5 * "One Ring to rule them all, One Ring to find them..."
8 /* This file contains functions for executing a regular expression. See
9 * also regcomp.c which funnily enough, contains functions for compiling
10 * a regular expression.
12 * This file is also copied at build time to ext/re/re_exec.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
19 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
20 * confused with the original package (see point 3 below). Thanks, Henry!
23 /* Additional note: this code is very heavily munged from Henry's version
24 * in places. In some spots I've traded clarity for efficiency, so don't
25 * blame Henry for some of the lack of readability.
28 /* The names of the functions have been changed from regcomp and
29 * regexec to pregcomp and pregexec in order to avoid conflicts
30 * with the POSIX routines of the same names.
33 #ifdef PERL_EXT_RE_BUILD
38 * pregcomp and pregexec -- regsub and regerror are not used in perl
40 * Copyright (c) 1986 by University of Toronto.
41 * Written by Henry Spencer. Not derived from licensed software.
43 * Permission is granted to anyone to use this software for any
44 * purpose on any computer system, and to redistribute it freely,
45 * subject to the following restrictions:
47 * 1. The author is not responsible for the consequences of use of
48 * this software, no matter how awful, even if they arise
51 * 2. The origin of this software must not be misrepresented, either
52 * by explicit claim or by omission.
54 * 3. Altered versions must be plainly marked as such, and must not
55 * be misrepresented as being the original software.
57 **** Alterations to Henry's code are...
59 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
60 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, by Larry Wall and others
62 **** You may distribute under the terms of either the GNU General Public
63 **** License or the Artistic License, as specified in the README file.
65 * Beware that some of this code is subtly aware of the way operator
66 * precedence is structured in regular expressions. Serious changes in
67 * regular-expression syntax might require a total rethink.
70 #define PERL_IN_REGEXEC_C
73 #ifdef PERL_IN_XSUB_RE
79 #define RF_tainted 1 /* tainted information used? */
80 #define RF_warned 2 /* warned about big count? */
81 #define RF_evaled 4 /* Did an EVAL with setting? */
82 #define RF_utf8 8 /* String contains multibyte chars? */
84 #define UTF ((PL_reg_flags & RF_utf8) != 0)
86 #define RS_init 1 /* eval environment created */
87 #define RS_set 2 /* replsv value is set */
93 #define REGINCLASS(prog,p,c) (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0,0) : ANYOF_BITMAP_TEST(p,*(c)))
99 #define CHR_SVLEN(sv) (do_utf8 ? sv_len_utf8(sv) : SvCUR(sv))
100 #define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
102 #define HOPc(pos,off) ((char *)(PL_reg_match_utf8 \
103 ? reghop3((U8*)pos, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr)) \
105 #define HOPBACKc(pos, off) ((char*) \
106 ((PL_reg_match_utf8) \
107 ? reghopmaybe3((U8*)pos, -off, (U8*)PL_bostr) \
108 : (pos - off >= PL_bostr) \
113 #define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)pos, off, (U8*)lim) : (U8*)(pos + off))
114 #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
116 #define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \
117 if (!CAT2(PL_utf8_,class)) { bool ok; ENTER; save_re_context(); ok=CAT2(is_utf8_,class)((const U8*)str); assert(ok); LEAVE; } } STMT_END
118 #define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS(alnum,"a")
119 #define LOAD_UTF8_CHARCLASS_DIGIT() LOAD_UTF8_CHARCLASS(digit,"0")
120 #define LOAD_UTF8_CHARCLASS_SPACE() LOAD_UTF8_CHARCLASS(space," ")
121 #define LOAD_UTF8_CHARCLASS_MARK() LOAD_UTF8_CHARCLASS(mark, "\xcd\x86")
123 /* for use after a quantifier and before an EXACT-like node -- japhy */
124 #define JUMPABLE(rn) ( \
125 OP(rn) == OPEN || OP(rn) == CLOSE || OP(rn) == EVAL || \
126 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
127 OP(rn) == PLUS || OP(rn) == MINMOD || \
128 (PL_regkind[(U8)OP(rn)] == CURLY && ARG1(rn) > 0) \
131 #define HAS_TEXT(rn) ( \
132 PL_regkind[(U8)OP(rn)] == EXACT || PL_regkind[(U8)OP(rn)] == REF \
136 Search for mandatory following text node; for lookahead, the text must
137 follow but for lookbehind (rn->flags != 0) we skip to the next step.
139 #define FIND_NEXT_IMPT(rn) STMT_START { \
140 while (JUMPABLE(rn)) \
141 if (OP(rn) == SUSPEND || PL_regkind[(U8)OP(rn)] == CURLY) \
142 rn = NEXTOPER(NEXTOPER(rn)); \
143 else if (OP(rn) == PLUS) \
145 else if (OP(rn) == IFMATCH) \
146 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
147 else rn += NEXT_OFF(rn); \
150 static void restore_pos(pTHX_ void *arg);
153 S_regcppush(pTHX_ I32 parenfloor)
156 const int retval = PL_savestack_ix;
157 #define REGCP_PAREN_ELEMS 4
158 const int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
161 if (paren_elems_to_push < 0)
162 Perl_croak(aTHX_ "panic: paren_elems_to_push < 0");
164 #define REGCP_OTHER_ELEMS 6
165 SSGROW(paren_elems_to_push + REGCP_OTHER_ELEMS);
166 for (p = PL_regsize; p > parenfloor; p--) {
167 /* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
168 SSPUSHINT(PL_regendp[p]);
169 SSPUSHINT(PL_regstartp[p]);
170 SSPUSHPTR(PL_reg_start_tmp[p]);
173 /* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
174 SSPUSHINT(PL_regsize);
175 SSPUSHINT(*PL_reglastparen);
176 SSPUSHINT(*PL_reglastcloseparen);
177 SSPUSHPTR(PL_reginput);
178 #define REGCP_FRAME_ELEMS 2
179 /* REGCP_FRAME_ELEMS are part of the REGCP_OTHER_ELEMS and
180 * are needed for the regexp context stack bookkeeping. */
181 SSPUSHINT(paren_elems_to_push + REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
182 SSPUSHINT(SAVEt_REGCONTEXT); /* Magic cookie. */
187 /* These are needed since we do not localize EVAL nodes: */
188 # define REGCP_SET(cp) DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, \
189 " Setting an EVAL scope, savestack=%"IVdf"\n", \
190 (IV)PL_savestack_ix)); cp = PL_savestack_ix
192 # define REGCP_UNWIND(cp) DEBUG_EXECUTE_r(cp != PL_savestack_ix ? \
193 PerlIO_printf(Perl_debug_log, \
194 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
195 (IV)(cp), (IV)PL_savestack_ix) : 0); regcpblow(cp)
198 S_regcppop(pTHX_ const regexp *rex)
204 GET_RE_DEBUG_FLAGS_DECL;
206 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
208 assert(i == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
209 i = SSPOPINT; /* Parentheses elements to pop. */
210 input = (char *) SSPOPPTR;
211 *PL_reglastcloseparen = SSPOPINT;
212 *PL_reglastparen = SSPOPINT;
213 PL_regsize = SSPOPINT;
215 /* Now restore the parentheses context. */
216 for (i -= (REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
217 i > 0; i -= REGCP_PAREN_ELEMS) {
219 U32 paren = (U32)SSPOPINT;
220 PL_reg_start_tmp[paren] = (char *) SSPOPPTR;
221 PL_regstartp[paren] = SSPOPINT;
223 if (paren <= *PL_reglastparen)
224 PL_regendp[paren] = tmps;
226 PerlIO_printf(Perl_debug_log,
227 " restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n",
228 (UV)paren, (IV)PL_regstartp[paren],
229 (IV)(PL_reg_start_tmp[paren] - PL_bostr),
230 (IV)PL_regendp[paren],
231 (paren > *PL_reglastparen ? "(no)" : ""));
235 if (*PL_reglastparen + 1 <= rex->nparens) {
236 PerlIO_printf(Perl_debug_log,
237 " restoring \\%"IVdf"..\\%"IVdf" to undef\n",
238 (IV)(*PL_reglastparen + 1), (IV)rex->nparens);
242 /* It would seem that the similar code in regtry()
243 * already takes care of this, and in fact it is in
244 * a better location to since this code can #if 0-ed out
245 * but the code in regtry() is needed or otherwise tests
246 * requiring null fields (pat.t#187 and split.t#{13,14}
247 * (as of patchlevel 7877) will fail. Then again,
248 * this code seems to be necessary or otherwise
249 * building DynaLoader will fail:
250 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
252 for (i = *PL_reglastparen + 1; (U32)i <= rex->nparens; i++) {
254 PL_regstartp[i] = -1;
261 #define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
263 #define TRYPAREN(paren, n, input, where) { \
266 PL_regstartp[paren] = HOPc(input, -1) - PL_bostr; \
267 PL_regendp[paren] = input - PL_bostr; \
270 PL_regendp[paren] = -1; \
272 REGMATCH(next, where); \
276 PL_regendp[paren] = -1; \
281 * pregexec and friends
284 #ifndef PERL_IN_XSUB_RE
286 - pregexec - match a regexp against a string
289 Perl_pregexec(pTHX_ register regexp *prog, char *stringarg, register char *strend,
290 char *strbeg, I32 minend, SV *screamer, U32 nosave)
291 /* strend: pointer to null at end of string */
292 /* strbeg: real beginning of string */
293 /* minend: end of match must be >=minend after stringarg. */
294 /* nosave: For optimizations. */
297 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
298 nosave ? 0 : REXEC_COPY_STR);
303 * Need to implement the following flags for reg_anch:
305 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
307 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
308 * INTUIT_AUTORITATIVE_ML
309 * INTUIT_ONCE_NOML - Intuit can match in one location only.
312 * Another flag for this function: SECOND_TIME (so that float substrs
313 * with giant delta may be not rechecked).
316 /* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
318 /* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
319 Otherwise, only SvCUR(sv) is used to get strbeg. */
321 /* XXXX We assume that strpos is strbeg unless sv. */
323 /* XXXX Some places assume that there is a fixed substring.
324 An update may be needed if optimizer marks as "INTUITable"
325 RExen without fixed substrings. Similarly, it is assumed that
326 lengths of all the strings are no more than minlen, thus they
327 cannot come from lookahead.
328 (Or minlen should take into account lookahead.) */
330 /* A failure to find a constant substring means that there is no need to make
331 an expensive call to REx engine, thus we celebrate a failure. Similarly,
332 finding a substring too deep into the string means that less calls to
333 regtry() should be needed.
335 REx compiler's optimizer found 4 possible hints:
336 a) Anchored substring;
338 c) Whether we are anchored (beginning-of-line or \G);
339 d) First node (of those at offset 0) which may distingush positions;
340 We use a)b)d) and multiline-part of c), and try to find a position in the
341 string which does not contradict any of them.
344 /* Most of decisions we do here should have been done at compile time.
345 The nodes of the REx which we used for the search should have been
346 deleted from the finite automaton. */
349 Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
350 char *strend, U32 flags, re_scream_pos_data *data)
353 register I32 start_shift = 0;
354 /* Should be nonnegative! */
355 register I32 end_shift = 0;
360 const int do_utf8 = sv ? SvUTF8(sv) : 0; /* if no sv we have to assume bytes */
362 register char *other_last = NULL; /* other substr checked before this */
363 char *check_at = NULL; /* check substr found at this pos */
364 const I32 multiline = prog->reganch & PMf_MULTILINE;
366 const char * const i_strpos = strpos;
367 SV * const dsv = PERL_DEBUG_PAD_ZERO(0);
370 GET_RE_DEBUG_FLAGS_DECL;
372 RX_MATCH_UTF8_set(prog,do_utf8);
374 if (prog->reganch & ROPT_UTF8) {
375 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
376 "UTF-8 regex...\n"));
377 PL_reg_flags |= RF_utf8;
381 const char *s = PL_reg_match_utf8 ?
382 sv_uni_display(dsv, sv, 60, UNI_DISPLAY_REGEX) :
384 const int len = PL_reg_match_utf8 ?
385 (int)strlen(s) : strend - strpos;
388 if (PL_reg_match_utf8)
389 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
390 "UTF-8 target...\n"));
391 PerlIO_printf(Perl_debug_log,
392 "%sGuessing start of match, REx%s \"%s%.60s%s%s\" against \"%s%.*s%s%s\"...\n",
393 PL_colors[4], PL_colors[5], PL_colors[0],
396 (strlen(prog->precomp) > 60 ? "..." : ""),
398 (int)(len > 60 ? 60 : len),
400 (len > 60 ? "..." : "")
404 /* CHR_DIST() would be more correct here but it makes things slow. */
405 if (prog->minlen > strend - strpos) {
406 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
407 "String too short... [re_intuit_start]\n"));
410 strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
413 if (!prog->check_utf8 && prog->check_substr)
414 to_utf8_substr(prog);
415 check = prog->check_utf8;
417 if (!prog->check_substr && prog->check_utf8)
418 to_byte_substr(prog);
419 check = prog->check_substr;
421 if (check == &PL_sv_undef) {
422 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
423 "Non-utf string cannot match utf check string\n"));
426 if (prog->reganch & ROPT_ANCH) { /* Match at beg-of-str or after \n */
427 ml_anch = !( (prog->reganch & ROPT_ANCH_SINGLE)
428 || ( (prog->reganch & ROPT_ANCH_BOL)
429 && !multiline ) ); /* Check after \n? */
432 if ( !(prog->reganch & (ROPT_ANCH_GPOS /* Checked by the caller */
433 | ROPT_IMPLICIT)) /* not a real BOL */
434 /* SvCUR is not set on references: SvRV and SvPVX_const overlap */
436 && (strpos != strbeg)) {
437 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
440 if (prog->check_offset_min == prog->check_offset_max &&
441 !(prog->reganch & ROPT_CANY_SEEN)) {
442 /* Substring at constant offset from beg-of-str... */
445 s = HOP3c(strpos, prog->check_offset_min, strend);
447 slen = SvCUR(check); /* >= 1 */
449 if ( strend - s > slen || strend - s < slen - 1
450 || (strend - s == slen && strend[-1] != '\n')) {
451 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
454 /* Now should match s[0..slen-2] */
456 if (slen && (*SvPVX_const(check) != *s
458 && memNE(SvPVX_const(check), s, slen)))) {
460 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
464 else if (*SvPVX_const(check) != *s
465 || ((slen = SvCUR(check)) > 1
466 && memNE(SvPVX_const(check), s, slen)))
469 goto success_at_start;
472 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
474 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
475 end_shift = prog->minlen - start_shift -
476 CHR_SVLEN(check) + (SvTAIL(check) != 0);
478 const I32 end = prog->check_offset_max + CHR_SVLEN(check)
479 - (SvTAIL(check) != 0);
480 const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
482 if (end_shift < eshift)
486 else { /* Can match at random position */
489 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
490 /* Should be nonnegative! */
491 end_shift = prog->minlen - start_shift -
492 CHR_SVLEN(check) + (SvTAIL(check) != 0);
495 #ifdef DEBUGGING /* 7/99: reports of failure (with the older version) */
497 Perl_croak(aTHX_ "panic: end_shift");
501 /* Find a possible match in the region s..strend by looking for
502 the "check" substring in the region corrected by start/end_shift. */
503 if (flags & REXEC_SCREAM) {
504 I32 p = -1; /* Internal iterator of scream. */
505 I32 * const pp = data ? data->scream_pos : &p;
507 if (PL_screamfirst[BmRARE(check)] >= 0
508 || ( BmRARE(check) == '\n'
509 && (BmPREVIOUS(check) == SvCUR(check) - 1)
511 s = screaminstr(sv, check,
512 start_shift + (s - strbeg), end_shift, pp, 0);
515 /* we may be pointing at the wrong string */
516 if (s && RX_MATCH_COPIED(prog))
517 s = strbeg + (s - SvPVX_const(sv));
519 *data->scream_olds = s;
521 else if (prog->reganch & ROPT_CANY_SEEN)
522 s = fbm_instr((U8*)(s + start_shift),
523 (U8*)(strend - end_shift),
524 check, multiline ? FBMrf_MULTILINE : 0);
526 s = fbm_instr(HOP3(s, start_shift, strend),
527 HOP3(strend, -end_shift, strbeg),
528 check, multiline ? FBMrf_MULTILINE : 0);
530 /* Update the count-of-usability, remove useless subpatterns,
533 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s %s substr \"%s%.*s%s\"%s%s",
534 (s ? "Found" : "Did not find"),
535 (check == (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) ? "anchored" : "floating"),
537 (int)(SvCUR(check) - (SvTAIL(check)!=0)),
539 PL_colors[1], (SvTAIL(check) ? "$" : ""),
540 (s ? " at offset " : "...\n") ) );
547 /* Finish the diagnostic message */
548 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
550 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
551 Start with the other substr.
552 XXXX no SCREAM optimization yet - and a very coarse implementation
553 XXXX /ttx+/ results in anchored="ttx", floating="x". floating will
554 *always* match. Probably should be marked during compile...
555 Probably it is right to do no SCREAM here...
558 if (do_utf8 ? (prog->float_utf8 && prog->anchored_utf8) : (prog->float_substr && prog->anchored_substr)) {
559 /* Take into account the "other" substring. */
560 /* XXXX May be hopelessly wrong for UTF... */
563 if (check == (do_utf8 ? prog->float_utf8 : prog->float_substr)) {
566 char * const last = HOP3c(s, -start_shift, strbeg);
571 t = s - prog->check_offset_max;
572 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
574 || ((t = (char*)reghopmaybe3((U8*)s, -(prog->check_offset_max), (U8*)strpos))
579 t = HOP3c(t, prog->anchored_offset, strend);
580 if (t < other_last) /* These positions already checked */
582 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
585 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
586 /* On end-of-str: see comment below. */
587 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
588 if (must == &PL_sv_undef) {
590 DEBUG_EXECUTE_r(must = prog->anchored_utf8); /* for debug */
595 HOP3(HOP3(last1, prog->anchored_offset, strend)
596 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
598 multiline ? FBMrf_MULTILINE : 0
600 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
601 "%s anchored substr \"%s%.*s%s\"%s",
602 (s ? "Found" : "Contradicts"),
605 - (SvTAIL(must)!=0)),
607 PL_colors[1], (SvTAIL(must) ? "$" : "")));
609 if (last1 >= last2) {
610 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
611 ", giving up...\n"));
614 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
615 ", trying floating at offset %ld...\n",
616 (long)(HOP3c(s1, 1, strend) - i_strpos)));
617 other_last = HOP3c(last1, prog->anchored_offset+1, strend);
618 s = HOP3c(last, 1, strend);
622 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
623 (long)(s - i_strpos)));
624 t = HOP3c(s, -prog->anchored_offset, strbeg);
625 other_last = HOP3c(s, 1, strend);
633 else { /* Take into account the floating substring. */
638 t = HOP3c(s, -start_shift, strbeg);
640 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
641 if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
642 last = HOP3c(t, prog->float_max_offset, strend);
643 s = HOP3c(t, prog->float_min_offset, strend);
646 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
647 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
648 /* fbm_instr() takes into account exact value of end-of-str
649 if the check is SvTAIL(ed). Since false positives are OK,
650 and end-of-str is not later than strend we are OK. */
651 if (must == &PL_sv_undef) {
653 DEBUG_EXECUTE_r(must = prog->float_utf8); /* for debug message */
656 s = fbm_instr((unsigned char*)s,
657 (unsigned char*)last + SvCUR(must)
659 must, multiline ? FBMrf_MULTILINE : 0);
660 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s floating substr \"%s%.*s%s\"%s",
661 (s ? "Found" : "Contradicts"),
663 (int)(SvCUR(must) - (SvTAIL(must)!=0)),
665 PL_colors[1], (SvTAIL(must) ? "$" : "")));
668 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
669 ", giving up...\n"));
672 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
673 ", trying anchored starting at offset %ld...\n",
674 (long)(s1 + 1 - i_strpos)));
676 s = HOP3c(t, 1, strend);
680 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
681 (long)(s - i_strpos)));
682 other_last = s; /* Fix this later. --Hugo */
691 t = s - prog->check_offset_max;
692 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
694 || ((t = (char*)reghopmaybe3((U8*)s, -prog->check_offset_max, (U8*)strpos))
696 /* Fixed substring is found far enough so that the match
697 cannot start at strpos. */
699 if (ml_anch && t[-1] != '\n') {
700 /* Eventually fbm_*() should handle this, but often
701 anchored_offset is not 0, so this check will not be wasted. */
702 /* XXXX In the code below we prefer to look for "^" even in
703 presence of anchored substrings. And we search even
704 beyond the found float position. These pessimizations
705 are historical artefacts only. */
707 while (t < strend - prog->minlen) {
709 if (t < check_at - prog->check_offset_min) {
710 if (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) {
711 /* Since we moved from the found position,
712 we definitely contradict the found anchored
713 substr. Due to the above check we do not
714 contradict "check" substr.
715 Thus we can arrive here only if check substr
716 is float. Redo checking for "other"=="fixed".
719 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
720 PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
721 goto do_other_anchored;
723 /* We don't contradict the found floating substring. */
724 /* XXXX Why not check for STCLASS? */
726 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
727 PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
730 /* Position contradicts check-string */
731 /* XXXX probably better to look for check-string
732 than for "\n", so one should lower the limit for t? */
733 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
734 PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
735 other_last = strpos = s = t + 1;
740 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
741 PL_colors[0], PL_colors[1]));
745 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
746 PL_colors[0], PL_colors[1]));
750 ++BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
753 /* The found string does not prohibit matching at strpos,
754 - no optimization of calling REx engine can be performed,
755 unless it was an MBOL and we are not after MBOL,
756 or a future STCLASS check will fail this. */
758 /* Even in this situation we may use MBOL flag if strpos is offset
759 wrt the start of the string. */
760 if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
761 && (strpos != strbeg) && strpos[-1] != '\n'
762 /* May be due to an implicit anchor of m{.*foo} */
763 && !(prog->reganch & ROPT_IMPLICIT))
768 DEBUG_EXECUTE_r( if (ml_anch)
769 PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
770 (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
773 if (!(prog->reganch & ROPT_NAUGHTY) /* XXXX If strpos moved? */
775 prog->check_utf8 /* Could be deleted already */
776 && --BmUSEFUL(prog->check_utf8) < 0
777 && (prog->check_utf8 == prog->float_utf8)
779 prog->check_substr /* Could be deleted already */
780 && --BmUSEFUL(prog->check_substr) < 0
781 && (prog->check_substr == prog->float_substr)
784 /* If flags & SOMETHING - do not do it many times on the same match */
785 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
786 SvREFCNT_dec(do_utf8 ? prog->check_utf8 : prog->check_substr);
787 if (do_utf8 ? prog->check_substr : prog->check_utf8)
788 SvREFCNT_dec(do_utf8 ? prog->check_substr : prog->check_utf8);
789 prog->check_substr = prog->check_utf8 = NULL; /* disable */
790 prog->float_substr = prog->float_utf8 = NULL; /* clear */
791 check = NULL; /* abort */
793 /* XXXX This is a remnant of the old implementation. It
794 looks wasteful, since now INTUIT can use many
796 prog->reganch &= ~RE_USE_INTUIT;
803 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
804 if (prog->regstclass) {
805 /* minlen == 0 is possible if regstclass is \b or \B,
806 and the fixed substr is ''$.
807 Since minlen is already taken into account, s+1 is before strend;
808 accidentally, minlen >= 1 guaranties no false positives at s + 1
809 even for \b or \B. But (minlen? 1 : 0) below assumes that
810 regstclass does not come from lookahead... */
811 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
812 This leaves EXACTF only, which is dealt with in find_byclass(). */
813 const U8* const str = (U8*)STRING(prog->regstclass);
814 const int cl_l = (PL_regkind[(U8)OP(prog->regstclass)] == EXACT
815 ? CHR_DIST(str+STR_LEN(prog->regstclass), str)
817 const char * const endpos = (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
818 ? HOP3c(s, (prog->minlen ? cl_l : 0), strend)
819 : (prog->float_substr || prog->float_utf8
820 ? HOP3c(HOP3c(check_at, -start_shift, strbeg),
825 s = find_byclass(prog, prog->regstclass, s, endpos, NULL);
828 const char *what = NULL;
830 if (endpos == strend) {
831 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
832 "Could not match STCLASS...\n") );
835 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
836 "This position contradicts STCLASS...\n") );
837 if ((prog->reganch & ROPT_ANCH) && !ml_anch)
839 /* Contradict one of substrings */
840 if (prog->anchored_substr || prog->anchored_utf8) {
841 if ((do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) == check) {
842 DEBUG_EXECUTE_r( what = "anchored" );
844 s = HOP3c(t, 1, strend);
845 if (s + start_shift + end_shift > strend) {
846 /* XXXX Should be taken into account earlier? */
847 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
848 "Could not match STCLASS...\n") );
853 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
854 "Looking for %s substr starting at offset %ld...\n",
855 what, (long)(s + start_shift - i_strpos)) );
858 /* Have both, check_string is floating */
859 if (t + start_shift >= check_at) /* Contradicts floating=check */
860 goto retry_floating_check;
861 /* Recheck anchored substring, but not floating... */
865 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
866 "Looking for anchored substr starting at offset %ld...\n",
867 (long)(other_last - i_strpos)) );
868 goto do_other_anchored;
870 /* Another way we could have checked stclass at the
871 current position only: */
876 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
877 "Looking for /%s^%s/m starting at offset %ld...\n",
878 PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
881 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
883 /* Check is floating subtring. */
884 retry_floating_check:
885 t = check_at - start_shift;
886 DEBUG_EXECUTE_r( what = "floating" );
887 goto hop_and_restart;
890 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
891 "By STCLASS: moving %ld --> %ld\n",
892 (long)(t - i_strpos), (long)(s - i_strpos))
896 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
897 "Does not contradict STCLASS...\n");
902 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
903 PL_colors[4], (check ? "Guessed" : "Giving up"),
904 PL_colors[5], (long)(s - i_strpos)) );
907 fail_finish: /* Substring not found */
908 if (prog->check_substr || prog->check_utf8) /* could be removed already */
909 BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
911 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
912 PL_colors[4], PL_colors[5]));
916 /* We know what class REx starts with. Try to find this position... */
917 /* if reginfo is NULL, its a dryrun */
920 S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, const char
921 *strend, const regmatch_info *reginfo)
924 const I32 doevery = (prog->reganch & ROPT_SKIP) == 0;
928 register STRLEN uskip;
932 register I32 tmp = 1; /* Scratch variable? */
933 register const bool do_utf8 = PL_reg_match_utf8;
935 /* We know what class it must start with. */
939 while (s + (uskip = UTF8SKIP(s)) <= strend) {
940 if ((ANYOF_FLAGS(c) & ANYOF_UNICODE) ||
941 !UTF8_IS_INVARIANT((U8)s[0]) ?
942 reginclass(prog, c, (U8*)s, 0, do_utf8) :
943 REGINCLASS(prog, c, (U8*)s)) {
944 if (tmp && (!reginfo || regtry(reginfo, s)))
958 if (REGINCLASS(prog, c, (U8*)s) ||
959 (ANYOF_FOLD_SHARP_S(c, s, strend) &&
960 /* The assignment of 2 is intentional:
961 * for the folded sharp s, the skip is 2. */
962 (skip = SHARP_S_SKIP))) {
963 if (tmp && (!reginfo || regtry(reginfo, s)))
976 if (tmp && (!reginfo || regtry(reginfo, s)))
985 ln = STR_LEN(c); /* length to match in octets/bytes */
986 lnc = (I32) ln; /* length to match in characters */
990 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
991 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
992 const U32 uniflags = UTF8_ALLOW_DEFAULT;
994 to_utf8_lower((U8*)m, tmpbuf1, &ulen1);
995 to_utf8_upper((U8*)m, tmpbuf2, &ulen2);
997 c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXBYTES_CASE,
999 c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXBYTES_CASE,
1002 while (sm < ((U8 *) m + ln)) {
1017 c2 = PL_fold_locale[c1];
1019 e = HOP3c(strend, -((I32)lnc), s);
1021 if (!reginfo && e < s)
1022 e = s; /* Due to minlen logic of intuit() */
1024 /* The idea in the EXACTF* cases is to first find the
1025 * first character of the EXACTF* node and then, if
1026 * necessary, case-insensitively compare the full
1027 * text of the node. The c1 and c2 are the first
1028 * characters (though in Unicode it gets a bit
1029 * more complicated because there are more cases
1030 * than just upper and lower: one needs to use
1031 * the so-called folding case for case-insensitive
1032 * matching (called "loose matching" in Unicode).
1033 * ibcmp_utf8() will do just that. */
1037 U8 tmpbuf [UTF8_MAXBYTES+1];
1038 STRLEN len, foldlen;
1039 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1041 /* Upper and lower of 1st char are equal -
1042 * probably not a "letter". */
1044 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
1048 ibcmp_utf8(s, (char **)0, 0, do_utf8,
1049 m, (char **)0, ln, (bool)UTF))
1050 && (!reginfo || regtry(reginfo, s)) )
1053 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
1054 uvchr_to_utf8(tmpbuf, c);
1055 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen);
1057 && (f == c1 || f == c2)
1058 && (ln == foldlen ||
1059 !ibcmp_utf8((char *) foldbuf,
1060 (char **)0, foldlen, do_utf8,
1062 (char **)0, ln, (bool)UTF))
1063 && (!reginfo || regtry(reginfo, s)) )
1071 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
1074 /* Handle some of the three Greek sigmas cases.
1075 * Note that not all the possible combinations
1076 * are handled here: some of them are handled
1077 * by the standard folding rules, and some of
1078 * them (the character class or ANYOF cases)
1079 * are handled during compiletime in
1080 * regexec.c:S_regclass(). */
1081 if (c == (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA ||
1082 c == (UV)UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA)
1083 c = (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA;
1085 if ( (c == c1 || c == c2)
1087 ibcmp_utf8(s, (char **)0, 0, do_utf8,
1088 m, (char **)0, ln, (bool)UTF))
1089 && (!reginfo || regtry(reginfo, s)) )
1092 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
1093 uvchr_to_utf8(tmpbuf, c);
1094 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen);
1096 && (f == c1 || f == c2)
1097 && (ln == foldlen ||
1098 !ibcmp_utf8((char *) foldbuf,
1099 (char **)0, foldlen, do_utf8,
1101 (char **)0, ln, (bool)UTF))
1102 && (!reginfo || regtry(reginfo, s)) )
1113 && (ln == 1 || !(OP(c) == EXACTF
1115 : ibcmp_locale(s, m, ln)))
1116 && (!reginfo || regtry(reginfo, s)) )
1122 if ( (*(U8*)s == c1 || *(U8*)s == c2)
1123 && (ln == 1 || !(OP(c) == EXACTF
1125 : ibcmp_locale(s, m, ln)))
1126 && (!reginfo || regtry(reginfo, s)) )
1133 PL_reg_flags |= RF_tainted;
1140 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
1141 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);
1143 tmp = ((OP(c) == BOUND ?
1144 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1145 LOAD_UTF8_CHARCLASS_ALNUM();
1146 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1147 if (tmp == !(OP(c) == BOUND ?
1148 (bool)swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
1149 isALNUM_LC_utf8((U8*)s)))
1152 if ((!reginfo || regtry(reginfo, s)))
1159 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
1160 tmp = ((OP(c) == BOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1161 while (s < strend) {
1163 !(OP(c) == BOUND ? isALNUM(*s) : isALNUM_LC(*s))) {
1165 if ((!reginfo || regtry(reginfo, s)))
1171 if ((!prog->minlen && tmp) && (!reginfo || regtry(reginfo, s)))
1175 PL_reg_flags |= RF_tainted;
1182 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
1183 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);
1185 tmp = ((OP(c) == NBOUND ?
1186 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1187 LOAD_UTF8_CHARCLASS_ALNUM();
1188 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1189 if (tmp == !(OP(c) == NBOUND ?
1190 (bool)swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
1191 isALNUM_LC_utf8((U8*)s)))
1193 else if ((!reginfo || regtry(reginfo, s)))
1199 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
1200 tmp = ((OP(c) == NBOUND ?
1201 isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1202 while (s < strend) {
1204 !(OP(c) == NBOUND ? isALNUM(*s) : isALNUM_LC(*s)))
1206 else if ((!reginfo || regtry(reginfo, s)))
1211 if ((!prog->minlen && !tmp) && (!reginfo || regtry(reginfo, s)))
1216 LOAD_UTF8_CHARCLASS_ALNUM();
1217 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1218 if (swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8)) {
1219 if (tmp && (!reginfo || regtry(reginfo, s)))
1230 while (s < strend) {
1232 if (tmp && (!reginfo || regtry(reginfo, s)))
1244 PL_reg_flags |= RF_tainted;
1246 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1247 if (isALNUM_LC_utf8((U8*)s)) {
1248 if (tmp && (!reginfo || regtry(reginfo, s)))
1259 while (s < strend) {
1260 if (isALNUM_LC(*s)) {
1261 if (tmp && (!reginfo || regtry(reginfo, s)))
1274 LOAD_UTF8_CHARCLASS_ALNUM();
1275 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1276 if (!swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8)) {
1277 if (tmp && (!reginfo || regtry(reginfo, s)))
1288 while (s < strend) {
1290 if (tmp && (!reginfo || regtry(reginfo, s)))
1302 PL_reg_flags |= RF_tainted;
1304 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1305 if (!isALNUM_LC_utf8((U8*)s)) {
1306 if (tmp && (!reginfo || regtry(reginfo, s)))
1317 while (s < strend) {
1318 if (!isALNUM_LC(*s)) {
1319 if (tmp && (!reginfo || regtry(reginfo, s)))
1332 LOAD_UTF8_CHARCLASS_SPACE();
1333 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1334 if (*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8)) {
1335 if (tmp && (!reginfo || regtry(reginfo, s)))
1346 while (s < strend) {
1348 if (tmp && (!reginfo || regtry(reginfo, s)))
1360 PL_reg_flags |= RF_tainted;
1362 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1363 if (*s == ' ' || isSPACE_LC_utf8((U8*)s)) {
1364 if (tmp && (!reginfo || regtry(reginfo, s)))
1375 while (s < strend) {
1376 if (isSPACE_LC(*s)) {
1377 if (tmp && (!reginfo || regtry(reginfo, s)))
1390 LOAD_UTF8_CHARCLASS_SPACE();
1391 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1392 if (!(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8))) {
1393 if (tmp && (!reginfo || regtry(reginfo, s)))
1404 while (s < strend) {
1406 if (tmp && (!reginfo || regtry(reginfo, s)))
1418 PL_reg_flags |= RF_tainted;
1420 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1421 if (!(*s == ' ' || isSPACE_LC_utf8((U8*)s))) {
1422 if (tmp && (!reginfo || regtry(reginfo, s)))
1433 while (s < strend) {
1434 if (!isSPACE_LC(*s)) {
1435 if (tmp && (!reginfo || regtry(reginfo, s)))
1448 LOAD_UTF8_CHARCLASS_DIGIT();
1449 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1450 if (swash_fetch(PL_utf8_digit,(U8*)s, do_utf8)) {
1451 if (tmp && (!reginfo || regtry(reginfo, s)))
1462 while (s < strend) {
1464 if (tmp && (!reginfo || regtry(reginfo, s)))
1476 PL_reg_flags |= RF_tainted;
1478 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1479 if (isDIGIT_LC_utf8((U8*)s)) {
1480 if (tmp && (!reginfo || regtry(reginfo, s)))
1491 while (s < strend) {
1492 if (isDIGIT_LC(*s)) {
1493 if (tmp && (!reginfo || regtry(reginfo, s)))
1506 LOAD_UTF8_CHARCLASS_DIGIT();
1507 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1508 if (!swash_fetch(PL_utf8_digit,(U8*)s, do_utf8)) {
1509 if (tmp && (!reginfo || regtry(reginfo, s)))
1520 while (s < strend) {
1522 if (tmp && (!reginfo || regtry(reginfo, s)))
1534 PL_reg_flags |= RF_tainted;
1536 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1537 if (!isDIGIT_LC_utf8((U8*)s)) {
1538 if (tmp && (!reginfo || regtry(reginfo, s)))
1549 while (s < strend) {
1550 if (!isDIGIT_LC(*s)) {
1551 if (tmp && (!reginfo || regtry(reginfo, s)))
1563 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
1572 - regexec_flags - match a regexp against a string
1575 Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *strend,
1576 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
1577 /* strend: pointer to null at end of string */
1578 /* strbeg: real beginning of string */
1579 /* minend: end of match must be >=minend after stringarg. */
1580 /* data: May be used for some additional optimizations. */
1581 /* nosave: For optimizations. */
1585 register regnode *c;
1586 register char *startpos = stringarg;
1587 I32 minlen; /* must match at least this many chars */
1588 I32 dontbother = 0; /* how many characters not to try at end */
1589 I32 end_shift = 0; /* Same for the end. */ /* CC */
1590 I32 scream_pos = -1; /* Internal iterator of scream. */
1591 char *scream_olds = NULL;
1592 SV* oreplsv = GvSV(PL_replgv);
1593 const bool do_utf8 = DO_UTF8(sv);
1599 regmatch_info reginfo; /* create some info to pass to regtry etc */
1601 GET_RE_DEBUG_FLAGS_DECL;
1603 PERL_UNUSED_ARG(data);
1605 /* Be paranoid... */
1606 if (prog == NULL || startpos == NULL) {
1607 Perl_croak(aTHX_ "NULL regexp parameter");
1611 multiline = prog->reganch & PMf_MULTILINE;
1612 reginfo.prog = prog;
1615 dsv0 = PERL_DEBUG_PAD_ZERO(0);
1616 dsv1 = PERL_DEBUG_PAD_ZERO(1);
1619 RX_MATCH_UTF8_set(prog, do_utf8);
1621 minlen = prog->minlen;
1622 if (strend - startpos < minlen) {
1623 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1624 "String too short [regexec_flags]...\n"));
1628 /* Check validity of program. */
1629 if (UCHARAT(prog->program) != REG_MAGIC) {
1630 Perl_croak(aTHX_ "corrupted regexp program");
1634 PL_reg_eval_set = 0;
1637 if (prog->reganch & ROPT_UTF8)
1638 PL_reg_flags |= RF_utf8;
1640 /* Mark beginning of line for ^ and lookbehind. */
1641 reginfo.bol = startpos; /* XXX not used ??? */
1645 /* Mark end of line for $ (and such) */
1648 /* see how far we have to get to not match where we matched before */
1649 reginfo.till = startpos+minend;
1651 /* If there is a "must appear" string, look for it. */
1654 if (prog->reganch & ROPT_GPOS_SEEN) { /* Need to set reginfo->ganch */
1657 if (flags & REXEC_IGNOREPOS) /* Means: check only at start */
1658 reginfo.ganch = startpos;
1659 else if (sv && SvTYPE(sv) >= SVt_PVMG
1661 && (mg = mg_find(sv, PERL_MAGIC_regex_global))
1662 && mg->mg_len >= 0) {
1663 reginfo.ganch = strbeg + mg->mg_len; /* Defined pos() */
1664 if (prog->reganch & ROPT_ANCH_GPOS) {
1665 if (s > reginfo.ganch)
1670 else /* pos() not defined */
1671 reginfo.ganch = strbeg;
1674 if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) {
1675 re_scream_pos_data d;
1677 d.scream_olds = &scream_olds;
1678 d.scream_pos = &scream_pos;
1679 s = re_intuit_start(prog, sv, s, strend, flags, &d);
1681 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
1682 goto phooey; /* not present */
1687 const char * const s0 = UTF
1688 ? pv_uni_display(dsv0, (U8*)prog->precomp, prog->prelen, 60,
1691 const int len0 = UTF ? (int)SvCUR(dsv0) : prog->prelen;
1692 const char * const s1 = do_utf8 ? sv_uni_display(dsv1, sv, 60,
1693 UNI_DISPLAY_REGEX) : startpos;
1694 const int len1 = do_utf8 ? (int)SvCUR(dsv1) : strend - startpos;
1697 PerlIO_printf(Perl_debug_log,
1698 "%sMatching REx%s \"%s%*.*s%s%s\" against \"%s%.*s%s%s\"\n",
1699 PL_colors[4], PL_colors[5], PL_colors[0],
1702 len0 > 60 ? "..." : "",
1704 (int)(len1 > 60 ? 60 : len1),
1706 (len1 > 60 ? "..." : "")
1710 /* Simplest case: anchored match need be tried only once. */
1711 /* [unless only anchor is BOL and multiline is set] */
1712 if (prog->reganch & (ROPT_ANCH & ~ROPT_ANCH_GPOS)) {
1713 if (s == startpos && regtry(®info, startpos))
1715 else if (multiline || (prog->reganch & ROPT_IMPLICIT)
1716 || (prog->reganch & ROPT_ANCH_MBOL)) /* XXXX SBOL? */
1721 dontbother = minlen - 1;
1722 end = HOP3c(strend, -dontbother, strbeg) - 1;
1723 /* for multiline we only have to try after newlines */
1724 if (prog->check_substr || prog->check_utf8) {
1728 if (regtry(®info, s))
1733 if (prog->reganch & RE_USE_INTUIT) {
1734 s = re_intuit_start(prog, sv, s + 1, strend, flags, NULL);
1745 if (*s++ == '\n') { /* don't need PL_utf8skip here */
1746 if (regtry(®info, s))
1753 } else if (prog->reganch & ROPT_ANCH_GPOS) {
1754 if (regtry(®info, reginfo.ganch))
1759 /* Messy cases: unanchored match. */
1760 if ((prog->anchored_substr || prog->anchored_utf8) && prog->reganch & ROPT_SKIP) {
1761 /* we have /x+whatever/ */
1762 /* it must be a one character string (XXXX Except UTF?) */
1767 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1768 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1769 ch = SvPVX_const(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr)[0];
1772 while (s < strend) {
1774 DEBUG_EXECUTE_r( did_match = 1 );
1775 if (regtry(®info, s)) goto got_it;
1777 while (s < strend && *s == ch)
1784 while (s < strend) {
1786 DEBUG_EXECUTE_r( did_match = 1 );
1787 if (regtry(®info, s)) goto got_it;
1789 while (s < strend && *s == ch)
1795 DEBUG_EXECUTE_r(if (!did_match)
1796 PerlIO_printf(Perl_debug_log,
1797 "Did not find anchored character...\n")
1800 else if (prog->anchored_substr != NULL
1801 || prog->anchored_utf8 != NULL
1802 || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
1803 && prog->float_max_offset < strend - s)) {
1808 char *last1; /* Last position checked before */
1812 if (prog->anchored_substr || prog->anchored_utf8) {
1813 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1814 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1815 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
1816 back_max = back_min = prog->anchored_offset;
1818 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
1819 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1820 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
1821 back_max = prog->float_max_offset;
1822 back_min = prog->float_min_offset;
1824 if (must == &PL_sv_undef)
1825 /* could not downgrade utf8 check substring, so must fail */
1828 last = HOP3c(strend, /* Cannot start after this */
1829 -(I32)(CHR_SVLEN(must)
1830 - (SvTAIL(must) != 0) + back_min), strbeg);
1833 last1 = HOPc(s, -1);
1835 last1 = s - 1; /* bogus */
1837 /* XXXX check_substr already used to find "s", can optimize if
1838 check_substr==must. */
1840 dontbother = end_shift;
1841 strend = HOPc(strend, -dontbother);
1842 while ( (s <= last) &&
1843 ((flags & REXEC_SCREAM)
1844 ? (s = screaminstr(sv, must, HOP3c(s, back_min, strend) - strbeg,
1845 end_shift, &scream_pos, 0))
1846 : (s = fbm_instr((unsigned char*)HOP3(s, back_min, strend),
1847 (unsigned char*)strend, must,
1848 multiline ? FBMrf_MULTILINE : 0))) ) {
1849 /* we may be pointing at the wrong string */
1850 if ((flags & REXEC_SCREAM) && RX_MATCH_COPIED(prog))
1851 s = strbeg + (s - SvPVX_const(sv));
1852 DEBUG_EXECUTE_r( did_match = 1 );
1853 if (HOPc(s, -back_max) > last1) {
1854 last1 = HOPc(s, -back_min);
1855 s = HOPc(s, -back_max);
1858 char * const t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
1860 last1 = HOPc(s, -back_min);
1864 while (s <= last1) {
1865 if (regtry(®info, s))
1871 while (s <= last1) {
1872 if (regtry(®info, s))
1878 DEBUG_EXECUTE_r(if (!did_match)
1879 PerlIO_printf(Perl_debug_log,
1880 "Did not find %s substr \"%s%.*s%s\"%s...\n",
1881 ((must == prog->anchored_substr || must == prog->anchored_utf8)
1882 ? "anchored" : "floating"),
1884 (int)(SvCUR(must) - (SvTAIL(must)!=0)),
1886 PL_colors[1], (SvTAIL(must) ? "$" : ""))
1890 else if ((c = prog->regstclass)) {
1892 I32 op = (U8)OP(prog->regstclass);
1893 /* don't bother with what can't match */
1894 if (PL_regkind[op] != EXACT && op != CANY)
1895 strend = HOPc(strend, -(minlen - 1));
1898 SV *prop = sv_newmortal();
1904 regprop(prog, prop, c);
1906 pv_uni_display(dsv0, (U8*)SvPVX_const(prop), SvCUR(prop), 60,
1907 UNI_DISPLAY_REGEX) :
1909 len0 = UTF ? SvCUR(dsv0) : SvCUR(prop);
1911 sv_uni_display(dsv1, sv, 60, UNI_DISPLAY_REGEX) : s;
1912 len1 = UTF ? (int)SvCUR(dsv1) : strend - s;
1913 PerlIO_printf(Perl_debug_log,
1914 "Matching stclass \"%*.*s\" against \"%*.*s\"\n",
1918 if (find_byclass(prog, c, s, strend, ®info))
1920 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass...\n"));
1924 if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
1929 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
1930 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1931 float_real = do_utf8 ? prog->float_utf8 : prog->float_substr;
1933 if (flags & REXEC_SCREAM) {
1934 last = screaminstr(sv, float_real, s - strbeg,
1935 end_shift, &scream_pos, 1); /* last one */
1937 last = scream_olds; /* Only one occurrence. */
1938 /* we may be pointing at the wrong string */
1939 else if (RX_MATCH_COPIED(prog))
1940 s = strbeg + (s - SvPVX_const(sv));
1944 const char * const little = SvPV_const(float_real, len);
1946 if (SvTAIL(float_real)) {
1947 if (memEQ(strend - len + 1, little, len - 1))
1948 last = strend - len + 1;
1949 else if (!multiline)
1950 last = memEQ(strend - len, little, len)
1951 ? strend - len : NULL;
1957 last = rninstr(s, strend, little, little + len);
1959 last = strend; /* matching "$" */
1963 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1964 "%sCan't trim the tail, match fails (should not happen)%s\n",
1965 PL_colors[4], PL_colors[5]));
1966 goto phooey; /* Should not happen! */
1968 dontbother = strend - last + prog->float_min_offset;
1970 if (minlen && (dontbother < minlen))
1971 dontbother = minlen - 1;
1972 strend -= dontbother; /* this one's always in bytes! */
1973 /* We don't know much -- general case. */
1976 if (regtry(®info, s))
1985 if (regtry(®info, s))
1987 } while (s++ < strend);
1995 RX_MATCH_TAINTED_set(prog, PL_reg_flags & RF_tainted);
1997 if (PL_reg_eval_set) {
1998 /* Preserve the current value of $^R */
1999 if (oreplsv != GvSV(PL_replgv))
2000 sv_setsv(oreplsv, GvSV(PL_replgv));/* So that when GvSV(replgv) is
2001 restored, the value remains
2003 restore_pos(aTHX_ prog);
2006 /* make sure $`, $&, $', and $digit will work later */
2007 if ( !(flags & REXEC_NOT_FIRST) ) {
2008 RX_MATCH_COPY_FREE(prog);
2009 if (flags & REXEC_COPY_STR) {
2010 I32 i = PL_regeol - startpos + (stringarg - strbeg);
2011 #ifdef PERL_OLD_COPY_ON_WRITE
2013 || (SvFLAGS(sv) & CAN_COW_MASK) == CAN_COW_FLAGS)) {
2015 PerlIO_printf(Perl_debug_log,
2016 "Copy on write: regexp capture, type %d\n",
2019 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
2020 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
2021 assert (SvPOKp(prog->saved_copy));
2025 RX_MATCH_COPIED_on(prog);
2026 s = savepvn(strbeg, i);
2032 prog->subbeg = strbeg;
2033 prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
2040 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
2041 PL_colors[4], PL_colors[5]));
2042 if (PL_reg_eval_set)
2043 restore_pos(aTHX_ prog);
2048 - regtry - try match at specific point
2050 STATIC I32 /* 0 failure, 1 success */
2051 S_regtry(pTHX_ const regmatch_info *reginfo, char *startpos)
2057 regexp *prog = reginfo->prog;
2058 GET_RE_DEBUG_FLAGS_DECL;
2061 PL_regindent = 0; /* XXXX Not good when matches are reenterable... */
2063 if ((prog->reganch & ROPT_EVAL_SEEN) && !PL_reg_eval_set) {
2066 PL_reg_eval_set = RS_init;
2067 DEBUG_EXECUTE_r(DEBUG_s(
2068 PerlIO_printf(Perl_debug_log, " setting stack tmpbase at %"IVdf"\n",
2069 (IV)(PL_stack_sp - PL_stack_base));
2071 SAVEI32(cxstack[cxstack_ix].blk_oldsp);
2072 cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base;
2073 /* Otherwise OP_NEXTSTATE will free whatever on stack now. */
2075 /* Apparently this is not needed, judging by wantarray. */
2076 /* SAVEI8(cxstack[cxstack_ix].blk_gimme);
2077 cxstack[cxstack_ix].blk_gimme = G_SCALAR; */
2080 /* Make $_ available to executed code. */
2081 if (reginfo->sv != DEFSV) {
2083 DEFSV = reginfo->sv;
2086 if (!(SvTYPE(reginfo->sv) >= SVt_PVMG && SvMAGIC(reginfo->sv)
2087 && (mg = mg_find(reginfo->sv, PERL_MAGIC_regex_global)))) {
2088 /* prepare for quick setting of pos */
2089 #ifdef PERL_OLD_COPY_ON_WRITE
2091 sv_force_normal_flags(sv, 0);
2093 mg = sv_magicext(reginfo->sv, (SV*)0, PERL_MAGIC_regex_global,
2094 &PL_vtbl_mglob, NULL, 0);
2098 PL_reg_oldpos = mg->mg_len;
2099 SAVEDESTRUCTOR_X(restore_pos, prog);
2101 if (!PL_reg_curpm) {
2102 Newxz(PL_reg_curpm, 1, PMOP);
2105 SV* repointer = newSViv(0);
2106 /* so we know which PL_regex_padav element is PL_reg_curpm */
2107 SvFLAGS(repointer) |= SVf_BREAK;
2108 av_push(PL_regex_padav,repointer);
2109 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2110 PL_regex_pad = AvARRAY(PL_regex_padav);
2114 PM_SETRE(PL_reg_curpm, prog);
2115 PL_reg_oldcurpm = PL_curpm;
2116 PL_curpm = PL_reg_curpm;
2117 if (RX_MATCH_COPIED(prog)) {
2118 /* Here is a serious problem: we cannot rewrite subbeg,
2119 since it may be needed if this match fails. Thus
2120 $` inside (?{}) could fail... */
2121 PL_reg_oldsaved = prog->subbeg;
2122 PL_reg_oldsavedlen = prog->sublen;
2123 #ifdef PERL_OLD_COPY_ON_WRITE
2124 PL_nrs = prog->saved_copy;
2126 RX_MATCH_COPIED_off(prog);
2129 PL_reg_oldsaved = NULL;
2130 prog->subbeg = PL_bostr;
2131 prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2133 prog->startp[0] = startpos - PL_bostr;
2134 PL_reginput = startpos;
2135 PL_regstartp = prog->startp;
2136 PL_regendp = prog->endp;
2137 PL_reglastparen = &prog->lastparen;
2138 PL_reglastcloseparen = &prog->lastcloseparen;
2139 prog->lastparen = 0;
2140 prog->lastcloseparen = 0;
2142 DEBUG_EXECUTE_r(PL_reg_starttry = startpos);
2143 if (PL_reg_start_tmpl <= prog->nparens) {
2144 PL_reg_start_tmpl = prog->nparens*3/2 + 3;
2145 if(PL_reg_start_tmp)
2146 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2148 Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2151 /* XXXX What this code is doing here?!!! There should be no need
2152 to do this again and again, PL_reglastparen should take care of
2155 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2156 * Actually, the code in regcppop() (which Ilya may be meaning by
2157 * PL_reglastparen), is not needed at all by the test suite
2158 * (op/regexp, op/pat, op/split), but that code is needed, oddly
2159 * enough, for building DynaLoader, or otherwise this
2160 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
2161 * will happen. Meanwhile, this code *is* needed for the
2162 * above-mentioned test suite tests to succeed. The common theme
2163 * on those tests seems to be returning null fields from matches.
2168 if (prog->nparens) {
2170 for (i = prog->nparens; i > (I32)*PL_reglastparen; i--) {
2177 if (regmatch(reginfo, prog->program + 1)) {
2178 prog->endp[0] = PL_reginput - PL_bostr;
2181 REGCP_UNWIND(lastcp);
2185 #define RE_UNWIND_BRANCH 1
2186 #define RE_UNWIND_BRANCHJ 2
2190 typedef struct { /* XX: makes sense to enlarge it... */
2194 } re_unwind_generic_t;
2208 } re_unwind_branch_t;
2210 typedef union re_unwind_t {
2212 re_unwind_generic_t generic;
2213 re_unwind_branch_t branch;
2216 #define sayYES goto yes
2217 #define sayNO goto no
2218 #define sayNO_ANYOF goto no_anyof
2219 #define sayYES_FINAL goto yes_final
2220 #define sayNO_FINAL goto no_final
2221 #define sayNO_SILENT goto do_no
2222 #define saySAME(x) if (x) goto yes; else goto no
2224 #define POSCACHE_SUCCESS 0 /* caching success rather than failure */
2225 #define POSCACHE_SEEN 1 /* we know what we're caching */
2226 #define POSCACHE_START 2 /* the real cache: this bit maps to pos 0 */
2228 #define CACHEsayYES STMT_START { \
2229 if (st->u.whilem.cache_offset | st->u.whilem.cache_bit) { \
2230 if (!(PL_reg_poscache[0] & (1<<POSCACHE_SEEN))) { \
2231 PL_reg_poscache[0] |= (1<<POSCACHE_SUCCESS) | (1<<POSCACHE_SEEN); \
2232 PL_reg_poscache[st->u.whilem.cache_offset] |= (1<<st->u.whilem.cache_bit); \
2234 else if (PL_reg_poscache[0] & (1<<POSCACHE_SUCCESS)) { \
2235 PL_reg_poscache[st->u.whilem.cache_offset] |= (1<<st->u.whilem.cache_bit); \
2238 /* cache records failure, but this is success */ \
2240 PerlIO_printf(Perl_debug_log, \
2241 "%*s (remove success from failure cache)\n", \
2242 REPORT_CODE_OFF+PL_regindent*2, "") \
2244 PL_reg_poscache[st->u.whilem.cache_offset] &= ~(1<<st->u.whilem.cache_bit); \
2250 #define CACHEsayNO STMT_START { \
2251 if (st->u.whilem.cache_offset | st->u.whilem.cache_bit) { \
2252 if (!(PL_reg_poscache[0] & (1<<POSCACHE_SEEN))) { \
2253 PL_reg_poscache[0] |= (1<<POSCACHE_SEEN); \
2254 PL_reg_poscache[st->u.whilem.cache_offset] |= (1<<st->u.whilem.cache_bit); \
2256 else if (!(PL_reg_poscache[0] & (1<<POSCACHE_SUCCESS))) { \
2257 PL_reg_poscache[st->u.whilem.cache_offset] |= (1<<st->u.whilem.cache_bit); \
2260 /* cache records success, but this is failure */ \
2262 PerlIO_printf(Perl_debug_log, \
2263 "%*s (remove failure from success cache)\n", \
2264 REPORT_CODE_OFF+PL_regindent*2, "") \
2266 PL_reg_poscache[st->u.whilem.cache_offset] &= ~(1<<st->u.whilem.cache_bit); \
2272 /* this is used to determine how far from the left messages like
2273 'failed...' are printed. Currently 29 makes these messages line
2274 up with the opcode they refer to. Earlier perls used 25 which
2275 left these messages outdented making reviewing a debug output
2278 #define REPORT_CODE_OFF 29
2281 /* Make sure there is a test for this +1 options in re_tests */
2282 #define TRIE_INITAL_ACCEPT_BUFFLEN 4;
2284 /* this value indiciates that the c1/c2 "next char" test should be skipped */
2285 #define CHRTEST_VOID -1000
2287 #define SLAB_FIRST(s) (&(s)->states[0])
2288 #define SLAB_LAST(s) (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
2290 /* grab a new slab and return the first slot in it */
2292 STATIC regmatch_state *
2295 #if PERL_VERSION < 9
2298 regmatch_slab *s = PL_regmatch_slab->next;
2300 Newx(s, 1, regmatch_slab);
2301 s->prev = PL_regmatch_slab;
2303 PL_regmatch_slab->next = s;
2305 PL_regmatch_slab = s;
2306 return SLAB_FIRST(s);
2309 /* simulate a recursive call to regmatch */
2311 #define REGMATCH(ns, where) \
2314 st->resume_state = resume_##where; \
2315 goto start_recurse; \
2316 resume_point_##where:
2319 /* push a new regex state. Set newst to point to it */
2321 #define PUSH_STATE(newst, resume) \
2323 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "PUSH STATE(%d)\n", depth)); \
2327 st->locinput = locinput; \
2328 st->resume_state = resume; \
2330 if (newst > SLAB_LAST(PL_regmatch_slab)) \
2331 newst = S_push_slab(aTHX); \
2332 PL_regmatch_state = newst; \
2334 newst->minmod = 0; \
2336 newst->logical = 0; \
2337 newst->unwind = 0; \
2338 locinput = PL_reginput; \
2339 nextchr = UCHARAT(locinput);
2342 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "POP STATE(%d)\n", depth)); \
2345 if (st < SLAB_FIRST(PL_regmatch_slab)) { \
2346 PL_regmatch_slab = PL_regmatch_slab->prev; \
2347 st = SLAB_LAST(PL_regmatch_slab); \
2349 PL_regmatch_state = st; \
2353 locinput = st->locinput; \
2354 nextchr = UCHARAT(locinput);
2357 - regmatch - main matching routine
2359 * Conceptually the strategy is simple: check to see whether the current
2360 * node matches, call self recursively to see whether the rest matches,
2361 * and then act accordingly. In practice we make some effort to avoid
2362 * recursion, in particular by going through "ordinary" nodes (that don't
2363 * need to know whether the rest of the match failed) by a loop instead of
2366 /* [lwall] I've hoisted the register declarations to the outer block in order to
2367 * maybe save a little bit of pushing and popping on the stack. It also takes
2368 * advantage of machines that use a register save mask on subroutine entry.
2370 * This function used to be heavily recursive, but since this had the
2371 * effect of blowing the CPU stack on complex regexes, it has been
2372 * restructured to be iterative, and to save state onto the heap rather
2373 * than the stack. Essentially whereever regmatch() used to be called, it
2374 * pushes the current state, notes where to return, then jumps back into
2377 * Originally the structure of this function used to look something like
2382 while (scan != NULL) {
2383 a++; // do stuff with a and b
2389 if (regmatch(...)) // recurse
2399 * Now it looks something like this:
2407 regmatch_state *st = new();
2409 st->a++; // do stuff with a and b
2411 while (scan != NULL) {
2419 st->resume_state = resume_FOO;
2420 goto start_recurse; // recurse
2429 st = new(); push a new state
2430 st->a = 1; st->b = 2;
2437 switch (resume_state) {
2439 goto resume_point_FOO;
2446 * WARNING: this means that any line in this function that contains a
2447 * REGMATCH() or TRYPAREN() is actually simulating a recursive call to
2448 * regmatch() using gotos instead. Thus the values of any local variables
2449 * not saved in the regmatch_state structure will have been lost when
2450 * execution resumes on the next line .
2452 * States (ie the st pointer) are allocated in slabs of about 4K in size.
2453 * PL_regmatch_state always points to the currently active state, and
2454 * PL_regmatch_slab points to the slab currently containing PL_regmatch_state.
2455 * The first time regmatch is called, the first slab is allocated, and is
2456 * never freed until interpreter desctruction. When the slab is full,
2457 * a new one is allocated chained to the end. At exit from regmatch, slabs
2458 * allocated since entry are freed.
2462 STATIC I32 /* 0 failure, 1 success */
2463 S_regmatch(pTHX_ const regmatch_info *reginfo, regnode *prog)
2465 #if PERL_VERSION < 9
2469 register const bool do_utf8 = PL_reg_match_utf8;
2470 const U32 uniflags = UTF8_ALLOW_DEFAULT;
2472 regexp *rex = reginfo->prog;
2474 regmatch_slab *orig_slab;
2475 regmatch_state *orig_state;
2477 /* the current state. This is a cached copy of PL_regmatch_state */
2478 register regmatch_state *st;
2480 /* cache heavy used fields of st in registers */
2481 register regnode *scan;
2482 register regnode *next;
2483 register I32 n = 0; /* initialize to shut up compiler warning */
2484 register char *locinput = PL_reginput;
2486 /* these variables are NOT saved during a recusive RFEGMATCH: */
2487 register I32 nextchr; /* is always set to UCHARAT(locinput) */
2488 bool result; /* return value of S_regmatch */
2489 regnode *inner; /* Next node in internal branch. */
2490 int depth = 0; /* depth of recursion */
2491 regmatch_state *newst; /* when pushing a state, this is the new one */
2492 regmatch_state *yes_state = NULL; /* state to pop to on success of
2496 SV *re_debug_flags = NULL;
2501 /* on first ever call to regmatch, allocate first slab */
2502 if (!PL_regmatch_slab) {
2503 Newx(PL_regmatch_slab, 1, regmatch_slab);
2504 PL_regmatch_slab->prev = NULL;
2505 PL_regmatch_slab->next = NULL;
2506 PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
2509 /* remember current high-water mark for exit */
2510 /* XXX this should be done with SAVE* instead */
2511 orig_slab = PL_regmatch_slab;
2512 orig_state = PL_regmatch_state;
2514 /* grab next free state slot */
2515 st = ++PL_regmatch_state;
2516 if (st > SLAB_LAST(PL_regmatch_slab))
2517 st = PL_regmatch_state = S_push_slab(aTHX);
2524 /* Note that nextchr is a byte even in UTF */
2525 nextchr = UCHARAT(locinput);
2527 while (scan != NULL) {
2530 SV * const prop = sv_newmortal();
2531 const int docolor = *PL_colors[0];
2532 const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
2533 int l = (PL_regeol - locinput) > taill ? taill : (PL_regeol - locinput);
2534 /* The part of the string before starttry has one color
2535 (pref0_len chars), between starttry and current
2536 position another one (pref_len - pref0_len chars),
2537 after the current position the third one.
2538 We assume that pref0_len <= pref_len, otherwise we
2539 decrease pref0_len. */
2540 int pref_len = (locinput - PL_bostr) > (5 + taill) - l
2541 ? (5 + taill) - l : locinput - PL_bostr;
2544 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
2546 pref0_len = pref_len - (locinput - PL_reg_starttry);
2547 if (l + pref_len < (5 + taill) && l < PL_regeol - locinput)
2548 l = ( PL_regeol - locinput > (5 + taill) - pref_len
2549 ? (5 + taill) - pref_len : PL_regeol - locinput);
2550 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
2554 if (pref0_len > pref_len)
2555 pref0_len = pref_len;
2556 regprop(rex, prop, scan);
2558 const char * const s0 =
2559 do_utf8 && OP(scan) != CANY ?
2560 pv_uni_display(PERL_DEBUG_PAD(0), (U8*)(locinput - pref_len),
2561 pref0_len, 60, UNI_DISPLAY_REGEX) :
2562 locinput - pref_len;
2563 const int len0 = do_utf8 ? (int)strlen(s0) : pref0_len;
2564 const char * const s1 = do_utf8 && OP(scan) != CANY ?
2565 pv_uni_display(PERL_DEBUG_PAD(1),
2566 (U8*)(locinput - pref_len + pref0_len),
2567 pref_len - pref0_len, 60, UNI_DISPLAY_REGEX) :
2568 locinput - pref_len + pref0_len;
2569 const int len1 = do_utf8 ? (int)strlen(s1) : pref_len - pref0_len;
2570 const char * const s2 = do_utf8 && OP(scan) != CANY ?
2571 pv_uni_display(PERL_DEBUG_PAD(2), (U8*)locinput,
2572 PL_regeol - locinput, 60, UNI_DISPLAY_REGEX) :
2574 const int len2 = do_utf8 ? (int)strlen(s2) : l;
2575 PerlIO_printf(Perl_debug_log,
2576 "%4"IVdf" <%s%.*s%s%s%.*s%s%s%s%.*s%s>%*s|%3"IVdf":%*s%s\n",
2577 (IV)(locinput - PL_bostr),
2584 (docolor ? "" : "> <"),
2588 15 - l - pref_len + 1,
2590 (IV)(scan - rex->program), PL_regindent*2, "",
2595 next = scan + NEXT_OFF(scan);
2601 if (locinput == PL_bostr)
2603 /* reginfo->till = reginfo->bol; */
2608 if (locinput == PL_bostr ||
2609 ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n'))
2615 if (locinput == PL_bostr)
2619 if (locinput == reginfo->ganch)
2625 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
2630 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
2632 if (PL_regeol - locinput > 1)
2636 if (PL_regeol != locinput)
2640 if (!nextchr && locinput >= PL_regeol)
2643 locinput += PL_utf8skip[nextchr];
2644 if (locinput > PL_regeol)
2646 nextchr = UCHARAT(locinput);
2649 nextchr = UCHARAT(++locinput);
2652 if (!nextchr && locinput >= PL_regeol)
2654 nextchr = UCHARAT(++locinput);
2657 if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
2660 locinput += PL_utf8skip[nextchr];
2661 if (locinput > PL_regeol)
2663 nextchr = UCHARAT(locinput);
2666 nextchr = UCHARAT(++locinput);
2672 traverse the TRIE keeping track of all accepting states
2673 we transition through until we get to a failing node.
2681 U8 *uc = ( U8* )locinput;
2688 U8 *uscan = (U8*)NULL;
2690 SV *sv_accept_buff = NULL;
2691 const enum { trie_plain, trie_utf8, trie_uft8_fold }
2692 trie_type = do_utf8 ?
2693 (OP(scan) == TRIE ? trie_utf8 : trie_uft8_fold)
2696 /* what trie are we using right now */
2698 = (reg_trie_data*)rex->data->data[ ARG( scan ) ];
2699 st->u.trie.accepted = 0; /* how many accepting states we have seen */
2702 while ( state && uc <= (U8*)PL_regeol ) {
2704 if (trie->states[ state ].wordnum) {
2705 if (!st->u.trie.accepted ) {
2708 bufflen = TRIE_INITAL_ACCEPT_BUFFLEN;
2709 sv_accept_buff=newSV(bufflen *
2710 sizeof(reg_trie_accepted) - 1);
2711 SvCUR_set(sv_accept_buff,
2712 sizeof(reg_trie_accepted));
2713 SvPOK_on(sv_accept_buff);
2714 sv_2mortal(sv_accept_buff);
2715 st->u.trie.accept_buff =
2716 (reg_trie_accepted*)SvPV_nolen(sv_accept_buff );
2719 if (st->u.trie.accepted >= bufflen) {
2721 st->u.trie.accept_buff =(reg_trie_accepted*)
2722 SvGROW(sv_accept_buff,
2723 bufflen * sizeof(reg_trie_accepted));
2725 SvCUR_set(sv_accept_buff,SvCUR(sv_accept_buff)
2726 + sizeof(reg_trie_accepted));
2728 st->u.trie.accept_buff[st->u.trie.accepted].wordnum = trie->states[state].wordnum;
2729 st->u.trie.accept_buff[st->u.trie.accepted].endpos = uc;
2730 ++st->u.trie.accepted;
2733 base = trie->states[ state ].trans.base;
2735 DEBUG_TRIE_EXECUTE_r(
2736 PerlIO_printf( Perl_debug_log,
2737 "%*s %sState: %4"UVxf", Base: %4"UVxf", Accepted: %4"UVxf" ",
2738 REPORT_CODE_OFF + PL_regindent * 2, "", PL_colors[4],
2739 (UV)state, (UV)base, (UV)st->u.trie.accepted );
2743 switch (trie_type) {
2744 case trie_uft8_fold:
2746 uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags );
2751 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2752 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags );
2753 uvc = to_uni_fold( uvc, foldbuf, &foldlen );
2754 foldlen -= UNISKIP( uvc );
2755 uscan = foldbuf + UNISKIP( uvc );
2759 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN,
2768 charid = trie->charmap[ uvc ];
2772 if (trie->widecharmap) {
2773 SV** svpp = (SV**)NULL;
2774 svpp = hv_fetch(trie->widecharmap,
2775 (char*)&uvc, sizeof(UV), 0);
2777 charid = (U16)SvIV(*svpp);
2782 (base + charid > trie->uniquecharcount )
2783 && (base + charid - 1 - trie->uniquecharcount
2785 && trie->trans[base + charid - 1 -
2786 trie->uniquecharcount].check == state)
2788 state = trie->trans[base + charid - 1 -
2789 trie->uniquecharcount ].next;
2800 DEBUG_TRIE_EXECUTE_r(
2801 PerlIO_printf( Perl_debug_log,
2802 "Charid:%3x CV:%4"UVxf" After State: %4"UVxf"%s\n",
2803 charid, uvc, (UV)state, PL_colors[5] );
2806 if (!st->u.trie.accepted )
2810 There was at least one accepting state that we
2811 transitioned through. Presumably the number of accepting
2812 states is going to be low, typically one or two. So we
2813 simply scan through to find the one with lowest wordnum.
2814 Once we find it, we swap the last state into its place
2815 and decrement the size. We then try to match the rest of
2816 the pattern at the point where the word ends, if we
2817 succeed then we end the loop, otherwise the loop
2818 eventually terminates once all of the accepting states
2822 if ( st->u.trie.accepted == 1 ) {
2824 SV ** const tmp = av_fetch( trie->words, st->u.trie.accept_buff[ 0 ].wordnum-1, 0 );
2825 PerlIO_printf( Perl_debug_log,
2826 "%*s %sonly one match : #%d <%s>%s\n",
2827 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4],
2828 st->u.trie.accept_buff[ 0 ].wordnum,
2829 tmp ? SvPV_nolen_const( *tmp ) : "not compiled under -Dr",
2832 PL_reginput = (char *)st->u.trie.accept_buff[ 0 ].endpos;
2833 /* in this case we free tmps/leave before we call regmatch
2834 as we wont be using accept_buff again. */
2837 REGMATCH(scan + NEXT_OFF(scan), TRIE1);
2838 /*** all unsaved local vars undefined at this point */
2841 PerlIO_printf( Perl_debug_log,"%*s %sgot %"IVdf" possible matches%s\n",
2842 REPORT_CODE_OFF + PL_regindent * 2, "", PL_colors[4], (IV)st->u.trie.accepted,
2845 while ( !result && st->u.trie.accepted-- ) {
2848 for( cur = 1 ; cur <= st->u.trie.accepted ; cur++ ) {
2849 DEBUG_TRIE_EXECUTE_r(
2850 PerlIO_printf( Perl_debug_log,
2851 "%*s %sgot %"IVdf" (%d) as best, looking at %"IVdf" (%d)%s\n",
2852 REPORT_CODE_OFF + PL_regindent * 2, "", PL_colors[4],
2853 (IV)best, st->u.trie.accept_buff[ best ].wordnum, (IV)cur,
2854 st->u.trie.accept_buff[ cur ].wordnum, PL_colors[5] );
2857 if (st->u.trie.accept_buff[cur].wordnum <
2858 st->u.trie.accept_buff[best].wordnum)
2862 reg_trie_data * const trie = (reg_trie_data*)
2863 rex->data->data[ARG(scan)];
2864 SV ** const tmp = av_fetch( trie->words, st->u.trie.accept_buff[ best ].wordnum - 1, 0 );
2865 PerlIO_printf( Perl_debug_log, "%*s %strying alternation #%d <%s> at 0x%p%s\n",
2866 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4],
2867 st->u.trie.accept_buff[best].wordnum,
2868 tmp ? SvPV_nolen_const( *tmp ) : "not compiled under -Dr", (void*)scan,
2871 if ( best<st->u.trie.accepted ) {
2872 reg_trie_accepted tmp = st->u.trie.accept_buff[ best ];
2873 st->u.trie.accept_buff[ best ] = st->u.trie.accept_buff[ st->u.trie.accepted ];
2874 st->u.trie.accept_buff[ st->u.trie.accepted ] = tmp;
2875 best = st->u.trie.accepted;
2877 PL_reginput = (char *)st->u.trie.accept_buff[ best ].endpos;
2880 as far as I can tell we only need the SAVETMPS/FREETMPS
2881 for re's with EVAL in them but I'm leaving them in for
2882 all until I can be sure.
2885 REGMATCH(scan + NEXT_OFF(scan), TRIE2);
2886 /*** all unsaved local vars undefined at this point */
2899 /* unreached codepoint */
2901 char *s = STRING(scan);
2902 st->ln = STR_LEN(scan);
2903 if (do_utf8 != UTF) {
2904 /* The target and the pattern have differing utf8ness. */
2906 const char *e = s + st->ln;
2909 /* The target is utf8, the pattern is not utf8. */
2914 if (NATIVE_TO_UNI(*(U8*)s) !=
2915 utf8n_to_uvuni((U8*)l, UTF8_MAXBYTES, &ulen,
2923 /* The target is not utf8, the pattern is utf8. */
2928 if (NATIVE_TO_UNI(*((U8*)l)) !=
2929 utf8n_to_uvuni((U8*)s, UTF8_MAXBYTES, &ulen,
2937 nextchr = UCHARAT(locinput);
2940 /* The target and the pattern have the same utf8ness. */
2941 /* Inline the first character, for speed. */
2942 if (UCHARAT(s) != nextchr)
2944 if (PL_regeol - locinput < st->ln)
2946 if (st->ln > 1 && memNE(s, locinput, st->ln))
2949 nextchr = UCHARAT(locinput);
2953 PL_reg_flags |= RF_tainted;
2956 char *s = STRING(scan);
2957 st->ln = STR_LEN(scan);
2959 if (do_utf8 || UTF) {
2960 /* Either target or the pattern are utf8. */
2962 char *e = PL_regeol;
2964 if (ibcmp_utf8(s, 0, st->ln, (bool)UTF,
2965 l, &e, 0, do_utf8)) {
2966 /* One more case for the sharp s:
2967 * pack("U0U*", 0xDF) =~ /ss/i,
2968 * the 0xC3 0x9F are the UTF-8
2969 * byte sequence for the U+00DF. */
2971 toLOWER(s[0]) == 's' &&
2973 toLOWER(s[1]) == 's' &&
2980 nextchr = UCHARAT(locinput);
2984 /* Neither the target and the pattern are utf8. */
2986 /* Inline the first character, for speed. */
2987 if (UCHARAT(s) != nextchr &&
2988 UCHARAT(s) != ((OP(scan) == EXACTF)
2989 ? PL_fold : PL_fold_locale)[nextchr])
2991 if (PL_regeol - locinput < st->ln)
2993 if (st->ln > 1 && (OP(scan) == EXACTF
2994 ? ibcmp(s, locinput, st->ln)
2995 : ibcmp_locale(s, locinput, st->ln)))
2998 nextchr = UCHARAT(locinput);
3003 STRLEN inclasslen = PL_regeol - locinput;
3005 if (!reginclass(rex, scan, (U8*)locinput, &inclasslen, do_utf8))
3007 if (locinput >= PL_regeol)
3009 locinput += inclasslen ? inclasslen : UTF8SKIP(locinput);
3010 nextchr = UCHARAT(locinput);
3015 nextchr = UCHARAT(locinput);
3016 if (!REGINCLASS(rex, scan, (U8*)locinput))
3018 if (!nextchr && locinput >= PL_regeol)
3020 nextchr = UCHARAT(++locinput);
3024 /* If we might have the case of the German sharp s
3025 * in a casefolding Unicode character class. */
3027 if (ANYOF_FOLD_SHARP_S(scan, locinput, PL_regeol)) {
3028 locinput += SHARP_S_SKIP;
3029 nextchr = UCHARAT(locinput);
3035 PL_reg_flags |= RF_tainted;
3041 LOAD_UTF8_CHARCLASS_ALNUM();
3042 if (!(OP(scan) == ALNUM
3043 ? (bool)swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
3044 : isALNUM_LC_utf8((U8*)locinput)))
3048 locinput += PL_utf8skip[nextchr];
3049 nextchr = UCHARAT(locinput);
3052 if (!(OP(scan) == ALNUM
3053 ? isALNUM(nextchr) : isALNUM_LC(nextchr)))
3055 nextchr = UCHARAT(++locinput);
3058 PL_reg_flags |= RF_tainted;
3061 if (!nextchr && locinput >= PL_regeol)
3064 LOAD_UTF8_CHARCLASS_ALNUM();
3065 if (OP(scan) == NALNUM
3066 ? (bool)swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
3067 : isALNUM_LC_utf8((U8*)locinput))
3071 locinput += PL_utf8skip[nextchr];
3072 nextchr = UCHARAT(locinput);
3075 if (OP(scan) == NALNUM
3076 ? isALNUM(nextchr) : isALNUM_LC(nextchr))
3078 nextchr = UCHARAT(++locinput);
3082 PL_reg_flags |= RF_tainted;
3086 /* was last char in word? */
3088 if (locinput == PL_bostr)
3091 const U8 * const r = reghop3((U8*)locinput, -1, (U8*)PL_bostr);
3093 st->ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, uniflags);
3095 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
3096 st->ln = isALNUM_uni(st->ln);
3097 LOAD_UTF8_CHARCLASS_ALNUM();
3098 n = swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8);
3101 st->ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(st->ln));
3102 n = isALNUM_LC_utf8((U8*)locinput);
3106 st->ln = (locinput != PL_bostr) ?
3107 UCHARAT(locinput - 1) : '\n';
3108 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
3109 st->ln = isALNUM(st->ln);
3110 n = isALNUM(nextchr);
3113 st->ln = isALNUM_LC(st->ln);
3114 n = isALNUM_LC(nextchr);
3117 if (((!st->ln) == (!n)) == (OP(scan) == BOUND ||
3118 OP(scan) == BOUNDL))
3122 PL_reg_flags |= RF_tainted;
3128 if (UTF8_IS_CONTINUED(nextchr)) {
3129 LOAD_UTF8_CHARCLASS_SPACE();
3130 if (!(OP(scan) == SPACE
3131 ? (bool)swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
3132 : isSPACE_LC_utf8((U8*)locinput)))
3136 locinput += PL_utf8skip[nextchr];
3137 nextchr = UCHARAT(locinput);
3140 if (!(OP(scan) == SPACE
3141 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
3143 nextchr = UCHARAT(++locinput);
3146 if (!(OP(scan) == SPACE
3147 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
3149 nextchr = UCHARAT(++locinput);
3153 PL_reg_flags |= RF_tainted;
3156 if (!nextchr && locinput >= PL_regeol)
3159 LOAD_UTF8_CHARCLASS_SPACE();
3160 if (OP(scan) == NSPACE
3161 ? (bool)swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
3162 : isSPACE_LC_utf8((U8*)locinput))
3166 locinput += PL_utf8skip[nextchr];
3167 nextchr = UCHARAT(locinput);
3170 if (OP(scan) == NSPACE
3171 ? isSPACE(nextchr) : isSPACE_LC(nextchr))
3173 nextchr = UCHARAT(++locinput);
3176 PL_reg_flags |= RF_tainted;
3182 LOAD_UTF8_CHARCLASS_DIGIT();
3183 if (!(OP(scan) == DIGIT
3184 ? (bool)swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
3185 : isDIGIT_LC_utf8((U8*)locinput)))
3189 locinput += PL_utf8skip[nextchr];
3190 nextchr = UCHARAT(locinput);
3193 if (!(OP(scan) == DIGIT
3194 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr)))
3196 nextchr = UCHARAT(++locinput);
3199 PL_reg_flags |= RF_tainted;
3202 if (!nextchr && locinput >= PL_regeol)
3205 LOAD_UTF8_CHARCLASS_DIGIT();
3206 if (OP(scan) == NDIGIT
3207 ? (bool)swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
3208 : isDIGIT_LC_utf8((U8*)locinput))
3212 locinput += PL_utf8skip[nextchr];
3213 nextchr = UCHARAT(locinput);
3216 if (OP(scan) == NDIGIT
3217 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr))
3219 nextchr = UCHARAT(++locinput);
3222 if (locinput >= PL_regeol)
3225 LOAD_UTF8_CHARCLASS_MARK();
3226 if (swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
3228 locinput += PL_utf8skip[nextchr];
3229 while (locinput < PL_regeol &&
3230 swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
3231 locinput += UTF8SKIP(locinput);
3232 if (locinput > PL_regeol)
3237 nextchr = UCHARAT(locinput);
3240 PL_reg_flags |= RF_tainted;
3245 n = ARG(scan); /* which paren pair */
3246 st->ln = PL_regstartp[n];
3247 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
3248 if ((I32)*PL_reglastparen < n || st->ln == -1)
3249 sayNO; /* Do not match unless seen CLOSEn. */
3250 if (st->ln == PL_regendp[n])
3253 s = PL_bostr + st->ln;
3254 if (do_utf8 && OP(scan) != REF) { /* REF can do byte comparison */
3256 const char *e = PL_bostr + PL_regendp[n];
3258 * Note that we can't do the "other character" lookup trick as
3259 * in the 8-bit case (no pun intended) because in Unicode we
3260 * have to map both upper and title case to lower case.
3262 if (OP(scan) == REFF) {
3264 STRLEN ulen1, ulen2;
3265 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
3266 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
3270 toLOWER_utf8((U8*)s, tmpbuf1, &ulen1);
3271 toLOWER_utf8((U8*)l, tmpbuf2, &ulen2);
3272 if (ulen1 != ulen2 || memNE((char *)tmpbuf1, (char *)tmpbuf2, ulen1))
3279 nextchr = UCHARAT(locinput);
3283 /* Inline the first character, for speed. */
3284 if (UCHARAT(s) != nextchr &&
3286 (UCHARAT(s) != ((OP(scan) == REFF
3287 ? PL_fold : PL_fold_locale)[nextchr]))))
3289 st->ln = PL_regendp[n] - st->ln;
3290 if (locinput + st->ln > PL_regeol)
3292 if (st->ln > 1 && (OP(scan) == REF
3293 ? memNE(s, locinput, st->ln)
3295 ? ibcmp(s, locinput, st->ln)
3296 : ibcmp_locale(s, locinput, st->ln))))
3299 nextchr = UCHARAT(locinput);
3312 /* execute the code in the {...} */
3314 SV ** const before = SP;
3315 OP_4tree * const oop = PL_op;
3316 COP * const ocurcop = PL_curcop;
3320 PL_op = (OP_4tree*)rex->data->data[n];
3321 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, " re_eval 0x%"UVxf"\n", PTR2UV(PL_op)) );
3322 PAD_SAVE_LOCAL(old_comppad, (PAD*)rex->data->data[n + 2]);
3323 PL_regendp[0] = PL_reg_magic->mg_len = locinput - PL_bostr;
3325 CALLRUNOPS(aTHX); /* Scalar context. */
3328 ret = &PL_sv_undef; /* protect against empty (?{}) blocks. */
3335 PAD_RESTORE_LOCAL(old_comppad);
3336 PL_curcop = ocurcop;
3339 sv_setsv(save_scalar(PL_replgv), ret);
3343 if (st->logical == 2) { /* Postponed subexpression: /(??{...})/ */
3346 /* extract RE object from returned value; compiling if
3351 if(SvROK(ret) && SvSMAGICAL(sv = SvRV(ret)))
3352 mg = mg_find(sv, PERL_MAGIC_qr);
3353 else if (SvSMAGICAL(ret)) {
3354 if (SvGMAGICAL(ret))
3355 sv_unmagic(ret, PERL_MAGIC_qr);
3357 mg = mg_find(ret, PERL_MAGIC_qr);
3361 re = (regexp *)mg->mg_obj;
3362 (void)ReREFCNT_inc(re);
3366 const char * const t = SvPV_const(ret, len);
3368 const I32 osize = PL_regsize;
3371 if (DO_UTF8(ret)) pm.op_pmdynflags |= PMdf_DYN_UTF8;
3372 re = CALLREGCOMP(aTHX_ (char*)t, (char*)t + len, &pm);
3374 & (SVs_TEMP | SVs_PADTMP | SVf_READONLY
3376 sv_magic(ret,(SV*)ReREFCNT_inc(re),
3382 /* run the pattern returned from (??{...}) */
3385 PerlIO_printf(Perl_debug_log,
3386 "Entering embedded \"%s%.60s%s%s\"\n",
3390 (strlen(re->precomp) > 60 ? "..." : ""))
3393 st->u.eval.cp = regcppush(0); /* Save *all* the positions. */
3394 REGCP_SET(st->u.eval.lastcp);
3395 *PL_reglastparen = 0;
3396 *PL_reglastcloseparen = 0;
3397 PL_reginput = locinput;
3399 /* XXXX This is too dramatic a measure... */
3403 st->u.eval.toggleutf = ((PL_reg_flags & RF_utf8) != 0) ^
3404 ((re->reganch & ROPT_UTF8) != 0);
3405 if (st->u.eval.toggleutf) PL_reg_flags ^= RF_utf8;
3406 st->u.eval.prev_rex = rex;
3409 /* resume to current state on success */
3410 st->u.yes.prev_yes_state = yes_state;
3412 PUSH_STATE(newst, resume_EVAL);
3415 /* now continue from first node in postoned RE */
3416 next = re->program + 1;
3420 /* /(?(?{...})X|Y)/ */
3421 st->sw = SvTRUE(ret);
3426 n = ARG(scan); /* which paren pair */
3427 PL_reg_start_tmp[n] = locinput;
3432 n = ARG(scan); /* which paren pair */
3433 PL_regstartp[n] = PL_reg_start_tmp[n] - PL_bostr;
3434 PL_regendp[n] = locinput - PL_bostr;
3435 if (n > (I32)*PL_reglastparen)
3436 *PL_reglastparen = n;
3437 *PL_reglastcloseparen = n;
3440 n = ARG(scan); /* which paren pair */
3441 st->sw = ((I32)*PL_reglastparen >= n && PL_regendp[n] != -1);
3444 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
3446 next = NEXTOPER(NEXTOPER(scan));
3448 next = scan + ARG(scan);
3449 if (OP(next) == IFTHEN) /* Fake one. */
3450 next = NEXTOPER(NEXTOPER(next));
3454 st->logical = scan->flags;
3456 /*******************************************************************
3457 cc points to the regmatch_state associated with the most recent CURLYX.
3458 This struct contains info about the innermost (...)* loop (an
3459 "infoblock"), and a pointer to the next outer cc.
3461 Here is how Y(A)*Z is processed (if it is compiled into CURLYX/WHILEM):
3463 1) After matching Y, regnode for CURLYX is processed;
3465 2) This regnode populates cc, and calls regmatch() recursively
3466 with the starting point at WHILEM node;
3468 3) Each hit of WHILEM node tries to match A and Z (in the order
3469 depending on the current iteration, min/max of {min,max} and
3470 greediness). The information about where are nodes for "A"
3471 and "Z" is read from cc, as is info on how many times "A"
3472 was already matched, and greediness.
3474 4) After A matches, the same WHILEM node is hit again.
3476 5) Each time WHILEM is hit, cc is the infoblock created by CURLYX
3477 of the same pair. Thus when WHILEM tries to match Z, it temporarily
3478 resets cc, since this Y(A)*Z can be a part of some other loop:
3479 as in (Y(A)*Z)*. If Z matches, the automaton will hit the WHILEM node
3480 of the external loop.
3482 Currently present infoblocks form a tree with a stem formed by st->cc
3483 and whatever it mentions via ->next, and additional attached trees
3484 corresponding to temporarily unset infoblocks as in "5" above.
3486 In the following picture, infoblocks for outer loop of
3487 (Y(A)*?Z)*?T are denoted O, for inner I. NULL starting block
3488 is denoted by x. The matched string is YAAZYAZT. Temporarily postponed
3489 infoblocks are drawn below the "reset" infoblock.
3491 In fact in the picture below we do not show failed matches for Z and T
3492 by WHILEM blocks. [We illustrate minimal matches, since for them it is
3493 more obvious *why* one needs to *temporary* unset infoblocks.]
3495 Matched REx position InfoBlocks Comment
3499 Y A)*?Z)*?T x <- O <- I
3500 YA )*?Z)*?T x <- O <- I
3501 YA A)*?Z)*?T x <- O <- I
3502 YAA )*?Z)*?T x <- O <- I
3503 YAA Z)*?T x <- O # Temporary unset I
3506 YAAZ Y(A)*?Z)*?T x <- O
3509 YAAZY (A)*?Z)*?T x <- O
3512 YAAZY A)*?Z)*?T x <- O <- I
3515 YAAZYA )*?Z)*?T x <- O <- I
3518 YAAZYA Z)*?T x <- O # Temporary unset I
3524 YAAZYAZ T x # Temporary unset O
3531 *******************************************************************/
3534 /* No need to save/restore up to this paren */
3535 I32 parenfloor = scan->flags;
3539 CURLYX and WHILEM are always paired: they're the moral
3540 equivalent of pp_enteriter anbd pp_iter.
3542 The only time next could be null is if the node tree is
3543 corrupt. This was mentioned on p5p a few days ago.
3545 See http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/2006-04/msg00556.html
3546 So we'll assert that this is true:
3549 if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
3551 /* XXXX Probably it is better to teach regpush to support
3552 parenfloor > PL_regsize... */
3553 if (parenfloor > (I32)*PL_reglastparen)
3554 parenfloor = *PL_reglastparen; /* Pessimization... */
3556 st->u.curlyx.cp = PL_savestack_ix;
3557 st->u.curlyx.outercc = st->cc;
3559 /* these fields contain the state of the current curly.
3560 * they are accessed by subsequent WHILEMs;
3561 * cur and lastloc are also updated by WHILEM */
3562 st->u.curlyx.parenfloor = parenfloor;
3563 st->u.curlyx.cur = -1; /* this will be updated by WHILEM */
3564 st->u.curlyx.min = ARG1(scan);
3565 st->u.curlyx.max = ARG2(scan);
3566 st->u.curlyx.scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3567 st->u.curlyx.lastloc = 0;
3568 /* st->next and st->minmod are also read by WHILEM */
3570 PL_reginput = locinput;
3571 REGMATCH(PREVOPER(next), CURLYX); /* start on the WHILEM */
3572 /*** all unsaved local vars undefined at this point */
3573 regcpblow(st->u.curlyx.cp);
3574 st->cc = st->u.curlyx.outercc;
3580 * This is really hard to understand, because after we match
3581 * what we're trying to match, we must make sure the rest of
3582 * the REx is going to match for sure, and to do that we have
3583 * to go back UP the parse tree by recursing ever deeper. And
3584 * if it fails, we have to reset our parent's current state
3585 * that we can try again after backing off.
3590 st->cc gets initialised by CURLYX ready for use by WHILEM.
3591 So again, unless somethings been corrupted, st->cc cannot
3592 be null at that point in WHILEM.
3594 See http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/2006-04/msg00556.html
3595 So we'll assert that this is true:
3598 st->u.whilem.lastloc = st->cc->u.curlyx.lastloc; /* Detection of 0-len. */
3599 st->u.whilem.cache_offset = 0;
3600 st->u.whilem.cache_bit = 0;
3602 n = st->cc->u.curlyx.cur + 1; /* how many we know we matched */
3603 PL_reginput = locinput;
3606 PerlIO_printf(Perl_debug_log,
3607 "%*s %ld out of %ld..%ld cc=%"UVxf"\n",
3608 REPORT_CODE_OFF+PL_regindent*2, "",
3609 (long)n, (long)st->cc->u.curlyx.min,
3610 (long)st->cc->u.curlyx.max, PTR2UV(st->cc))
3613 /* If degenerate scan matches "", assume scan done. */
3615 if (locinput == st->cc->u.curlyx.lastloc && n >= st->cc->u.curlyx.min) {
3616 st->u.whilem.savecc = st->cc;
3617 st->cc = st->cc->u.curlyx.outercc;
3619 st->ln = st->cc->u.curlyx.cur;
3621 PerlIO_printf(Perl_debug_log,
3622 "%*s empty match detected, try continuation...\n",
3623 REPORT_CODE_OFF+PL_regindent*2, "")
3625 REGMATCH(st->u.whilem.savecc->next, WHILEM1);
3626 /*** all unsaved local vars undefined at this point */
3627 st->cc = st->u.whilem.savecc;
3630 if (st->cc->u.curlyx.outercc)
3631 st->cc->u.curlyx.outercc->u.curlyx.cur = st->ln;
3635 /* First just match a string of min scans. */
3637 if (n < st->cc->u.curlyx.min) {
3638 st->cc->u.curlyx.cur = n;
3639 st->cc->u.curlyx.lastloc = locinput;
3640 REGMATCH(st->cc->u.curlyx.scan, WHILEM2);
3641 /*** all unsaved local vars undefined at this point */
3644 st->cc->u.curlyx.cur = n - 1;
3645 st->cc->u.curlyx.lastloc = st->u.whilem.lastloc;
3650 /* Check whether we already were at this position.
3651 Postpone detection until we know the match is not
3652 *that* much linear. */
3653 if (!PL_reg_maxiter) {
3654 PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
3655 /* possible overflow for long strings and many CURLYX's */
3656 if (PL_reg_maxiter < 0)
3657 PL_reg_maxiter = I32_MAX;
3658 PL_reg_leftiter = PL_reg_maxiter;
3660 if (PL_reg_leftiter-- == 0) {
3661 const I32 size = (PL_reg_maxiter + 7 + POSCACHE_START)/8;
3662 if (PL_reg_poscache) {
3663 if ((I32)PL_reg_poscache_size < size) {
3664 Renew(PL_reg_poscache, size, char);
3665 PL_reg_poscache_size = size;
3667 Zero(PL_reg_poscache, size, char);
3670 PL_reg_poscache_size = size;
3671 Newxz(PL_reg_poscache, size, char);
3674 PerlIO_printf(Perl_debug_log,
3675 "%sDetected a super-linear match, switching on caching%s...\n",
3676 PL_colors[4], PL_colors[5])
3679 if (PL_reg_leftiter < 0) {
3680 st->u.whilem.cache_offset = locinput - PL_bostr;
3682 st->u.whilem.cache_offset = (scan->flags & 0xf) - 1 + POSCACHE_START
3683 + st->u.whilem.cache_offset * (scan->flags>>4);
3684 st->u.whilem.cache_bit = st->u.whilem.cache_offset % 8;
3685 st->u.whilem.cache_offset /= 8;
3686 if (PL_reg_poscache[st->u.whilem.cache_offset] & (1<<st->u.whilem.cache_bit)) {
3688 PerlIO_printf(Perl_debug_log,
3689 "%*s already tried at this position...\n",
3690 REPORT_CODE_OFF+PL_regindent*2, "")
3692 if (PL_reg_poscache[0] & (1<<POSCACHE_SUCCESS))
3693 /* cache records success */
3696 /* cache records failure */
3702 /* Prefer next over scan for minimal matching. */
3704 if (st->cc->minmod) {
3705 st->u.whilem.savecc = st->cc;
3706 st->cc = st->cc->u.curlyx.outercc;
3708 st->ln = st->cc->u.curlyx.cur;
3709 st->u.whilem.cp = regcppush(st->u.whilem.savecc->u.curlyx.parenfloor);
3710 REGCP_SET(st->u.whilem.lastcp);
3711 REGMATCH(st->u.whilem.savecc->next, WHILEM3);
3712 /*** all unsaved local vars undefined at this point */
3713 st->cc = st->u.whilem.savecc;
3715 regcpblow(st->u.whilem.cp);
3716 CACHEsayYES; /* All done. */
3718 REGCP_UNWIND(st->u.whilem.lastcp);
3720 if (st->cc->u.curlyx.outercc)
3721 st->cc->u.curlyx.outercc->u.curlyx.cur = st->ln;
3723 if (n >= st->cc->u.curlyx.max) { /* Maximum greed exceeded? */
3724 if (ckWARN(WARN_REGEXP) && n >= REG_INFTY
3725 && !(PL_reg_flags & RF_warned)) {
3726 PL_reg_flags |= RF_warned;
3727 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s limit (%d) exceeded",
3728 "Complex regular subexpression recursion",
3735 PerlIO_printf(Perl_debug_log,
3736 "%*s trying longer...\n",
3737 REPORT_CODE_OFF+PL_regindent*2, "")
3739 /* Try scanning more and see if it helps. */
3740 PL_reginput = locinput;
3741 st->cc->u.curlyx.cur = n;
3742 st->cc->u.curlyx.lastloc = locinput;
3743 st->u.whilem.cp = regcppush(st->cc->u.curlyx.parenfloor);
3744 REGCP_SET(st->u.whilem.lastcp);
3745 REGMATCH(st->cc->u.curlyx.scan, WHILEM4);
3746 /*** all unsaved local vars undefined at this point */
3748 regcpblow(st->u.whilem.cp);
3751 REGCP_UNWIND(st->u.whilem.lastcp);
3753 st->cc->u.curlyx.cur = n - 1;
3754 st->cc->u.curlyx.lastloc = st->u.whilem.lastloc;
3758 /* Prefer scan over next for maximal matching. */
3760 if (n < st->cc->u.curlyx.max) { /* More greed allowed? */
3761 st->u.whilem.cp = regcppush(st->cc->u.curlyx.parenfloor);
3762 st->cc->u.curlyx.cur = n;
3763 st->cc->u.curlyx.lastloc = locinput;
3764 REGCP_SET(st->u.whilem.lastcp);
3765 REGMATCH(st->cc->u.curlyx.scan, WHILEM5);
3766 /*** all unsaved local vars undefined at this point */
3768 regcpblow(st->u.whilem.cp);
3771 REGCP_UNWIND(st->u.whilem.lastcp);
3772 regcppop(rex); /* Restore some previous $<digit>s? */
3773 PL_reginput = locinput;
3775 PerlIO_printf(Perl_debug_log,
3776 "%*s failed, try continuation...\n",
3777 REPORT_CODE_OFF+PL_regindent*2, "")
3780 if (ckWARN(WARN_REGEXP) && n >= REG_INFTY
3781 && !(PL_reg_flags & RF_warned)) {
3782 PL_reg_flags |= RF_warned;
3783 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s limit (%d) exceeded",
3784 "Complex regular subexpression recursion",
3788 /* Failed deeper matches of scan, so see if this one works. */
3789 st->u.whilem.savecc = st->cc;
3790 st->cc = st->cc->u.curlyx.outercc;
3792 st->ln = st->cc->u.curlyx.cur;
3793 REGMATCH(st->u.whilem.savecc->next, WHILEM6);
3794 /*** all unsaved local vars undefined at this point */
3795 st->cc = st->u.whilem.savecc;
3798 if (st->cc->u.curlyx.outercc)
3799 st->cc->u.curlyx.outercc->u.curlyx.cur = st->ln;
3800 st->cc->u.curlyx.cur = n - 1;
3801 st->cc->u.curlyx.lastloc = st->u.whilem.lastloc;
3806 next = scan + ARG(scan);
3809 inner = NEXTOPER(NEXTOPER(scan));
3812 inner = NEXTOPER(scan);
3817 if (!next || OP(next) != type) /* No choice. */
3818 next = inner; /* Avoid recursion. */
3820 const I32 lastparen = *PL_reglastparen;
3821 /* Put unwinding data on stack */
3822 const I32 unwind1 = SSNEWt(1,re_unwind_branch_t);
3823 re_unwind_branch_t * const uw = SSPTRt(unwind1,re_unwind_branch_t);
3825 uw->prev = st->unwind;
3826 st->unwind = unwind1;
3827 uw->type = ((type == BRANCH)
3829 : RE_UNWIND_BRANCHJ);
3830 uw->lastparen = lastparen;
3832 uw->locinput = locinput;
3833 uw->nextchr = nextchr;
3834 uw->minmod = st->minmod;
3836 uw->regindent = ++PL_regindent;
3839 REGCP_SET(uw->lastcp);
3841 /* Now go into the first branch */
3851 st->u.curlym.l = st->u.curlym.matches = 0;
3853 /* We suppose that the next guy does not need
3854 backtracking: in particular, it is of constant non-zero length,
3855 and has no parenths to influence future backrefs. */
3856 st->ln = ARG1(scan); /* min to match */
3857 n = ARG2(scan); /* max to match */
3858 st->u.curlym.paren = scan->flags;
3859 if (st->u.curlym.paren) {
3860 if (st->u.curlym.paren > PL_regsize)
3861 PL_regsize = st->u.curlym.paren;
3862 if (st->u.curlym.paren > (I32)*PL_reglastparen)
3863 *PL_reglastparen = st->u.curlym.paren;
3865 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
3866 if (st->u.curlym.paren)
3867 scan += NEXT_OFF(scan); /* Skip former OPEN. */
3868 PL_reginput = locinput;
3869 st->u.curlym.maxwanted = st->minmod ? st->ln : n;
3870 while (PL_reginput < PL_regeol && st->u.curlym.matches < st->u.curlym.maxwanted) {
3871 /* resume to current state on success */
3872 st->u.yes.prev_yes_state = yes_state;
3874 REGMATCH(scan, CURLYM1);
3875 yes_state = st->u.yes.prev_yes_state;
3876 /*** all unsaved local vars undefined at this point */
3879 /* on first match, determine length, u.curlym.l */
3880 if (!st->u.curlym.matches++) {
3881 if (PL_reg_match_utf8) {
3883 while (s < PL_reginput) {
3889 st->u.curlym.l = PL_reginput - locinput;
3891 if (st->u.curlym.l == 0) {
3892 st->u.curlym.matches = st->u.curlym.maxwanted;
3896 locinput = PL_reginput;
3899 PL_reginput = locinput;
3900 if (st->u.curlym.matches < st->ln) {
3906 PerlIO_printf(Perl_debug_log,
3907 "%*s matched %"IVdf" times, len=%"IVdf"...\n",
3908 (int)(REPORT_CODE_OFF+PL_regindent*2), "",
3909 (IV) st->u.curlym.matches, (IV)st->u.curlym.l)
3912 /* calculate c1 and c1 for possible match of 1st char
3913 * following curly */
3914 st->u.curlym.c1 = st->u.curlym.c2 = CHRTEST_VOID;
3915 if (HAS_TEXT(next) || JUMPABLE(next)) {
3916 regnode *text_node = next;
3917 if (! HAS_TEXT(text_node)) FIND_NEXT_IMPT(text_node);
3918 if (HAS_TEXT(text_node)
3919 && PL_regkind[(U8)OP(text_node)] != REF)
3921 st->u.curlym.c1 = (U8)*STRING(text_node);
3923 (OP(text_node) == EXACTF || OP(text_node) == REFF)
3924 ? PL_fold[st->u.curlym.c1]
3925 : (OP(text_node) == EXACTFL || OP(text_node) == REFFL)
3926 ? PL_fold_locale[st->u.curlym.c1]
3931 REGCP_SET(st->u.curlym.lastcp);
3933 st->u.curlym.minmod = st->minmod;
3935 while (st->u.curlym.matches >= st->ln
3936 && (st->u.curlym.matches <= n
3937 /* for REG_INFTY, ln could overflow to negative */
3938 || (n == REG_INFTY && st->u.curlym.matches >= 0)))
3940 /* If it could work, try it. */
3941 if (st->u.curlym.c1 == CHRTEST_VOID ||
3942 UCHARAT(PL_reginput) == st->u.curlym.c1 ||
3943 UCHARAT(PL_reginput) == st->u.curlym.c2)
3946 PerlIO_printf(Perl_debug_log,
3947 "%*s trying tail with matches=%"IVdf"...\n",
3948 (int)(REPORT_CODE_OFF+PL_regindent*2),
3949 "", (IV)st->u.curlym.matches)
3951 if (st->u.curlym.paren) {
3952 if (st->u.curlym.matches) {
3953 PL_regstartp[st->u.curlym.paren]
3954 = HOPc(PL_reginput, -st->u.curlym.l) - PL_bostr;
3955 PL_regendp[st->u.curlym.paren] = PL_reginput - PL_bostr;
3958 PL_regendp[st->u.curlym.paren] = -1;
3960 /* resume to current state on success */
3961 st->u.yes.prev_yes_state = yes_state;
3963 REGMATCH(next, CURLYM2);
3964 yes_state = st->u.yes.prev_yes_state;
3965 /*** all unsaved local vars undefined at this point */
3967 /* XXX tmp sayYES; */
3969 REGCP_UNWIND(st->u.curlym.lastcp);
3971 /* Couldn't or didn't -- move forward/backward. */
3972 if (st->u.curlym.minmod) {
3973 PL_reginput = locinput;
3974 /* resume to current state on success */
3975 st->u.yes.prev_yes_state = yes_state;
3977 REGMATCH(scan, CURLYM3);
3978 yes_state = st->u.yes.prev_yes_state;
3979 /*** all unsaved local vars undefined at this point */
3981 st->u.curlym.matches++;
3982 locinput = PL_reginput;
3988 st->u.curlym.matches--;
3989 locinput = HOPc(locinput, -st->u.curlym.l);
3990 PL_reginput = locinput;
3998 st->u.plus.paren = scan->flags; /* Which paren to set */
3999 if (st->u.plus.paren > PL_regsize)
4000 PL_regsize = st->u.plus.paren;
4001 if (st->u.plus.paren > (I32)*PL_reglastparen)
4002 *PL_reglastparen = st->u.plus.paren;
4003 st->ln = ARG1(scan); /* min to match */
4004 n = ARG2(scan); /* max to match */
4005 scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
4008 st->u.plus.paren = 0;
4009 st->ln = ARG1(scan); /* min to match */
4010 n = ARG2(scan); /* max to match */
4011 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
4016 scan = NEXTOPER(scan);
4017 st->u.plus.paren = 0;
4022 scan = NEXTOPER(scan);
4023 st->u.plus.paren = 0;
4026 * Lookahead to avoid useless match attempts
4027 * when we know what character comes next.
4031 * Used to only do .*x and .*?x, but now it allows
4032 * for )'s, ('s and (?{ ... })'s to be in the way
4033 * of the quantifier and the EXACT-like node. -- japhy
4036 if (HAS_TEXT(next) || JUMPABLE(next)) {
4038 regnode *text_node = next;
4040 if (! HAS_TEXT(text_node)) FIND_NEXT_IMPT(text_node);
4042 if (! HAS_TEXT(text_node))
4043 st->u.plus.c1 = st->u.plus.c2 = CHRTEST_VOID;
4045 if (PL_regkind[(U8)OP(text_node)] == REF) {
4046 st->u.plus.c1 = st->u.plus.c2 = CHRTEST_VOID;
4047 goto assume_ok_easy;
4049 else { s = (U8*)STRING(text_node); }
4052 st->u.plus.c2 = st->u.plus.c1 = *s;
4053 if (OP(text_node) == EXACTF || OP(text_node) == REFF)
4054 st->u.plus.c2 = PL_fold[st->u.plus.c1];
4055 else if (OP(text_node) == EXACTFL || OP(text_node) == REFFL)
4056 st->u.plus.c2 = PL_fold_locale[st->u.plus.c1];
4059 if (OP(text_node) == EXACTF || OP(text_node) == REFF) {
4060 STRLEN ulen1, ulen2;
4061 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
4062 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
4064 to_utf8_lower((U8*)s, tmpbuf1, &ulen1);
4065 to_utf8_upper((U8*)s, tmpbuf2, &ulen2);
4067 st->u.plus.c1 = utf8n_to_uvuni(tmpbuf1, UTF8_MAXBYTES, 0,
4069 st->u.plus.c2 = utf8n_to_uvuni(tmpbuf2, UTF8_MAXBYTES, 0,
4073 st->u.plus.c2 = st->u.plus.c1 = utf8n_to_uvchr(s, UTF8_MAXBYTES, 0,
4080 st->u.plus.c1 = st->u.plus.c2 = CHRTEST_VOID;
4082 PL_reginput = locinput;
4085 if (st->ln && regrepeat(rex, scan, st->ln) < st->ln)
4087 locinput = PL_reginput;
4088 REGCP_SET(st->u.plus.lastcp);
4089 if (st->u.plus.c1 != CHRTEST_VOID) {
4090 st->u.plus.old = locinput;
4091 st->u.plus.count = 0;
4093 if (n == REG_INFTY) {
4094 st->u.plus.e = PL_regeol - 1;
4096 while (UTF8_IS_CONTINUATION(*(U8*)st->u.plus.e))
4101 for (st->u.plus.e = locinput;
4102 m >0 && st->u.plus.e + UTF8SKIP(st->u.plus.e) <= PL_regeol; m--)
4103 st->u.plus.e += UTF8SKIP(st->u.plus.e);
4106 st->u.plus.e = locinput + n - st->ln;
4107 if (st->u.plus.e >= PL_regeol)
4108 st->u.plus.e = PL_regeol - 1;
4111 /* Find place 'next' could work */
4113 if (st->u.plus.c1 == st->u.plus.c2) {
4114 while (locinput <= st->u.plus.e &&
4115 UCHARAT(locinput) != st->u.plus.c1)
4118 while (locinput <= st->u.plus.e
4119 && UCHARAT(locinput) != st->u.plus.c1
4120 && UCHARAT(locinput) != st->u.plus.c2)
4123 st->u.plus.count = locinput - st->u.plus.old;
4126 if (st->u.plus.c1 == st->u.plus.c2) {
4128 /* count initialised to
4129 * utf8_distance(old, locinput) */
4130 while (locinput <= st->u.plus.e &&
4131 utf8n_to_uvchr((U8*)locinput,
4132 UTF8_MAXBYTES, &len,
4133 uniflags) != (UV)st->u.plus.c1) {
4138 /* count initialised to
4139 * utf8_distance(old, locinput) */
4140 while (locinput <= st->u.plus.e) {
4142 const UV c = utf8n_to_uvchr((U8*)locinput,
4143 UTF8_MAXBYTES, &len,
4145 if (c == (UV)st->u.plus.c1 || c == (UV)st->u.plus.c2)
4152 if (locinput > st->u.plus.e)
4154 /* PL_reginput == old now */
4155 if (locinput != st->u.plus.old) {
4156 st->ln = 1; /* Did some */
4157 if (regrepeat(rex, scan, st->u.plus.count) < st->u.plus.count)
4160 /* PL_reginput == locinput now */
4161 TRYPAREN(st->u.plus.paren, st->ln, locinput, PLUS1);
4162 /*** all unsaved local vars undefined at this point */
4163 PL_reginput = locinput; /* Could be reset... */
4164 REGCP_UNWIND(st->u.plus.lastcp);
4165 /* Couldn't or didn't -- move forward. */
4166 st->u.plus.old = locinput;
4168 locinput += UTF8SKIP(locinput);
4171 st->u.plus.count = 1;
4175 while (n >= st->ln || (n == REG_INFTY && st->ln > 0)) { /* ln overflow ? */
4177 if (st->u.plus.c1 != CHRTEST_VOID) {
4179 c = utf8n_to_uvchr((U8*)PL_reginput,
4183 c = UCHARAT(PL_reginput);
4184 /* If it could work, try it. */
4185 if (c == (UV)st->u.plus.c1 || c == (UV)st->u.plus.c2)
4187 TRYPAREN(st->u.plus.paren, st->ln, PL_reginput, PLUS2);
4188 /*** all unsaved local vars undefined at this point */
4189 REGCP_UNWIND(st->u.plus.lastcp);
4192 /* If it could work, try it. */
4193 else if (st->u.plus.c1 == CHRTEST_VOID)
4195 TRYPAREN(st->u.plus.paren, st->ln, PL_reginput, PLUS3);
4196 /*** all unsaved local vars undefined at this point */
4197 REGCP_UNWIND(st->u.plus.lastcp);
4199 /* Couldn't or didn't -- move forward. */
4200 PL_reginput = locinput;
4201 if (regrepeat(rex, scan, 1)) {
4203 locinput = PL_reginput;
4210 n = regrepeat(rex, scan, n);
4211 locinput = PL_reginput;
4212 if (st->ln < n && PL_regkind[(U8)OP(next)] == EOL &&
4213 (OP(next) != MEOL ||
4214 OP(next) == SEOL || OP(next) == EOS))
4216 st->ln = n; /* why back off? */
4217 /* ...because $ and \Z can match before *and* after
4218 newline at the end. Consider "\n\n" =~ /\n+\Z\n/.
4219 We should back off by one in this case. */
4220 if (UCHARAT(PL_reginput - 1) == '\n' && OP(next) != EOS)
4223 REGCP_SET(st->u.plus.lastcp);
4226 while (n >= st->ln) {
4227 if (st->u.plus.c1 != CHRTEST_VOID) {
4229 c = utf8n_to_uvchr((U8*)PL_reginput,
4233 c = UCHARAT(PL_reginput);
4235 /* If it could work, try it. */
4236 if (st->u.plus.c1 == CHRTEST_VOID || c == (UV)st->u.plus.c1 || c == (UV)st->u.plus.c2)
4238 TRYPAREN(st->u.plus.paren, n, PL_reginput, PLUS4);
4239 /*** all unsaved local vars undefined at this point */
4240 REGCP_UNWIND(st->u.plus.lastcp);
4242 /* Couldn't or didn't -- back up. */
4244 PL_reginput = locinput = HOPc(locinput, -1);
4251 if (locinput < reginfo->till) {
4252 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
4253 "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
4255 (long)(locinput - PL_reg_starttry),
4256 (long)(reginfo->till - PL_reg_starttry),
4258 sayNO_FINAL; /* Cannot match: too short. */
4260 PL_reginput = locinput; /* put where regtry can find it */
4261 sayYES_FINAL; /* Success! */
4263 case SUCCEED: /* successful SUSPEND/UNLESSM/IFMATCH/CURLYM */
4265 PerlIO_printf(Perl_debug_log,
4266 "%*s %ssubpattern success...%s\n",
4267 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4], PL_colors[5]));
4268 PL_reginput = locinput; /* put where regtry can find it */
4269 sayYES_FINAL; /* Success! */
4271 case SUSPEND: /* (?>FOO) */
4272 st->u.ifmatch.wanted = 1;
4273 PL_reginput = locinput;
4276 case UNLESSM: /* -ve lookaround: (?!FOO), or with flags, (?<!foo) */
4277 st->u.ifmatch.wanted = 0;
4278 goto ifmatch_trivial_fail_test;
4280 case IFMATCH: /* +ve lookaround: (?=FOO), or with flags, (?<=foo) */
4281 st->u.ifmatch.wanted = 1;
4282 ifmatch_trivial_fail_test:
4284 char * const s = HOPBACKc(locinput, scan->flags);
4289 st->sw = 1 - st->u.ifmatch.wanted;
4291 else if (st->u.ifmatch.wanted)
4293 next = scan + ARG(scan);
4301 PL_reginput = locinput;
4304 /* resume to current state on success */
4305 st->u.yes.prev_yes_state = yes_state;
4307 PUSH_STATE(newst, resume_IFMATCH);
4309 next = NEXTOPER(NEXTOPER(scan));
4313 next = scan + ARG(scan);
4318 PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
4319 PTR2UV(scan), OP(scan));
4320 Perl_croak(aTHX_ "regexp memory corruption");
4328 /* simulate recursively calling regmatch(), but without actually
4329 * recursing - ie save the current state on the heap rather than on
4330 * the stack, then re-enter the loop. This avoids complex regexes
4331 * blowing the processor stack */
4335 /* push new state */
4336 regmatch_state *oldst = st;
4340 /* grab the next free state slot */
4342 if (st > SLAB_LAST(PL_regmatch_slab))
4343 st = S_push_slab(aTHX);
4344 PL_regmatch_state = st;
4348 oldst->locinput = locinput;
4351 locinput = PL_reginput;
4352 nextchr = UCHARAT(locinput);
4366 * We get here only if there's trouble -- normally "case END" is
4367 * the terminating point.
4369 Perl_croak(aTHX_ "corrupted regexp pointers");
4376 /* we have successfully completed a subexpression, but we must now
4377 * pop to the state marked by yes_state and continue from there */
4379 /*XXX tmp for CURLYM*/
4380 regmatch_slab * const oslab = PL_regmatch_slab;
4381 regmatch_state * const ost = st;
4382 regmatch_state * const oys = yes_state;
4385 assert(st != yes_state);
4386 while (yes_state < SLAB_FIRST(PL_regmatch_slab)
4387 || yes_state > SLAB_LAST(PL_regmatch_slab))
4389 /* not in this slab, pop slab */
4390 depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1);
4391 PL_regmatch_slab = PL_regmatch_slab->prev;
4392 st = SLAB_LAST(PL_regmatch_slab);
4394 depth -= (st - yes_state);
4395 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "POP STATE TO (%d)\n", depth));
4397 yes_state = st->u.yes.prev_yes_state;
4398 PL_regmatch_state = st;
4400 switch (st->resume_state) {
4402 if (st->u.eval.toggleutf)
4403 PL_reg_flags ^= RF_utf8;
4405 rex = st->u.eval.prev_rex;
4406 /* XXXX This is too dramatic a measure... */
4408 /* Restore parens of the caller without popping the
4411 const I32 tmp = PL_savestack_ix;
4412 PL_savestack_ix = st->u.eval.lastcp;
4414 PL_savestack_ix = tmp;
4416 PL_reginput = locinput;
4417 /* continue at the node following the (??{...}) */
4421 case resume_IFMATCH:
4424 st->sw = st->u.ifmatch.wanted;
4426 else if (!st->u.ifmatch.wanted)
4429 if (OP(st->scan) == SUSPEND)
4430 locinput = PL_reginput;
4432 locinput = PL_reginput = st->locinput;
4433 nextchr = UCHARAT(locinput);
4435 next = st->scan + ARG(st->scan);
4436 if (next == st->scan)
4440 /* XXX tmp don't handle yes_state yet */
4441 case resume_CURLYM1:
4442 case resume_CURLYM2:
4443 case resume_CURLYM3:
4444 PL_regmatch_slab =oslab;
4446 PL_regmatch_state = st;
4449 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "XXX revering a CURLYM\n"));
4453 Perl_croak(aTHX_ "unexpected yes reume state");
4457 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
4458 PL_colors[4], PL_colors[5]));
4465 /* XXX this is duplicate(ish) code to that in the do_no section.
4466 * eventually a yes should just pop the stack back to the current
4469 /* restore previous state and re-enter */
4472 switch (st->resume_state) {
4474 goto resume_point_TRIE1;
4476 goto resume_point_TRIE2;
4478 goto resume_point_CURLYX;
4479 case resume_WHILEM1:
4480 goto resume_point_WHILEM1;
4481 case resume_WHILEM2:
4482 goto resume_point_WHILEM2;
4483 case resume_WHILEM3:
4484 goto resume_point_WHILEM3;
4485 case resume_WHILEM4:
4486 goto resume_point_WHILEM4;
4487 case resume_WHILEM5:
4488 goto resume_point_WHILEM5;
4489 case resume_WHILEM6:
4490 goto resume_point_WHILEM6;
4491 case resume_CURLYM1:
4492 goto resume_point_CURLYM1;
4493 case resume_CURLYM2:
4494 goto resume_point_CURLYM2;
4495 case resume_CURLYM3:
4496 goto resume_point_CURLYM3;
4498 goto resume_point_PLUS1;
4500 goto resume_point_PLUS2;
4502 goto resume_point_PLUS3;
4504 goto resume_point_PLUS4;
4506 case resume_IFMATCH:
4509 Perl_croak(aTHX_ "regexp resume memory corruption");
4516 PerlIO_printf(Perl_debug_log,
4517 "%*s %sfailed...%s\n",
4518 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4], PL_colors[5])
4524 re_unwind_t * const uw = SSPTRt(st->unwind,re_unwind_t);
4527 case RE_UNWIND_BRANCH:
4528 case RE_UNWIND_BRANCHJ:
4530 re_unwind_branch_t * const uwb = &(uw->branch);
4531 const I32 lastparen = uwb->lastparen;
4533 REGCP_UNWIND(uwb->lastcp);
4534 for (n = *PL_reglastparen; n > lastparen; n--)
4536 *PL_reglastparen = n;
4537 scan = next = uwb->next;
4538 st->minmod = uwb->minmod;
4540 OP(scan) != (uwb->type == RE_UNWIND_BRANCH
4541 ? BRANCH : BRANCHJ) ) { /* Failure */
4542 st->unwind = uwb->prev;
4548 /* Have more choice yet. Reuse the same uwb. */
4549 if ((n = (uwb->type == RE_UNWIND_BRANCH
4550 ? NEXT_OFF(next) : ARG(next))))
4553 next = NULL; /* XXXX Needn't unwinding in this case... */
4555 next = NEXTOPER(scan);
4556 if (uwb->type == RE_UNWIND_BRANCHJ)
4557 next = NEXTOPER(next);
4558 locinput = uwb->locinput;
4559 nextchr = uwb->nextchr;
4561 PL_regindent = uwb->regindent;
4568 Perl_croak(aTHX_ "regexp unwind memory corruption");
4579 /* there's a previous state to backtrack to */
4581 switch (st->resume_state) {
4583 goto resume_point_TRIE1;
4585 goto resume_point_TRIE2;
4587 /* we have failed an (??{...}). Restore state to the outer re
4588 * then re-throw the failure */
4589 if (st->u.eval.toggleutf)
4590 PL_reg_flags ^= RF_utf8;
4592 rex = st->u.eval.prev_rex;
4593 yes_state = st->u.yes.prev_yes_state;
4595 /* XXXX This is too dramatic a measure... */
4598 PL_reginput = locinput;
4599 REGCP_UNWIND(st->u.eval.lastcp);
4604 goto resume_point_CURLYX;
4605 case resume_WHILEM1:
4606 goto resume_point_WHILEM1;
4607 case resume_WHILEM2:
4608 goto resume_point_WHILEM2;
4609 case resume_WHILEM3:
4610 goto resume_point_WHILEM3;
4611 case resume_WHILEM4:
4612 goto resume_point_WHILEM4;
4613 case resume_WHILEM5:
4614 goto resume_point_WHILEM5;
4615 case resume_WHILEM6:
4616 goto resume_point_WHILEM6;
4617 case resume_CURLYM1:
4618 goto resume_point_CURLYM1;
4619 case resume_CURLYM2:
4620 goto resume_point_CURLYM2;
4621 case resume_CURLYM3:
4622 goto resume_point_CURLYM3;
4623 case resume_IFMATCH:
4624 yes_state = st->u.yes.prev_yes_state;
4627 st->sw = !st->u.ifmatch.wanted;
4629 else if (st->u.ifmatch.wanted)
4632 assert(OP(scan) != SUSPEND); /* XXX DAPM tmp */
4633 locinput = PL_reginput = st->locinput;
4634 nextchr = UCHARAT(locinput);
4635 next = scan + ARG(scan);
4641 goto resume_point_PLUS1;
4643 goto resume_point_PLUS2;
4645 goto resume_point_PLUS3;
4647 goto resume_point_PLUS4;
4649 Perl_croak(aTHX_ "regexp resume memory corruption");
4655 /* restore original high-water mark */
4656 PL_regmatch_slab = orig_slab;
4657 PL_regmatch_state = orig_state;
4659 /* free all slabs above current one */
4660 if (orig_slab->next) {
4661 regmatch_slab *sl = orig_slab->next;
4662 orig_slab->next = NULL;
4664 regmatch_slab * const osl = sl;
4675 - regrepeat - repeatedly match something simple, report how many
4678 * [This routine now assumes that it will only match on things of length 1.
4679 * That was true before, but now we assume scan - reginput is the count,
4680 * rather than incrementing count on every character. [Er, except utf8.]]
4683 S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max)
4686 register char *scan;
4688 register char *loceol = PL_regeol;
4689 register I32 hardcount = 0;
4690 register bool do_utf8 = PL_reg_match_utf8;
4693 if (max == REG_INFTY)
4695 else if (max < loceol - scan)
4696 loceol = scan + max;
4701 while (scan < loceol && hardcount < max && *scan != '\n') {
4702 scan += UTF8SKIP(scan);
4706 while (scan < loceol && *scan != '\n')
4713 while (scan < loceol && hardcount < max) {
4714 scan += UTF8SKIP(scan);
4724 case EXACT: /* length of string is 1 */
4726 while (scan < loceol && UCHARAT(scan) == c)
4729 case EXACTF: /* length of string is 1 */
4731 while (scan < loceol &&
4732 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold[c]))
4735 case EXACTFL: /* length of string is 1 */
4736 PL_reg_flags |= RF_tainted;
4738 while (scan < loceol &&
4739 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold_locale[c]))
4745 while (hardcount < max && scan < loceol &&
4746 reginclass(prog, p, (U8*)scan, 0, do_utf8)) {
4747 scan += UTF8SKIP(scan);
4751 while (scan < loceol && REGINCLASS(prog, p, (U8*)scan))
4758 LOAD_UTF8_CHARCLASS_ALNUM();
4759 while (hardcount < max && scan < loceol &&
4760 swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) {
4761 scan += UTF8SKIP(scan);
4765 while (scan < loceol && isALNUM(*scan))
4770 PL_reg_flags |= RF_tainted;
4773 while (hardcount < max && scan < loceol &&
4774 isALNUM_LC_utf8((U8*)scan)) {
4775 scan += UTF8SKIP(scan);
4779 while (scan < loceol && isALNUM_LC(*scan))
4786 LOAD_UTF8_CHARCLASS_ALNUM();
4787 while (hardcount < max && scan < loceol &&
4788 !swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) {
4789 scan += UTF8SKIP(scan);
4793 while (scan < loceol && !isALNUM(*scan))
4798 PL_reg_flags |= RF_tainted;
4801 while (hardcount < max && scan < loceol &&
4802 !isALNUM_LC_utf8((U8*)scan)) {
4803 scan += UTF8SKIP(scan);
4807 while (scan < loceol && !isALNUM_LC(*scan))
4814 LOAD_UTF8_CHARCLASS_SPACE();
4815 while (hardcount < max && scan < loceol &&
4817 swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) {
4818 scan += UTF8SKIP(scan);
4822 while (scan < loceol && isSPACE(*scan))
4827 PL_reg_flags |= RF_tainted;
4830 while (hardcount < max && scan < loceol &&
4831 (*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
4832 scan += UTF8SKIP(scan);
4836 while (scan < loceol && isSPACE_LC(*scan))
4843 LOAD_UTF8_CHARCLASS_SPACE();
4844 while (hardcount < max && scan < loceol &&
4846 swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) {
4847 scan += UTF8SKIP(scan);
4851 while (scan < loceol && !isSPACE(*scan))
4856 PL_reg_flags |= RF_tainted;
4859 while (hardcount < max && scan < loceol &&
4860 !(*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
4861 scan += UTF8SKIP(scan);
4865 while (scan < loceol && !isSPACE_LC(*scan))
4872 LOAD_UTF8_CHARCLASS_DIGIT();
4873 while (hardcount < max && scan < loceol &&
4874 swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) {
4875 scan += UTF8SKIP(scan);
4879 while (scan < loceol && isDIGIT(*scan))
4886 LOAD_UTF8_CHARCLASS_DIGIT();
4887 while (hardcount < max && scan < loceol &&
4888 !swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) {
4889 scan += UTF8SKIP(scan);
4893 while (scan < loceol && !isDIGIT(*scan))
4897 default: /* Called on something of 0 width. */
4898 break; /* So match right here or not at all. */
4904 c = scan - PL_reginput;
4908 SV *re_debug_flags = NULL;
4909 SV * const prop = sv_newmortal();
4912 regprop(prog, prop, p);
4913 PerlIO_printf(Perl_debug_log,
4914 "%*s %s can match %"IVdf" times out of %"IVdf"...\n",
4915 REPORT_CODE_OFF+1, "", SvPVX_const(prop),(IV)c,(IV)max);
4923 #ifndef PERL_IN_XSUB_RE
4925 - regclass_swash - prepare the utf8 swash
4929 Perl_regclass_swash(pTHX_ const regexp *prog, register const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
4935 const struct reg_data *data = prog ? prog->data : NULL;
4937 if (data && data->count) {
4938 const U32 n = ARG(node);
4940 if (data->what[n] == 's') {
4941 SV * const rv = (SV*)data->data[n];
4942 AV * const av = (AV*)SvRV((SV*)rv);
4943 SV **const ary = AvARRAY(av);
4946 /* See the end of regcomp.c:S_regclass() for
4947 * documentation of these array elements. */
4950 a = SvROK(ary[1]) ? &ary[1] : 0;
4951 b = SvTYPE(ary[2]) == SVt_PVAV ? &ary[2] : 0;
4955 else if (si && doinit) {
4956 sw = swash_init("utf8", "", si, 1, 0);
4957 (void)av_store(av, 1, sw);
4974 - reginclass - determine if a character falls into a character class
4976 The n is the ANYOF regnode, the p is the target string, lenp
4977 is pointer to the maximum length of how far to go in the p
4978 (if the lenp is zero, UTF8SKIP(p) is used),
4979 do_utf8 tells whether the target string is in UTF-8.
4984 S_reginclass(pTHX_ const regexp *prog, register const regnode *n, register const U8* p, STRLEN* lenp, register bool do_utf8)
4987 const char flags = ANYOF_FLAGS(n);
4993 if (do_utf8 && !UTF8_IS_INVARIANT(c)) {
4994 c = utf8n_to_uvchr(p, UTF8_MAXBYTES, &len,
4995 (UTF8_ALLOW_DEFAULT & UTF8_ALLOW_ANYUV) | UTF8_CHECK_ONLY);
4996 /* see [perl #37836] for UTF8_ALLOW_ANYUV */
4997 if (len == (STRLEN)-1)
4998 Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
5001 plen = lenp ? *lenp : UNISKIP(NATIVE_TO_UNI(c));
5002 if (do_utf8 || (flags & ANYOF_UNICODE)) {
5005 if (do_utf8 && !ANYOF_RUNTIME(n)) {
5006 if (len != (STRLEN)-1 && c < 256 && ANYOF_BITMAP_TEST(n, c))
5009 if (!match && do_utf8 && (flags & ANYOF_UNICODE_ALL) && c >= 256)
5013 SV * const sw = regclass_swash(prog, n, TRUE, 0, (SV**)&av);
5016 if (swash_fetch(sw, p, do_utf8))
5018 else if (flags & ANYOF_FOLD) {
5019 if (!match && lenp && av) {
5021 for (i = 0; i <= av_len(av); i++) {
5022 SV* const sv = *av_fetch(av, i, FALSE);
5024 const char * const s = SvPV_const(sv, len);
5026 if (len <= plen && memEQ(s, (char*)p, len)) {
5034 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
5037 to_utf8_fold(p, tmpbuf, &tmplen);
5038 if (swash_fetch(sw, tmpbuf, do_utf8))
5044 if (match && lenp && *lenp == 0)
5045 *lenp = UNISKIP(NATIVE_TO_UNI(c));
5047 if (!match && c < 256) {
5048 if (ANYOF_BITMAP_TEST(n, c))
5050 else if (flags & ANYOF_FOLD) {
5053 if (flags & ANYOF_LOCALE) {
5054 PL_reg_flags |= RF_tainted;
5055 f = PL_fold_locale[c];
5059 if (f != c && ANYOF_BITMAP_TEST(n, f))
5063 if (!match && (flags & ANYOF_CLASS)) {
5064 PL_reg_flags |= RF_tainted;
5066 (ANYOF_CLASS_TEST(n, ANYOF_ALNUM) && isALNUM_LC(c)) ||
5067 (ANYOF_CLASS_TEST(n, ANYOF_NALNUM) && !isALNUM_LC(c)) ||
5068 (ANYOF_CLASS_TEST(n, ANYOF_SPACE) && isSPACE_LC(c)) ||
5069 (ANYOF_CLASS_TEST(n, ANYOF_NSPACE) && !isSPACE_LC(c)) ||
5070 (ANYOF_CLASS_TEST(n, ANYOF_DIGIT) && isDIGIT_LC(c)) ||
5071 (ANYOF_CLASS_TEST(n, ANYOF_NDIGIT) && !isDIGIT_LC(c)) ||
5072 (ANYOF_CLASS_TEST(n, ANYOF_ALNUMC) && isALNUMC_LC(c)) ||
5073 (ANYOF_CLASS_TEST(n, ANYOF_NALNUMC) && !isALNUMC_LC(c)) ||
5074 (ANYOF_CLASS_TEST(n, ANYOF_ALPHA) && isALPHA_LC(c)) ||
5075 (ANYOF_CLASS_TEST(n, ANYOF_NALPHA) && !isALPHA_LC(c)) ||
5076 (ANYOF_CLASS_TEST(n, ANYOF_ASCII) && isASCII(c)) ||
5077 (ANYOF_CLASS_TEST(n, ANYOF_NASCII) && !isASCII(c)) ||
5078 (ANYOF_CLASS_TEST(n, ANYOF_CNTRL) && isCNTRL_LC(c)) ||
5079 (ANYOF_CLASS_TEST(n, ANYOF_NCNTRL) && !isCNTRL_LC(c)) ||
5080 (ANYOF_CLASS_TEST(n, ANYOF_GRAPH) && isGRAPH_LC(c)) ||
5081 (ANYOF_CLASS_TEST(n, ANYOF_NGRAPH) && !isGRAPH_LC(c)) ||
5082 (ANYOF_CLASS_TEST(n, ANYOF_LOWER) && isLOWER_LC(c)) ||
5083 (ANYOF_CLASS_TEST(n, ANYOF_NLOWER) && !isLOWER_LC(c)) ||
5084 (ANYOF_CLASS_TEST(n, ANYOF_PRINT) && isPRINT_LC(c)) ||
5085 (ANYOF_CLASS_TEST(n, ANYOF_NPRINT) && !isPRINT_LC(c)) ||
5086 (ANYOF_CLASS_TEST(n, ANYOF_PUNCT) && isPUNCT_LC(c)) ||
5087 (ANYOF_CLASS_TEST(n, ANYOF_NPUNCT) && !isPUNCT_LC(c)) ||
5088 (ANYOF_CLASS_TEST(n, ANYOF_UPPER) && isUPPER_LC(c)) ||
5089 (ANYOF_CLASS_TEST(n, ANYOF_NUPPER) && !isUPPER_LC(c)) ||
5090 (ANYOF_CLASS_TEST(n, ANYOF_XDIGIT) && isXDIGIT(c)) ||
5091 (ANYOF_CLASS_TEST(n, ANYOF_NXDIGIT) && !isXDIGIT(c)) ||
5092 (ANYOF_CLASS_TEST(n, ANYOF_PSXSPC) && isPSXSPC(c)) ||
5093 (ANYOF_CLASS_TEST(n, ANYOF_NPSXSPC) && !isPSXSPC(c)) ||
5094 (ANYOF_CLASS_TEST(n, ANYOF_BLANK) && isBLANK(c)) ||
5095 (ANYOF_CLASS_TEST(n, ANYOF_NBLANK) && !isBLANK(c))
5096 ) /* How's that for a conditional? */
5103 return (flags & ANYOF_INVERT) ? !match : match;
5107 S_reghop3(U8 *s, I32 off, const U8* lim)
5111 while (off-- && s < lim) {
5112 /* XXX could check well-formedness here */
5120 if (UTF8_IS_CONTINUED(*s)) {
5121 while (s > (U8*)lim && UTF8_IS_CONTINUATION(*s))
5124 /* XXX could check well-formedness here */
5132 S_reghopmaybe3(U8* s, I32 off, const U8* lim)
5136 while (off-- && s < lim) {
5137 /* XXX could check well-formedness here */
5147 if (UTF8_IS_CONTINUED(*s)) {
5148 while (s > (U8*)lim && UTF8_IS_CONTINUATION(*s))
5151 /* XXX could check well-formedness here */
5163 restore_pos(pTHX_ void *arg)
5166 regexp * const rex = (regexp *)arg;
5167 if (PL_reg_eval_set) {
5168 if (PL_reg_oldsaved) {
5169 rex->subbeg = PL_reg_oldsaved;
5170 rex->sublen = PL_reg_oldsavedlen;
5171 #ifdef PERL_OLD_COPY_ON_WRITE
5172 rex->saved_copy = PL_nrs;
5174 RX_MATCH_COPIED_on(rex);
5176 PL_reg_magic->mg_len = PL_reg_oldpos;
5177 PL_reg_eval_set = 0;
5178 PL_curpm = PL_reg_oldcurpm;
5183 S_to_utf8_substr(pTHX_ register regexp *prog)
5185 if (prog->float_substr && !prog->float_utf8) {
5186 SV* const sv = newSVsv(prog->float_substr);
5187 prog->float_utf8 = sv;
5188 sv_utf8_upgrade(sv);
5189 if (SvTAIL(prog->float_substr))
5191 if (prog->float_substr == prog->check_substr)
5192 prog->check_utf8 = sv;
5194 if (prog->anchored_substr && !prog->anchored_utf8) {
5195 SV* const sv = newSVsv(prog->anchored_substr);
5196 prog->anchored_utf8 = sv;
5197 sv_utf8_upgrade(sv);
5198 if (SvTAIL(prog->anchored_substr))
5200 if (prog->anchored_substr == prog->check_substr)
5201 prog->check_utf8 = sv;
5206 S_to_byte_substr(pTHX_ register regexp *prog)
5209 if (prog->float_utf8 && !prog->float_substr) {
5210 SV* sv = newSVsv(prog->float_utf8);
5211 prog->float_substr = sv;
5212 if (sv_utf8_downgrade(sv, TRUE)) {
5213 if (SvTAIL(prog->float_utf8))
5217 prog->float_substr = sv = &PL_sv_undef;
5219 if (prog->float_utf8 == prog->check_utf8)
5220 prog->check_substr = sv;
5222 if (prog->anchored_utf8 && !prog->anchored_substr) {
5223 SV* sv = newSVsv(prog->anchored_utf8);
5224 prog->anchored_substr = sv;
5225 if (sv_utf8_downgrade(sv, TRUE)) {
5226 if (SvTAIL(prog->anchored_utf8))
5230 prog->anchored_substr = sv = &PL_sv_undef;
5232 if (prog->anchored_utf8 == prog->check_utf8)
5233 prog->check_substr = sv;
5239 * c-indentation-style: bsd
5241 * indent-tabs-mode: t
5244 * ex: set ts=8 sts=4 sw=4 noet: