5 * "One Ring to rule them all, One Ring to find them..."
8 /* This file contains functions for executing a regular expression. See
9 * also regcomp.c which funnily enough, contains functions for compiling
10 * a regular expression.
12 * This file is also copied at build time to ext/re/re_exec.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
18 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
19 * confused with the original package (see point 3 below). Thanks, Henry!
22 /* Additional note: this code is very heavily munged from Henry's version
23 * in places. In some spots I've traded clarity for efficiency, so don't
24 * blame Henry for some of the lack of readability.
27 /* The names of the functions have been changed from regcomp and
28 * regexec to pregcomp and pregexec in order to avoid conflicts
29 * with the POSIX routines of the same names.
32 #ifdef PERL_EXT_RE_BUILD
37 * pregcomp and pregexec -- regsub and regerror are not used in perl
39 * Copyright (c) 1986 by University of Toronto.
40 * Written by Henry Spencer. Not derived from licensed software.
42 * Permission is granted to anyone to use this software for any
43 * purpose on any computer system, and to redistribute it freely,
44 * subject to the following restrictions:
46 * 1. The author is not responsible for the consequences of use of
47 * this software, no matter how awful, even if they arise
50 * 2. The origin of this software must not be misrepresented, either
51 * by explicit claim or by omission.
53 * 3. Altered versions must be plainly marked as such, and must not
54 * be misrepresented as being the original software.
56 **** Alterations to Henry's code are...
58 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
59 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, by Larry Wall and others
61 **** You may distribute under the terms of either the GNU General Public
62 **** License or the Artistic License, as specified in the README file.
64 * Beware that some of this code is subtly aware of the way operator
65 * precedence is structured in regular expressions. Serious changes in
66 * regular-expression syntax might require a total rethink.
69 #define PERL_IN_REGEXEC_C
72 #ifdef PERL_IN_XSUB_RE
78 #define RF_tainted 1 /* tainted information used? */
79 #define RF_warned 2 /* warned about big count? */
80 #define RF_evaled 4 /* Did an EVAL with setting? */
81 #define RF_utf8 8 /* Pattern contains multibyte chars? */
83 #define UTF ((PL_reg_flags & RF_utf8) != 0)
85 #define RS_init 1 /* eval environment created */
86 #define RS_set 2 /* replsv value is set */
92 #define REGINCLASS(prog,p,c) (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0,0) : ANYOF_BITMAP_TEST(p,*(c)))
98 #define CHR_SVLEN(sv) (do_utf8 ? sv_len_utf8(sv) : SvCUR(sv))
99 #define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
101 #define HOPc(pos,off) \
102 (char *)(PL_reg_match_utf8 \
103 ? reghop3((U8*)pos, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr)) \
105 #define HOPBACKc(pos, off) \
106 (char*)(PL_reg_match_utf8\
107 ? reghopmaybe3((U8*)pos, -off, (U8*)PL_bostr) \
108 : (pos - off >= PL_bostr) \
112 #define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)pos, off, (U8*)lim) : (U8*)(pos + off))
113 #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
115 #define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \
116 if (!CAT2(PL_utf8_,class)) { bool ok; ENTER; save_re_context(); ok=CAT2(is_utf8_,class)((const U8*)str); assert(ok); LEAVE; } } STMT_END
117 #define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS(alnum,"a")
118 #define LOAD_UTF8_CHARCLASS_DIGIT() LOAD_UTF8_CHARCLASS(digit,"0")
119 #define LOAD_UTF8_CHARCLASS_SPACE() LOAD_UTF8_CHARCLASS(space," ")
120 #define LOAD_UTF8_CHARCLASS_MARK() LOAD_UTF8_CHARCLASS(mark, "\xcd\x86")
122 /* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
124 /* for use after a quantifier and before an EXACT-like node -- japhy */
125 #define JUMPABLE(rn) ( \
126 OP(rn) == OPEN || OP(rn) == CLOSE || OP(rn) == EVAL || \
127 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
128 OP(rn) == PLUS || OP(rn) == MINMOD || \
129 (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
132 #define HAS_TEXT(rn) ( \
133 PL_regkind[OP(rn)] == EXACT || PL_regkind[OP(rn)] == REF \
137 Search for mandatory following text node; for lookahead, the text must
138 follow but for lookbehind (rn->flags != 0) we skip to the next step.
140 #define FIND_NEXT_IMPT(rn) STMT_START { \
141 while (JUMPABLE(rn)) { \
142 const OPCODE type = OP(rn); \
143 if (type == SUSPEND || PL_regkind[type] == CURLY) \
144 rn = NEXTOPER(NEXTOPER(rn)); \
145 else if (type == PLUS) \
147 else if (type == IFMATCH) \
148 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
149 else rn += NEXT_OFF(rn); \
153 static void restore_pos(pTHX_ void *arg);
156 S_regcppush(pTHX_ I32 parenfloor)
159 const int retval = PL_savestack_ix;
160 #define REGCP_PAREN_ELEMS 4
161 const int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
163 GET_RE_DEBUG_FLAGS_DECL;
165 if (paren_elems_to_push < 0)
166 Perl_croak(aTHX_ "panic: paren_elems_to_push < 0");
168 #define REGCP_OTHER_ELEMS 6
169 SSGROW(paren_elems_to_push + REGCP_OTHER_ELEMS);
170 for (p = PL_regsize; p > parenfloor; p--) {
171 /* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
172 SSPUSHINT(PL_regendp[p]);
173 SSPUSHINT(PL_regstartp[p]);
174 SSPUSHPTR(PL_reg_start_tmp[p]);
176 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
177 " saving \\%"UVuf" %"IVdf"(%"IVdf")..%"IVdf"\n",
178 (UV)p, (IV)PL_regstartp[p],
179 (IV)(PL_reg_start_tmp[p] - PL_bostr),
183 /* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
184 SSPUSHINT(PL_regsize);
185 SSPUSHINT(*PL_reglastparen);
186 SSPUSHINT(*PL_reglastcloseparen);
187 SSPUSHPTR(PL_reginput);
188 #define REGCP_FRAME_ELEMS 2
189 /* REGCP_FRAME_ELEMS are part of the REGCP_OTHER_ELEMS and
190 * are needed for the regexp context stack bookkeeping. */
191 SSPUSHINT(paren_elems_to_push + REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
192 SSPUSHINT(SAVEt_REGCONTEXT); /* Magic cookie. */
197 /* These are needed since we do not localize EVAL nodes: */
198 #define REGCP_SET(cp) \
200 if (cp != PL_savestack_ix) \
201 PerlIO_printf(Perl_debug_log, \
202 " Setting an EVAL scope, savestack=%"IVdf"\n", \
203 (IV)PL_savestack_ix)); \
206 #define REGCP_UNWIND(cp) \
208 if (cp != PL_savestack_ix) \
209 PerlIO_printf(Perl_debug_log, \
210 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
211 (IV)(cp), (IV)PL_savestack_ix)); \
215 S_regcppop(pTHX_ const regexp *rex)
221 GET_RE_DEBUG_FLAGS_DECL;
223 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
225 assert(i == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
226 i = SSPOPINT; /* Parentheses elements to pop. */
227 input = (char *) SSPOPPTR;
228 *PL_reglastcloseparen = SSPOPINT;
229 *PL_reglastparen = SSPOPINT;
230 PL_regsize = SSPOPINT;
232 /* Now restore the parentheses context. */
233 for (i -= (REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
234 i > 0; i -= REGCP_PAREN_ELEMS) {
236 U32 paren = (U32)SSPOPINT;
237 PL_reg_start_tmp[paren] = (char *) SSPOPPTR;
238 PL_regstartp[paren] = SSPOPINT;
240 if (paren <= *PL_reglastparen)
241 PL_regendp[paren] = tmps;
243 PerlIO_printf(Perl_debug_log,
244 " restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n",
245 (UV)paren, (IV)PL_regstartp[paren],
246 (IV)(PL_reg_start_tmp[paren] - PL_bostr),
247 (IV)PL_regendp[paren],
248 (paren > *PL_reglastparen ? "(no)" : ""));
252 if (*PL_reglastparen + 1 <= rex->nparens) {
253 PerlIO_printf(Perl_debug_log,
254 " restoring \\%"IVdf"..\\%"IVdf" to undef\n",
255 (IV)(*PL_reglastparen + 1), (IV)rex->nparens);
259 /* It would seem that the similar code in regtry()
260 * already takes care of this, and in fact it is in
261 * a better location to since this code can #if 0-ed out
262 * but the code in regtry() is needed or otherwise tests
263 * requiring null fields (pat.t#187 and split.t#{13,14}
264 * (as of patchlevel 7877) will fail. Then again,
265 * this code seems to be necessary or otherwise
266 * building DynaLoader will fail:
267 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
269 for (i = *PL_reglastparen + 1; (U32)i <= rex->nparens; i++) {
271 PL_regstartp[i] = -1;
278 #define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
281 * pregexec and friends
284 #ifndef PERL_IN_XSUB_RE
286 - pregexec - match a regexp against a string
289 Perl_pregexec(pTHX_ register regexp *prog, char *stringarg, register char *strend,
290 char *strbeg, I32 minend, SV *screamer, U32 nosave)
291 /* strend: pointer to null at end of string */
292 /* strbeg: real beginning of string */
293 /* minend: end of match must be >=minend after stringarg. */
294 /* nosave: For optimizations. */
297 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
298 nosave ? 0 : REXEC_COPY_STR);
303 * Need to implement the following flags for reg_anch:
305 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
307 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
308 * INTUIT_AUTORITATIVE_ML
309 * INTUIT_ONCE_NOML - Intuit can match in one location only.
312 * Another flag for this function: SECOND_TIME (so that float substrs
313 * with giant delta may be not rechecked).
316 /* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
318 /* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
319 Otherwise, only SvCUR(sv) is used to get strbeg. */
321 /* XXXX We assume that strpos is strbeg unless sv. */
323 /* XXXX Some places assume that there is a fixed substring.
324 An update may be needed if optimizer marks as "INTUITable"
325 RExen without fixed substrings. Similarly, it is assumed that
326 lengths of all the strings are no more than minlen, thus they
327 cannot come from lookahead.
328 (Or minlen should take into account lookahead.) */
330 /* A failure to find a constant substring means that there is no need to make
331 an expensive call to REx engine, thus we celebrate a failure. Similarly,
332 finding a substring too deep into the string means that less calls to
333 regtry() should be needed.
335 REx compiler's optimizer found 4 possible hints:
336 a) Anchored substring;
338 c) Whether we are anchored (beginning-of-line or \G);
339 d) First node (of those at offset 0) which may distingush positions;
340 We use a)b)d) and multiline-part of c), and try to find a position in the
341 string which does not contradict any of them.
344 /* Most of decisions we do here should have been done at compile time.
345 The nodes of the REx which we used for the search should have been
346 deleted from the finite automaton. */
349 Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
350 char *strend, U32 flags, re_scream_pos_data *data)
353 register I32 start_shift = 0;
354 /* Should be nonnegative! */
355 register I32 end_shift = 0;
360 const int do_utf8 = sv ? SvUTF8(sv) : 0; /* if no sv we have to assume bytes */
362 register char *other_last = NULL; /* other substr checked before this */
363 char *check_at = NULL; /* check substr found at this pos */
364 const I32 multiline = prog->reganch & PMf_MULTILINE;
366 const char * const i_strpos = strpos;
369 GET_RE_DEBUG_FLAGS_DECL;
371 RX_MATCH_UTF8_set(prog,do_utf8);
373 if (prog->reganch & ROPT_UTF8) {
374 PL_reg_flags |= RF_utf8;
377 debug_start_match(prog, do_utf8, strpos, strend,
378 "Guessing start of match for");
381 /* CHR_DIST() would be more correct here but it makes things slow. */
382 if (prog->minlen > strend - strpos) {
383 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
384 "String too short... [re_intuit_start]\n"));
387 strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
390 if (!prog->check_utf8 && prog->check_substr)
391 to_utf8_substr(prog);
392 check = prog->check_utf8;
394 if (!prog->check_substr && prog->check_utf8)
395 to_byte_substr(prog);
396 check = prog->check_substr;
398 if (check == &PL_sv_undef) {
399 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
400 "Non-utf string cannot match utf check string\n"));
403 if (prog->reganch & ROPT_ANCH) { /* Match at beg-of-str or after \n */
404 ml_anch = !( (prog->reganch & ROPT_ANCH_SINGLE)
405 || ( (prog->reganch & ROPT_ANCH_BOL)
406 && !multiline ) ); /* Check after \n? */
409 if ( !(prog->reganch & (ROPT_ANCH_GPOS /* Checked by the caller */
410 | ROPT_IMPLICIT)) /* not a real BOL */
411 /* SvCUR is not set on references: SvRV and SvPVX_const overlap */
413 && (strpos != strbeg)) {
414 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
417 if (prog->check_offset_min == prog->check_offset_max &&
418 !(prog->reganch & ROPT_CANY_SEEN)) {
419 /* Substring at constant offset from beg-of-str... */
422 s = HOP3c(strpos, prog->check_offset_min, strend);
424 slen = SvCUR(check); /* >= 1 */
426 if ( strend - s > slen || strend - s < slen - 1
427 || (strend - s == slen && strend[-1] != '\n')) {
428 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
431 /* Now should match s[0..slen-2] */
433 if (slen && (*SvPVX_const(check) != *s
435 && memNE(SvPVX_const(check), s, slen)))) {
437 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
441 else if (*SvPVX_const(check) != *s
442 || ((slen = SvCUR(check)) > 1
443 && memNE(SvPVX_const(check), s, slen)))
446 goto success_at_start;
449 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
451 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
452 end_shift = prog->minlen - start_shift -
453 CHR_SVLEN(check) + (SvTAIL(check) != 0);
455 const I32 end = prog->check_offset_max + CHR_SVLEN(check)
456 - (SvTAIL(check) != 0);
457 const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
459 if (end_shift < eshift)
463 else { /* Can match at random position */
466 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
467 /* Should be nonnegative! */
468 end_shift = prog->minlen - start_shift -
469 CHR_SVLEN(check) + (SvTAIL(check) != 0);
472 #ifdef DEBUGGING /* 7/99: reports of failure (with the older version) */
474 Perl_croak(aTHX_ "panic: end_shift");
478 /* Find a possible match in the region s..strend by looking for
479 the "check" substring in the region corrected by start/end_shift. */
480 if (flags & REXEC_SCREAM) {
481 I32 p = -1; /* Internal iterator of scream. */
482 I32 * const pp = data ? data->scream_pos : &p;
484 if (PL_screamfirst[BmRARE(check)] >= 0
485 || ( BmRARE(check) == '\n'
486 && (BmPREVIOUS(check) == SvCUR(check) - 1)
488 s = screaminstr(sv, check,
489 start_shift + (s - strbeg), end_shift, pp, 0);
492 /* we may be pointing at the wrong string */
493 if (s && RX_MATCH_COPIED(prog))
494 s = strbeg + (s - SvPVX_const(sv));
496 *data->scream_olds = s;
498 else if (prog->reganch & ROPT_CANY_SEEN)
499 s = fbm_instr((U8*)(s + start_shift),
500 (U8*)(strend - end_shift),
501 check, multiline ? FBMrf_MULTILINE : 0);
503 s = fbm_instr(HOP3(s, start_shift, strend),
504 HOP3(strend, -end_shift, strbeg),
505 check, multiline ? FBMrf_MULTILINE : 0);
507 /* Update the count-of-usability, remove useless subpatterns,
511 RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
512 SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
513 PerlIO_printf(Perl_debug_log, "%s %s substr %s%s%s",
514 (s ? "Found" : "Did not find"),
515 (check == (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr)
516 ? "anchored" : "floating"),
519 (s ? " at offset " : "...\n") );
527 /* Finish the diagnostic message */
528 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
530 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
531 Start with the other substr.
532 XXXX no SCREAM optimization yet - and a very coarse implementation
533 XXXX /ttx+/ results in anchored="ttx", floating="x". floating will
534 *always* match. Probably should be marked during compile...
535 Probably it is right to do no SCREAM here...
538 if (do_utf8 ? (prog->float_utf8 && prog->anchored_utf8) : (prog->float_substr && prog->anchored_substr)) {
539 /* Take into account the "other" substring. */
540 /* XXXX May be hopelessly wrong for UTF... */
543 if (check == (do_utf8 ? prog->float_utf8 : prog->float_substr)) {
546 char * const last = HOP3c(s, -start_shift, strbeg);
548 char * const saved_s = s;
551 t = s - prog->check_offset_max;
552 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
554 || ((t = (char*)reghopmaybe3((U8*)s, -(prog->check_offset_max), (U8*)strpos))
559 t = HOP3c(t, prog->anchored_offset, strend);
560 if (t < other_last) /* These positions already checked */
562 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
565 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
566 /* On end-of-str: see comment below. */
567 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
568 if (must == &PL_sv_undef) {
570 DEBUG_EXECUTE_r(must = prog->anchored_utf8); /* for debug */
575 HOP3(HOP3(last1, prog->anchored_offset, strend)
576 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
578 multiline ? FBMrf_MULTILINE : 0
581 RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
582 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
583 PerlIO_printf(Perl_debug_log, "%s anchored substr %s%s",
584 (s ? "Found" : "Contradicts"),
585 quoted, RE_SV_TAIL(must));
590 if (last1 >= last2) {
591 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
592 ", giving up...\n"));
595 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
596 ", trying floating at offset %ld...\n",
597 (long)(HOP3c(saved_s, 1, strend) - i_strpos)));
598 other_last = HOP3c(last1, prog->anchored_offset+1, strend);
599 s = HOP3c(last, 1, strend);
603 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
604 (long)(s - i_strpos)));
605 t = HOP3c(s, -prog->anchored_offset, strbeg);
606 other_last = HOP3c(s, 1, strend);
614 else { /* Take into account the floating substring. */
616 char * const saved_s = s;
619 t = HOP3c(s, -start_shift, strbeg);
621 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
622 if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
623 last = HOP3c(t, prog->float_max_offset, strend);
624 s = HOP3c(t, prog->float_min_offset, strend);
627 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
628 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
629 /* fbm_instr() takes into account exact value of end-of-str
630 if the check is SvTAIL(ed). Since false positives are OK,
631 and end-of-str is not later than strend we are OK. */
632 if (must == &PL_sv_undef) {
634 DEBUG_EXECUTE_r(must = prog->float_utf8); /* for debug message */
637 s = fbm_instr((unsigned char*)s,
638 (unsigned char*)last + SvCUR(must)
640 must, multiline ? FBMrf_MULTILINE : 0);
642 RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
643 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
644 PerlIO_printf(Perl_debug_log, "%s floating substr %s%s",
645 (s ? "Found" : "Contradicts"),
646 quoted, RE_SV_TAIL(must));
650 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
651 ", giving up...\n"));
654 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
655 ", trying anchored starting at offset %ld...\n",
656 (long)(saved_s + 1 - i_strpos)));
658 s = HOP3c(t, 1, strend);
662 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
663 (long)(s - i_strpos)));
664 other_last = s; /* Fix this later. --Hugo */
673 t = s - prog->check_offset_max;
674 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
676 || ((t = (char*)reghopmaybe3((U8*)s, -prog->check_offset_max, (U8*)strpos))
678 /* Fixed substring is found far enough so that the match
679 cannot start at strpos. */
681 if (ml_anch && t[-1] != '\n') {
682 /* Eventually fbm_*() should handle this, but often
683 anchored_offset is not 0, so this check will not be wasted. */
684 /* XXXX In the code below we prefer to look for "^" even in
685 presence of anchored substrings. And we search even
686 beyond the found float position. These pessimizations
687 are historical artefacts only. */
689 while (t < strend - prog->minlen) {
691 if (t < check_at - prog->check_offset_min) {
692 if (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) {
693 /* Since we moved from the found position,
694 we definitely contradict the found anchored
695 substr. Due to the above check we do not
696 contradict "check" substr.
697 Thus we can arrive here only if check substr
698 is float. Redo checking for "other"=="fixed".
701 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
702 PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
703 goto do_other_anchored;
705 /* We don't contradict the found floating substring. */
706 /* XXXX Why not check for STCLASS? */
708 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
709 PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
712 /* Position contradicts check-string */
713 /* XXXX probably better to look for check-string
714 than for "\n", so one should lower the limit for t? */
715 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
716 PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
717 other_last = strpos = s = t + 1;
722 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
723 PL_colors[0], PL_colors[1]));
727 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
728 PL_colors[0], PL_colors[1]));
732 ++BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
735 /* The found string does not prohibit matching at strpos,
736 - no optimization of calling REx engine can be performed,
737 unless it was an MBOL and we are not after MBOL,
738 or a future STCLASS check will fail this. */
740 /* Even in this situation we may use MBOL flag if strpos is offset
741 wrt the start of the string. */
742 if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
743 && (strpos != strbeg) && strpos[-1] != '\n'
744 /* May be due to an implicit anchor of m{.*foo} */
745 && !(prog->reganch & ROPT_IMPLICIT))
750 DEBUG_EXECUTE_r( if (ml_anch)
751 PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
752 (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
755 if (!(prog->reganch & ROPT_NAUGHTY) /* XXXX If strpos moved? */
757 prog->check_utf8 /* Could be deleted already */
758 && --BmUSEFUL(prog->check_utf8) < 0
759 && (prog->check_utf8 == prog->float_utf8)
761 prog->check_substr /* Could be deleted already */
762 && --BmUSEFUL(prog->check_substr) < 0
763 && (prog->check_substr == prog->float_substr)
766 /* If flags & SOMETHING - do not do it many times on the same match */
767 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
768 SvREFCNT_dec(do_utf8 ? prog->check_utf8 : prog->check_substr);
769 if (do_utf8 ? prog->check_substr : prog->check_utf8)
770 SvREFCNT_dec(do_utf8 ? prog->check_substr : prog->check_utf8);
771 prog->check_substr = prog->check_utf8 = NULL; /* disable */
772 prog->float_substr = prog->float_utf8 = NULL; /* clear */
773 check = NULL; /* abort */
775 /* XXXX This is a remnant of the old implementation. It
776 looks wasteful, since now INTUIT can use many
778 prog->reganch &= ~RE_USE_INTUIT;
785 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
786 if (prog->regstclass && OP(prog->regstclass)!=TRIE) {
787 /* minlen == 0 is possible if regstclass is \b or \B,
788 and the fixed substr is ''$.
789 Since minlen is already taken into account, s+1 is before strend;
790 accidentally, minlen >= 1 guaranties no false positives at s + 1
791 even for \b or \B. But (minlen? 1 : 0) below assumes that
792 regstclass does not come from lookahead... */
793 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
794 This leaves EXACTF only, which is dealt with in find_byclass(). */
795 const U8* const str = (U8*)STRING(prog->regstclass);
796 const int cl_l = (PL_regkind[OP(prog->regstclass)] == EXACT
797 ? CHR_DIST(str+STR_LEN(prog->regstclass), str)
799 const char * endpos = (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
800 ? HOP3c(s, (prog->minlen ? cl_l : 0), strend)
801 : (prog->float_substr || prog->float_utf8
802 ? HOP3c(HOP3c(check_at, -start_shift, strbeg),
805 /*if (OP(prog->regstclass) == TRIE)
808 s = find_byclass(prog, prog->regstclass, s, endpos, NULL);
811 const char *what = NULL;
813 if (endpos == strend) {
814 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
815 "Could not match STCLASS...\n") );
818 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
819 "This position contradicts STCLASS...\n") );
820 if ((prog->reganch & ROPT_ANCH) && !ml_anch)
822 /* Contradict one of substrings */
823 if (prog->anchored_substr || prog->anchored_utf8) {
824 if ((do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) == check) {
825 DEBUG_EXECUTE_r( what = "anchored" );
827 s = HOP3c(t, 1, strend);
828 if (s + start_shift + end_shift > strend) {
829 /* XXXX Should be taken into account earlier? */
830 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
831 "Could not match STCLASS...\n") );
836 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
837 "Looking for %s substr starting at offset %ld...\n",
838 what, (long)(s + start_shift - i_strpos)) );
841 /* Have both, check_string is floating */
842 if (t + start_shift >= check_at) /* Contradicts floating=check */
843 goto retry_floating_check;
844 /* Recheck anchored substring, but not floating... */
848 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
849 "Looking for anchored substr starting at offset %ld...\n",
850 (long)(other_last - i_strpos)) );
851 goto do_other_anchored;
853 /* Another way we could have checked stclass at the
854 current position only: */
859 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
860 "Looking for /%s^%s/m starting at offset %ld...\n",
861 PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
864 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
866 /* Check is floating subtring. */
867 retry_floating_check:
868 t = check_at - start_shift;
869 DEBUG_EXECUTE_r( what = "floating" );
870 goto hop_and_restart;
873 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
874 "By STCLASS: moving %ld --> %ld\n",
875 (long)(t - i_strpos), (long)(s - i_strpos))
879 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
880 "Does not contradict STCLASS...\n");
885 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
886 PL_colors[4], (check ? "Guessed" : "Giving up"),
887 PL_colors[5], (long)(s - i_strpos)) );
890 fail_finish: /* Substring not found */
891 if (prog->check_substr || prog->check_utf8) /* could be removed already */
892 BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
894 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
895 PL_colors[4], PL_colors[5]));
899 /* We know what class REx starts with. Try to find this position... */
900 /* if reginfo is NULL, its a dryrun */
901 /* annoyingly all the vars in this routine have different names from their counterparts
902 in regmatch. /grrr */
904 #define REXEC_TRIE_READ_CHAR(trie_type, trie, uc, uscan, len, uvc, charid, \
905 foldlen, foldbuf, uniflags) STMT_START { \
906 switch (trie_type) { \
907 case trie_utf8_fold: \
909 uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags ); \
914 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags ); \
915 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
916 foldlen -= UNISKIP( uvc ); \
917 uscan = foldbuf + UNISKIP( uvc ); \
921 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags ); \
929 charid = trie->charmap[ uvc ]; \
933 if (trie->widecharmap) { \
934 SV** const svpp = hv_fetch(trie->widecharmap, \
935 (char*)&uvc, sizeof(UV), 0); \
937 charid = (U16)SvIV(*svpp); \
942 #define REXEC_FBC_EXACTISH_CHECK(CoNd) \
945 ibcmp_utf8(s, NULL, 0, do_utf8, \
946 m, NULL, ln, (bool)UTF)) \
947 && (!reginfo || regtry(reginfo, s)) ) \
950 U8 foldbuf[UTF8_MAXBYTES_CASE+1]; \
951 uvchr_to_utf8(tmpbuf, c); \
952 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen); \
954 && (f == c1 || f == c2) \
955 && (ln == foldlen || \
956 !ibcmp_utf8((char *) foldbuf, \
957 NULL, foldlen, do_utf8, \
959 NULL, ln, (bool)UTF)) \
960 && (!reginfo || regtry(reginfo, s)) ) \
965 #define REXEC_FBC_EXACTISH_SCAN(CoNd) \
969 && (ln == 1 || !(OP(c) == EXACTF \
971 : ibcmp_locale(s, m, ln))) \
972 && (!reginfo || regtry(reginfo, s)) ) \
978 #define REXEC_FBC_UTF8_SCAN(CoDe) \
980 while (s + (uskip = UTF8SKIP(s)) <= strend) { \
986 #define REXEC_FBC_SCAN(CoDe) \
988 while (s < strend) { \
994 #define REXEC_FBC_UTF8_CLASS_SCAN(CoNd) \
995 REXEC_FBC_UTF8_SCAN( \
997 if (tmp && (!reginfo || regtry(reginfo, s))) \
1006 #define REXEC_FBC_CLASS_SCAN(CoNd) \
1009 if (tmp && (!reginfo || regtry(reginfo, s))) \
1018 #define REXEC_FBC_TRYIT \
1019 if ((!reginfo || regtry(reginfo, s))) \
1022 #define REXEC_FBC_CSCAN_PRELOAD(UtFpReLoAd,CoNdUtF8,CoNd) \
1025 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1028 REXEC_FBC_CLASS_SCAN(CoNd); \
1032 #define REXEC_FBC_CSCAN_TAINT(CoNdUtF8,CoNd) \
1033 PL_reg_flags |= RF_tainted; \
1035 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1038 REXEC_FBC_CLASS_SCAN(CoNd); \
1043 S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
1044 const char *strend, const regmatch_info *reginfo)
1047 const I32 doevery = (prog->reganch & ROPT_SKIP) == 0;
1051 register STRLEN uskip;
1055 register I32 tmp = 1; /* Scratch variable? */
1056 register const bool do_utf8 = PL_reg_match_utf8;
1058 /* We know what class it must start with. */
1062 REXEC_FBC_UTF8_CLASS_SCAN((ANYOF_FLAGS(c) & ANYOF_UNICODE) ||
1063 !UTF8_IS_INVARIANT((U8)s[0]) ?
1064 reginclass(prog, c, (U8*)s, 0, do_utf8) :
1065 REGINCLASS(prog, c, (U8*)s));
1068 while (s < strend) {
1071 if (REGINCLASS(prog, c, (U8*)s) ||
1072 (ANYOF_FOLD_SHARP_S(c, s, strend) &&
1073 /* The assignment of 2 is intentional:
1074 * for the folded sharp s, the skip is 2. */
1075 (skip = SHARP_S_SKIP))) {
1076 if (tmp && (!reginfo || regtry(reginfo, s)))
1089 if (tmp && (!reginfo || regtry(reginfo, s)))
1097 ln = STR_LEN(c); /* length to match in octets/bytes */
1098 lnc = (I32) ln; /* length to match in characters */
1100 STRLEN ulen1, ulen2;
1102 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
1103 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
1104 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1106 to_utf8_lower((U8*)m, tmpbuf1, &ulen1);
1107 to_utf8_upper((U8*)m, tmpbuf2, &ulen2);
1109 c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXBYTES_CASE,
1111 c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXBYTES_CASE,
1114 while (sm < ((U8 *) m + ln)) {
1129 c2 = PL_fold_locale[c1];
1131 e = HOP3c(strend, -((I32)lnc), s);
1133 if (!reginfo && e < s)
1134 e = s; /* Due to minlen logic of intuit() */
1136 /* The idea in the EXACTF* cases is to first find the
1137 * first character of the EXACTF* node and then, if
1138 * necessary, case-insensitively compare the full
1139 * text of the node. The c1 and c2 are the first
1140 * characters (though in Unicode it gets a bit
1141 * more complicated because there are more cases
1142 * than just upper and lower: one needs to use
1143 * the so-called folding case for case-insensitive
1144 * matching (called "loose matching" in Unicode).
1145 * ibcmp_utf8() will do just that. */
1149 U8 tmpbuf [UTF8_MAXBYTES+1];
1150 STRLEN len, foldlen;
1151 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1153 /* Upper and lower of 1st char are equal -
1154 * probably not a "letter". */
1156 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
1158 REXEC_FBC_EXACTISH_CHECK(c == c1);
1163 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
1166 /* Handle some of the three Greek sigmas cases.
1167 * Note that not all the possible combinations
1168 * are handled here: some of them are handled
1169 * by the standard folding rules, and some of
1170 * them (the character class or ANYOF cases)
1171 * are handled during compiletime in
1172 * regexec.c:S_regclass(). */
1173 if (c == (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA ||
1174 c == (UV)UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA)
1175 c = (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA;
1177 REXEC_FBC_EXACTISH_CHECK(c == c1 || c == c2);
1183 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
1185 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
1189 PL_reg_flags |= RF_tainted;
1196 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
1197 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);
1199 tmp = ((OP(c) == BOUND ?
1200 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1201 LOAD_UTF8_CHARCLASS_ALNUM();
1202 REXEC_FBC_UTF8_SCAN(
1203 if (tmp == !(OP(c) == BOUND ?
1204 (bool)swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
1205 isALNUM_LC_utf8((U8*)s)))
1213 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
1214 tmp = ((OP(c) == BOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1217 !(OP(c) == BOUND ? isALNUM(*s) : isALNUM_LC(*s))) {
1223 if ((!prog->minlen && tmp) && (!reginfo || regtry(reginfo, s)))
1227 PL_reg_flags |= RF_tainted;
1234 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
1235 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);
1237 tmp = ((OP(c) == NBOUND ?
1238 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1239 LOAD_UTF8_CHARCLASS_ALNUM();
1240 REXEC_FBC_UTF8_SCAN(
1241 if (tmp == !(OP(c) == NBOUND ?
1242 (bool)swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
1243 isALNUM_LC_utf8((U8*)s)))
1245 else REXEC_FBC_TRYIT;
1249 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
1250 tmp = ((OP(c) == NBOUND ?
1251 isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1254 !(OP(c) == NBOUND ? isALNUM(*s) : isALNUM_LC(*s)))
1256 else REXEC_FBC_TRYIT;
1259 if ((!prog->minlen && !tmp) && (!reginfo || regtry(reginfo, s)))
1263 REXEC_FBC_CSCAN_PRELOAD(
1264 LOAD_UTF8_CHARCLASS_ALNUM(),
1265 swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8),
1269 REXEC_FBC_CSCAN_TAINT(
1270 isALNUM_LC_utf8((U8*)s),
1274 REXEC_FBC_CSCAN_PRELOAD(
1275 LOAD_UTF8_CHARCLASS_ALNUM(),
1276 !swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8),
1280 REXEC_FBC_CSCAN_TAINT(
1281 !isALNUM_LC_utf8((U8*)s),
1285 REXEC_FBC_CSCAN_PRELOAD(
1286 LOAD_UTF8_CHARCLASS_SPACE(),
1287 *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8),
1291 REXEC_FBC_CSCAN_TAINT(
1292 *s == ' ' || isSPACE_LC_utf8((U8*)s),
1296 REXEC_FBC_CSCAN_PRELOAD(
1297 LOAD_UTF8_CHARCLASS_SPACE(),
1298 !(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8)),
1302 REXEC_FBC_CSCAN_TAINT(
1303 !(*s == ' ' || isSPACE_LC_utf8((U8*)s)),
1307 REXEC_FBC_CSCAN_PRELOAD(
1308 LOAD_UTF8_CHARCLASS_DIGIT(),
1309 swash_fetch(PL_utf8_digit,(U8*)s, do_utf8),
1313 REXEC_FBC_CSCAN_TAINT(
1314 isDIGIT_LC_utf8((U8*)s),
1318 REXEC_FBC_CSCAN_PRELOAD(
1319 LOAD_UTF8_CHARCLASS_DIGIT(),
1320 !swash_fetch(PL_utf8_digit,(U8*)s, do_utf8),
1324 REXEC_FBC_CSCAN_TAINT(
1325 !isDIGIT_LC_utf8((U8*)s),
1329 /*Perl_croak(aTHX_ "panic: unknown regstclass TRIE");*/
1331 const enum { trie_plain, trie_utf8, trie_utf8_fold }
1332 trie_type = do_utf8 ?
1333 (c->flags == EXACT ? trie_utf8 : trie_utf8_fold)
1335 /* what trie are we using right now */
1337 = (reg_ac_data*)prog->data->data[ ARG( c ) ];
1338 reg_trie_data *trie=aho->trie;
1340 const char *last_start = strend - trie->minlen;
1342 const char *real_start = s;
1344 STRLEN maxlen = trie->maxlen;
1346 U8 **points; /* map of where we were in the input string
1347 when reading a given string. For ASCII this
1348 is unnecessary overhead as the relationship
1349 is always 1:1, but for unicode, especially
1350 case folded unicode this is not true. */
1351 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1353 GET_RE_DEBUG_FLAGS_DECL;
1355 /* We can't just allocate points here. We need to wrap it in
1356 * an SV so it gets freed properly if there is a croak while
1357 * running the match */
1360 sv_points=newSV(maxlen * sizeof(U8 *));
1361 SvCUR_set(sv_points,
1362 maxlen * sizeof(U8 *));
1363 SvPOK_on(sv_points);
1364 sv_2mortal(sv_points);
1365 points=(U8**)SvPV_nolen(sv_points );
1367 if (trie->bitmap && trie_type != trie_utf8_fold) {
1368 while (s <= last_start && !TRIE_BITMAP_TEST(trie,*s) ) {
1373 while (s <= last_start) {
1374 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1382 U8 *uscan = (U8*)NULL;
1383 U8 *leftmost = NULL;
1387 while ( state && uc <= (U8*)strend ) {
1389 if (aho->states[ state ].wordnum) {
1390 U8 *lpos= points[ (pointpos - trie->wordlen[aho->states[ state ].wordnum-1] ) % maxlen ];
1391 if (!leftmost || lpos < leftmost)
1395 points[pointpos++ % maxlen]= uc;
1396 REXEC_TRIE_READ_CHAR(trie_type, trie, uc, uscan, len,
1397 uvc, charid, foldlen, foldbuf, uniflags);
1398 DEBUG_TRIE_EXECUTE_r(
1399 PerlIO_printf(Perl_debug_log,
1400 "Pos: %d Charid:%3x CV:%4"UVxf" ",
1401 (int)((const char*)uc - real_start), charid, uvc)
1407 U32 word = aho->states[ state ].wordnum;
1409 base = aho->states[ state ].trans.base;
1411 DEBUG_TRIE_EXECUTE_r(
1412 PerlIO_printf( Perl_debug_log,
1413 "%sState: %4"UVxf", Base: 0x%-4"UVxf" uvc=%"UVxf" word=%"UVxf"\n",
1414 failed ? "Fail transition to " : "",
1415 state, base, uvc, word)
1420 (base + charid > trie->uniquecharcount )
1421 && (base + charid - 1 - trie->uniquecharcount
1423 && trie->trans[base + charid - 1 -
1424 trie->uniquecharcount].check == state
1425 && (tmp=trie->trans[base + charid - 1 -
1426 trie->uniquecharcount ].next))
1436 state = aho->fail[state];
1440 /* we must be accepting here */
1448 else if (!charid && trie->bitmap && trie_type != trie_utf8_fold) {
1449 while ( uc <= (U8*)last_start && !TRIE_BITMAP_TEST(trie,*uc) ) {
1455 if ( aho->states[ state ].wordnum ) {
1456 U8 *lpos = points[ (pointpos - trie->wordlen[aho->states[ state ].wordnum-1]) % maxlen ];
1457 if (!leftmost || lpos < leftmost)
1460 DEBUG_TRIE_EXECUTE_r(
1461 PerlIO_printf( Perl_debug_log,
1462 "%sState: %4"UVxf", Base: 0x%-4"UVxf" uvc=%"UVxf"\n",
1467 s = (char*)leftmost;
1468 if (!reginfo || regtry(reginfo, s)) {
1483 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
1492 - regexec_flags - match a regexp against a string
1495 Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *strend,
1496 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
1497 /* strend: pointer to null at end of string */
1498 /* strbeg: real beginning of string */
1499 /* minend: end of match must be >=minend after stringarg. */
1500 /* data: May be used for some additional optimizations. */
1501 /* nosave: For optimizations. */
1505 register regnode *c;
1506 register char *startpos = stringarg;
1507 I32 minlen; /* must match at least this many chars */
1508 I32 dontbother = 0; /* how many characters not to try at end */
1509 I32 end_shift = 0; /* Same for the end. */ /* CC */
1510 I32 scream_pos = -1; /* Internal iterator of scream. */
1511 char *scream_olds = NULL;
1512 SV* const oreplsv = GvSV(PL_replgv);
1513 const bool do_utf8 = DO_UTF8(sv);
1516 regmatch_info reginfo; /* create some info to pass to regtry etc */
1518 GET_RE_DEBUG_FLAGS_DECL;
1520 PERL_UNUSED_ARG(data);
1522 /* Be paranoid... */
1523 if (prog == NULL || startpos == NULL) {
1524 Perl_croak(aTHX_ "NULL regexp parameter");
1528 multiline = prog->reganch & PMf_MULTILINE;
1529 reginfo.prog = prog;
1531 RX_MATCH_UTF8_set(prog, do_utf8);
1533 minlen = prog->minlen;
1534 if (strend - startpos < minlen) {
1535 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1536 "String too short [regexec_flags]...\n"));
1540 /* Check validity of program. */
1541 if (UCHARAT(prog->program) != REG_MAGIC) {
1542 Perl_croak(aTHX_ "corrupted regexp program");
1546 PL_reg_eval_set = 0;
1549 if (prog->reganch & ROPT_UTF8)
1550 PL_reg_flags |= RF_utf8;
1552 /* Mark beginning of line for ^ and lookbehind. */
1553 reginfo.bol = startpos; /* XXX not used ??? */
1557 /* Mark end of line for $ (and such) */
1560 /* see how far we have to get to not match where we matched before */
1561 reginfo.till = startpos+minend;
1563 /* If there is a "must appear" string, look for it. */
1566 if (prog->reganch & ROPT_GPOS_SEEN) { /* Need to set reginfo->ganch */
1569 if (flags & REXEC_IGNOREPOS) /* Means: check only at start */
1570 reginfo.ganch = startpos;
1571 else if (sv && SvTYPE(sv) >= SVt_PVMG
1573 && (mg = mg_find(sv, PERL_MAGIC_regex_global))
1574 && mg->mg_len >= 0) {
1575 reginfo.ganch = strbeg + mg->mg_len; /* Defined pos() */
1576 if (prog->reganch & ROPT_ANCH_GPOS) {
1577 if (s > reginfo.ganch)
1582 else /* pos() not defined */
1583 reginfo.ganch = strbeg;
1586 if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) {
1587 re_scream_pos_data d;
1589 d.scream_olds = &scream_olds;
1590 d.scream_pos = &scream_pos;
1591 s = re_intuit_start(prog, sv, s, strend, flags, &d);
1593 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
1594 goto phooey; /* not present */
1599 debug_start_match(prog, do_utf8, startpos, strend,
1603 /* Simplest case: anchored match need be tried only once. */
1604 /* [unless only anchor is BOL and multiline is set] */
1605 if (prog->reganch & (ROPT_ANCH & ~ROPT_ANCH_GPOS)) {
1606 if (s == startpos && regtry(®info, startpos))
1608 else if (multiline || (prog->reganch & ROPT_IMPLICIT)
1609 || (prog->reganch & ROPT_ANCH_MBOL)) /* XXXX SBOL? */
1614 dontbother = minlen - 1;
1615 end = HOP3c(strend, -dontbother, strbeg) - 1;
1616 /* for multiline we only have to try after newlines */
1617 if (prog->check_substr || prog->check_utf8) {
1621 if (regtry(®info, s))
1626 if (prog->reganch & RE_USE_INTUIT) {
1627 s = re_intuit_start(prog, sv, s + 1, strend, flags, NULL);
1638 if (*s++ == '\n') { /* don't need PL_utf8skip here */
1639 if (regtry(®info, s))
1646 } else if (prog->reganch & ROPT_ANCH_GPOS) {
1647 if (regtry(®info, reginfo.ganch))
1652 /* Messy cases: unanchored match. */
1653 if ((prog->anchored_substr || prog->anchored_utf8) && prog->reganch & ROPT_SKIP) {
1654 /* we have /x+whatever/ */
1655 /* it must be a one character string (XXXX Except UTF?) */
1660 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1661 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1662 ch = SvPVX_const(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr)[0];
1667 DEBUG_EXECUTE_r( did_match = 1 );
1668 if (regtry(®info, s)) goto got_it;
1670 while (s < strend && *s == ch)
1678 DEBUG_EXECUTE_r( did_match = 1 );
1679 if (regtry(®info, s)) goto got_it;
1681 while (s < strend && *s == ch)
1686 DEBUG_EXECUTE_r(if (!did_match)
1687 PerlIO_printf(Perl_debug_log,
1688 "Did not find anchored character...\n")
1691 else if (prog->anchored_substr != NULL
1692 || prog->anchored_utf8 != NULL
1693 || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
1694 && prog->float_max_offset < strend - s)) {
1699 char *last1; /* Last position checked before */
1703 if (prog->anchored_substr || prog->anchored_utf8) {
1704 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1705 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1706 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
1707 back_max = back_min = prog->anchored_offset;
1709 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
1710 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1711 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
1712 back_max = prog->float_max_offset;
1713 back_min = prog->float_min_offset;
1715 if (must == &PL_sv_undef)
1716 /* could not downgrade utf8 check substring, so must fail */
1719 last = HOP3c(strend, /* Cannot start after this */
1720 -(I32)(CHR_SVLEN(must)
1721 - (SvTAIL(must) != 0) + back_min), strbeg);
1724 last1 = HOPc(s, -1);
1726 last1 = s - 1; /* bogus */
1728 /* XXXX check_substr already used to find "s", can optimize if
1729 check_substr==must. */
1731 dontbother = end_shift;
1732 strend = HOPc(strend, -dontbother);
1733 while ( (s <= last) &&
1734 ((flags & REXEC_SCREAM)
1735 ? (s = screaminstr(sv, must, HOP3c(s, back_min, strend) - strbeg,
1736 end_shift, &scream_pos, 0))
1737 : (s = fbm_instr((unsigned char*)HOP3(s, back_min, strend),
1738 (unsigned char*)strend, must,
1739 multiline ? FBMrf_MULTILINE : 0))) ) {
1740 /* we may be pointing at the wrong string */
1741 if ((flags & REXEC_SCREAM) && RX_MATCH_COPIED(prog))
1742 s = strbeg + (s - SvPVX_const(sv));
1743 DEBUG_EXECUTE_r( did_match = 1 );
1744 if (HOPc(s, -back_max) > last1) {
1745 last1 = HOPc(s, -back_min);
1746 s = HOPc(s, -back_max);
1749 char * const t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
1751 last1 = HOPc(s, -back_min);
1755 while (s <= last1) {
1756 if (regtry(®info, s))
1762 while (s <= last1) {
1763 if (regtry(®info, s))
1769 DEBUG_EXECUTE_r(if (!did_match) {
1770 RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
1771 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
1772 PerlIO_printf(Perl_debug_log, "Did not find %s substr %s%s...\n",
1773 ((must == prog->anchored_substr || must == prog->anchored_utf8)
1774 ? "anchored" : "floating"),
1775 quoted, RE_SV_TAIL(must));
1779 else if ((c = prog->regstclass)) {
1781 const OPCODE op = OP(prog->regstclass);
1782 /* don't bother with what can't match */
1783 if (PL_regkind[op] != EXACT && op != CANY && op != TRIE)
1784 strend = HOPc(strend, -(minlen - 1));
1787 SV * const prop = sv_newmortal();
1788 regprop(prog, prop, c);
1790 RE_PV_QUOTED_DECL(quoted,UTF,PERL_DEBUG_PAD_ZERO(1),
1792 PerlIO_printf(Perl_debug_log,
1793 "Matching stclass %.*s against %s (%d chars)\n",
1794 SvCUR(prop), SvPVX_const(prop),
1795 quoted, (int)(strend - s));
1798 if (find_byclass(prog, c, s, strend, ®info))
1800 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
1804 if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
1809 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
1810 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1811 float_real = do_utf8 ? prog->float_utf8 : prog->float_substr;
1813 if (flags & REXEC_SCREAM) {
1814 last = screaminstr(sv, float_real, s - strbeg,
1815 end_shift, &scream_pos, 1); /* last one */
1817 last = scream_olds; /* Only one occurrence. */
1818 /* we may be pointing at the wrong string */
1819 else if (RX_MATCH_COPIED(prog))
1820 s = strbeg + (s - SvPVX_const(sv));
1824 const char * const little = SvPV_const(float_real, len);
1826 if (SvTAIL(float_real)) {
1827 if (memEQ(strend - len + 1, little, len - 1))
1828 last = strend - len + 1;
1829 else if (!multiline)
1830 last = memEQ(strend - len, little, len)
1831 ? strend - len : NULL;
1837 last = rninstr(s, strend, little, little + len);
1839 last = strend; /* matching "$" */
1843 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1844 "%sCan't trim the tail, match fails (should not happen)%s\n",
1845 PL_colors[4], PL_colors[5]));
1846 goto phooey; /* Should not happen! */
1848 dontbother = strend - last + prog->float_min_offset;
1850 if (minlen && (dontbother < minlen))
1851 dontbother = minlen - 1;
1852 strend -= dontbother; /* this one's always in bytes! */
1853 /* We don't know much -- general case. */
1856 if (regtry(®info, s))
1865 if (regtry(®info, s))
1867 } while (s++ < strend);
1875 RX_MATCH_TAINTED_set(prog, PL_reg_flags & RF_tainted);
1877 if (PL_reg_eval_set) {
1878 /* Preserve the current value of $^R */
1879 if (oreplsv != GvSV(PL_replgv))
1880 sv_setsv(oreplsv, GvSV(PL_replgv));/* So that when GvSV(replgv) is
1881 restored, the value remains
1883 restore_pos(aTHX_ prog);
1886 /* make sure $`, $&, $', and $digit will work later */
1887 if ( !(flags & REXEC_NOT_FIRST) ) {
1888 RX_MATCH_COPY_FREE(prog);
1889 if (flags & REXEC_COPY_STR) {
1890 const I32 i = PL_regeol - startpos + (stringarg - strbeg);
1891 #ifdef PERL_OLD_COPY_ON_WRITE
1893 || (SvFLAGS(sv) & CAN_COW_MASK) == CAN_COW_FLAGS)) {
1895 PerlIO_printf(Perl_debug_log,
1896 "Copy on write: regexp capture, type %d\n",
1899 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
1900 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
1901 assert (SvPOKp(prog->saved_copy));
1905 RX_MATCH_COPIED_on(prog);
1906 s = savepvn(strbeg, i);
1912 prog->subbeg = strbeg;
1913 prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
1920 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
1921 PL_colors[4], PL_colors[5]));
1922 if (PL_reg_eval_set)
1923 restore_pos(aTHX_ prog);
1928 - regtry - try match at specific point
1930 STATIC I32 /* 0 failure, 1 success */
1931 S_regtry(pTHX_ const regmatch_info *reginfo, char *startpos)
1937 regexp *prog = reginfo->prog;
1938 GET_RE_DEBUG_FLAGS_DECL;
1941 PL_regindent = 0; /* XXXX Not good when matches are reenterable... */
1943 if ((prog->reganch & ROPT_EVAL_SEEN) && !PL_reg_eval_set) {
1946 PL_reg_eval_set = RS_init;
1947 DEBUG_EXECUTE_r(DEBUG_s(
1948 PerlIO_printf(Perl_debug_log, " setting stack tmpbase at %"IVdf"\n",
1949 (IV)(PL_stack_sp - PL_stack_base));
1951 SAVEI32(cxstack[cxstack_ix].blk_oldsp);
1952 cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base;
1953 /* Otherwise OP_NEXTSTATE will free whatever on stack now. */
1955 /* Apparently this is not needed, judging by wantarray. */
1956 /* SAVEI8(cxstack[cxstack_ix].blk_gimme);
1957 cxstack[cxstack_ix].blk_gimme = G_SCALAR; */
1960 /* Make $_ available to executed code. */
1961 if (reginfo->sv != DEFSV) {
1963 DEFSV = reginfo->sv;
1966 if (!(SvTYPE(reginfo->sv) >= SVt_PVMG && SvMAGIC(reginfo->sv)
1967 && (mg = mg_find(reginfo->sv, PERL_MAGIC_regex_global)))) {
1968 /* prepare for quick setting of pos */
1969 #ifdef PERL_OLD_COPY_ON_WRITE
1971 sv_force_normal_flags(sv, 0);
1973 mg = sv_magicext(reginfo->sv, NULL, PERL_MAGIC_regex_global,
1974 &PL_vtbl_mglob, NULL, 0);
1978 PL_reg_oldpos = mg->mg_len;
1979 SAVEDESTRUCTOR_X(restore_pos, prog);
1981 if (!PL_reg_curpm) {
1982 Newxz(PL_reg_curpm, 1, PMOP);
1985 SV* const repointer = newSViv(0);
1986 /* so we know which PL_regex_padav element is PL_reg_curpm */
1987 SvFLAGS(repointer) |= SVf_BREAK;
1988 av_push(PL_regex_padav,repointer);
1989 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
1990 PL_regex_pad = AvARRAY(PL_regex_padav);
1994 PM_SETRE(PL_reg_curpm, prog);
1995 PL_reg_oldcurpm = PL_curpm;
1996 PL_curpm = PL_reg_curpm;
1997 if (RX_MATCH_COPIED(prog)) {
1998 /* Here is a serious problem: we cannot rewrite subbeg,
1999 since it may be needed if this match fails. Thus
2000 $` inside (?{}) could fail... */
2001 PL_reg_oldsaved = prog->subbeg;
2002 PL_reg_oldsavedlen = prog->sublen;
2003 #ifdef PERL_OLD_COPY_ON_WRITE
2004 PL_nrs = prog->saved_copy;
2006 RX_MATCH_COPIED_off(prog);
2009 PL_reg_oldsaved = NULL;
2010 prog->subbeg = PL_bostr;
2011 prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2013 prog->startp[0] = startpos - PL_bostr;
2014 PL_reginput = startpos;
2015 PL_regstartp = prog->startp;
2016 PL_regendp = prog->endp;
2017 PL_reglastparen = &prog->lastparen;
2018 PL_reglastcloseparen = &prog->lastcloseparen;
2019 prog->lastparen = 0;
2020 prog->lastcloseparen = 0;
2022 DEBUG_EXECUTE_r(PL_reg_starttry = startpos);
2023 if (PL_reg_start_tmpl <= prog->nparens) {
2024 PL_reg_start_tmpl = prog->nparens*3/2 + 3;
2025 if(PL_reg_start_tmp)
2026 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2028 Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2031 /* XXXX What this code is doing here?!!! There should be no need
2032 to do this again and again, PL_reglastparen should take care of
2035 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2036 * Actually, the code in regcppop() (which Ilya may be meaning by
2037 * PL_reglastparen), is not needed at all by the test suite
2038 * (op/regexp, op/pat, op/split), but that code is needed, oddly
2039 * enough, for building DynaLoader, or otherwise this
2040 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
2041 * will happen. Meanwhile, this code *is* needed for the
2042 * above-mentioned test suite tests to succeed. The common theme
2043 * on those tests seems to be returning null fields from matches.
2048 if (prog->nparens) {
2050 for (i = prog->nparens; i > (I32)*PL_reglastparen; i--) {
2057 if (regmatch(reginfo, prog->program + 1)) {
2058 prog->endp[0] = PL_reginput - PL_bostr;
2061 REGCP_UNWIND(lastcp);
2066 #define sayYES goto yes
2067 #define sayNO goto no
2068 #define sayNO_ANYOF goto no_anyof
2069 #define sayYES_FINAL goto yes_final
2070 #define sayNO_FINAL goto no_final
2071 #define sayNO_SILENT goto do_no
2072 #define saySAME(x) if (x) goto yes; else goto no
2074 #define CACHEsayNO STMT_START { \
2075 if (st->u.whilem.cache_offset | st->u.whilem.cache_bit) \
2076 PL_reg_poscache[st->u.whilem.cache_offset] |= \
2077 (1<<st->u.whilem.cache_bit); \
2082 /* this is used to determine how far from the left messages like
2083 'failed...' are printed. Currently 29 makes these messages line
2084 up with the opcode they refer to. Earlier perls used 25 which
2085 left these messages outdented making reviewing a debug output
2088 #define REPORT_CODE_OFF 29
2091 /* Make sure there is a test for this +1 options in re_tests */
2092 #define TRIE_INITAL_ACCEPT_BUFFLEN 4;
2094 #define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
2095 #define CHRTEST_VOID -1000 /* the c1/c2 "next char" test should be skipped */
2097 #define SLAB_FIRST(s) (&(s)->states[0])
2098 #define SLAB_LAST(s) (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
2100 /* grab a new slab and return the first slot in it */
2102 STATIC regmatch_state *
2105 #if PERL_VERSION < 9
2108 regmatch_slab *s = PL_regmatch_slab->next;
2110 Newx(s, 1, regmatch_slab);
2111 s->prev = PL_regmatch_slab;
2113 PL_regmatch_slab->next = s;
2115 PL_regmatch_slab = s;
2116 return SLAB_FIRST(s);
2119 /* simulate a recursive call to regmatch */
2121 #define REGMATCH(ns, where) \
2124 st->resume_state = resume_##where; \
2125 goto start_recurse; \
2126 resume_point_##where:
2128 /* push a new state then goto it */
2130 #define PUSH_STATE_GOTO(state, node) \
2132 st->resume_state = state; \
2135 /* push a new state with success backtracking, then goto it */
2137 #define PUSH_YES_STATE_GOTO(state, node) \
2139 st->resume_state = state; \
2140 goto push_yes_state;
2145 - regmatch - main matching routine
2147 * Conceptually the strategy is simple: check to see whether the current
2148 * node matches, call self recursively to see whether the rest matches,
2149 * and then act accordingly. In practice we make some effort to avoid
2150 * recursion, in particular by going through "ordinary" nodes (that don't
2151 * need to know whether the rest of the match failed) by a loop instead of
2154 /* [lwall] I've hoisted the register declarations to the outer block in order to
2155 * maybe save a little bit of pushing and popping on the stack. It also takes
2156 * advantage of machines that use a register save mask on subroutine entry.
2158 * This function used to be heavily recursive, but since this had the
2159 * effect of blowing the CPU stack on complex regexes, it has been
2160 * restructured to be iterative, and to save state onto the heap rather
2161 * than the stack. Essentially whereever regmatch() used to be called, it
2162 * pushes the current state, notes where to return, then jumps back into
2165 * Originally the structure of this function used to look something like
2170 while (scan != NULL) {
2171 a++; // do stuff with a and b
2177 if (regmatch(...)) // recurse
2187 * Now it looks something like this:
2195 regmatch_state *st = new();
2197 st->a++; // do stuff with a and b
2199 while (scan != NULL) {
2207 st->resume_state = resume_FOO;
2208 goto start_recurse; // recurse
2217 st = new(); push a new state
2218 st->a = 1; st->b = 2;
2225 switch (resume_state) {
2227 goto resume_point_FOO;
2234 * WARNING: this means that any line in this function that contains a
2235 * REGMATCH() or TRYPAREN() is actually simulating a recursive call to
2236 * regmatch() using gotos instead. Thus the values of any local variables
2237 * not saved in the regmatch_state structure will have been lost when
2238 * execution resumes on the next line .
2240 * States (ie the st pointer) are allocated in slabs of about 4K in size.
2241 * PL_regmatch_state always points to the currently active state, and
2242 * PL_regmatch_slab points to the slab currently containing PL_regmatch_state.
2243 * The first time regmatch is called, the first slab is allocated, and is
2244 * never freed until interpreter desctruction. When the slab is full,
2245 * a new one is allocated chained to the end. At exit from regmatch, slabs
2246 * allocated since entry are freed.
2249 /* *** every FOO_fail should = FOO+1 */
2250 #define TRIE_next (REGNODE_MAX+1)
2251 #define TRIE_next_fail (REGNODE_MAX+2)
2252 #define EVAL_A (REGNODE_MAX+3)
2253 #define EVAL_A_fail (REGNODE_MAX+4)
2254 #define resume_CURLYX (REGNODE_MAX+5)
2255 #define resume_WHILEM1 (REGNODE_MAX+6)
2256 #define resume_WHILEM2 (REGNODE_MAX+7)
2257 #define resume_WHILEM3 (REGNODE_MAX+8)
2258 #define resume_WHILEM4 (REGNODE_MAX+9)
2259 #define resume_WHILEM5 (REGNODE_MAX+10)
2260 #define resume_WHILEM6 (REGNODE_MAX+11)
2261 #define BRANCH_next (REGNODE_MAX+12)
2262 #define BRANCH_next_fail (REGNODE_MAX+13)
2263 #define CURLYM_A (REGNODE_MAX+14)
2264 #define CURLYM_A_fail (REGNODE_MAX+15)
2265 #define CURLYM_B (REGNODE_MAX+16)
2266 #define CURLYM_B_fail (REGNODE_MAX+17)
2267 #define IFMATCH_A (REGNODE_MAX+18)
2268 #define IFMATCH_A_fail (REGNODE_MAX+19)
2269 #define CURLY_B_min_known (REGNODE_MAX+20)
2270 #define CURLY_B_min_known_fail (REGNODE_MAX+21)
2271 #define CURLY_B_min (REGNODE_MAX+22)
2272 #define CURLY_B_min_fail (REGNODE_MAX+23)
2273 #define CURLY_B_max (REGNODE_MAX+24)
2274 #define CURLY_B_max_fail (REGNODE_MAX+25)
2277 #define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
2281 S_debug_start_match(pTHX_ const regexp *prog, const bool do_utf8,
2282 const char *start, const char *end, const char *blurb)
2284 const bool utf8_pat= prog->reganch & ROPT_UTF8 ? 1 : 0;
2288 RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
2289 prog->precomp, prog->prelen, 60);
2291 RE_PV_QUOTED_DECL(s1, do_utf8, PERL_DEBUG_PAD_ZERO(1),
2292 start, end - start, 60);
2294 PerlIO_printf(Perl_debug_log,
2295 "%s%s REx%s %s against %s\n",
2296 PL_colors[4], blurb, PL_colors[5], s0, s1);
2298 if (do_utf8||utf8_pat)
2299 PerlIO_printf(Perl_debug_log, "UTF-8 %s...\n",
2300 !do_utf8 ? "pattern" : !utf8_pat ? "string" :
2301 "pattern and string"
2307 S_dump_exec_pos(pTHX_ const char *locinput, const regnode *scan, const bool do_utf8)
2309 const int docolor = *PL_colors[0];
2310 const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
2311 int l = (PL_regeol - locinput) > taill ? taill : (PL_regeol - locinput);
2312 /* The part of the string before starttry has one color
2313 (pref0_len chars), between starttry and current
2314 position another one (pref_len - pref0_len chars),
2315 after the current position the third one.
2316 We assume that pref0_len <= pref_len, otherwise we
2317 decrease pref0_len. */
2318 int pref_len = (locinput - PL_bostr) > (5 + taill) - l
2319 ? (5 + taill) - l : locinput - PL_bostr;
2322 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
2324 pref0_len = pref_len - (locinput - PL_reg_starttry);
2325 if (l + pref_len < (5 + taill) && l < PL_regeol - locinput)
2326 l = ( PL_regeol - locinput > (5 + taill) - pref_len
2327 ? (5 + taill) - pref_len : PL_regeol - locinput);
2328 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
2332 if (pref0_len > pref_len)
2333 pref0_len = pref_len;
2335 const int is_uni = (do_utf8 && OP(scan) != CANY) ? 1 : 0;
2337 RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
2338 (locinput - pref_len),pref0_len, 60, 4, 5);
2340 RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
2341 (locinput - pref_len + pref0_len),
2342 pref_len - pref0_len, 60, 2, 3);
2344 RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
2345 locinput, PL_regeol - locinput, 60, 0, 1);
2347 PerlIO_printf(Perl_debug_log,
2348 "%4"IVdf" <%.*s%.*s%s%.*s>%*s|",
2349 (IV)(locinput - PL_bostr),
2352 (docolor ? "" : "> <"),
2354 15 - l - pref_len + 1,
2361 STATIC I32 /* 0 failure, 1 success */
2362 S_regmatch(pTHX_ const regmatch_info *reginfo, regnode *prog)
2364 #if PERL_VERSION < 9
2368 register const bool do_utf8 = PL_reg_match_utf8;
2369 const U32 uniflags = UTF8_ALLOW_DEFAULT;
2371 regexp *rex = reginfo->prog;
2373 regmatch_slab *orig_slab;
2374 regmatch_state *orig_state;
2376 /* the current state. This is a cached copy of PL_regmatch_state */
2377 register regmatch_state *st;
2379 /* cache heavy used fields of st in registers */
2380 register regnode *scan;
2381 register regnode *next;
2382 register I32 n = 0; /* initialize to shut up compiler warning */
2383 register char *locinput = PL_reginput;
2385 /* these variables are NOT saved during a recusive RFEGMATCH: */
2386 register I32 nextchr; /* is always set to UCHARAT(locinput) */
2387 bool result; /* return value of S_regmatch */
2388 int depth = 0; /* depth of recursion */
2389 regmatch_state *yes_state = NULL; /* state to pop to on success of
2394 GET_RE_DEBUG_FLAGS_DECL;
2398 /* on first ever call to regmatch, allocate first slab */
2399 if (!PL_regmatch_slab) {
2400 Newx(PL_regmatch_slab, 1, regmatch_slab);
2401 PL_regmatch_slab->prev = NULL;
2402 PL_regmatch_slab->next = NULL;
2403 PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
2406 /* remember current high-water mark for exit */
2407 /* XXX this should be done with SAVE* instead */
2408 orig_slab = PL_regmatch_slab;
2409 orig_state = PL_regmatch_state;
2411 /* grab next free state slot */
2412 st = ++PL_regmatch_state;
2413 if (st > SLAB_LAST(PL_regmatch_slab))
2414 st = PL_regmatch_state = S_push_slab(aTHX);
2420 /* Note that nextchr is a byte even in UTF */
2421 nextchr = UCHARAT(locinput);
2423 while (scan != NULL) {
2426 SV * const prop = sv_newmortal();
2427 dump_exec_pos( locinput, scan, do_utf8 );
2428 regprop(rex, prop, scan);
2430 PerlIO_printf(Perl_debug_log,
2431 "%3"IVdf":%*s%s(%"IVdf")\n",
2432 (IV)(scan - rex->program), PL_regindent*2, "",
2434 PL_regkind[OP(scan)] == END ? 0 : (IV)(regnext(scan) - rex->program));
2437 next = scan + NEXT_OFF(scan);
2440 state_num = OP(scan);
2443 switch (state_num) {
2445 if (locinput == PL_bostr)
2447 /* reginfo->till = reginfo->bol; */
2452 if (locinput == PL_bostr ||
2453 ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n'))
2459 if (locinput == PL_bostr)
2463 if (locinput == reginfo->ganch)
2469 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
2474 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
2476 if (PL_regeol - locinput > 1)
2480 if (PL_regeol != locinput)
2484 if (!nextchr && locinput >= PL_regeol)
2487 locinput += PL_utf8skip[nextchr];
2488 if (locinput > PL_regeol)
2490 nextchr = UCHARAT(locinput);
2493 nextchr = UCHARAT(++locinput);
2496 if (!nextchr && locinput >= PL_regeol)
2498 nextchr = UCHARAT(++locinput);
2501 if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
2504 locinput += PL_utf8skip[nextchr];
2505 if (locinput > PL_regeol)
2507 nextchr = UCHARAT(locinput);
2510 nextchr = UCHARAT(++locinput);
2514 #define ST st->u.trie
2518 /* what type of TRIE am I? (utf8 makes this contextual) */
2519 const enum { trie_plain, trie_utf8, trie_utf8_fold }
2520 trie_type = do_utf8 ?
2521 (scan->flags == EXACT ? trie_utf8 : trie_utf8_fold)
2524 /* what trie are we using right now */
2525 reg_trie_data * const trie
2526 = (reg_trie_data*)rex->data->data[ ARG( scan ) ];
2527 U32 state = trie->startstate;
2529 U8 *uc = ( U8* )locinput;
2535 U8 *uscan = (U8*)NULL;
2537 SV *sv_accept_buff = NULL;
2538 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2540 ST.accepted = 0; /* how many accepting states we have seen */
2546 if (trie->bitmap && trie_type != trie_utf8_fold &&
2547 !TRIE_BITMAP_TEST(trie,*locinput)
2549 if (trie->states[ state ].wordnum) {
2551 PerlIO_printf(Perl_debug_log,
2552 "%*s %smatched empty string...%s\n",
2553 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4], PL_colors[5])
2558 PerlIO_printf(Perl_debug_log,
2559 "%*s %sfailed to match start class...%s\n",
2560 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4], PL_colors[5])
2567 traverse the TRIE keeping track of all accepting states
2568 we transition through until we get to a failing node.
2571 while ( state && uc <= (U8*)PL_regeol ) {
2573 if (trie->states[ state ].wordnum) {
2574 if (!ST.accepted ) {
2577 bufflen = TRIE_INITAL_ACCEPT_BUFFLEN;
2578 sv_accept_buff=newSV(bufflen *
2579 sizeof(reg_trie_accepted) - 1);
2580 SvCUR_set(sv_accept_buff,
2581 sizeof(reg_trie_accepted));
2582 SvPOK_on(sv_accept_buff);
2583 sv_2mortal(sv_accept_buff);
2586 (reg_trie_accepted*)SvPV_nolen(sv_accept_buff );
2589 if (ST.accepted >= bufflen) {
2591 ST.accept_buff =(reg_trie_accepted*)
2592 SvGROW(sv_accept_buff,
2593 bufflen * sizeof(reg_trie_accepted));
2595 SvCUR_set(sv_accept_buff,SvCUR(sv_accept_buff)
2596 + sizeof(reg_trie_accepted));
2598 ST.accept_buff[ST.accepted].wordnum = trie->states[state].wordnum;
2599 ST.accept_buff[ST.accepted].endpos = uc;
2603 base = trie->states[ state ].trans.base;
2605 DEBUG_TRIE_EXECUTE_r({
2606 dump_exec_pos( (char *)uc, scan, do_utf8 );
2607 PerlIO_printf( Perl_debug_log,
2608 "%*s %sState: %4"UVxf", Base: %4"UVxf", Accepted: %4"UVxf" ",
2609 2+PL_regindent * 2, "", PL_colors[4],
2610 (UV)state, (UV)base, (UV)ST.accepted );
2614 REXEC_TRIE_READ_CHAR(trie_type, trie, uc, uscan, len,
2615 uvc, charid, foldlen, foldbuf, uniflags);
2618 (base + charid > trie->uniquecharcount )
2619 && (base + charid - 1 - trie->uniquecharcount
2621 && trie->trans[base + charid - 1 -
2622 trie->uniquecharcount].check == state)
2624 state = trie->trans[base + charid - 1 -
2625 trie->uniquecharcount ].next;
2636 DEBUG_TRIE_EXECUTE_r(
2637 PerlIO_printf( Perl_debug_log,
2638 "Charid:%3x CV:%4"UVxf" After State: %4"UVxf"%s\n",
2639 charid, uvc, (UV)state, PL_colors[5] );
2646 PerlIO_printf( Perl_debug_log,
2647 "%*s %sgot %"IVdf" possible matches%s\n",
2648 REPORT_CODE_OFF + PL_regindent * 2, "",
2649 PL_colors[4], (IV)ST.accepted, PL_colors[5] );
2655 case TRIE_next_fail: /* we failed - try next alterative */
2657 if ( ST.accepted == 1 ) {
2658 /* only one choice left - just continue */
2660 reg_trie_data * const trie
2661 = (reg_trie_data*)rex->data->data[ ARG(ST.me) ];
2662 SV ** const tmp = RX_DEBUG(reginfo->prog)
2663 ? av_fetch( trie->words, ST.accept_buff[ 0 ].wordnum-1, 0 )
2665 PerlIO_printf( Perl_debug_log,
2666 "%*s %sonly one match left: #%d <%s>%s\n",
2667 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4],
2668 ST.accept_buff[ 0 ].wordnum,
2669 tmp ? SvPV_nolen_const( *tmp ) : "not compiled under -Dr",
2672 PL_reginput = (char *)ST.accept_buff[ 0 ].endpos;
2673 /* in this case we free tmps/leave before we call regmatch
2674 as we wont be using accept_buff again. */
2677 locinput = PL_reginput;
2678 nextchr = UCHARAT(locinput);
2680 continue; /* execute rest of RE */
2683 if (!ST.accepted-- ) {
2690 There are at least two accepting states left. Presumably
2691 the number of accepting states is going to be low,
2692 typically two. So we simply scan through to find the one
2693 with lowest wordnum. Once we find it, we swap the last
2694 state into its place and decrement the size. We then try to
2695 match the rest of the pattern at the point where the word
2696 ends. If we succeed, control just continues along the
2697 regex; if we fail we return here to try the next accepting
2704 for( cur = 1 ; cur <= ST.accepted ; cur++ ) {
2705 DEBUG_TRIE_EXECUTE_r(
2706 PerlIO_printf( Perl_debug_log,
2707 "%*s %sgot %"IVdf" (%d) as best, looking at %"IVdf" (%d)%s\n",
2708 REPORT_CODE_OFF + PL_regindent * 2, "", PL_colors[4],
2709 (IV)best, ST.accept_buff[ best ].wordnum, (IV)cur,
2710 ST.accept_buff[ cur ].wordnum, PL_colors[5] );
2713 if (ST.accept_buff[cur].wordnum <
2714 ST.accept_buff[best].wordnum)
2719 reg_trie_data * const trie
2720 = (reg_trie_data*)rex->data->data[ ARG(ST.me) ];
2721 SV ** const tmp = RX_DEBUG(reginfo->prog)
2722 ? av_fetch( trie->words, ST.accept_buff[ best ].wordnum - 1, 0 )
2724 PerlIO_printf( Perl_debug_log, "%*s %strying alternation #%d <%s> at node #%d %s\n",
2725 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4],
2726 ST.accept_buff[best].wordnum,
2727 tmp ? SvPV_nolen_const( *tmp ) : "not compiled under -Dr", REG_NODE_NUM(scan),
2731 if ( best<ST.accepted ) {
2732 reg_trie_accepted tmp = ST.accept_buff[ best ];
2733 ST.accept_buff[ best ] = ST.accept_buff[ ST.accepted ];
2734 ST.accept_buff[ ST.accepted ] = tmp;
2737 PL_reginput = (char *)ST.accept_buff[ best ].endpos;
2739 PUSH_STATE_GOTO(TRIE_next, ST.B);
2745 char *s = STRING(scan);
2746 st->ln = STR_LEN(scan);
2747 if (do_utf8 != UTF) {
2748 /* The target and the pattern have differing utf8ness. */
2750 const char * const e = s + st->ln;
2753 /* The target is utf8, the pattern is not utf8. */
2758 if (NATIVE_TO_UNI(*(U8*)s) !=
2759 utf8n_to_uvuni((U8*)l, UTF8_MAXBYTES, &ulen,
2767 /* The target is not utf8, the pattern is utf8. */
2772 if (NATIVE_TO_UNI(*((U8*)l)) !=
2773 utf8n_to_uvuni((U8*)s, UTF8_MAXBYTES, &ulen,
2781 nextchr = UCHARAT(locinput);
2784 /* The target and the pattern have the same utf8ness. */
2785 /* Inline the first character, for speed. */
2786 if (UCHARAT(s) != nextchr)
2788 if (PL_regeol - locinput < st->ln)
2790 if (st->ln > 1 && memNE(s, locinput, st->ln))
2793 nextchr = UCHARAT(locinput);
2797 PL_reg_flags |= RF_tainted;
2800 char * const s = STRING(scan);
2801 st->ln = STR_LEN(scan);
2803 if (do_utf8 || UTF) {
2804 /* Either target or the pattern are utf8. */
2805 const char * const l = locinput;
2806 char *e = PL_regeol;
2808 if (ibcmp_utf8(s, 0, st->ln, (bool)UTF,
2809 l, &e, 0, do_utf8)) {
2810 /* One more case for the sharp s:
2811 * pack("U0U*", 0xDF) =~ /ss/i,
2812 * the 0xC3 0x9F are the UTF-8
2813 * byte sequence for the U+00DF. */
2815 toLOWER(s[0]) == 's' &&
2817 toLOWER(s[1]) == 's' &&
2824 nextchr = UCHARAT(locinput);
2828 /* Neither the target and the pattern are utf8. */
2830 /* Inline the first character, for speed. */
2831 if (UCHARAT(s) != nextchr &&
2832 UCHARAT(s) != ((OP(scan) == EXACTF)
2833 ? PL_fold : PL_fold_locale)[nextchr])
2835 if (PL_regeol - locinput < st->ln)
2837 if (st->ln > 1 && (OP(scan) == EXACTF
2838 ? ibcmp(s, locinput, st->ln)
2839 : ibcmp_locale(s, locinput, st->ln)))
2842 nextchr = UCHARAT(locinput);
2847 STRLEN inclasslen = PL_regeol - locinput;
2849 if (!reginclass(rex, scan, (U8*)locinput, &inclasslen, do_utf8))
2851 if (locinput >= PL_regeol)
2853 locinput += inclasslen ? inclasslen : UTF8SKIP(locinput);
2854 nextchr = UCHARAT(locinput);
2859 nextchr = UCHARAT(locinput);
2860 if (!REGINCLASS(rex, scan, (U8*)locinput))
2862 if (!nextchr && locinput >= PL_regeol)
2864 nextchr = UCHARAT(++locinput);
2868 /* If we might have the case of the German sharp s
2869 * in a casefolding Unicode character class. */
2871 if (ANYOF_FOLD_SHARP_S(scan, locinput, PL_regeol)) {
2872 locinput += SHARP_S_SKIP;
2873 nextchr = UCHARAT(locinput);
2879 PL_reg_flags |= RF_tainted;
2885 LOAD_UTF8_CHARCLASS_ALNUM();
2886 if (!(OP(scan) == ALNUM
2887 ? (bool)swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
2888 : isALNUM_LC_utf8((U8*)locinput)))
2892 locinput += PL_utf8skip[nextchr];
2893 nextchr = UCHARAT(locinput);
2896 if (!(OP(scan) == ALNUM
2897 ? isALNUM(nextchr) : isALNUM_LC(nextchr)))
2899 nextchr = UCHARAT(++locinput);
2902 PL_reg_flags |= RF_tainted;
2905 if (!nextchr && locinput >= PL_regeol)
2908 LOAD_UTF8_CHARCLASS_ALNUM();
2909 if (OP(scan) == NALNUM
2910 ? (bool)swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
2911 : isALNUM_LC_utf8((U8*)locinput))
2915 locinput += PL_utf8skip[nextchr];
2916 nextchr = UCHARAT(locinput);
2919 if (OP(scan) == NALNUM
2920 ? isALNUM(nextchr) : isALNUM_LC(nextchr))
2922 nextchr = UCHARAT(++locinput);
2926 PL_reg_flags |= RF_tainted;
2930 /* was last char in word? */
2932 if (locinput == PL_bostr)
2935 const U8 * const r = reghop3((U8*)locinput, -1, (U8*)PL_bostr);
2937 st->ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, uniflags);
2939 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
2940 st->ln = isALNUM_uni(st->ln);
2941 LOAD_UTF8_CHARCLASS_ALNUM();
2942 n = swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8);
2945 st->ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(st->ln));
2946 n = isALNUM_LC_utf8((U8*)locinput);
2950 st->ln = (locinput != PL_bostr) ?
2951 UCHARAT(locinput - 1) : '\n';
2952 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
2953 st->ln = isALNUM(st->ln);
2954 n = isALNUM(nextchr);
2957 st->ln = isALNUM_LC(st->ln);
2958 n = isALNUM_LC(nextchr);
2961 if (((!st->ln) == (!n)) == (OP(scan) == BOUND ||
2962 OP(scan) == BOUNDL))
2966 PL_reg_flags |= RF_tainted;
2972 if (UTF8_IS_CONTINUED(nextchr)) {
2973 LOAD_UTF8_CHARCLASS_SPACE();
2974 if (!(OP(scan) == SPACE
2975 ? (bool)swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
2976 : isSPACE_LC_utf8((U8*)locinput)))
2980 locinput += PL_utf8skip[nextchr];
2981 nextchr = UCHARAT(locinput);
2984 if (!(OP(scan) == SPACE
2985 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
2987 nextchr = UCHARAT(++locinput);
2990 if (!(OP(scan) == SPACE
2991 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
2993 nextchr = UCHARAT(++locinput);
2997 PL_reg_flags |= RF_tainted;
3000 if (!nextchr && locinput >= PL_regeol)
3003 LOAD_UTF8_CHARCLASS_SPACE();
3004 if (OP(scan) == NSPACE
3005 ? (bool)swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
3006 : isSPACE_LC_utf8((U8*)locinput))
3010 locinput += PL_utf8skip[nextchr];
3011 nextchr = UCHARAT(locinput);
3014 if (OP(scan) == NSPACE
3015 ? isSPACE(nextchr) : isSPACE_LC(nextchr))
3017 nextchr = UCHARAT(++locinput);
3020 PL_reg_flags |= RF_tainted;
3026 LOAD_UTF8_CHARCLASS_DIGIT();
3027 if (!(OP(scan) == DIGIT
3028 ? (bool)swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
3029 : isDIGIT_LC_utf8((U8*)locinput)))
3033 locinput += PL_utf8skip[nextchr];
3034 nextchr = UCHARAT(locinput);
3037 if (!(OP(scan) == DIGIT
3038 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr)))
3040 nextchr = UCHARAT(++locinput);
3043 PL_reg_flags |= RF_tainted;
3046 if (!nextchr && locinput >= PL_regeol)
3049 LOAD_UTF8_CHARCLASS_DIGIT();
3050 if (OP(scan) == NDIGIT
3051 ? (bool)swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
3052 : isDIGIT_LC_utf8((U8*)locinput))
3056 locinput += PL_utf8skip[nextchr];
3057 nextchr = UCHARAT(locinput);
3060 if (OP(scan) == NDIGIT
3061 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr))
3063 nextchr = UCHARAT(++locinput);
3066 if (locinput >= PL_regeol)
3069 LOAD_UTF8_CHARCLASS_MARK();
3070 if (swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
3072 locinput += PL_utf8skip[nextchr];
3073 while (locinput < PL_regeol &&
3074 swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
3075 locinput += UTF8SKIP(locinput);
3076 if (locinput > PL_regeol)
3081 nextchr = UCHARAT(locinput);
3084 PL_reg_flags |= RF_tainted;
3089 n = ARG(scan); /* which paren pair */
3090 st->ln = PL_regstartp[n];
3091 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
3092 if ((I32)*PL_reglastparen < n || st->ln == -1)
3093 sayNO; /* Do not match unless seen CLOSEn. */
3094 if (st->ln == PL_regendp[n])
3097 s = PL_bostr + st->ln;
3098 if (do_utf8 && OP(scan) != REF) { /* REF can do byte comparison */
3100 const char *e = PL_bostr + PL_regendp[n];
3102 * Note that we can't do the "other character" lookup trick as
3103 * in the 8-bit case (no pun intended) because in Unicode we
3104 * have to map both upper and title case to lower case.
3106 if (OP(scan) == REFF) {
3108 STRLEN ulen1, ulen2;
3109 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
3110 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
3114 toLOWER_utf8((U8*)s, tmpbuf1, &ulen1);
3115 toLOWER_utf8((U8*)l, tmpbuf2, &ulen2);
3116 if (ulen1 != ulen2 || memNE((char *)tmpbuf1, (char *)tmpbuf2, ulen1))
3123 nextchr = UCHARAT(locinput);
3127 /* Inline the first character, for speed. */
3128 if (UCHARAT(s) != nextchr &&
3130 (UCHARAT(s) != ((OP(scan) == REFF
3131 ? PL_fold : PL_fold_locale)[nextchr]))))
3133 st->ln = PL_regendp[n] - st->ln;
3134 if (locinput + st->ln > PL_regeol)
3136 if (st->ln > 1 && (OP(scan) == REF
3137 ? memNE(s, locinput, st->ln)
3139 ? ibcmp(s, locinput, st->ln)
3140 : ibcmp_locale(s, locinput, st->ln))))
3143 nextchr = UCHARAT(locinput);
3154 #define ST st->u.eval
3156 case EVAL: /* /(?{A})B/ /(??{A})B/ and /(?(?{A})X|Y)B/ */
3160 /* execute the code in the {...} */
3162 SV ** const before = SP;
3163 OP_4tree * const oop = PL_op;
3164 COP * const ocurcop = PL_curcop;
3168 PL_op = (OP_4tree*)rex->data->data[n];
3169 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, " re_eval 0x%"UVxf"\n", PTR2UV(PL_op)) );
3170 PAD_SAVE_LOCAL(old_comppad, (PAD*)rex->data->data[n + 2]);
3171 PL_regendp[0] = PL_reg_magic->mg_len = locinput - PL_bostr;
3173 CALLRUNOPS(aTHX); /* Scalar context. */
3176 ret = &PL_sv_undef; /* protect against empty (?{}) blocks. */
3183 PAD_RESTORE_LOCAL(old_comppad);
3184 PL_curcop = ocurcop;
3187 sv_setsv(save_scalar(PL_replgv), ret);
3191 if (st->logical == 2) { /* Postponed subexpression: /(??{...})/ */
3194 /* extract RE object from returned value; compiling if
3199 if(SvROK(ret) && SvSMAGICAL(sv = SvRV(ret)))
3200 mg = mg_find(sv, PERL_MAGIC_qr);
3201 else if (SvSMAGICAL(ret)) {
3202 if (SvGMAGICAL(ret))
3203 sv_unmagic(ret, PERL_MAGIC_qr);
3205 mg = mg_find(ret, PERL_MAGIC_qr);
3209 re = (regexp *)mg->mg_obj;
3210 (void)ReREFCNT_inc(re);
3214 const char * const t = SvPV_const(ret, len);
3216 const I32 osize = PL_regsize;
3219 if (DO_UTF8(ret)) pm.op_pmdynflags |= PMdf_DYN_UTF8;
3220 re = CALLREGCOMP(aTHX_ (char*)t, (char*)t + len, &pm);
3222 & (SVs_TEMP | SVs_PADTMP | SVf_READONLY
3224 sv_magic(ret,(SV*)ReREFCNT_inc(re),
3230 /* run the pattern returned from (??{...}) */
3232 debug_start_match(re, do_utf8, locinput, PL_regeol,
3233 "Matching embedded");
3236 ST.cp = regcppush(0); /* Save *all* the positions. */
3237 REGCP_SET(ST.lastcp);
3238 *PL_reglastparen = 0;
3239 *PL_reglastcloseparen = 0;
3240 PL_reginput = locinput;
3242 /* XXXX This is too dramatic a measure... */
3246 ST.toggleutf = ((PL_reg_flags & RF_utf8) != 0) ^
3247 ((re->reganch & ROPT_UTF8) != 0);
3248 if (ST.toggleutf) PL_reg_flags ^= RF_utf8;
3253 /* now continue from first node in postoned RE */
3254 PUSH_YES_STATE_GOTO(EVAL_A, re->program + 1);
3257 /* /(?(?{...})X|Y)/ */
3258 st->sw = SvTRUE(ret);
3263 case EVAL_A: /* successfully ran inner rex (??{rex}) */
3265 PL_reg_flags ^= RF_utf8;
3268 /* XXXX This is too dramatic a measure... */
3270 /* Restore parens of the caller without popping the
3273 const I32 tmp = PL_savestack_ix;
3274 PL_savestack_ix = ST.lastcp;
3276 PL_savestack_ix = tmp;
3278 PL_reginput = locinput;
3279 /* continue at the node following the (??{...}) */
3283 case EVAL_A_fail: /* unsuccessfully ran inner rex (??{rex}) */
3284 /* Restore state to the outer re then re-throw the failure */
3286 PL_reg_flags ^= RF_utf8;
3290 /* XXXX This is too dramatic a measure... */
3293 PL_reginput = locinput;
3294 REGCP_UNWIND(ST.lastcp);
3301 n = ARG(scan); /* which paren pair */
3302 PL_reg_start_tmp[n] = locinput;
3307 n = ARG(scan); /* which paren pair */
3308 PL_regstartp[n] = PL_reg_start_tmp[n] - PL_bostr;
3309 PL_regendp[n] = locinput - PL_bostr;
3310 if (n > (I32)*PL_reglastparen)
3311 *PL_reglastparen = n;
3312 *PL_reglastcloseparen = n;
3315 n = ARG(scan); /* which paren pair */
3316 st->sw = ((I32)*PL_reglastparen >= n && PL_regendp[n] != -1);
3319 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
3321 next = NEXTOPER(NEXTOPER(scan));
3323 next = scan + ARG(scan);
3324 if (OP(next) == IFTHEN) /* Fake one. */
3325 next = NEXTOPER(NEXTOPER(next));
3329 st->logical = scan->flags;
3331 /*******************************************************************
3332 cc points to the regmatch_state associated with the most recent CURLYX.
3333 This struct contains info about the innermost (...)* loop (an
3334 "infoblock"), and a pointer to the next outer cc.
3336 Here is how Y(A)*Z is processed (if it is compiled into CURLYX/WHILEM):
3338 1) After matching Y, regnode for CURLYX is processed;
3340 2) This regnode populates cc, and calls regmatch() recursively
3341 with the starting point at WHILEM node;
3343 3) Each hit of WHILEM node tries to match A and Z (in the order
3344 depending on the current iteration, min/max of {min,max} and
3345 greediness). The information about where are nodes for "A"
3346 and "Z" is read from cc, as is info on how many times "A"
3347 was already matched, and greediness.
3349 4) After A matches, the same WHILEM node is hit again.
3351 5) Each time WHILEM is hit, cc is the infoblock created by CURLYX
3352 of the same pair. Thus when WHILEM tries to match Z, it temporarily
3353 resets cc, since this Y(A)*Z can be a part of some other loop:
3354 as in (Y(A)*Z)*. If Z matches, the automaton will hit the WHILEM node
3355 of the external loop.
3357 Currently present infoblocks form a tree with a stem formed by st->cc
3358 and whatever it mentions via ->next, and additional attached trees
3359 corresponding to temporarily unset infoblocks as in "5" above.
3361 In the following picture, infoblocks for outer loop of
3362 (Y(A)*?Z)*?T are denoted O, for inner I. NULL starting block
3363 is denoted by x. The matched string is YAAZYAZT. Temporarily postponed
3364 infoblocks are drawn below the "reset" infoblock.
3366 In fact in the picture below we do not show failed matches for Z and T
3367 by WHILEM blocks. [We illustrate minimal matches, since for them it is
3368 more obvious *why* one needs to *temporary* unset infoblocks.]
3370 Matched REx position InfoBlocks Comment
3374 Y A)*?Z)*?T x <- O <- I
3375 YA )*?Z)*?T x <- O <- I
3376 YA A)*?Z)*?T x <- O <- I
3377 YAA )*?Z)*?T x <- O <- I
3378 YAA Z)*?T x <- O # Temporary unset I
3381 YAAZ Y(A)*?Z)*?T x <- O
3384 YAAZY (A)*?Z)*?T x <- O
3387 YAAZY A)*?Z)*?T x <- O <- I
3390 YAAZYA )*?Z)*?T x <- O <- I
3393 YAAZYA Z)*?T x <- O # Temporary unset I
3399 YAAZYAZ T x # Temporary unset O
3406 *******************************************************************/
3409 /* No need to save/restore up to this paren */
3410 I32 parenfloor = scan->flags;
3414 CURLYX and WHILEM are always paired: they're the moral
3415 equivalent of pp_enteriter anbd pp_iter.
3417 The only time next could be null is if the node tree is
3418 corrupt. This was mentioned on p5p a few days ago.
3420 See http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/2006-04/msg00556.html
3421 So we'll assert that this is true:
3424 if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
3426 /* XXXX Probably it is better to teach regpush to support
3427 parenfloor > PL_regsize... */
3428 if (parenfloor > (I32)*PL_reglastparen)
3429 parenfloor = *PL_reglastparen; /* Pessimization... */
3431 st->u.curlyx.cp = PL_savestack_ix;
3432 st->u.curlyx.outercc = st->cc;
3434 /* these fields contain the state of the current curly.
3435 * they are accessed by subsequent WHILEMs;
3436 * cur and lastloc are also updated by WHILEM */
3437 st->u.curlyx.parenfloor = parenfloor;
3438 st->u.curlyx.cur = -1; /* this will be updated by WHILEM */
3439 st->u.curlyx.min = ARG1(scan);
3440 st->u.curlyx.max = ARG2(scan);
3441 st->u.curlyx.scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3442 st->u.curlyx.lastloc = 0;
3443 /* st->next and st->minmod are also read by WHILEM */
3445 PL_reginput = locinput;
3446 REGMATCH(PREVOPER(next), CURLYX); /* start on the WHILEM */
3447 /*** all unsaved local vars undefined at this point */
3448 regcpblow(st->u.curlyx.cp);
3449 st->cc = st->u.curlyx.outercc;
3455 * This is really hard to understand, because after we match
3456 * what we're trying to match, we must make sure the rest of
3457 * the REx is going to match for sure, and to do that we have
3458 * to go back UP the parse tree by recursing ever deeper. And
3459 * if it fails, we have to reset our parent's current state
3460 * that we can try again after backing off.
3465 st->cc gets initialised by CURLYX ready for use by WHILEM.
3466 So again, unless somethings been corrupted, st->cc cannot
3467 be null at that point in WHILEM.
3469 See http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/2006-04/msg00556.html
3470 So we'll assert that this is true:
3473 st->u.whilem.lastloc = st->cc->u.curlyx.lastloc; /* Detection of 0-len. */
3474 st->u.whilem.cache_offset = 0;
3475 st->u.whilem.cache_bit = 0;
3477 n = st->cc->u.curlyx.cur + 1; /* how many we know we matched */
3478 PL_reginput = locinput;
3481 PerlIO_printf(Perl_debug_log,
3482 "%*s %ld out of %ld..%ld cc=%"UVxf"\n",
3483 REPORT_CODE_OFF+PL_regindent*2, "",
3484 (long)n, (long)st->cc->u.curlyx.min,
3485 (long)st->cc->u.curlyx.max, PTR2UV(st->cc))
3488 /* If degenerate scan matches "", assume scan done. */
3490 if (locinput == st->cc->u.curlyx.lastloc && n >= st->cc->u.curlyx.min) {
3491 st->u.whilem.savecc = st->cc;
3492 st->cc = st->cc->u.curlyx.outercc;
3494 st->ln = st->cc->u.curlyx.cur;
3496 PerlIO_printf(Perl_debug_log,
3497 "%*s empty match detected, try continuation...\n",
3498 REPORT_CODE_OFF+PL_regindent*2, "")
3500 REGMATCH(st->u.whilem.savecc->next, WHILEM1);
3501 /*** all unsaved local vars undefined at this point */
3502 st->cc = st->u.whilem.savecc;
3505 if (st->cc->u.curlyx.outercc)
3506 st->cc->u.curlyx.outercc->u.curlyx.cur = st->ln;
3510 /* First just match a string of min scans. */
3512 if (n < st->cc->u.curlyx.min) {
3513 st->cc->u.curlyx.cur = n;
3514 st->cc->u.curlyx.lastloc = locinput;
3515 REGMATCH(st->cc->u.curlyx.scan, WHILEM2);
3516 /*** all unsaved local vars undefined at this point */
3519 st->cc->u.curlyx.cur = n - 1;
3520 st->cc->u.curlyx.lastloc = st->u.whilem.lastloc;
3525 /* Check whether we already were at this position.
3526 Postpone detection until we know the match is not
3527 *that* much linear. */
3528 if (!PL_reg_maxiter) {
3529 PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
3530 /* possible overflow for long strings and many CURLYX's */
3531 if (PL_reg_maxiter < 0)
3532 PL_reg_maxiter = I32_MAX;
3533 PL_reg_leftiter = PL_reg_maxiter;
3535 if (PL_reg_leftiter-- == 0) {
3536 const I32 size = (PL_reg_maxiter + 7)/8;
3537 if (PL_reg_poscache) {
3538 if ((I32)PL_reg_poscache_size < size) {
3539 Renew(PL_reg_poscache, size, char);
3540 PL_reg_poscache_size = size;
3542 Zero(PL_reg_poscache, size, char);
3545 PL_reg_poscache_size = size;
3546 Newxz(PL_reg_poscache, size, char);
3549 PerlIO_printf(Perl_debug_log,
3550 "%sDetected a super-linear match, switching on caching%s...\n",
3551 PL_colors[4], PL_colors[5])
3554 if (PL_reg_leftiter < 0) {
3555 st->u.whilem.cache_offset = locinput - PL_bostr;
3557 st->u.whilem.cache_offset = (scan->flags & 0xf) - 1
3558 + st->u.whilem.cache_offset * (scan->flags>>4);
3559 st->u.whilem.cache_bit = st->u.whilem.cache_offset % 8;
3560 st->u.whilem.cache_offset /= 8;
3561 if (PL_reg_poscache[st->u.whilem.cache_offset] & (1<<st->u.whilem.cache_bit)) {
3563 PerlIO_printf(Perl_debug_log,
3564 "%*s already tried at this position...\n",
3565 REPORT_CODE_OFF+PL_regindent*2, "")
3567 sayNO; /* cache records failure */
3572 /* Prefer next over scan for minimal matching. */
3574 if (st->cc->minmod) {
3575 st->u.whilem.savecc = st->cc;
3576 st->cc = st->cc->u.curlyx.outercc;
3578 st->ln = st->cc->u.curlyx.cur;
3579 st->u.whilem.cp = regcppush(st->u.whilem.savecc->u.curlyx.parenfloor);
3580 REGCP_SET(st->u.whilem.lastcp);
3581 REGMATCH(st->u.whilem.savecc->next, WHILEM3);
3582 /*** all unsaved local vars undefined at this point */
3583 st->cc = st->u.whilem.savecc;
3585 regcpblow(st->u.whilem.cp);
3586 sayYES; /* All done. */
3588 REGCP_UNWIND(st->u.whilem.lastcp);
3590 if (st->cc->u.curlyx.outercc)
3591 st->cc->u.curlyx.outercc->u.curlyx.cur = st->ln;
3593 if (n >= st->cc->u.curlyx.max) { /* Maximum greed exceeded? */
3594 if (ckWARN(WARN_REGEXP) && n >= REG_INFTY
3595 && !(PL_reg_flags & RF_warned)) {
3596 PL_reg_flags |= RF_warned;
3597 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s limit (%d) exceeded",
3598 "Complex regular subexpression recursion",
3605 PerlIO_printf(Perl_debug_log,
3606 "%*s trying longer...\n",
3607 REPORT_CODE_OFF+PL_regindent*2, "")
3609 /* Try scanning more and see if it helps. */
3610 PL_reginput = locinput;
3611 st->cc->u.curlyx.cur = n;
3612 st->cc->u.curlyx.lastloc = locinput;
3613 st->u.whilem.cp = regcppush(st->cc->u.curlyx.parenfloor);
3614 REGCP_SET(st->u.whilem.lastcp);
3615 REGMATCH(st->cc->u.curlyx.scan, WHILEM4);
3616 /*** all unsaved local vars undefined at this point */
3618 regcpblow(st->u.whilem.cp);
3621 REGCP_UNWIND(st->u.whilem.lastcp);
3623 st->cc->u.curlyx.cur = n - 1;
3624 st->cc->u.curlyx.lastloc = st->u.whilem.lastloc;
3628 /* Prefer scan over next for maximal matching. */
3630 if (n < st->cc->u.curlyx.max) { /* More greed allowed? */
3631 st->u.whilem.cp = regcppush(st->cc->u.curlyx.parenfloor);
3632 st->cc->u.curlyx.cur = n;
3633 st->cc->u.curlyx.lastloc = locinput;
3634 REGCP_SET(st->u.whilem.lastcp);
3635 REGMATCH(st->cc->u.curlyx.scan, WHILEM5);
3636 /*** all unsaved local vars undefined at this point */
3638 regcpblow(st->u.whilem.cp);
3641 REGCP_UNWIND(st->u.whilem.lastcp);
3642 regcppop(rex); /* Restore some previous $<digit>s? */
3643 PL_reginput = locinput;
3645 PerlIO_printf(Perl_debug_log,
3646 "%*s failed, try continuation...\n",
3647 REPORT_CODE_OFF+PL_regindent*2, "")
3650 if (ckWARN(WARN_REGEXP) && n >= REG_INFTY
3651 && !(PL_reg_flags & RF_warned)) {
3652 PL_reg_flags |= RF_warned;
3653 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s limit (%d) exceeded",
3654 "Complex regular subexpression recursion",
3658 /* Failed deeper matches of scan, so see if this one works. */
3659 st->u.whilem.savecc = st->cc;
3660 st->cc = st->cc->u.curlyx.outercc;
3662 st->ln = st->cc->u.curlyx.cur;
3663 REGMATCH(st->u.whilem.savecc->next, WHILEM6);
3664 /*** all unsaved local vars undefined at this point */
3665 st->cc = st->u.whilem.savecc;
3668 if (st->cc->u.curlyx.outercc)
3669 st->cc->u.curlyx.outercc->u.curlyx.cur = st->ln;
3670 st->cc->u.curlyx.cur = n - 1;
3671 st->cc->u.curlyx.lastloc = st->u.whilem.lastloc;
3677 #define ST st->u.branch
3679 case BRANCHJ: /* /(...|A|...)/ with long next pointer */
3680 next = scan + ARG(scan);
3683 scan = NEXTOPER(scan);
3686 case BRANCH: /* /(...|A|...)/ */
3687 scan = NEXTOPER(scan); /* scan now points to inner node */
3688 if (!next || (OP(next) != BRANCH && OP(next) != BRANCHJ))
3689 /* last branch; skip state push and jump direct to node */
3691 ST.lastparen = *PL_reglastparen;
3692 ST.next_branch = next;
3694 PL_reginput = locinput;
3696 /* Now go into the branch */
3697 PUSH_STATE_GOTO(BRANCH_next, scan);
3700 case BRANCH_next_fail: /* that branch failed; try the next, if any */
3701 REGCP_UNWIND(ST.cp);
3702 for (n = *PL_reglastparen; n > ST.lastparen; n--)
3704 *PL_reglastparen = n;
3705 scan = ST.next_branch;
3706 /* no more branches? */
3707 if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ))
3709 continue; /* execute next BRANCH[J] op */
3717 #define ST st->u.curlym
3719 case CURLYM: /* /A{m,n}B/ where A is fixed-length */
3721 /* This is an optimisation of CURLYX that enables us to push
3722 * only a single backtracking state, no matter now many matches
3723 * there are in {m,n}. It relies on the pattern being constant
3724 * length, with no parens to influence future backrefs
3728 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
3730 /* if paren positive, emulate an OPEN/CLOSE around A */
3732 I32 paren = ST.me->flags;
3733 if (paren > PL_regsize)
3735 if (paren > (I32)*PL_reglastparen)
3736 *PL_reglastparen = paren;
3737 scan += NEXT_OFF(scan); /* Skip former OPEN. */
3743 ST.minmod = st->minmod;
3745 ST.c1 = CHRTEST_UNINIT;
3748 if (!(ST.minmod ? ARG1(ST.me) : ARG2(ST.me))) /* min/max */
3751 curlym_do_A: /* execute the A in /A{m,n}B/ */
3752 PL_reginput = locinput;
3753 PUSH_YES_STATE_GOTO(CURLYM_A, ST.A); /* match A */
3756 case CURLYM_A: /* we've just matched an A */
3757 locinput = st->locinput;
3758 nextchr = UCHARAT(locinput);
3761 /* after first match, determine A's length: u.curlym.alen */
3762 if (ST.count == 1) {
3763 if (PL_reg_match_utf8) {
3765 while (s < PL_reginput) {
3771 ST.alen = PL_reginput - locinput;
3774 ST.count = ST.minmod ? ARG1(ST.me) : ARG2(ST.me);
3777 PerlIO_printf(Perl_debug_log,
3778 "%*s CURLYM now matched %"IVdf" times, len=%"IVdf"...\n",
3779 (int)(REPORT_CODE_OFF+(PL_regindent*2)), "",
3780 (IV) ST.count, (IV)ST.alen)
3783 locinput = PL_reginput;
3784 if (ST.count < (ST.minmod ? ARG1(ST.me) : ARG2(ST.me)))
3785 goto curlym_do_A; /* try to match another A */
3786 goto curlym_do_B; /* try to match B */
3788 case CURLYM_A_fail: /* just failed to match an A */
3789 REGCP_UNWIND(ST.cp);
3790 if (ST.minmod || ST.count < ARG1(ST.me) /* min*/ )
3793 curlym_do_B: /* execute the B in /A{m,n}B/ */
3794 PL_reginput = locinput;
3795 if (ST.c1 == CHRTEST_UNINIT) {
3796 /* calculate c1 and c2 for possible match of 1st char
3797 * following curly */
3798 ST.c1 = ST.c2 = CHRTEST_VOID;
3799 if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) {
3800 regnode *text_node = ST.B;
3801 if (! HAS_TEXT(text_node))
3802 FIND_NEXT_IMPT(text_node);
3803 if (HAS_TEXT(text_node)
3804 && PL_regkind[OP(text_node)] != REF)
3806 ST.c1 = (U8)*STRING(text_node);
3808 (OP(text_node) == EXACTF || OP(text_node) == REFF)
3810 : (OP(text_node) == EXACTFL || OP(text_node) == REFFL)
3811 ? PL_fold_locale[ST.c1]
3818 PerlIO_printf(Perl_debug_log,
3819 "%*s CURLYM trying tail with matches=%"IVdf"...\n",
3820 (int)(REPORT_CODE_OFF+(PL_regindent*2)),
3823 if (ST.c1 != CHRTEST_VOID
3824 && UCHARAT(PL_reginput) != ST.c1
3825 && UCHARAT(PL_reginput) != ST.c2)
3827 /* simulate B failing */
3828 state_num = CURLYM_B_fail;
3829 goto reenter_switch;
3833 /* mark current A as captured */
3834 I32 paren = ST.me->flags;
3837 = HOPc(PL_reginput, -ST.alen) - PL_bostr;
3838 PL_regendp[paren] = PL_reginput - PL_bostr;
3841 PL_regendp[paren] = -1;
3843 PUSH_STATE_GOTO(CURLYM_B, ST.B); /* match B */
3846 case CURLYM_B_fail: /* just failed to match a B */
3847 REGCP_UNWIND(ST.cp);
3849 if (ST.count == ARG2(ST.me) /* max */)
3851 goto curlym_do_A; /* try to match a further A */
3853 /* backtrack one A */
3854 if (ST.count == ARG1(ST.me) /* min */)
3857 locinput = HOPc(locinput, -ST.alen);
3858 goto curlym_do_B; /* try to match B */
3861 #define ST st->u.curly
3863 #define CURLY_SETPAREN(paren, success) \
3866 PL_regstartp[paren] = HOPc(locinput, -1) - PL_bostr; \
3867 PL_regendp[paren] = locinput - PL_bostr; \
3870 PL_regendp[paren] = -1; \
3873 case STAR: /* /A*B/ where A is width 1 */
3877 scan = NEXTOPER(scan);
3879 case PLUS: /* /A+B/ where A is width 1 */
3883 scan = NEXTOPER(scan);
3885 case CURLYN: /* /(A){m,n}B/ where A is width 1 */
3886 ST.paren = scan->flags; /* Which paren to set */
3887 if (ST.paren > PL_regsize)
3888 PL_regsize = ST.paren;
3889 if (ST.paren > (I32)*PL_reglastparen)
3890 *PL_reglastparen = ST.paren;
3891 ST.min = ARG1(scan); /* min to match */
3892 ST.max = ARG2(scan); /* max to match */
3893 scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
3895 case CURLY: /* /A{m,n}B/ where A is width 1 */
3897 ST.min = ARG1(scan); /* min to match */
3898 ST.max = ARG2(scan); /* max to match */
3899 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
3902 * Lookahead to avoid useless match attempts
3903 * when we know what character comes next.
3905 * Used to only do .*x and .*?x, but now it allows
3906 * for )'s, ('s and (?{ ... })'s to be in the way
3907 * of the quantifier and the EXACT-like node. -- japhy
3910 if (ST.min > ST.max) /* XXX make this a compile-time check? */
3912 if (HAS_TEXT(next) || JUMPABLE(next)) {
3914 regnode *text_node = next;
3916 if (! HAS_TEXT(text_node))
3917 FIND_NEXT_IMPT(text_node);
3919 if (! HAS_TEXT(text_node))
3920 ST.c1 = ST.c2 = CHRTEST_VOID;
3922 if (PL_regkind[OP(text_node)] == REF) {
3923 ST.c1 = ST.c2 = CHRTEST_VOID;
3924 goto assume_ok_easy;
3927 s = (U8*)STRING(text_node);
3931 if (OP(text_node) == EXACTF || OP(text_node) == REFF)
3932 ST.c2 = PL_fold[ST.c1];
3933 else if (OP(text_node) == EXACTFL || OP(text_node) == REFFL)
3934 ST.c2 = PL_fold_locale[ST.c1];
3937 if (OP(text_node) == EXACTF || OP(text_node) == REFF) {
3938 STRLEN ulen1, ulen2;
3939 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
3940 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
3942 to_utf8_lower((U8*)s, tmpbuf1, &ulen1);
3943 to_utf8_upper((U8*)s, tmpbuf2, &ulen2);
3945 ST.c1 = utf8n_to_uvuni(tmpbuf1, UTF8_MAXBYTES, 0,
3947 ST.c2 = utf8n_to_uvuni(tmpbuf2, UTF8_MAXBYTES, 0,
3951 ST.c2 = ST.c1 = utf8n_to_uvchr(s, UTF8_MAXBYTES, 0,
3958 ST.c1 = ST.c2 = CHRTEST_VOID;
3963 PL_reginput = locinput;
3966 if (ST.min && regrepeat(rex, ST.A, ST.min) < ST.min)
3969 locinput = PL_reginput;
3971 if (ST.c1 == CHRTEST_VOID)
3972 goto curly_try_B_min;
3974 ST.oldloc = locinput;
3976 /* set ST.maxpos to the furthest point along the
3977 * string that could possibly match */
3978 if (ST.max == REG_INFTY) {
3979 ST.maxpos = PL_regeol - 1;
3981 while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos))
3985 int m = ST.max - ST.min;
3986 for (ST.maxpos = locinput;
3987 m >0 && ST.maxpos + UTF8SKIP(ST.maxpos) <= PL_regeol; m--)
3988 ST.maxpos += UTF8SKIP(ST.maxpos);
3991 ST.maxpos = locinput + ST.max - ST.min;
3992 if (ST.maxpos >= PL_regeol)
3993 ST.maxpos = PL_regeol - 1;
3995 goto curly_try_B_min_known;
3999 ST.count = regrepeat(rex, ST.A, ST.max);
4000 locinput = PL_reginput;
4001 if (ST.count < ST.min)
4003 if ((ST.count > ST.min)
4004 && (PL_regkind[OP(ST.B)] == EOL) && (OP(ST.B) != MEOL))
4006 /* A{m,n} must come at the end of the string, there's
4007 * no point in backing off ... */
4009 /* ...except that $ and \Z can match before *and* after
4010 newline at the end. Consider "\n\n" =~ /\n+\Z\n/.
4011 We may back off by one in this case. */
4012 if (UCHARAT(PL_reginput - 1) == '\n' && OP(ST.B) != EOS)
4016 goto curly_try_B_max;
4021 case CURLY_B_min_known_fail:
4022 /* failed to find B in a non-greedy match where c1,c2 valid */
4023 if (ST.paren && ST.count)
4024 PL_regendp[ST.paren] = -1;
4026 PL_reginput = locinput; /* Could be reset... */
4027 REGCP_UNWIND(ST.cp);
4028 /* Couldn't or didn't -- move forward. */
4029 ST.oldloc = locinput;
4031 locinput += UTF8SKIP(locinput);
4035 curly_try_B_min_known:
4036 /* find the next place where 'B' could work, then call B */
4040 n = (ST.oldloc == locinput) ? 0 : 1;
4041 if (ST.c1 == ST.c2) {
4043 /* set n to utf8_distance(oldloc, locinput) */
4044 while (locinput <= ST.maxpos &&
4045 utf8n_to_uvchr((U8*)locinput,
4046 UTF8_MAXBYTES, &len,
4047 uniflags) != (UV)ST.c1) {
4053 /* set n to utf8_distance(oldloc, locinput) */
4054 while (locinput <= ST.maxpos) {
4056 const UV c = utf8n_to_uvchr((U8*)locinput,
4057 UTF8_MAXBYTES, &len,
4059 if (c == (UV)ST.c1 || c == (UV)ST.c2)
4067 if (ST.c1 == ST.c2) {
4068 while (locinput <= ST.maxpos &&
4069 UCHARAT(locinput) != ST.c1)
4073 while (locinput <= ST.maxpos
4074 && UCHARAT(locinput) != ST.c1
4075 && UCHARAT(locinput) != ST.c2)
4078 n = locinput - ST.oldloc;
4080 if (locinput > ST.maxpos)
4082 /* PL_reginput == oldloc now */
4085 if (regrepeat(rex, ST.A, n) < n)
4088 PL_reginput = locinput;
4089 CURLY_SETPAREN(ST.paren, ST.count);
4090 PUSH_STATE_GOTO(CURLY_B_min_known, ST.B);
4095 case CURLY_B_min_fail:
4096 /* failed to find B in a non-greedy match where c1,c2 invalid */
4097 if (ST.paren && ST.count)
4098 PL_regendp[ST.paren] = -1;
4100 REGCP_UNWIND(ST.cp);
4101 /* failed -- move forward one */
4102 PL_reginput = locinput;
4103 if (regrepeat(rex, ST.A, 1)) {
4105 locinput = PL_reginput;
4106 if (ST.count <= ST.max || (ST.max == REG_INFTY &&
4107 ST.count > 0)) /* count overflow ? */
4110 CURLY_SETPAREN(ST.paren, ST.count);
4111 PUSH_STATE_GOTO(CURLY_B_min, ST.B);
4119 /* a successful greedy match: now try to match B */
4122 if (ST.c1 != CHRTEST_VOID)
4123 c = do_utf8 ? utf8n_to_uvchr((U8*)PL_reginput,
4124 UTF8_MAXBYTES, 0, uniflags)
4125 : (UV) UCHARAT(PL_reginput);
4126 /* If it could work, try it. */
4127 if (ST.c1 == CHRTEST_VOID || c == (UV)ST.c1 || c == (UV)ST.c2) {
4128 CURLY_SETPAREN(ST.paren, ST.count);
4129 PUSH_STATE_GOTO(CURLY_B_max, ST.B);
4134 case CURLY_B_max_fail:
4135 /* failed to find B in a greedy match */
4136 if (ST.paren && ST.count)
4137 PL_regendp[ST.paren] = -1;
4139 REGCP_UNWIND(ST.cp);
4141 if (--ST.count < ST.min)
4143 PL_reginput = locinput = HOPc(locinput, -1);
4144 goto curly_try_B_max;
4150 if (locinput < reginfo->till) {
4151 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
4152 "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
4154 (long)(locinput - PL_reg_starttry),
4155 (long)(reginfo->till - PL_reg_starttry),
4157 sayNO_FINAL; /* Cannot match: too short. */
4159 PL_reginput = locinput; /* put where regtry can find it */
4160 sayYES_FINAL; /* Success! */
4162 case SUCCEED: /* successful SUSPEND/UNLESSM/IFMATCH/CURLYM */
4164 PerlIO_printf(Perl_debug_log,
4165 "%*s %ssubpattern success...%s\n",
4166 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4], PL_colors[5]));
4167 PL_reginput = locinput; /* put where regtry can find it */
4168 sayYES_FINAL; /* Success! */
4171 #define ST st->u.ifmatch
4173 case SUSPEND: /* (?>A) */
4175 PL_reginput = locinput;
4178 case UNLESSM: /* -ve lookaround: (?!A), or with flags, (?<!A) */
4180 goto ifmatch_trivial_fail_test;
4182 case IFMATCH: /* +ve lookaround: (?=A), or with flags, (?<=A) */
4184 ifmatch_trivial_fail_test:
4186 char * const s = HOPBACKc(locinput, scan->flags);
4191 st->sw = 1 - (bool)ST.wanted;
4195 next = scan + ARG(scan);
4203 PL_reginput = locinput;
4207 /* execute body of (?...A) */
4208 PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)));
4211 case IFMATCH_A_fail: /* body of (?...A) failed */
4212 ST.wanted = !ST.wanted;
4215 case IFMATCH_A: /* body of (?...A) succeeded */
4218 st->sw = (bool)ST.wanted;
4220 else if (!ST.wanted)
4223 if (OP(ST.me) == SUSPEND)
4224 locinput = PL_reginput;
4226 locinput = PL_reginput = st->locinput;
4227 nextchr = UCHARAT(locinput);
4229 scan = ST.me + ARG(ST.me);
4232 continue; /* execute B */
4237 next = scan + ARG(scan);
4242 PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
4243 PTR2UV(scan), OP(scan));
4244 Perl_croak(aTHX_ "regexp memory corruption");
4252 /* push a state that backtracks on success */
4253 st->u.yes.prev_yes_state = yes_state;
4257 /* push a new regex state, then continue at scan */
4259 regmatch_state *newst;
4262 DEBUG_STATE_r(PerlIO_printf(Perl_debug_log,
4263 "PUSH STATE(%d)\n", depth));
4264 st->locinput = locinput;
4266 if (newst > SLAB_LAST(PL_regmatch_slab))
4267 newst = S_push_slab(aTHX);
4268 PL_regmatch_state = newst;
4270 /* XXX probably don't need to initialise these */
4275 locinput = PL_reginput;
4276 nextchr = UCHARAT(locinput);
4282 /* simulate recursively calling regmatch(), but without actually
4283 * recursing - ie save the current state on the heap rather than on
4284 * the stack, then re-enter the loop. This avoids complex regexes
4285 * blowing the processor stack */
4289 /* push new state */
4290 regmatch_state *oldst = st;
4293 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "PUSH RECURSE STATE(%d)\n", depth));
4295 /* grab the next free state slot */
4297 if (st > SLAB_LAST(PL_regmatch_slab))
4298 st = S_push_slab(aTHX);
4299 PL_regmatch_state = st;
4303 oldst->locinput = locinput;
4306 locinput = PL_reginput;
4307 nextchr = UCHARAT(locinput);
4320 * We get here only if there's trouble -- normally "case END" is
4321 * the terminating point.
4323 Perl_croak(aTHX_ "corrupted regexp pointers");
4330 /* we have successfully completed a subexpression, but we must now
4331 * pop to the state marked by yes_state and continue from there */
4333 assert(st != yes_state);
4334 while (yes_state < SLAB_FIRST(PL_regmatch_slab)
4335 || yes_state > SLAB_LAST(PL_regmatch_slab))
4337 /* not in this slab, pop slab */
4338 depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1);
4339 PL_regmatch_slab = PL_regmatch_slab->prev;
4340 st = SLAB_LAST(PL_regmatch_slab);
4342 depth -= (st - yes_state);
4343 DEBUG_STATE_r(PerlIO_printf(Perl_debug_log, "POP STATES (%d..%d)\n",
4344 depth+1, depth+(st - yes_state)));
4346 yes_state = st->u.yes.prev_yes_state;
4347 PL_regmatch_state = st;
4349 switch (st->resume_state) {
4353 state_num = st->resume_state;
4354 goto reenter_switch;
4361 Perl_croak(aTHX_ "unexpected yes resume state");
4365 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
4366 PL_colors[4], PL_colors[5]));
4373 /* XXX this is duplicate(ish) code to that in the do_no section.
4374 * will disappear when REGFMATCH goes */
4376 /* restore previous state and re-enter */
4377 DEBUG_STATE_r(PerlIO_printf(Perl_debug_log, "POP STATE(%d)\n", depth));
4380 if (st < SLAB_FIRST(PL_regmatch_slab)) {
4381 PL_regmatch_slab = PL_regmatch_slab->prev;
4382 st = SLAB_LAST(PL_regmatch_slab);
4384 PL_regmatch_state = st;
4388 locinput= st->locinput;
4389 nextchr = UCHARAT(locinput);
4391 switch (st->resume_state) {
4393 goto resume_point_CURLYX;
4394 case resume_WHILEM1:
4395 goto resume_point_WHILEM1;
4396 case resume_WHILEM2:
4397 goto resume_point_WHILEM2;
4398 case resume_WHILEM3:
4399 goto resume_point_WHILEM3;
4400 case resume_WHILEM4:
4401 goto resume_point_WHILEM4;
4402 case resume_WHILEM5:
4403 goto resume_point_WHILEM5;
4404 case resume_WHILEM6:
4405 goto resume_point_WHILEM6;
4415 case CURLY_B_min_known:
4419 Perl_croak(aTHX_ "regexp resume memory corruption");
4426 PerlIO_printf(Perl_debug_log,
4427 "%*s %sfailed...%s\n",
4428 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4], PL_colors[5])
4439 /* there's a previous state to backtrack to */
4440 DEBUG_STATE_r(PerlIO_printf(Perl_debug_log, "POP STATE(%d)\n", depth));
4443 if (st < SLAB_FIRST(PL_regmatch_slab)) {
4444 PL_regmatch_slab = PL_regmatch_slab->prev;
4445 st = SLAB_LAST(PL_regmatch_slab);
4447 PL_regmatch_state = st;
4451 locinput= st->locinput;
4452 nextchr = UCHARAT(locinput);
4454 switch (st->resume_state) {
4456 goto resume_point_CURLYX;
4457 case resume_WHILEM1:
4458 goto resume_point_WHILEM1;
4459 case resume_WHILEM2:
4460 goto resume_point_WHILEM2;
4461 case resume_WHILEM3:
4462 goto resume_point_WHILEM3;
4463 case resume_WHILEM4:
4464 goto resume_point_WHILEM4;
4465 case resume_WHILEM5:
4466 goto resume_point_WHILEM5;
4467 case resume_WHILEM6:
4468 goto resume_point_WHILEM6;
4478 case CURLY_B_min_known:
4479 if (yes_state == st)
4480 yes_state = st->u.yes.prev_yes_state;
4481 state_num = st->resume_state + 1; /* failure = success + 1 */
4482 goto reenter_switch;
4485 Perl_croak(aTHX_ "regexp resume memory corruption");
4491 /* restore original high-water mark */
4492 PL_regmatch_slab = orig_slab;
4493 PL_regmatch_state = orig_state;
4495 /* free all slabs above current one */
4496 if (orig_slab->next) {
4497 regmatch_slab *sl = orig_slab->next;
4498 orig_slab->next = NULL;
4500 regmatch_slab * const osl = sl;
4511 - regrepeat - repeatedly match something simple, report how many
4514 * [This routine now assumes that it will only match on things of length 1.
4515 * That was true before, but now we assume scan - reginput is the count,
4516 * rather than incrementing count on every character. [Er, except utf8.]]
4519 S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max)
4522 register char *scan;
4524 register char *loceol = PL_regeol;
4525 register I32 hardcount = 0;
4526 register bool do_utf8 = PL_reg_match_utf8;
4529 if (max == REG_INFTY)
4531 else if (max < loceol - scan)
4532 loceol = scan + max;
4537 while (scan < loceol && hardcount < max && *scan != '\n') {
4538 scan += UTF8SKIP(scan);
4542 while (scan < loceol && *scan != '\n')
4549 while (scan < loceol && hardcount < max) {
4550 scan += UTF8SKIP(scan);
4560 case EXACT: /* length of string is 1 */
4562 while (scan < loceol && UCHARAT(scan) == c)
4565 case EXACTF: /* length of string is 1 */
4567 while (scan < loceol &&
4568 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold[c]))
4571 case EXACTFL: /* length of string is 1 */
4572 PL_reg_flags |= RF_tainted;
4574 while (scan < loceol &&
4575 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold_locale[c]))
4581 while (hardcount < max && scan < loceol &&
4582 reginclass(prog, p, (U8*)scan, 0, do_utf8)) {
4583 scan += UTF8SKIP(scan);
4587 while (scan < loceol && REGINCLASS(prog, p, (U8*)scan))
4594 LOAD_UTF8_CHARCLASS_ALNUM();
4595 while (hardcount < max && scan < loceol &&
4596 swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) {
4597 scan += UTF8SKIP(scan);
4601 while (scan < loceol && isALNUM(*scan))
4606 PL_reg_flags |= RF_tainted;
4609 while (hardcount < max && scan < loceol &&
4610 isALNUM_LC_utf8((U8*)scan)) {
4611 scan += UTF8SKIP(scan);
4615 while (scan < loceol && isALNUM_LC(*scan))
4622 LOAD_UTF8_CHARCLASS_ALNUM();
4623 while (hardcount < max && scan < loceol &&
4624 !swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) {
4625 scan += UTF8SKIP(scan);
4629 while (scan < loceol && !isALNUM(*scan))
4634 PL_reg_flags |= RF_tainted;
4637 while (hardcount < max && scan < loceol &&
4638 !isALNUM_LC_utf8((U8*)scan)) {
4639 scan += UTF8SKIP(scan);
4643 while (scan < loceol && !isALNUM_LC(*scan))
4650 LOAD_UTF8_CHARCLASS_SPACE();
4651 while (hardcount < max && scan < loceol &&
4653 swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) {
4654 scan += UTF8SKIP(scan);
4658 while (scan < loceol && isSPACE(*scan))
4663 PL_reg_flags |= RF_tainted;
4666 while (hardcount < max && scan < loceol &&
4667 (*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
4668 scan += UTF8SKIP(scan);
4672 while (scan < loceol && isSPACE_LC(*scan))
4679 LOAD_UTF8_CHARCLASS_SPACE();
4680 while (hardcount < max && scan < loceol &&
4682 swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) {
4683 scan += UTF8SKIP(scan);
4687 while (scan < loceol && !isSPACE(*scan))
4692 PL_reg_flags |= RF_tainted;
4695 while (hardcount < max && scan < loceol &&
4696 !(*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
4697 scan += UTF8SKIP(scan);
4701 while (scan < loceol && !isSPACE_LC(*scan))
4708 LOAD_UTF8_CHARCLASS_DIGIT();
4709 while (hardcount < max && scan < loceol &&
4710 swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) {
4711 scan += UTF8SKIP(scan);
4715 while (scan < loceol && isDIGIT(*scan))
4722 LOAD_UTF8_CHARCLASS_DIGIT();
4723 while (hardcount < max && scan < loceol &&
4724 !swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) {
4725 scan += UTF8SKIP(scan);
4729 while (scan < loceol && !isDIGIT(*scan))
4733 default: /* Called on something of 0 width. */
4734 break; /* So match right here or not at all. */
4740 c = scan - PL_reginput;
4744 GET_RE_DEBUG_FLAGS_DECL;
4746 SV * const prop = sv_newmortal();
4747 regprop(prog, prop, p);
4748 PerlIO_printf(Perl_debug_log,
4749 "%*s %s can match %"IVdf" times out of %"IVdf"...\n",
4750 REPORT_CODE_OFF+1, "", SvPVX_const(prop),(IV)c,(IV)max);
4758 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
4760 - regclass_swash - prepare the utf8 swash
4764 Perl_regclass_swash(pTHX_ const regexp *prog, register const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
4770 const struct reg_data * const data = prog ? prog->data : NULL;
4772 if (data && data->count) {
4773 const U32 n = ARG(node);
4775 if (data->what[n] == 's') {
4776 SV * const rv = (SV*)data->data[n];
4777 AV * const av = (AV*)SvRV((SV*)rv);
4778 SV **const ary = AvARRAY(av);
4781 /* See the end of regcomp.c:S_regclass() for
4782 * documentation of these array elements. */
4785 a = SvROK(ary[1]) ? &ary[1] : 0;
4786 b = SvTYPE(ary[2]) == SVt_PVAV ? &ary[2] : 0;
4790 else if (si && doinit) {
4791 sw = swash_init("utf8", "", si, 1, 0);
4792 (void)av_store(av, 1, sw);
4809 - reginclass - determine if a character falls into a character class
4811 The n is the ANYOF regnode, the p is the target string, lenp
4812 is pointer to the maximum length of how far to go in the p
4813 (if the lenp is zero, UTF8SKIP(p) is used),
4814 do_utf8 tells whether the target string is in UTF-8.
4819 S_reginclass(pTHX_ const regexp *prog, register const regnode *n, register const U8* p, STRLEN* lenp, register bool do_utf8)
4822 const char flags = ANYOF_FLAGS(n);
4828 if (do_utf8 && !UTF8_IS_INVARIANT(c)) {
4829 c = utf8n_to_uvchr(p, UTF8_MAXBYTES, &len,
4830 (UTF8_ALLOW_DEFAULT & UTF8_ALLOW_ANYUV) | UTF8_CHECK_ONLY);
4831 /* see [perl #37836] for UTF8_ALLOW_ANYUV */
4832 if (len == (STRLEN)-1)
4833 Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
4836 plen = lenp ? *lenp : UNISKIP(NATIVE_TO_UNI(c));
4837 if (do_utf8 || (flags & ANYOF_UNICODE)) {
4840 if (do_utf8 && !ANYOF_RUNTIME(n)) {
4841 if (len != (STRLEN)-1 && c < 256 && ANYOF_BITMAP_TEST(n, c))
4844 if (!match && do_utf8 && (flags & ANYOF_UNICODE_ALL) && c >= 256)
4848 SV * const sw = regclass_swash(prog, n, TRUE, 0, (SV**)&av);
4851 if (swash_fetch(sw, p, do_utf8))
4853 else if (flags & ANYOF_FOLD) {
4854 if (!match && lenp && av) {
4856 for (i = 0; i <= av_len(av); i++) {
4857 SV* const sv = *av_fetch(av, i, FALSE);
4859 const char * const s = SvPV_const(sv, len);
4861 if (len <= plen && memEQ(s, (char*)p, len)) {
4869 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
4872 to_utf8_fold(p, tmpbuf, &tmplen);
4873 if (swash_fetch(sw, tmpbuf, do_utf8))
4879 if (match && lenp && *lenp == 0)
4880 *lenp = UNISKIP(NATIVE_TO_UNI(c));
4882 if (!match && c < 256) {
4883 if (ANYOF_BITMAP_TEST(n, c))
4885 else if (flags & ANYOF_FOLD) {
4888 if (flags & ANYOF_LOCALE) {
4889 PL_reg_flags |= RF_tainted;
4890 f = PL_fold_locale[c];
4894 if (f != c && ANYOF_BITMAP_TEST(n, f))
4898 if (!match && (flags & ANYOF_CLASS)) {
4899 PL_reg_flags |= RF_tainted;
4901 (ANYOF_CLASS_TEST(n, ANYOF_ALNUM) && isALNUM_LC(c)) ||
4902 (ANYOF_CLASS_TEST(n, ANYOF_NALNUM) && !isALNUM_LC(c)) ||
4903 (ANYOF_CLASS_TEST(n, ANYOF_SPACE) && isSPACE_LC(c)) ||
4904 (ANYOF_CLASS_TEST(n, ANYOF_NSPACE) && !isSPACE_LC(c)) ||
4905 (ANYOF_CLASS_TEST(n, ANYOF_DIGIT) && isDIGIT_LC(c)) ||
4906 (ANYOF_CLASS_TEST(n, ANYOF_NDIGIT) && !isDIGIT_LC(c)) ||
4907 (ANYOF_CLASS_TEST(n, ANYOF_ALNUMC) && isALNUMC_LC(c)) ||
4908 (ANYOF_CLASS_TEST(n, ANYOF_NALNUMC) && !isALNUMC_LC(c)) ||
4909 (ANYOF_CLASS_TEST(n, ANYOF_ALPHA) && isALPHA_LC(c)) ||
4910 (ANYOF_CLASS_TEST(n, ANYOF_NALPHA) && !isALPHA_LC(c)) ||
4911 (ANYOF_CLASS_TEST(n, ANYOF_ASCII) && isASCII(c)) ||
4912 (ANYOF_CLASS_TEST(n, ANYOF_NASCII) && !isASCII(c)) ||
4913 (ANYOF_CLASS_TEST(n, ANYOF_CNTRL) && isCNTRL_LC(c)) ||
4914 (ANYOF_CLASS_TEST(n, ANYOF_NCNTRL) && !isCNTRL_LC(c)) ||
4915 (ANYOF_CLASS_TEST(n, ANYOF_GRAPH) && isGRAPH_LC(c)) ||
4916 (ANYOF_CLASS_TEST(n, ANYOF_NGRAPH) && !isGRAPH_LC(c)) ||
4917 (ANYOF_CLASS_TEST(n, ANYOF_LOWER) && isLOWER_LC(c)) ||
4918 (ANYOF_CLASS_TEST(n, ANYOF_NLOWER) && !isLOWER_LC(c)) ||
4919 (ANYOF_CLASS_TEST(n, ANYOF_PRINT) && isPRINT_LC(c)) ||
4920 (ANYOF_CLASS_TEST(n, ANYOF_NPRINT) && !isPRINT_LC(c)) ||
4921 (ANYOF_CLASS_TEST(n, ANYOF_PUNCT) && isPUNCT_LC(c)) ||
4922 (ANYOF_CLASS_TEST(n, ANYOF_NPUNCT) && !isPUNCT_LC(c)) ||
4923 (ANYOF_CLASS_TEST(n, ANYOF_UPPER) && isUPPER_LC(c)) ||
4924 (ANYOF_CLASS_TEST(n, ANYOF_NUPPER) && !isUPPER_LC(c)) ||
4925 (ANYOF_CLASS_TEST(n, ANYOF_XDIGIT) && isXDIGIT(c)) ||
4926 (ANYOF_CLASS_TEST(n, ANYOF_NXDIGIT) && !isXDIGIT(c)) ||
4927 (ANYOF_CLASS_TEST(n, ANYOF_PSXSPC) && isPSXSPC(c)) ||
4928 (ANYOF_CLASS_TEST(n, ANYOF_NPSXSPC) && !isPSXSPC(c)) ||
4929 (ANYOF_CLASS_TEST(n, ANYOF_BLANK) && isBLANK(c)) ||
4930 (ANYOF_CLASS_TEST(n, ANYOF_NBLANK) && !isBLANK(c))
4931 ) /* How's that for a conditional? */
4938 return (flags & ANYOF_INVERT) ? !match : match;
4942 S_reghop3(U8 *s, I32 off, const U8* lim)
4946 while (off-- && s < lim) {
4947 /* XXX could check well-formedness here */
4955 if (UTF8_IS_CONTINUED(*s)) {
4956 while (s > (U8*)lim && UTF8_IS_CONTINUATION(*s))
4959 /* XXX could check well-formedness here */
4967 S_reghopmaybe3(U8* s, I32 off, const U8* lim)
4971 while (off-- && s < lim) {
4972 /* XXX could check well-formedness here */
4982 if (UTF8_IS_CONTINUED(*s)) {
4983 while (s > (U8*)lim && UTF8_IS_CONTINUATION(*s))
4986 /* XXX could check well-formedness here */
4998 restore_pos(pTHX_ void *arg)
5001 regexp * const rex = (regexp *)arg;
5002 if (PL_reg_eval_set) {
5003 if (PL_reg_oldsaved) {
5004 rex->subbeg = PL_reg_oldsaved;
5005 rex->sublen = PL_reg_oldsavedlen;
5006 #ifdef PERL_OLD_COPY_ON_WRITE
5007 rex->saved_copy = PL_nrs;
5009 RX_MATCH_COPIED_on(rex);
5011 PL_reg_magic->mg_len = PL_reg_oldpos;
5012 PL_reg_eval_set = 0;
5013 PL_curpm = PL_reg_oldcurpm;
5018 S_to_utf8_substr(pTHX_ register regexp *prog)
5020 if (prog->float_substr && !prog->float_utf8) {
5021 SV* const sv = newSVsv(prog->float_substr);
5022 prog->float_utf8 = sv;
5023 sv_utf8_upgrade(sv);
5024 if (SvTAIL(prog->float_substr))
5026 if (prog->float_substr == prog->check_substr)
5027 prog->check_utf8 = sv;
5029 if (prog->anchored_substr && !prog->anchored_utf8) {
5030 SV* const sv = newSVsv(prog->anchored_substr);
5031 prog->anchored_utf8 = sv;
5032 sv_utf8_upgrade(sv);
5033 if (SvTAIL(prog->anchored_substr))
5035 if (prog->anchored_substr == prog->check_substr)
5036 prog->check_utf8 = sv;
5041 S_to_byte_substr(pTHX_ register regexp *prog)
5044 if (prog->float_utf8 && !prog->float_substr) {
5045 SV* sv = newSVsv(prog->float_utf8);
5046 prog->float_substr = sv;
5047 if (sv_utf8_downgrade(sv, TRUE)) {
5048 if (SvTAIL(prog->float_utf8))
5052 prog->float_substr = sv = &PL_sv_undef;
5054 if (prog->float_utf8 == prog->check_utf8)
5055 prog->check_substr = sv;
5057 if (prog->anchored_utf8 && !prog->anchored_substr) {
5058 SV* sv = newSVsv(prog->anchored_utf8);
5059 prog->anchored_substr = sv;
5060 if (sv_utf8_downgrade(sv, TRUE)) {
5061 if (SvTAIL(prog->anchored_utf8))
5065 prog->anchored_substr = sv = &PL_sv_undef;
5067 if (prog->anchored_utf8 == prog->check_utf8)
5068 prog->check_substr = sv;
5074 * c-indentation-style: bsd
5076 * indent-tabs-mode: t
5079 * ex: set ts=8 sts=4 sw=4 noet: