5 * "One Ring to rule them all, One Ring to find them..."
8 /* This file contains functions for executing a regular expression. See
9 * also regcomp.c which funnily enough, contains functions for compiling
10 * a regular expression.
13 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
14 * confused with the original package (see point 3 below). Thanks, Henry!
17 /* Additional note: this code is very heavily munged from Henry's version
18 * in places. In some spots I've traded clarity for efficiency, so don't
19 * blame Henry for some of the lack of readability.
22 /* The names of the functions have been changed from regcomp and
23 * regexec to pregcomp and pregexec in order to avoid conflicts
24 * with the POSIX routines of the same names.
27 #ifdef PERL_EXT_RE_BUILD
28 /* need to replace pregcomp et al, so enable that */
29 # ifndef PERL_IN_XSUB_RE
30 # define PERL_IN_XSUB_RE
32 /* need access to debugger hooks */
33 # if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
38 #ifdef PERL_IN_XSUB_RE
39 /* We *really* need to overwrite these symbols: */
40 # define Perl_regexec_flags my_regexec
41 # define Perl_regdump my_regdump
42 # define Perl_regprop my_regprop
43 # define Perl_re_intuit_start my_re_intuit_start
44 /* *These* symbols are masked to allow static link. */
45 # define Perl_pregexec my_pregexec
46 # define Perl_reginitcolors my_reginitcolors
47 # define Perl_regclass_swash my_regclass_swash
49 # define PERL_NO_GET_CONTEXT
54 * pregcomp and pregexec -- regsub and regerror are not used in perl
56 * Copyright (c) 1986 by University of Toronto.
57 * Written by Henry Spencer. Not derived from licensed software.
59 * Permission is granted to anyone to use this software for any
60 * purpose on any computer system, and to redistribute it freely,
61 * subject to the following restrictions:
63 * 1. The author is not responsible for the consequences of use of
64 * this software, no matter how awful, even if they arise
67 * 2. The origin of this software must not be misrepresented, either
68 * by explicit claim or by omission.
70 * 3. Altered versions must be plainly marked as such, and must not
71 * be misrepresented as being the original software.
73 **** Alterations to Henry's code are...
75 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
76 **** 2000, 2001, 2002, 2003, 2004, by Larry Wall and others
78 **** You may distribute under the terms of either the GNU General Public
79 **** License or the Artistic License, as specified in the README file.
81 * Beware that some of this code is subtly aware of the way operator
82 * precedence is structured in regular expressions. Serious changes in
83 * regular-expression syntax might require a total rethink.
86 #define PERL_IN_REGEXEC_C
91 #define RF_tainted 1 /* tainted information used? */
92 #define RF_warned 2 /* warned about big count? */
93 #define RF_evaled 4 /* Did an EVAL with setting? */
94 #define RF_utf8 8 /* String contains multibyte chars? */
95 #define RF_false 16 /* odd number of nested negatives */
97 #define UTF ((PL_reg_flags & RF_utf8) != 0)
99 #define RS_init 1 /* eval environment created */
100 #define RS_set 2 /* replsv value is set */
103 #define STATIC static
106 #define REGINCLASS(p,c) (ANYOF_FLAGS(p) ? reginclass(p,c,0,0) : ANYOF_BITMAP_TEST(p,*(c)))
112 #define CHR_SVLEN(sv) (do_utf8 ? sv_len_utf8(sv) : SvCUR(sv))
113 #define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
115 #define reghop_c(pos,off) ((char*)reghop((U8*)pos, off))
116 #define reghopmaybe_c(pos,off) ((char*)reghopmaybe((U8*)pos, off))
117 #define HOP(pos,off) (PL_reg_match_utf8 ? reghop((U8*)pos, off) : (U8*)(pos + off))
118 #define HOPMAYBE(pos,off) (PL_reg_match_utf8 ? reghopmaybe((U8*)pos, off) : (U8*)(pos + off))
119 #define HOPc(pos,off) ((char*)HOP(pos,off))
120 #define HOPMAYBEc(pos,off) ((char*)HOPMAYBE(pos,off))
122 #define HOPBACK(pos, off) ( \
123 (PL_reg_match_utf8) \
124 ? reghopmaybe((U8*)pos, -off) \
125 : (pos - off >= PL_bostr) \
129 #define HOPBACKc(pos, off) (char*)HOPBACK(pos, off)
131 #define reghop3_c(pos,off,lim) ((char*)reghop3((U8*)pos, off, (U8*)lim))
132 #define reghopmaybe3_c(pos,off,lim) ((char*)reghopmaybe3((U8*)pos, off, (U8*)lim))
133 #define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)pos, off, (U8*)lim) : (U8*)(pos + off))
134 #define HOPMAYBE3(pos,off,lim) (PL_reg_match_utf8 ? reghopmaybe3((U8*)pos, off, (U8*)lim) : (U8*)(pos + off))
135 #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
136 #define HOPMAYBE3c(pos,off,lim) ((char*)HOPMAYBE3(pos,off,lim))
138 #define LOAD_UTF8_CHARCLASS(a,b) STMT_START { if (!CAT2(PL_utf8_,a)) { ENTER; save_re_context(); (void)CAT2(is_utf8_, a)((U8*)b); LEAVE; } } STMT_END
140 /* for use after a quantifier and before an EXACT-like node -- japhy */
141 #define JUMPABLE(rn) ( \
142 OP(rn) == OPEN || OP(rn) == CLOSE || OP(rn) == EVAL || \
143 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
144 OP(rn) == PLUS || OP(rn) == MINMOD || \
145 (PL_regkind[(U8)OP(rn)] == CURLY && ARG1(rn) > 0) \
148 #define HAS_TEXT(rn) ( \
149 PL_regkind[(U8)OP(rn)] == EXACT || PL_regkind[(U8)OP(rn)] == REF \
153 Search for mandatory following text node; for lookahead, the text must
154 follow but for lookbehind (rn->flags != 0) we skip to the next step.
156 #define FIND_NEXT_IMPT(rn) STMT_START { \
157 while (JUMPABLE(rn)) \
158 if (OP(rn) == SUSPEND || PL_regkind[(U8)OP(rn)] == CURLY) \
159 rn = NEXTOPER(NEXTOPER(rn)); \
160 else if (OP(rn) == PLUS) \
162 else if (OP(rn) == IFMATCH) \
163 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
164 else rn += NEXT_OFF(rn); \
167 static void restore_pos(pTHX_ void *arg);
170 S_regcppush(pTHX_ I32 parenfloor)
172 int retval = PL_savestack_ix;
173 #define REGCP_PAREN_ELEMS 4
174 int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
177 if (paren_elems_to_push < 0)
178 Perl_croak(aTHX_ "panic: paren_elems_to_push < 0");
180 #define REGCP_OTHER_ELEMS 6
181 SSGROW(paren_elems_to_push + REGCP_OTHER_ELEMS);
182 for (p = PL_regsize; p > parenfloor; p--) {
183 /* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
184 SSPUSHINT(PL_regendp[p]);
185 SSPUSHINT(PL_regstartp[p]);
186 SSPUSHPTR(PL_reg_start_tmp[p]);
189 /* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
190 SSPUSHINT(PL_regsize);
191 SSPUSHINT(*PL_reglastparen);
192 SSPUSHINT(*PL_reglastcloseparen);
193 SSPUSHPTR(PL_reginput);
194 #define REGCP_FRAME_ELEMS 2
195 /* REGCP_FRAME_ELEMS are part of the REGCP_OTHER_ELEMS and
196 * are needed for the regexp context stack bookkeeping. */
197 SSPUSHINT(paren_elems_to_push + REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
198 SSPUSHINT(SAVEt_REGCONTEXT); /* Magic cookie. */
203 /* These are needed since we do not localize EVAL nodes: */
204 # define REGCP_SET(cp) DEBUG_r(PerlIO_printf(Perl_debug_log, \
205 " Setting an EVAL scope, savestack=%"IVdf"\n", \
206 (IV)PL_savestack_ix)); cp = PL_savestack_ix
208 # define REGCP_UNWIND(cp) DEBUG_r(cp != PL_savestack_ix ? \
209 PerlIO_printf(Perl_debug_log, \
210 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
211 (IV)(cp), (IV)PL_savestack_ix) : 0); regcpblow(cp)
221 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
223 assert(i == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
224 i = SSPOPINT; /* Parentheses elements to pop. */
225 input = (char *) SSPOPPTR;
226 *PL_reglastcloseparen = SSPOPINT;
227 *PL_reglastparen = SSPOPINT;
228 PL_regsize = SSPOPINT;
230 /* Now restore the parentheses context. */
231 for (i -= (REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
232 i > 0; i -= REGCP_PAREN_ELEMS) {
233 paren = (U32)SSPOPINT;
234 PL_reg_start_tmp[paren] = (char *) SSPOPPTR;
235 PL_regstartp[paren] = SSPOPINT;
237 if (paren <= *PL_reglastparen)
238 PL_regendp[paren] = tmps;
240 PerlIO_printf(Perl_debug_log,
241 " restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n",
242 (UV)paren, (IV)PL_regstartp[paren],
243 (IV)(PL_reg_start_tmp[paren] - PL_bostr),
244 (IV)PL_regendp[paren],
245 (paren > *PL_reglastparen ? "(no)" : ""));
249 if ((I32)(*PL_reglastparen + 1) <= PL_regnpar) {
250 PerlIO_printf(Perl_debug_log,
251 " restoring \\%"IVdf"..\\%"IVdf" to undef\n",
252 (IV)(*PL_reglastparen + 1), (IV)PL_regnpar);
256 /* It would seem that the similar code in regtry()
257 * already takes care of this, and in fact it is in
258 * a better location to since this code can #if 0-ed out
259 * but the code in regtry() is needed or otherwise tests
260 * requiring null fields (pat.t#187 and split.t#{13,14}
261 * (as of patchlevel 7877) will fail. Then again,
262 * this code seems to be necessary or otherwise
263 * building DynaLoader will fail:
264 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
266 for (paren = *PL_reglastparen + 1; (I32)paren <= PL_regnpar; paren++) {
267 if ((I32)paren > PL_regsize)
268 PL_regstartp[paren] = -1;
269 PL_regendp[paren] = -1;
276 S_regcp_set_to(pTHX_ I32 ss)
278 I32 tmp = PL_savestack_ix;
280 PL_savestack_ix = ss;
282 PL_savestack_ix = tmp;
286 typedef struct re_cc_state
290 struct re_cc_state *prev;
295 #define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
297 #define TRYPAREN(paren, n, input) { \
300 PL_regstartp[paren] = HOPc(input, -1) - PL_bostr; \
301 PL_regendp[paren] = input - PL_bostr; \
304 PL_regendp[paren] = -1; \
306 if (regmatch(next)) \
309 PL_regendp[paren] = -1; \
314 * pregexec and friends
318 - pregexec - match a regexp against a string
321 Perl_pregexec(pTHX_ register regexp *prog, char *stringarg, register char *strend,
322 char *strbeg, I32 minend, SV *screamer, U32 nosave)
323 /* strend: pointer to null at end of string */
324 /* strbeg: real beginning of string */
325 /* minend: end of match must be >=minend after stringarg. */
326 /* nosave: For optimizations. */
329 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
330 nosave ? 0 : REXEC_COPY_STR);
334 S_cache_re(pTHX_ regexp *prog)
336 PL_regprecomp = prog->precomp; /* Needed for FAIL. */
338 PL_regprogram = prog->program;
340 PL_regnpar = prog->nparens;
341 PL_regdata = prog->data;
346 * Need to implement the following flags for reg_anch:
348 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
350 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
351 * INTUIT_AUTORITATIVE_ML
352 * INTUIT_ONCE_NOML - Intuit can match in one location only.
355 * Another flag for this function: SECOND_TIME (so that float substrs
356 * with giant delta may be not rechecked).
359 /* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
361 /* If SCREAM, then SvPVX(sv) should be compatible with strpos and strend.
362 Otherwise, only SvCUR(sv) is used to get strbeg. */
364 /* XXXX We assume that strpos is strbeg unless sv. */
366 /* XXXX Some places assume that there is a fixed substring.
367 An update may be needed if optimizer marks as "INTUITable"
368 RExen without fixed substrings. Similarly, it is assumed that
369 lengths of all the strings are no more than minlen, thus they
370 cannot come from lookahead.
371 (Or minlen should take into account lookahead.) */
373 /* A failure to find a constant substring means that there is no need to make
374 an expensive call to REx engine, thus we celebrate a failure. Similarly,
375 finding a substring too deep into the string means that less calls to
376 regtry() should be needed.
378 REx compiler's optimizer found 4 possible hints:
379 a) Anchored substring;
381 c) Whether we are anchored (beginning-of-line or \G);
382 d) First node (of those at offset 0) which may distingush positions;
383 We use a)b)d) and multiline-part of c), and try to find a position in the
384 string which does not contradict any of them.
387 /* Most of decisions we do here should have been done at compile time.
388 The nodes of the REx which we used for the search should have been
389 deleted from the finite automaton. */
392 Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
393 char *strend, U32 flags, re_scream_pos_data *data)
395 register I32 start_shift = 0;
396 /* Should be nonnegative! */
397 register I32 end_shift = 0;
402 int do_utf8 = sv ? SvUTF8(sv) : 0; /* if no sv we have to assume bytes */
404 register char *other_last = Nullch; /* other substr checked before this */
405 char *check_at = Nullch; /* check substr found at this pos */
407 char *i_strpos = strpos;
408 SV *dsv = PERL_DEBUG_PAD_ZERO(0);
410 RX_MATCH_UTF8_set(prog,do_utf8);
412 if (prog->reganch & ROPT_UTF8) {
413 DEBUG_r(PerlIO_printf(Perl_debug_log,
414 "UTF-8 regex...\n"));
415 PL_reg_flags |= RF_utf8;
419 char *s = PL_reg_match_utf8 ?
420 sv_uni_display(dsv, sv, 60, UNI_DISPLAY_REGEX) :
422 int len = PL_reg_match_utf8 ?
423 strlen(s) : strend - strpos;
426 if (PL_reg_match_utf8)
427 DEBUG_r(PerlIO_printf(Perl_debug_log,
428 "UTF-8 target...\n"));
429 PerlIO_printf(Perl_debug_log,
430 "%sGuessing start of match, REx%s `%s%.60s%s%s' against `%s%.*s%s%s'...\n",
431 PL_colors[4],PL_colors[5],PL_colors[0],
434 (strlen(prog->precomp) > 60 ? "..." : ""),
436 (int)(len > 60 ? 60 : len),
438 (len > 60 ? "..." : "")
442 /* CHR_DIST() would be more correct here but it makes things slow. */
443 if (prog->minlen > strend - strpos) {
444 DEBUG_r(PerlIO_printf(Perl_debug_log,
445 "String too short... [re_intuit_start]\n"));
448 strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
451 if (!prog->check_utf8 && prog->check_substr)
452 to_utf8_substr(prog);
453 check = prog->check_utf8;
455 if (!prog->check_substr && prog->check_utf8)
456 to_byte_substr(prog);
457 check = prog->check_substr;
459 if (check == &PL_sv_undef) {
460 DEBUG_r(PerlIO_printf(Perl_debug_log,
461 "Non-utf string cannot match utf check string\n"));
464 if (prog->reganch & ROPT_ANCH) { /* Match at beg-of-str or after \n */
465 ml_anch = !( (prog->reganch & ROPT_ANCH_SINGLE)
466 || ( (prog->reganch & ROPT_ANCH_BOL)
467 && !PL_multiline ) ); /* Check after \n? */
470 if ( !(prog->reganch & (ROPT_ANCH_GPOS /* Checked by the caller */
471 | ROPT_IMPLICIT)) /* not a real BOL */
472 /* SvCUR is not set on references: SvRV and SvPVX overlap */
474 && (strpos != strbeg)) {
475 DEBUG_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
478 if (prog->check_offset_min == prog->check_offset_max &&
479 !(prog->reganch & ROPT_CANY_SEEN)) {
480 /* Substring at constant offset from beg-of-str... */
483 s = HOP3c(strpos, prog->check_offset_min, strend);
485 slen = SvCUR(check); /* >= 1 */
487 if ( strend - s > slen || strend - s < slen - 1
488 || (strend - s == slen && strend[-1] != '\n')) {
489 DEBUG_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
492 /* Now should match s[0..slen-2] */
494 if (slen && (*SvPVX(check) != *s
496 && memNE(SvPVX(check), s, slen)))) {
498 DEBUG_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
502 else if (*SvPVX(check) != *s
503 || ((slen = SvCUR(check)) > 1
504 && memNE(SvPVX(check), s, slen)))
506 goto success_at_start;
509 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
511 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
512 end_shift = prog->minlen - start_shift -
513 CHR_SVLEN(check) + (SvTAIL(check) != 0);
515 I32 end = prog->check_offset_max + CHR_SVLEN(check)
516 - (SvTAIL(check) != 0);
517 I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
519 if (end_shift < eshift)
523 else { /* Can match at random position */
526 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
527 /* Should be nonnegative! */
528 end_shift = prog->minlen - start_shift -
529 CHR_SVLEN(check) + (SvTAIL(check) != 0);
532 #ifdef DEBUGGING /* 7/99: reports of failure (with the older version) */
534 Perl_croak(aTHX_ "panic: end_shift");
538 /* Find a possible match in the region s..strend by looking for
539 the "check" substring in the region corrected by start/end_shift. */
540 if (flags & REXEC_SCREAM) {
541 I32 p = -1; /* Internal iterator of scream. */
542 I32 *pp = data ? data->scream_pos : &p;
544 if (PL_screamfirst[BmRARE(check)] >= 0
545 || ( BmRARE(check) == '\n'
546 && (BmPREVIOUS(check) == SvCUR(check) - 1)
548 s = screaminstr(sv, check,
549 start_shift + (s - strbeg), end_shift, pp, 0);
552 /* we may be pointing at the wrong string */
553 if (s && RX_MATCH_COPIED(prog))
554 s = strbeg + (s - SvPVX(sv));
556 *data->scream_olds = s;
558 else if (prog->reganch & ROPT_CANY_SEEN)
559 s = fbm_instr((U8*)(s + start_shift),
560 (U8*)(strend - end_shift),
561 check, PL_multiline ? FBMrf_MULTILINE : 0);
563 s = fbm_instr(HOP3(s, start_shift, strend),
564 HOP3(strend, -end_shift, strbeg),
565 check, PL_multiline ? FBMrf_MULTILINE : 0);
567 /* Update the count-of-usability, remove useless subpatterns,
570 DEBUG_r(PerlIO_printf(Perl_debug_log, "%s %s substr `%s%.*s%s'%s%s",
571 (s ? "Found" : "Did not find"),
572 (check == (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) ? "anchored" : "floating"),
574 (int)(SvCUR(check) - (SvTAIL(check)!=0)),
576 PL_colors[1], (SvTAIL(check) ? "$" : ""),
577 (s ? " at offset " : "...\n") ) );
584 /* Finish the diagnostic message */
585 DEBUG_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
587 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
588 Start with the other substr.
589 XXXX no SCREAM optimization yet - and a very coarse implementation
590 XXXX /ttx+/ results in anchored=`ttx', floating=`x'. floating will
591 *always* match. Probably should be marked during compile...
592 Probably it is right to do no SCREAM here...
595 if (do_utf8 ? (prog->float_utf8 && prog->anchored_utf8) : (prog->float_substr && prog->anchored_substr)) {
596 /* Take into account the "other" substring. */
597 /* XXXX May be hopelessly wrong for UTF... */
600 if (check == (do_utf8 ? prog->float_utf8 : prog->float_substr)) {
603 char *last = HOP3c(s, -start_shift, strbeg), *last1, *last2;
607 t = s - prog->check_offset_max;
608 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
610 || ((t = reghopmaybe3_c(s, -(prog->check_offset_max), strpos))
615 t = HOP3c(t, prog->anchored_offset, strend);
616 if (t < other_last) /* These positions already checked */
618 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
621 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
622 /* On end-of-str: see comment below. */
623 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
624 if (must == &PL_sv_undef) {
626 DEBUG_r(must = prog->anchored_utf8); /* for debug */
631 HOP3(HOP3(last1, prog->anchored_offset, strend)
632 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
634 PL_multiline ? FBMrf_MULTILINE : 0
636 DEBUG_r(PerlIO_printf(Perl_debug_log,
637 "%s anchored substr `%s%.*s%s'%s",
638 (s ? "Found" : "Contradicts"),
641 - (SvTAIL(must)!=0)),
643 PL_colors[1], (SvTAIL(must) ? "$" : "")));
645 if (last1 >= last2) {
646 DEBUG_r(PerlIO_printf(Perl_debug_log,
647 ", giving up...\n"));
650 DEBUG_r(PerlIO_printf(Perl_debug_log,
651 ", trying floating at offset %ld...\n",
652 (long)(HOP3c(s1, 1, strend) - i_strpos)));
653 other_last = HOP3c(last1, prog->anchored_offset+1, strend);
654 s = HOP3c(last, 1, strend);
658 DEBUG_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
659 (long)(s - i_strpos)));
660 t = HOP3c(s, -prog->anchored_offset, strbeg);
661 other_last = HOP3c(s, 1, strend);
669 else { /* Take into account the floating substring. */
674 t = HOP3c(s, -start_shift, strbeg);
676 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
677 if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
678 last = HOP3c(t, prog->float_max_offset, strend);
679 s = HOP3c(t, prog->float_min_offset, strend);
682 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
683 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
684 /* fbm_instr() takes into account exact value of end-of-str
685 if the check is SvTAIL(ed). Since false positives are OK,
686 and end-of-str is not later than strend we are OK. */
687 if (must == &PL_sv_undef) {
689 DEBUG_r(must = prog->float_utf8); /* for debug message */
692 s = fbm_instr((unsigned char*)s,
693 (unsigned char*)last + SvCUR(must)
695 must, PL_multiline ? FBMrf_MULTILINE : 0);
696 DEBUG_r(PerlIO_printf(Perl_debug_log, "%s floating substr `%s%.*s%s'%s",
697 (s ? "Found" : "Contradicts"),
699 (int)(SvCUR(must) - (SvTAIL(must)!=0)),
701 PL_colors[1], (SvTAIL(must) ? "$" : "")));
704 DEBUG_r(PerlIO_printf(Perl_debug_log,
705 ", giving up...\n"));
708 DEBUG_r(PerlIO_printf(Perl_debug_log,
709 ", trying anchored starting at offset %ld...\n",
710 (long)(s1 + 1 - i_strpos)));
712 s = HOP3c(t, 1, strend);
716 DEBUG_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
717 (long)(s - i_strpos)));
718 other_last = s; /* Fix this later. --Hugo */
727 t = s - prog->check_offset_max;
728 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
730 || ((t = reghopmaybe3_c(s, -prog->check_offset_max, strpos))
732 /* Fixed substring is found far enough so that the match
733 cannot start at strpos. */
735 if (ml_anch && t[-1] != '\n') {
736 /* Eventually fbm_*() should handle this, but often
737 anchored_offset is not 0, so this check will not be wasted. */
738 /* XXXX In the code below we prefer to look for "^" even in
739 presence of anchored substrings. And we search even
740 beyond the found float position. These pessimizations
741 are historical artefacts only. */
743 while (t < strend - prog->minlen) {
745 if (t < check_at - prog->check_offset_min) {
746 if (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) {
747 /* Since we moved from the found position,
748 we definitely contradict the found anchored
749 substr. Due to the above check we do not
750 contradict "check" substr.
751 Thus we can arrive here only if check substr
752 is float. Redo checking for "other"=="fixed".
755 DEBUG_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
756 PL_colors[0],PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
757 goto do_other_anchored;
759 /* We don't contradict the found floating substring. */
760 /* XXXX Why not check for STCLASS? */
762 DEBUG_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
763 PL_colors[0],PL_colors[1], (long)(s - i_strpos)));
766 /* Position contradicts check-string */
767 /* XXXX probably better to look for check-string
768 than for "\n", so one should lower the limit for t? */
769 DEBUG_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
770 PL_colors[0],PL_colors[1], (long)(t + 1 - i_strpos)));
771 other_last = strpos = s = t + 1;
776 DEBUG_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
777 PL_colors[0],PL_colors[1]));
781 DEBUG_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
782 PL_colors[0],PL_colors[1]));
786 ++BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
789 /* The found string does not prohibit matching at strpos,
790 - no optimization of calling REx engine can be performed,
791 unless it was an MBOL and we are not after MBOL,
792 or a future STCLASS check will fail this. */
794 /* Even in this situation we may use MBOL flag if strpos is offset
795 wrt the start of the string. */
796 if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
797 && (strpos != strbeg) && strpos[-1] != '\n'
798 /* May be due to an implicit anchor of m{.*foo} */
799 && !(prog->reganch & ROPT_IMPLICIT))
804 DEBUG_r( if (ml_anch)
805 PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
806 (long)(strpos - i_strpos), PL_colors[0],PL_colors[1]);
809 if (!(prog->reganch & ROPT_NAUGHTY) /* XXXX If strpos moved? */
811 prog->check_utf8 /* Could be deleted already */
812 && --BmUSEFUL(prog->check_utf8) < 0
813 && (prog->check_utf8 == prog->float_utf8)
815 prog->check_substr /* Could be deleted already */
816 && --BmUSEFUL(prog->check_substr) < 0
817 && (prog->check_substr == prog->float_substr)
820 /* If flags & SOMETHING - do not do it many times on the same match */
821 DEBUG_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
822 SvREFCNT_dec(do_utf8 ? prog->check_utf8 : prog->check_substr);
823 if (do_utf8 ? prog->check_substr : prog->check_utf8)
824 SvREFCNT_dec(do_utf8 ? prog->check_substr : prog->check_utf8);
825 prog->check_substr = prog->check_utf8 = Nullsv; /* disable */
826 prog->float_substr = prog->float_utf8 = Nullsv; /* clear */
827 check = Nullsv; /* abort */
829 /* XXXX This is a remnant of the old implementation. It
830 looks wasteful, since now INTUIT can use many
832 prog->reganch &= ~RE_USE_INTUIT;
839 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
840 if (prog->regstclass) {
841 /* minlen == 0 is possible if regstclass is \b or \B,
842 and the fixed substr is ''$.
843 Since minlen is already taken into account, s+1 is before strend;
844 accidentally, minlen >= 1 guaranties no false positives at s + 1
845 even for \b or \B. But (minlen? 1 : 0) below assumes that
846 regstclass does not come from lookahead... */
847 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
848 This leaves EXACTF only, which is dealt with in find_byclass(). */
849 U8* str = (U8*)STRING(prog->regstclass);
850 int cl_l = (PL_regkind[(U8)OP(prog->regstclass)] == EXACT
851 ? CHR_DIST(str+STR_LEN(prog->regstclass), str)
853 char *endpos = (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
854 ? HOP3c(s, (prog->minlen ? cl_l : 0), strend)
855 : (prog->float_substr || prog->float_utf8
856 ? HOP3c(HOP3c(check_at, -start_shift, strbeg),
859 char *startpos = strbeg;
863 s = find_byclass(prog, prog->regstclass, s, endpos, startpos, 1);
868 if (endpos == strend) {
869 DEBUG_r( PerlIO_printf(Perl_debug_log,
870 "Could not match STCLASS...\n") );
873 DEBUG_r( PerlIO_printf(Perl_debug_log,
874 "This position contradicts STCLASS...\n") );
875 if ((prog->reganch & ROPT_ANCH) && !ml_anch)
877 /* Contradict one of substrings */
878 if (prog->anchored_substr || prog->anchored_utf8) {
879 if ((do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) == check) {
880 DEBUG_r( what = "anchored" );
882 s = HOP3c(t, 1, strend);
883 if (s + start_shift + end_shift > strend) {
884 /* XXXX Should be taken into account earlier? */
885 DEBUG_r( PerlIO_printf(Perl_debug_log,
886 "Could not match STCLASS...\n") );
891 DEBUG_r( PerlIO_printf(Perl_debug_log,
892 "Looking for %s substr starting at offset %ld...\n",
893 what, (long)(s + start_shift - i_strpos)) );
896 /* Have both, check_string is floating */
897 if (t + start_shift >= check_at) /* Contradicts floating=check */
898 goto retry_floating_check;
899 /* Recheck anchored substring, but not floating... */
903 DEBUG_r( PerlIO_printf(Perl_debug_log,
904 "Looking for anchored substr starting at offset %ld...\n",
905 (long)(other_last - i_strpos)) );
906 goto do_other_anchored;
908 /* Another way we could have checked stclass at the
909 current position only: */
914 DEBUG_r( PerlIO_printf(Perl_debug_log,
915 "Looking for /%s^%s/m starting at offset %ld...\n",
916 PL_colors[0],PL_colors[1], (long)(t - i_strpos)) );
919 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
921 /* Check is floating subtring. */
922 retry_floating_check:
923 t = check_at - start_shift;
924 DEBUG_r( what = "floating" );
925 goto hop_and_restart;
928 DEBUG_r(PerlIO_printf(Perl_debug_log,
929 "By STCLASS: moving %ld --> %ld\n",
930 (long)(t - i_strpos), (long)(s - i_strpos))
934 DEBUG_r(PerlIO_printf(Perl_debug_log,
935 "Does not contradict STCLASS...\n");
940 DEBUG_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
941 PL_colors[4], (check ? "Guessed" : "Giving up"),
942 PL_colors[5], (long)(s - i_strpos)) );
945 fail_finish: /* Substring not found */
946 if (prog->check_substr || prog->check_utf8) /* could be removed already */
947 BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
949 DEBUG_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
950 PL_colors[4],PL_colors[5]));
954 /* We know what class REx starts with. Try to find this position... */
956 S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *startpos, I32 norun)
958 I32 doevery = (prog->reganch & ROPT_SKIP) == 0;
962 register STRLEN uskip;
966 register I32 tmp = 1; /* Scratch variable? */
967 register bool do_utf8 = PL_reg_match_utf8;
969 /* We know what class it must start with. */
973 while (s + (uskip = UTF8SKIP(s)) <= strend) {
974 if ((ANYOF_FLAGS(c) & ANYOF_UNICODE) ||
975 !UTF8_IS_INVARIANT((U8)s[0]) ?
976 reginclass(c, (U8*)s, 0, do_utf8) :
977 REGINCLASS(c, (U8*)s)) {
978 if (tmp && (norun || regtry(prog, s)))
992 if (REGINCLASS(c, (U8*)s) ||
993 (ANYOF_FOLD_SHARP_S(c, s, strend) &&
994 /* The assignment of 2 is intentional:
995 * for the folded sharp s, the skip is 2. */
996 (skip = SHARP_S_SKIP))) {
997 if (tmp && (norun || regtry(prog, s)))
1009 while (s < strend) {
1010 if (tmp && (norun || regtry(prog, s)))
1019 ln = STR_LEN(c); /* length to match in octets/bytes */
1020 lnc = (I32) ln; /* length to match in characters */
1022 STRLEN ulen1, ulen2;
1024 U8 tmpbuf1[UTF8_MAXLEN_UCLC+1];
1025 U8 tmpbuf2[UTF8_MAXLEN_UCLC+1];
1027 to_utf8_lower((U8*)m, tmpbuf1, &ulen1);
1028 to_utf8_upper((U8*)m, tmpbuf2, &ulen2);
1030 c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXLEN_UCLC,
1031 0, ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
1032 c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXLEN_UCLC,
1033 0, ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
1035 while (sm < ((U8 *) m + ln)) {
1050 c2 = PL_fold_locale[c1];
1052 e = HOP3c(strend, -((I32)lnc), s);
1055 e = s; /* Due to minlen logic of intuit() */
1057 /* The idea in the EXACTF* cases is to first find the
1058 * first character of the EXACTF* node and then, if
1059 * necessary, case-insensitively compare the full
1060 * text of the node. The c1 and c2 are the first
1061 * characters (though in Unicode it gets a bit
1062 * more complicated because there are more cases
1063 * than just upper and lower: one needs to use
1064 * the so-called folding case for case-insensitive
1065 * matching (called "loose matching" in Unicode).
1066 * ibcmp_utf8() will do just that. */
1070 U8 tmpbuf [UTF8_MAXLEN+1];
1071 U8 foldbuf[UTF8_MAXLEN_FOLD+1];
1072 STRLEN len, foldlen;
1075 /* Upper and lower of 1st char are equal -
1076 * probably not a "letter". */
1078 c = utf8n_to_uvchr((U8*)s, UTF8_MAXLEN, &len,
1080 0 : UTF8_ALLOW_ANY);
1083 ibcmp_utf8(s, (char **)0, 0, do_utf8,
1084 m, (char **)0, ln, (bool)UTF))
1085 && (norun || regtry(prog, s)) )
1088 uvchr_to_utf8(tmpbuf, c);
1089 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen);
1091 && (f == c1 || f == c2)
1092 && (ln == foldlen ||
1093 !ibcmp_utf8((char *) foldbuf,
1094 (char **)0, foldlen, do_utf8,
1096 (char **)0, ln, (bool)UTF))
1097 && (norun || regtry(prog, s)) )
1105 c = utf8n_to_uvchr((U8*)s, UTF8_MAXLEN, &len,
1107 0 : UTF8_ALLOW_ANY);
1109 /* Handle some of the three Greek sigmas cases.
1110 * Note that not all the possible combinations
1111 * are handled here: some of them are handled
1112 * by the standard folding rules, and some of
1113 * them (the character class or ANYOF cases)
1114 * are handled during compiletime in
1115 * regexec.c:S_regclass(). */
1116 if (c == (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA ||
1117 c == (UV)UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA)
1118 c = (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA;
1120 if ( (c == c1 || c == c2)
1122 ibcmp_utf8(s, (char **)0, 0, do_utf8,
1123 m, (char **)0, ln, (bool)UTF))
1124 && (norun || regtry(prog, s)) )
1127 uvchr_to_utf8(tmpbuf, c);
1128 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen);
1130 && (f == c1 || f == c2)
1131 && (ln == foldlen ||
1132 !ibcmp_utf8((char *) foldbuf,
1133 (char **)0, foldlen, do_utf8,
1135 (char **)0, ln, (bool)UTF))
1136 && (norun || regtry(prog, s)) )
1147 && (ln == 1 || !(OP(c) == EXACTF
1149 : ibcmp_locale(s, m, ln)))
1150 && (norun || regtry(prog, s)) )
1156 if ( (*(U8*)s == c1 || *(U8*)s == c2)
1157 && (ln == 1 || !(OP(c) == EXACTF
1159 : ibcmp_locale(s, m, ln)))
1160 && (norun || regtry(prog, s)) )
1167 PL_reg_flags |= RF_tainted;
1174 U8 *r = reghop3((U8*)s, -1, (U8*)PL_bostr);
1176 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, 0);
1178 tmp = ((OP(c) == BOUND ?
1179 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1180 LOAD_UTF8_CHARCLASS(alnum,"a");
1181 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1182 if (tmp == !(OP(c) == BOUND ?
1183 swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
1184 isALNUM_LC_utf8((U8*)s)))
1187 if ((norun || regtry(prog, s)))
1194 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
1195 tmp = ((OP(c) == BOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1196 while (s < strend) {
1198 !(OP(c) == BOUND ? isALNUM(*s) : isALNUM_LC(*s))) {
1200 if ((norun || regtry(prog, s)))
1206 if ((!prog->minlen && tmp) && (norun || regtry(prog, s)))
1210 PL_reg_flags |= RF_tainted;
1217 U8 *r = reghop3((U8*)s, -1, (U8*)PL_bostr);
1219 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, 0);
1221 tmp = ((OP(c) == NBOUND ?
1222 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1223 LOAD_UTF8_CHARCLASS(alnum,"a");
1224 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1225 if (tmp == !(OP(c) == NBOUND ?
1226 swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
1227 isALNUM_LC_utf8((U8*)s)))
1229 else if ((norun || regtry(prog, s)))
1235 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
1236 tmp = ((OP(c) == NBOUND ?
1237 isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1238 while (s < strend) {
1240 !(OP(c) == NBOUND ? isALNUM(*s) : isALNUM_LC(*s)))
1242 else if ((norun || regtry(prog, s)))
1247 if ((!prog->minlen && !tmp) && (norun || regtry(prog, s)))
1252 LOAD_UTF8_CHARCLASS(alnum,"a");
1253 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1254 if (swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8)) {
1255 if (tmp && (norun || regtry(prog, s)))
1266 while (s < strend) {
1268 if (tmp && (norun || regtry(prog, s)))
1280 PL_reg_flags |= RF_tainted;
1282 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1283 if (isALNUM_LC_utf8((U8*)s)) {
1284 if (tmp && (norun || regtry(prog, s)))
1295 while (s < strend) {
1296 if (isALNUM_LC(*s)) {
1297 if (tmp && (norun || regtry(prog, s)))
1310 LOAD_UTF8_CHARCLASS(alnum,"a");
1311 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1312 if (!swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8)) {
1313 if (tmp && (norun || regtry(prog, s)))
1324 while (s < strend) {
1326 if (tmp && (norun || regtry(prog, s)))
1338 PL_reg_flags |= RF_tainted;
1340 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1341 if (!isALNUM_LC_utf8((U8*)s)) {
1342 if (tmp && (norun || regtry(prog, s)))
1353 while (s < strend) {
1354 if (!isALNUM_LC(*s)) {
1355 if (tmp && (norun || regtry(prog, s)))
1368 LOAD_UTF8_CHARCLASS(space," ");
1369 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1370 if (*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8)) {
1371 if (tmp && (norun || regtry(prog, s)))
1382 while (s < strend) {
1384 if (tmp && (norun || regtry(prog, s)))
1396 PL_reg_flags |= RF_tainted;
1398 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1399 if (*s == ' ' || isSPACE_LC_utf8((U8*)s)) {
1400 if (tmp && (norun || regtry(prog, s)))
1411 while (s < strend) {
1412 if (isSPACE_LC(*s)) {
1413 if (tmp && (norun || regtry(prog, s)))
1426 LOAD_UTF8_CHARCLASS(space," ");
1427 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1428 if (!(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8))) {
1429 if (tmp && (norun || regtry(prog, s)))
1440 while (s < strend) {
1442 if (tmp && (norun || regtry(prog, s)))
1454 PL_reg_flags |= RF_tainted;
1456 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1457 if (!(*s == ' ' || isSPACE_LC_utf8((U8*)s))) {
1458 if (tmp && (norun || regtry(prog, s)))
1469 while (s < strend) {
1470 if (!isSPACE_LC(*s)) {
1471 if (tmp && (norun || regtry(prog, s)))
1484 LOAD_UTF8_CHARCLASS(digit,"0");
1485 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1486 if (swash_fetch(PL_utf8_digit,(U8*)s, do_utf8)) {
1487 if (tmp && (norun || regtry(prog, s)))
1498 while (s < strend) {
1500 if (tmp && (norun || regtry(prog, s)))
1512 PL_reg_flags |= RF_tainted;
1514 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1515 if (isDIGIT_LC_utf8((U8*)s)) {
1516 if (tmp && (norun || regtry(prog, s)))
1527 while (s < strend) {
1528 if (isDIGIT_LC(*s)) {
1529 if (tmp && (norun || regtry(prog, s)))
1542 LOAD_UTF8_CHARCLASS(digit,"0");
1543 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1544 if (!swash_fetch(PL_utf8_digit,(U8*)s, do_utf8)) {
1545 if (tmp && (norun || regtry(prog, s)))
1556 while (s < strend) {
1558 if (tmp && (norun || regtry(prog, s)))
1570 PL_reg_flags |= RF_tainted;
1572 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1573 if (!isDIGIT_LC_utf8((U8*)s)) {
1574 if (tmp && (norun || regtry(prog, s)))
1585 while (s < strend) {
1586 if (!isDIGIT_LC(*s)) {
1587 if (tmp && (norun || regtry(prog, s)))
1599 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
1608 - regexec_flags - match a regexp against a string
1611 Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *strend,
1612 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
1613 /* strend: pointer to null at end of string */
1614 /* strbeg: real beginning of string */
1615 /* minend: end of match must be >=minend after stringarg. */
1616 /* data: May be used for some additional optimizations. */
1617 /* nosave: For optimizations. */
1620 register regnode *c;
1621 register char *startpos = stringarg;
1622 I32 minlen; /* must match at least this many chars */
1623 I32 dontbother = 0; /* how many characters not to try at end */
1624 /* I32 start_shift = 0; */ /* Offset of the start to find
1625 constant substr. */ /* CC */
1626 I32 end_shift = 0; /* Same for the end. */ /* CC */
1627 I32 scream_pos = -1; /* Internal iterator of scream. */
1629 SV* oreplsv = GvSV(PL_replgv);
1630 bool do_utf8 = DO_UTF8(sv);
1632 SV *dsv0 = PERL_DEBUG_PAD_ZERO(0);
1633 SV *dsv1 = PERL_DEBUG_PAD_ZERO(1);
1635 RX_MATCH_UTF8_set(prog,do_utf8);
1641 PL_regnarrate = DEBUG_r_TEST;
1644 /* Be paranoid... */
1645 if (prog == NULL || startpos == NULL) {
1646 Perl_croak(aTHX_ "NULL regexp parameter");
1650 minlen = prog->minlen;
1651 if (strend - startpos < minlen) {
1652 DEBUG_r(PerlIO_printf(Perl_debug_log,
1653 "String too short [regexec_flags]...\n"));
1657 /* Check validity of program. */
1658 if (UCHARAT(prog->program) != REG_MAGIC) {
1659 Perl_croak(aTHX_ "corrupted regexp program");
1663 PL_reg_eval_set = 0;
1666 if (prog->reganch & ROPT_UTF8)
1667 PL_reg_flags |= RF_utf8;
1669 /* Mark beginning of line for ^ and lookbehind. */
1670 PL_regbol = startpos;
1674 /* Mark end of line for $ (and such) */
1677 /* see how far we have to get to not match where we matched before */
1678 PL_regtill = startpos+minend;
1680 /* We start without call_cc context. */
1683 /* If there is a "must appear" string, look for it. */
1686 if (prog->reganch & ROPT_GPOS_SEEN) { /* Need to have PL_reg_ganch */
1689 if (flags & REXEC_IGNOREPOS) /* Means: check only at start */
1690 PL_reg_ganch = startpos;
1691 else if (sv && SvTYPE(sv) >= SVt_PVMG
1693 && (mg = mg_find(sv, PERL_MAGIC_regex_global))
1694 && mg->mg_len >= 0) {
1695 PL_reg_ganch = strbeg + mg->mg_len; /* Defined pos() */
1696 if (prog->reganch & ROPT_ANCH_GPOS) {
1697 if (s > PL_reg_ganch)
1702 else /* pos() not defined */
1703 PL_reg_ganch = strbeg;
1706 if (!(flags & REXEC_CHECKED) && (prog->check_substr != Nullsv || prog->check_utf8 != Nullsv)) {
1707 re_scream_pos_data d;
1709 d.scream_olds = &scream_olds;
1710 d.scream_pos = &scream_pos;
1711 s = re_intuit_start(prog, sv, s, strend, flags, &d);
1713 DEBUG_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
1714 goto phooey; /* not present */
1720 pv_uni_display(dsv0, (U8*)prog->precomp, prog->prelen, 60,
1721 UNI_DISPLAY_REGEX) :
1723 int len0 = UTF ? SvCUR(dsv0) : prog->prelen;
1724 char *s1 = do_utf8 ? sv_uni_display(dsv1, sv, 60,
1725 UNI_DISPLAY_REGEX) : startpos;
1726 int len1 = do_utf8 ? SvCUR(dsv1) : strend - startpos;
1729 PerlIO_printf(Perl_debug_log,
1730 "%sMatching REx%s `%s%*.*s%s%s' against `%s%.*s%s%s'\n",
1731 PL_colors[4],PL_colors[5],PL_colors[0],
1734 len0 > 60 ? "..." : "",
1736 (int)(len1 > 60 ? 60 : len1),
1738 (len1 > 60 ? "..." : "")
1742 /* Simplest case: anchored match need be tried only once. */
1743 /* [unless only anchor is BOL and multiline is set] */
1744 if (prog->reganch & (ROPT_ANCH & ~ROPT_ANCH_GPOS)) {
1745 if (s == startpos && regtry(prog, startpos))
1747 else if (PL_multiline || (prog->reganch & ROPT_IMPLICIT)
1748 || (prog->reganch & ROPT_ANCH_MBOL)) /* XXXX SBOL? */
1753 dontbother = minlen - 1;
1754 end = HOP3c(strend, -dontbother, strbeg) - 1;
1755 /* for multiline we only have to try after newlines */
1756 if (prog->check_substr || prog->check_utf8) {
1760 if (regtry(prog, s))
1765 if (prog->reganch & RE_USE_INTUIT) {
1766 s = re_intuit_start(prog, sv, s + 1, strend, flags, NULL);
1777 if (*s++ == '\n') { /* don't need PL_utf8skip here */
1778 if (regtry(prog, s))
1785 } else if (prog->reganch & ROPT_ANCH_GPOS) {
1786 if (regtry(prog, PL_reg_ganch))
1791 /* Messy cases: unanchored match. */
1792 if ((prog->anchored_substr || prog->anchored_utf8) && prog->reganch & ROPT_SKIP) {
1793 /* we have /x+whatever/ */
1794 /* it must be a one character string (XXXX Except UTF?) */
1799 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1800 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1801 ch = SvPVX(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr)[0];
1804 while (s < strend) {
1806 DEBUG_r( did_match = 1 );
1807 if (regtry(prog, s)) goto got_it;
1809 while (s < strend && *s == ch)
1816 while (s < strend) {
1818 DEBUG_r( did_match = 1 );
1819 if (regtry(prog, s)) goto got_it;
1821 while (s < strend && *s == ch)
1827 DEBUG_r(if (!did_match)
1828 PerlIO_printf(Perl_debug_log,
1829 "Did not find anchored character...\n")
1833 else if (prog->anchored_substr != Nullsv
1834 || prog->anchored_utf8 != Nullsv
1835 || ((prog->float_substr != Nullsv || prog->float_utf8 != Nullsv)
1836 && prog->float_max_offset < strend - s)) {
1841 char *last1; /* Last position checked before */
1845 if (prog->anchored_substr || prog->anchored_utf8) {
1846 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1847 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1848 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
1849 back_max = back_min = prog->anchored_offset;
1851 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
1852 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1853 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
1854 back_max = prog->float_max_offset;
1855 back_min = prog->float_min_offset;
1857 if (must == &PL_sv_undef)
1858 /* could not downgrade utf8 check substring, so must fail */
1861 last = HOP3c(strend, /* Cannot start after this */
1862 -(I32)(CHR_SVLEN(must)
1863 - (SvTAIL(must) != 0) + back_min), strbeg);
1866 last1 = HOPc(s, -1);
1868 last1 = s - 1; /* bogus */
1870 /* XXXX check_substr already used to find `s', can optimize if
1871 check_substr==must. */
1873 dontbother = end_shift;
1874 strend = HOPc(strend, -dontbother);
1875 while ( (s <= last) &&
1876 ((flags & REXEC_SCREAM)
1877 ? (s = screaminstr(sv, must, HOP3c(s, back_min, strend) - strbeg,
1878 end_shift, &scream_pos, 0))
1879 : (s = fbm_instr((unsigned char*)HOP3(s, back_min, strend),
1880 (unsigned char*)strend, must,
1881 PL_multiline ? FBMrf_MULTILINE : 0))) ) {
1882 /* we may be pointing at the wrong string */
1883 if ((flags & REXEC_SCREAM) && RX_MATCH_COPIED(prog))
1884 s = strbeg + (s - SvPVX(sv));
1885 DEBUG_r( did_match = 1 );
1886 if (HOPc(s, -back_max) > last1) {
1887 last1 = HOPc(s, -back_min);
1888 s = HOPc(s, -back_max);
1891 char *t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
1893 last1 = HOPc(s, -back_min);
1897 while (s <= last1) {
1898 if (regtry(prog, s))
1904 while (s <= last1) {
1905 if (regtry(prog, s))
1911 DEBUG_r(if (!did_match)
1912 PerlIO_printf(Perl_debug_log,
1913 "Did not find %s substr `%s%.*s%s'%s...\n",
1914 ((must == prog->anchored_substr || must == prog->anchored_utf8)
1915 ? "anchored" : "floating"),
1917 (int)(SvCUR(must) - (SvTAIL(must)!=0)),
1919 PL_colors[1], (SvTAIL(must) ? "$" : ""))
1923 else if ((c = prog->regstclass)) {
1925 I32 op = (U8)OP(prog->regstclass);
1926 /* don't bother with what can't match */
1927 if (PL_regkind[op] != EXACT && op != CANY)
1928 strend = HOPc(strend, -(minlen - 1));
1931 SV *prop = sv_newmortal();
1939 pv_uni_display(dsv0, (U8*)SvPVX(prop), SvCUR(prop), 60,
1940 UNI_DISPLAY_REGEX) :
1942 len0 = UTF ? SvCUR(dsv0) : SvCUR(prop);
1944 sv_uni_display(dsv1, sv, 60, UNI_DISPLAY_REGEX) : s;
1945 len1 = UTF ? SvCUR(dsv1) : strend - s;
1946 PerlIO_printf(Perl_debug_log,
1947 "Matching stclass `%*.*s' against `%*.*s'\n",
1951 if (find_byclass(prog, c, s, strend, startpos, 0))
1953 DEBUG_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass...\n"));
1957 if (prog->float_substr != Nullsv || prog->float_utf8 != Nullsv) {
1962 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
1963 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1964 float_real = do_utf8 ? prog->float_utf8 : prog->float_substr;
1966 if (flags & REXEC_SCREAM) {
1967 last = screaminstr(sv, float_real, s - strbeg,
1968 end_shift, &scream_pos, 1); /* last one */
1970 last = scream_olds; /* Only one occurrence. */
1971 /* we may be pointing at the wrong string */
1972 else if (RX_MATCH_COPIED(prog))
1973 s = strbeg + (s - SvPVX(sv));
1977 char *little = SvPV(float_real, len);
1979 if (SvTAIL(float_real)) {
1980 if (memEQ(strend - len + 1, little, len - 1))
1981 last = strend - len + 1;
1982 else if (!PL_multiline)
1983 last = memEQ(strend - len, little, len)
1984 ? strend - len : Nullch;
1990 last = rninstr(s, strend, little, little + len);
1992 last = strend; /* matching `$' */
1996 DEBUG_r(PerlIO_printf(Perl_debug_log,
1997 "%sCan't trim the tail, match fails (should not happen)%s\n",
1998 PL_colors[4],PL_colors[5]));
1999 goto phooey; /* Should not happen! */
2001 dontbother = strend - last + prog->float_min_offset;
2003 if (minlen && (dontbother < minlen))
2004 dontbother = minlen - 1;
2005 strend -= dontbother; /* this one's always in bytes! */
2006 /* We don't know much -- general case. */
2009 if (regtry(prog, s))
2018 if (regtry(prog, s))
2020 } while (s++ < strend);
2028 RX_MATCH_TAINTED_set(prog, PL_reg_flags & RF_tainted);
2030 if (PL_reg_eval_set) {
2031 /* Preserve the current value of $^R */
2032 if (oreplsv != GvSV(PL_replgv))
2033 sv_setsv(oreplsv, GvSV(PL_replgv));/* So that when GvSV(replgv) is
2034 restored, the value remains
2036 restore_pos(aTHX_ 0);
2039 /* make sure $`, $&, $', and $digit will work later */
2040 if ( !(flags & REXEC_NOT_FIRST) ) {
2041 RX_MATCH_COPY_FREE(prog);
2042 if (flags & REXEC_COPY_STR) {
2043 I32 i = PL_regeol - startpos + (stringarg - strbeg);
2044 #ifdef PERL_COPY_ON_WRITE
2046 || (SvFLAGS(sv) & CAN_COW_MASK) == CAN_COW_FLAGS)) {
2048 PerlIO_printf(Perl_debug_log,
2049 "Copy on write: regexp capture, type %d\n",
2052 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
2053 prog->subbeg = SvPVX(prog->saved_copy);
2054 assert (SvPOKp(prog->saved_copy));
2058 RX_MATCH_COPIED_on(prog);
2059 s = savepvn(strbeg, i);
2065 prog->subbeg = strbeg;
2066 prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
2073 DEBUG_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
2074 PL_colors[4],PL_colors[5]));
2075 if (PL_reg_eval_set)
2076 restore_pos(aTHX_ 0);
2081 - regtry - try match at specific point
2083 STATIC I32 /* 0 failure, 1 success */
2084 S_regtry(pTHX_ regexp *prog, char *startpos)
2092 PL_regindent = 0; /* XXXX Not good when matches are reenterable... */
2094 if ((prog->reganch & ROPT_EVAL_SEEN) && !PL_reg_eval_set) {
2097 PL_reg_eval_set = RS_init;
2099 PerlIO_printf(Perl_debug_log, " setting stack tmpbase at %"IVdf"\n",
2100 (IV)(PL_stack_sp - PL_stack_base));
2102 SAVEI32(cxstack[cxstack_ix].blk_oldsp);
2103 cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base;
2104 /* Otherwise OP_NEXTSTATE will free whatever on stack now. */
2106 /* Apparently this is not needed, judging by wantarray. */
2107 /* SAVEI8(cxstack[cxstack_ix].blk_gimme);
2108 cxstack[cxstack_ix].blk_gimme = G_SCALAR; */
2111 /* Make $_ available to executed code. */
2112 if (PL_reg_sv != DEFSV) {
2117 if (!(SvTYPE(PL_reg_sv) >= SVt_PVMG && SvMAGIC(PL_reg_sv)
2118 && (mg = mg_find(PL_reg_sv, PERL_MAGIC_regex_global)))) {
2119 /* prepare for quick setting of pos */
2120 sv_magic(PL_reg_sv, (SV*)0,
2121 PERL_MAGIC_regex_global, Nullch, 0);
2122 mg = mg_find(PL_reg_sv, PERL_MAGIC_regex_global);
2126 PL_reg_oldpos = mg->mg_len;
2127 SAVEDESTRUCTOR_X(restore_pos, 0);
2129 if (!PL_reg_curpm) {
2130 Newz(22,PL_reg_curpm, 1, PMOP);
2133 SV* repointer = newSViv(0);
2134 /* so we know which PL_regex_padav element is PL_reg_curpm */
2135 SvFLAGS(repointer) |= SVf_BREAK;
2136 av_push(PL_regex_padav,repointer);
2137 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2138 PL_regex_pad = AvARRAY(PL_regex_padav);
2142 PM_SETRE(PL_reg_curpm, prog);
2143 PL_reg_oldcurpm = PL_curpm;
2144 PL_curpm = PL_reg_curpm;
2145 if (RX_MATCH_COPIED(prog)) {
2146 /* Here is a serious problem: we cannot rewrite subbeg,
2147 since it may be needed if this match fails. Thus
2148 $` inside (?{}) could fail... */
2149 PL_reg_oldsaved = prog->subbeg;
2150 PL_reg_oldsavedlen = prog->sublen;
2151 #ifdef PERL_COPY_ON_WRITE
2152 PL_nrs = prog->saved_copy;
2154 RX_MATCH_COPIED_off(prog);
2157 PL_reg_oldsaved = Nullch;
2158 prog->subbeg = PL_bostr;
2159 prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2161 prog->startp[0] = startpos - PL_bostr;
2162 PL_reginput = startpos;
2163 PL_regstartp = prog->startp;
2164 PL_regendp = prog->endp;
2165 PL_reglastparen = &prog->lastparen;
2166 PL_reglastcloseparen = &prog->lastcloseparen;
2167 prog->lastparen = 0;
2168 prog->lastcloseparen = 0;
2170 DEBUG_r(PL_reg_starttry = startpos);
2171 if (PL_reg_start_tmpl <= prog->nparens) {
2172 PL_reg_start_tmpl = prog->nparens*3/2 + 3;
2173 if(PL_reg_start_tmp)
2174 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2176 New(22,PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2179 /* XXXX What this code is doing here?!!! There should be no need
2180 to do this again and again, PL_reglastparen should take care of
2183 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2184 * Actually, the code in regcppop() (which Ilya may be meaning by
2185 * PL_reglastparen), is not needed at all by the test suite
2186 * (op/regexp, op/pat, op/split), but that code is needed, oddly
2187 * enough, for building DynaLoader, or otherwise this
2188 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
2189 * will happen. Meanwhile, this code *is* needed for the
2190 * above-mentioned test suite tests to succeed. The common theme
2191 * on those tests seems to be returning null fields from matches.
2196 if (prog->nparens) {
2197 for (i = prog->nparens; i > (I32)*PL_reglastparen; i--) {
2204 if (regmatch(prog->program + 1)) {
2205 prog->endp[0] = PL_reginput - PL_bostr;
2208 REGCP_UNWIND(lastcp);
2212 #define RE_UNWIND_BRANCH 1
2213 #define RE_UNWIND_BRANCHJ 2
2217 typedef struct { /* XX: makes sense to enlarge it... */
2221 } re_unwind_generic_t;
2234 } re_unwind_branch_t;
2236 typedef union re_unwind_t {
2238 re_unwind_generic_t generic;
2239 re_unwind_branch_t branch;
2242 #define sayYES goto yes
2243 #define sayNO goto no
2244 #define sayNO_ANYOF goto no_anyof
2245 #define sayYES_FINAL goto yes_final
2246 #define sayYES_LOUD goto yes_loud
2247 #define sayNO_FINAL goto no_final
2248 #define sayNO_SILENT goto do_no
2249 #define saySAME(x) if (x) goto yes; else goto no
2251 #define REPORT_CODE_OFF 24
2254 - regmatch - main matching routine
2256 * Conceptually the strategy is simple: check to see whether the current
2257 * node matches, call self recursively to see whether the rest matches,
2258 * and then act accordingly. In practice we make some effort to avoid
2259 * recursion, in particular by going through "ordinary" nodes (that don't
2260 * need to know whether the rest of the match failed) by a loop instead of
2263 /* [lwall] I've hoisted the register declarations to the outer block in order to
2264 * maybe save a little bit of pushing and popping on the stack. It also takes
2265 * advantage of machines that use a register save mask on subroutine entry.
2267 STATIC I32 /* 0 failure, 1 success */
2268 S_regmatch(pTHX_ regnode *prog)
2270 register regnode *scan; /* Current node. */
2271 regnode *next; /* Next node. */
2272 regnode *inner; /* Next node in internal branch. */
2273 register I32 nextchr; /* renamed nextchr - nextchar colides with
2274 function of same name */
2275 register I32 n; /* no or next */
2276 register I32 ln = 0; /* len or last */
2277 register char *s = Nullch; /* operand or save */
2278 register char *locinput = PL_reginput;
2279 register I32 c1 = 0, c2 = 0, paren; /* case fold search, parenth */
2280 int minmod = 0, sw = 0, logical = 0;
2283 I32 firstcp = PL_savestack_ix;
2285 register bool do_utf8 = PL_reg_match_utf8;
2287 SV *dsv0 = PERL_DEBUG_PAD_ZERO(0);
2288 SV *dsv1 = PERL_DEBUG_PAD_ZERO(1);
2289 SV *dsv2 = PERL_DEBUG_PAD_ZERO(2);
2296 /* Note that nextchr is a byte even in UTF */
2297 nextchr = UCHARAT(locinput);
2299 while (scan != NULL) {
2302 SV *prop = sv_newmortal();
2303 int docolor = *PL_colors[0];
2304 int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
2305 int l = (PL_regeol - locinput) > taill ? taill : (PL_regeol - locinput);
2306 /* The part of the string before starttry has one color
2307 (pref0_len chars), between starttry and current
2308 position another one (pref_len - pref0_len chars),
2309 after the current position the third one.
2310 We assume that pref0_len <= pref_len, otherwise we
2311 decrease pref0_len. */
2312 int pref_len = (locinput - PL_bostr) > (5 + taill) - l
2313 ? (5 + taill) - l : locinput - PL_bostr;
2316 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
2318 pref0_len = pref_len - (locinput - PL_reg_starttry);
2319 if (l + pref_len < (5 + taill) && l < PL_regeol - locinput)
2320 l = ( PL_regeol - locinput > (5 + taill) - pref_len
2321 ? (5 + taill) - pref_len : PL_regeol - locinput);
2322 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
2326 if (pref0_len > pref_len)
2327 pref0_len = pref_len;
2328 regprop(prop, scan);
2331 do_utf8 && OP(scan) != CANY ?
2332 pv_uni_display(dsv0, (U8*)(locinput - pref_len),
2333 pref0_len, 60, UNI_DISPLAY_REGEX) :
2334 locinput - pref_len;
2335 int len0 = do_utf8 ? strlen(s0) : pref0_len;
2336 char *s1 = do_utf8 && OP(scan) != CANY ?
2337 pv_uni_display(dsv1, (U8*)(locinput - pref_len + pref0_len),
2338 pref_len - pref0_len, 60, UNI_DISPLAY_REGEX) :
2339 locinput - pref_len + pref0_len;
2340 int len1 = do_utf8 ? strlen(s1) : pref_len - pref0_len;
2341 char *s2 = do_utf8 && OP(scan) != CANY ?
2342 pv_uni_display(dsv2, (U8*)locinput,
2343 PL_regeol - locinput, 60, UNI_DISPLAY_REGEX) :
2345 int len2 = do_utf8 ? strlen(s2) : l;
2346 PerlIO_printf(Perl_debug_log,
2347 "%4"IVdf" <%s%.*s%s%s%.*s%s%s%s%.*s%s>%*s|%3"IVdf":%*s%s\n",
2348 (IV)(locinput - PL_bostr),
2355 (docolor ? "" : "> <"),
2359 15 - l - pref_len + 1,
2361 (IV)(scan - PL_regprogram), PL_regindent*2, "",
2366 next = scan + NEXT_OFF(scan);
2372 if (locinput == PL_bostr || (PL_multiline &&
2373 (nextchr || locinput < PL_regeol) && locinput[-1] == '\n') )
2375 /* regtill = regbol; */
2380 if (locinput == PL_bostr ||
2381 ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n'))
2387 if (locinput == PL_bostr)
2391 if (locinput == PL_reg_ganch)
2401 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
2406 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
2408 if (PL_regeol - locinput > 1)
2412 if (PL_regeol != locinput)
2416 if (!nextchr && locinput >= PL_regeol)
2419 locinput += PL_utf8skip[nextchr];
2420 if (locinput > PL_regeol)
2422 nextchr = UCHARAT(locinput);
2425 nextchr = UCHARAT(++locinput);
2428 if (!nextchr && locinput >= PL_regeol)
2430 nextchr = UCHARAT(++locinput);
2433 if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
2436 locinput += PL_utf8skip[nextchr];
2437 if (locinput > PL_regeol)
2439 nextchr = UCHARAT(locinput);
2442 nextchr = UCHARAT(++locinput);
2447 if (do_utf8 != UTF) {
2448 /* The target and the pattern have differing utf8ness. */
2454 /* The target is utf8, the pattern is not utf8. */
2458 if (NATIVE_TO_UNI(*(U8*)s) !=
2459 utf8n_to_uvuni((U8*)l, UTF8_MAXLEN, &ulen,
2461 0 : UTF8_ALLOW_ANY))
2468 /* The target is not utf8, the pattern is utf8. */
2472 if (NATIVE_TO_UNI(*((U8*)l)) !=
2473 utf8n_to_uvuni((U8*)s, UTF8_MAXLEN, &ulen,
2475 0 : UTF8_ALLOW_ANY))
2482 nextchr = UCHARAT(locinput);
2485 /* The target and the pattern have the same utf8ness. */
2486 /* Inline the first character, for speed. */
2487 if (UCHARAT(s) != nextchr)
2489 if (PL_regeol - locinput < ln)
2491 if (ln > 1 && memNE(s, locinput, ln))
2494 nextchr = UCHARAT(locinput);
2497 PL_reg_flags |= RF_tainted;
2503 if (do_utf8 || UTF) {
2504 /* Either target or the pattern are utf8. */
2506 char *e = PL_regeol;
2508 if (ibcmp_utf8(s, 0, ln, (bool)UTF,
2509 l, &e, 0, do_utf8)) {
2510 /* One more case for the sharp s:
2511 * pack("U0U*", 0xDF) =~ /ss/i,
2512 * the 0xC3 0x9F are the UTF-8
2513 * byte sequence for the U+00DF. */
2515 toLOWER(s[0]) == 's' &&
2517 toLOWER(s[1]) == 's' &&
2524 nextchr = UCHARAT(locinput);
2528 /* Neither the target and the pattern are utf8. */
2530 /* Inline the first character, for speed. */
2531 if (UCHARAT(s) != nextchr &&
2532 UCHARAT(s) != ((OP(scan) == EXACTF)
2533 ? PL_fold : PL_fold_locale)[nextchr])
2535 if (PL_regeol - locinput < ln)
2537 if (ln > 1 && (OP(scan) == EXACTF
2538 ? ibcmp(s, locinput, ln)
2539 : ibcmp_locale(s, locinput, ln)))
2542 nextchr = UCHARAT(locinput);
2546 STRLEN inclasslen = PL_regeol - locinput;
2548 if (!reginclass(scan, (U8*)locinput, &inclasslen, do_utf8))
2550 if (locinput >= PL_regeol)
2552 locinput += inclasslen ? inclasslen : UTF8SKIP(locinput);
2553 nextchr = UCHARAT(locinput);
2558 nextchr = UCHARAT(locinput);
2559 if (!REGINCLASS(scan, (U8*)locinput))
2561 if (!nextchr && locinput >= PL_regeol)
2563 nextchr = UCHARAT(++locinput);
2567 /* If we might have the case of the German sharp s
2568 * in a casefolding Unicode character class. */
2570 if (ANYOF_FOLD_SHARP_S(scan, locinput, PL_regeol)) {
2571 locinput += SHARP_S_SKIP;
2572 nextchr = UCHARAT(locinput);
2578 PL_reg_flags |= RF_tainted;
2584 LOAD_UTF8_CHARCLASS(alnum,"a");
2585 if (!(OP(scan) == ALNUM
2586 ? swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
2587 : isALNUM_LC_utf8((U8*)locinput)))
2591 locinput += PL_utf8skip[nextchr];
2592 nextchr = UCHARAT(locinput);
2595 if (!(OP(scan) == ALNUM
2596 ? isALNUM(nextchr) : isALNUM_LC(nextchr)))
2598 nextchr = UCHARAT(++locinput);
2601 PL_reg_flags |= RF_tainted;
2604 if (!nextchr && locinput >= PL_regeol)
2607 LOAD_UTF8_CHARCLASS(alnum,"a");
2608 if (OP(scan) == NALNUM
2609 ? swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
2610 : isALNUM_LC_utf8((U8*)locinput))
2614 locinput += PL_utf8skip[nextchr];
2615 nextchr = UCHARAT(locinput);
2618 if (OP(scan) == NALNUM
2619 ? isALNUM(nextchr) : isALNUM_LC(nextchr))
2621 nextchr = UCHARAT(++locinput);
2625 PL_reg_flags |= RF_tainted;
2629 /* was last char in word? */
2631 if (locinput == PL_bostr)
2634 U8 *r = reghop3((U8*)locinput, -1, (U8*)PL_bostr);
2636 ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, 0);
2638 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
2639 ln = isALNUM_uni(ln);
2640 LOAD_UTF8_CHARCLASS(alnum,"a");
2641 n = swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8);
2644 ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(ln));
2645 n = isALNUM_LC_utf8((U8*)locinput);
2649 ln = (locinput != PL_bostr) ?
2650 UCHARAT(locinput - 1) : '\n';
2651 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
2653 n = isALNUM(nextchr);
2656 ln = isALNUM_LC(ln);
2657 n = isALNUM_LC(nextchr);
2660 if (((!ln) == (!n)) == (OP(scan) == BOUND ||
2661 OP(scan) == BOUNDL))
2665 PL_reg_flags |= RF_tainted;
2671 if (UTF8_IS_CONTINUED(nextchr)) {
2672 LOAD_UTF8_CHARCLASS(space," ");
2673 if (!(OP(scan) == SPACE
2674 ? swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
2675 : isSPACE_LC_utf8((U8*)locinput)))
2679 locinput += PL_utf8skip[nextchr];
2680 nextchr = UCHARAT(locinput);
2683 if (!(OP(scan) == SPACE
2684 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
2686 nextchr = UCHARAT(++locinput);
2689 if (!(OP(scan) == SPACE
2690 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
2692 nextchr = UCHARAT(++locinput);
2696 PL_reg_flags |= RF_tainted;
2699 if (!nextchr && locinput >= PL_regeol)
2702 LOAD_UTF8_CHARCLASS(space," ");
2703 if (OP(scan) == NSPACE
2704 ? swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
2705 : isSPACE_LC_utf8((U8*)locinput))
2709 locinput += PL_utf8skip[nextchr];
2710 nextchr = UCHARAT(locinput);
2713 if (OP(scan) == NSPACE
2714 ? isSPACE(nextchr) : isSPACE_LC(nextchr))
2716 nextchr = UCHARAT(++locinput);
2719 PL_reg_flags |= RF_tainted;
2725 LOAD_UTF8_CHARCLASS(digit,"0");
2726 if (!(OP(scan) == DIGIT
2727 ? swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
2728 : isDIGIT_LC_utf8((U8*)locinput)))
2732 locinput += PL_utf8skip[nextchr];
2733 nextchr = UCHARAT(locinput);
2736 if (!(OP(scan) == DIGIT
2737 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr)))
2739 nextchr = UCHARAT(++locinput);
2742 PL_reg_flags |= RF_tainted;
2745 if (!nextchr && locinput >= PL_regeol)
2748 LOAD_UTF8_CHARCLASS(digit,"0");
2749 if (OP(scan) == NDIGIT
2750 ? swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
2751 : isDIGIT_LC_utf8((U8*)locinput))
2755 locinput += PL_utf8skip[nextchr];
2756 nextchr = UCHARAT(locinput);
2759 if (OP(scan) == NDIGIT
2760 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr))
2762 nextchr = UCHARAT(++locinput);
2765 if (locinput >= PL_regeol)
2768 LOAD_UTF8_CHARCLASS(mark,"~");
2769 if (swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
2771 locinput += PL_utf8skip[nextchr];
2772 while (locinput < PL_regeol &&
2773 swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
2774 locinput += UTF8SKIP(locinput);
2775 if (locinput > PL_regeol)
2780 nextchr = UCHARAT(locinput);
2783 PL_reg_flags |= RF_tainted;
2787 n = ARG(scan); /* which paren pair */
2788 ln = PL_regstartp[n];
2789 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
2790 if ((I32)*PL_reglastparen < n || ln == -1)
2791 sayNO; /* Do not match unless seen CLOSEn. */
2792 if (ln == PL_regendp[n])
2796 if (do_utf8 && OP(scan) != REF) { /* REF can do byte comparison */
2798 char *e = PL_bostr + PL_regendp[n];
2800 * Note that we can't do the "other character" lookup trick as
2801 * in the 8-bit case (no pun intended) because in Unicode we
2802 * have to map both upper and title case to lower case.
2804 if (OP(scan) == REFF) {
2805 STRLEN ulen1, ulen2;
2806 U8 tmpbuf1[UTF8_MAXLEN_UCLC+1];
2807 U8 tmpbuf2[UTF8_MAXLEN_UCLC+1];
2811 toLOWER_utf8((U8*)s, tmpbuf1, &ulen1);
2812 toLOWER_utf8((U8*)l, tmpbuf2, &ulen2);
2813 if (ulen1 != ulen2 || memNE((char *)tmpbuf1, (char *)tmpbuf2, ulen1))
2820 nextchr = UCHARAT(locinput);
2824 /* Inline the first character, for speed. */
2825 if (UCHARAT(s) != nextchr &&
2827 (UCHARAT(s) != ((OP(scan) == REFF
2828 ? PL_fold : PL_fold_locale)[nextchr]))))
2830 ln = PL_regendp[n] - ln;
2831 if (locinput + ln > PL_regeol)
2833 if (ln > 1 && (OP(scan) == REF
2834 ? memNE(s, locinput, ln)
2836 ? ibcmp(s, locinput, ln)
2837 : ibcmp_locale(s, locinput, ln))))
2840 nextchr = UCHARAT(locinput);
2851 OP_4tree *oop = PL_op;
2852 COP *ocurcop = PL_curcop;
2855 struct regexp *oreg = PL_reg_re;
2858 PL_op = (OP_4tree*)PL_regdata->data[n];
2859 DEBUG_r( PerlIO_printf(Perl_debug_log, " re_eval 0x%"UVxf"\n", PTR2UV(PL_op)) );
2860 PAD_SAVE_LOCAL(old_comppad, (PAD*)PL_regdata->data[n + 2]);
2861 PL_regendp[0] = PL_reg_magic->mg_len = locinput - PL_bostr;
2865 CALLRUNOPS(aTHX); /* Scalar context. */
2868 ret = &PL_sv_undef; /* protect against empty (?{}) blocks. */
2876 PAD_RESTORE_LOCAL(old_comppad);
2877 PL_curcop = ocurcop;
2879 if (logical == 2) { /* Postponed subexpression. */
2881 MAGIC *mg = Null(MAGIC*);
2883 CHECKPOINT cp, lastcp;
2887 if(SvROK(ret) && SvSMAGICAL(sv = SvRV(ret)))
2888 mg = mg_find(sv, PERL_MAGIC_qr);
2889 else if (SvSMAGICAL(ret)) {
2890 if (SvGMAGICAL(ret))
2891 sv_unmagic(ret, PERL_MAGIC_qr);
2893 mg = mg_find(ret, PERL_MAGIC_qr);
2897 re = (regexp *)mg->mg_obj;
2898 (void)ReREFCNT_inc(re);
2902 char *t = SvPV(ret, len);
2904 char *oprecomp = PL_regprecomp;
2905 I32 osize = PL_regsize;
2906 I32 onpar = PL_regnpar;
2909 if (DO_UTF8(ret)) pm.op_pmdynflags |= PMdf_DYN_UTF8;
2910 re = CALLREGCOMP(aTHX_ t, t + len, &pm);
2912 & (SVs_TEMP | SVs_PADTMP | SVf_READONLY
2914 sv_magic(ret,(SV*)ReREFCNT_inc(re),
2916 PL_regprecomp = oprecomp;
2921 PerlIO_printf(Perl_debug_log,
2922 "Entering embedded `%s%.60s%s%s'\n",
2926 (strlen(re->precomp) > 60 ? "..." : ""))
2929 state.prev = PL_reg_call_cc;
2930 state.cc = PL_regcc;
2931 state.re = PL_reg_re;
2935 cp = regcppush(0); /* Save *all* the positions. */
2938 state.ss = PL_savestack_ix;
2939 *PL_reglastparen = 0;
2940 *PL_reglastcloseparen = 0;
2941 PL_reg_call_cc = &state;
2942 PL_reginput = locinput;
2943 toggleutf = ((PL_reg_flags & RF_utf8) != 0) ^
2944 ((re->reganch & ROPT_UTF8) != 0);
2945 if (toggleutf) PL_reg_flags ^= RF_utf8;
2947 /* XXXX This is too dramatic a measure... */
2950 if (regmatch(re->program + 1)) {
2951 /* Even though we succeeded, we need to restore
2952 global variables, since we may be wrapped inside
2953 SUSPEND, thus the match may be not finished yet. */
2955 /* XXXX Do this only if SUSPENDed? */
2956 PL_reg_call_cc = state.prev;
2957 PL_regcc = state.cc;
2958 PL_reg_re = state.re;
2959 cache_re(PL_reg_re);
2960 if (toggleutf) PL_reg_flags ^= RF_utf8;
2962 /* XXXX This is too dramatic a measure... */
2965 /* These are needed even if not SUSPEND. */
2971 REGCP_UNWIND(lastcp);
2973 PL_reg_call_cc = state.prev;
2974 PL_regcc = state.cc;
2975 PL_reg_re = state.re;
2976 cache_re(PL_reg_re);
2977 if (toggleutf) PL_reg_flags ^= RF_utf8;
2979 /* XXXX This is too dramatic a measure... */
2989 sv_setsv(save_scalar(PL_replgv), ret);
2995 n = ARG(scan); /* which paren pair */
2996 PL_reg_start_tmp[n] = locinput;
3001 n = ARG(scan); /* which paren pair */
3002 PL_regstartp[n] = PL_reg_start_tmp[n] - PL_bostr;
3003 PL_regendp[n] = locinput - PL_bostr;
3004 if (n > (I32)*PL_reglastparen)
3005 *PL_reglastparen = n;
3006 *PL_reglastcloseparen = n;
3009 n = ARG(scan); /* which paren pair */
3010 sw = ((I32)*PL_reglastparen >= n && PL_regendp[n] != -1);
3013 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
3015 next = NEXTOPER(NEXTOPER(scan));
3017 next = scan + ARG(scan);
3018 if (OP(next) == IFTHEN) /* Fake one. */
3019 next = NEXTOPER(NEXTOPER(next));
3023 logical = scan->flags;
3025 /*******************************************************************
3026 PL_regcc contains infoblock about the innermost (...)* loop, and
3027 a pointer to the next outer infoblock.
3029 Here is how Y(A)*Z is processed (if it is compiled into CURLYX/WHILEM):
3031 1) After matching X, regnode for CURLYX is processed;
3033 2) This regnode creates infoblock on the stack, and calls
3034 regmatch() recursively with the starting point at WHILEM node;
3036 3) Each hit of WHILEM node tries to match A and Z (in the order
3037 depending on the current iteration, min/max of {min,max} and
3038 greediness). The information about where are nodes for "A"
3039 and "Z" is read from the infoblock, as is info on how many times "A"
3040 was already matched, and greediness.
3042 4) After A matches, the same WHILEM node is hit again.
3044 5) Each time WHILEM is hit, PL_regcc is the infoblock created by CURLYX
3045 of the same pair. Thus when WHILEM tries to match Z, it temporarily
3046 resets PL_regcc, since this Y(A)*Z can be a part of some other loop:
3047 as in (Y(A)*Z)*. If Z matches, the automaton will hit the WHILEM node
3048 of the external loop.
3050 Currently present infoblocks form a tree with a stem formed by PL_curcc
3051 and whatever it mentions via ->next, and additional attached trees
3052 corresponding to temporarily unset infoblocks as in "5" above.
3054 In the following picture infoblocks for outer loop of
3055 (Y(A)*?Z)*?T are denoted O, for inner I. NULL starting block
3056 is denoted by x. The matched string is YAAZYAZT. Temporarily postponed
3057 infoblocks are drawn below the "reset" infoblock.
3059 In fact in the picture below we do not show failed matches for Z and T
3060 by WHILEM blocks. [We illustrate minimal matches, since for them it is
3061 more obvious *why* one needs to *temporary* unset infoblocks.]
3063 Matched REx position InfoBlocks Comment
3067 Y A)*?Z)*?T x <- O <- I
3068 YA )*?Z)*?T x <- O <- I
3069 YA A)*?Z)*?T x <- O <- I
3070 YAA )*?Z)*?T x <- O <- I
3071 YAA Z)*?T x <- O # Temporary unset I
3074 YAAZ Y(A)*?Z)*?T x <- O
3077 YAAZY (A)*?Z)*?T x <- O
3080 YAAZY A)*?Z)*?T x <- O <- I
3083 YAAZYA )*?Z)*?T x <- O <- I
3086 YAAZYA Z)*?T x <- O # Temporary unset I
3092 YAAZYAZ T x # Temporary unset O
3099 *******************************************************************/
3102 CHECKPOINT cp = PL_savestack_ix;
3103 /* No need to save/restore up to this paren */
3104 I32 parenfloor = scan->flags;
3106 if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
3108 cc.oldcc = PL_regcc;
3110 /* XXXX Probably it is better to teach regpush to support
3111 parenfloor > PL_regsize... */
3112 if (parenfloor > (I32)*PL_reglastparen)
3113 parenfloor = *PL_reglastparen; /* Pessimization... */
3114 cc.parenfloor = parenfloor;
3116 cc.min = ARG1(scan);
3117 cc.max = ARG2(scan);
3118 cc.scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3122 PL_reginput = locinput;
3123 n = regmatch(PREVOPER(next)); /* start on the WHILEM */
3125 PL_regcc = cc.oldcc;
3131 * This is really hard to understand, because after we match
3132 * what we're trying to match, we must make sure the rest of
3133 * the REx is going to match for sure, and to do that we have
3134 * to go back UP the parse tree by recursing ever deeper. And
3135 * if it fails, we have to reset our parent's current state
3136 * that we can try again after backing off.
3139 CHECKPOINT cp, lastcp;
3140 CURCUR* cc = PL_regcc;
3141 char *lastloc = cc->lastloc; /* Detection of 0-len. */
3143 n = cc->cur + 1; /* how many we know we matched */
3144 PL_reginput = locinput;
3147 PerlIO_printf(Perl_debug_log,
3148 "%*s %ld out of %ld..%ld cc=%"UVxf"\n",
3149 REPORT_CODE_OFF+PL_regindent*2, "",
3150 (long)n, (long)cc->min,
3151 (long)cc->max, PTR2UV(cc))
3154 /* If degenerate scan matches "", assume scan done. */
3156 if (locinput == cc->lastloc && n >= cc->min) {
3157 PL_regcc = cc->oldcc;
3161 PerlIO_printf(Perl_debug_log,
3162 "%*s empty match detected, try continuation...\n",
3163 REPORT_CODE_OFF+PL_regindent*2, "")
3165 if (regmatch(cc->next))
3173 /* First just match a string of min scans. */
3177 cc->lastloc = locinput;
3178 if (regmatch(cc->scan))
3181 cc->lastloc = lastloc;
3186 /* Check whether we already were at this position.
3187 Postpone detection until we know the match is not
3188 *that* much linear. */
3189 if (!PL_reg_maxiter) {
3190 PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
3191 PL_reg_leftiter = PL_reg_maxiter;
3193 if (PL_reg_leftiter-- == 0) {
3194 I32 size = (PL_reg_maxiter + 7)/8;
3195 if (PL_reg_poscache) {
3196 if ((I32)PL_reg_poscache_size < size) {
3197 Renew(PL_reg_poscache, size, char);
3198 PL_reg_poscache_size = size;
3200 Zero(PL_reg_poscache, size, char);
3203 PL_reg_poscache_size = size;
3204 Newz(29, PL_reg_poscache, size, char);
3207 PerlIO_printf(Perl_debug_log,
3208 "%sDetected a super-linear match, switching on caching%s...\n",
3209 PL_colors[4], PL_colors[5])
3212 if (PL_reg_leftiter < 0) {
3213 I32 o = locinput - PL_bostr, b;
3215 o = (scan->flags & 0xf) - 1 + o * (scan->flags>>4);
3218 if (PL_reg_poscache[o] & (1<<b)) {
3220 PerlIO_printf(Perl_debug_log,
3221 "%*s already tried at this position...\n",
3222 REPORT_CODE_OFF+PL_regindent*2, "")
3224 if (PL_reg_flags & RF_false)
3229 PL_reg_poscache[o] |= (1<<b);
3233 /* Prefer next over scan for minimal matching. */
3236 PL_regcc = cc->oldcc;
3239 cp = regcppush(cc->parenfloor);
3241 if (regmatch(cc->next)) {
3243 sayYES; /* All done. */
3245 REGCP_UNWIND(lastcp);
3251 if (n >= cc->max) { /* Maximum greed exceeded? */
3252 if (ckWARN(WARN_REGEXP) && n >= REG_INFTY
3253 && !(PL_reg_flags & RF_warned)) {
3254 PL_reg_flags |= RF_warned;
3255 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s limit (%d) exceeded",
3256 "Complex regular subexpression recursion",
3263 PerlIO_printf(Perl_debug_log,
3264 "%*s trying longer...\n",
3265 REPORT_CODE_OFF+PL_regindent*2, "")
3267 /* Try scanning more and see if it helps. */
3268 PL_reginput = locinput;
3270 cc->lastloc = locinput;
3271 cp = regcppush(cc->parenfloor);
3273 if (regmatch(cc->scan)) {
3277 REGCP_UNWIND(lastcp);
3280 cc->lastloc = lastloc;
3284 /* Prefer scan over next for maximal matching. */
3286 if (n < cc->max) { /* More greed allowed? */
3287 cp = regcppush(cc->parenfloor);
3289 cc->lastloc = locinput;
3291 if (regmatch(cc->scan)) {
3295 REGCP_UNWIND(lastcp);
3296 regcppop(); /* Restore some previous $<digit>s? */
3297 PL_reginput = locinput;
3299 PerlIO_printf(Perl_debug_log,
3300 "%*s failed, try continuation...\n",
3301 REPORT_CODE_OFF+PL_regindent*2, "")
3304 if (ckWARN(WARN_REGEXP) && n >= REG_INFTY
3305 && !(PL_reg_flags & RF_warned)) {
3306 PL_reg_flags |= RF_warned;
3307 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s limit (%d) exceeded",
3308 "Complex regular subexpression recursion",
3312 /* Failed deeper matches of scan, so see if this one works. */
3313 PL_regcc = cc->oldcc;
3316 if (regmatch(cc->next))
3322 cc->lastloc = lastloc;
3327 next = scan + ARG(scan);
3330 inner = NEXTOPER(NEXTOPER(scan));
3333 inner = NEXTOPER(scan);
3337 if (OP(next) != c1) /* No choice. */
3338 next = inner; /* Avoid recursion. */
3340 I32 lastparen = *PL_reglastparen;
3342 re_unwind_branch_t *uw;
3344 /* Put unwinding data on stack */
3345 unwind1 = SSNEWt(1,re_unwind_branch_t);
3346 uw = SSPTRt(unwind1,re_unwind_branch_t);
3349 uw->type = ((c1 == BRANCH)
3351 : RE_UNWIND_BRANCHJ);
3352 uw->lastparen = lastparen;
3354 uw->locinput = locinput;
3355 uw->nextchr = nextchr;
3357 uw->regindent = ++PL_regindent;
3360 REGCP_SET(uw->lastcp);
3362 /* Now go into the first branch */
3375 /* We suppose that the next guy does not need
3376 backtracking: in particular, it is of constant non-zero length,
3377 and has no parenths to influence future backrefs. */
3378 ln = ARG1(scan); /* min to match */
3379 n = ARG2(scan); /* max to match */
3380 paren = scan->flags;
3382 if (paren > PL_regsize)
3384 if (paren > (I32)*PL_reglastparen)
3385 *PL_reglastparen = paren;
3387 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
3389 scan += NEXT_OFF(scan); /* Skip former OPEN. */
3390 PL_reginput = locinput;
3393 if (ln && regrepeat_hard(scan, ln, &l) < ln)
3395 locinput = PL_reginput;
3396 if (HAS_TEXT(next) || JUMPABLE(next)) {
3397 regnode *text_node = next;
3399 if (! HAS_TEXT(text_node)) FIND_NEXT_IMPT(text_node);
3401 if (! HAS_TEXT(text_node)) c1 = c2 = -1000;
3403 if (PL_regkind[(U8)OP(text_node)] == REF) {
3407 else { c1 = (U8)*STRING(text_node); }
3408 if (OP(text_node) == EXACTF || OP(text_node) == REFF)
3410 else if (OP(text_node) == EXACTFL || OP(text_node) == REFFL)
3411 c2 = PL_fold_locale[c1];
3420 while (n >= ln || (n == REG_INFTY && ln > 0)) { /* ln overflow ? */
3421 /* If it could work, try it. */
3423 UCHARAT(PL_reginput) == c1 ||
3424 UCHARAT(PL_reginput) == c2)
3428 PL_regstartp[paren] =
3429 HOPc(PL_reginput, -l) - PL_bostr;
3430 PL_regendp[paren] = PL_reginput - PL_bostr;
3433 PL_regendp[paren] = -1;
3437 REGCP_UNWIND(lastcp);
3439 /* Couldn't or didn't -- move forward. */
3440 PL_reginput = locinput;
3441 if (regrepeat_hard(scan, 1, &l)) {
3443 locinput = PL_reginput;
3450 n = regrepeat_hard(scan, n, &l);
3451 locinput = PL_reginput;
3453 PerlIO_printf(Perl_debug_log,
3454 "%*s matched %"IVdf" times, len=%"IVdf"...\n",
3455 (int)(REPORT_CODE_OFF+PL_regindent*2), "",
3459 if (HAS_TEXT(next) || JUMPABLE(next)) {
3460 regnode *text_node = next;
3462 if (! HAS_TEXT(text_node)) FIND_NEXT_IMPT(text_node);
3464 if (! HAS_TEXT(text_node)) c1 = c2 = -1000;
3466 if (PL_regkind[(U8)OP(text_node)] == REF) {
3470 else { c1 = (U8)*STRING(text_node); }
3472 if (OP(text_node) == EXACTF || OP(text_node) == REFF)
3474 else if (OP(text_node) == EXACTFL || OP(text_node) == REFFL)
3475 c2 = PL_fold_locale[c1];
3486 /* If it could work, try it. */
3488 UCHARAT(PL_reginput) == c1 ||
3489 UCHARAT(PL_reginput) == c2)
3492 PerlIO_printf(Perl_debug_log,
3493 "%*s trying tail with n=%"IVdf"...\n",
3494 (int)(REPORT_CODE_OFF+PL_regindent*2), "", (IV)n)
3498 PL_regstartp[paren] = HOPc(PL_reginput, -l) - PL_bostr;
3499 PL_regendp[paren] = PL_reginput - PL_bostr;
3502 PL_regendp[paren] = -1;
3506 REGCP_UNWIND(lastcp);
3508 /* Couldn't or didn't -- back up. */
3510 locinput = HOPc(locinput, -l);
3511 PL_reginput = locinput;
3518 paren = scan->flags; /* Which paren to set */
3519 if (paren > PL_regsize)
3521 if (paren > (I32)*PL_reglastparen)
3522 *PL_reglastparen = paren;
3523 ln = ARG1(scan); /* min to match */
3524 n = ARG2(scan); /* max to match */
3525 scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
3529 ln = ARG1(scan); /* min to match */
3530 n = ARG2(scan); /* max to match */
3531 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
3536 scan = NEXTOPER(scan);
3542 scan = NEXTOPER(scan);
3546 * Lookahead to avoid useless match attempts
3547 * when we know what character comes next.
3551 * Used to only do .*x and .*?x, but now it allows
3552 * for )'s, ('s and (?{ ... })'s to be in the way
3553 * of the quantifier and the EXACT-like node. -- japhy
3556 if (HAS_TEXT(next) || JUMPABLE(next)) {
3558 regnode *text_node = next;
3560 if (! HAS_TEXT(text_node)) FIND_NEXT_IMPT(text_node);
3562 if (! HAS_TEXT(text_node)) c1 = c2 = -1000;
3564 if (PL_regkind[(U8)OP(text_node)] == REF) {
3566 goto assume_ok_easy;
3568 else { s = (U8*)STRING(text_node); }
3572 if (OP(text_node) == EXACTF || OP(text_node) == REFF)
3574 else if (OP(text_node) == EXACTFL || OP(text_node) == REFFL)
3575 c2 = PL_fold_locale[c1];
3578 if (OP(text_node) == EXACTF || OP(text_node) == REFF) {
3579 STRLEN ulen1, ulen2;
3580 U8 tmpbuf1[UTF8_MAXLEN_UCLC+1];
3581 U8 tmpbuf2[UTF8_MAXLEN_UCLC+1];
3583 to_utf8_lower((U8*)s, tmpbuf1, &ulen1);
3584 to_utf8_upper((U8*)s, tmpbuf2, &ulen2);
3586 c1 = utf8n_to_uvuni(tmpbuf1, UTF8_MAXLEN, 0,
3588 0 : UTF8_ALLOW_ANY);
3589 c2 = utf8n_to_uvuni(tmpbuf2, UTF8_MAXLEN, 0,
3591 0 : UTF8_ALLOW_ANY);
3594 c2 = c1 = utf8n_to_uvchr(s, UTF8_MAXLEN, 0,
3596 0 : UTF8_ALLOW_ANY);
3604 PL_reginput = locinput;
3608 if (ln && regrepeat(scan, ln) < ln)
3610 locinput = PL_reginput;
3613 char *e; /* Should not check after this */
3614 char *old = locinput;
3617 if (n == REG_INFTY) {
3620 while (UTF8_IS_CONTINUATION(*(U8*)e))
3626 m >0 && e + UTF8SKIP(e) <= PL_regeol; m--)
3630 e = locinput + n - ln;
3635 /* Find place 'next' could work */
3638 while (locinput <= e &&
3639 UCHARAT(locinput) != c1)
3642 while (locinput <= e
3643 && UCHARAT(locinput) != c1
3644 && UCHARAT(locinput) != c2)
3647 count = locinput - old;
3652 /* count initialised to
3653 * utf8_distance(old, locinput) */
3654 while (locinput <= e &&
3655 utf8n_to_uvchr((U8*)locinput,
3658 0 : UTF8_ALLOW_ANY) != (UV)c1) {
3663 /* count initialised to
3664 * utf8_distance(old, locinput) */
3665 while (locinput <= e) {
3666 UV c = utf8n_to_uvchr((U8*)locinput,
3669 0 : UTF8_ALLOW_ANY);
3670 if (c == (UV)c1 || c == (UV)c2)
3679 /* PL_reginput == old now */
3680 if (locinput != old) {
3681 ln = 1; /* Did some */
3682 if (regrepeat(scan, count) < count)
3685 /* PL_reginput == locinput now */
3686 TRYPAREN(paren, ln, locinput);
3687 PL_reginput = locinput; /* Could be reset... */
3688 REGCP_UNWIND(lastcp);
3689 /* Couldn't or didn't -- move forward. */
3692 locinput += UTF8SKIP(locinput);
3699 while (n >= ln || (n == REG_INFTY && ln > 0)) { /* ln overflow ? */
3703 c = utf8n_to_uvchr((U8*)PL_reginput,
3706 0 : UTF8_ALLOW_ANY);
3708 c = UCHARAT(PL_reginput);
3709 /* If it could work, try it. */
3710 if (c == (UV)c1 || c == (UV)c2)
3712 TRYPAREN(paren, ln, PL_reginput);
3713 REGCP_UNWIND(lastcp);
3716 /* If it could work, try it. */
3717 else if (c1 == -1000)
3719 TRYPAREN(paren, ln, PL_reginput);
3720 REGCP_UNWIND(lastcp);
3722 /* Couldn't or didn't -- move forward. */
3723 PL_reginput = locinput;
3724 if (regrepeat(scan, 1)) {
3726 locinput = PL_reginput;
3734 n = regrepeat(scan, n);
3735 locinput = PL_reginput;
3736 if (ln < n && PL_regkind[(U8)OP(next)] == EOL &&
3737 ((!PL_multiline && OP(next) != MEOL) ||
3738 OP(next) == SEOL || OP(next) == EOS))
3740 ln = n; /* why back off? */
3741 /* ...because $ and \Z can match before *and* after
3742 newline at the end. Consider "\n\n" =~ /\n+\Z\n/.
3743 We should back off by one in this case. */
3744 if (UCHARAT(PL_reginput - 1) == '\n' && OP(next) != EOS)
3753 c = utf8n_to_uvchr((U8*)PL_reginput,
3756 0 : UTF8_ALLOW_ANY);
3758 c = UCHARAT(PL_reginput);
3760 /* If it could work, try it. */
3761 if (c1 == -1000 || c == (UV)c1 || c == (UV)c2)
3763 TRYPAREN(paren, n, PL_reginput);
3764 REGCP_UNWIND(lastcp);
3766 /* Couldn't or didn't -- back up. */
3768 PL_reginput = locinput = HOPc(locinput, -1);
3776 c = utf8n_to_uvchr((U8*)PL_reginput,
3779 0 : UTF8_ALLOW_ANY);
3781 c = UCHARAT(PL_reginput);
3783 /* If it could work, try it. */
3784 if (c1 == -1000 || c == (UV)c1 || c == (UV)c2)
3786 TRYPAREN(paren, n, PL_reginput);
3787 REGCP_UNWIND(lastcp);
3789 /* Couldn't or didn't -- back up. */
3791 PL_reginput = locinput = HOPc(locinput, -1);
3798 if (PL_reg_call_cc) {
3799 re_cc_state *cur_call_cc = PL_reg_call_cc;
3800 CURCUR *cctmp = PL_regcc;
3801 regexp *re = PL_reg_re;
3802 CHECKPOINT cp, lastcp;
3804 cp = regcppush(0); /* Save *all* the positions. */
3806 regcp_set_to(PL_reg_call_cc->ss); /* Restore parens of
3808 PL_reginput = locinput; /* Make position available to
3810 cache_re(PL_reg_call_cc->re);
3811 PL_regcc = PL_reg_call_cc->cc;
3812 PL_reg_call_cc = PL_reg_call_cc->prev;
3813 if (regmatch(cur_call_cc->node)) {
3814 PL_reg_call_cc = cur_call_cc;
3818 REGCP_UNWIND(lastcp);
3820 PL_reg_call_cc = cur_call_cc;
3826 PerlIO_printf(Perl_debug_log,
3827 "%*s continuation failed...\n",
3828 REPORT_CODE_OFF+PL_regindent*2, "")
3832 if (locinput < PL_regtill) {
3833 DEBUG_r(PerlIO_printf(Perl_debug_log,
3834 "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
3836 (long)(locinput - PL_reg_starttry),
3837 (long)(PL_regtill - PL_reg_starttry),
3839 sayNO_FINAL; /* Cannot match: too short. */
3841 PL_reginput = locinput; /* put where regtry can find it */
3842 sayYES_FINAL; /* Success! */
3844 PL_reginput = locinput; /* put where regtry can find it */
3845 sayYES_LOUD; /* Success! */
3848 PL_reginput = locinput;
3853 s = HOPBACKc(locinput, scan->flags);
3859 PL_reginput = locinput;
3860 PL_reg_flags ^= RF_false;
3865 s = HOPBACKc(locinput, scan->flags);
3871 PL_reginput = locinput;
3874 inner = NEXTOPER(NEXTOPER(scan));
3875 if (regmatch(inner) != n) {
3877 PL_reg_flags ^= RF_false;
3888 PL_reg_flags ^= RF_false;
3894 if (OP(scan) == SUSPEND) {
3895 locinput = PL_reginput;
3896 nextchr = UCHARAT(locinput);
3901 next = scan + ARG(scan);
3906 PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
3907 PTR2UV(scan), OP(scan));
3908 Perl_croak(aTHX_ "regexp memory corruption");
3915 * We get here only if there's trouble -- normally "case END" is
3916 * the terminating point.
3918 Perl_croak(aTHX_ "corrupted regexp pointers");
3924 PerlIO_printf(Perl_debug_log,
3925 "%*s %scould match...%s\n",
3926 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4],PL_colors[5])
3930 DEBUG_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
3931 PL_colors[4],PL_colors[5]));
3937 #if 0 /* Breaks $^R */
3945 PerlIO_printf(Perl_debug_log,
3946 "%*s %sfailed...%s\n",
3947 REPORT_CODE_OFF+PL_regindent*2, "",PL_colors[4],PL_colors[5])
3953 re_unwind_t *uw = SSPTRt(unwind,re_unwind_t);
3956 case RE_UNWIND_BRANCH:
3957 case RE_UNWIND_BRANCHJ:
3959 re_unwind_branch_t *uwb = &(uw->branch);
3960 I32 lastparen = uwb->lastparen;
3962 REGCP_UNWIND(uwb->lastcp);
3963 for (n = *PL_reglastparen; n > lastparen; n--)
3965 *PL_reglastparen = n;
3966 scan = next = uwb->next;
3968 OP(scan) != (uwb->type == RE_UNWIND_BRANCH
3969 ? BRANCH : BRANCHJ) ) { /* Failure */
3976 /* Have more choice yet. Reuse the same uwb. */
3978 if ((n = (uwb->type == RE_UNWIND_BRANCH
3979 ? NEXT_OFF(next) : ARG(next))))
3982 next = NULL; /* XXXX Needn't unwinding in this case... */
3984 next = NEXTOPER(scan);
3985 if (uwb->type == RE_UNWIND_BRANCHJ)
3986 next = NEXTOPER(next);
3987 locinput = uwb->locinput;
3988 nextchr = uwb->nextchr;
3990 PL_regindent = uwb->regindent;
3997 Perl_croak(aTHX_ "regexp unwind memory corruption");
4008 - regrepeat - repeatedly match something simple, report how many
4011 * [This routine now assumes that it will only match on things of length 1.
4012 * That was true before, but now we assume scan - reginput is the count,
4013 * rather than incrementing count on every character. [Er, except utf8.]]
4016 S_regrepeat(pTHX_ regnode *p, I32 max)
4018 register char *scan;
4020 register char *loceol = PL_regeol;
4021 register I32 hardcount = 0;
4022 register bool do_utf8 = PL_reg_match_utf8;
4025 if (max == REG_INFTY)
4027 else if (max < loceol - scan)
4028 loceol = scan + max;
4033 while (scan < loceol && hardcount < max && *scan != '\n') {
4034 scan += UTF8SKIP(scan);
4038 while (scan < loceol && *scan != '\n')
4045 while (scan < loceol && hardcount < max) {
4046 scan += UTF8SKIP(scan);
4056 case EXACT: /* length of string is 1 */
4058 while (scan < loceol && UCHARAT(scan) == c)
4061 case EXACTF: /* length of string is 1 */
4063 while (scan < loceol &&
4064 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold[c]))
4067 case EXACTFL: /* length of string is 1 */
4068 PL_reg_flags |= RF_tainted;
4070 while (scan < loceol &&
4071 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold_locale[c]))
4077 while (hardcount < max && scan < loceol &&
4078 reginclass(p, (U8*)scan, 0, do_utf8)) {
4079 scan += UTF8SKIP(scan);
4083 while (scan < loceol && REGINCLASS(p, (U8*)scan))
4090 LOAD_UTF8_CHARCLASS(alnum,"a");
4091 while (hardcount < max && scan < loceol &&
4092 swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) {
4093 scan += UTF8SKIP(scan);
4097 while (scan < loceol && isALNUM(*scan))
4102 PL_reg_flags |= RF_tainted;
4105 while (hardcount < max && scan < loceol &&
4106 isALNUM_LC_utf8((U8*)scan)) {
4107 scan += UTF8SKIP(scan);
4111 while (scan < loceol && isALNUM_LC(*scan))
4118 LOAD_UTF8_CHARCLASS(alnum,"a");
4119 while (hardcount < max && scan < loceol &&
4120 !swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) {
4121 scan += UTF8SKIP(scan);
4125 while (scan < loceol && !isALNUM(*scan))
4130 PL_reg_flags |= RF_tainted;
4133 while (hardcount < max && scan < loceol &&
4134 !isALNUM_LC_utf8((U8*)scan)) {
4135 scan += UTF8SKIP(scan);
4139 while (scan < loceol && !isALNUM_LC(*scan))
4146 LOAD_UTF8_CHARCLASS(space," ");
4147 while (hardcount < max && scan < loceol &&
4149 swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) {
4150 scan += UTF8SKIP(scan);
4154 while (scan < loceol && isSPACE(*scan))
4159 PL_reg_flags |= RF_tainted;
4162 while (hardcount < max && scan < loceol &&
4163 (*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
4164 scan += UTF8SKIP(scan);
4168 while (scan < loceol && isSPACE_LC(*scan))
4175 LOAD_UTF8_CHARCLASS(space," ");
4176 while (hardcount < max && scan < loceol &&
4178 swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) {
4179 scan += UTF8SKIP(scan);
4183 while (scan < loceol && !isSPACE(*scan))
4188 PL_reg_flags |= RF_tainted;
4191 while (hardcount < max && scan < loceol &&
4192 !(*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
4193 scan += UTF8SKIP(scan);
4197 while (scan < loceol && !isSPACE_LC(*scan))
4204 LOAD_UTF8_CHARCLASS(digit,"0");
4205 while (hardcount < max && scan < loceol &&
4206 swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) {
4207 scan += UTF8SKIP(scan);
4211 while (scan < loceol && isDIGIT(*scan))
4218 LOAD_UTF8_CHARCLASS(digit,"0");
4219 while (hardcount < max && scan < loceol &&
4220 !swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) {
4221 scan += UTF8SKIP(scan);
4225 while (scan < loceol && !isDIGIT(*scan))
4229 default: /* Called on something of 0 width. */
4230 break; /* So match right here or not at all. */
4236 c = scan - PL_reginput;
4241 SV *prop = sv_newmortal();
4244 PerlIO_printf(Perl_debug_log,
4245 "%*s %s can match %"IVdf" times out of %"IVdf"...\n",
4246 REPORT_CODE_OFF+1, "", SvPVX(prop),(IV)c,(IV)max);
4253 - regrepeat_hard - repeatedly match something, report total lenth and length
4255 * The repeater is supposed to have constant non-zero length.
4259 S_regrepeat_hard(pTHX_ regnode *p, I32 max, I32 *lp)
4261 register char *scan = Nullch;
4262 register char *start;
4263 register char *loceol = PL_regeol;
4265 I32 count = 0, res = 1;
4270 start = PL_reginput;
4271 if (PL_reg_match_utf8) {
4272 while (PL_reginput < loceol && (scan = PL_reginput, res = regmatch(p))) {
4275 while (start < PL_reginput) {
4277 start += UTF8SKIP(start);
4288 while (PL_reginput < loceol && (scan = PL_reginput, res = regmatch(p))) {
4290 *lp = l = PL_reginput - start;
4291 if (max != REG_INFTY && l*max < loceol - scan)
4292 loceol = scan + l*max;
4305 - regclass_swash - prepare the utf8 swash
4309 Perl_regclass_swash(pTHX_ register regnode* node, bool doinit, SV** listsvp, SV **altsvp)
4315 if (PL_regdata && PL_regdata->count) {
4318 if (PL_regdata->what[n] == 's') {
4319 SV *rv = (SV*)PL_regdata->data[n];
4320 AV *av = (AV*)SvRV((SV*)rv);
4321 SV **ary = AvARRAY(av);
4324 /* See the end of regcomp.c:S_reglass() for
4325 * documentation of these array elements. */
4328 a = SvTYPE(ary[1]) == SVt_RV ? &ary[1] : 0;
4329 b = SvTYPE(ary[2]) == SVt_PVAV ? &ary[2] : 0;
4333 else if (si && doinit) {
4334 sw = swash_init("utf8", "", si, 1, 0);
4335 (void)av_store(av, 1, sw);
4351 - reginclass - determine if a character falls into a character class
4353 The n is the ANYOF regnode, the p is the target string, lenp
4354 is pointer to the maximum length of how far to go in the p
4355 (if the lenp is zero, UTF8SKIP(p) is used),
4356 do_utf8 tells whether the target string is in UTF-8.
4361 S_reginclass(pTHX_ register regnode *n, register U8* p, STRLEN* lenp, register bool do_utf8)
4363 char flags = ANYOF_FLAGS(n);
4369 if (do_utf8 && !UTF8_IS_INVARIANT(c))
4370 c = utf8n_to_uvchr(p, UTF8_MAXLEN, &len,
4371 ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
4373 plen = lenp ? *lenp : UNISKIP(NATIVE_TO_UNI(c));
4374 if (do_utf8 || (flags & ANYOF_UNICODE)) {
4377 if (do_utf8 && !ANYOF_RUNTIME(n)) {
4378 if (len != (STRLEN)-1 && c < 256 && ANYOF_BITMAP_TEST(n, c))
4381 if (!match && do_utf8 && (flags & ANYOF_UNICODE_ALL) && c >= 256)
4385 SV *sw = regclass_swash(n, TRUE, 0, (SV**)&av);
4388 if (swash_fetch(sw, p, do_utf8))
4390 else if (flags & ANYOF_FOLD) {
4391 if (!match && lenp && av) {
4394 for (i = 0; i <= av_len(av); i++) {
4395 SV* sv = *av_fetch(av, i, FALSE);
4397 char *s = SvPV(sv, len);
4399 if (len <= plen && memEQ(s, (char*)p, len)) {
4407 U8 tmpbuf[UTF8_MAXLEN_FOLD+1];
4410 to_utf8_fold(p, tmpbuf, &tmplen);
4411 if (swash_fetch(sw, tmpbuf, do_utf8))
4417 if (match && lenp && *lenp == 0)
4418 *lenp = UNISKIP(NATIVE_TO_UNI(c));
4420 if (!match && c < 256) {
4421 if (ANYOF_BITMAP_TEST(n, c))
4423 else if (flags & ANYOF_FOLD) {
4426 if (flags & ANYOF_LOCALE) {
4427 PL_reg_flags |= RF_tainted;
4428 f = PL_fold_locale[c];
4432 if (f != c && ANYOF_BITMAP_TEST(n, f))
4436 if (!match && (flags & ANYOF_CLASS)) {
4437 PL_reg_flags |= RF_tainted;
4439 (ANYOF_CLASS_TEST(n, ANYOF_ALNUM) && isALNUM_LC(c)) ||
4440 (ANYOF_CLASS_TEST(n, ANYOF_NALNUM) && !isALNUM_LC(c)) ||
4441 (ANYOF_CLASS_TEST(n, ANYOF_SPACE) && isSPACE_LC(c)) ||
4442 (ANYOF_CLASS_TEST(n, ANYOF_NSPACE) && !isSPACE_LC(c)) ||
4443 (ANYOF_CLASS_TEST(n, ANYOF_DIGIT) && isDIGIT_LC(c)) ||
4444 (ANYOF_CLASS_TEST(n, ANYOF_NDIGIT) && !isDIGIT_LC(c)) ||
4445 (ANYOF_CLASS_TEST(n, ANYOF_ALNUMC) && isALNUMC_LC(c)) ||
4446 (ANYOF_CLASS_TEST(n, ANYOF_NALNUMC) && !isALNUMC_LC(c)) ||
4447 (ANYOF_CLASS_TEST(n, ANYOF_ALPHA) && isALPHA_LC(c)) ||
4448 (ANYOF_CLASS_TEST(n, ANYOF_NALPHA) && !isALPHA_LC(c)) ||
4449 (ANYOF_CLASS_TEST(n, ANYOF_ASCII) && isASCII(c)) ||
4450 (ANYOF_CLASS_TEST(n, ANYOF_NASCII) && !isASCII(c)) ||
4451 (ANYOF_CLASS_TEST(n, ANYOF_CNTRL) && isCNTRL_LC(c)) ||
4452 (ANYOF_CLASS_TEST(n, ANYOF_NCNTRL) && !isCNTRL_LC(c)) ||
4453 (ANYOF_CLASS_TEST(n, ANYOF_GRAPH) && isGRAPH_LC(c)) ||
4454 (ANYOF_CLASS_TEST(n, ANYOF_NGRAPH) && !isGRAPH_LC(c)) ||
4455 (ANYOF_CLASS_TEST(n, ANYOF_LOWER) && isLOWER_LC(c)) ||
4456 (ANYOF_CLASS_TEST(n, ANYOF_NLOWER) && !isLOWER_LC(c)) ||
4457 (ANYOF_CLASS_TEST(n, ANYOF_PRINT) && isPRINT_LC(c)) ||
4458 (ANYOF_CLASS_TEST(n, ANYOF_NPRINT) && !isPRINT_LC(c)) ||
4459 (ANYOF_CLASS_TEST(n, ANYOF_PUNCT) && isPUNCT_LC(c)) ||
4460 (ANYOF_CLASS_TEST(n, ANYOF_NPUNCT) && !isPUNCT_LC(c)) ||
4461 (ANYOF_CLASS_TEST(n, ANYOF_UPPER) && isUPPER_LC(c)) ||
4462 (ANYOF_CLASS_TEST(n, ANYOF_NUPPER) && !isUPPER_LC(c)) ||
4463 (ANYOF_CLASS_TEST(n, ANYOF_XDIGIT) && isXDIGIT(c)) ||
4464 (ANYOF_CLASS_TEST(n, ANYOF_NXDIGIT) && !isXDIGIT(c)) ||
4465 (ANYOF_CLASS_TEST(n, ANYOF_PSXSPC) && isPSXSPC(c)) ||
4466 (ANYOF_CLASS_TEST(n, ANYOF_NPSXSPC) && !isPSXSPC(c)) ||
4467 (ANYOF_CLASS_TEST(n, ANYOF_BLANK) && isBLANK(c)) ||
4468 (ANYOF_CLASS_TEST(n, ANYOF_NBLANK) && !isBLANK(c))
4469 ) /* How's that for a conditional? */
4476 return (flags & ANYOF_INVERT) ? !match : match;
4480 S_reghop(pTHX_ U8 *s, I32 off)
4482 return S_reghop3(aTHX_ s, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr));
4486 S_reghop3(pTHX_ U8 *s, I32 off, U8* lim)
4489 while (off-- && s < lim) {
4490 /* XXX could check well-formedness here */
4498 if (UTF8_IS_CONTINUED(*s)) {
4499 while (s > (U8*)lim && UTF8_IS_CONTINUATION(*s))
4502 /* XXX could check well-formedness here */
4510 S_reghopmaybe(pTHX_ U8 *s, I32 off)
4512 return S_reghopmaybe3(aTHX_ s, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr));
4516 S_reghopmaybe3(pTHX_ U8* s, I32 off, U8* lim)
4519 while (off-- && s < lim) {
4520 /* XXX could check well-formedness here */
4530 if (UTF8_IS_CONTINUED(*s)) {
4531 while (s > (U8*)lim && UTF8_IS_CONTINUATION(*s))
4534 /* XXX could check well-formedness here */
4546 restore_pos(pTHX_ void *arg)
4548 if (PL_reg_eval_set) {
4549 if (PL_reg_oldsaved) {
4550 PL_reg_re->subbeg = PL_reg_oldsaved;
4551 PL_reg_re->sublen = PL_reg_oldsavedlen;
4552 #ifdef PERL_COPY_ON_WRITE
4553 PL_reg_re->saved_copy = PL_nrs;
4555 RX_MATCH_COPIED_on(PL_reg_re);
4557 PL_reg_magic->mg_len = PL_reg_oldpos;
4558 PL_reg_eval_set = 0;
4559 PL_curpm = PL_reg_oldcurpm;
4564 S_to_utf8_substr(pTHX_ register regexp *prog)
4567 if (prog->float_substr && !prog->float_utf8) {
4568 prog->float_utf8 = sv = NEWSV(117, 0);
4569 SvSetSV(sv, prog->float_substr);
4570 sv_utf8_upgrade(sv);
4571 if (SvTAIL(prog->float_substr))
4573 if (prog->float_substr == prog->check_substr)
4574 prog->check_utf8 = sv;
4576 if (prog->anchored_substr && !prog->anchored_utf8) {
4577 prog->anchored_utf8 = sv = NEWSV(118, 0);
4578 SvSetSV(sv, prog->anchored_substr);
4579 sv_utf8_upgrade(sv);
4580 if (SvTAIL(prog->anchored_substr))
4582 if (prog->anchored_substr == prog->check_substr)
4583 prog->check_utf8 = sv;
4588 S_to_byte_substr(pTHX_ register regexp *prog)
4591 if (prog->float_utf8 && !prog->float_substr) {
4592 prog->float_substr = sv = NEWSV(117, 0);
4593 SvSetSV(sv, prog->float_utf8);
4594 if (sv_utf8_downgrade(sv, TRUE)) {
4595 if (SvTAIL(prog->float_utf8))
4599 prog->float_substr = sv = &PL_sv_undef;
4601 if (prog->float_utf8 == prog->check_utf8)
4602 prog->check_substr = sv;
4604 if (prog->anchored_utf8 && !prog->anchored_substr) {
4605 prog->anchored_substr = sv = NEWSV(118, 0);
4606 SvSetSV(sv, prog->anchored_utf8);
4607 if (sv_utf8_downgrade(sv, TRUE)) {
4608 if (SvTAIL(prog->anchored_utf8))
4612 prog->anchored_substr = sv = &PL_sv_undef;
4614 if (prog->anchored_utf8 == prog->check_utf8)
4615 prog->check_substr = sv;