5 * "One Ring to rule them all, One Ring to find them..."
8 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
9 * confused with the original package (see point 3 below). Thanks, Henry!
12 /* Additional note: this code is very heavily munged from Henry's version
13 * in places. In some spots I've traded clarity for efficiency, so don't
14 * blame Henry for some of the lack of readability.
17 /* The names of the functions have been changed from regcomp and
18 * regexec to pregcomp and pregexec in order to avoid conflicts
19 * with the POSIX routines of the same names.
22 #ifdef PERL_EXT_RE_BUILD
23 /* need to replace pregcomp et al, so enable that */
24 # ifndef PERL_IN_XSUB_RE
25 # define PERL_IN_XSUB_RE
27 /* need access to debugger hooks */
28 # if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
33 #ifdef PERL_IN_XSUB_RE
34 /* We *really* need to overwrite these symbols: */
35 # define Perl_regexec_flags my_regexec
36 # define Perl_regdump my_regdump
37 # define Perl_regprop my_regprop
38 # define Perl_re_intuit_start my_re_intuit_start
39 /* *These* symbols are masked to allow static link. */
40 # define Perl_pregexec my_pregexec
41 # define Perl_reginitcolors my_reginitcolors
42 # define Perl_regclass_swash my_regclass_swash
44 # define PERL_NO_GET_CONTEXT
49 * pregcomp and pregexec -- regsub and regerror are not used in perl
51 * Copyright (c) 1986 by University of Toronto.
52 * Written by Henry Spencer. Not derived from licensed software.
54 * Permission is granted to anyone to use this software for any
55 * purpose on any computer system, and to redistribute it freely,
56 * subject to the following restrictions:
58 * 1. The author is not responsible for the consequences of use of
59 * this software, no matter how awful, even if they arise
62 * 2. The origin of this software must not be misrepresented, either
63 * by explicit claim or by omission.
65 * 3. Altered versions must be plainly marked as such, and must not
66 * be misrepresented as being the original software.
68 **** Alterations to Henry's code are...
70 **** Copyright (c) 1991-2002, Larry Wall
72 **** You may distribute under the terms of either the GNU General Public
73 **** License or the Artistic License, as specified in the README file.
75 * Beware that some of this code is subtly aware of the way operator
76 * precedence is structured in regular expressions. Serious changes in
77 * regular-expression syntax might require a total rethink.
80 #define PERL_IN_REGEXEC_C
85 #define RF_tainted 1 /* tainted information used? */
86 #define RF_warned 2 /* warned about big count? */
87 #define RF_evaled 4 /* Did an EVAL with setting? */
88 #define RF_utf8 8 /* String contains multibyte chars? */
90 #define UTF (PL_reg_flags & RF_utf8)
92 #define RS_init 1 /* eval environment created */
93 #define RS_set 2 /* replsv value is set */
99 #define REGINCLASS(p,c) (ANYOF_FLAGS(p) ? reginclass(p,c,0,0) : ANYOF_BITMAP_TEST(p,*(c)))
105 #define CHR_SVLEN(sv) (do_utf8 ? sv_len_utf8(sv) : SvCUR(sv))
106 #define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
108 #define reghop_c(pos,off) ((char*)reghop((U8*)pos, off))
109 #define reghopmaybe_c(pos,off) ((char*)reghopmaybe((U8*)pos, off))
110 #define HOP(pos,off) (PL_reg_match_utf8 ? reghop((U8*)pos, off) : (U8*)(pos + off))
111 #define HOPMAYBE(pos,off) (PL_reg_match_utf8 ? reghopmaybe((U8*)pos, off) : (U8*)(pos + off))
112 #define HOPc(pos,off) ((char*)HOP(pos,off))
113 #define HOPMAYBEc(pos,off) ((char*)HOPMAYBE(pos,off))
115 #define HOPBACK(pos, off) ( \
116 (PL_reg_match_utf8) \
117 ? reghopmaybe((U8*)pos, -off) \
118 : (pos - off >= PL_bostr) \
122 #define HOPBACKc(pos, off) (char*)HOPBACK(pos, off)
124 #define reghop3_c(pos,off,lim) ((char*)reghop3((U8*)pos, off, (U8*)lim))
125 #define reghopmaybe3_c(pos,off,lim) ((char*)reghopmaybe3((U8*)pos, off, (U8*)lim))
126 #define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)pos, off, (U8*)lim) : (U8*)(pos + off))
127 #define HOPMAYBE3(pos,off,lim) (PL_reg_match_utf8 ? reghopmaybe3((U8*)pos, off, (U8*)lim) : (U8*)(pos + off))
128 #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
129 #define HOPMAYBE3c(pos,off,lim) ((char*)HOPMAYBE3(pos,off,lim))
131 #define LOAD_UTF8_CHARCLASS(a,b) STMT_START { if (!CAT2(PL_utf8_,a)) { ENTER; save_re_context(); (void)CAT2(is_utf8_, a)((U8*)b); LEAVE; } } STMT_END
133 /* for use after a quantifier and before an EXACT-like node -- japhy */
134 #define JUMPABLE(rn) ( \
135 OP(rn) == OPEN || OP(rn) == CLOSE || OP(rn) == EVAL || \
136 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
137 OP(rn) == PLUS || OP(rn) == MINMOD || \
138 (PL_regkind[(U8)OP(rn)] == CURLY && ARG1(rn) > 0) \
141 #define HAS_TEXT(rn) ( \
142 PL_regkind[(U8)OP(rn)] == EXACT || PL_regkind[(U8)OP(rn)] == REF \
146 Search for mandatory following text node; for lookahead, the text must
147 follow but for lookbehind (rn->flags != 0) we skip to the next step.
149 #define FIND_NEXT_IMPT(rn) STMT_START { \
150 while (JUMPABLE(rn)) \
151 if (OP(rn) == SUSPEND || PL_regkind[(U8)OP(rn)] == CURLY) \
152 rn = NEXTOPER(NEXTOPER(rn)); \
153 else if (OP(rn) == PLUS) \
155 else if (OP(rn) == IFMATCH) \
156 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
157 else rn += NEXT_OFF(rn); \
160 static void restore_pos(pTHX_ void *arg);
163 S_regcppush(pTHX_ I32 parenfloor)
165 int retval = PL_savestack_ix;
166 #define REGCP_PAREN_ELEMS 4
167 int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
170 if (paren_elems_to_push < 0)
171 Perl_croak(aTHX_ "panic: paren_elems_to_push < 0");
173 #define REGCP_OTHER_ELEMS 6
174 SSCHECK(paren_elems_to_push + REGCP_OTHER_ELEMS);
175 for (p = PL_regsize; p > parenfloor; p--) {
176 /* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
177 SSPUSHINT(PL_regendp[p]);
178 SSPUSHINT(PL_regstartp[p]);
179 SSPUSHPTR(PL_reg_start_tmp[p]);
182 /* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
183 SSPUSHINT(PL_regsize);
184 SSPUSHINT(*PL_reglastparen);
185 SSPUSHINT(*PL_reglastcloseparen);
186 SSPUSHPTR(PL_reginput);
187 #define REGCP_FRAME_ELEMS 2
188 /* REGCP_FRAME_ELEMS are part of the REGCP_OTHER_ELEMS and
189 * are needed for the regexp context stack bookkeeping. */
190 SSPUSHINT(paren_elems_to_push + REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
191 SSPUSHINT(SAVEt_REGCONTEXT); /* Magic cookie. */
196 /* These are needed since we do not localize EVAL nodes: */
197 # define REGCP_SET(cp) DEBUG_r(PerlIO_printf(Perl_debug_log, \
198 " Setting an EVAL scope, savestack=%"IVdf"\n", \
199 (IV)PL_savestack_ix)); cp = PL_savestack_ix
201 # define REGCP_UNWIND(cp) DEBUG_r(cp != PL_savestack_ix ? \
202 PerlIO_printf(Perl_debug_log, \
203 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
204 (IV)(cp), (IV)PL_savestack_ix) : 0); regcpblow(cp)
214 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
216 assert(i == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
217 i = SSPOPINT; /* Parentheses elements to pop. */
218 input = (char *) SSPOPPTR;
219 *PL_reglastcloseparen = SSPOPINT;
220 *PL_reglastparen = SSPOPINT;
221 PL_regsize = SSPOPINT;
223 /* Now restore the parentheses context. */
224 for (i -= (REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
225 i > 0; i -= REGCP_PAREN_ELEMS) {
226 paren = (U32)SSPOPINT;
227 PL_reg_start_tmp[paren] = (char *) SSPOPPTR;
228 PL_regstartp[paren] = SSPOPINT;
230 if (paren <= *PL_reglastparen)
231 PL_regendp[paren] = tmps;
233 PerlIO_printf(Perl_debug_log,
234 " restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n",
235 (UV)paren, (IV)PL_regstartp[paren],
236 (IV)(PL_reg_start_tmp[paren] - PL_bostr),
237 (IV)PL_regendp[paren],
238 (paren > *PL_reglastparen ? "(no)" : ""));
242 if (*PL_reglastparen + 1 <= PL_regnpar) {
243 PerlIO_printf(Perl_debug_log,
244 " restoring \\%"IVdf"..\\%"IVdf" to undef\n",
245 (IV)(*PL_reglastparen + 1), (IV)PL_regnpar);
249 /* It would seem that the similar code in regtry()
250 * already takes care of this, and in fact it is in
251 * a better location to since this code can #if 0-ed out
252 * but the code in regtry() is needed or otherwise tests
253 * requiring null fields (pat.t#187 and split.t#{13,14}
254 * (as of patchlevel 7877) will fail. Then again,
255 * this code seems to be necessary or otherwise
256 * building DynaLoader will fail:
257 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
259 for (paren = *PL_reglastparen + 1; paren <= PL_regnpar; paren++) {
260 if (paren > PL_regsize)
261 PL_regstartp[paren] = -1;
262 PL_regendp[paren] = -1;
269 S_regcp_set_to(pTHX_ I32 ss)
271 I32 tmp = PL_savestack_ix;
273 PL_savestack_ix = ss;
275 PL_savestack_ix = tmp;
279 typedef struct re_cc_state
283 struct re_cc_state *prev;
288 #define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
290 #define TRYPAREN(paren, n, input) { \
293 PL_regstartp[paren] = HOPc(input, -1) - PL_bostr; \
294 PL_regendp[paren] = input - PL_bostr; \
297 PL_regendp[paren] = -1; \
299 if (regmatch(next)) \
302 PL_regendp[paren] = -1; \
307 * pregexec and friends
311 - pregexec - match a regexp against a string
314 Perl_pregexec(pTHX_ register regexp *prog, char *stringarg, register char *strend,
315 char *strbeg, I32 minend, SV *screamer, U32 nosave)
316 /* strend: pointer to null at end of string */
317 /* strbeg: real beginning of string */
318 /* minend: end of match must be >=minend after stringarg. */
319 /* nosave: For optimizations. */
322 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
323 nosave ? 0 : REXEC_COPY_STR);
327 S_cache_re(pTHX_ regexp *prog)
329 PL_regprecomp = prog->precomp; /* Needed for FAIL. */
331 PL_regprogram = prog->program;
333 PL_regnpar = prog->nparens;
334 PL_regdata = prog->data;
339 * Need to implement the following flags for reg_anch:
341 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
343 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
344 * INTUIT_AUTORITATIVE_ML
345 * INTUIT_ONCE_NOML - Intuit can match in one location only.
348 * Another flag for this function: SECOND_TIME (so that float substrs
349 * with giant delta may be not rechecked).
352 /* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
354 /* If SCREAM, then SvPVX(sv) should be compatible with strpos and strend.
355 Otherwise, only SvCUR(sv) is used to get strbeg. */
357 /* XXXX We assume that strpos is strbeg unless sv. */
359 /* XXXX Some places assume that there is a fixed substring.
360 An update may be needed if optimizer marks as "INTUITable"
361 RExen without fixed substrings. Similarly, it is assumed that
362 lengths of all the strings are no more than minlen, thus they
363 cannot come from lookahead.
364 (Or minlen should take into account lookahead.) */
366 /* A failure to find a constant substring means that there is no need to make
367 an expensive call to REx engine, thus we celebrate a failure. Similarly,
368 finding a substring too deep into the string means that less calls to
369 regtry() should be needed.
371 REx compiler's optimizer found 4 possible hints:
372 a) Anchored substring;
374 c) Whether we are anchored (beginning-of-line or \G);
375 d) First node (of those at offset 0) which may distingush positions;
376 We use a)b)d) and multiline-part of c), and try to find a position in the
377 string which does not contradict any of them.
380 /* Most of decisions we do here should have been done at compile time.
381 The nodes of the REx which we used for the search should have been
382 deleted from the finite automaton. */
385 Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
386 char *strend, U32 flags, re_scream_pos_data *data)
388 register I32 start_shift = 0;
389 /* Should be nonnegative! */
390 register I32 end_shift = 0;
395 int do_utf8 = sv ? SvUTF8(sv) : 0; /* if no sv we have to assume bytes */
397 register char *other_last = Nullch; /* other substr checked before this */
398 char *check_at = Nullch; /* check substr found at this pos */
400 char *i_strpos = strpos;
401 SV *dsv = PERL_DEBUG_PAD_ZERO(0);
404 if (prog->reganch & ROPT_UTF8) {
405 DEBUG_r(PerlIO_printf(Perl_debug_log,
406 "UTF-8 regex...\n"));
407 PL_reg_flags |= RF_utf8;
411 char *s = PL_reg_match_utf8 ?
412 sv_uni_display(dsv, sv, 60, UNI_DISPLAY_REGEX) :
414 int len = PL_reg_match_utf8 ?
415 strlen(s) : strend - strpos;
418 if (PL_reg_match_utf8)
419 DEBUG_r(PerlIO_printf(Perl_debug_log,
420 "UTF-8 target...\n"));
421 PerlIO_printf(Perl_debug_log,
422 "%sGuessing start of match, REx%s `%s%.60s%s%s' against `%s%.*s%s%s'...\n",
423 PL_colors[4],PL_colors[5],PL_colors[0],
426 (strlen(prog->precomp) > 60 ? "..." : ""),
428 (int)(len > 60 ? 60 : len),
430 (len > 60 ? "..." : "")
434 if (prog->minlen > CHR_DIST((U8*)strend, (U8*)strpos)) {
435 DEBUG_r(PerlIO_printf(Perl_debug_log,
436 "String too short... [re_intuit_start]\n"));
439 strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
442 if (!prog->check_utf8 && prog->check_substr)
443 to_utf8_substr(prog);
444 check = prog->check_utf8;
446 if (!prog->check_substr && prog->check_utf8)
447 to_byte_substr(prog);
448 check = prog->check_substr;
450 if (check == &PL_sv_undef) {
451 DEBUG_r(PerlIO_printf(Perl_debug_log,
452 "Non-utf string cannot match utf check string\n"));
455 if (prog->reganch & ROPT_ANCH) { /* Match at beg-of-str or after \n */
456 ml_anch = !( (prog->reganch & ROPT_ANCH_SINGLE)
457 || ( (prog->reganch & ROPT_ANCH_BOL)
458 && !PL_multiline ) ); /* Check after \n? */
461 if ( !(prog->reganch & (ROPT_ANCH_GPOS /* Checked by the caller */
462 | ROPT_IMPLICIT)) /* not a real BOL */
463 /* SvCUR is not set on references: SvRV and SvPVX overlap */
465 && (strpos != strbeg)) {
466 DEBUG_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
469 if (prog->check_offset_min == prog->check_offset_max &&
470 !(prog->reganch & ROPT_CANY_SEEN)) {
471 /* Substring at constant offset from beg-of-str... */
474 s = HOP3c(strpos, prog->check_offset_min, strend);
476 slen = SvCUR(check); /* >= 1 */
478 if ( strend - s > slen || strend - s < slen - 1
479 || (strend - s == slen && strend[-1] != '\n')) {
480 DEBUG_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
483 /* Now should match s[0..slen-2] */
485 if (slen && (*SvPVX(check) != *s
487 && memNE(SvPVX(check), s, slen)))) {
489 DEBUG_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
493 else if (*SvPVX(check) != *s
494 || ((slen = SvCUR(check)) > 1
495 && memNE(SvPVX(check), s, slen)))
497 goto success_at_start;
500 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
502 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
503 end_shift = prog->minlen - start_shift -
504 CHR_SVLEN(check) + (SvTAIL(check) != 0);
506 I32 end = prog->check_offset_max + CHR_SVLEN(check)
507 - (SvTAIL(check) != 0);
508 I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
510 if (end_shift < eshift)
514 else { /* Can match at random position */
517 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
518 /* Should be nonnegative! */
519 end_shift = prog->minlen - start_shift -
520 CHR_SVLEN(check) + (SvTAIL(check) != 0);
523 #ifdef DEBUGGING /* 7/99: reports of failure (with the older version) */
525 Perl_croak(aTHX_ "panic: end_shift");
529 /* Find a possible match in the region s..strend by looking for
530 the "check" substring in the region corrected by start/end_shift. */
531 if (flags & REXEC_SCREAM) {
532 I32 p = -1; /* Internal iterator of scream. */
533 I32 *pp = data ? data->scream_pos : &p;
535 if (PL_screamfirst[BmRARE(check)] >= 0
536 || ( BmRARE(check) == '\n'
537 && (BmPREVIOUS(check) == SvCUR(check) - 1)
539 s = screaminstr(sv, check,
540 start_shift + (s - strbeg), end_shift, pp, 0);
544 *data->scream_olds = s;
546 else if (prog->reganch & ROPT_CANY_SEEN)
547 s = fbm_instr((U8*)(s + start_shift),
548 (U8*)(strend - end_shift),
549 check, PL_multiline ? FBMrf_MULTILINE : 0);
551 s = fbm_instr(HOP3(s, start_shift, strend),
552 HOP3(strend, -end_shift, strbeg),
553 check, PL_multiline ? FBMrf_MULTILINE : 0);
555 /* Update the count-of-usability, remove useless subpatterns,
558 DEBUG_r(PerlIO_printf(Perl_debug_log, "%s %s substr `%s%.*s%s'%s%s",
559 (s ? "Found" : "Did not find"),
560 (check == (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) ? "anchored" : "floating"),
562 (int)(SvCUR(check) - (SvTAIL(check)!=0)),
564 PL_colors[1], (SvTAIL(check) ? "$" : ""),
565 (s ? " at offset " : "...\n") ) );
572 /* Finish the diagnostic message */
573 DEBUG_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
575 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
576 Start with the other substr.
577 XXXX no SCREAM optimization yet - and a very coarse implementation
578 XXXX /ttx+/ results in anchored=`ttx', floating=`x'. floating will
579 *always* match. Probably should be marked during compile...
580 Probably it is right to do no SCREAM here...
583 if (do_utf8 ? (prog->float_utf8 && prog->anchored_utf8) : (prog->float_substr && prog->anchored_substr)) {
584 /* Take into account the "other" substring. */
585 /* XXXX May be hopelessly wrong for UTF... */
588 if (check == (do_utf8 ? prog->float_utf8 : prog->float_substr)) {
591 char *last = HOP3c(s, -start_shift, strbeg), *last1, *last2;
595 t = s - prog->check_offset_max;
596 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
597 && (!(prog->reganch & ROPT_UTF8)
598 || ((t = reghopmaybe3_c(s, -(prog->check_offset_max), strpos))
603 t = HOP3c(t, prog->anchored_offset, strend);
604 if (t < other_last) /* These positions already checked */
606 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
609 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
610 /* On end-of-str: see comment below. */
611 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
612 if (must == &PL_sv_undef) {
614 DEBUG_r(must = prog->anchored_utf8); /* for debug */
619 HOP3(HOP3(last1, prog->anchored_offset, strend)
620 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
622 PL_multiline ? FBMrf_MULTILINE : 0
624 DEBUG_r(PerlIO_printf(Perl_debug_log,
625 "%s anchored substr `%s%.*s%s'%s",
626 (s ? "Found" : "Contradicts"),
629 - (SvTAIL(must)!=0)),
631 PL_colors[1], (SvTAIL(must) ? "$" : "")));
633 if (last1 >= last2) {
634 DEBUG_r(PerlIO_printf(Perl_debug_log,
635 ", giving up...\n"));
638 DEBUG_r(PerlIO_printf(Perl_debug_log,
639 ", trying floating at offset %ld...\n",
640 (long)(HOP3c(s1, 1, strend) - i_strpos)));
641 other_last = HOP3c(last1, prog->anchored_offset+1, strend);
642 s = HOP3c(last, 1, strend);
646 DEBUG_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
647 (long)(s - i_strpos)));
648 t = HOP3c(s, -prog->anchored_offset, strbeg);
649 other_last = HOP3c(s, 1, strend);
657 else { /* Take into account the floating substring. */
662 t = HOP3c(s, -start_shift, strbeg);
664 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
665 if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
666 last = HOP3c(t, prog->float_max_offset, strend);
667 s = HOP3c(t, prog->float_min_offset, strend);
670 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
671 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
672 /* fbm_instr() takes into account exact value of end-of-str
673 if the check is SvTAIL(ed). Since false positives are OK,
674 and end-of-str is not later than strend we are OK. */
675 if (must == &PL_sv_undef) {
677 DEBUG_r(must = prog->float_utf8); /* for debug message */
680 s = fbm_instr((unsigned char*)s,
681 (unsigned char*)last + SvCUR(must)
683 must, PL_multiline ? FBMrf_MULTILINE : 0);
684 DEBUG_r(PerlIO_printf(Perl_debug_log, "%s floating substr `%s%.*s%s'%s",
685 (s ? "Found" : "Contradicts"),
687 (int)(SvCUR(must) - (SvTAIL(must)!=0)),
689 PL_colors[1], (SvTAIL(must) ? "$" : "")));
692 DEBUG_r(PerlIO_printf(Perl_debug_log,
693 ", giving up...\n"));
696 DEBUG_r(PerlIO_printf(Perl_debug_log,
697 ", trying anchored starting at offset %ld...\n",
698 (long)(s1 + 1 - i_strpos)));
700 s = HOP3c(t, 1, strend);
704 DEBUG_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
705 (long)(s - i_strpos)));
706 other_last = s; /* Fix this later. --Hugo */
715 t = s - prog->check_offset_max;
716 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
717 && (!(prog->reganch & ROPT_UTF8)
718 || ((t = reghopmaybe3_c(s, -prog->check_offset_max, strpos))
720 /* Fixed substring is found far enough so that the match
721 cannot start at strpos. */
723 if (ml_anch && t[-1] != '\n') {
724 /* Eventually fbm_*() should handle this, but often
725 anchored_offset is not 0, so this check will not be wasted. */
726 /* XXXX In the code below we prefer to look for "^" even in
727 presence of anchored substrings. And we search even
728 beyond the found float position. These pessimizations
729 are historical artefacts only. */
731 while (t < strend - prog->minlen) {
733 if (t < check_at - prog->check_offset_min) {
734 if (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) {
735 /* Since we moved from the found position,
736 we definitely contradict the found anchored
737 substr. Due to the above check we do not
738 contradict "check" substr.
739 Thus we can arrive here only if check substr
740 is float. Redo checking for "other"=="fixed".
743 DEBUG_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
744 PL_colors[0],PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
745 goto do_other_anchored;
747 /* We don't contradict the found floating substring. */
748 /* XXXX Why not check for STCLASS? */
750 DEBUG_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
751 PL_colors[0],PL_colors[1], (long)(s - i_strpos)));
754 /* Position contradicts check-string */
755 /* XXXX probably better to look for check-string
756 than for "\n", so one should lower the limit for t? */
757 DEBUG_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
758 PL_colors[0],PL_colors[1], (long)(t + 1 - i_strpos)));
759 other_last = strpos = s = t + 1;
764 DEBUG_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
765 PL_colors[0],PL_colors[1]));
769 DEBUG_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
770 PL_colors[0],PL_colors[1]));
774 ++BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
777 /* The found string does not prohibit matching at strpos,
778 - no optimization of calling REx engine can be performed,
779 unless it was an MBOL and we are not after MBOL,
780 or a future STCLASS check will fail this. */
782 /* Even in this situation we may use MBOL flag if strpos is offset
783 wrt the start of the string. */
784 if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
785 && (strpos != strbeg) && strpos[-1] != '\n'
786 /* May be due to an implicit anchor of m{.*foo} */
787 && !(prog->reganch & ROPT_IMPLICIT))
792 DEBUG_r( if (ml_anch)
793 PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
794 (long)(strpos - i_strpos), PL_colors[0],PL_colors[1]);
797 if (!(prog->reganch & ROPT_NAUGHTY) /* XXXX If strpos moved? */
799 prog->check_utf8 /* Could be deleted already */
800 && --BmUSEFUL(prog->check_utf8) < 0
801 && (prog->check_utf8 == prog->float_utf8)
803 prog->check_substr /* Could be deleted already */
804 && --BmUSEFUL(prog->check_substr) < 0
805 && (prog->check_substr == prog->float_substr)
808 /* If flags & SOMETHING - do not do it many times on the same match */
809 DEBUG_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
810 SvREFCNT_dec(do_utf8 ? prog->check_utf8 : prog->check_substr);
811 if (do_utf8 ? prog->check_substr : prog->check_utf8)
812 SvREFCNT_dec(do_utf8 ? prog->check_substr : prog->check_utf8);
813 prog->check_substr = prog->check_utf8 = Nullsv; /* disable */
814 prog->float_substr = prog->float_utf8 = Nullsv; /* clear */
815 check = Nullsv; /* abort */
817 /* XXXX This is a remnant of the old implementation. It
818 looks wasteful, since now INTUIT can use many
820 prog->reganch &= ~RE_USE_INTUIT;
827 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
828 if (prog->regstclass) {
829 /* minlen == 0 is possible if regstclass is \b or \B,
830 and the fixed substr is ''$.
831 Since minlen is already taken into account, s+1 is before strend;
832 accidentally, minlen >= 1 guaranties no false positives at s + 1
833 even for \b or \B. But (minlen? 1 : 0) below assumes that
834 regstclass does not come from lookahead... */
835 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
836 This leaves EXACTF only, which is dealt with in find_byclass(). */
837 U8* str = (U8*)STRING(prog->regstclass);
838 int cl_l = (PL_regkind[(U8)OP(prog->regstclass)] == EXACT
839 ? CHR_DIST(str+STR_LEN(prog->regstclass), str)
841 char *endpos = (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
842 ? HOP3c(s, (prog->minlen ? cl_l : 0), strend)
843 : (prog->float_substr || prog->float_utf8
844 ? HOP3c(HOP3c(check_at, -start_shift, strbeg),
847 char *startpos = strbeg;
850 if (prog->reganch & ROPT_UTF8) {
851 PL_regdata = prog->data;
854 s = find_byclass(prog, prog->regstclass, s, endpos, startpos, 1);
859 if (endpos == strend) {
860 DEBUG_r( PerlIO_printf(Perl_debug_log,
861 "Could not match STCLASS...\n") );
864 DEBUG_r( PerlIO_printf(Perl_debug_log,
865 "This position contradicts STCLASS...\n") );
866 if ((prog->reganch & ROPT_ANCH) && !ml_anch)
868 /* Contradict one of substrings */
869 if (prog->anchored_substr || prog->anchored_utf8) {
870 if ((do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) == check) {
871 DEBUG_r( what = "anchored" );
873 s = HOP3c(t, 1, strend);
874 if (s + start_shift + end_shift > strend) {
875 /* XXXX Should be taken into account earlier? */
876 DEBUG_r( PerlIO_printf(Perl_debug_log,
877 "Could not match STCLASS...\n") );
882 DEBUG_r( PerlIO_printf(Perl_debug_log,
883 "Looking for %s substr starting at offset %ld...\n",
884 what, (long)(s + start_shift - i_strpos)) );
887 /* Have both, check_string is floating */
888 if (t + start_shift >= check_at) /* Contradicts floating=check */
889 goto retry_floating_check;
890 /* Recheck anchored substring, but not floating... */
894 DEBUG_r( PerlIO_printf(Perl_debug_log,
895 "Looking for anchored substr starting at offset %ld...\n",
896 (long)(other_last - i_strpos)) );
897 goto do_other_anchored;
899 /* Another way we could have checked stclass at the
900 current position only: */
905 DEBUG_r( PerlIO_printf(Perl_debug_log,
906 "Looking for /%s^%s/m starting at offset %ld...\n",
907 PL_colors[0],PL_colors[1], (long)(t - i_strpos)) );
910 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
912 /* Check is floating subtring. */
913 retry_floating_check:
914 t = check_at - start_shift;
915 DEBUG_r( what = "floating" );
916 goto hop_and_restart;
919 DEBUG_r(PerlIO_printf(Perl_debug_log,
920 "By STCLASS: moving %ld --> %ld\n",
921 (long)(t - i_strpos), (long)(s - i_strpos))
925 DEBUG_r(PerlIO_printf(Perl_debug_log,
926 "Does not contradict STCLASS...\n");
931 DEBUG_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
932 PL_colors[4], (check ? "Guessed" : "Giving up"),
933 PL_colors[5], (long)(s - i_strpos)) );
936 fail_finish: /* Substring not found */
937 if (prog->check_substr || prog->check_utf8) /* could be removed already */
938 BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
940 DEBUG_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
941 PL_colors[4],PL_colors[5]));
945 /* We know what class REx starts with. Try to find this position... */
947 S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *startpos, I32 norun)
949 I32 doevery = (prog->reganch & ROPT_SKIP) == 0;
955 register I32 tmp = 1; /* Scratch variable? */
956 register bool do_utf8 = PL_reg_match_utf8;
958 /* We know what class it must start with. */
962 STRLEN skip = do_utf8 ? UTF8SKIP(s) : 1;
965 reginclass(c, (U8*)s, 0, do_utf8) :
966 REGINCLASS(c, (U8*)s) ||
967 (ANYOF_FOLD_SHARP_S(c, s, strend) &&
968 /* The assignment of 2 is intentional:
969 * for the sharp s, the skip is 2. */
970 (skip = SHARP_S_SKIP)
972 if (tmp && (norun || regtry(prog, s)))
984 if (tmp && (norun || regtry(prog, s)))
996 U8 tmpbuf1[UTF8_MAXLEN_UCLC+1];
997 U8 tmpbuf2[UTF8_MAXLEN_UCLC+1];
999 to_utf8_lower((U8*)m, tmpbuf1, &ulen1);
1000 to_utf8_upper((U8*)m, tmpbuf2, &ulen2);
1002 c1 = utf8_to_uvchr(tmpbuf1, 0);
1003 c2 = utf8_to_uvchr(tmpbuf2, 0);
1014 c2 = PL_fold_locale[c1];
1016 e = do_utf8 ? s + ln : strend - ln;
1019 e = s; /* Due to minlen logic of intuit() */
1021 /* The idea in the EXACTF* cases is to first find the
1022 * first character of the EXACTF* node and then, if
1023 * necessary, case-insensitively compare the full
1024 * text of the node. The c1 and c2 are the first
1025 * characters (though in Unicode it gets a bit
1026 * more complicated because there are more cases
1027 * than just upper and lower: one needs to use
1028 * the so-called folding case for case-insensitive
1029 * matching (called "loose matching" in Unicode).
1030 * ibcmp_utf8() will do just that. */
1034 U8 tmpbuf [UTF8_MAXLEN+1];
1035 U8 foldbuf[UTF8_MAXLEN_FOLD+1];
1036 STRLEN len, foldlen;
1040 c = utf8_to_uvchr((U8*)s, &len);
1043 ibcmp_utf8(s, (char **)0, 0, do_utf8,
1044 m, (char **)0, ln, UTF))
1045 && (norun || regtry(prog, s)) )
1048 uvchr_to_utf8(tmpbuf, c);
1049 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen);
1051 && (f == c1 || f == c2)
1052 && (ln == foldlen ||
1053 !ibcmp_utf8((char *) foldbuf,
1054 (char **)0, foldlen, do_utf8,
1056 (char **)0, ln, UTF))
1057 && (norun || regtry(prog, s)) )
1065 c = utf8_to_uvchr((U8*)s, &len);
1067 /* Handle some of the three Greek sigmas cases.
1068 * Note that not all the possible combinations
1069 * are handled here: some of them are handled
1070 * by the standard folding rules, and some of
1071 * them (the character class or ANYOF cases)
1072 * are handled during compiletime in
1073 * regexec.c:S_regclass(). */
1074 if (c == (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA ||
1075 c == (UV)UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA)
1076 c = (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA;
1078 if ( (c == c1 || c == c2)
1080 ibcmp_utf8(s, (char **)0, 0, do_utf8,
1081 m, (char **)0, ln, UTF))
1082 && (norun || regtry(prog, s)) )
1085 uvchr_to_utf8(tmpbuf, c);
1086 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen);
1088 && (f == c1 || f == c2)
1089 && (ln == foldlen ||
1090 !ibcmp_utf8((char *) foldbuf,
1091 (char **)0, foldlen, do_utf8,
1093 (char **)0, ln, UTF))
1094 && (norun || regtry(prog, s)) )
1105 && (ln == 1 || !(OP(c) == EXACTF
1107 : ibcmp_locale(s, m, ln)))
1108 && (norun || regtry(prog, s)) )
1114 if ( (*(U8*)s == c1 || *(U8*)s == c2)
1115 && (ln == 1 || !(OP(c) == EXACTF
1117 : ibcmp_locale(s, m, ln)))
1118 && (norun || regtry(prog, s)) )
1125 PL_reg_flags |= RF_tainted;
1132 U8 *r = reghop3((U8*)s, -1, (U8*)startpos);
1135 tmp = (I32)utf8n_to_uvchr(r, s - (char*)r, 0, 0);
1137 tmp = ((OP(c) == BOUND ?
1138 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1139 LOAD_UTF8_CHARCLASS(alnum,"a");
1140 while (s < strend) {
1141 if (tmp == !(OP(c) == BOUND ?
1142 swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
1143 isALNUM_LC_utf8((U8*)s)))
1146 if ((norun || regtry(prog, s)))
1153 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
1154 tmp = ((OP(c) == BOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1155 while (s < strend) {
1157 !(OP(c) == BOUND ? isALNUM(*s) : isALNUM_LC(*s))) {
1159 if ((norun || regtry(prog, s)))
1165 if ((!prog->minlen && tmp) && (norun || regtry(prog, s)))
1169 PL_reg_flags |= RF_tainted;
1176 U8 *r = reghop3((U8*)s, -1, (U8*)startpos);
1179 tmp = (I32)utf8n_to_uvchr(r, s - (char*)r, 0, 0);
1181 tmp = ((OP(c) == NBOUND ?
1182 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1183 LOAD_UTF8_CHARCLASS(alnum,"a");
1184 while (s < strend) {
1185 if (tmp == !(OP(c) == NBOUND ?
1186 swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
1187 isALNUM_LC_utf8((U8*)s)))
1189 else if ((norun || regtry(prog, s)))
1195 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
1196 tmp = ((OP(c) == NBOUND ?
1197 isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1198 while (s < strend) {
1200 !(OP(c) == NBOUND ? isALNUM(*s) : isALNUM_LC(*s)))
1202 else if ((norun || regtry(prog, s)))
1207 if ((!prog->minlen && !tmp) && (norun || regtry(prog, s)))
1212 LOAD_UTF8_CHARCLASS(alnum,"a");
1213 while (s < strend) {
1214 if (swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8)) {
1215 if (tmp && (norun || regtry(prog, s)))
1226 while (s < strend) {
1228 if (tmp && (norun || regtry(prog, s)))
1240 PL_reg_flags |= RF_tainted;
1242 while (s < strend) {
1243 if (isALNUM_LC_utf8((U8*)s)) {
1244 if (tmp && (norun || regtry(prog, s)))
1255 while (s < strend) {
1256 if (isALNUM_LC(*s)) {
1257 if (tmp && (norun || regtry(prog, s)))
1270 LOAD_UTF8_CHARCLASS(alnum,"a");
1271 while (s < strend) {
1272 if (!swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8)) {
1273 if (tmp && (norun || regtry(prog, s)))
1284 while (s < strend) {
1286 if (tmp && (norun || regtry(prog, s)))
1298 PL_reg_flags |= RF_tainted;
1300 while (s < strend) {
1301 if (!isALNUM_LC_utf8((U8*)s)) {
1302 if (tmp && (norun || regtry(prog, s)))
1313 while (s < strend) {
1314 if (!isALNUM_LC(*s)) {
1315 if (tmp && (norun || regtry(prog, s)))
1328 LOAD_UTF8_CHARCLASS(space," ");
1329 while (s < strend) {
1330 if (*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8)) {
1331 if (tmp && (norun || regtry(prog, s)))
1342 while (s < strend) {
1344 if (tmp && (norun || regtry(prog, s)))
1356 PL_reg_flags |= RF_tainted;
1358 while (s < strend) {
1359 if (*s == ' ' || isSPACE_LC_utf8((U8*)s)) {
1360 if (tmp && (norun || regtry(prog, s)))
1371 while (s < strend) {
1372 if (isSPACE_LC(*s)) {
1373 if (tmp && (norun || regtry(prog, s)))
1386 LOAD_UTF8_CHARCLASS(space," ");
1387 while (s < strend) {
1388 if (!(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8))) {
1389 if (tmp && (norun || regtry(prog, s)))
1400 while (s < strend) {
1402 if (tmp && (norun || regtry(prog, s)))
1414 PL_reg_flags |= RF_tainted;
1416 while (s < strend) {
1417 if (!(*s == ' ' || isSPACE_LC_utf8((U8*)s))) {
1418 if (tmp && (norun || regtry(prog, s)))
1429 while (s < strend) {
1430 if (!isSPACE_LC(*s)) {
1431 if (tmp && (norun || regtry(prog, s)))
1444 LOAD_UTF8_CHARCLASS(digit,"0");
1445 while (s < strend) {
1446 if (swash_fetch(PL_utf8_digit,(U8*)s, do_utf8)) {
1447 if (tmp && (norun || regtry(prog, s)))
1458 while (s < strend) {
1460 if (tmp && (norun || regtry(prog, s)))
1472 PL_reg_flags |= RF_tainted;
1474 while (s < strend) {
1475 if (isDIGIT_LC_utf8((U8*)s)) {
1476 if (tmp && (norun || regtry(prog, s)))
1487 while (s < strend) {
1488 if (isDIGIT_LC(*s)) {
1489 if (tmp && (norun || regtry(prog, s)))
1502 LOAD_UTF8_CHARCLASS(digit,"0");
1503 while (s < strend) {
1504 if (!swash_fetch(PL_utf8_digit,(U8*)s, do_utf8)) {
1505 if (tmp && (norun || regtry(prog, s)))
1516 while (s < strend) {
1518 if (tmp && (norun || regtry(prog, s)))
1530 PL_reg_flags |= RF_tainted;
1532 while (s < strend) {
1533 if (!isDIGIT_LC_utf8((U8*)s)) {
1534 if (tmp && (norun || regtry(prog, s)))
1545 while (s < strend) {
1546 if (!isDIGIT_LC(*s)) {
1547 if (tmp && (norun || regtry(prog, s)))
1559 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
1568 - regexec_flags - match a regexp against a string
1571 Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *strend,
1572 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
1573 /* strend: pointer to null at end of string */
1574 /* strbeg: real beginning of string */
1575 /* minend: end of match must be >=minend after stringarg. */
1576 /* data: May be used for some additional optimizations. */
1577 /* nosave: For optimizations. */
1580 register regnode *c;
1581 register char *startpos = stringarg;
1582 I32 minlen; /* must match at least this many chars */
1583 I32 dontbother = 0; /* how many characters not to try at end */
1584 /* I32 start_shift = 0; */ /* Offset of the start to find
1585 constant substr. */ /* CC */
1586 I32 end_shift = 0; /* Same for the end. */ /* CC */
1587 I32 scream_pos = -1; /* Internal iterator of scream. */
1589 SV* oreplsv = GvSV(PL_replgv);
1590 bool do_utf8 = DO_UTF8(sv);
1592 SV *dsv0 = PERL_DEBUG_PAD_ZERO(0);
1593 SV *dsv1 = PERL_DEBUG_PAD_ZERO(1);
1600 PL_regnarrate = DEBUG_r_TEST;
1603 /* Be paranoid... */
1604 if (prog == NULL || startpos == NULL) {
1605 Perl_croak(aTHX_ "NULL regexp parameter");
1609 minlen = prog->minlen;
1610 if (strend - startpos < minlen) {
1611 DEBUG_r(PerlIO_printf(Perl_debug_log,
1612 "String too short [regexec_flags]...\n"));
1616 /* Check validity of program. */
1617 if (UCHARAT(prog->program) != REG_MAGIC) {
1618 Perl_croak(aTHX_ "corrupted regexp program");
1622 PL_reg_eval_set = 0;
1625 if (prog->reganch & ROPT_UTF8)
1626 PL_reg_flags |= RF_utf8;
1628 /* Mark beginning of line for ^ and lookbehind. */
1629 PL_regbol = startpos;
1633 /* Mark end of line for $ (and such) */
1636 /* see how far we have to get to not match where we matched before */
1637 PL_regtill = startpos+minend;
1639 /* We start without call_cc context. */
1642 /* If there is a "must appear" string, look for it. */
1645 if (prog->reganch & ROPT_GPOS_SEEN) { /* Need to have PL_reg_ganch */
1648 if (flags & REXEC_IGNOREPOS) /* Means: check only at start */
1649 PL_reg_ganch = startpos;
1650 else if (sv && SvTYPE(sv) >= SVt_PVMG
1652 && (mg = mg_find(sv, PERL_MAGIC_regex_global))
1653 && mg->mg_len >= 0) {
1654 PL_reg_ganch = strbeg + mg->mg_len; /* Defined pos() */
1655 if (prog->reganch & ROPT_ANCH_GPOS) {
1656 if (s > PL_reg_ganch)
1661 else /* pos() not defined */
1662 PL_reg_ganch = strbeg;
1665 if (!(flags & REXEC_CHECKED) && (prog->check_substr != Nullsv || prog->check_utf8 != Nullsv)) {
1666 re_scream_pos_data d;
1668 d.scream_olds = &scream_olds;
1669 d.scream_pos = &scream_pos;
1670 s = re_intuit_start(prog, sv, s, strend, flags, &d);
1672 DEBUG_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
1673 goto phooey; /* not present */
1679 pv_uni_display(dsv0, (U8*)prog->precomp, prog->prelen, 60,
1680 UNI_DISPLAY_REGEX) :
1682 int len0 = UTF ? SvCUR(dsv0) : prog->prelen;
1683 char *s1 = do_utf8 ? sv_uni_display(dsv1, sv, 60,
1684 UNI_DISPLAY_REGEX) : startpos;
1685 int len1 = do_utf8 ? SvCUR(dsv1) : strend - startpos;
1688 PerlIO_printf(Perl_debug_log,
1689 "%sMatching REx%s `%s%*.*s%s%s' against `%s%.*s%s%s'\n",
1690 PL_colors[4],PL_colors[5],PL_colors[0],
1693 len0 > 60 ? "..." : "",
1695 (int)(len1 > 60 ? 60 : len1),
1697 (len1 > 60 ? "..." : "")
1701 /* Simplest case: anchored match need be tried only once. */
1702 /* [unless only anchor is BOL and multiline is set] */
1703 if (prog->reganch & (ROPT_ANCH & ~ROPT_ANCH_GPOS)) {
1704 if (s == startpos && regtry(prog, startpos))
1706 else if (PL_multiline || (prog->reganch & ROPT_IMPLICIT)
1707 || (prog->reganch & ROPT_ANCH_MBOL)) /* XXXX SBOL? */
1712 dontbother = minlen - 1;
1713 end = HOP3c(strend, -dontbother, strbeg) - 1;
1714 /* for multiline we only have to try after newlines */
1715 if (prog->check_substr || prog->check_utf8) {
1719 if (regtry(prog, s))
1724 if (prog->reganch & RE_USE_INTUIT) {
1725 s = re_intuit_start(prog, sv, s + 1, strend, flags, NULL);
1736 if (*s++ == '\n') { /* don't need PL_utf8skip here */
1737 if (regtry(prog, s))
1744 } else if (prog->reganch & ROPT_ANCH_GPOS) {
1745 if (regtry(prog, PL_reg_ganch))
1750 /* Messy cases: unanchored match. */
1751 if ((prog->anchored_substr || prog->anchored_utf8) && prog->reganch & ROPT_SKIP) {
1752 /* we have /x+whatever/ */
1753 /* it must be a one character string (XXXX Except UTF?) */
1758 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1759 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1760 ch = SvPVX(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr)[0];
1763 while (s < strend) {
1765 DEBUG_r( did_match = 1 );
1766 if (regtry(prog, s)) goto got_it;
1768 while (s < strend && *s == ch)
1775 while (s < strend) {
1777 DEBUG_r( did_match = 1 );
1778 if (regtry(prog, s)) goto got_it;
1780 while (s < strend && *s == ch)
1786 DEBUG_r(if (!did_match)
1787 PerlIO_printf(Perl_debug_log,
1788 "Did not find anchored character...\n")
1792 else if (prog->anchored_substr != Nullsv
1793 || prog->anchored_utf8 != Nullsv
1794 || ((prog->float_substr != Nullsv || prog->float_utf8 != Nullsv)
1795 && prog->float_max_offset < strend - s)) {
1800 char *last1; /* Last position checked before */
1804 if (prog->anchored_substr || prog->anchored_utf8) {
1805 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1806 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1807 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
1808 back_max = back_min = prog->anchored_offset;
1810 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
1811 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1812 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
1813 back_max = prog->float_max_offset;
1814 back_min = prog->float_min_offset;
1816 if (must == &PL_sv_undef)
1817 /* could not downgrade utf8 check substring, so must fail */
1820 last = HOP3c(strend, /* Cannot start after this */
1821 -(I32)(CHR_SVLEN(must)
1822 - (SvTAIL(must) != 0) + back_min), strbeg);
1825 last1 = HOPc(s, -1);
1827 last1 = s - 1; /* bogus */
1829 /* XXXX check_substr already used to find `s', can optimize if
1830 check_substr==must. */
1832 dontbother = end_shift;
1833 strend = HOPc(strend, -dontbother);
1834 while ( (s <= last) &&
1835 ((flags & REXEC_SCREAM)
1836 ? (s = screaminstr(sv, must, HOP3c(s, back_min, strend) - strbeg,
1837 end_shift, &scream_pos, 0))
1838 : (s = fbm_instr((unsigned char*)HOP3(s, back_min, strend),
1839 (unsigned char*)strend, must,
1840 PL_multiline ? FBMrf_MULTILINE : 0))) ) {
1841 DEBUG_r( did_match = 1 );
1842 if (HOPc(s, -back_max) > last1) {
1843 last1 = HOPc(s, -back_min);
1844 s = HOPc(s, -back_max);
1847 char *t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
1849 last1 = HOPc(s, -back_min);
1853 while (s <= last1) {
1854 if (regtry(prog, s))
1860 while (s <= last1) {
1861 if (regtry(prog, s))
1867 DEBUG_r(if (!did_match)
1868 PerlIO_printf(Perl_debug_log,
1869 "Did not find %s substr `%s%.*s%s'%s...\n",
1870 ((must == prog->anchored_substr || must == prog->anchored_utf8)
1871 ? "anchored" : "floating"),
1873 (int)(SvCUR(must) - (SvTAIL(must)!=0)),
1875 PL_colors[1], (SvTAIL(must) ? "$" : ""))
1879 else if ((c = prog->regstclass)) {
1880 if (minlen && PL_regkind[(U8)OP(prog->regstclass)] != EXACT)
1881 /* don't bother with what can't match */
1882 strend = HOPc(strend, -(minlen - 1));
1884 SV *prop = sv_newmortal();
1892 pv_uni_display(dsv0, (U8*)SvPVX(prop), SvCUR(prop), 60,
1893 UNI_DISPLAY_REGEX) :
1895 len0 = UTF ? SvCUR(dsv0) : SvCUR(prop);
1897 sv_uni_display(dsv1, sv, 60, UNI_DISPLAY_REGEX) : s;
1898 len1 = UTF ? SvCUR(dsv1) : strend - s;
1899 PerlIO_printf(Perl_debug_log,
1900 "Matching stclass `%*.*s' against `%*.*s'\n",
1904 if (find_byclass(prog, c, s, strend, startpos, 0))
1906 DEBUG_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass...\n"));
1910 if (prog->float_substr != Nullsv || prog->float_utf8 != Nullsv) {
1915 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
1916 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1917 float_real = do_utf8 ? prog->float_utf8 : prog->float_substr;
1919 if (flags & REXEC_SCREAM) {
1920 last = screaminstr(sv, float_real, s - strbeg,
1921 end_shift, &scream_pos, 1); /* last one */
1923 last = scream_olds; /* Only one occurrence. */
1927 char *little = SvPV(float_real, len);
1929 if (SvTAIL(float_real)) {
1930 if (memEQ(strend - len + 1, little, len - 1))
1931 last = strend - len + 1;
1932 else if (!PL_multiline)
1933 last = memEQ(strend - len, little, len)
1934 ? strend - len : Nullch;
1940 last = rninstr(s, strend, little, little + len);
1942 last = strend; /* matching `$' */
1946 DEBUG_r(PerlIO_printf(Perl_debug_log,
1947 "%sCan't trim the tail, match fails (should not happen)%s\n",
1948 PL_colors[4],PL_colors[5]));
1949 goto phooey; /* Should not happen! */
1951 dontbother = strend - last + prog->float_min_offset;
1953 if (minlen && (dontbother < minlen))
1954 dontbother = minlen - 1;
1955 strend -= dontbother; /* this one's always in bytes! */
1956 /* We don't know much -- general case. */
1959 if (regtry(prog, s))
1968 if (regtry(prog, s))
1970 } while (s++ < strend);
1978 RX_MATCH_TAINTED_set(prog, PL_reg_flags & RF_tainted);
1980 if (PL_reg_eval_set) {
1981 /* Preserve the current value of $^R */
1982 if (oreplsv != GvSV(PL_replgv))
1983 sv_setsv(oreplsv, GvSV(PL_replgv));/* So that when GvSV(replgv) is
1984 restored, the value remains
1986 restore_pos(aTHX_ 0);
1989 /* make sure $`, $&, $', and $digit will work later */
1990 if ( !(flags & REXEC_NOT_FIRST) ) {
1991 if (RX_MATCH_COPIED(prog)) {
1992 Safefree(prog->subbeg);
1993 RX_MATCH_COPIED_off(prog);
1995 if (flags & REXEC_COPY_STR) {
1996 I32 i = PL_regeol - startpos + (stringarg - strbeg);
1998 s = savepvn(strbeg, i);
2001 RX_MATCH_COPIED_on(prog);
2004 prog->subbeg = strbeg;
2005 prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
2012 DEBUG_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
2013 PL_colors[4],PL_colors[5]));
2014 if (PL_reg_eval_set)
2015 restore_pos(aTHX_ 0);
2020 - regtry - try match at specific point
2022 STATIC I32 /* 0 failure, 1 success */
2023 S_regtry(pTHX_ regexp *prog, char *startpos)
2031 PL_regindent = 0; /* XXXX Not good when matches are reenterable... */
2033 if ((prog->reganch & ROPT_EVAL_SEEN) && !PL_reg_eval_set) {
2036 PL_reg_eval_set = RS_init;
2038 PerlIO_printf(Perl_debug_log, " setting stack tmpbase at %"IVdf"\n",
2039 (IV)(PL_stack_sp - PL_stack_base));
2041 SAVEI32(cxstack[cxstack_ix].blk_oldsp);
2042 cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base;
2043 /* Otherwise OP_NEXTSTATE will free whatever on stack now. */
2045 /* Apparently this is not needed, judging by wantarray. */
2046 /* SAVEI8(cxstack[cxstack_ix].blk_gimme);
2047 cxstack[cxstack_ix].blk_gimme = G_SCALAR; */
2050 /* Make $_ available to executed code. */
2051 if (PL_reg_sv != DEFSV) {
2052 /* SAVE_DEFSV does *not* suffice here for USE_5005THREADS */
2057 if (!(SvTYPE(PL_reg_sv) >= SVt_PVMG && SvMAGIC(PL_reg_sv)
2058 && (mg = mg_find(PL_reg_sv, PERL_MAGIC_regex_global)))) {
2059 /* prepare for quick setting of pos */
2060 sv_magic(PL_reg_sv, (SV*)0,
2061 PERL_MAGIC_regex_global, Nullch, 0);
2062 mg = mg_find(PL_reg_sv, PERL_MAGIC_regex_global);
2066 PL_reg_oldpos = mg->mg_len;
2067 SAVEDESTRUCTOR_X(restore_pos, 0);
2069 if (!PL_reg_curpm) {
2070 Newz(22,PL_reg_curpm, 1, PMOP);
2073 SV* repointer = newSViv(0);
2074 /* so we know which PL_regex_padav element is PL_reg_curpm */
2075 SvFLAGS(repointer) |= SVf_BREAK;
2076 av_push(PL_regex_padav,repointer);
2077 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2078 PL_regex_pad = AvARRAY(PL_regex_padav);
2082 PM_SETRE(PL_reg_curpm, prog);
2083 PL_reg_oldcurpm = PL_curpm;
2084 PL_curpm = PL_reg_curpm;
2085 if (RX_MATCH_COPIED(prog)) {
2086 /* Here is a serious problem: we cannot rewrite subbeg,
2087 since it may be needed if this match fails. Thus
2088 $` inside (?{}) could fail... */
2089 PL_reg_oldsaved = prog->subbeg;
2090 PL_reg_oldsavedlen = prog->sublen;
2091 RX_MATCH_COPIED_off(prog);
2094 PL_reg_oldsaved = Nullch;
2095 prog->subbeg = PL_bostr;
2096 prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2098 prog->startp[0] = startpos - PL_bostr;
2099 PL_reginput = startpos;
2100 PL_regstartp = prog->startp;
2101 PL_regendp = prog->endp;
2102 PL_reglastparen = &prog->lastparen;
2103 PL_reglastcloseparen = &prog->lastcloseparen;
2104 prog->lastparen = 0;
2106 DEBUG_r(PL_reg_starttry = startpos);
2107 if (PL_reg_start_tmpl <= prog->nparens) {
2108 PL_reg_start_tmpl = prog->nparens*3/2 + 3;
2109 if(PL_reg_start_tmp)
2110 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2112 New(22,PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2115 /* XXXX What this code is doing here?!!! There should be no need
2116 to do this again and again, PL_reglastparen should take care of
2119 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2120 * Actually, the code in regcppop() (which Ilya may be meaning by
2121 * PL_reglastparen), is not needed at all by the test suite
2122 * (op/regexp, op/pat, op/split), but that code is needed, oddly
2123 * enough, for building DynaLoader, or otherwise this
2124 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
2125 * will happen. Meanwhile, this code *is* needed for the
2126 * above-mentioned test suite tests to succeed. The common theme
2127 * on those tests seems to be returning null fields from matches.
2132 if (prog->nparens) {
2133 for (i = prog->nparens; i > *PL_reglastparen; i--) {
2140 if (regmatch(prog->program + 1)) {
2141 prog->endp[0] = PL_reginput - PL_bostr;
2144 REGCP_UNWIND(lastcp);
2148 #define RE_UNWIND_BRANCH 1
2149 #define RE_UNWIND_BRANCHJ 2
2153 typedef struct { /* XX: makes sense to enlarge it... */
2157 } re_unwind_generic_t;
2170 } re_unwind_branch_t;
2172 typedef union re_unwind_t {
2174 re_unwind_generic_t generic;
2175 re_unwind_branch_t branch;
2178 #define sayYES goto yes
2179 #define sayNO goto no
2180 #define sayNO_ANYOF goto no_anyof
2181 #define sayYES_FINAL goto yes_final
2182 #define sayYES_LOUD goto yes_loud
2183 #define sayNO_FINAL goto no_final
2184 #define sayNO_SILENT goto do_no
2185 #define saySAME(x) if (x) goto yes; else goto no
2187 #define REPORT_CODE_OFF 24
2190 - regmatch - main matching routine
2192 * Conceptually the strategy is simple: check to see whether the current
2193 * node matches, call self recursively to see whether the rest matches,
2194 * and then act accordingly. In practice we make some effort to avoid
2195 * recursion, in particular by going through "ordinary" nodes (that don't
2196 * need to know whether the rest of the match failed) by a loop instead of
2199 /* [lwall] I've hoisted the register declarations to the outer block in order to
2200 * maybe save a little bit of pushing and popping on the stack. It also takes
2201 * advantage of machines that use a register save mask on subroutine entry.
2203 STATIC I32 /* 0 failure, 1 success */
2204 S_regmatch(pTHX_ regnode *prog)
2206 register regnode *scan; /* Current node. */
2207 regnode *next; /* Next node. */
2208 regnode *inner; /* Next node in internal branch. */
2209 register I32 nextchr; /* renamed nextchr - nextchar colides with
2210 function of same name */
2211 register I32 n; /* no or next */
2212 register I32 ln = 0; /* len or last */
2213 register char *s = Nullch; /* operand or save */
2214 register char *locinput = PL_reginput;
2215 register I32 c1 = 0, c2 = 0, paren; /* case fold search, parenth */
2216 int minmod = 0, sw = 0, logical = 0;
2219 I32 firstcp = PL_savestack_ix;
2221 register bool do_utf8 = PL_reg_match_utf8;
2223 SV *dsv0 = PERL_DEBUG_PAD_ZERO(0);
2224 SV *dsv1 = PERL_DEBUG_PAD_ZERO(1);
2225 SV *dsv2 = PERL_DEBUG_PAD_ZERO(2);
2232 /* Note that nextchr is a byte even in UTF */
2233 nextchr = UCHARAT(locinput);
2235 while (scan != NULL) {
2238 SV *prop = sv_newmortal();
2239 int docolor = *PL_colors[0];
2240 int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
2241 int l = (PL_regeol - locinput) > taill ? taill : (PL_regeol - locinput);
2242 /* The part of the string before starttry has one color
2243 (pref0_len chars), between starttry and current
2244 position another one (pref_len - pref0_len chars),
2245 after the current position the third one.
2246 We assume that pref0_len <= pref_len, otherwise we
2247 decrease pref0_len. */
2248 int pref_len = (locinput - PL_bostr) > (5 + taill) - l
2249 ? (5 + taill) - l : locinput - PL_bostr;
2252 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
2254 pref0_len = pref_len - (locinput - PL_reg_starttry);
2255 if (l + pref_len < (5 + taill) && l < PL_regeol - locinput)
2256 l = ( PL_regeol - locinput > (5 + taill) - pref_len
2257 ? (5 + taill) - pref_len : PL_regeol - locinput);
2258 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
2262 if (pref0_len > pref_len)
2263 pref0_len = pref_len;
2264 regprop(prop, scan);
2268 pv_uni_display(dsv0, (U8*)(locinput - pref_len),
2269 pref0_len, 60, UNI_DISPLAY_REGEX) :
2270 locinput - pref_len;
2271 int len0 = do_utf8 ? strlen(s0) : pref0_len;
2272 char *s1 = do_utf8 ?
2273 pv_uni_display(dsv1, (U8*)(locinput - pref_len + pref0_len),
2274 pref_len - pref0_len, 60, UNI_DISPLAY_REGEX) :
2275 locinput - pref_len + pref0_len;
2276 int len1 = do_utf8 ? strlen(s1) : pref_len - pref0_len;
2277 char *s2 = do_utf8 ?
2278 pv_uni_display(dsv2, (U8*)locinput,
2279 PL_regeol - locinput, 60, UNI_DISPLAY_REGEX) :
2281 int len2 = do_utf8 ? strlen(s2) : l;
2282 PerlIO_printf(Perl_debug_log,
2283 "%4"IVdf" <%s%.*s%s%s%.*s%s%s%s%.*s%s>%*s|%3"IVdf":%*s%s\n",
2284 (IV)(locinput - PL_bostr),
2291 (docolor ? "" : "> <"),
2295 15 - l - pref_len + 1,
2297 (IV)(scan - PL_regprogram), PL_regindent*2, "",
2302 next = scan + NEXT_OFF(scan);
2308 if (locinput == PL_bostr || (PL_multiline &&
2309 (nextchr || locinput < PL_regeol) && locinput[-1] == '\n') )
2311 /* regtill = regbol; */
2316 if (locinput == PL_bostr ||
2317 ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n'))
2323 if (locinput == PL_bostr)
2327 if (locinput == PL_reg_ganch)
2337 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
2342 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
2344 if (PL_regeol - locinput > 1)
2348 if (PL_regeol != locinput)
2352 if (!nextchr && locinput >= PL_regeol)
2355 locinput += PL_utf8skip[nextchr];
2356 if (locinput > PL_regeol)
2358 nextchr = UCHARAT(locinput);
2361 nextchr = UCHARAT(++locinput);
2364 if (!nextchr && locinput >= PL_regeol)
2366 nextchr = UCHARAT(++locinput);
2369 if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
2372 locinput += PL_utf8skip[nextchr];
2373 if (locinput > PL_regeol)
2375 nextchr = UCHARAT(locinput);
2378 nextchr = UCHARAT(++locinput);
2383 if (do_utf8 != (UTF!=0)) {
2384 /* The target and the pattern have differing utf8ness. */
2390 /* The target is utf8, the pattern is not utf8. */
2394 if (NATIVE_TO_UNI(*(U8*)s) !=
2395 utf8_to_uvuni((U8*)l, &ulen))
2402 /* The target is not utf8, the pattern is utf8. */
2406 if (NATIVE_TO_UNI(*((U8*)l)) !=
2407 utf8_to_uvuni((U8*)s, &ulen))
2414 nextchr = UCHARAT(locinput);
2417 /* The target and the pattern have the same utf8ness. */
2418 /* Inline the first character, for speed. */
2419 if (UCHARAT(s) != nextchr)
2421 if (PL_regeol - locinput < ln)
2423 if (ln > 1 && memNE(s, locinput, ln))
2426 nextchr = UCHARAT(locinput);
2429 PL_reg_flags |= RF_tainted;
2435 if (do_utf8 || UTF) {
2436 /* Either target or the pattern are utf8. */
2438 char *e = PL_regeol;
2440 if (ibcmp_utf8(s, 0, ln, UTF,
2441 l, &e, 0, do_utf8)) {
2442 /* One more case for the sharp s:
2443 * pack("U0U*", 0xDF) =~ /ss/i,
2444 * the 0xC3 0x9F are the UTF-8
2445 * byte sequence for the U+00DF. */
2447 toLOWER(s[0]) == 's' &&
2449 toLOWER(s[1]) == 's' &&
2456 nextchr = UCHARAT(locinput);
2460 /* Neither the target and the pattern are utf8. */
2462 /* Inline the first character, for speed. */
2463 if (UCHARAT(s) != nextchr &&
2464 UCHARAT(s) != ((OP(scan) == EXACTF)
2465 ? PL_fold : PL_fold_locale)[nextchr])
2467 if (PL_regeol - locinput < ln)
2469 if (ln > 1 && (OP(scan) == EXACTF
2470 ? ibcmp(s, locinput, ln)
2471 : ibcmp_locale(s, locinput, ln)))
2474 nextchr = UCHARAT(locinput);
2478 STRLEN inclasslen = PL_regeol - locinput;
2480 if (!reginclass(scan, (U8*)locinput, &inclasslen, do_utf8))
2482 if (locinput >= PL_regeol)
2484 locinput += inclasslen ? inclasslen : UTF8SKIP(locinput);
2485 nextchr = UCHARAT(locinput);
2490 nextchr = UCHARAT(locinput);
2491 if (!REGINCLASS(scan, (U8*)locinput))
2493 if (!nextchr && locinput >= PL_regeol)
2495 nextchr = UCHARAT(++locinput);
2499 /* If we might have the case of the German sharp s
2500 * in a casefolding Unicode character class. */
2502 if (ANYOF_FOLD_SHARP_S(scan, locinput, PL_regeol)) {
2503 locinput += SHARP_S_SKIP;
2504 nextchr = UCHARAT(locinput);
2510 PL_reg_flags |= RF_tainted;
2516 LOAD_UTF8_CHARCLASS(alnum,"a");
2517 if (!(OP(scan) == ALNUM
2518 ? swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
2519 : isALNUM_LC_utf8((U8*)locinput)))
2523 locinput += PL_utf8skip[nextchr];
2524 nextchr = UCHARAT(locinput);
2527 if (!(OP(scan) == ALNUM
2528 ? isALNUM(nextchr) : isALNUM_LC(nextchr)))
2530 nextchr = UCHARAT(++locinput);
2533 PL_reg_flags |= RF_tainted;
2536 if (!nextchr && locinput >= PL_regeol)
2539 LOAD_UTF8_CHARCLASS(alnum,"a");
2540 if (OP(scan) == NALNUM
2541 ? swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
2542 : isALNUM_LC_utf8((U8*)locinput))
2546 locinput += PL_utf8skip[nextchr];
2547 nextchr = UCHARAT(locinput);
2550 if (OP(scan) == NALNUM
2551 ? isALNUM(nextchr) : isALNUM_LC(nextchr))
2553 nextchr = UCHARAT(++locinput);
2557 PL_reg_flags |= RF_tainted;
2561 /* was last char in word? */
2563 if (locinput == PL_bostr)
2566 U8 *r = reghop((U8*)locinput, -1);
2568 ln = utf8n_to_uvchr(r, s - (char*)r, 0, 0);
2570 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
2571 ln = isALNUM_uni(ln);
2572 LOAD_UTF8_CHARCLASS(alnum,"a");
2573 n = swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8);
2576 ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(ln));
2577 n = isALNUM_LC_utf8((U8*)locinput);
2581 ln = (locinput != PL_bostr) ?
2582 UCHARAT(locinput - 1) : '\n';
2583 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
2585 n = isALNUM(nextchr);
2588 ln = isALNUM_LC(ln);
2589 n = isALNUM_LC(nextchr);
2592 if (((!ln) == (!n)) == (OP(scan) == BOUND ||
2593 OP(scan) == BOUNDL))
2597 PL_reg_flags |= RF_tainted;
2603 if (UTF8_IS_CONTINUED(nextchr)) {
2604 LOAD_UTF8_CHARCLASS(space," ");
2605 if (!(OP(scan) == SPACE
2606 ? swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
2607 : isSPACE_LC_utf8((U8*)locinput)))
2611 locinput += PL_utf8skip[nextchr];
2612 nextchr = UCHARAT(locinput);
2615 if (!(OP(scan) == SPACE
2616 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
2618 nextchr = UCHARAT(++locinput);
2621 if (!(OP(scan) == SPACE
2622 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
2624 nextchr = UCHARAT(++locinput);
2628 PL_reg_flags |= RF_tainted;
2631 if (!nextchr && locinput >= PL_regeol)
2634 LOAD_UTF8_CHARCLASS(space," ");
2635 if (OP(scan) == NSPACE
2636 ? swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
2637 : isSPACE_LC_utf8((U8*)locinput))
2641 locinput += PL_utf8skip[nextchr];
2642 nextchr = UCHARAT(locinput);
2645 if (OP(scan) == NSPACE
2646 ? isSPACE(nextchr) : isSPACE_LC(nextchr))
2648 nextchr = UCHARAT(++locinput);
2651 PL_reg_flags |= RF_tainted;
2657 LOAD_UTF8_CHARCLASS(digit,"0");
2658 if (!(OP(scan) == DIGIT
2659 ? swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
2660 : isDIGIT_LC_utf8((U8*)locinput)))
2664 locinput += PL_utf8skip[nextchr];
2665 nextchr = UCHARAT(locinput);
2668 if (!(OP(scan) == DIGIT
2669 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr)))
2671 nextchr = UCHARAT(++locinput);
2674 PL_reg_flags |= RF_tainted;
2677 if (!nextchr && locinput >= PL_regeol)
2680 LOAD_UTF8_CHARCLASS(digit,"0");
2681 if (OP(scan) == NDIGIT
2682 ? swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
2683 : isDIGIT_LC_utf8((U8*)locinput))
2687 locinput += PL_utf8skip[nextchr];
2688 nextchr = UCHARAT(locinput);
2691 if (OP(scan) == NDIGIT
2692 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr))
2694 nextchr = UCHARAT(++locinput);
2697 if (locinput >= PL_regeol)
2700 LOAD_UTF8_CHARCLASS(mark,"~");
2701 if (swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
2703 locinput += PL_utf8skip[nextchr];
2704 while (locinput < PL_regeol &&
2705 swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
2706 locinput += UTF8SKIP(locinput);
2707 if (locinput > PL_regeol)
2712 nextchr = UCHARAT(locinput);
2715 PL_reg_flags |= RF_tainted;
2719 n = ARG(scan); /* which paren pair */
2720 ln = PL_regstartp[n];
2721 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
2722 if (*PL_reglastparen < n || ln == -1)
2723 sayNO; /* Do not match unless seen CLOSEn. */
2724 if (ln == PL_regendp[n])
2728 if (do_utf8 && OP(scan) != REF) { /* REF can do byte comparison */
2730 char *e = PL_bostr + PL_regendp[n];
2732 * Note that we can't do the "other character" lookup trick as
2733 * in the 8-bit case (no pun intended) because in Unicode we
2734 * have to map both upper and title case to lower case.
2736 if (OP(scan) == REFF) {
2737 STRLEN ulen1, ulen2;
2738 U8 tmpbuf1[UTF8_MAXLEN_UCLC+1];
2739 U8 tmpbuf2[UTF8_MAXLEN_UCLC+1];
2743 toLOWER_utf8((U8*)s, tmpbuf1, &ulen1);
2744 toLOWER_utf8((U8*)l, tmpbuf2, &ulen2);
2745 if (ulen1 != ulen2 || memNE((char *)tmpbuf1, (char *)tmpbuf2, ulen1))
2752 nextchr = UCHARAT(locinput);
2756 /* Inline the first character, for speed. */
2757 if (UCHARAT(s) != nextchr &&
2759 (UCHARAT(s) != ((OP(scan) == REFF
2760 ? PL_fold : PL_fold_locale)[nextchr]))))
2762 ln = PL_regendp[n] - ln;
2763 if (locinput + ln > PL_regeol)
2765 if (ln > 1 && (OP(scan) == REF
2766 ? memNE(s, locinput, ln)
2768 ? ibcmp(s, locinput, ln)
2769 : ibcmp_locale(s, locinput, ln))))
2772 nextchr = UCHARAT(locinput);
2783 OP_4tree *oop = PL_op;
2784 COP *ocurcop = PL_curcop;
2785 SV **ocurpad = PL_curpad;
2789 PL_op = (OP_4tree*)PL_regdata->data[n];
2790 DEBUG_r( PerlIO_printf(Perl_debug_log, " re_eval 0x%"UVxf"\n", PTR2UV(PL_op)) );
2791 PL_curpad = AvARRAY((AV*)PL_regdata->data[n + 2]);
2792 PL_regendp[0] = PL_reg_magic->mg_len = locinput - PL_bostr;
2796 CALLRUNOPS(aTHX); /* Scalar context. */
2799 ret = Nullsv; /* protect against empty (?{}) blocks. */
2807 PL_curpad = ocurpad;
2808 PL_curcop = ocurcop;
2810 if (logical == 2) { /* Postponed subexpression. */
2812 MAGIC *mg = Null(MAGIC*);
2814 CHECKPOINT cp, lastcp;
2816 if(SvROK(ret) || SvRMAGICAL(ret)) {
2817 SV *sv = SvROK(ret) ? SvRV(ret) : ret;
2820 mg = mg_find(sv, PERL_MAGIC_qr);
2823 re = (regexp *)mg->mg_obj;
2824 (void)ReREFCNT_inc(re);
2828 char *t = SvPV(ret, len);
2830 char *oprecomp = PL_regprecomp;
2831 I32 osize = PL_regsize;
2832 I32 onpar = PL_regnpar;
2835 re = CALLREGCOMP(aTHX_ t, t + len, &pm);
2837 & (SVs_TEMP | SVs_PADTMP | SVf_READONLY)))
2838 sv_magic(ret,(SV*)ReREFCNT_inc(re),
2840 PL_regprecomp = oprecomp;
2845 PerlIO_printf(Perl_debug_log,
2846 "Entering embedded `%s%.60s%s%s'\n",
2850 (strlen(re->precomp) > 60 ? "..." : ""))
2853 state.prev = PL_reg_call_cc;
2854 state.cc = PL_regcc;
2855 state.re = PL_reg_re;
2859 cp = regcppush(0); /* Save *all* the positions. */
2862 state.ss = PL_savestack_ix;
2863 *PL_reglastparen = 0;
2864 *PL_reglastcloseparen = 0;
2865 PL_reg_call_cc = &state;
2866 PL_reginput = locinput;
2868 /* XXXX This is too dramatic a measure... */
2871 if (regmatch(re->program + 1)) {
2872 /* Even though we succeeded, we need to restore
2873 global variables, since we may be wrapped inside
2874 SUSPEND, thus the match may be not finished yet. */
2876 /* XXXX Do this only if SUSPENDed? */
2877 PL_reg_call_cc = state.prev;
2878 PL_regcc = state.cc;
2879 PL_reg_re = state.re;
2880 cache_re(PL_reg_re);
2882 /* XXXX This is too dramatic a measure... */
2885 /* These are needed even if not SUSPEND. */
2891 REGCP_UNWIND(lastcp);
2893 PL_reg_call_cc = state.prev;
2894 PL_regcc = state.cc;
2895 PL_reg_re = state.re;
2896 cache_re(PL_reg_re);
2898 /* XXXX This is too dramatic a measure... */
2908 sv_setsv(save_scalar(PL_replgv), ret);
2912 n = ARG(scan); /* which paren pair */
2913 PL_reg_start_tmp[n] = locinput;
2918 n = ARG(scan); /* which paren pair */
2919 PL_regstartp[n] = PL_reg_start_tmp[n] - PL_bostr;
2920 PL_regendp[n] = locinput - PL_bostr;
2921 if (n > *PL_reglastparen)
2922 *PL_reglastparen = n;
2923 *PL_reglastcloseparen = n;
2926 n = ARG(scan); /* which paren pair */
2927 sw = (*PL_reglastparen >= n && PL_regendp[n] != -1);
2930 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
2932 next = NEXTOPER(NEXTOPER(scan));
2934 next = scan + ARG(scan);
2935 if (OP(next) == IFTHEN) /* Fake one. */
2936 next = NEXTOPER(NEXTOPER(next));
2940 logical = scan->flags;
2942 /*******************************************************************
2943 PL_regcc contains infoblock about the innermost (...)* loop, and
2944 a pointer to the next outer infoblock.
2946 Here is how Y(A)*Z is processed (if it is compiled into CURLYX/WHILEM):
2948 1) After matching X, regnode for CURLYX is processed;
2950 2) This regnode creates infoblock on the stack, and calls
2951 regmatch() recursively with the starting point at WHILEM node;
2953 3) Each hit of WHILEM node tries to match A and Z (in the order
2954 depending on the current iteration, min/max of {min,max} and
2955 greediness). The information about where are nodes for "A"
2956 and "Z" is read from the infoblock, as is info on how many times "A"
2957 was already matched, and greediness.
2959 4) After A matches, the same WHILEM node is hit again.
2961 5) Each time WHILEM is hit, PL_regcc is the infoblock created by CURLYX
2962 of the same pair. Thus when WHILEM tries to match Z, it temporarily
2963 resets PL_regcc, since this Y(A)*Z can be a part of some other loop:
2964 as in (Y(A)*Z)*. If Z matches, the automaton will hit the WHILEM node
2965 of the external loop.
2967 Currently present infoblocks form a tree with a stem formed by PL_curcc
2968 and whatever it mentions via ->next, and additional attached trees
2969 corresponding to temporarily unset infoblocks as in "5" above.
2971 In the following picture infoblocks for outer loop of
2972 (Y(A)*?Z)*?T are denoted O, for inner I. NULL starting block
2973 is denoted by x. The matched string is YAAZYAZT. Temporarily postponed
2974 infoblocks are drawn below the "reset" infoblock.
2976 In fact in the picture below we do not show failed matches for Z and T
2977 by WHILEM blocks. [We illustrate minimal matches, since for them it is
2978 more obvious *why* one needs to *temporary* unset infoblocks.]
2980 Matched REx position InfoBlocks Comment
2984 Y A)*?Z)*?T x <- O <- I
2985 YA )*?Z)*?T x <- O <- I
2986 YA A)*?Z)*?T x <- O <- I
2987 YAA )*?Z)*?T x <- O <- I
2988 YAA Z)*?T x <- O # Temporary unset I
2991 YAAZ Y(A)*?Z)*?T x <- O
2994 YAAZY (A)*?Z)*?T x <- O
2997 YAAZY A)*?Z)*?T x <- O <- I
3000 YAAZYA )*?Z)*?T x <- O <- I
3003 YAAZYA Z)*?T x <- O # Temporary unset I
3009 YAAZYAZ T x # Temporary unset O
3016 *******************************************************************/
3019 CHECKPOINT cp = PL_savestack_ix;
3020 /* No need to save/restore up to this paren */
3021 I32 parenfloor = scan->flags;
3023 if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
3025 cc.oldcc = PL_regcc;
3027 /* XXXX Probably it is better to teach regpush to support
3028 parenfloor > PL_regsize... */
3029 if (parenfloor > *PL_reglastparen)
3030 parenfloor = *PL_reglastparen; /* Pessimization... */
3031 cc.parenfloor = parenfloor;
3033 cc.min = ARG1(scan);
3034 cc.max = ARG2(scan);
3035 cc.scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3039 PL_reginput = locinput;
3040 n = regmatch(PREVOPER(next)); /* start on the WHILEM */
3042 PL_regcc = cc.oldcc;
3048 * This is really hard to understand, because after we match
3049 * what we're trying to match, we must make sure the rest of
3050 * the REx is going to match for sure, and to do that we have
3051 * to go back UP the parse tree by recursing ever deeper. And
3052 * if it fails, we have to reset our parent's current state
3053 * that we can try again after backing off.
3056 CHECKPOINT cp, lastcp;
3057 CURCUR* cc = PL_regcc;
3058 char *lastloc = cc->lastloc; /* Detection of 0-len. */
3060 n = cc->cur + 1; /* how many we know we matched */
3061 PL_reginput = locinput;
3064 PerlIO_printf(Perl_debug_log,
3065 "%*s %ld out of %ld..%ld cc=%lx\n",
3066 REPORT_CODE_OFF+PL_regindent*2, "",
3067 (long)n, (long)cc->min,
3068 (long)cc->max, (long)cc)
3071 /* If degenerate scan matches "", assume scan done. */
3073 if (locinput == cc->lastloc && n >= cc->min) {
3074 PL_regcc = cc->oldcc;
3078 PerlIO_printf(Perl_debug_log,
3079 "%*s empty match detected, try continuation...\n",
3080 REPORT_CODE_OFF+PL_regindent*2, "")
3082 if (regmatch(cc->next))
3090 /* First just match a string of min scans. */
3094 cc->lastloc = locinput;
3095 if (regmatch(cc->scan))
3098 cc->lastloc = lastloc;
3103 /* Check whether we already were at this position.
3104 Postpone detection until we know the match is not
3105 *that* much linear. */
3106 if (!PL_reg_maxiter) {
3107 PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
3108 PL_reg_leftiter = PL_reg_maxiter;
3110 if (PL_reg_leftiter-- == 0) {
3111 I32 size = (PL_reg_maxiter + 7)/8;
3112 if (PL_reg_poscache) {
3113 if (PL_reg_poscache_size < size) {
3114 Renew(PL_reg_poscache, size, char);
3115 PL_reg_poscache_size = size;
3117 Zero(PL_reg_poscache, size, char);
3120 PL_reg_poscache_size = size;
3121 Newz(29, PL_reg_poscache, size, char);
3124 PerlIO_printf(Perl_debug_log,
3125 "%sDetected a super-linear match, switching on caching%s...\n",
3126 PL_colors[4], PL_colors[5])
3129 if (PL_reg_leftiter < 0) {
3130 I32 o = locinput - PL_bostr, b;
3132 o = (scan->flags & 0xf) - 1 + o * (scan->flags>>4);
3135 if (PL_reg_poscache[o] & (1<<b)) {
3137 PerlIO_printf(Perl_debug_log,
3138 "%*s already tried at this position...\n",
3139 REPORT_CODE_OFF+PL_regindent*2, "")
3143 PL_reg_poscache[o] |= (1<<b);
3147 /* Prefer next over scan for minimal matching. */
3150 PL_regcc = cc->oldcc;
3153 cp = regcppush(cc->parenfloor);
3155 if (regmatch(cc->next)) {
3157 sayYES; /* All done. */
3159 REGCP_UNWIND(lastcp);
3165 if (n >= cc->max) { /* Maximum greed exceeded? */
3166 if (ckWARN(WARN_REGEXP) && n >= REG_INFTY
3167 && !(PL_reg_flags & RF_warned)) {
3168 PL_reg_flags |= RF_warned;
3169 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s limit (%d) exceeded",
3170 "Complex regular subexpression recursion",
3177 PerlIO_printf(Perl_debug_log,
3178 "%*s trying longer...\n",
3179 REPORT_CODE_OFF+PL_regindent*2, "")
3181 /* Try scanning more and see if it helps. */
3182 PL_reginput = locinput;
3184 cc->lastloc = locinput;
3185 cp = regcppush(cc->parenfloor);
3187 if (regmatch(cc->scan)) {
3191 REGCP_UNWIND(lastcp);
3194 cc->lastloc = lastloc;
3198 /* Prefer scan over next for maximal matching. */
3200 if (n < cc->max) { /* More greed allowed? */
3201 cp = regcppush(cc->parenfloor);
3203 cc->lastloc = locinput;
3205 if (regmatch(cc->scan)) {
3209 REGCP_UNWIND(lastcp);
3210 regcppop(); /* Restore some previous $<digit>s? */
3211 PL_reginput = locinput;
3213 PerlIO_printf(Perl_debug_log,
3214 "%*s failed, try continuation...\n",
3215 REPORT_CODE_OFF+PL_regindent*2, "")
3218 if (ckWARN(WARN_REGEXP) && n >= REG_INFTY
3219 && !(PL_reg_flags & RF_warned)) {
3220 PL_reg_flags |= RF_warned;
3221 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s limit (%d) exceeded",
3222 "Complex regular subexpression recursion",
3226 /* Failed deeper matches of scan, so see if this one works. */
3227 PL_regcc = cc->oldcc;
3230 if (regmatch(cc->next))
3236 cc->lastloc = lastloc;
3241 next = scan + ARG(scan);
3244 inner = NEXTOPER(NEXTOPER(scan));
3247 inner = NEXTOPER(scan);
3251 if (OP(next) != c1) /* No choice. */
3252 next = inner; /* Avoid recursion. */
3254 I32 lastparen = *PL_reglastparen;
3256 re_unwind_branch_t *uw;
3258 /* Put unwinding data on stack */
3259 unwind1 = SSNEWt(1,re_unwind_branch_t);
3260 uw = SSPTRt(unwind1,re_unwind_branch_t);
3263 uw->type = ((c1 == BRANCH)
3265 : RE_UNWIND_BRANCHJ);
3266 uw->lastparen = lastparen;
3268 uw->locinput = locinput;
3269 uw->nextchr = nextchr;
3271 uw->regindent = ++PL_regindent;
3274 REGCP_SET(uw->lastcp);
3276 /* Now go into the first branch */
3289 /* We suppose that the next guy does not need
3290 backtracking: in particular, it is of constant length,
3291 and has no parenths to influence future backrefs. */
3292 ln = ARG1(scan); /* min to match */
3293 n = ARG2(scan); /* max to match */
3294 paren = scan->flags;
3296 if (paren > PL_regsize)
3298 if (paren > *PL_reglastparen)
3299 *PL_reglastparen = paren;
3301 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
3303 scan += NEXT_OFF(scan); /* Skip former OPEN. */
3304 PL_reginput = locinput;
3307 if (ln && regrepeat_hard(scan, ln, &l) < ln)
3309 /* if we matched something zero-length we don't need to
3310 backtrack - capturing parens are already defined, so
3311 the caveat in the maximal case doesn't apply
3313 XXXX if ln == 0, we can redo this check first time
3314 through the following loop
3317 n = ln; /* don't backtrack */
3318 locinput = PL_reginput;
3319 if (HAS_TEXT(next) || JUMPABLE(next)) {
3320 regnode *text_node = next;
3322 if (! HAS_TEXT(text_node)) FIND_NEXT_IMPT(text_node);
3324 if (! HAS_TEXT(text_node)) c1 = c2 = -1000;
3326 if (PL_regkind[(U8)OP(text_node)] == REF) {
3328 n = ARG(text_node); /* which paren pair */
3329 ln = PL_regstartp[n];
3330 /* assume yes if we haven't seen CLOSEn */
3332 *PL_reglastparen < n ||
3339 c1 = *(PL_bostr + ln);
3341 else { c1 = (U8)*STRING(text_node); }
3342 if (OP(text_node) == EXACTF || OP(text_node) == REFF)
3344 else if (OP(text_node) == EXACTFL || OP(text_node) == REFFL)
3345 c2 = PL_fold_locale[c1];
3354 /* This may be improved if l == 0. */
3355 while (n >= ln || (n == REG_INFTY && ln > 0 && l)) { /* ln overflow ? */
3356 /* If it could work, try it. */
3358 UCHARAT(PL_reginput) == c1 ||
3359 UCHARAT(PL_reginput) == c2)
3363 PL_regstartp[paren] =
3364 HOPc(PL_reginput, -l) - PL_bostr;
3365 PL_regendp[paren] = PL_reginput - PL_bostr;
3368 PL_regendp[paren] = -1;
3372 REGCP_UNWIND(lastcp);
3374 /* Couldn't or didn't -- move forward. */
3375 PL_reginput = locinput;
3376 if (regrepeat_hard(scan, 1, &l)) {
3378 locinput = PL_reginput;
3385 n = regrepeat_hard(scan, n, &l);
3386 /* if we matched something zero-length we don't need to
3387 backtrack, unless the minimum count is zero and we
3388 are capturing the result - in that case the capture
3389 being defined or not may affect later execution
3391 if (n != 0 && l == 0 && !(paren && ln == 0))
3392 ln = n; /* don't backtrack */
3393 locinput = PL_reginput;
3395 PerlIO_printf(Perl_debug_log,
3396 "%*s matched %"IVdf" times, len=%"IVdf"...\n",
3397 (int)(REPORT_CODE_OFF+PL_regindent*2), "",
3401 if (HAS_TEXT(next) || JUMPABLE(next)) {
3402 regnode *text_node = next;
3404 if (! HAS_TEXT(text_node)) FIND_NEXT_IMPT(text_node);
3406 if (! HAS_TEXT(text_node)) c1 = c2 = -1000;
3408 if (PL_regkind[(U8)OP(text_node)] == REF) {
3410 n = ARG(text_node); /* which paren pair */
3411 ln = PL_regstartp[n];
3412 /* assume yes if we haven't seen CLOSEn */
3414 *PL_reglastparen < n ||
3421 c1 = *(PL_bostr + ln);
3423 else { c1 = (U8)*STRING(text_node); }
3425 if (OP(text_node) == EXACTF || OP(text_node) == REFF)
3427 else if (OP(text_node) == EXACTFL || OP(text_node) == REFFL)
3428 c2 = PL_fold_locale[c1];
3439 /* If it could work, try it. */
3441 UCHARAT(PL_reginput) == c1 ||
3442 UCHARAT(PL_reginput) == c2)
3445 PerlIO_printf(Perl_debug_log,
3446 "%*s trying tail with n=%"IVdf"...\n",
3447 (int)(REPORT_CODE_OFF+PL_regindent*2), "", (IV)n)
3451 PL_regstartp[paren] = HOPc(PL_reginput, -l) - PL_bostr;
3452 PL_regendp[paren] = PL_reginput - PL_bostr;
3455 PL_regendp[paren] = -1;
3459 REGCP_UNWIND(lastcp);
3461 /* Couldn't or didn't -- back up. */
3463 locinput = HOPc(locinput, -l);
3464 PL_reginput = locinput;
3471 paren = scan->flags; /* Which paren to set */
3472 if (paren > PL_regsize)
3474 if (paren > *PL_reglastparen)
3475 *PL_reglastparen = paren;
3476 ln = ARG1(scan); /* min to match */
3477 n = ARG2(scan); /* max to match */
3478 scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
3482 ln = ARG1(scan); /* min to match */
3483 n = ARG2(scan); /* max to match */
3484 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
3489 scan = NEXTOPER(scan);
3495 scan = NEXTOPER(scan);
3499 * Lookahead to avoid useless match attempts
3500 * when we know what character comes next.
3504 * Used to only do .*x and .*?x, but now it allows
3505 * for )'s, ('s and (?{ ... })'s to be in the way
3506 * of the quantifier and the EXACT-like node. -- japhy
3509 if (HAS_TEXT(next) || JUMPABLE(next)) {
3511 regnode *text_node = next;
3513 if (! HAS_TEXT(text_node)) FIND_NEXT_IMPT(text_node);
3515 if (! HAS_TEXT(text_node)) c1 = c2 = -1000;
3517 if (PL_regkind[(U8)OP(text_node)] == REF) {
3519 n = ARG(text_node); /* which paren pair */
3520 ln = PL_regstartp[n];
3521 /* assume yes if we haven't seen CLOSEn */
3523 *PL_reglastparen < n ||
3528 goto assume_ok_easy;
3530 s = (U8*)PL_bostr + ln;
3532 else { s = (U8*)STRING(text_node); }
3536 if (OP(text_node) == EXACTF || OP(text_node) == REFF)
3538 else if (OP(text_node) == EXACTFL || OP(text_node) == REFFL)
3539 c2 = PL_fold_locale[c1];
3542 if (OP(text_node) == EXACTF || OP(text_node) == REFF) {
3543 STRLEN ulen1, ulen2;
3544 U8 tmpbuf1[UTF8_MAXLEN_UCLC+1];
3545 U8 tmpbuf2[UTF8_MAXLEN_UCLC+1];
3547 to_utf8_lower((U8*)s, tmpbuf1, &ulen1);
3548 to_utf8_upper((U8*)s, tmpbuf2, &ulen2);
3550 c1 = utf8_to_uvuni(tmpbuf1, 0);
3551 c2 = utf8_to_uvuni(tmpbuf2, 0);
3554 c2 = c1 = utf8_to_uvchr(s, NULL);
3562 PL_reginput = locinput;
3566 if (ln && regrepeat(scan, ln) < ln)
3568 locinput = PL_reginput;
3571 char *e; /* Should not check after this */
3572 char *old = locinput;
3574 if (n == REG_INFTY) {
3577 while (UTF8_IS_CONTINUATION(*(U8*)e))
3583 m >0 && e + UTF8SKIP(e) <= PL_regeol; m--)
3587 e = locinput + n - ln;
3593 /* Find place 'next' could work */
3596 while (locinput <= e &&
3597 UCHARAT(locinput) != c1)
3600 while (locinput <= e
3601 && UCHARAT(locinput) != c1
3602 && UCHARAT(locinput) != c2)
3605 count = locinput - old;
3612 utf8_to_uvchr((U8*)locinput, &len) != c1;
3617 for (count = 0; locinput <= e; count++) {
3618 UV c = utf8_to_uvchr((U8*)locinput, &len);
3619 if (c == c1 || c == c2)
3627 /* PL_reginput == old now */
3628 if (locinput != old) {
3629 ln = 1; /* Did some */
3630 if (regrepeat(scan, count) < count)
3633 /* PL_reginput == locinput now */
3634 TRYPAREN(paren, ln, locinput);
3635 PL_reginput = locinput; /* Could be reset... */
3636 REGCP_UNWIND(lastcp);
3637 /* Couldn't or didn't -- move forward. */
3640 locinput += UTF8SKIP(locinput);
3646 while (n >= ln || (n == REG_INFTY && ln > 0)) { /* ln overflow ? */
3650 c = utf8_to_uvchr((U8*)PL_reginput, NULL);
3652 c = UCHARAT(PL_reginput);
3653 /* If it could work, try it. */
3654 if (c == c1 || c == c2)
3656 TRYPAREN(paren, n, PL_reginput);
3657 REGCP_UNWIND(lastcp);
3660 /* If it could work, try it. */
3661 else if (c1 == -1000)
3663 TRYPAREN(paren, n, PL_reginput);
3664 REGCP_UNWIND(lastcp);
3666 /* Couldn't or didn't -- move forward. */
3667 PL_reginput = locinput;
3668 if (regrepeat(scan, 1)) {
3670 locinput = PL_reginput;
3678 n = regrepeat(scan, n);
3679 locinput = PL_reginput;
3680 if (ln < n && PL_regkind[(U8)OP(next)] == EOL &&
3681 ((!PL_multiline && OP(next) != MEOL) ||
3682 OP(next) == SEOL || OP(next) == EOS))
3684 ln = n; /* why back off? */
3685 /* ...because $ and \Z can match before *and* after
3686 newline at the end. Consider "\n\n" =~ /\n+\Z\n/.
3687 We should back off by one in this case. */
3688 if (UCHARAT(PL_reginput - 1) == '\n' && OP(next) != EOS)
3697 c = utf8_to_uvchr((U8*)PL_reginput, NULL);
3699 c = UCHARAT(PL_reginput);
3701 /* If it could work, try it. */
3702 if (c1 == -1000 || c == c1 || c == c2)
3704 TRYPAREN(paren, n, PL_reginput);
3705 REGCP_UNWIND(lastcp);
3707 /* Couldn't or didn't -- back up. */
3709 PL_reginput = locinput = HOPc(locinput, -1);
3717 c = utf8_to_uvchr((U8*)PL_reginput, NULL);
3719 c = UCHARAT(PL_reginput);
3721 /* If it could work, try it. */
3722 if (c1 == -1000 || c == c1 || c == c2)
3724 TRYPAREN(paren, n, PL_reginput);
3725 REGCP_UNWIND(lastcp);
3727 /* Couldn't or didn't -- back up. */
3729 PL_reginput = locinput = HOPc(locinput, -1);
3736 if (PL_reg_call_cc) {
3737 re_cc_state *cur_call_cc = PL_reg_call_cc;
3738 CURCUR *cctmp = PL_regcc;
3739 regexp *re = PL_reg_re;
3740 CHECKPOINT cp, lastcp;
3742 cp = regcppush(0); /* Save *all* the positions. */
3744 regcp_set_to(PL_reg_call_cc->ss); /* Restore parens of
3746 PL_reginput = locinput; /* Make position available to
3748 cache_re(PL_reg_call_cc->re);
3749 PL_regcc = PL_reg_call_cc->cc;
3750 PL_reg_call_cc = PL_reg_call_cc->prev;
3751 if (regmatch(cur_call_cc->node)) {
3752 PL_reg_call_cc = cur_call_cc;
3756 REGCP_UNWIND(lastcp);
3758 PL_reg_call_cc = cur_call_cc;
3764 PerlIO_printf(Perl_debug_log,
3765 "%*s continuation failed...\n",
3766 REPORT_CODE_OFF+PL_regindent*2, "")
3770 if (locinput < PL_regtill) {
3771 DEBUG_r(PerlIO_printf(Perl_debug_log,
3772 "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
3774 (long)(locinput - PL_reg_starttry),
3775 (long)(PL_regtill - PL_reg_starttry),
3777 sayNO_FINAL; /* Cannot match: too short. */
3779 PL_reginput = locinput; /* put where regtry can find it */
3780 sayYES_FINAL; /* Success! */
3782 PL_reginput = locinput; /* put where regtry can find it */
3783 sayYES_LOUD; /* Success! */
3786 PL_reginput = locinput;
3791 s = HOPBACKc(locinput, scan->flags);
3797 PL_reginput = locinput;
3802 s = HOPBACKc(locinput, scan->flags);
3808 PL_reginput = locinput;
3811 inner = NEXTOPER(NEXTOPER(scan));
3812 if (regmatch(inner) != n) {
3827 if (OP(scan) == SUSPEND) {
3828 locinput = PL_reginput;
3829 nextchr = UCHARAT(locinput);
3834 next = scan + ARG(scan);
3839 PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
3840 PTR2UV(scan), OP(scan));
3841 Perl_croak(aTHX_ "regexp memory corruption");
3848 * We get here only if there's trouble -- normally "case END" is
3849 * the terminating point.
3851 Perl_croak(aTHX_ "corrupted regexp pointers");
3857 PerlIO_printf(Perl_debug_log,
3858 "%*s %scould match...%s\n",
3859 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4],PL_colors[5])
3863 DEBUG_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
3864 PL_colors[4],PL_colors[5]));
3870 #if 0 /* Breaks $^R */
3878 PerlIO_printf(Perl_debug_log,
3879 "%*s %sfailed...%s\n",
3880 REPORT_CODE_OFF+PL_regindent*2, "",PL_colors[4],PL_colors[5])
3886 re_unwind_t *uw = SSPTRt(unwind,re_unwind_t);
3889 case RE_UNWIND_BRANCH:
3890 case RE_UNWIND_BRANCHJ:
3892 re_unwind_branch_t *uwb = &(uw->branch);
3893 I32 lastparen = uwb->lastparen;
3895 REGCP_UNWIND(uwb->lastcp);
3896 for (n = *PL_reglastparen; n > lastparen; n--)
3898 *PL_reglastparen = n;
3899 scan = next = uwb->next;
3901 OP(scan) != (uwb->type == RE_UNWIND_BRANCH
3902 ? BRANCH : BRANCHJ) ) { /* Failure */
3909 /* Have more choice yet. Reuse the same uwb. */
3911 if ((n = (uwb->type == RE_UNWIND_BRANCH
3912 ? NEXT_OFF(next) : ARG(next))))
3915 next = NULL; /* XXXX Needn't unwinding in this case... */
3917 next = NEXTOPER(scan);
3918 if (uwb->type == RE_UNWIND_BRANCHJ)
3919 next = NEXTOPER(next);
3920 locinput = uwb->locinput;
3921 nextchr = uwb->nextchr;
3923 PL_regindent = uwb->regindent;
3930 Perl_croak(aTHX_ "regexp unwind memory corruption");
3941 - regrepeat - repeatedly match something simple, report how many
3944 * [This routine now assumes that it will only match on things of length 1.
3945 * That was true before, but now we assume scan - reginput is the count,
3946 * rather than incrementing count on every character. [Er, except utf8.]]
3949 S_regrepeat(pTHX_ regnode *p, I32 max)
3951 register char *scan;
3953 register char *loceol = PL_regeol;
3954 register I32 hardcount = 0;
3955 register bool do_utf8 = PL_reg_match_utf8;
3958 if (max != REG_INFTY && max < loceol - scan)
3959 loceol = scan + max;
3964 while (scan < loceol && hardcount < max && *scan != '\n') {
3965 scan += UTF8SKIP(scan);
3969 while (scan < loceol && *scan != '\n')
3976 while (scan < loceol && hardcount < max) {
3977 scan += UTF8SKIP(scan);
3987 case EXACT: /* length of string is 1 */
3989 while (scan < loceol && UCHARAT(scan) == c)
3992 case EXACTF: /* length of string is 1 */
3994 while (scan < loceol &&
3995 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold[c]))
3998 case EXACTFL: /* length of string is 1 */
3999 PL_reg_flags |= RF_tainted;
4001 while (scan < loceol &&
4002 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold_locale[c]))
4008 while (hardcount < max && scan < loceol &&
4009 reginclass(p, (U8*)scan, 0, do_utf8)) {
4010 scan += UTF8SKIP(scan);
4014 while (scan < loceol && REGINCLASS(p, (U8*)scan))
4021 LOAD_UTF8_CHARCLASS(alnum,"a");
4022 while (hardcount < max && scan < loceol &&
4023 swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) {
4024 scan += UTF8SKIP(scan);
4028 while (scan < loceol && isALNUM(*scan))
4033 PL_reg_flags |= RF_tainted;
4036 while (hardcount < max && scan < loceol &&
4037 isALNUM_LC_utf8((U8*)scan)) {
4038 scan += UTF8SKIP(scan);
4042 while (scan < loceol && isALNUM_LC(*scan))
4049 LOAD_UTF8_CHARCLASS(alnum,"a");
4050 while (hardcount < max && scan < loceol &&
4051 !swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) {
4052 scan += UTF8SKIP(scan);
4056 while (scan < loceol && !isALNUM(*scan))
4061 PL_reg_flags |= RF_tainted;
4064 while (hardcount < max && scan < loceol &&
4065 !isALNUM_LC_utf8((U8*)scan)) {
4066 scan += UTF8SKIP(scan);
4070 while (scan < loceol && !isALNUM_LC(*scan))
4077 LOAD_UTF8_CHARCLASS(space," ");
4078 while (hardcount < max && scan < loceol &&
4080 swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) {
4081 scan += UTF8SKIP(scan);
4085 while (scan < loceol && isSPACE(*scan))
4090 PL_reg_flags |= RF_tainted;
4093 while (hardcount < max && scan < loceol &&
4094 (*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
4095 scan += UTF8SKIP(scan);
4099 while (scan < loceol && isSPACE_LC(*scan))
4106 LOAD_UTF8_CHARCLASS(space," ");
4107 while (hardcount < max && scan < loceol &&
4109 swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) {
4110 scan += UTF8SKIP(scan);
4114 while (scan < loceol && !isSPACE(*scan))
4119 PL_reg_flags |= RF_tainted;
4122 while (hardcount < max && scan < loceol &&
4123 !(*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
4124 scan += UTF8SKIP(scan);
4128 while (scan < loceol && !isSPACE_LC(*scan))
4135 LOAD_UTF8_CHARCLASS(digit,"0");
4136 while (hardcount < max && scan < loceol &&
4137 swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) {
4138 scan += UTF8SKIP(scan);
4142 while (scan < loceol && isDIGIT(*scan))
4149 LOAD_UTF8_CHARCLASS(digit,"0");
4150 while (hardcount < max && scan < loceol &&
4151 !swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) {
4152 scan += UTF8SKIP(scan);
4156 while (scan < loceol && !isDIGIT(*scan))
4160 default: /* Called on something of 0 width. */
4161 break; /* So match right here or not at all. */
4167 c = scan - PL_reginput;
4172 SV *prop = sv_newmortal();
4175 PerlIO_printf(Perl_debug_log,
4176 "%*s %s can match %"IVdf" times out of %"IVdf"...\n",
4177 REPORT_CODE_OFF+1, "", SvPVX(prop),(IV)c,(IV)max);
4184 - regrepeat_hard - repeatedly match something, report total lenth and length
4186 * The repeater is supposed to have constant length.
4190 S_regrepeat_hard(pTHX_ regnode *p, I32 max, I32 *lp)
4192 register char *scan = Nullch;
4193 register char *start;
4194 register char *loceol = PL_regeol;
4196 I32 count = 0, res = 1;
4201 start = PL_reginput;
4202 if (PL_reg_match_utf8) {
4203 while (PL_reginput < loceol && (scan = PL_reginput, res = regmatch(p))) {
4206 while (start < PL_reginput) {
4208 start += UTF8SKIP(start);
4219 while (PL_reginput < loceol && (scan = PL_reginput, res = regmatch(p))) {
4221 *lp = l = PL_reginput - start;
4222 if (max != REG_INFTY && l*max < loceol - scan)
4223 loceol = scan + l*max;
4236 - regclass_swash - prepare the utf8 swash
4240 Perl_regclass_swash(pTHX_ register regnode* node, bool doinit, SV** listsvp, SV **altsvp)
4246 if (PL_regdata && PL_regdata->count) {
4249 if (PL_regdata->what[n] == 's') {
4250 SV *rv = (SV*)PL_regdata->data[n];
4251 AV *av = (AV*)SvRV((SV*)rv);
4254 /* See the end of regcomp.c:S_reglass() for
4255 * documentation of these array elements. */
4257 si = *av_fetch(av, 0, FALSE);
4258 a = av_fetch(av, 1, FALSE);
4259 b = av_fetch(av, 2, FALSE);
4263 else if (si && doinit) {
4264 sw = swash_init("utf8", "", si, 1, 0);
4265 (void)av_store(av, 1, sw);
4281 - reginclass - determine if a character falls into a character class
4283 The n is the ANYOF regnode, the p is the target string, lenp
4284 is pointer to the maximum length of how far to go in the p
4285 (if the lenp is zero, UTF8SKIP(p) is used),
4286 do_utf8 tells whether the target string is in UTF-8.
4291 S_reginclass(pTHX_ register regnode *n, register U8* p, STRLEN* lenp, register bool do_utf8)
4293 char flags = ANYOF_FLAGS(n);
4299 c = do_utf8 ? utf8_to_uvchr(p, &len) : *p;
4301 plen = lenp ? *lenp : UNISKIP(NATIVE_TO_UNI(c));
4302 if (do_utf8 || (flags & ANYOF_UNICODE)) {
4305 if (do_utf8 && !ANYOF_RUNTIME(n)) {
4306 if (len != (STRLEN)-1 && c < 256 && ANYOF_BITMAP_TEST(n, c))
4309 if (!match && do_utf8 && (flags & ANYOF_UNICODE_ALL) && c >= 256)
4313 SV *sw = regclass_swash(n, TRUE, 0, (SV**)&av);
4316 if (swash_fetch(sw, p, do_utf8))
4318 else if (flags & ANYOF_FOLD) {
4319 if (!match && lenp && av) {
4322 for (i = 0; i <= av_len(av); i++) {
4323 SV* sv = *av_fetch(av, i, FALSE);
4325 char *s = SvPV(sv, len);
4327 if (len <= plen && memEQ(s, (char*)p, len)) {
4335 U8 tmpbuf[UTF8_MAXLEN_FOLD+1];
4338 to_utf8_fold(p, tmpbuf, &tmplen);
4339 if (swash_fetch(sw, tmpbuf, do_utf8))
4345 if (match && lenp && *lenp == 0)
4346 *lenp = UNISKIP(NATIVE_TO_UNI(c));
4348 if (!match && c < 256) {
4349 if (ANYOF_BITMAP_TEST(n, c))
4351 else if (flags & ANYOF_FOLD) {
4354 if (flags & ANYOF_LOCALE) {
4355 PL_reg_flags |= RF_tainted;
4356 f = PL_fold_locale[c];
4360 if (f != c && ANYOF_BITMAP_TEST(n, f))
4364 if (!match && (flags & ANYOF_CLASS)) {
4365 PL_reg_flags |= RF_tainted;
4367 (ANYOF_CLASS_TEST(n, ANYOF_ALNUM) && isALNUM_LC(c)) ||
4368 (ANYOF_CLASS_TEST(n, ANYOF_NALNUM) && !isALNUM_LC(c)) ||
4369 (ANYOF_CLASS_TEST(n, ANYOF_SPACE) && isSPACE_LC(c)) ||
4370 (ANYOF_CLASS_TEST(n, ANYOF_NSPACE) && !isSPACE_LC(c)) ||
4371 (ANYOF_CLASS_TEST(n, ANYOF_DIGIT) && isDIGIT_LC(c)) ||
4372 (ANYOF_CLASS_TEST(n, ANYOF_NDIGIT) && !isDIGIT_LC(c)) ||
4373 (ANYOF_CLASS_TEST(n, ANYOF_ALNUMC) && isALNUMC_LC(c)) ||
4374 (ANYOF_CLASS_TEST(n, ANYOF_NALNUMC) && !isALNUMC_LC(c)) ||
4375 (ANYOF_CLASS_TEST(n, ANYOF_ALPHA) && isALPHA_LC(c)) ||
4376 (ANYOF_CLASS_TEST(n, ANYOF_NALPHA) && !isALPHA_LC(c)) ||
4377 (ANYOF_CLASS_TEST(n, ANYOF_ASCII) && isASCII(c)) ||
4378 (ANYOF_CLASS_TEST(n, ANYOF_NASCII) && !isASCII(c)) ||
4379 (ANYOF_CLASS_TEST(n, ANYOF_CNTRL) && isCNTRL_LC(c)) ||
4380 (ANYOF_CLASS_TEST(n, ANYOF_NCNTRL) && !isCNTRL_LC(c)) ||
4381 (ANYOF_CLASS_TEST(n, ANYOF_GRAPH) && isGRAPH_LC(c)) ||
4382 (ANYOF_CLASS_TEST(n, ANYOF_NGRAPH) && !isGRAPH_LC(c)) ||
4383 (ANYOF_CLASS_TEST(n, ANYOF_LOWER) && isLOWER_LC(c)) ||
4384 (ANYOF_CLASS_TEST(n, ANYOF_NLOWER) && !isLOWER_LC(c)) ||
4385 (ANYOF_CLASS_TEST(n, ANYOF_PRINT) && isPRINT_LC(c)) ||
4386 (ANYOF_CLASS_TEST(n, ANYOF_NPRINT) && !isPRINT_LC(c)) ||
4387 (ANYOF_CLASS_TEST(n, ANYOF_PUNCT) && isPUNCT_LC(c)) ||
4388 (ANYOF_CLASS_TEST(n, ANYOF_NPUNCT) && !isPUNCT_LC(c)) ||
4389 (ANYOF_CLASS_TEST(n, ANYOF_UPPER) && isUPPER_LC(c)) ||
4390 (ANYOF_CLASS_TEST(n, ANYOF_NUPPER) && !isUPPER_LC(c)) ||
4391 (ANYOF_CLASS_TEST(n, ANYOF_XDIGIT) && isXDIGIT(c)) ||
4392 (ANYOF_CLASS_TEST(n, ANYOF_NXDIGIT) && !isXDIGIT(c)) ||
4393 (ANYOF_CLASS_TEST(n, ANYOF_PSXSPC) && isPSXSPC(c)) ||
4394 (ANYOF_CLASS_TEST(n, ANYOF_NPSXSPC) && !isPSXSPC(c)) ||
4395 (ANYOF_CLASS_TEST(n, ANYOF_BLANK) && isBLANK(c)) ||
4396 (ANYOF_CLASS_TEST(n, ANYOF_NBLANK) && !isBLANK(c))
4397 ) /* How's that for a conditional? */
4404 return (flags & ANYOF_INVERT) ? !match : match;
4408 S_reghop(pTHX_ U8 *s, I32 off)
4410 return S_reghop3(aTHX_ s, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr));
4414 S_reghop3(pTHX_ U8 *s, I32 off, U8* lim)
4417 while (off-- && s < lim) {
4418 /* XXX could check well-formedness here */
4426 if (UTF8_IS_CONTINUED(*s)) {
4427 while (s > (U8*)lim && UTF8_IS_CONTINUATION(*s))
4430 /* XXX could check well-formedness here */
4438 S_reghopmaybe(pTHX_ U8 *s, I32 off)
4440 return S_reghopmaybe3(aTHX_ s, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr));
4444 S_reghopmaybe3(pTHX_ U8* s, I32 off, U8* lim)
4447 while (off-- && s < lim) {
4448 /* XXX could check well-formedness here */
4458 if (UTF8_IS_CONTINUED(*s)) {
4459 while (s > (U8*)lim && UTF8_IS_CONTINUATION(*s))
4462 /* XXX could check well-formedness here */
4474 restore_pos(pTHX_ void *arg)
4476 if (PL_reg_eval_set) {
4477 if (PL_reg_oldsaved) {
4478 PL_reg_re->subbeg = PL_reg_oldsaved;
4479 PL_reg_re->sublen = PL_reg_oldsavedlen;
4480 RX_MATCH_COPIED_on(PL_reg_re);
4482 PL_reg_magic->mg_len = PL_reg_oldpos;
4483 PL_reg_eval_set = 0;
4484 PL_curpm = PL_reg_oldcurpm;
4489 S_to_utf8_substr(pTHX_ register regexp *prog)
4492 if (prog->float_substr && !prog->float_utf8) {
4493 prog->float_utf8 = sv = NEWSV(117, 0);
4494 SvSetMagicSV(sv, prog->float_substr);
4495 sv_utf8_upgrade(sv);
4496 if (SvTAIL(prog->float_substr))
4498 if (prog->float_substr == prog->check_substr)
4499 prog->check_utf8 = sv;
4501 if (prog->anchored_substr && !prog->anchored_utf8) {
4502 prog->anchored_utf8 = sv = NEWSV(118, 0);
4503 SvSetMagicSV(sv, prog->anchored_substr);
4504 sv_utf8_upgrade(sv);
4505 if (SvTAIL(prog->anchored_substr))
4507 if (prog->anchored_substr == prog->check_substr)
4508 prog->check_utf8 = sv;
4513 S_to_byte_substr(pTHX_ register regexp *prog)
4516 if (prog->float_utf8 && !prog->float_substr) {
4517 prog->float_substr = sv = NEWSV(117, 0);
4518 SvSetMagicSV(sv, prog->float_utf8);
4519 if (sv_utf8_downgrade(sv, TRUE)) {
4520 if (SvTAIL(prog->float_utf8))
4524 prog->float_substr = sv = &PL_sv_undef;
4526 if (prog->float_utf8 == prog->check_utf8)
4527 prog->check_substr = sv;
4529 if (prog->anchored_utf8 && !prog->anchored_substr) {
4530 prog->anchored_substr = sv = NEWSV(118, 0);
4531 SvSetMagicSV(sv, prog->anchored_utf8);
4532 if (sv_utf8_downgrade(sv, TRUE)) {
4533 if (SvTAIL(prog->anchored_utf8))
4537 prog->anchored_substr = sv = &PL_sv_undef;
4539 if (prog->anchored_utf8 == prog->check_utf8)
4540 prog->check_substr = sv;