5 * "One Ring to rule them all, One Ring to find them..."
8 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
9 * confused with the original package (see point 3 below). Thanks, Henry!
12 /* Additional note: this code is very heavily munged from Henry's version
13 * in places. In some spots I've traded clarity for efficiency, so don't
14 * blame Henry for some of the lack of readability.
17 /* The names of the functions have been changed from regcomp and
18 * regexec to pregcomp and pregexec in order to avoid conflicts
19 * with the POSIX routines of the same names.
22 #ifdef PERL_EXT_RE_BUILD
23 /* need to replace pregcomp et al, so enable that */
24 # ifndef PERL_IN_XSUB_RE
25 # define PERL_IN_XSUB_RE
27 /* need access to debugger hooks */
28 # if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
33 #ifdef PERL_IN_XSUB_RE
34 /* We *really* need to overwrite these symbols: */
35 # define Perl_regexec_flags my_regexec
36 # define Perl_regdump my_regdump
37 # define Perl_regprop my_regprop
38 # define Perl_re_intuit_start my_re_intuit_start
39 /* *These* symbols are masked to allow static link. */
40 # define Perl_pregexec my_pregexec
41 # define Perl_reginitcolors my_reginitcolors
42 # define Perl_regclass_swash my_regclass_swash
44 # define PERL_NO_GET_CONTEXT
49 * pregcomp and pregexec -- regsub and regerror are not used in perl
51 * Copyright (c) 1986 by University of Toronto.
52 * Written by Henry Spencer. Not derived from licensed software.
54 * Permission is granted to anyone to use this software for any
55 * purpose on any computer system, and to redistribute it freely,
56 * subject to the following restrictions:
58 * 1. The author is not responsible for the consequences of use of
59 * this software, no matter how awful, even if they arise
62 * 2. The origin of this software must not be misrepresented, either
63 * by explicit claim or by omission.
65 * 3. Altered versions must be plainly marked as such, and must not
66 * be misrepresented as being the original software.
68 **** Alterations to Henry's code are...
70 **** Copyright (c) 1991-2002, Larry Wall
72 **** You may distribute under the terms of either the GNU General Public
73 **** License or the Artistic License, as specified in the README file.
75 * Beware that some of this code is subtly aware of the way operator
76 * precedence is structured in regular expressions. Serious changes in
77 * regular-expression syntax might require a total rethink.
80 #define PERL_IN_REGEXEC_C
85 #define RF_tainted 1 /* tainted information used? */
86 #define RF_warned 2 /* warned about big count? */
87 #define RF_evaled 4 /* Did an EVAL with setting? */
88 #define RF_utf8 8 /* String contains multibyte chars? */
90 #define UTF (PL_reg_flags & RF_utf8)
92 #define RS_init 1 /* eval environment created */
93 #define RS_set 2 /* replsv value is set */
103 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
104 #define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
106 #define reghop_c(pos,off) ((char*)reghop((U8*)pos, off))
107 #define reghopmaybe_c(pos,off) ((char*)reghopmaybe((U8*)pos, off))
108 #define HOP(pos,off) (PL_reg_match_utf8 ? reghop((U8*)pos, off) : (U8*)(pos + off))
109 #define HOPMAYBE(pos,off) (PL_reg_match_utf8 ? reghopmaybe((U8*)pos, off) : (U8*)(pos + off))
110 #define HOPc(pos,off) ((char*)HOP(pos,off))
111 #define HOPMAYBEc(pos,off) ((char*)HOPMAYBE(pos,off))
113 #define HOPBACK(pos, off) ( \
114 (PL_reg_match_utf8) \
115 ? reghopmaybe((U8*)pos, -off) \
116 : (pos - off >= PL_bostr) \
120 #define HOPBACKc(pos, off) (char*)HOPBACK(pos, off)
122 #define reghop3_c(pos,off,lim) ((char*)reghop3((U8*)pos, off, (U8*)lim))
123 #define reghopmaybe3_c(pos,off,lim) ((char*)reghopmaybe3((U8*)pos, off, (U8*)lim))
124 #define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)pos, off, (U8*)lim) : (U8*)(pos + off))
125 #define HOPMAYBE3(pos,off,lim) (PL_reg_match_utf8 ? reghopmaybe3((U8*)pos, off, (U8*)lim) : (U8*)(pos + off))
126 #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
127 #define HOPMAYBE3c(pos,off,lim) ((char*)HOPMAYBE3(pos,off,lim))
129 #define LOAD_UTF8_CHARCLASS(a,b) STMT_START { if (!CAT2(PL_utf8_,a)) (void)CAT2(is_utf8_, a)((U8*)b); } STMT_END
131 /* for use after a quantifier and before an EXACT-like node -- japhy */
132 #define JUMPABLE(rn) ( \
133 OP(rn) == OPEN || OP(rn) == CLOSE || OP(rn) == EVAL || \
134 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
135 OP(rn) == PLUS || OP(rn) == MINMOD || \
136 (PL_regkind[(U8)OP(rn)] == CURLY && ARG1(rn) > 0) \
139 #define HAS_TEXT(rn) ( \
140 PL_regkind[(U8)OP(rn)] == EXACT || PL_regkind[(U8)OP(rn)] == REF \
144 Search for mandatory following text node; for lookahead, the text must
145 follow but for lookbehind (rn->flags != 0) we skip to the next step.
147 #define FIND_NEXT_IMPT(rn) STMT_START { \
148 while (JUMPABLE(rn)) \
149 if (OP(rn) == SUSPEND || PL_regkind[(U8)OP(rn)] == CURLY) \
150 rn = NEXTOPER(NEXTOPER(rn)); \
151 else if (OP(rn) == PLUS) \
153 else if (OP(rn) == IFMATCH) \
154 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
155 else rn += NEXT_OFF(rn); \
158 static void restore_pos(pTHX_ void *arg);
161 S_regcppush(pTHX_ I32 parenfloor)
163 int retval = PL_savestack_ix;
164 #define REGCP_PAREN_ELEMS 4
165 int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
168 if (paren_elems_to_push < 0)
169 Perl_croak(aTHX_ "panic: paren_elems_to_push < 0");
171 #define REGCP_OTHER_ELEMS 6
172 SSCHECK(paren_elems_to_push + REGCP_OTHER_ELEMS);
173 for (p = PL_regsize; p > parenfloor; p--) {
174 /* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
175 SSPUSHINT(PL_regendp[p]);
176 SSPUSHINT(PL_regstartp[p]);
177 SSPUSHPTR(PL_reg_start_tmp[p]);
180 /* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
181 SSPUSHINT(PL_regsize);
182 SSPUSHINT(*PL_reglastparen);
183 SSPUSHINT(*PL_reglastcloseparen);
184 SSPUSHPTR(PL_reginput);
185 #define REGCP_FRAME_ELEMS 2
186 /* REGCP_FRAME_ELEMS are part of the REGCP_OTHER_ELEMS and
187 * are needed for the regexp context stack bookkeeping. */
188 SSPUSHINT(paren_elems_to_push + REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
189 SSPUSHINT(SAVEt_REGCONTEXT); /* Magic cookie. */
194 /* These are needed since we do not localize EVAL nodes: */
195 # define REGCP_SET(cp) DEBUG_r(PerlIO_printf(Perl_debug_log, \
196 " Setting an EVAL scope, savestack=%"IVdf"\n", \
197 (IV)PL_savestack_ix)); cp = PL_savestack_ix
199 # define REGCP_UNWIND(cp) DEBUG_r(cp != PL_savestack_ix ? \
200 PerlIO_printf(Perl_debug_log, \
201 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
202 (IV)(cp), (IV)PL_savestack_ix) : 0); regcpblow(cp)
212 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
214 assert(i == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
215 i = SSPOPINT; /* Parentheses elements to pop. */
216 input = (char *) SSPOPPTR;
217 *PL_reglastcloseparen = SSPOPINT;
218 *PL_reglastparen = SSPOPINT;
219 PL_regsize = SSPOPINT;
221 /* Now restore the parentheses context. */
222 for (i -= (REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
223 i > 0; i -= REGCP_PAREN_ELEMS) {
224 paren = (U32)SSPOPINT;
225 PL_reg_start_tmp[paren] = (char *) SSPOPPTR;
226 PL_regstartp[paren] = SSPOPINT;
228 if (paren <= *PL_reglastparen)
229 PL_regendp[paren] = tmps;
231 PerlIO_printf(Perl_debug_log,
232 " restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n",
233 (UV)paren, (IV)PL_regstartp[paren],
234 (IV)(PL_reg_start_tmp[paren] - PL_bostr),
235 (IV)PL_regendp[paren],
236 (paren > *PL_reglastparen ? "(no)" : ""));
240 if (*PL_reglastparen + 1 <= PL_regnpar) {
241 PerlIO_printf(Perl_debug_log,
242 " restoring \\%"IVdf"..\\%"IVdf" to undef\n",
243 (IV)(*PL_reglastparen + 1), (IV)PL_regnpar);
247 /* It would seem that the similar code in regtry()
248 * already takes care of this, and in fact it is in
249 * a better location to since this code can #if 0-ed out
250 * but the code in regtry() is needed or otherwise tests
251 * requiring null fields (pat.t#187 and split.t#{13,14}
252 * (as of patchlevel 7877) will fail. Then again,
253 * this code seems to be necessary or otherwise
254 * building DynaLoader will fail:
255 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
257 for (paren = *PL_reglastparen + 1; paren <= PL_regnpar; paren++) {
258 if (paren > PL_regsize)
259 PL_regstartp[paren] = -1;
260 PL_regendp[paren] = -1;
267 S_regcp_set_to(pTHX_ I32 ss)
269 I32 tmp = PL_savestack_ix;
271 PL_savestack_ix = ss;
273 PL_savestack_ix = tmp;
277 typedef struct re_cc_state
281 struct re_cc_state *prev;
286 #define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
288 #define TRYPAREN(paren, n, input) { \
291 PL_regstartp[paren] = HOPc(input, -1) - PL_bostr; \
292 PL_regendp[paren] = input - PL_bostr; \
295 PL_regendp[paren] = -1; \
297 if (regmatch(next)) \
300 PL_regendp[paren] = -1; \
305 * pregexec and friends
309 - pregexec - match a regexp against a string
312 Perl_pregexec(pTHX_ register regexp *prog, char *stringarg, register char *strend,
313 char *strbeg, I32 minend, SV *screamer, U32 nosave)
314 /* strend: pointer to null at end of string */
315 /* strbeg: real beginning of string */
316 /* minend: end of match must be >=minend after stringarg. */
317 /* nosave: For optimizations. */
320 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
321 nosave ? 0 : REXEC_COPY_STR);
325 S_cache_re(pTHX_ regexp *prog)
327 PL_regprecomp = prog->precomp; /* Needed for FAIL. */
329 PL_regprogram = prog->program;
331 PL_regnpar = prog->nparens;
332 PL_regdata = prog->data;
337 * Need to implement the following flags for reg_anch:
339 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
341 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
342 * INTUIT_AUTORITATIVE_ML
343 * INTUIT_ONCE_NOML - Intuit can match in one location only.
346 * Another flag for this function: SECOND_TIME (so that float substrs
347 * with giant delta may be not rechecked).
350 /* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
352 /* If SCREAM, then SvPVX(sv) should be compatible with strpos and strend.
353 Otherwise, only SvCUR(sv) is used to get strbeg. */
355 /* XXXX We assume that strpos is strbeg unless sv. */
357 /* XXXX Some places assume that there is a fixed substring.
358 An update may be needed if optimizer marks as "INTUITable"
359 RExen without fixed substrings. Similarly, it is assumed that
360 lengths of all the strings are no more than minlen, thus they
361 cannot come from lookahead.
362 (Or minlen should take into account lookahead.) */
364 /* A failure to find a constant substring means that there is no need to make
365 an expensive call to REx engine, thus we celebrate a failure. Similarly,
366 finding a substring too deep into the string means that less calls to
367 regtry() should be needed.
369 REx compiler's optimizer found 4 possible hints:
370 a) Anchored substring;
372 c) Whether we are anchored (beginning-of-line or \G);
373 d) First node (of those at offset 0) which may distingush positions;
374 We use a)b)d) and multiline-part of c), and try to find a position in the
375 string which does not contradict any of them.
378 /* Most of decisions we do here should have been done at compile time.
379 The nodes of the REx which we used for the search should have been
380 deleted from the finite automaton. */
383 Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
384 char *strend, U32 flags, re_scream_pos_data *data)
386 register I32 start_shift = 0;
387 /* Should be nonnegative! */
388 register I32 end_shift = 0;
394 register char *other_last = Nullch; /* other substr checked before this */
395 char *check_at = Nullch; /* check substr found at this pos */
397 char *i_strpos = strpos;
398 SV *dsv = PERL_DEBUG_PAD_ZERO(0);
401 if (prog->reganch & ROPT_UTF8) {
402 DEBUG_r(PerlIO_printf(Perl_debug_log,
403 "UTF-8 regex...\n"));
404 PL_reg_flags |= RF_utf8;
408 char *s = PL_reg_match_utf8 ?
409 sv_uni_display(dsv, sv, 60, UNI_DISPLAY_REGEX) :
411 int len = PL_reg_match_utf8 ?
412 strlen(s) : strend - strpos;
415 if (PL_reg_match_utf8)
416 DEBUG_r(PerlIO_printf(Perl_debug_log,
417 "UTF-8 target...\n"));
418 PerlIO_printf(Perl_debug_log,
419 "%sGuessing start of match, REx%s `%s%.60s%s%s' against `%s%.*s%s%s'...\n",
420 PL_colors[4],PL_colors[5],PL_colors[0],
423 (strlen(prog->precomp) > 60 ? "..." : ""),
425 (int)(len > 60 ? 60 : len),
427 (len > 60 ? "..." : "")
431 if (prog->minlen > CHR_DIST((U8*)strend, (U8*)strpos)) {
432 DEBUG_r(PerlIO_printf(Perl_debug_log,
433 "String too short... [re_intuit_start]\n"));
436 strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
438 check = prog->check_substr;
439 if (prog->reganch & ROPT_ANCH) { /* Match at beg-of-str or after \n */
440 ml_anch = !( (prog->reganch & ROPT_ANCH_SINGLE)
441 || ( (prog->reganch & ROPT_ANCH_BOL)
442 && !PL_multiline ) ); /* Check after \n? */
445 if ( !(prog->reganch & (ROPT_ANCH_GPOS /* Checked by the caller */
446 | ROPT_IMPLICIT)) /* not a real BOL */
447 /* SvCUR is not set on references: SvRV and SvPVX overlap */
449 && (strpos != strbeg)) {
450 DEBUG_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
453 if (prog->check_offset_min == prog->check_offset_max &&
454 !(prog->reganch & ROPT_CANY_SEEN)) {
455 /* Substring at constant offset from beg-of-str... */
458 s = HOP3c(strpos, prog->check_offset_min, strend);
460 slen = SvCUR(check); /* >= 1 */
462 if ( strend - s > slen || strend - s < slen - 1
463 || (strend - s == slen && strend[-1] != '\n')) {
464 DEBUG_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
467 /* Now should match s[0..slen-2] */
469 if (slen && (*SvPVX(check) != *s
471 && memNE(SvPVX(check), s, slen)))) {
473 DEBUG_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
477 else if (*SvPVX(check) != *s
478 || ((slen = SvCUR(check)) > 1
479 && memNE(SvPVX(check), s, slen)))
481 goto success_at_start;
484 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
486 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
487 end_shift = prog->minlen - start_shift -
488 CHR_SVLEN(check) + (SvTAIL(check) != 0);
490 I32 end = prog->check_offset_max + CHR_SVLEN(check)
491 - (SvTAIL(check) != 0);
492 I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
494 if (end_shift < eshift)
498 else { /* Can match at random position */
501 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
502 /* Should be nonnegative! */
503 end_shift = prog->minlen - start_shift -
504 CHR_SVLEN(check) + (SvTAIL(check) != 0);
507 #ifdef DEBUGGING /* 7/99: reports of failure (with the older version) */
509 Perl_croak(aTHX_ "panic: end_shift");
513 /* Find a possible match in the region s..strend by looking for
514 the "check" substring in the region corrected by start/end_shift. */
515 if (flags & REXEC_SCREAM) {
516 I32 p = -1; /* Internal iterator of scream. */
517 I32 *pp = data ? data->scream_pos : &p;
519 if (PL_screamfirst[BmRARE(check)] >= 0
520 || ( BmRARE(check) == '\n'
521 && (BmPREVIOUS(check) == SvCUR(check) - 1)
523 s = screaminstr(sv, check,
524 start_shift + (s - strbeg), end_shift, pp, 0);
528 *data->scream_olds = s;
530 else if (prog->reganch & ROPT_CANY_SEEN)
531 s = fbm_instr((U8*)(s + start_shift),
532 (U8*)(strend - end_shift),
533 check, PL_multiline ? FBMrf_MULTILINE : 0);
535 s = fbm_instr(HOP3(s, start_shift, strend),
536 HOP3(strend, -end_shift, strbeg),
537 check, PL_multiline ? FBMrf_MULTILINE : 0);
539 /* Update the count-of-usability, remove useless subpatterns,
542 DEBUG_r(PerlIO_printf(Perl_debug_log, "%s %s substr `%s%.*s%s'%s%s",
543 (s ? "Found" : "Did not find"),
544 ((check == prog->anchored_substr) ? "anchored" : "floating"),
546 (int)(SvCUR(check) - (SvTAIL(check)!=0)),
548 PL_colors[1], (SvTAIL(check) ? "$" : ""),
549 (s ? " at offset " : "...\n") ) );
556 /* Finish the diagnostic message */
557 DEBUG_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
559 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
560 Start with the other substr.
561 XXXX no SCREAM optimization yet - and a very coarse implementation
562 XXXX /ttx+/ results in anchored=`ttx', floating=`x'. floating will
563 *always* match. Probably should be marked during compile...
564 Probably it is right to do no SCREAM here...
567 if (prog->float_substr && prog->anchored_substr) {
568 /* Take into account the "other" substring. */
569 /* XXXX May be hopelessly wrong for UTF... */
572 if (check == prog->float_substr) {
575 char *last = HOP3c(s, -start_shift, strbeg), *last1, *last2;
578 t = s - prog->check_offset_max;
579 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
580 && (!(prog->reganch & ROPT_UTF8)
581 || ((t = reghopmaybe3_c(s, -(prog->check_offset_max), strpos))
586 t = HOP3c(t, prog->anchored_offset, strend);
587 if (t < other_last) /* These positions already checked */
589 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
592 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
593 /* On end-of-str: see comment below. */
594 s = fbm_instr((unsigned char*)t,
595 HOP3(HOP3(last1, prog->anchored_offset, strend)
596 + SvCUR(prog->anchored_substr),
597 -(SvTAIL(prog->anchored_substr)!=0), strbeg),
598 prog->anchored_substr,
599 PL_multiline ? FBMrf_MULTILINE : 0);
600 DEBUG_r(PerlIO_printf(Perl_debug_log,
601 "%s anchored substr `%s%.*s%s'%s",
602 (s ? "Found" : "Contradicts"),
604 (int)(SvCUR(prog->anchored_substr)
605 - (SvTAIL(prog->anchored_substr)!=0)),
606 SvPVX(prog->anchored_substr),
607 PL_colors[1], (SvTAIL(prog->anchored_substr) ? "$" : "")));
609 if (last1 >= last2) {
610 DEBUG_r(PerlIO_printf(Perl_debug_log,
611 ", giving up...\n"));
614 DEBUG_r(PerlIO_printf(Perl_debug_log,
615 ", trying floating at offset %ld...\n",
616 (long)(HOP3c(s1, 1, strend) - i_strpos)));
617 other_last = HOP3c(last1, prog->anchored_offset+1, strend);
618 s = HOP3c(last, 1, strend);
622 DEBUG_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
623 (long)(s - i_strpos)));
624 t = HOP3c(s, -prog->anchored_offset, strbeg);
625 other_last = HOP3c(s, 1, strend);
633 else { /* Take into account the floating substring. */
637 t = HOP3c(s, -start_shift, strbeg);
639 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
640 if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
641 last = HOP3c(t, prog->float_max_offset, strend);
642 s = HOP3c(t, prog->float_min_offset, strend);
645 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
646 /* fbm_instr() takes into account exact value of end-of-str
647 if the check is SvTAIL(ed). Since false positives are OK,
648 and end-of-str is not later than strend we are OK. */
649 s = fbm_instr((unsigned char*)s,
650 (unsigned char*)last + SvCUR(prog->float_substr)
651 - (SvTAIL(prog->float_substr)!=0),
652 prog->float_substr, PL_multiline ? FBMrf_MULTILINE : 0);
653 DEBUG_r(PerlIO_printf(Perl_debug_log, "%s floating substr `%s%.*s%s'%s",
654 (s ? "Found" : "Contradicts"),
656 (int)(SvCUR(prog->float_substr)
657 - (SvTAIL(prog->float_substr)!=0)),
658 SvPVX(prog->float_substr),
659 PL_colors[1], (SvTAIL(prog->float_substr) ? "$" : "")));
662 DEBUG_r(PerlIO_printf(Perl_debug_log,
663 ", giving up...\n"));
666 DEBUG_r(PerlIO_printf(Perl_debug_log,
667 ", trying anchored starting at offset %ld...\n",
668 (long)(s1 + 1 - i_strpos)));
670 s = HOP3c(t, 1, strend);
674 DEBUG_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
675 (long)(s - i_strpos)));
676 other_last = s; /* Fix this later. --Hugo */
685 t = s - prog->check_offset_max;
686 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
687 && (!(prog->reganch & ROPT_UTF8)
688 || ((t = reghopmaybe3_c(s, -prog->check_offset_max, strpos))
690 /* Fixed substring is found far enough so that the match
691 cannot start at strpos. */
693 if (ml_anch && t[-1] != '\n') {
694 /* Eventually fbm_*() should handle this, but often
695 anchored_offset is not 0, so this check will not be wasted. */
696 /* XXXX In the code below we prefer to look for "^" even in
697 presence of anchored substrings. And we search even
698 beyond the found float position. These pessimizations
699 are historical artefacts only. */
701 while (t < strend - prog->minlen) {
703 if (t < check_at - prog->check_offset_min) {
704 if (prog->anchored_substr) {
705 /* Since we moved from the found position,
706 we definitely contradict the found anchored
707 substr. Due to the above check we do not
708 contradict "check" substr.
709 Thus we can arrive here only if check substr
710 is float. Redo checking for "other"=="fixed".
713 DEBUG_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
714 PL_colors[0],PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
715 goto do_other_anchored;
717 /* We don't contradict the found floating substring. */
718 /* XXXX Why not check for STCLASS? */
720 DEBUG_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
721 PL_colors[0],PL_colors[1], (long)(s - i_strpos)));
724 /* Position contradicts check-string */
725 /* XXXX probably better to look for check-string
726 than for "\n", so one should lower the limit for t? */
727 DEBUG_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
728 PL_colors[0],PL_colors[1], (long)(t + 1 - i_strpos)));
729 other_last = strpos = s = t + 1;
734 DEBUG_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
735 PL_colors[0],PL_colors[1]));
739 DEBUG_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
740 PL_colors[0],PL_colors[1]));
744 ++BmUSEFUL(prog->check_substr); /* hooray/5 */
747 /* The found string does not prohibit matching at strpos,
748 - no optimization of calling REx engine can be performed,
749 unless it was an MBOL and we are not after MBOL,
750 or a future STCLASS check will fail this. */
752 /* Even in this situation we may use MBOL flag if strpos is offset
753 wrt the start of the string. */
754 if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
755 && (strpos != strbeg) && strpos[-1] != '\n'
756 /* May be due to an implicit anchor of m{.*foo} */
757 && !(prog->reganch & ROPT_IMPLICIT))
762 DEBUG_r( if (ml_anch)
763 PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
764 (long)(strpos - i_strpos), PL_colors[0],PL_colors[1]);
767 if (!(prog->reganch & ROPT_NAUGHTY) /* XXXX If strpos moved? */
768 && prog->check_substr /* Could be deleted already */
769 && --BmUSEFUL(prog->check_substr) < 0
770 && prog->check_substr == prog->float_substr)
772 /* If flags & SOMETHING - do not do it many times on the same match */
773 DEBUG_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
774 SvREFCNT_dec(prog->check_substr);
775 prog->check_substr = Nullsv; /* disable */
776 prog->float_substr = Nullsv; /* clear */
777 check = Nullsv; /* abort */
779 /* XXXX This is a remnant of the old implementation. It
780 looks wasteful, since now INTUIT can use many
782 prog->reganch &= ~RE_USE_INTUIT;
789 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
790 if (prog->regstclass) {
791 /* minlen == 0 is possible if regstclass is \b or \B,
792 and the fixed substr is ''$.
793 Since minlen is already taken into account, s+1 is before strend;
794 accidentally, minlen >= 1 guaranties no false positives at s + 1
795 even for \b or \B. But (minlen? 1 : 0) below assumes that
796 regstclass does not come from lookahead... */
797 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
798 This leaves EXACTF only, which is dealt with in find_byclass(). */
799 U8* str = (U8*)STRING(prog->regstclass);
800 int cl_l = (PL_regkind[(U8)OP(prog->regstclass)] == EXACT
801 ? CHR_DIST(str+STR_LEN(prog->regstclass), str)
803 char *endpos = (prog->anchored_substr || ml_anch)
804 ? HOP3c(s, (prog->minlen ? cl_l : 0), strend)
805 : (prog->float_substr
806 ? HOP3c(HOP3c(check_at, -start_shift, strbeg),
809 char *startpos = strbeg;
812 if (prog->reganch & ROPT_UTF8) {
813 PL_regdata = prog->data;
816 s = find_byclass(prog, prog->regstclass, s, endpos, startpos, 1);
821 if (endpos == strend) {
822 DEBUG_r( PerlIO_printf(Perl_debug_log,
823 "Could not match STCLASS...\n") );
826 DEBUG_r( PerlIO_printf(Perl_debug_log,
827 "This position contradicts STCLASS...\n") );
828 if ((prog->reganch & ROPT_ANCH) && !ml_anch)
830 /* Contradict one of substrings */
831 if (prog->anchored_substr) {
832 if (prog->anchored_substr == check) {
833 DEBUG_r( what = "anchored" );
835 s = HOP3c(t, 1, strend);
836 if (s + start_shift + end_shift > strend) {
837 /* XXXX Should be taken into account earlier? */
838 DEBUG_r( PerlIO_printf(Perl_debug_log,
839 "Could not match STCLASS...\n") );
844 DEBUG_r( PerlIO_printf(Perl_debug_log,
845 "Looking for %s substr starting at offset %ld...\n",
846 what, (long)(s + start_shift - i_strpos)) );
849 /* Have both, check_string is floating */
850 if (t + start_shift >= check_at) /* Contradicts floating=check */
851 goto retry_floating_check;
852 /* Recheck anchored substring, but not floating... */
856 DEBUG_r( PerlIO_printf(Perl_debug_log,
857 "Looking for anchored substr starting at offset %ld...\n",
858 (long)(other_last - i_strpos)) );
859 goto do_other_anchored;
861 /* Another way we could have checked stclass at the
862 current position only: */
867 DEBUG_r( PerlIO_printf(Perl_debug_log,
868 "Looking for /%s^%s/m starting at offset %ld...\n",
869 PL_colors[0],PL_colors[1], (long)(t - i_strpos)) );
872 if (!prog->float_substr) /* Could have been deleted */
874 /* Check is floating subtring. */
875 retry_floating_check:
876 t = check_at - start_shift;
877 DEBUG_r( what = "floating" );
878 goto hop_and_restart;
881 DEBUG_r(PerlIO_printf(Perl_debug_log,
882 "By STCLASS: moving %ld --> %ld\n",
883 (long)(t - i_strpos), (long)(s - i_strpos))
887 DEBUG_r(PerlIO_printf(Perl_debug_log,
888 "Does not contradict STCLASS...\n");
893 DEBUG_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
894 PL_colors[4], (check ? "Guessed" : "Giving up"),
895 PL_colors[5], (long)(s - i_strpos)) );
898 fail_finish: /* Substring not found */
899 if (prog->check_substr) /* could be removed already */
900 BmUSEFUL(prog->check_substr) += 5; /* hooray */
902 DEBUG_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
903 PL_colors[4],PL_colors[5]));
907 /* We know what class REx starts with. Try to find this position... */
909 S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *startpos, I32 norun)
911 I32 doevery = (prog->reganch & ROPT_SKIP) == 0;
917 register I32 tmp = 1; /* Scratch variable? */
918 register bool do_utf8 = PL_reg_match_utf8;
920 /* We know what class it must start with. */
924 STRLEN skip = do_utf8 ? UTF8SKIP(s) : 1;
926 if (reginclass(c, (U8*)s, do_utf8) ||
927 (ANYOF_FOLD_SHARP_S(c, s, strend) &&
928 /* The assignment of 2 is intentional:
929 * for the sharp s, the skip is 2. */
930 (skip = SHARP_S_SKIP)
932 if (tmp && (norun || regtry(prog, s)))
944 if (tmp && (norun || regtry(prog, s)))
956 U8 tmpbuf1[UTF8_MAXLEN_UCLC+1];
957 U8 tmpbuf2[UTF8_MAXLEN_UCLC+1];
959 to_utf8_lower((U8*)m, tmpbuf1, &ulen1);
960 to_utf8_upper((U8*)m, tmpbuf2, &ulen2);
962 c1 = utf8_to_uvuni(tmpbuf1, 0);
963 c2 = utf8_to_uvuni(tmpbuf2, 0);
974 c2 = PL_fold_locale[c1];
976 e = do_utf8 ? s + ln : strend - ln;
979 e = s; /* Due to minlen logic of intuit() */
981 /* The idea in the EXACTF* cases is to first find the
982 * first character of the EXACTF* node and then, if
983 * necessary, case-insensitively compare the full
984 * text of the node. The c1 and c2 are the first
985 * characters (though in Unicode it gets a bit
986 * more complicated because there are more cases
987 * than just upper and lower: one is really supposed
988 * to use the so-called folding case for case-insensitive
989 * matching (called "loose matching" in Unicode). */
993 U8 tmpbuf [UTF8_MAXLEN+1];
994 U8 foldbuf[UTF8_MAXLEN_FOLD+1];
999 c = utf8_to_uvchr((U8*)s, &len);
1002 ibcmp_utf8(s, (char **)0, 0, do_utf8,
1003 m, (char **)0, ln, UTF))
1004 && (norun || regtry(prog, s)) )
1007 uvchr_to_utf8(tmpbuf, c);
1008 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen);
1010 && (f == c1 || f == c2)
1011 && (ln == foldlen ||
1012 !ibcmp_utf8((char *) foldbuf,
1013 (char **)0, foldlen, do_utf8,
1015 (char **)0, ln, UTF))
1016 && (norun || regtry(prog, s)) )
1024 c = utf8_to_uvchr((U8*)s, &len);
1026 /* Handle some of the three Greek sigmas cases.
1027 * Note that not all the possible combinations
1028 * are handled here: some of them are handled
1029 * by the standard folding rules, and some of
1030 * them (the character class or ANYOF cases)
1031 * are handled during compiletime in
1032 * regexec.c:S_regclass(). */
1033 if (c == (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA ||
1034 c == (UV)UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA)
1035 c = (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA;
1037 if ( (c == c1 || c == c2)
1039 ibcmp_utf8(s, (char **)0, 0, do_utf8,
1040 m, (char **)0, ln, UTF))
1041 && (norun || regtry(prog, s)) )
1044 uvchr_to_utf8(tmpbuf, c);
1045 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen);
1047 && (f == c1 || f == c2)
1048 && (ln == foldlen ||
1049 !ibcmp_utf8((char *) foldbuf,
1050 (char **)0, foldlen, do_utf8,
1052 (char **)0, ln, UTF))
1053 && (norun || regtry(prog, s)) )
1064 && (ln == 1 || !(OP(c) == EXACTF
1066 : ibcmp_locale(s, m, ln)))
1067 && (norun || regtry(prog, s)) )
1073 if ( (*(U8*)s == c1 || *(U8*)s == c2)
1074 && (ln == 1 || !(OP(c) == EXACTF
1076 : ibcmp_locale(s, m, ln)))
1077 && (norun || regtry(prog, s)) )
1084 PL_reg_flags |= RF_tainted;
1091 U8 *r = reghop3((U8*)s, -1, (U8*)startpos);
1094 tmp = (I32)utf8n_to_uvchr(r, s - (char*)r, 0, 0);
1096 tmp = ((OP(c) == BOUND ?
1097 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1098 LOAD_UTF8_CHARCLASS(alnum,"a");
1099 while (s < strend) {
1100 if (tmp == !(OP(c) == BOUND ?
1101 swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
1102 isALNUM_LC_utf8((U8*)s)))
1105 if ((norun || regtry(prog, s)))
1112 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
1113 tmp = ((OP(c) == BOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1114 while (s < strend) {
1116 !(OP(c) == BOUND ? isALNUM(*s) : isALNUM_LC(*s))) {
1118 if ((norun || regtry(prog, s)))
1124 if ((!prog->minlen && tmp) && (norun || regtry(prog, s)))
1128 PL_reg_flags |= RF_tainted;
1135 U8 *r = reghop3((U8*)s, -1, (U8*)startpos);
1138 tmp = (I32)utf8n_to_uvchr(r, s - (char*)r, 0, 0);
1140 tmp = ((OP(c) == NBOUND ?
1141 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1142 LOAD_UTF8_CHARCLASS(alnum,"a");
1143 while (s < strend) {
1144 if (tmp == !(OP(c) == NBOUND ?
1145 swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
1146 isALNUM_LC_utf8((U8*)s)))
1148 else if ((norun || regtry(prog, s)))
1154 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
1155 tmp = ((OP(c) == NBOUND ?
1156 isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1157 while (s < strend) {
1159 !(OP(c) == NBOUND ? isALNUM(*s) : isALNUM_LC(*s)))
1161 else if ((norun || regtry(prog, s)))
1166 if ((!prog->minlen && !tmp) && (norun || regtry(prog, s)))
1171 LOAD_UTF8_CHARCLASS(alnum,"a");
1172 while (s < strend) {
1173 if (swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8)) {
1174 if (tmp && (norun || regtry(prog, s)))
1185 while (s < strend) {
1187 if (tmp && (norun || regtry(prog, s)))
1199 PL_reg_flags |= RF_tainted;
1201 while (s < strend) {
1202 if (isALNUM_LC_utf8((U8*)s)) {
1203 if (tmp && (norun || regtry(prog, s)))
1214 while (s < strend) {
1215 if (isALNUM_LC(*s)) {
1216 if (tmp && (norun || regtry(prog, s)))
1229 LOAD_UTF8_CHARCLASS(alnum,"a");
1230 while (s < strend) {
1231 if (!swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8)) {
1232 if (tmp && (norun || regtry(prog, s)))
1243 while (s < strend) {
1245 if (tmp && (norun || regtry(prog, s)))
1257 PL_reg_flags |= RF_tainted;
1259 while (s < strend) {
1260 if (!isALNUM_LC_utf8((U8*)s)) {
1261 if (tmp && (norun || regtry(prog, s)))
1272 while (s < strend) {
1273 if (!isALNUM_LC(*s)) {
1274 if (tmp && (norun || regtry(prog, s)))
1287 LOAD_UTF8_CHARCLASS(space," ");
1288 while (s < strend) {
1289 if (*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8)) {
1290 if (tmp && (norun || regtry(prog, s)))
1301 while (s < strend) {
1303 if (tmp && (norun || regtry(prog, s)))
1315 PL_reg_flags |= RF_tainted;
1317 while (s < strend) {
1318 if (*s == ' ' || isSPACE_LC_utf8((U8*)s)) {
1319 if (tmp && (norun || regtry(prog, s)))
1330 while (s < strend) {
1331 if (isSPACE_LC(*s)) {
1332 if (tmp && (norun || regtry(prog, s)))
1345 LOAD_UTF8_CHARCLASS(space," ");
1346 while (s < strend) {
1347 if (!(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8))) {
1348 if (tmp && (norun || regtry(prog, s)))
1359 while (s < strend) {
1361 if (tmp && (norun || regtry(prog, s)))
1373 PL_reg_flags |= RF_tainted;
1375 while (s < strend) {
1376 if (!(*s == ' ' || isSPACE_LC_utf8((U8*)s))) {
1377 if (tmp && (norun || regtry(prog, s)))
1388 while (s < strend) {
1389 if (!isSPACE_LC(*s)) {
1390 if (tmp && (norun || regtry(prog, s)))
1403 LOAD_UTF8_CHARCLASS(digit,"0");
1404 while (s < strend) {
1405 if (swash_fetch(PL_utf8_digit,(U8*)s, do_utf8)) {
1406 if (tmp && (norun || regtry(prog, s)))
1417 while (s < strend) {
1419 if (tmp && (norun || regtry(prog, s)))
1431 PL_reg_flags |= RF_tainted;
1433 while (s < strend) {
1434 if (isDIGIT_LC_utf8((U8*)s)) {
1435 if (tmp && (norun || regtry(prog, s)))
1446 while (s < strend) {
1447 if (isDIGIT_LC(*s)) {
1448 if (tmp && (norun || regtry(prog, s)))
1461 LOAD_UTF8_CHARCLASS(digit,"0");
1462 while (s < strend) {
1463 if (!swash_fetch(PL_utf8_digit,(U8*)s, do_utf8)) {
1464 if (tmp && (norun || regtry(prog, s)))
1475 while (s < strend) {
1477 if (tmp && (norun || regtry(prog, s)))
1489 PL_reg_flags |= RF_tainted;
1491 while (s < strend) {
1492 if (!isDIGIT_LC_utf8((U8*)s)) {
1493 if (tmp && (norun || regtry(prog, s)))
1504 while (s < strend) {
1505 if (!isDIGIT_LC(*s)) {
1506 if (tmp && (norun || regtry(prog, s)))
1518 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
1527 - regexec_flags - match a regexp against a string
1530 Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *strend,
1531 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
1532 /* strend: pointer to null at end of string */
1533 /* strbeg: real beginning of string */
1534 /* minend: end of match must be >=minend after stringarg. */
1535 /* data: May be used for some additional optimizations. */
1536 /* nosave: For optimizations. */
1539 register regnode *c;
1540 register char *startpos = stringarg;
1541 I32 minlen; /* must match at least this many chars */
1542 I32 dontbother = 0; /* how many characters not to try at end */
1543 /* I32 start_shift = 0; */ /* Offset of the start to find
1544 constant substr. */ /* CC */
1545 I32 end_shift = 0; /* Same for the end. */ /* CC */
1546 I32 scream_pos = -1; /* Internal iterator of scream. */
1548 SV* oreplsv = GvSV(PL_replgv);
1549 bool do_utf8 = DO_UTF8(sv);
1551 SV *dsv0 = PERL_DEBUG_PAD_ZERO(0);
1552 SV *dsv1 = PERL_DEBUG_PAD_ZERO(1);
1559 PL_regnarrate = DEBUG_r_TEST;
1562 /* Be paranoid... */
1563 if (prog == NULL || startpos == NULL) {
1564 Perl_croak(aTHX_ "NULL regexp parameter");
1568 minlen = prog->minlen;
1569 if (strend - startpos < minlen) {
1570 DEBUG_r(PerlIO_printf(Perl_debug_log,
1571 "String too short [regexec_flags]...\n"));
1575 /* Check validity of program. */
1576 if (UCHARAT(prog->program) != REG_MAGIC) {
1577 Perl_croak(aTHX_ "corrupted regexp program");
1581 PL_reg_eval_set = 0;
1584 if (prog->reganch & ROPT_UTF8)
1585 PL_reg_flags |= RF_utf8;
1587 /* Mark beginning of line for ^ and lookbehind. */
1588 PL_regbol = startpos;
1592 /* Mark end of line for $ (and such) */
1595 /* see how far we have to get to not match where we matched before */
1596 PL_regtill = startpos+minend;
1598 /* We start without call_cc context. */
1601 /* If there is a "must appear" string, look for it. */
1604 if (prog->reganch & ROPT_GPOS_SEEN) { /* Need to have PL_reg_ganch */
1607 if (flags & REXEC_IGNOREPOS) /* Means: check only at start */
1608 PL_reg_ganch = startpos;
1609 else if (sv && SvTYPE(sv) >= SVt_PVMG
1611 && (mg = mg_find(sv, PERL_MAGIC_regex_global))
1612 && mg->mg_len >= 0) {
1613 PL_reg_ganch = strbeg + mg->mg_len; /* Defined pos() */
1614 if (prog->reganch & ROPT_ANCH_GPOS) {
1615 if (s > PL_reg_ganch)
1620 else /* pos() not defined */
1621 PL_reg_ganch = strbeg;
1624 if (do_utf8 == (UTF!=0) &&
1625 !(flags & REXEC_CHECKED) && prog->check_substr != Nullsv) {
1626 re_scream_pos_data d;
1628 d.scream_olds = &scream_olds;
1629 d.scream_pos = &scream_pos;
1630 s = re_intuit_start(prog, sv, s, strend, flags, &d);
1632 DEBUG_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
1633 goto phooey; /* not present */
1639 pv_uni_display(dsv0, (U8*)prog->precomp, prog->prelen, 60,
1640 UNI_DISPLAY_REGEX) :
1642 int len0 = UTF ? SvCUR(dsv0) : prog->prelen;
1643 char *s1 = do_utf8 ? sv_uni_display(dsv1, sv, 60,
1644 UNI_DISPLAY_REGEX) : startpos;
1645 int len1 = do_utf8 ? SvCUR(dsv1) : strend - startpos;
1648 PerlIO_printf(Perl_debug_log,
1649 "%sMatching REx%s `%s%*.*s%s%s' against `%s%.*s%s%s'\n",
1650 PL_colors[4],PL_colors[5],PL_colors[0],
1653 len0 > 60 ? "..." : "",
1655 (int)(len1 > 60 ? 60 : len1),
1657 (len1 > 60 ? "..." : "")
1661 /* Simplest case: anchored match need be tried only once. */
1662 /* [unless only anchor is BOL and multiline is set] */
1663 if (prog->reganch & (ROPT_ANCH & ~ROPT_ANCH_GPOS)) {
1664 if (s == startpos && regtry(prog, startpos))
1666 else if (PL_multiline || (prog->reganch & ROPT_IMPLICIT)
1667 || (prog->reganch & ROPT_ANCH_MBOL)) /* XXXX SBOL? */
1672 dontbother = minlen - 1;
1673 end = HOP3c(strend, -dontbother, strbeg) - 1;
1674 /* for multiline we only have to try after newlines */
1675 if (prog->check_substr) {
1679 if (regtry(prog, s))
1684 if (prog->reganch & RE_USE_INTUIT) {
1685 s = re_intuit_start(prog, sv, s + 1, strend, flags, NULL);
1696 if (*s++ == '\n') { /* don't need PL_utf8skip here */
1697 if (regtry(prog, s))
1704 } else if (prog->reganch & ROPT_ANCH_GPOS) {
1705 if (regtry(prog, PL_reg_ganch))
1710 /* Messy cases: unanchored match. */
1711 if (prog->anchored_substr && prog->reganch & ROPT_SKIP) {
1712 /* we have /x+whatever/ */
1713 /* it must be a one character string (XXXX Except UTF?) */
1714 char ch = SvPVX(prog->anchored_substr)[0];
1720 while (s < strend) {
1722 DEBUG_r( did_match = 1 );
1723 if (regtry(prog, s)) goto got_it;
1725 while (s < strend && *s == ch)
1732 while (s < strend) {
1734 DEBUG_r( did_match = 1 );
1735 if (regtry(prog, s)) goto got_it;
1737 while (s < strend && *s == ch)
1743 DEBUG_r(if (!did_match)
1744 PerlIO_printf(Perl_debug_log,
1745 "Did not find anchored character...\n")
1749 else if (do_utf8 == (UTF!=0) &&
1750 (prog->anchored_substr != Nullsv
1751 || (prog->float_substr != Nullsv
1752 && prog->float_max_offset < strend - s))) {
1753 SV *must = prog->anchored_substr
1754 ? prog->anchored_substr : prog->float_substr;
1756 prog->anchored_substr ? prog->anchored_offset : prog->float_max_offset;
1758 prog->anchored_substr ? prog->anchored_offset : prog->float_min_offset;
1759 char *last = HOP3c(strend, /* Cannot start after this */
1760 -(I32)(CHR_SVLEN(must)
1761 - (SvTAIL(must) != 0) + back_min), strbeg);
1762 char *last1; /* Last position checked before */
1768 last1 = HOPc(s, -1);
1770 last1 = s - 1; /* bogus */
1772 /* XXXX check_substr already used to find `s', can optimize if
1773 check_substr==must. */
1775 dontbother = end_shift;
1776 strend = HOPc(strend, -dontbother);
1777 while ( (s <= last) &&
1778 ((flags & REXEC_SCREAM)
1779 ? (s = screaminstr(sv, must, HOP3c(s, back_min, strend) - strbeg,
1780 end_shift, &scream_pos, 0))
1781 : (s = fbm_instr((unsigned char*)HOP3(s, back_min, strend),
1782 (unsigned char*)strend, must,
1783 PL_multiline ? FBMrf_MULTILINE : 0))) ) {
1784 DEBUG_r( did_match = 1 );
1785 if (HOPc(s, -back_max) > last1) {
1786 last1 = HOPc(s, -back_min);
1787 s = HOPc(s, -back_max);
1790 char *t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
1792 last1 = HOPc(s, -back_min);
1796 while (s <= last1) {
1797 if (regtry(prog, s))
1803 while (s <= last1) {
1804 if (regtry(prog, s))
1810 DEBUG_r(if (!did_match)
1811 PerlIO_printf(Perl_debug_log,
1812 "Did not find %s substr `%s%.*s%s'%s...\n",
1813 ((must == prog->anchored_substr)
1814 ? "anchored" : "floating"),
1816 (int)(SvCUR(must) - (SvTAIL(must)!=0)),
1818 PL_colors[1], (SvTAIL(must) ? "$" : ""))
1822 else if ((c = prog->regstclass)) {
1823 if (minlen && PL_regkind[(U8)OP(prog->regstclass)] != EXACT)
1824 /* don't bother with what can't match */
1825 strend = HOPc(strend, -(minlen - 1));
1827 SV *prop = sv_newmortal();
1835 pv_uni_display(dsv0, (U8*)SvPVX(prop), SvCUR(prop), 60,
1836 UNI_DISPLAY_REGEX) :
1838 len0 = UTF ? SvCUR(dsv0) : SvCUR(prop);
1840 sv_uni_display(dsv1, sv, 60, UNI_DISPLAY_REGEX) : s;
1841 len1 = UTF ? SvCUR(dsv1) : strend - s;
1842 PerlIO_printf(Perl_debug_log,
1843 "Matching stclass `%*.*s' against `%*.*s'\n",
1847 if (find_byclass(prog, c, s, strend, startpos, 0))
1849 DEBUG_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass...\n"));
1853 if (prog->float_substr != Nullsv) { /* Trim the end. */
1856 if (flags & REXEC_SCREAM) {
1857 last = screaminstr(sv, prog->float_substr, s - strbeg,
1858 end_shift, &scream_pos, 1); /* last one */
1860 last = scream_olds; /* Only one occurrence. */
1864 char *little = SvPV(prog->float_substr, len);
1866 if (SvTAIL(prog->float_substr)) {
1867 if (memEQ(strend - len + 1, little, len - 1))
1868 last = strend - len + 1;
1869 else if (!PL_multiline)
1870 last = memEQ(strend - len, little, len)
1871 ? strend - len : Nullch;
1877 last = rninstr(s, strend, little, little + len);
1879 last = strend; /* matching `$' */
1883 DEBUG_r(PerlIO_printf(Perl_debug_log,
1884 "%sCan't trim the tail, match fails (should not happen)%s\n",
1885 PL_colors[4],PL_colors[5]));
1886 goto phooey; /* Should not happen! */
1888 dontbother = strend - last + prog->float_min_offset;
1890 if (minlen && (dontbother < minlen))
1891 dontbother = minlen - 1;
1892 strend -= dontbother; /* this one's always in bytes! */
1893 /* We don't know much -- general case. */
1896 if (regtry(prog, s))
1905 if (regtry(prog, s))
1907 } while (s++ < strend);
1915 RX_MATCH_TAINTED_set(prog, PL_reg_flags & RF_tainted);
1917 if (PL_reg_eval_set) {
1918 /* Preserve the current value of $^R */
1919 if (oreplsv != GvSV(PL_replgv))
1920 sv_setsv(oreplsv, GvSV(PL_replgv));/* So that when GvSV(replgv) is
1921 restored, the value remains
1923 restore_pos(aTHX_ 0);
1926 /* make sure $`, $&, $', and $digit will work later */
1927 if ( !(flags & REXEC_NOT_FIRST) ) {
1928 if (RX_MATCH_COPIED(prog)) {
1929 Safefree(prog->subbeg);
1930 RX_MATCH_COPIED_off(prog);
1932 if (flags & REXEC_COPY_STR) {
1933 I32 i = PL_regeol - startpos + (stringarg - strbeg);
1935 s = savepvn(strbeg, i);
1938 RX_MATCH_COPIED_on(prog);
1941 prog->subbeg = strbeg;
1942 prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
1949 DEBUG_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
1950 PL_colors[4],PL_colors[5]));
1951 if (PL_reg_eval_set)
1952 restore_pos(aTHX_ 0);
1957 - regtry - try match at specific point
1959 STATIC I32 /* 0 failure, 1 success */
1960 S_regtry(pTHX_ regexp *prog, char *startpos)
1968 PL_regindent = 0; /* XXXX Not good when matches are reenterable... */
1970 if ((prog->reganch & ROPT_EVAL_SEEN) && !PL_reg_eval_set) {
1973 PL_reg_eval_set = RS_init;
1975 PerlIO_printf(Perl_debug_log, " setting stack tmpbase at %"IVdf"\n",
1976 (IV)(PL_stack_sp - PL_stack_base));
1978 SAVEI32(cxstack[cxstack_ix].blk_oldsp);
1979 cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base;
1980 /* Otherwise OP_NEXTSTATE will free whatever on stack now. */
1982 /* Apparently this is not needed, judging by wantarray. */
1983 /* SAVEI8(cxstack[cxstack_ix].blk_gimme);
1984 cxstack[cxstack_ix].blk_gimme = G_SCALAR; */
1987 /* Make $_ available to executed code. */
1988 if (PL_reg_sv != DEFSV) {
1989 /* SAVE_DEFSV does *not* suffice here for USE_5005THREADS */
1994 if (!(SvTYPE(PL_reg_sv) >= SVt_PVMG && SvMAGIC(PL_reg_sv)
1995 && (mg = mg_find(PL_reg_sv, PERL_MAGIC_regex_global)))) {
1996 /* prepare for quick setting of pos */
1997 sv_magic(PL_reg_sv, (SV*)0,
1998 PERL_MAGIC_regex_global, Nullch, 0);
1999 mg = mg_find(PL_reg_sv, PERL_MAGIC_regex_global);
2003 PL_reg_oldpos = mg->mg_len;
2004 SAVEDESTRUCTOR_X(restore_pos, 0);
2006 if (!PL_reg_curpm) {
2007 Newz(22,PL_reg_curpm, 1, PMOP);
2010 SV* repointer = newSViv(0);
2011 /* so we know which PL_regex_padav element is PL_reg_curpm */
2012 SvFLAGS(repointer) |= SVf_BREAK;
2013 av_push(PL_regex_padav,repointer);
2014 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2015 PL_regex_pad = AvARRAY(PL_regex_padav);
2019 PM_SETRE(PL_reg_curpm, prog);
2020 PL_reg_oldcurpm = PL_curpm;
2021 PL_curpm = PL_reg_curpm;
2022 if (RX_MATCH_COPIED(prog)) {
2023 /* Here is a serious problem: we cannot rewrite subbeg,
2024 since it may be needed if this match fails. Thus
2025 $` inside (?{}) could fail... */
2026 PL_reg_oldsaved = prog->subbeg;
2027 PL_reg_oldsavedlen = prog->sublen;
2028 RX_MATCH_COPIED_off(prog);
2031 PL_reg_oldsaved = Nullch;
2032 prog->subbeg = PL_bostr;
2033 prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2035 prog->startp[0] = startpos - PL_bostr;
2036 PL_reginput = startpos;
2037 PL_regstartp = prog->startp;
2038 PL_regendp = prog->endp;
2039 PL_reglastparen = &prog->lastparen;
2040 PL_reglastcloseparen = &prog->lastcloseparen;
2041 prog->lastparen = 0;
2043 DEBUG_r(PL_reg_starttry = startpos);
2044 if (PL_reg_start_tmpl <= prog->nparens) {
2045 PL_reg_start_tmpl = prog->nparens*3/2 + 3;
2046 if(PL_reg_start_tmp)
2047 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2049 New(22,PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2053 sv_setpvn(PERL_DEBUG_PAD(0), "", 0);
2054 sv_setpvn(PERL_DEBUG_PAD(1), "", 0);
2055 sv_setpvn(PERL_DEBUG_PAD(2), "", 0);
2058 /* XXXX What this code is doing here?!!! There should be no need
2059 to do this again and again, PL_reglastparen should take care of
2062 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2063 * Actually, the code in regcppop() (which Ilya may be meaning by
2064 * PL_reglastparen), is not needed at all by the test suite
2065 * (op/regexp, op/pat, op/split), but that code is needed, oddly
2066 * enough, for building DynaLoader, or otherwise this
2067 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
2068 * will happen. Meanwhile, this code *is* needed for the
2069 * above-mentioned test suite tests to succeed. The common theme
2070 * on those tests seems to be returning null fields from matches.
2075 if (prog->nparens) {
2076 for (i = prog->nparens; i > *PL_reglastparen; i--) {
2083 if (regmatch(prog->program + 1)) {
2084 prog->endp[0] = PL_reginput - PL_bostr;
2087 REGCP_UNWIND(lastcp);
2091 #define RE_UNWIND_BRANCH 1
2092 #define RE_UNWIND_BRANCHJ 2
2096 typedef struct { /* XX: makes sense to enlarge it... */
2100 } re_unwind_generic_t;
2113 } re_unwind_branch_t;
2115 typedef union re_unwind_t {
2117 re_unwind_generic_t generic;
2118 re_unwind_branch_t branch;
2121 #define sayYES goto yes
2122 #define sayNO goto no
2123 #define sayNO_ANYOF goto no_anyof
2124 #define sayYES_FINAL goto yes_final
2125 #define sayYES_LOUD goto yes_loud
2126 #define sayNO_FINAL goto no_final
2127 #define sayNO_SILENT goto do_no
2128 #define saySAME(x) if (x) goto yes; else goto no
2130 #define REPORT_CODE_OFF 24
2133 - regmatch - main matching routine
2135 * Conceptually the strategy is simple: check to see whether the current
2136 * node matches, call self recursively to see whether the rest matches,
2137 * and then act accordingly. In practice we make some effort to avoid
2138 * recursion, in particular by going through "ordinary" nodes (that don't
2139 * need to know whether the rest of the match failed) by a loop instead of
2142 /* [lwall] I've hoisted the register declarations to the outer block in order to
2143 * maybe save a little bit of pushing and popping on the stack. It also takes
2144 * advantage of machines that use a register save mask on subroutine entry.
2146 STATIC I32 /* 0 failure, 1 success */
2147 S_regmatch(pTHX_ regnode *prog)
2149 register regnode *scan; /* Current node. */
2150 regnode *next; /* Next node. */
2151 regnode *inner; /* Next node in internal branch. */
2152 register I32 nextchr; /* renamed nextchr - nextchar colides with
2153 function of same name */
2154 register I32 n; /* no or next */
2155 register I32 ln = 0; /* len or last */
2156 register char *s = Nullch; /* operand or save */
2157 register char *locinput = PL_reginput;
2158 register I32 c1 = 0, c2 = 0, paren; /* case fold search, parenth */
2159 int minmod = 0, sw = 0, logical = 0;
2162 I32 firstcp = PL_savestack_ix;
2164 register bool do_utf8 = PL_reg_match_utf8;
2166 SV *dsv0 = PERL_DEBUG_PAD_ZERO(0);
2167 SV *dsv1 = PERL_DEBUG_PAD_ZERO(1);
2168 SV *dsv2 = PERL_DEBUG_PAD_ZERO(2);
2175 /* Note that nextchr is a byte even in UTF */
2176 nextchr = UCHARAT(locinput);
2178 while (scan != NULL) {
2181 SV *prop = sv_newmortal();
2182 int docolor = *PL_colors[0];
2183 int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
2184 int l = (PL_regeol - locinput) > taill ? taill : (PL_regeol - locinput);
2185 /* The part of the string before starttry has one color
2186 (pref0_len chars), between starttry and current
2187 position another one (pref_len - pref0_len chars),
2188 after the current position the third one.
2189 We assume that pref0_len <= pref_len, otherwise we
2190 decrease pref0_len. */
2191 int pref_len = (locinput - PL_bostr) > (5 + taill) - l
2192 ? (5 + taill) - l : locinput - PL_bostr;
2195 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
2197 pref0_len = pref_len - (locinput - PL_reg_starttry);
2198 if (l + pref_len < (5 + taill) && l < PL_regeol - locinput)
2199 l = ( PL_regeol - locinput > (5 + taill) - pref_len
2200 ? (5 + taill) - pref_len : PL_regeol - locinput);
2201 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
2205 if (pref0_len > pref_len)
2206 pref0_len = pref_len;
2207 regprop(prop, scan);
2211 pv_uni_display(dsv0, (U8*)(locinput - pref_len),
2212 pref0_len, 60, UNI_DISPLAY_REGEX) :
2213 locinput - pref_len;
2214 int len0 = do_utf8 ? strlen(s0) : pref0_len;
2215 char *s1 = do_utf8 ?
2216 pv_uni_display(dsv1, (U8*)(locinput - pref_len + pref0_len),
2217 pref_len - pref0_len, 60, UNI_DISPLAY_REGEX) :
2218 locinput - pref_len + pref0_len;
2219 int len1 = do_utf8 ? strlen(s1) : pref_len - pref0_len;
2220 char *s2 = do_utf8 ?
2221 pv_uni_display(dsv2, (U8*)locinput,
2222 PL_regeol - locinput, 60, UNI_DISPLAY_REGEX) :
2224 int len2 = do_utf8 ? strlen(s2) : l;
2225 PerlIO_printf(Perl_debug_log,
2226 "%4"IVdf" <%s%.*s%s%s%.*s%s%s%s%.*s%s>%*s|%3"IVdf":%*s%s\n",
2227 (IV)(locinput - PL_bostr),
2234 (docolor ? "" : "> <"),
2238 15 - l - pref_len + 1,
2240 (IV)(scan - PL_regprogram), PL_regindent*2, "",
2245 next = scan + NEXT_OFF(scan);
2251 if (locinput == PL_bostr || (PL_multiline &&
2252 (nextchr || locinput < PL_regeol) && locinput[-1] == '\n') )
2254 /* regtill = regbol; */
2259 if (locinput == PL_bostr ||
2260 ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n'))
2266 if (locinput == PL_bostr)
2270 if (locinput == PL_reg_ganch)
2280 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
2285 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
2287 if (PL_regeol - locinput > 1)
2291 if (PL_regeol != locinput)
2295 if (!nextchr && locinput >= PL_regeol)
2298 locinput += PL_utf8skip[nextchr];
2299 if (locinput > PL_regeol)
2301 nextchr = UCHARAT(locinput);
2304 nextchr = UCHARAT(++locinput);
2307 if (!nextchr && locinput >= PL_regeol)
2309 nextchr = UCHARAT(++locinput);
2312 if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
2315 locinput += PL_utf8skip[nextchr];
2316 if (locinput > PL_regeol)
2318 nextchr = UCHARAT(locinput);
2321 nextchr = UCHARAT(++locinput);
2326 if (do_utf8 != (UTF!=0)) {
2327 /* The target and the pattern have differing utf8ness. */
2333 /* The target is utf8, the pattern is not utf8. */
2337 if (NATIVE_TO_UNI(*(U8*)s) !=
2338 utf8_to_uvuni((U8*)l, &ulen))
2345 /* The target is not utf8, the pattern is utf8. */
2349 if (NATIVE_TO_UNI(*((U8*)l)) !=
2350 utf8_to_uvuni((U8*)s, &ulen))
2357 nextchr = UCHARAT(locinput);
2360 /* The target and the pattern have the same utf8ness. */
2361 /* Inline the first character, for speed. */
2362 if (UCHARAT(s) != nextchr)
2364 if (PL_regeol - locinput < ln)
2366 if (ln > 1 && memNE(s, locinput, ln))
2369 nextchr = UCHARAT(locinput);
2372 PL_reg_flags |= RF_tainted;
2378 if (do_utf8 || UTF) {
2379 /* Either target or the pattern are utf8. */
2381 char *e = PL_regeol;
2383 if (ibcmp_utf8(s, 0, ln, do_utf8,
2385 /* One more case for the sharp s:
2386 * pack("U0U*", 0xDF) =~ /ss/i,
2387 * the 0xC3 0x9F are the UTF-8
2388 * byte sequence for the U+00DF. */
2390 toLOWER(s[0]) == 's' &&
2392 toLOWER(s[1]) == 's' &&
2399 nextchr = UCHARAT(locinput);
2403 /* Neither the target and the pattern are utf8. */
2405 /* Inline the first character, for speed. */
2406 if (UCHARAT(s) != nextchr &&
2407 UCHARAT(s) != ((OP(scan) == EXACTF)
2408 ? PL_fold : PL_fold_locale)[nextchr])
2410 if (PL_regeol - locinput < ln)
2412 if (ln > 1 && (OP(scan) == EXACTF
2413 ? ibcmp(s, locinput, ln)
2414 : ibcmp_locale(s, locinput, ln)))
2417 nextchr = UCHARAT(locinput);
2421 STRLEN inclasslen = PL_regeol - locinput;
2423 if (!reginclasslen(scan, (U8*)locinput, &inclasslen, do_utf8))
2425 if (locinput >= PL_regeol)
2427 locinput += inclasslen;
2428 nextchr = UCHARAT(locinput);
2433 nextchr = UCHARAT(locinput);
2434 if (!reginclass(scan, (U8*)locinput, do_utf8))
2436 if (!nextchr && locinput >= PL_regeol)
2438 nextchr = UCHARAT(++locinput);
2442 /* If we might have the case of the German sharp s
2443 * in a casefolding Unicode character class. */
2445 if (ANYOF_FOLD_SHARP_S(scan, locinput, PL_regeol)) {
2446 locinput += SHARP_S_SKIP;
2447 nextchr = UCHARAT(locinput);
2453 PL_reg_flags |= RF_tainted;
2459 LOAD_UTF8_CHARCLASS(alnum,"a");
2460 if (!(OP(scan) == ALNUM
2461 ? swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
2462 : isALNUM_LC_utf8((U8*)locinput)))
2466 locinput += PL_utf8skip[nextchr];
2467 nextchr = UCHARAT(locinput);
2470 if (!(OP(scan) == ALNUM
2471 ? isALNUM(nextchr) : isALNUM_LC(nextchr)))
2473 nextchr = UCHARAT(++locinput);
2476 PL_reg_flags |= RF_tainted;
2479 if (!nextchr && locinput >= PL_regeol)
2482 LOAD_UTF8_CHARCLASS(alnum,"a");
2483 if (OP(scan) == NALNUM
2484 ? swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
2485 : isALNUM_LC_utf8((U8*)locinput))
2489 locinput += PL_utf8skip[nextchr];
2490 nextchr = UCHARAT(locinput);
2493 if (OP(scan) == NALNUM
2494 ? isALNUM(nextchr) : isALNUM_LC(nextchr))
2496 nextchr = UCHARAT(++locinput);
2500 PL_reg_flags |= RF_tainted;
2504 /* was last char in word? */
2506 if (locinput == PL_bostr)
2509 U8 *r = reghop((U8*)locinput, -1);
2511 ln = utf8n_to_uvchr(r, s - (char*)r, 0, 0);
2513 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
2514 ln = isALNUM_uni(ln);
2515 LOAD_UTF8_CHARCLASS(alnum,"a");
2516 n = swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8);
2519 ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(ln));
2520 n = isALNUM_LC_utf8((U8*)locinput);
2524 ln = (locinput != PL_bostr) ?
2525 UCHARAT(locinput - 1) : '\n';
2526 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
2528 n = isALNUM(nextchr);
2531 ln = isALNUM_LC(ln);
2532 n = isALNUM_LC(nextchr);
2535 if (((!ln) == (!n)) == (OP(scan) == BOUND ||
2536 OP(scan) == BOUNDL))
2540 PL_reg_flags |= RF_tainted;
2546 if (UTF8_IS_CONTINUED(nextchr)) {
2547 LOAD_UTF8_CHARCLASS(space," ");
2548 if (!(OP(scan) == SPACE
2549 ? swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
2550 : isSPACE_LC_utf8((U8*)locinput)))
2554 locinput += PL_utf8skip[nextchr];
2555 nextchr = UCHARAT(locinput);
2558 if (!(OP(scan) == SPACE
2559 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
2561 nextchr = UCHARAT(++locinput);
2564 if (!(OP(scan) == SPACE
2565 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
2567 nextchr = UCHARAT(++locinput);
2571 PL_reg_flags |= RF_tainted;
2574 if (!nextchr && locinput >= PL_regeol)
2577 LOAD_UTF8_CHARCLASS(space," ");
2578 if (OP(scan) == NSPACE
2579 ? swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
2580 : isSPACE_LC_utf8((U8*)locinput))
2584 locinput += PL_utf8skip[nextchr];
2585 nextchr = UCHARAT(locinput);
2588 if (OP(scan) == NSPACE
2589 ? isSPACE(nextchr) : isSPACE_LC(nextchr))
2591 nextchr = UCHARAT(++locinput);
2594 PL_reg_flags |= RF_tainted;
2600 LOAD_UTF8_CHARCLASS(digit,"0");
2601 if (!(OP(scan) == DIGIT
2602 ? swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
2603 : isDIGIT_LC_utf8((U8*)locinput)))
2607 locinput += PL_utf8skip[nextchr];
2608 nextchr = UCHARAT(locinput);
2611 if (!(OP(scan) == DIGIT
2612 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr)))
2614 nextchr = UCHARAT(++locinput);
2617 PL_reg_flags |= RF_tainted;
2620 if (!nextchr && locinput >= PL_regeol)
2623 LOAD_UTF8_CHARCLASS(digit,"0");
2624 if (OP(scan) == NDIGIT
2625 ? swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
2626 : isDIGIT_LC_utf8((U8*)locinput))
2630 locinput += PL_utf8skip[nextchr];
2631 nextchr = UCHARAT(locinput);
2634 if (OP(scan) == NDIGIT
2635 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr))
2637 nextchr = UCHARAT(++locinput);
2640 if (locinput >= PL_regeol)
2643 LOAD_UTF8_CHARCLASS(mark,"~");
2644 if (swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
2646 locinput += PL_utf8skip[nextchr];
2647 while (locinput < PL_regeol &&
2648 swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
2649 locinput += UTF8SKIP(locinput);
2650 if (locinput > PL_regeol)
2655 nextchr = UCHARAT(locinput);
2658 PL_reg_flags |= RF_tainted;
2662 n = ARG(scan); /* which paren pair */
2663 ln = PL_regstartp[n];
2664 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
2665 if (*PL_reglastparen < n || ln == -1)
2666 sayNO; /* Do not match unless seen CLOSEn. */
2667 if (ln == PL_regendp[n])
2671 if (do_utf8 && OP(scan) != REF) { /* REF can do byte comparison */
2673 char *e = PL_bostr + PL_regendp[n];
2675 * Note that we can't do the "other character" lookup trick as
2676 * in the 8-bit case (no pun intended) because in Unicode we
2677 * have to map both upper and title case to lower case.
2679 if (OP(scan) == REFF) {
2680 STRLEN ulen1, ulen2;
2681 U8 tmpbuf1[UTF8_MAXLEN_UCLC+1];
2682 U8 tmpbuf2[UTF8_MAXLEN_UCLC+1];
2686 toLOWER_utf8((U8*)s, tmpbuf1, &ulen1);
2687 toLOWER_utf8((U8*)l, tmpbuf2, &ulen2);
2688 if (ulen1 != ulen2 || memNE((char *)tmpbuf1, (char *)tmpbuf2, ulen1))
2695 nextchr = UCHARAT(locinput);
2699 /* Inline the first character, for speed. */
2700 if (UCHARAT(s) != nextchr &&
2702 (UCHARAT(s) != ((OP(scan) == REFF
2703 ? PL_fold : PL_fold_locale)[nextchr]))))
2705 ln = PL_regendp[n] - ln;
2706 if (locinput + ln > PL_regeol)
2708 if (ln > 1 && (OP(scan) == REF
2709 ? memNE(s, locinput, ln)
2711 ? ibcmp(s, locinput, ln)
2712 : ibcmp_locale(s, locinput, ln))))
2715 nextchr = UCHARAT(locinput);
2726 OP_4tree *oop = PL_op;
2727 COP *ocurcop = PL_curcop;
2728 SV **ocurpad = PL_curpad;
2732 PL_op = (OP_4tree*)PL_regdata->data[n];
2733 DEBUG_r( PerlIO_printf(Perl_debug_log, " re_eval 0x%"UVxf"\n", PTR2UV(PL_op)) );
2734 PL_curpad = AvARRAY((AV*)PL_regdata->data[n + 2]);
2735 PL_regendp[0] = PL_reg_magic->mg_len = locinput - PL_bostr;
2739 CALLRUNOPS(aTHX); /* Scalar context. */
2742 ret = Nullsv; /* protect against empty (?{}) blocks. */
2750 PL_curpad = ocurpad;
2751 PL_curcop = ocurcop;
2753 if (logical == 2) { /* Postponed subexpression. */
2755 MAGIC *mg = Null(MAGIC*);
2757 CHECKPOINT cp, lastcp;
2759 if(SvROK(ret) || SvRMAGICAL(ret)) {
2760 SV *sv = SvROK(ret) ? SvRV(ret) : ret;
2763 mg = mg_find(sv, PERL_MAGIC_qr);
2766 re = (regexp *)mg->mg_obj;
2767 (void)ReREFCNT_inc(re);
2771 char *t = SvPV(ret, len);
2773 char *oprecomp = PL_regprecomp;
2774 I32 osize = PL_regsize;
2775 I32 onpar = PL_regnpar;
2778 re = CALLREGCOMP(aTHX_ t, t + len, &pm);
2780 & (SVs_TEMP | SVs_PADTMP | SVf_READONLY)))
2781 sv_magic(ret,(SV*)ReREFCNT_inc(re),
2783 PL_regprecomp = oprecomp;
2788 PerlIO_printf(Perl_debug_log,
2789 "Entering embedded `%s%.60s%s%s'\n",
2793 (strlen(re->precomp) > 60 ? "..." : ""))
2796 state.prev = PL_reg_call_cc;
2797 state.cc = PL_regcc;
2798 state.re = PL_reg_re;
2802 cp = regcppush(0); /* Save *all* the positions. */
2805 state.ss = PL_savestack_ix;
2806 *PL_reglastparen = 0;
2807 *PL_reglastcloseparen = 0;
2808 PL_reg_call_cc = &state;
2809 PL_reginput = locinput;
2811 /* XXXX This is too dramatic a measure... */
2814 if (regmatch(re->program + 1)) {
2815 /* Even though we succeeded, we need to restore
2816 global variables, since we may be wrapped inside
2817 SUSPEND, thus the match may be not finished yet. */
2819 /* XXXX Do this only if SUSPENDed? */
2820 PL_reg_call_cc = state.prev;
2821 PL_regcc = state.cc;
2822 PL_reg_re = state.re;
2823 cache_re(PL_reg_re);
2825 /* XXXX This is too dramatic a measure... */
2828 /* These are needed even if not SUSPEND. */
2834 REGCP_UNWIND(lastcp);
2836 PL_reg_call_cc = state.prev;
2837 PL_regcc = state.cc;
2838 PL_reg_re = state.re;
2839 cache_re(PL_reg_re);
2841 /* XXXX This is too dramatic a measure... */
2851 sv_setsv(save_scalar(PL_replgv), ret);
2855 n = ARG(scan); /* which paren pair */
2856 PL_reg_start_tmp[n] = locinput;
2861 n = ARG(scan); /* which paren pair */
2862 PL_regstartp[n] = PL_reg_start_tmp[n] - PL_bostr;
2863 PL_regendp[n] = locinput - PL_bostr;
2864 if (n > *PL_reglastparen)
2865 *PL_reglastparen = n;
2866 *PL_reglastcloseparen = n;
2869 n = ARG(scan); /* which paren pair */
2870 sw = (*PL_reglastparen >= n && PL_regendp[n] != -1);
2873 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
2875 next = NEXTOPER(NEXTOPER(scan));
2877 next = scan + ARG(scan);
2878 if (OP(next) == IFTHEN) /* Fake one. */
2879 next = NEXTOPER(NEXTOPER(next));
2883 logical = scan->flags;
2885 /*******************************************************************
2886 PL_regcc contains infoblock about the innermost (...)* loop, and
2887 a pointer to the next outer infoblock.
2889 Here is how Y(A)*Z is processed (if it is compiled into CURLYX/WHILEM):
2891 1) After matching X, regnode for CURLYX is processed;
2893 2) This regnode creates infoblock on the stack, and calls
2894 regmatch() recursively with the starting point at WHILEM node;
2896 3) Each hit of WHILEM node tries to match A and Z (in the order
2897 depending on the current iteration, min/max of {min,max} and
2898 greediness). The information about where are nodes for "A"
2899 and "Z" is read from the infoblock, as is info on how many times "A"
2900 was already matched, and greediness.
2902 4) After A matches, the same WHILEM node is hit again.
2904 5) Each time WHILEM is hit, PL_regcc is the infoblock created by CURLYX
2905 of the same pair. Thus when WHILEM tries to match Z, it temporarily
2906 resets PL_regcc, since this Y(A)*Z can be a part of some other loop:
2907 as in (Y(A)*Z)*. If Z matches, the automaton will hit the WHILEM node
2908 of the external loop.
2910 Currently present infoblocks form a tree with a stem formed by PL_curcc
2911 and whatever it mentions via ->next, and additional attached trees
2912 corresponding to temporarily unset infoblocks as in "5" above.
2914 In the following picture infoblocks for outer loop of
2915 (Y(A)*?Z)*?T are denoted O, for inner I. NULL starting block
2916 is denoted by x. The matched string is YAAZYAZT. Temporarily postponed
2917 infoblocks are drawn below the "reset" infoblock.
2919 In fact in the picture below we do not show failed matches for Z and T
2920 by WHILEM blocks. [We illustrate minimal matches, since for them it is
2921 more obvious *why* one needs to *temporary* unset infoblocks.]
2923 Matched REx position InfoBlocks Comment
2927 Y A)*?Z)*?T x <- O <- I
2928 YA )*?Z)*?T x <- O <- I
2929 YA A)*?Z)*?T x <- O <- I
2930 YAA )*?Z)*?T x <- O <- I
2931 YAA Z)*?T x <- O # Temporary unset I
2934 YAAZ Y(A)*?Z)*?T x <- O
2937 YAAZY (A)*?Z)*?T x <- O
2940 YAAZY A)*?Z)*?T x <- O <- I
2943 YAAZYA )*?Z)*?T x <- O <- I
2946 YAAZYA Z)*?T x <- O # Temporary unset I
2952 YAAZYAZ T x # Temporary unset O
2959 *******************************************************************/
2962 CHECKPOINT cp = PL_savestack_ix;
2963 /* No need to save/restore up to this paren */
2964 I32 parenfloor = scan->flags;
2966 if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
2968 cc.oldcc = PL_regcc;
2970 /* XXXX Probably it is better to teach regpush to support
2971 parenfloor > PL_regsize... */
2972 if (parenfloor > *PL_reglastparen)
2973 parenfloor = *PL_reglastparen; /* Pessimization... */
2974 cc.parenfloor = parenfloor;
2976 cc.min = ARG1(scan);
2977 cc.max = ARG2(scan);
2978 cc.scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
2982 PL_reginput = locinput;
2983 n = regmatch(PREVOPER(next)); /* start on the WHILEM */
2985 PL_regcc = cc.oldcc;
2991 * This is really hard to understand, because after we match
2992 * what we're trying to match, we must make sure the rest of
2993 * the REx is going to match for sure, and to do that we have
2994 * to go back UP the parse tree by recursing ever deeper. And
2995 * if it fails, we have to reset our parent's current state
2996 * that we can try again after backing off.
2999 CHECKPOINT cp, lastcp;
3000 CURCUR* cc = PL_regcc;
3001 char *lastloc = cc->lastloc; /* Detection of 0-len. */
3003 n = cc->cur + 1; /* how many we know we matched */
3004 PL_reginput = locinput;
3007 PerlIO_printf(Perl_debug_log,
3008 "%*s %ld out of %ld..%ld cc=%lx\n",
3009 REPORT_CODE_OFF+PL_regindent*2, "",
3010 (long)n, (long)cc->min,
3011 (long)cc->max, (long)cc)
3014 /* If degenerate scan matches "", assume scan done. */
3016 if (locinput == cc->lastloc && n >= cc->min) {
3017 PL_regcc = cc->oldcc;
3021 PerlIO_printf(Perl_debug_log,
3022 "%*s empty match detected, try continuation...\n",
3023 REPORT_CODE_OFF+PL_regindent*2, "")
3025 if (regmatch(cc->next))
3033 /* First just match a string of min scans. */
3037 cc->lastloc = locinput;
3038 if (regmatch(cc->scan))
3041 cc->lastloc = lastloc;
3046 /* Check whether we already were at this position.
3047 Postpone detection until we know the match is not
3048 *that* much linear. */
3049 if (!PL_reg_maxiter) {
3050 PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
3051 PL_reg_leftiter = PL_reg_maxiter;
3053 if (PL_reg_leftiter-- == 0) {
3054 I32 size = (PL_reg_maxiter + 7)/8;
3055 if (PL_reg_poscache) {
3056 if (PL_reg_poscache_size < size) {
3057 Renew(PL_reg_poscache, size, char);
3058 PL_reg_poscache_size = size;
3060 Zero(PL_reg_poscache, size, char);
3063 PL_reg_poscache_size = size;
3064 Newz(29, PL_reg_poscache, size, char);
3067 PerlIO_printf(Perl_debug_log,
3068 "%sDetected a super-linear match, switching on caching%s...\n",
3069 PL_colors[4], PL_colors[5])
3072 if (PL_reg_leftiter < 0) {
3073 I32 o = locinput - PL_bostr, b;
3075 o = (scan->flags & 0xf) - 1 + o * (scan->flags>>4);
3078 if (PL_reg_poscache[o] & (1<<b)) {
3080 PerlIO_printf(Perl_debug_log,
3081 "%*s already tried at this position...\n",
3082 REPORT_CODE_OFF+PL_regindent*2, "")
3086 PL_reg_poscache[o] |= (1<<b);
3090 /* Prefer next over scan for minimal matching. */
3093 PL_regcc = cc->oldcc;
3096 cp = regcppush(cc->parenfloor);
3098 if (regmatch(cc->next)) {
3100 sayYES; /* All done. */
3102 REGCP_UNWIND(lastcp);
3108 if (n >= cc->max) { /* Maximum greed exceeded? */
3109 if (ckWARN(WARN_REGEXP) && n >= REG_INFTY
3110 && !(PL_reg_flags & RF_warned)) {
3111 PL_reg_flags |= RF_warned;
3112 Perl_warner(aTHX_ WARN_REGEXP, "%s limit (%d) exceeded",
3113 "Complex regular subexpression recursion",
3120 PerlIO_printf(Perl_debug_log,
3121 "%*s trying longer...\n",
3122 REPORT_CODE_OFF+PL_regindent*2, "")
3124 /* Try scanning more and see if it helps. */
3125 PL_reginput = locinput;
3127 cc->lastloc = locinput;
3128 cp = regcppush(cc->parenfloor);
3130 if (regmatch(cc->scan)) {
3134 REGCP_UNWIND(lastcp);
3137 cc->lastloc = lastloc;
3141 /* Prefer scan over next for maximal matching. */
3143 if (n < cc->max) { /* More greed allowed? */
3144 cp = regcppush(cc->parenfloor);
3146 cc->lastloc = locinput;
3148 if (regmatch(cc->scan)) {
3152 REGCP_UNWIND(lastcp);
3153 regcppop(); /* Restore some previous $<digit>s? */
3154 PL_reginput = locinput;
3156 PerlIO_printf(Perl_debug_log,
3157 "%*s failed, try continuation...\n",
3158 REPORT_CODE_OFF+PL_regindent*2, "")
3161 if (ckWARN(WARN_REGEXP) && n >= REG_INFTY
3162 && !(PL_reg_flags & RF_warned)) {
3163 PL_reg_flags |= RF_warned;
3164 Perl_warner(aTHX_ WARN_REGEXP, "%s limit (%d) exceeded",
3165 "Complex regular subexpression recursion",
3169 /* Failed deeper matches of scan, so see if this one works. */
3170 PL_regcc = cc->oldcc;
3173 if (regmatch(cc->next))
3179 cc->lastloc = lastloc;
3184 next = scan + ARG(scan);
3187 inner = NEXTOPER(NEXTOPER(scan));
3190 inner = NEXTOPER(scan);
3194 if (OP(next) != c1) /* No choice. */
3195 next = inner; /* Avoid recursion. */
3197 I32 lastparen = *PL_reglastparen;
3199 re_unwind_branch_t *uw;
3201 /* Put unwinding data on stack */
3202 unwind1 = SSNEWt(1,re_unwind_branch_t);
3203 uw = SSPTRt(unwind1,re_unwind_branch_t);
3206 uw->type = ((c1 == BRANCH)
3208 : RE_UNWIND_BRANCHJ);
3209 uw->lastparen = lastparen;
3211 uw->locinput = locinput;
3212 uw->nextchr = nextchr;
3214 uw->regindent = ++PL_regindent;
3217 REGCP_SET(uw->lastcp);
3219 /* Now go into the first branch */
3232 /* We suppose that the next guy does not need
3233 backtracking: in particular, it is of constant length,
3234 and has no parenths to influence future backrefs. */
3235 ln = ARG1(scan); /* min to match */
3236 n = ARG2(scan); /* max to match */
3237 paren = scan->flags;
3239 if (paren > PL_regsize)
3241 if (paren > *PL_reglastparen)
3242 *PL_reglastparen = paren;
3244 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
3246 scan += NEXT_OFF(scan); /* Skip former OPEN. */
3247 PL_reginput = locinput;
3250 if (ln && regrepeat_hard(scan, ln, &l) < ln)
3252 /* if we matched something zero-length we don't need to
3253 backtrack - capturing parens are already defined, so
3254 the caveat in the maximal case doesn't apply
3256 XXXX if ln == 0, we can redo this check first time
3257 through the following loop
3260 n = ln; /* don't backtrack */
3261 locinput = PL_reginput;
3262 if (HAS_TEXT(next) || JUMPABLE(next)) {
3263 regnode *text_node = next;
3265 if (! HAS_TEXT(text_node)) FIND_NEXT_IMPT(text_node);
3267 if (! HAS_TEXT(text_node)) c1 = c2 = -1000;
3269 if (PL_regkind[(U8)OP(text_node)] == REF) {
3271 n = ARG(text_node); /* which paren pair */
3272 ln = PL_regstartp[n];
3273 /* assume yes if we haven't seen CLOSEn */
3275 *PL_reglastparen < n ||
3282 c1 = *(PL_bostr + ln);
3284 else { c1 = (U8)*STRING(text_node); }
3285 if (OP(text_node) == EXACTF || OP(text_node) == REFF)
3287 else if (OP(text_node) == EXACTFL || OP(text_node) == REFFL)
3288 c2 = PL_fold_locale[c1];
3297 /* This may be improved if l == 0. */
3298 while (n >= ln || (n == REG_INFTY && ln > 0 && l)) { /* ln overflow ? */
3299 /* If it could work, try it. */
3301 UCHARAT(PL_reginput) == c1 ||
3302 UCHARAT(PL_reginput) == c2)
3306 PL_regstartp[paren] =
3307 HOPc(PL_reginput, -l) - PL_bostr;
3308 PL_regendp[paren] = PL_reginput - PL_bostr;
3311 PL_regendp[paren] = -1;
3315 REGCP_UNWIND(lastcp);
3317 /* Couldn't or didn't -- move forward. */
3318 PL_reginput = locinput;
3319 if (regrepeat_hard(scan, 1, &l)) {
3321 locinput = PL_reginput;
3328 n = regrepeat_hard(scan, n, &l);
3329 /* if we matched something zero-length we don't need to
3330 backtrack, unless the minimum count is zero and we
3331 are capturing the result - in that case the capture
3332 being defined or not may affect later execution
3334 if (n != 0 && l == 0 && !(paren && ln == 0))
3335 ln = n; /* don't backtrack */
3336 locinput = PL_reginput;
3338 PerlIO_printf(Perl_debug_log,
3339 "%*s matched %"IVdf" times, len=%"IVdf"...\n",
3340 (int)(REPORT_CODE_OFF+PL_regindent*2), "",
3344 if (HAS_TEXT(next) || JUMPABLE(next)) {
3345 regnode *text_node = next;
3347 if (! HAS_TEXT(text_node)) FIND_NEXT_IMPT(text_node);
3349 if (! HAS_TEXT(text_node)) c1 = c2 = -1000;
3351 if (PL_regkind[(U8)OP(text_node)] == REF) {
3353 n = ARG(text_node); /* which paren pair */
3354 ln = PL_regstartp[n];
3355 /* assume yes if we haven't seen CLOSEn */
3357 *PL_reglastparen < n ||
3364 c1 = *(PL_bostr + ln);
3366 else { c1 = (U8)*STRING(text_node); }
3368 if (OP(text_node) == EXACTF || OP(text_node) == REFF)
3370 else if (OP(text_node) == EXACTFL || OP(text_node) == REFFL)
3371 c2 = PL_fold_locale[c1];
3382 /* If it could work, try it. */
3384 UCHARAT(PL_reginput) == c1 ||
3385 UCHARAT(PL_reginput) == c2)
3388 PerlIO_printf(Perl_debug_log,
3389 "%*s trying tail with n=%"IVdf"...\n",
3390 (int)(REPORT_CODE_OFF+PL_regindent*2), "", (IV)n)
3394 PL_regstartp[paren] = HOPc(PL_reginput, -l) - PL_bostr;
3395 PL_regendp[paren] = PL_reginput - PL_bostr;
3398 PL_regendp[paren] = -1;
3402 REGCP_UNWIND(lastcp);
3404 /* Couldn't or didn't -- back up. */
3406 locinput = HOPc(locinput, -l);
3407 PL_reginput = locinput;
3414 paren = scan->flags; /* Which paren to set */
3415 if (paren > PL_regsize)
3417 if (paren > *PL_reglastparen)
3418 *PL_reglastparen = paren;
3419 ln = ARG1(scan); /* min to match */
3420 n = ARG2(scan); /* max to match */
3421 scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
3425 ln = ARG1(scan); /* min to match */
3426 n = ARG2(scan); /* max to match */
3427 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
3432 scan = NEXTOPER(scan);
3438 scan = NEXTOPER(scan);
3442 * Lookahead to avoid useless match attempts
3443 * when we know what character comes next.
3447 * Used to only do .*x and .*?x, but now it allows
3448 * for )'s, ('s and (?{ ... })'s to be in the way
3449 * of the quantifier and the EXACT-like node. -- japhy
3452 if (HAS_TEXT(next) || JUMPABLE(next)) {
3454 regnode *text_node = next;
3456 if (! HAS_TEXT(text_node)) FIND_NEXT_IMPT(text_node);
3458 if (! HAS_TEXT(text_node)) c1 = c2 = -1000;
3460 if (PL_regkind[(U8)OP(text_node)] == REF) {
3462 n = ARG(text_node); /* which paren pair */
3463 ln = PL_regstartp[n];
3464 /* assume yes if we haven't seen CLOSEn */
3466 *PL_reglastparen < n ||
3471 goto assume_ok_easy;
3473 s = (U8*)PL_bostr + ln;
3475 else { s = (U8*)STRING(text_node); }
3479 if (OP(text_node) == EXACTF || OP(text_node) == REFF)
3481 else if (OP(text_node) == EXACTFL || OP(text_node) == REFFL)
3482 c2 = PL_fold_locale[c1];
3485 if (OP(text_node) == EXACTF || OP(text_node) == REFF) {
3486 STRLEN ulen1, ulen2;
3487 U8 tmpbuf1[UTF8_MAXLEN_UCLC+1];
3488 U8 tmpbuf2[UTF8_MAXLEN_UCLC+1];
3490 to_utf8_lower((U8*)s, tmpbuf1, &ulen1);
3491 to_utf8_upper((U8*)s, tmpbuf2, &ulen2);
3493 c1 = utf8_to_uvuni(tmpbuf1, 0);
3494 c2 = utf8_to_uvuni(tmpbuf2, 0);
3497 c2 = c1 = utf8_to_uvchr(s, NULL);
3505 PL_reginput = locinput;
3509 if (ln && regrepeat(scan, ln) < ln)
3511 locinput = PL_reginput;
3514 char *e; /* Should not check after this */
3515 char *old = locinput;
3517 if (n == REG_INFTY) {
3520 while (UTF8_IS_CONTINUATION(*(U8*)e))
3526 m >0 && e + UTF8SKIP(e) <= PL_regeol; m--)
3530 e = locinput + n - ln;
3536 /* Find place 'next' could work */
3539 while (locinput <= e &&
3540 UCHARAT(locinput) != c1)
3543 while (locinput <= e
3544 && UCHARAT(locinput) != c1
3545 && UCHARAT(locinput) != c2)
3548 count = locinput - old;
3555 utf8_to_uvchr((U8*)locinput, &len) != c1;
3560 for (count = 0; locinput <= e; count++) {
3561 UV c = utf8_to_uvchr((U8*)locinput, &len);
3562 if (c == c1 || c == c2)
3570 /* PL_reginput == old now */
3571 if (locinput != old) {
3572 ln = 1; /* Did some */
3573 if (regrepeat(scan, count) < count)
3576 /* PL_reginput == locinput now */
3577 TRYPAREN(paren, ln, locinput);
3578 PL_reginput = locinput; /* Could be reset... */
3579 REGCP_UNWIND(lastcp);
3580 /* Couldn't or didn't -- move forward. */
3583 locinput += UTF8SKIP(locinput);
3589 while (n >= ln || (n == REG_INFTY && ln > 0)) { /* ln overflow ? */
3593 c = utf8_to_uvchr((U8*)PL_reginput, NULL);
3595 c = UCHARAT(PL_reginput);
3596 /* If it could work, try it. */
3597 if (c == c1 || c == c2)
3599 TRYPAREN(paren, n, PL_reginput);
3600 REGCP_UNWIND(lastcp);
3603 /* If it could work, try it. */
3604 else if (c1 == -1000)
3606 TRYPAREN(paren, n, PL_reginput);
3607 REGCP_UNWIND(lastcp);
3609 /* Couldn't or didn't -- move forward. */
3610 PL_reginput = locinput;
3611 if (regrepeat(scan, 1)) {
3613 locinput = PL_reginput;
3621 n = regrepeat(scan, n);
3622 locinput = PL_reginput;
3623 if (ln < n && PL_regkind[(U8)OP(next)] == EOL &&
3624 ((!PL_multiline && OP(next) != MEOL) ||
3625 OP(next) == SEOL || OP(next) == EOS))
3627 ln = n; /* why back off? */
3628 /* ...because $ and \Z can match before *and* after
3629 newline at the end. Consider "\n\n" =~ /\n+\Z\n/.
3630 We should back off by one in this case. */
3631 if (UCHARAT(PL_reginput - 1) == '\n' && OP(next) != EOS)
3640 c = utf8_to_uvchr((U8*)PL_reginput, NULL);
3642 c = UCHARAT(PL_reginput);
3644 /* If it could work, try it. */
3645 if (c1 == -1000 || c == c1 || c == c2)
3647 TRYPAREN(paren, n, PL_reginput);
3648 REGCP_UNWIND(lastcp);
3650 /* Couldn't or didn't -- back up. */
3652 PL_reginput = locinput = HOPc(locinput, -1);
3660 c = utf8_to_uvchr((U8*)PL_reginput, NULL);
3662 c = UCHARAT(PL_reginput);
3664 /* If it could work, try it. */
3665 if (c1 == -1000 || c == c1 || c == c2)
3667 TRYPAREN(paren, n, PL_reginput);
3668 REGCP_UNWIND(lastcp);
3670 /* Couldn't or didn't -- back up. */
3672 PL_reginput = locinput = HOPc(locinput, -1);
3679 if (PL_reg_call_cc) {
3680 re_cc_state *cur_call_cc = PL_reg_call_cc;
3681 CURCUR *cctmp = PL_regcc;
3682 regexp *re = PL_reg_re;
3683 CHECKPOINT cp, lastcp;
3685 cp = regcppush(0); /* Save *all* the positions. */
3687 regcp_set_to(PL_reg_call_cc->ss); /* Restore parens of
3689 PL_reginput = locinput; /* Make position available to
3691 cache_re(PL_reg_call_cc->re);
3692 PL_regcc = PL_reg_call_cc->cc;
3693 PL_reg_call_cc = PL_reg_call_cc->prev;
3694 if (regmatch(cur_call_cc->node)) {
3695 PL_reg_call_cc = cur_call_cc;
3699 REGCP_UNWIND(lastcp);
3701 PL_reg_call_cc = cur_call_cc;
3707 PerlIO_printf(Perl_debug_log,
3708 "%*s continuation failed...\n",
3709 REPORT_CODE_OFF+PL_regindent*2, "")
3713 if (locinput < PL_regtill) {
3714 DEBUG_r(PerlIO_printf(Perl_debug_log,
3715 "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
3717 (long)(locinput - PL_reg_starttry),
3718 (long)(PL_regtill - PL_reg_starttry),
3720 sayNO_FINAL; /* Cannot match: too short. */
3722 PL_reginput = locinput; /* put where regtry can find it */
3723 sayYES_FINAL; /* Success! */
3725 PL_reginput = locinput; /* put where regtry can find it */
3726 sayYES_LOUD; /* Success! */
3729 PL_reginput = locinput;
3734 s = HOPBACKc(locinput, scan->flags);
3740 PL_reginput = locinput;
3745 s = HOPBACKc(locinput, scan->flags);
3751 PL_reginput = locinput;
3754 inner = NEXTOPER(NEXTOPER(scan));
3755 if (regmatch(inner) != n) {
3770 if (OP(scan) == SUSPEND) {
3771 locinput = PL_reginput;
3772 nextchr = UCHARAT(locinput);
3777 next = scan + ARG(scan);
3782 PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
3783 PTR2UV(scan), OP(scan));
3784 Perl_croak(aTHX_ "regexp memory corruption");
3791 * We get here only if there's trouble -- normally "case END" is
3792 * the terminating point.
3794 Perl_croak(aTHX_ "corrupted regexp pointers");
3800 PerlIO_printf(Perl_debug_log,
3801 "%*s %scould match...%s\n",
3802 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4],PL_colors[5])
3806 DEBUG_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
3807 PL_colors[4],PL_colors[5]));
3813 #if 0 /* Breaks $^R */
3821 PerlIO_printf(Perl_debug_log,
3822 "%*s %sfailed...%s\n",
3823 REPORT_CODE_OFF+PL_regindent*2, "",PL_colors[4],PL_colors[5])
3829 re_unwind_t *uw = SSPTRt(unwind,re_unwind_t);
3832 case RE_UNWIND_BRANCH:
3833 case RE_UNWIND_BRANCHJ:
3835 re_unwind_branch_t *uwb = &(uw->branch);
3836 I32 lastparen = uwb->lastparen;
3838 REGCP_UNWIND(uwb->lastcp);
3839 for (n = *PL_reglastparen; n > lastparen; n--)
3841 *PL_reglastparen = n;
3842 scan = next = uwb->next;
3844 OP(scan) != (uwb->type == RE_UNWIND_BRANCH
3845 ? BRANCH : BRANCHJ) ) { /* Failure */
3852 /* Have more choice yet. Reuse the same uwb. */
3854 if ((n = (uwb->type == RE_UNWIND_BRANCH
3855 ? NEXT_OFF(next) : ARG(next))))
3858 next = NULL; /* XXXX Needn't unwinding in this case... */
3860 next = NEXTOPER(scan);
3861 if (uwb->type == RE_UNWIND_BRANCHJ)
3862 next = NEXTOPER(next);
3863 locinput = uwb->locinput;
3864 nextchr = uwb->nextchr;
3866 PL_regindent = uwb->regindent;
3873 Perl_croak(aTHX_ "regexp unwind memory corruption");
3884 - regrepeat - repeatedly match something simple, report how many
3887 * [This routine now assumes that it will only match on things of length 1.
3888 * That was true before, but now we assume scan - reginput is the count,
3889 * rather than incrementing count on every character. [Er, except utf8.]]
3892 S_regrepeat(pTHX_ regnode *p, I32 max)
3894 register char *scan;
3896 register char *loceol = PL_regeol;
3897 register I32 hardcount = 0;
3898 register bool do_utf8 = PL_reg_match_utf8;
3901 if (max != REG_INFTY && max < loceol - scan)
3902 loceol = scan + max;
3907 while (scan < loceol && hardcount < max && *scan != '\n') {
3908 scan += UTF8SKIP(scan);
3912 while (scan < loceol && *scan != '\n')
3919 while (scan < loceol && hardcount < max) {
3920 scan += UTF8SKIP(scan);
3930 case EXACT: /* length of string is 1 */
3932 while (scan < loceol && UCHARAT(scan) == c)
3935 case EXACTF: /* length of string is 1 */
3937 while (scan < loceol &&
3938 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold[c]))
3941 case EXACTFL: /* length of string is 1 */
3942 PL_reg_flags |= RF_tainted;
3944 while (scan < loceol &&
3945 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold_locale[c]))
3951 while (hardcount < max && scan < loceol &&
3952 reginclass(p, (U8*)scan, do_utf8)) {
3953 scan += UTF8SKIP(scan);
3957 while (scan < loceol && reginclass(p, (U8*)scan, do_utf8))
3964 LOAD_UTF8_CHARCLASS(alnum,"a");
3965 while (hardcount < max && scan < loceol &&
3966 swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) {
3967 scan += UTF8SKIP(scan);
3971 while (scan < loceol && isALNUM(*scan))
3976 PL_reg_flags |= RF_tainted;
3979 while (hardcount < max && scan < loceol &&
3980 isALNUM_LC_utf8((U8*)scan)) {
3981 scan += UTF8SKIP(scan);
3985 while (scan < loceol && isALNUM_LC(*scan))
3992 LOAD_UTF8_CHARCLASS(alnum,"a");
3993 while (hardcount < max && scan < loceol &&
3994 !swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) {
3995 scan += UTF8SKIP(scan);
3999 while (scan < loceol && !isALNUM(*scan))
4004 PL_reg_flags |= RF_tainted;
4007 while (hardcount < max && scan < loceol &&
4008 !isALNUM_LC_utf8((U8*)scan)) {
4009 scan += UTF8SKIP(scan);
4013 while (scan < loceol && !isALNUM_LC(*scan))
4020 LOAD_UTF8_CHARCLASS(space," ");
4021 while (hardcount < max && scan < loceol &&
4023 swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) {
4024 scan += UTF8SKIP(scan);
4028 while (scan < loceol && isSPACE(*scan))
4033 PL_reg_flags |= RF_tainted;
4036 while (hardcount < max && scan < loceol &&
4037 (*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
4038 scan += UTF8SKIP(scan);
4042 while (scan < loceol && isSPACE_LC(*scan))
4049 LOAD_UTF8_CHARCLASS(space," ");
4050 while (hardcount < max && scan < loceol &&
4052 swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) {
4053 scan += UTF8SKIP(scan);
4057 while (scan < loceol && !isSPACE(*scan))
4062 PL_reg_flags |= RF_tainted;
4065 while (hardcount < max && scan < loceol &&
4066 !(*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
4067 scan += UTF8SKIP(scan);
4071 while (scan < loceol && !isSPACE_LC(*scan))
4078 LOAD_UTF8_CHARCLASS(digit,"0");
4079 while (hardcount < max && scan < loceol &&
4080 swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) {
4081 scan += UTF8SKIP(scan);
4085 while (scan < loceol && isDIGIT(*scan))
4092 LOAD_UTF8_CHARCLASS(digit,"0");
4093 while (hardcount < max && scan < loceol &&
4094 !swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) {
4095 scan += UTF8SKIP(scan);
4099 while (scan < loceol && !isDIGIT(*scan))
4103 default: /* Called on something of 0 width. */
4104 break; /* So match right here or not at all. */
4110 c = scan - PL_reginput;
4115 SV *prop = sv_newmortal();
4118 PerlIO_printf(Perl_debug_log,
4119 "%*s %s can match %"IVdf" times out of %"IVdf"...\n",
4120 REPORT_CODE_OFF+1, "", SvPVX(prop),(IV)c,(IV)max);
4127 - regrepeat_hard - repeatedly match something, report total lenth and length
4129 * The repeater is supposed to have constant length.
4133 S_regrepeat_hard(pTHX_ regnode *p, I32 max, I32 *lp)
4135 register char *scan = Nullch;
4136 register char *start;
4137 register char *loceol = PL_regeol;
4139 I32 count = 0, res = 1;
4144 start = PL_reginput;
4145 if (PL_reg_match_utf8) {
4146 while (PL_reginput < loceol && (scan = PL_reginput, res = regmatch(p))) {
4149 while (start < PL_reginput) {
4151 start += UTF8SKIP(start);
4162 while (PL_reginput < loceol && (scan = PL_reginput, res = regmatch(p))) {
4164 *lp = l = PL_reginput - start;
4165 if (max != REG_INFTY && l*max < loceol - scan)
4166 loceol = scan + l*max;
4179 - regclass_swash - prepare the utf8 swash
4183 Perl_regclass_swash(pTHX_ register regnode* node, bool doinit, SV** listsvp, SV **altsvp)
4189 if (PL_regdata && PL_regdata->count) {
4192 if (PL_regdata->what[n] == 's') {
4193 SV *rv = (SV*)PL_regdata->data[n];
4194 AV *av = (AV*)SvRV((SV*)rv);
4197 /* See the end of regcomp.c:S_reglass() for
4198 * documentation of these array elements. */
4200 si = *av_fetch(av, 0, FALSE);
4201 a = av_fetch(av, 1, FALSE);
4202 b = av_fetch(av, 2, FALSE);
4206 else if (si && doinit) {
4207 sw = swash_init("utf8", "", si, 1, 0);
4208 (void)av_store(av, 1, sw);
4224 - reginclasslen - determine if a character falls into a character class
4226 The n is the ANYOF regnode, the p is the target string, lenp
4227 is pointer to the maximum length of how far to go in the p
4228 (if the lenp is zero, UTF8SKIP(p) is used),
4229 do_utf8 tells whether the target string is in UTF-8.
4234 S_reginclasslen(pTHX_ register regnode *n, register U8* p, STRLEN* lenp, register bool do_utf8)
4236 char flags = ANYOF_FLAGS(n);
4242 c = do_utf8 ? utf8_to_uvchr(p, &len) : *p;
4244 plen = lenp ? *lenp : UNISKIP(c);
4245 if (do_utf8 || (flags & ANYOF_UNICODE)) {
4248 if (do_utf8 && !ANYOF_RUNTIME(n)) {
4249 if (len != (STRLEN)-1 && c < 256 && ANYOF_BITMAP_TEST(n, c))
4252 if (!match && do_utf8 && (flags & ANYOF_UNICODE_ALL) && c >= 256)
4256 SV *sw = regclass_swash(n, TRUE, 0, (SV**)&av);
4259 if (swash_fetch(sw, p, do_utf8))
4261 else if (flags & ANYOF_FOLD) {
4262 U8 tmpbuf[UTF8_MAXLEN_FOLD+1];
4265 if (!match && lenp && av) {
4268 for (i = 0; i <= av_len(av); i++) {
4269 SV* sv = *av_fetch(av, i, FALSE);
4271 char *s = SvPV(sv, len);
4273 if (len <= plen && memEQ(s, p, len)) {
4281 to_utf8_fold(p, tmpbuf, &tmplen);
4282 if (swash_fetch(sw, tmpbuf, do_utf8))
4286 to_utf8_upper(p, tmpbuf, &tmplen);
4287 if (swash_fetch(sw, tmpbuf, do_utf8))
4293 if (match && lenp && *lenp == 0)
4296 if (!match && c < 256) {
4297 if (ANYOF_BITMAP_TEST(n, c))
4299 else if (flags & ANYOF_FOLD) {
4302 if (flags & ANYOF_LOCALE) {
4303 PL_reg_flags |= RF_tainted;
4304 f = PL_fold_locale[c];
4308 if (f != c && ANYOF_BITMAP_TEST(n, f))
4312 if (!match && (flags & ANYOF_CLASS)) {
4313 PL_reg_flags |= RF_tainted;
4315 (ANYOF_CLASS_TEST(n, ANYOF_ALNUM) && isALNUM_LC(c)) ||
4316 (ANYOF_CLASS_TEST(n, ANYOF_NALNUM) && !isALNUM_LC(c)) ||
4317 (ANYOF_CLASS_TEST(n, ANYOF_SPACE) && isSPACE_LC(c)) ||
4318 (ANYOF_CLASS_TEST(n, ANYOF_NSPACE) && !isSPACE_LC(c)) ||
4319 (ANYOF_CLASS_TEST(n, ANYOF_DIGIT) && isDIGIT_LC(c)) ||
4320 (ANYOF_CLASS_TEST(n, ANYOF_NDIGIT) && !isDIGIT_LC(c)) ||
4321 (ANYOF_CLASS_TEST(n, ANYOF_ALNUMC) && isALNUMC_LC(c)) ||
4322 (ANYOF_CLASS_TEST(n, ANYOF_NALNUMC) && !isALNUMC_LC(c)) ||
4323 (ANYOF_CLASS_TEST(n, ANYOF_ALPHA) && isALPHA_LC(c)) ||
4324 (ANYOF_CLASS_TEST(n, ANYOF_NALPHA) && !isALPHA_LC(c)) ||
4325 (ANYOF_CLASS_TEST(n, ANYOF_ASCII) && isASCII(c)) ||
4326 (ANYOF_CLASS_TEST(n, ANYOF_NASCII) && !isASCII(c)) ||
4327 (ANYOF_CLASS_TEST(n, ANYOF_CNTRL) && isCNTRL_LC(c)) ||
4328 (ANYOF_CLASS_TEST(n, ANYOF_NCNTRL) && !isCNTRL_LC(c)) ||
4329 (ANYOF_CLASS_TEST(n, ANYOF_GRAPH) && isGRAPH_LC(c)) ||
4330 (ANYOF_CLASS_TEST(n, ANYOF_NGRAPH) && !isGRAPH_LC(c)) ||
4331 (ANYOF_CLASS_TEST(n, ANYOF_LOWER) && isLOWER_LC(c)) ||
4332 (ANYOF_CLASS_TEST(n, ANYOF_NLOWER) && !isLOWER_LC(c)) ||
4333 (ANYOF_CLASS_TEST(n, ANYOF_PRINT) && isPRINT_LC(c)) ||
4334 (ANYOF_CLASS_TEST(n, ANYOF_NPRINT) && !isPRINT_LC(c)) ||
4335 (ANYOF_CLASS_TEST(n, ANYOF_PUNCT) && isPUNCT_LC(c)) ||
4336 (ANYOF_CLASS_TEST(n, ANYOF_NPUNCT) && !isPUNCT_LC(c)) ||
4337 (ANYOF_CLASS_TEST(n, ANYOF_UPPER) && isUPPER_LC(c)) ||
4338 (ANYOF_CLASS_TEST(n, ANYOF_NUPPER) && !isUPPER_LC(c)) ||
4339 (ANYOF_CLASS_TEST(n, ANYOF_XDIGIT) && isXDIGIT(c)) ||
4340 (ANYOF_CLASS_TEST(n, ANYOF_NXDIGIT) && !isXDIGIT(c)) ||
4341 (ANYOF_CLASS_TEST(n, ANYOF_PSXSPC) && isPSXSPC(c)) ||
4342 (ANYOF_CLASS_TEST(n, ANYOF_NPSXSPC) && !isPSXSPC(c)) ||
4343 (ANYOF_CLASS_TEST(n, ANYOF_BLANK) && isBLANK(c)) ||
4344 (ANYOF_CLASS_TEST(n, ANYOF_NBLANK) && !isBLANK(c))
4345 ) /* How's that for a conditional? */
4352 return (flags & ANYOF_INVERT) ? !match : match;
4356 - reginclass - determine if a character falls into a character class
4358 The n is the ANYOF regnode, the p is the target string, do_utf8 tells
4359 whether the target string is in UTF-8.
4364 S_reginclass(pTHX_ register regnode *n, register U8* p, register bool do_utf8)
4366 return S_reginclasslen(aTHX_ n, p, 0, do_utf8);
4370 S_reghop(pTHX_ U8 *s, I32 off)
4372 return S_reghop3(aTHX_ s, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr));
4376 S_reghop3(pTHX_ U8 *s, I32 off, U8* lim)
4379 while (off-- && s < lim) {
4380 /* XXX could check well-formedness here */
4388 if (UTF8_IS_CONTINUED(*s)) {
4389 while (s > (U8*)lim && UTF8_IS_CONTINUATION(*s))
4392 /* XXX could check well-formedness here */
4400 S_reghopmaybe(pTHX_ U8 *s, I32 off)
4402 return S_reghopmaybe3(aTHX_ s, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr));
4406 S_reghopmaybe3(pTHX_ U8* s, I32 off, U8* lim)
4409 while (off-- && s < lim) {
4410 /* XXX could check well-formedness here */
4420 if (UTF8_IS_CONTINUED(*s)) {
4421 while (s > (U8*)lim && UTF8_IS_CONTINUATION(*s))
4424 /* XXX could check well-formedness here */
4436 restore_pos(pTHX_ void *arg)
4438 if (PL_reg_eval_set) {
4439 if (PL_reg_oldsaved) {
4440 PL_reg_re->subbeg = PL_reg_oldsaved;
4441 PL_reg_re->sublen = PL_reg_oldsavedlen;
4442 RX_MATCH_COPIED_on(PL_reg_re);
4444 PL_reg_magic->mg_len = PL_reg_oldpos;
4445 PL_reg_eval_set = 0;
4446 PL_curpm = PL_reg_oldcurpm;