5 * "One Ring to rule them all, One Ring to find them..."
8 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
9 * confused with the original package (see point 3 below). Thanks, Henry!
12 /* Additional note: this code is very heavily munged from Henry's version
13 * in places. In some spots I've traded clarity for efficiency, so don't
14 * blame Henry for some of the lack of readability.
17 /* The names of the functions have been changed from regcomp and
18 * regexec to pregcomp and pregexec in order to avoid conflicts
19 * with the POSIX routines of the same names.
22 #ifdef PERL_EXT_RE_BUILD
23 /* need to replace pregcomp et al, so enable that */
24 # ifndef PERL_IN_XSUB_RE
25 # define PERL_IN_XSUB_RE
27 /* need access to debugger hooks */
28 # if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
33 #ifdef PERL_IN_XSUB_RE
34 /* We *really* need to overwrite these symbols: */
35 # define Perl_regexec_flags my_regexec
36 # define Perl_regdump my_regdump
37 # define Perl_regprop my_regprop
38 # define Perl_re_intuit_start my_re_intuit_start
39 /* *These* symbols are masked to allow static link. */
40 # define Perl_pregexec my_pregexec
41 # define Perl_reginitcolors my_reginitcolors
42 # define Perl_regclass_swash my_regclass_swash
44 # define PERL_NO_GET_CONTEXT
49 * pregcomp and pregexec -- regsub and regerror are not used in perl
51 * Copyright (c) 1986 by University of Toronto.
52 * Written by Henry Spencer. Not derived from licensed software.
54 * Permission is granted to anyone to use this software for any
55 * purpose on any computer system, and to redistribute it freely,
56 * subject to the following restrictions:
58 * 1. The author is not responsible for the consequences of use of
59 * this software, no matter how awful, even if they arise
62 * 2. The origin of this software must not be misrepresented, either
63 * by explicit claim or by omission.
65 * 3. Altered versions must be plainly marked as such, and must not
66 * be misrepresented as being the original software.
68 **** Alterations to Henry's code are...
70 **** Copyright (c) 1991-2001, Larry Wall
72 **** You may distribute under the terms of either the GNU General Public
73 **** License or the Artistic License, as specified in the README file.
75 * Beware that some of this code is subtly aware of the way operator
76 * precedence is structured in regular expressions. Serious changes in
77 * regular-expression syntax might require a total rethink.
80 #define PERL_IN_REGEXEC_C
85 #define RF_tainted 1 /* tainted information used? */
86 #define RF_warned 2 /* warned about big count? */
87 #define RF_evaled 4 /* Did an EVAL with setting? */
88 #define RF_utf8 8 /* String contains multibyte chars? */
90 #define UTF (PL_reg_flags & RF_utf8)
92 #define RS_init 1 /* eval environment created */
93 #define RS_set 2 /* replsv value is set */
103 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
104 #define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
106 #define reghop_c(pos,off) ((char*)reghop((U8*)pos, off))
107 #define reghopmaybe_c(pos,off) ((char*)reghopmaybe((U8*)pos, off))
108 #define HOP(pos,off) (PL_reg_match_utf8 ? reghop((U8*)pos, off) : (U8*)(pos + off))
109 #define HOPMAYBE(pos,off) (PL_reg_match_utf8 ? reghopmaybe((U8*)pos, off) : (U8*)(pos + off))
110 #define HOPc(pos,off) ((char*)HOP(pos,off))
111 #define HOPMAYBEc(pos,off) ((char*)HOPMAYBE(pos,off))
113 #define HOPBACK(pos, off) ( \
114 (UTF && PL_reg_match_utf8) \
115 ? reghopmaybe((U8*)pos, -off) \
116 : (pos - off >= PL_bostr) \
120 #define HOPBACKc(pos, off) (char*)HOPBACK(pos, off)
122 #define reghop3_c(pos,off,lim) ((char*)reghop3((U8*)pos, off, (U8*)lim))
123 #define reghopmaybe3_c(pos,off,lim) ((char*)reghopmaybe3((U8*)pos, off, (U8*)lim))
124 #define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)pos, off, (U8*)lim) : (U8*)(pos + off))
125 #define HOPMAYBE3(pos,off,lim) (PL_reg_match_utf8 ? reghopmaybe3((U8*)pos, off, (U8*)lim) : (U8*)(pos + off))
126 #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
127 #define HOPMAYBE3c(pos,off,lim) ((char*)HOPMAYBE3(pos,off,lim))
129 #define LOAD_UTF8_CHARCLASS(a,b) STMT_START { if (!CAT2(PL_utf8_,a)) (void)CAT2(is_utf8_, a)((U8*)b); } STMT_END
131 /* for use after a quantifier and before an EXACT-like node -- japhy */
132 #define JUMPABLE(rn) ( \
133 OP(rn) == OPEN || OP(rn) == CLOSE || OP(rn) == EVAL || \
134 OP(rn) == SUSPEND || OP(rn) == IFMATCH \
137 #define NEAR_EXACT(rn) (PL_regkind[(U8)OP(rn)] == EXACT || JUMPABLE(rn))
139 #define NEXT_IMPT(rn) STMT_START { \
140 while (JUMPABLE(rn)) \
141 if (OP(rn) == SUSPEND || OP(rn) == IFMATCH) \
142 rn = NEXTOPER(NEXTOPER(rn)); \
143 else rn += NEXT_OFF(rn); \
146 static void restore_pos(pTHX_ void *arg);
149 S_regcppush(pTHX_ I32 parenfloor)
151 int retval = PL_savestack_ix;
152 #define REGCP_PAREN_ELEMS 4
153 int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
156 if (paren_elems_to_push < 0)
157 Perl_croak(aTHX_ "panic: paren_elems_to_push < 0");
159 #define REGCP_OTHER_ELEMS 6
160 SSCHECK(paren_elems_to_push + REGCP_OTHER_ELEMS);
161 for (p = PL_regsize; p > parenfloor; p--) {
162 /* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
163 SSPUSHINT(PL_regendp[p]);
164 SSPUSHINT(PL_regstartp[p]);
165 SSPUSHPTR(PL_reg_start_tmp[p]);
168 /* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
169 SSPUSHINT(PL_regsize);
170 SSPUSHINT(*PL_reglastparen);
171 SSPUSHINT(*PL_reglastcloseparen);
172 SSPUSHPTR(PL_reginput);
173 #define REGCP_FRAME_ELEMS 2
174 /* REGCP_FRAME_ELEMS are part of the REGCP_OTHER_ELEMS and
175 * are needed for the regexp context stack bookkeeping. */
176 SSPUSHINT(paren_elems_to_push + REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
177 SSPUSHINT(SAVEt_REGCONTEXT); /* Magic cookie. */
182 /* These are needed since we do not localize EVAL nodes: */
183 # define REGCP_SET(cp) DEBUG_r(PerlIO_printf(Perl_debug_log, \
184 " Setting an EVAL scope, savestack=%"IVdf"\n", \
185 (IV)PL_savestack_ix)); cp = PL_savestack_ix
187 # define REGCP_UNWIND(cp) DEBUG_r(cp != PL_savestack_ix ? \
188 PerlIO_printf(Perl_debug_log, \
189 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
190 (IV)(cp), (IV)PL_savestack_ix) : 0); regcpblow(cp)
200 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
202 assert(i == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
203 i = SSPOPINT; /* Parentheses elements to pop. */
204 input = (char *) SSPOPPTR;
205 *PL_reglastcloseparen = SSPOPINT;
206 *PL_reglastparen = SSPOPINT;
207 PL_regsize = SSPOPINT;
209 /* Now restore the parentheses context. */
210 for (i -= (REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
211 i > 0; i -= REGCP_PAREN_ELEMS) {
212 paren = (U32)SSPOPINT;
213 PL_reg_start_tmp[paren] = (char *) SSPOPPTR;
214 PL_regstartp[paren] = SSPOPINT;
216 if (paren <= *PL_reglastparen)
217 PL_regendp[paren] = tmps;
219 PerlIO_printf(Perl_debug_log,
220 " restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n",
221 (UV)paren, (IV)PL_regstartp[paren],
222 (IV)(PL_reg_start_tmp[paren] - PL_bostr),
223 (IV)PL_regendp[paren],
224 (paren > *PL_reglastparen ? "(no)" : ""));
228 if (*PL_reglastparen + 1 <= PL_regnpar) {
229 PerlIO_printf(Perl_debug_log,
230 " restoring \\%"IVdf"..\\%"IVdf" to undef\n",
231 (IV)(*PL_reglastparen + 1), (IV)PL_regnpar);
235 /* It would seem that the similar code in regtry()
236 * already takes care of this, and in fact it is in
237 * a better location to since this code can #if 0-ed out
238 * but the code in regtry() is needed or otherwise tests
239 * requiring null fields (pat.t#187 and split.t#{13,14}
240 * (as of patchlevel 7877) will fail. Then again,
241 * this code seems to be necessary or otherwise
242 * building DynaLoader will fail:
243 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
245 for (paren = *PL_reglastparen + 1; paren <= PL_regnpar; paren++) {
246 if (paren > PL_regsize)
247 PL_regstartp[paren] = -1;
248 PL_regendp[paren] = -1;
255 S_regcp_set_to(pTHX_ I32 ss)
257 I32 tmp = PL_savestack_ix;
259 PL_savestack_ix = ss;
261 PL_savestack_ix = tmp;
265 typedef struct re_cc_state
269 struct re_cc_state *prev;
274 #define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
276 #define TRYPAREN(paren, n, input) { \
279 PL_regstartp[paren] = HOPc(input, -1) - PL_bostr; \
280 PL_regendp[paren] = input - PL_bostr; \
283 PL_regendp[paren] = -1; \
285 if (regmatch(next)) \
288 PL_regendp[paren] = -1; \
293 * pregexec and friends
297 - pregexec - match a regexp against a string
300 Perl_pregexec(pTHX_ register regexp *prog, char *stringarg, register char *strend,
301 char *strbeg, I32 minend, SV *screamer, U32 nosave)
302 /* strend: pointer to null at end of string */
303 /* strbeg: real beginning of string */
304 /* minend: end of match must be >=minend after stringarg. */
305 /* nosave: For optimizations. */
308 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
309 nosave ? 0 : REXEC_COPY_STR);
313 S_cache_re(pTHX_ regexp *prog)
315 PL_regprecomp = prog->precomp; /* Needed for FAIL. */
317 PL_regprogram = prog->program;
319 PL_regnpar = prog->nparens;
320 PL_regdata = prog->data;
325 * Need to implement the following flags for reg_anch:
327 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
329 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
330 * INTUIT_AUTORITATIVE_ML
331 * INTUIT_ONCE_NOML - Intuit can match in one location only.
334 * Another flag for this function: SECOND_TIME (so that float substrs
335 * with giant delta may be not rechecked).
338 /* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
340 /* If SCREAM, then SvPVX(sv) should be compatible with strpos and strend.
341 Otherwise, only SvCUR(sv) is used to get strbeg. */
343 /* XXXX We assume that strpos is strbeg unless sv. */
345 /* XXXX Some places assume that there is a fixed substring.
346 An update may be needed if optimizer marks as "INTUITable"
347 RExen without fixed substrings. Similarly, it is assumed that
348 lengths of all the strings are no more than minlen, thus they
349 cannot come from lookahead.
350 (Or minlen should take into account lookahead.) */
352 /* A failure to find a constant substring means that there is no need to make
353 an expensive call to REx engine, thus we celebrate a failure. Similarly,
354 finding a substring too deep into the string means that less calls to
355 regtry() should be needed.
357 REx compiler's optimizer found 4 possible hints:
358 a) Anchored substring;
360 c) Whether we are anchored (beginning-of-line or \G);
361 d) First node (of those at offset 0) which may distingush positions;
362 We use a)b)d) and multiline-part of c), and try to find a position in the
363 string which does not contradict any of them.
366 /* Most of decisions we do here should have been done at compile time.
367 The nodes of the REx which we used for the search should have been
368 deleted from the finite automaton. */
371 Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
372 char *strend, U32 flags, re_scream_pos_data *data)
374 register I32 start_shift = 0;
375 /* Should be nonnegative! */
376 register I32 end_shift = 0;
382 register char *other_last = Nullch; /* other substr checked before this */
383 char *check_at = Nullch; /* check substr found at this pos */
385 char *i_strpos = strpos;
388 DEBUG_r( if (!PL_colorset) reginitcolors() );
389 DEBUG_r(PerlIO_printf(Perl_debug_log,
390 "%sGuessing start of match, REx%s `%s%.60s%s%s' against `%s%.*s%s%s'...\n",
391 PL_colors[4],PL_colors[5],PL_colors[0],
394 (strlen(prog->precomp) > 60 ? "..." : ""),
396 (int)(strend - strpos > 60 ? 60 : strend - strpos),
397 strpos, PL_colors[1],
398 (strend - strpos > 60 ? "..." : ""))
401 if (prog->reganch & ROPT_UTF8)
402 PL_reg_flags |= RF_utf8;
404 if (prog->minlen > CHR_DIST((U8*)strend, (U8*)strpos)) {
405 DEBUG_r(PerlIO_printf(Perl_debug_log, "String too short...\n"));
408 strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
410 check = prog->check_substr;
411 if (prog->reganch & ROPT_ANCH) { /* Match at beg-of-str or after \n */
412 ml_anch = !( (prog->reganch & ROPT_ANCH_SINGLE)
413 || ( (prog->reganch & ROPT_ANCH_BOL)
414 && !PL_multiline ) ); /* Check after \n? */
417 if ( !(prog->reganch & (ROPT_ANCH_GPOS /* Checked by the caller */
418 | ROPT_IMPLICIT)) /* not a real BOL */
419 /* SvCUR is not set on references: SvRV and SvPVX overlap */
421 && (strpos != strbeg)) {
422 DEBUG_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
425 if (prog->check_offset_min == prog->check_offset_max &&
426 !(prog->reganch & ROPT_CANY_SEEN)) {
427 /* Substring at constant offset from beg-of-str... */
430 s = HOP3c(strpos, prog->check_offset_min, strend);
432 slen = SvCUR(check); /* >= 1 */
434 if ( strend - s > slen || strend - s < slen - 1
435 || (strend - s == slen && strend[-1] != '\n')) {
436 DEBUG_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
439 /* Now should match s[0..slen-2] */
441 if (slen && (*SvPVX(check) != *s
443 && memNE(SvPVX(check), s, slen)))) {
445 DEBUG_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
449 else if (*SvPVX(check) != *s
450 || ((slen = SvCUR(check)) > 1
451 && memNE(SvPVX(check), s, slen)))
453 goto success_at_start;
456 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
458 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
459 end_shift = prog->minlen - start_shift -
460 CHR_SVLEN(check) + (SvTAIL(check) != 0);
462 I32 end = prog->check_offset_max + CHR_SVLEN(check)
463 - (SvTAIL(check) != 0);
464 I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
466 if (end_shift < eshift)
470 else { /* Can match at random position */
473 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
474 /* Should be nonnegative! */
475 end_shift = prog->minlen - start_shift -
476 CHR_SVLEN(check) + (SvTAIL(check) != 0);
479 #ifdef DEBUGGING /* 7/99: reports of failure (with the older version) */
481 Perl_croak(aTHX_ "panic: end_shift");
485 /* Find a possible match in the region s..strend by looking for
486 the "check" substring in the region corrected by start/end_shift. */
487 if (flags & REXEC_SCREAM) {
488 I32 p = -1; /* Internal iterator of scream. */
489 I32 *pp = data ? data->scream_pos : &p;
491 if (PL_screamfirst[BmRARE(check)] >= 0
492 || ( BmRARE(check) == '\n'
493 && (BmPREVIOUS(check) == SvCUR(check) - 1)
495 s = screaminstr(sv, check,
496 start_shift + (s - strbeg), end_shift, pp, 0);
500 *data->scream_olds = s;
502 else if (prog->reganch & ROPT_CANY_SEEN)
503 s = fbm_instr((U8*)(s + start_shift),
504 (U8*)(strend - end_shift),
505 check, PL_multiline ? FBMrf_MULTILINE : 0);
507 s = fbm_instr(HOP3(s, start_shift, strend),
508 HOP3(strend, -end_shift, strbeg),
509 check, PL_multiline ? FBMrf_MULTILINE : 0);
511 /* Update the count-of-usability, remove useless subpatterns,
514 DEBUG_r(PerlIO_printf(Perl_debug_log, "%s %s substr `%s%.*s%s'%s%s",
515 (s ? "Found" : "Did not find"),
516 ((check == prog->anchored_substr) ? "anchored" : "floating"),
518 (int)(SvCUR(check) - (SvTAIL(check)!=0)),
520 PL_colors[1], (SvTAIL(check) ? "$" : ""),
521 (s ? " at offset " : "...\n") ) );
528 /* Finish the diagnostic message */
529 DEBUG_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
531 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
532 Start with the other substr.
533 XXXX no SCREAM optimization yet - and a very coarse implementation
534 XXXX /ttx+/ results in anchored=`ttx', floating=`x'. floating will
535 *always* match. Probably should be marked during compile...
536 Probably it is right to do no SCREAM here...
539 if (prog->float_substr && prog->anchored_substr) {
540 /* Take into account the "other" substring. */
541 /* XXXX May be hopelessly wrong for UTF... */
544 if (check == prog->float_substr) {
547 char *last = HOP3c(s, -start_shift, strbeg), *last1, *last2;
550 t = s - prog->check_offset_max;
551 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
552 && (!(prog->reganch & ROPT_UTF8)
553 || ((t = reghopmaybe3_c(s, -(prog->check_offset_max), strpos))
558 t = HOP3c(t, prog->anchored_offset, strend);
559 if (t < other_last) /* These positions already checked */
561 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
564 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
565 /* On end-of-str: see comment below. */
566 s = fbm_instr((unsigned char*)t,
567 HOP3(HOP3(last1, prog->anchored_offset, strend)
568 + SvCUR(prog->anchored_substr),
569 -(SvTAIL(prog->anchored_substr)!=0), strbeg),
570 prog->anchored_substr,
571 PL_multiline ? FBMrf_MULTILINE : 0);
572 DEBUG_r(PerlIO_printf(Perl_debug_log,
573 "%s anchored substr `%s%.*s%s'%s",
574 (s ? "Found" : "Contradicts"),
576 (int)(SvCUR(prog->anchored_substr)
577 - (SvTAIL(prog->anchored_substr)!=0)),
578 SvPVX(prog->anchored_substr),
579 PL_colors[1], (SvTAIL(prog->anchored_substr) ? "$" : "")));
581 if (last1 >= last2) {
582 DEBUG_r(PerlIO_printf(Perl_debug_log,
583 ", giving up...\n"));
586 DEBUG_r(PerlIO_printf(Perl_debug_log,
587 ", trying floating at offset %ld...\n",
588 (long)(HOP3c(s1, 1, strend) - i_strpos)));
589 other_last = HOP3c(last1, prog->anchored_offset+1, strend);
590 s = HOP3c(last, 1, strend);
594 DEBUG_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
595 (long)(s - i_strpos)));
596 t = HOP3c(s, -prog->anchored_offset, strbeg);
597 other_last = HOP3c(s, 1, strend);
605 else { /* Take into account the floating substring. */
609 t = HOP3c(s, -start_shift, strbeg);
611 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
612 if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
613 last = HOP3c(t, prog->float_max_offset, strend);
614 s = HOP3c(t, prog->float_min_offset, strend);
617 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
618 /* fbm_instr() takes into account exact value of end-of-str
619 if the check is SvTAIL(ed). Since false positives are OK,
620 and end-of-str is not later than strend we are OK. */
621 s = fbm_instr((unsigned char*)s,
622 (unsigned char*)last + SvCUR(prog->float_substr)
623 - (SvTAIL(prog->float_substr)!=0),
624 prog->float_substr, PL_multiline ? FBMrf_MULTILINE : 0);
625 DEBUG_r(PerlIO_printf(Perl_debug_log, "%s floating substr `%s%.*s%s'%s",
626 (s ? "Found" : "Contradicts"),
628 (int)(SvCUR(prog->float_substr)
629 - (SvTAIL(prog->float_substr)!=0)),
630 SvPVX(prog->float_substr),
631 PL_colors[1], (SvTAIL(prog->float_substr) ? "$" : "")));
634 DEBUG_r(PerlIO_printf(Perl_debug_log,
635 ", giving up...\n"));
638 DEBUG_r(PerlIO_printf(Perl_debug_log,
639 ", trying anchored starting at offset %ld...\n",
640 (long)(s1 + 1 - i_strpos)));
642 s = HOP3c(t, 1, strend);
646 DEBUG_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
647 (long)(s - i_strpos)));
648 other_last = s; /* Fix this later. --Hugo */
657 t = s - prog->check_offset_max;
658 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
659 && (!(prog->reganch & ROPT_UTF8)
660 || ((t = reghopmaybe3_c(s, -prog->check_offset_max, strpos))
662 /* Fixed substring is found far enough so that the match
663 cannot start at strpos. */
665 if (ml_anch && t[-1] != '\n') {
666 /* Eventually fbm_*() should handle this, but often
667 anchored_offset is not 0, so this check will not be wasted. */
668 /* XXXX In the code below we prefer to look for "^" even in
669 presence of anchored substrings. And we search even
670 beyond the found float position. These pessimizations
671 are historical artefacts only. */
673 while (t < strend - prog->minlen) {
675 if (t < check_at - prog->check_offset_min) {
676 if (prog->anchored_substr) {
677 /* Since we moved from the found position,
678 we definitely contradict the found anchored
679 substr. Due to the above check we do not
680 contradict "check" substr.
681 Thus we can arrive here only if check substr
682 is float. Redo checking for "other"=="fixed".
685 DEBUG_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
686 PL_colors[0],PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
687 goto do_other_anchored;
689 /* We don't contradict the found floating substring. */
690 /* XXXX Why not check for STCLASS? */
692 DEBUG_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
693 PL_colors[0],PL_colors[1], (long)(s - i_strpos)));
696 /* Position contradicts check-string */
697 /* XXXX probably better to look for check-string
698 than for "\n", so one should lower the limit for t? */
699 DEBUG_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
700 PL_colors[0],PL_colors[1], (long)(t + 1 - i_strpos)));
701 other_last = strpos = s = t + 1;
706 DEBUG_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
707 PL_colors[0],PL_colors[1]));
711 DEBUG_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
712 PL_colors[0],PL_colors[1]));
716 ++BmUSEFUL(prog->check_substr); /* hooray/5 */
719 /* The found string does not prohibit matching at strpos,
720 - no optimization of calling REx engine can be performed,
721 unless it was an MBOL and we are not after MBOL,
722 or a future STCLASS check will fail this. */
724 /* Even in this situation we may use MBOL flag if strpos is offset
725 wrt the start of the string. */
726 if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
727 && (strpos != strbeg) && strpos[-1] != '\n'
728 /* May be due to an implicit anchor of m{.*foo} */
729 && !(prog->reganch & ROPT_IMPLICIT))
734 DEBUG_r( if (ml_anch)
735 PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
736 (long)(strpos - i_strpos), PL_colors[0],PL_colors[1]);
739 if (!(prog->reganch & ROPT_NAUGHTY) /* XXXX If strpos moved? */
740 && prog->check_substr /* Could be deleted already */
741 && --BmUSEFUL(prog->check_substr) < 0
742 && prog->check_substr == prog->float_substr)
744 /* If flags & SOMETHING - do not do it many times on the same match */
745 DEBUG_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
746 SvREFCNT_dec(prog->check_substr);
747 prog->check_substr = Nullsv; /* disable */
748 prog->float_substr = Nullsv; /* clear */
749 check = Nullsv; /* abort */
751 /* XXXX This is a remnant of the old implementation. It
752 looks wasteful, since now INTUIT can use many
754 prog->reganch &= ~RE_USE_INTUIT;
761 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
762 if (prog->regstclass) {
763 /* minlen == 0 is possible if regstclass is \b or \B,
764 and the fixed substr is ''$.
765 Since minlen is already taken into account, s+1 is before strend;
766 accidentally, minlen >= 1 guaranties no false positives at s + 1
767 even for \b or \B. But (minlen? 1 : 0) below assumes that
768 regstclass does not come from lookahead... */
769 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
770 This leaves EXACTF only, which is dealt with in find_byclass(). */
771 U8* str = (U8*)STRING(prog->regstclass);
772 int cl_l = (PL_regkind[(U8)OP(prog->regstclass)] == EXACT
773 ? CHR_DIST(str+STR_LEN(prog->regstclass), str)
775 char *endpos = (prog->anchored_substr || ml_anch)
776 ? HOP3c(s, (prog->minlen ? cl_l : 0), strend)
777 : (prog->float_substr
778 ? HOP3c(HOP3c(check_at, -start_shift, strbeg),
781 char *startpos = strbeg;
784 if (prog->reganch & ROPT_UTF8) {
785 PL_regdata = prog->data;
788 s = find_byclass(prog, prog->regstclass, s, endpos, startpos, 1);
793 if (endpos == strend) {
794 DEBUG_r( PerlIO_printf(Perl_debug_log,
795 "Could not match STCLASS...\n") );
798 DEBUG_r( PerlIO_printf(Perl_debug_log,
799 "This position contradicts STCLASS...\n") );
800 if ((prog->reganch & ROPT_ANCH) && !ml_anch)
802 /* Contradict one of substrings */
803 if (prog->anchored_substr) {
804 if (prog->anchored_substr == check) {
805 DEBUG_r( what = "anchored" );
807 s = HOP3c(t, 1, strend);
808 if (s + start_shift + end_shift > strend) {
809 /* XXXX Should be taken into account earlier? */
810 DEBUG_r( PerlIO_printf(Perl_debug_log,
811 "Could not match STCLASS...\n") );
816 DEBUG_r( PerlIO_printf(Perl_debug_log,
817 "Looking for %s substr starting at offset %ld...\n",
818 what, (long)(s + start_shift - i_strpos)) );
821 /* Have both, check_string is floating */
822 if (t + start_shift >= check_at) /* Contradicts floating=check */
823 goto retry_floating_check;
824 /* Recheck anchored substring, but not floating... */
828 DEBUG_r( PerlIO_printf(Perl_debug_log,
829 "Looking for anchored substr starting at offset %ld...\n",
830 (long)(other_last - i_strpos)) );
831 goto do_other_anchored;
833 /* Another way we could have checked stclass at the
834 current position only: */
839 DEBUG_r( PerlIO_printf(Perl_debug_log,
840 "Looking for /%s^%s/m starting at offset %ld...\n",
841 PL_colors[0],PL_colors[1], (long)(t - i_strpos)) );
844 if (!prog->float_substr) /* Could have been deleted */
846 /* Check is floating subtring. */
847 retry_floating_check:
848 t = check_at - start_shift;
849 DEBUG_r( what = "floating" );
850 goto hop_and_restart;
853 DEBUG_r(PerlIO_printf(Perl_debug_log,
854 "By STCLASS: moving %ld --> %ld\n",
855 (long)(t - i_strpos), (long)(s - i_strpos))
859 DEBUG_r(PerlIO_printf(Perl_debug_log,
860 "Does not contradict STCLASS...\n");
865 DEBUG_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
866 PL_colors[4], (check ? "Guessed" : "Giving up"),
867 PL_colors[5], (long)(s - i_strpos)) );
870 fail_finish: /* Substring not found */
871 if (prog->check_substr) /* could be removed already */
872 BmUSEFUL(prog->check_substr) += 5; /* hooray */
874 DEBUG_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
875 PL_colors[4],PL_colors[5]));
879 /* We know what class REx starts with. Try to find this position... */
881 S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *startpos, I32 norun)
883 I32 doevery = (prog->reganch & ROPT_SKIP) == 0;
889 register I32 tmp = 1; /* Scratch variable? */
890 register bool do_utf8 = PL_reg_match_utf8;
892 /* We know what class it must start with. */
896 if (reginclass(c, (U8*)s, do_utf8)) {
897 if (tmp && (norun || regtry(prog, s)))
904 s += do_utf8 ? UTF8SKIP(s) : 1;
909 if (tmp && (norun || regtry(prog, s)))
921 U8 tmpbuf1[UTF8_MAXLEN*2+1];
922 U8 tmpbuf2[UTF8_MAXLEN*2+1];
924 to_utf8_lower((U8*)m, tmpbuf1, &ulen1);
925 to_utf8_upper((U8*)m, tmpbuf2, &ulen2);
927 c1 = utf8_to_uvuni(tmpbuf1, 0);
928 c2 = utf8_to_uvuni(tmpbuf2, 0);
939 c2 = PL_fold_locale[c1];
944 e = s; /* Due to minlen logic of intuit() */
950 if ( utf8_to_uvchr((U8*)s, &len) == c1
957 UV c = utf8_to_uvchr((U8*)s, &len);
958 if ( (c == c1 || c == c2) && regtry(prog, s) )
967 && (ln == 1 || !(OP(c) == EXACTF
969 : ibcmp_locale(s, m, ln)))
970 && (norun || regtry(prog, s)) )
976 if ( (*(U8*)s == c1 || *(U8*)s == c2)
977 && (ln == 1 || !(OP(c) == EXACTF
979 : ibcmp_locale(s, m, ln)))
980 && (norun || regtry(prog, s)) )
987 PL_reg_flags |= RF_tainted;
994 U8 *r = reghop3((U8*)s, -1, (U8*)startpos);
997 tmp = (I32)utf8n_to_uvchr(r, s - (char*)r, 0, 0);
999 tmp = ((OP(c) == BOUND ?
1000 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1001 LOAD_UTF8_CHARCLASS(alnum,"a");
1002 while (s < strend) {
1003 if (tmp == !(OP(c) == BOUND ?
1004 swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
1005 isALNUM_LC_utf8((U8*)s)))
1008 if ((norun || regtry(prog, s)))
1015 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
1016 tmp = ((OP(c) == BOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1017 while (s < strend) {
1019 !(OP(c) == BOUND ? isALNUM(*s) : isALNUM_LC(*s))) {
1021 if ((norun || regtry(prog, s)))
1027 if ((!prog->minlen && tmp) && (norun || regtry(prog, s)))
1031 PL_reg_flags |= RF_tainted;
1038 U8 *r = reghop3((U8*)s, -1, (U8*)startpos);
1041 tmp = (I32)utf8n_to_uvchr(r, s - (char*)r, 0, 0);
1043 tmp = ((OP(c) == NBOUND ?
1044 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1045 LOAD_UTF8_CHARCLASS(alnum,"a");
1046 while (s < strend) {
1047 if (tmp == !(OP(c) == NBOUND ?
1048 swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
1049 isALNUM_LC_utf8((U8*)s)))
1051 else if ((norun || regtry(prog, s)))
1057 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
1058 tmp = ((OP(c) == NBOUND ?
1059 isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1060 while (s < strend) {
1062 !(OP(c) == NBOUND ? isALNUM(*s) : isALNUM_LC(*s)))
1064 else if ((norun || regtry(prog, s)))
1069 if ((!prog->minlen && !tmp) && (norun || regtry(prog, s)))
1074 LOAD_UTF8_CHARCLASS(alnum,"a");
1075 while (s < strend) {
1076 if (swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8)) {
1077 if (tmp && (norun || regtry(prog, s)))
1088 while (s < strend) {
1090 if (tmp && (norun || regtry(prog, s)))
1102 PL_reg_flags |= RF_tainted;
1104 while (s < strend) {
1105 if (isALNUM_LC_utf8((U8*)s)) {
1106 if (tmp && (norun || regtry(prog, s)))
1117 while (s < strend) {
1118 if (isALNUM_LC(*s)) {
1119 if (tmp && (norun || regtry(prog, s)))
1132 LOAD_UTF8_CHARCLASS(alnum,"a");
1133 while (s < strend) {
1134 if (!swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8)) {
1135 if (tmp && (norun || regtry(prog, s)))
1146 while (s < strend) {
1148 if (tmp && (norun || regtry(prog, s)))
1160 PL_reg_flags |= RF_tainted;
1162 while (s < strend) {
1163 if (!isALNUM_LC_utf8((U8*)s)) {
1164 if (tmp && (norun || regtry(prog, s)))
1175 while (s < strend) {
1176 if (!isALNUM_LC(*s)) {
1177 if (tmp && (norun || regtry(prog, s)))
1190 LOAD_UTF8_CHARCLASS(space," ");
1191 while (s < strend) {
1192 if (*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8)) {
1193 if (tmp && (norun || regtry(prog, s)))
1204 while (s < strend) {
1206 if (tmp && (norun || regtry(prog, s)))
1218 PL_reg_flags |= RF_tainted;
1220 while (s < strend) {
1221 if (*s == ' ' || isSPACE_LC_utf8((U8*)s)) {
1222 if (tmp && (norun || regtry(prog, s)))
1233 while (s < strend) {
1234 if (isSPACE_LC(*s)) {
1235 if (tmp && (norun || regtry(prog, s)))
1248 LOAD_UTF8_CHARCLASS(space," ");
1249 while (s < strend) {
1250 if (!(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8))) {
1251 if (tmp && (norun || regtry(prog, s)))
1262 while (s < strend) {
1264 if (tmp && (norun || regtry(prog, s)))
1276 PL_reg_flags |= RF_tainted;
1278 while (s < strend) {
1279 if (!(*s == ' ' || isSPACE_LC_utf8((U8*)s))) {
1280 if (tmp && (norun || regtry(prog, s)))
1291 while (s < strend) {
1292 if (!isSPACE_LC(*s)) {
1293 if (tmp && (norun || regtry(prog, s)))
1306 LOAD_UTF8_CHARCLASS(digit,"0");
1307 while (s < strend) {
1308 if (swash_fetch(PL_utf8_digit,(U8*)s, do_utf8)) {
1309 if (tmp && (norun || regtry(prog, s)))
1320 while (s < strend) {
1322 if (tmp && (norun || regtry(prog, s)))
1334 PL_reg_flags |= RF_tainted;
1336 while (s < strend) {
1337 if (isDIGIT_LC_utf8((U8*)s)) {
1338 if (tmp && (norun || regtry(prog, s)))
1349 while (s < strend) {
1350 if (isDIGIT_LC(*s)) {
1351 if (tmp && (norun || regtry(prog, s)))
1364 LOAD_UTF8_CHARCLASS(digit,"0");
1365 while (s < strend) {
1366 if (!swash_fetch(PL_utf8_digit,(U8*)s, do_utf8)) {
1367 if (tmp && (norun || regtry(prog, s)))
1378 while (s < strend) {
1380 if (tmp && (norun || regtry(prog, s)))
1392 PL_reg_flags |= RF_tainted;
1394 while (s < strend) {
1395 if (!isDIGIT_LC_utf8((U8*)s)) {
1396 if (tmp && (norun || regtry(prog, s)))
1407 while (s < strend) {
1408 if (!isDIGIT_LC(*s)) {
1409 if (tmp && (norun || regtry(prog, s)))
1421 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
1430 - regexec_flags - match a regexp against a string
1433 Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *strend,
1434 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
1435 /* strend: pointer to null at end of string */
1436 /* strbeg: real beginning of string */
1437 /* minend: end of match must be >=minend after stringarg. */
1438 /* data: May be used for some additional optimizations. */
1439 /* nosave: For optimizations. */
1442 register regnode *c;
1443 register char *startpos = stringarg;
1444 I32 minlen; /* must match at least this many chars */
1445 I32 dontbother = 0; /* how many characters not to try at end */
1446 /* I32 start_shift = 0; */ /* Offset of the start to find
1447 constant substr. */ /* CC */
1448 I32 end_shift = 0; /* Same for the end. */ /* CC */
1449 I32 scream_pos = -1; /* Internal iterator of scream. */
1451 SV* oreplsv = GvSV(PL_replgv);
1452 bool do_utf8 = DO_UTF8(sv);
1458 PL_regnarrate = DEBUG_r_TEST;
1461 /* Be paranoid... */
1462 if (prog == NULL || startpos == NULL) {
1463 Perl_croak(aTHX_ "NULL regexp parameter");
1467 minlen = prog->minlen;
1468 if (do_utf8 && !(prog->reganch & ROPT_CANY_SEEN)) {
1469 if (utf8_distance((U8*)strend, (U8*)startpos) < minlen) goto phooey;
1472 if (strend - startpos < minlen) goto phooey;
1475 /* Check validity of program. */
1476 if (UCHARAT(prog->program) != REG_MAGIC) {
1477 Perl_croak(aTHX_ "corrupted regexp program");
1481 PL_reg_eval_set = 0;
1484 if (prog->reganch & ROPT_UTF8)
1485 PL_reg_flags |= RF_utf8;
1487 /* Mark beginning of line for ^ and lookbehind. */
1488 PL_regbol = startpos;
1492 /* Mark end of line for $ (and such) */
1495 /* see how far we have to get to not match where we matched before */
1496 PL_regtill = startpos+minend;
1498 /* We start without call_cc context. */
1501 /* If there is a "must appear" string, look for it. */
1504 if (prog->reganch & ROPT_GPOS_SEEN) { /* Need to have PL_reg_ganch */
1507 if (flags & REXEC_IGNOREPOS) /* Means: check only at start */
1508 PL_reg_ganch = startpos;
1509 else if (sv && SvTYPE(sv) >= SVt_PVMG
1511 && (mg = mg_find(sv, PERL_MAGIC_regex_global))
1512 && mg->mg_len >= 0) {
1513 PL_reg_ganch = strbeg + mg->mg_len; /* Defined pos() */
1514 if (prog->reganch & ROPT_ANCH_GPOS) {
1515 if (s > PL_reg_ganch)
1520 else /* pos() not defined */
1521 PL_reg_ganch = strbeg;
1524 if (do_utf8 == (UTF!=0) &&
1525 !(flags & REXEC_CHECKED) && prog->check_substr != Nullsv) {
1526 re_scream_pos_data d;
1528 d.scream_olds = &scream_olds;
1529 d.scream_pos = &scream_pos;
1530 s = re_intuit_start(prog, sv, s, strend, flags, &d);
1532 goto phooey; /* not present */
1535 DEBUG_r( if (!PL_colorset) reginitcolors() );
1536 DEBUG_r(PerlIO_printf(Perl_debug_log,
1537 "%sMatching REx%s `%s%.60s%s%s' against `%s%.*s%s%s'\n",
1538 PL_colors[4],PL_colors[5],PL_colors[0],
1541 (strlen(prog->precomp) > 60 ? "..." : ""),
1543 (int)(strend - startpos > 60 ? 60 : strend - startpos),
1544 startpos, PL_colors[1],
1545 (strend - startpos > 60 ? "..." : ""))
1548 /* Simplest case: anchored match need be tried only once. */
1549 /* [unless only anchor is BOL and multiline is set] */
1550 if (prog->reganch & (ROPT_ANCH & ~ROPT_ANCH_GPOS)) {
1551 if (s == startpos && regtry(prog, startpos))
1553 else if (PL_multiline || (prog->reganch & ROPT_IMPLICIT)
1554 || (prog->reganch & ROPT_ANCH_MBOL)) /* XXXX SBOL? */
1559 dontbother = minlen - 1;
1560 end = HOP3c(strend, -dontbother, strbeg) - 1;
1561 /* for multiline we only have to try after newlines */
1562 if (prog->check_substr) {
1566 if (regtry(prog, s))
1571 if (prog->reganch & RE_USE_INTUIT) {
1572 s = re_intuit_start(prog, sv, s + 1, strend, flags, NULL);
1583 if (*s++ == '\n') { /* don't need PL_utf8skip here */
1584 if (regtry(prog, s))
1591 } else if (prog->reganch & ROPT_ANCH_GPOS) {
1592 if (regtry(prog, PL_reg_ganch))
1597 /* Messy cases: unanchored match. */
1598 if (prog->anchored_substr && prog->reganch & ROPT_SKIP) {
1599 /* we have /x+whatever/ */
1600 /* it must be a one character string (XXXX Except UTF?) */
1601 char ch = SvPVX(prog->anchored_substr)[0];
1607 while (s < strend) {
1609 DEBUG_r( did_match = 1 );
1610 if (regtry(prog, s)) goto got_it;
1612 while (s < strend && *s == ch)
1619 while (s < strend) {
1621 DEBUG_r( did_match = 1 );
1622 if (regtry(prog, s)) goto got_it;
1624 while (s < strend && *s == ch)
1630 DEBUG_r(if (!did_match)
1631 PerlIO_printf(Perl_debug_log,
1632 "Did not find anchored character...\n")
1636 else if (do_utf8 == (UTF!=0) &&
1637 (prog->anchored_substr != Nullsv
1638 || (prog->float_substr != Nullsv
1639 && prog->float_max_offset < strend - s))) {
1640 SV *must = prog->anchored_substr
1641 ? prog->anchored_substr : prog->float_substr;
1643 prog->anchored_substr ? prog->anchored_offset : prog->float_max_offset;
1645 prog->anchored_substr ? prog->anchored_offset : prog->float_min_offset;
1646 char *last = HOP3c(strend, /* Cannot start after this */
1647 -(I32)(CHR_SVLEN(must)
1648 - (SvTAIL(must) != 0) + back_min), strbeg);
1649 char *last1; /* Last position checked before */
1655 last1 = HOPc(s, -1);
1657 last1 = s - 1; /* bogus */
1659 /* XXXX check_substr already used to find `s', can optimize if
1660 check_substr==must. */
1662 dontbother = end_shift;
1663 strend = HOPc(strend, -dontbother);
1664 while ( (s <= last) &&
1665 ((flags & REXEC_SCREAM)
1666 ? (s = screaminstr(sv, must, HOP3c(s, back_min, strend) - strbeg,
1667 end_shift, &scream_pos, 0))
1668 : (s = fbm_instr((unsigned char*)HOP3(s, back_min, strend),
1669 (unsigned char*)strend, must,
1670 PL_multiline ? FBMrf_MULTILINE : 0))) ) {
1671 DEBUG_r( did_match = 1 );
1672 if (HOPc(s, -back_max) > last1) {
1673 last1 = HOPc(s, -back_min);
1674 s = HOPc(s, -back_max);
1677 char *t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
1679 last1 = HOPc(s, -back_min);
1683 while (s <= last1) {
1684 if (regtry(prog, s))
1690 while (s <= last1) {
1691 if (regtry(prog, s))
1697 DEBUG_r(if (!did_match)
1698 PerlIO_printf(Perl_debug_log,
1699 "Did not find %s substr `%s%.*s%s'%s...\n",
1700 ((must == prog->anchored_substr)
1701 ? "anchored" : "floating"),
1703 (int)(SvCUR(must) - (SvTAIL(must)!=0)),
1705 PL_colors[1], (SvTAIL(must) ? "$" : ""))
1709 else if ((c = prog->regstclass)) {
1710 if (minlen && PL_regkind[(U8)OP(prog->regstclass)] != EXACT)
1711 /* don't bother with what can't match */
1712 strend = HOPc(strend, -(minlen - 1));
1714 SV *prop = sv_newmortal();
1716 PerlIO_printf(Perl_debug_log, "Matching stclass `%s' against `%s'\n", SvPVX(prop), s);
1718 if (find_byclass(prog, c, s, strend, startpos, 0))
1720 DEBUG_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass...\n"));
1724 if (prog->float_substr != Nullsv) { /* Trim the end. */
1727 if (flags & REXEC_SCREAM) {
1728 last = screaminstr(sv, prog->float_substr, s - strbeg,
1729 end_shift, &scream_pos, 1); /* last one */
1731 last = scream_olds; /* Only one occurrence. */
1735 char *little = SvPV(prog->float_substr, len);
1737 if (SvTAIL(prog->float_substr)) {
1738 if (memEQ(strend - len + 1, little, len - 1))
1739 last = strend - len + 1;
1740 else if (!PL_multiline)
1741 last = memEQ(strend - len, little, len)
1742 ? strend - len : Nullch;
1748 last = rninstr(s, strend, little, little + len);
1750 last = strend; /* matching `$' */
1754 DEBUG_r(PerlIO_printf(Perl_debug_log,
1755 "%sCan't trim the tail, match fails (should not happen)%s\n",
1756 PL_colors[4],PL_colors[5]));
1757 goto phooey; /* Should not happen! */
1759 dontbother = strend - last + prog->float_min_offset;
1761 if (minlen && (dontbother < minlen))
1762 dontbother = minlen - 1;
1763 strend -= dontbother; /* this one's always in bytes! */
1764 /* We don't know much -- general case. */
1767 if (regtry(prog, s))
1776 if (regtry(prog, s))
1778 } while (s++ < strend);
1786 RX_MATCH_TAINTED_set(prog, PL_reg_flags & RF_tainted);
1788 if (PL_reg_eval_set) {
1789 /* Preserve the current value of $^R */
1790 if (oreplsv != GvSV(PL_replgv))
1791 sv_setsv(oreplsv, GvSV(PL_replgv));/* So that when GvSV(replgv) is
1792 restored, the value remains
1794 restore_pos(aTHX_ 0);
1797 /* make sure $`, $&, $', and $digit will work later */
1798 if ( !(flags & REXEC_NOT_FIRST) ) {
1799 if (RX_MATCH_COPIED(prog)) {
1800 Safefree(prog->subbeg);
1801 RX_MATCH_COPIED_off(prog);
1803 if (flags & REXEC_COPY_STR) {
1804 I32 i = PL_regeol - startpos + (stringarg - strbeg);
1806 s = savepvn(strbeg, i);
1809 RX_MATCH_COPIED_on(prog);
1812 prog->subbeg = strbeg;
1813 prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
1820 DEBUG_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
1821 PL_colors[4],PL_colors[5]));
1822 if (PL_reg_eval_set)
1823 restore_pos(aTHX_ 0);
1828 - regtry - try match at specific point
1830 STATIC I32 /* 0 failure, 1 success */
1831 S_regtry(pTHX_ regexp *prog, char *startpos)
1839 PL_regindent = 0; /* XXXX Not good when matches are reenterable... */
1841 if ((prog->reganch & ROPT_EVAL_SEEN) && !PL_reg_eval_set) {
1844 PL_reg_eval_set = RS_init;
1846 PerlIO_printf(Perl_debug_log, " setting stack tmpbase at %"IVdf"\n",
1847 (IV)(PL_stack_sp - PL_stack_base));
1849 SAVEI32(cxstack[cxstack_ix].blk_oldsp);
1850 cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base;
1851 /* Otherwise OP_NEXTSTATE will free whatever on stack now. */
1853 /* Apparently this is not needed, judging by wantarray. */
1854 /* SAVEI8(cxstack[cxstack_ix].blk_gimme);
1855 cxstack[cxstack_ix].blk_gimme = G_SCALAR; */
1858 /* Make $_ available to executed code. */
1859 if (PL_reg_sv != DEFSV) {
1860 /* SAVE_DEFSV does *not* suffice here for USE_5005THREADS */
1865 if (!(SvTYPE(PL_reg_sv) >= SVt_PVMG && SvMAGIC(PL_reg_sv)
1866 && (mg = mg_find(PL_reg_sv, PERL_MAGIC_regex_global)))) {
1867 /* prepare for quick setting of pos */
1868 sv_magic(PL_reg_sv, (SV*)0,
1869 PERL_MAGIC_regex_global, Nullch, 0);
1870 mg = mg_find(PL_reg_sv, PERL_MAGIC_regex_global);
1874 PL_reg_oldpos = mg->mg_len;
1875 SAVEDESTRUCTOR_X(restore_pos, 0);
1877 if (!PL_reg_curpm) {
1878 Newz(22,PL_reg_curpm, 1, PMOP);
1881 SV* repointer = newSViv(0);
1882 /* so we know which PL_regex_padav element is PL_reg_curpm */
1883 SvFLAGS(repointer) |= SVf_BREAK;
1884 av_push(PL_regex_padav,repointer);
1885 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
1886 PL_regex_pad = AvARRAY(PL_regex_padav);
1890 PM_SETRE(PL_reg_curpm, prog);
1891 PL_reg_oldcurpm = PL_curpm;
1892 PL_curpm = PL_reg_curpm;
1893 if (RX_MATCH_COPIED(prog)) {
1894 /* Here is a serious problem: we cannot rewrite subbeg,
1895 since it may be needed if this match fails. Thus
1896 $` inside (?{}) could fail... */
1897 PL_reg_oldsaved = prog->subbeg;
1898 PL_reg_oldsavedlen = prog->sublen;
1899 RX_MATCH_COPIED_off(prog);
1902 PL_reg_oldsaved = Nullch;
1903 prog->subbeg = PL_bostr;
1904 prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
1906 prog->startp[0] = startpos - PL_bostr;
1907 PL_reginput = startpos;
1908 PL_regstartp = prog->startp;
1909 PL_regendp = prog->endp;
1910 PL_reglastparen = &prog->lastparen;
1911 PL_reglastcloseparen = &prog->lastcloseparen;
1912 prog->lastparen = 0;
1914 DEBUG_r(PL_reg_starttry = startpos);
1915 if (PL_reg_start_tmpl <= prog->nparens) {
1916 PL_reg_start_tmpl = prog->nparens*3/2 + 3;
1917 if(PL_reg_start_tmp)
1918 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
1920 New(22,PL_reg_start_tmp, PL_reg_start_tmpl, char*);
1923 /* XXXX What this code is doing here?!!! There should be no need
1924 to do this again and again, PL_reglastparen should take care of
1927 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
1928 * Actually, the code in regcppop() (which Ilya may be meaning by
1929 * PL_reglastparen), is not needed at all by the test suite
1930 * (op/regexp, op/pat, op/split), but that code is needed, oddly
1931 * enough, for building DynaLoader, or otherwise this
1932 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
1933 * will happen. Meanwhile, this code *is* needed for the
1934 * above-mentioned test suite tests to succeed. The common theme
1935 * on those tests seems to be returning null fields from matches.
1940 if (prog->nparens) {
1941 for (i = prog->nparens; i > *PL_reglastparen; i--) {
1948 if (regmatch(prog->program + 1)) {
1949 prog->endp[0] = PL_reginput - PL_bostr;
1952 REGCP_UNWIND(lastcp);
1956 #define RE_UNWIND_BRANCH 1
1957 #define RE_UNWIND_BRANCHJ 2
1961 typedef struct { /* XX: makes sense to enlarge it... */
1965 } re_unwind_generic_t;
1978 } re_unwind_branch_t;
1980 typedef union re_unwind_t {
1982 re_unwind_generic_t generic;
1983 re_unwind_branch_t branch;
1986 #define sayYES goto yes
1987 #define sayNO goto no
1988 #define sayYES_FINAL goto yes_final
1989 #define sayYES_LOUD goto yes_loud
1990 #define sayNO_FINAL goto no_final
1991 #define sayNO_SILENT goto do_no
1992 #define saySAME(x) if (x) goto yes; else goto no
1994 #define REPORT_CODE_OFF 24
1997 - regmatch - main matching routine
1999 * Conceptually the strategy is simple: check to see whether the current
2000 * node matches, call self recursively to see whether the rest matches,
2001 * and then act accordingly. In practice we make some effort to avoid
2002 * recursion, in particular by going through "ordinary" nodes (that don't
2003 * need to know whether the rest of the match failed) by a loop instead of
2006 /* [lwall] I've hoisted the register declarations to the outer block in order to
2007 * maybe save a little bit of pushing and popping on the stack. It also takes
2008 * advantage of machines that use a register save mask on subroutine entry.
2010 STATIC I32 /* 0 failure, 1 success */
2011 S_regmatch(pTHX_ regnode *prog)
2013 register regnode *scan; /* Current node. */
2014 regnode *next; /* Next node. */
2015 regnode *inner; /* Next node in internal branch. */
2016 register I32 nextchr; /* renamed nextchr - nextchar colides with
2017 function of same name */
2018 register I32 n; /* no or next */
2019 register I32 ln = 0; /* len or last */
2020 register char *s = Nullch; /* operand or save */
2021 register char *locinput = PL_reginput;
2022 register I32 c1 = 0, c2 = 0, paren; /* case fold search, parenth */
2023 int minmod = 0, sw = 0, logical = 0;
2026 I32 firstcp = PL_savestack_ix;
2028 register bool do_utf8 = PL_reg_match_utf8;
2034 /* Note that nextchr is a byte even in UTF */
2035 nextchr = UCHARAT(locinput);
2037 while (scan != NULL) {
2040 SV *prop = sv_newmortal();
2041 int docolor = *PL_colors[0];
2042 int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
2043 int l = (PL_regeol - locinput) > taill ? taill : (PL_regeol - locinput);
2044 /* The part of the string before starttry has one color
2045 (pref0_len chars), between starttry and current
2046 position another one (pref_len - pref0_len chars),
2047 after the current position the third one.
2048 We assume that pref0_len <= pref_len, otherwise we
2049 decrease pref0_len. */
2050 int pref_len = (locinput - PL_bostr) > (5 + taill) - l
2051 ? (5 + taill) - l : locinput - PL_bostr;
2054 while (UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
2056 pref0_len = pref_len - (locinput - PL_reg_starttry);
2057 if (l + pref_len < (5 + taill) && l < PL_regeol - locinput)
2058 l = ( PL_regeol - locinput > (5 + taill) - pref_len
2059 ? (5 + taill) - pref_len : PL_regeol - locinput);
2060 while (UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
2064 if (pref0_len > pref_len)
2065 pref0_len = pref_len;
2066 regprop(prop, scan);
2067 PerlIO_printf(Perl_debug_log,
2068 "%4"IVdf" <%s%.*s%s%s%.*s%s%s%s%.*s%s>%*s|%3"IVdf":%*s%s\n",
2069 (IV)(locinput - PL_bostr),
2070 PL_colors[4], pref0_len,
2071 locinput - pref_len, PL_colors[5],
2072 PL_colors[2], pref_len - pref0_len,
2073 locinput - pref_len + pref0_len, PL_colors[3],
2074 (docolor ? "" : "> <"),
2075 PL_colors[0], l, locinput, PL_colors[1],
2076 15 - l - pref_len + 1,
2078 (IV)(scan - PL_regprogram), PL_regindent*2, "",
2082 next = scan + NEXT_OFF(scan);
2088 if (locinput == PL_bostr || (PL_multiline &&
2089 (nextchr || locinput < PL_regeol) && locinput[-1] == '\n') )
2091 /* regtill = regbol; */
2096 if (locinput == PL_bostr ||
2097 ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n'))
2103 if (locinput == PL_bostr)
2107 if (locinput == PL_reg_ganch)
2117 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
2122 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
2124 if (PL_regeol - locinput > 1)
2128 if (PL_regeol != locinput)
2132 if (!nextchr && locinput >= PL_regeol)
2135 locinput += PL_utf8skip[nextchr];
2136 if (locinput > PL_regeol)
2138 nextchr = UCHARAT(locinput);
2141 nextchr = UCHARAT(++locinput);
2144 if (!nextchr && locinput >= PL_regeol)
2146 nextchr = UCHARAT(++locinput);
2149 if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
2152 locinput += PL_utf8skip[nextchr];
2153 if (locinput > PL_regeol)
2155 nextchr = UCHARAT(locinput);
2158 nextchr = UCHARAT(++locinput);
2163 if (do_utf8 != (UTF!=0)) {
2171 if (*((U8*)s) != utf8_to_uvchr((U8*)l, &len))
2180 if (*((U8*)l) != utf8_to_uvchr((U8*)s, &len))
2186 nextchr = UCHARAT(locinput);
2189 /* Inline the first character, for speed. */
2190 if (UCHARAT(s) != nextchr)
2192 if (PL_regeol - locinput < ln)
2194 if (ln > 1 && memNE(s, locinput, ln))
2197 nextchr = UCHARAT(locinput);
2200 PL_reg_flags |= RF_tainted;
2210 U8 tmpbuf[UTF8_MAXLEN*2+1];
2215 toLOWER_utf8((U8*)l, tmpbuf, &ulen);
2216 if (memNE(s, tmpbuf, ulen))
2222 nextchr = UCHARAT(locinput);
2226 /* Inline the first character, for speed. */
2227 if (UCHARAT(s) != nextchr &&
2228 UCHARAT(s) != ((OP(scan) == EXACTF)
2229 ? PL_fold : PL_fold_locale)[nextchr])
2231 if (PL_regeol - locinput < ln)
2233 if (ln > 1 && (OP(scan) == EXACTF
2234 ? ibcmp(s, locinput, ln)
2235 : ibcmp_locale(s, locinput, ln)))
2238 nextchr = UCHARAT(locinput);
2242 if (!reginclass(scan, (U8*)locinput, do_utf8))
2244 if (locinput >= PL_regeol)
2246 locinput += PL_utf8skip[nextchr];
2247 nextchr = UCHARAT(locinput);
2251 nextchr = UCHARAT(locinput);
2252 if (!reginclass(scan, (U8*)locinput, do_utf8))
2254 if (!nextchr && locinput >= PL_regeol)
2256 nextchr = UCHARAT(++locinput);
2260 PL_reg_flags |= RF_tainted;
2266 LOAD_UTF8_CHARCLASS(alnum,"a");
2267 if (!(OP(scan) == ALNUM
2268 ? swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
2269 : isALNUM_LC_utf8((U8*)locinput)))
2273 locinput += PL_utf8skip[nextchr];
2274 nextchr = UCHARAT(locinput);
2277 if (!(OP(scan) == ALNUM
2278 ? isALNUM(nextchr) : isALNUM_LC(nextchr)))
2280 nextchr = UCHARAT(++locinput);
2283 PL_reg_flags |= RF_tainted;
2286 if (!nextchr && locinput >= PL_regeol)
2289 LOAD_UTF8_CHARCLASS(alnum,"a");
2290 if (OP(scan) == NALNUM
2291 ? swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
2292 : isALNUM_LC_utf8((U8*)locinput))
2296 locinput += PL_utf8skip[nextchr];
2297 nextchr = UCHARAT(locinput);
2300 if (OP(scan) == NALNUM
2301 ? isALNUM(nextchr) : isALNUM_LC(nextchr))
2303 nextchr = UCHARAT(++locinput);
2307 PL_reg_flags |= RF_tainted;
2311 /* was last char in word? */
2313 if (locinput == PL_bostr)
2316 U8 *r = reghop((U8*)locinput, -1);
2318 ln = utf8n_to_uvchr(r, s - (char*)r, 0, 0);
2320 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
2321 ln = isALNUM_uni(ln);
2322 LOAD_UTF8_CHARCLASS(alnum,"a");
2323 n = swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8);
2326 ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(ln));
2327 n = isALNUM_LC_utf8((U8*)locinput);
2331 ln = (locinput != PL_bostr) ?
2332 UCHARAT(locinput - 1) : '\n';
2333 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
2335 n = isALNUM(nextchr);
2338 ln = isALNUM_LC(ln);
2339 n = isALNUM_LC(nextchr);
2342 if (((!ln) == (!n)) == (OP(scan) == BOUND ||
2343 OP(scan) == BOUNDL))
2347 PL_reg_flags |= RF_tainted;
2353 if (UTF8_IS_CONTINUED(nextchr)) {
2354 LOAD_UTF8_CHARCLASS(space," ");
2355 if (!(OP(scan) == SPACE
2356 ? swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
2357 : isSPACE_LC_utf8((U8*)locinput)))
2361 locinput += PL_utf8skip[nextchr];
2362 nextchr = UCHARAT(locinput);
2365 if (!(OP(scan) == SPACE
2366 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
2368 nextchr = UCHARAT(++locinput);
2371 if (!(OP(scan) == SPACE
2372 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
2374 nextchr = UCHARAT(++locinput);
2378 PL_reg_flags |= RF_tainted;
2381 if (!nextchr && locinput >= PL_regeol)
2384 LOAD_UTF8_CHARCLASS(space," ");
2385 if (OP(scan) == NSPACE
2386 ? swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
2387 : isSPACE_LC_utf8((U8*)locinput))
2391 locinput += PL_utf8skip[nextchr];
2392 nextchr = UCHARAT(locinput);
2395 if (OP(scan) == NSPACE
2396 ? isSPACE(nextchr) : isSPACE_LC(nextchr))
2398 nextchr = UCHARAT(++locinput);
2401 PL_reg_flags |= RF_tainted;
2407 LOAD_UTF8_CHARCLASS(digit,"0");
2408 if (!(OP(scan) == DIGIT
2409 ? swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
2410 : isDIGIT_LC_utf8((U8*)locinput)))
2414 locinput += PL_utf8skip[nextchr];
2415 nextchr = UCHARAT(locinput);
2418 if (!(OP(scan) == DIGIT
2419 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr)))
2421 nextchr = UCHARAT(++locinput);
2424 PL_reg_flags |= RF_tainted;
2427 if (!nextchr && locinput >= PL_regeol)
2430 LOAD_UTF8_CHARCLASS(digit,"0");
2431 if (OP(scan) == NDIGIT
2432 ? swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
2433 : isDIGIT_LC_utf8((U8*)locinput))
2437 locinput += PL_utf8skip[nextchr];
2438 nextchr = UCHARAT(locinput);
2441 if (OP(scan) == NDIGIT
2442 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr))
2444 nextchr = UCHARAT(++locinput);
2447 LOAD_UTF8_CHARCLASS(mark,"~");
2448 if (locinput >= PL_regeol ||
2449 swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
2451 locinput += PL_utf8skip[nextchr];
2452 while (locinput < PL_regeol &&
2453 swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
2454 locinput += UTF8SKIP(locinput);
2455 if (locinput > PL_regeol)
2457 nextchr = UCHARAT(locinput);
2460 PL_reg_flags |= RF_tainted;
2464 n = ARG(scan); /* which paren pair */
2465 ln = PL_regstartp[n];
2466 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
2467 if (*PL_reglastparen < n || ln == -1)
2468 sayNO; /* Do not match unless seen CLOSEn. */
2469 if (ln == PL_regendp[n])
2473 if (do_utf8 && OP(scan) != REF) { /* REF can do byte comparison */
2475 char *e = PL_bostr + PL_regendp[n];
2477 * Note that we can't do the "other character" lookup trick as
2478 * in the 8-bit case (no pun intended) because in Unicode we
2479 * have to map both upper and title case to lower case.
2481 if (OP(scan) == REFF) {
2482 STRLEN ulen1, ulen2;
2483 U8 tmpbuf1[UTF8_MAXLEN*2+1];
2484 U8 tmpbuf2[UTF8_MAXLEN*2+1];
2488 toLOWER_utf8((U8*)s, tmpbuf1, &ulen1);
2489 toLOWER_utf8((U8*)l, tmpbuf2, &ulen2);
2490 if (ulen1 != ulen2 || memNE(tmpbuf1, tmpbuf2, ulen1))
2497 nextchr = UCHARAT(locinput);
2501 /* Inline the first character, for speed. */
2502 if (UCHARAT(s) != nextchr &&
2504 (UCHARAT(s) != ((OP(scan) == REFF
2505 ? PL_fold : PL_fold_locale)[nextchr]))))
2507 ln = PL_regendp[n] - ln;
2508 if (locinput + ln > PL_regeol)
2510 if (ln > 1 && (OP(scan) == REF
2511 ? memNE(s, locinput, ln)
2513 ? ibcmp(s, locinput, ln)
2514 : ibcmp_locale(s, locinput, ln))))
2517 nextchr = UCHARAT(locinput);
2528 OP_4tree *oop = PL_op;
2529 COP *ocurcop = PL_curcop;
2530 SV **ocurpad = PL_curpad;
2534 PL_op = (OP_4tree*)PL_regdata->data[n];
2535 DEBUG_r( PerlIO_printf(Perl_debug_log, " re_eval 0x%"UVxf"\n", PTR2UV(PL_op)) );
2536 PL_curpad = AvARRAY((AV*)PL_regdata->data[n + 2]);
2537 PL_regendp[0] = PL_reg_magic->mg_len = locinput - PL_bostr;
2541 CALLRUNOPS(aTHX); /* Scalar context. */
2544 ret = Nullsv; /* protect against empty (?{}) blocks. */
2552 PL_curpad = ocurpad;
2553 PL_curcop = ocurcop;
2555 if (logical == 2) { /* Postponed subexpression. */
2557 MAGIC *mg = Null(MAGIC*);
2559 CHECKPOINT cp, lastcp;
2561 if(SvROK(ret) || SvRMAGICAL(ret)) {
2562 SV *sv = SvROK(ret) ? SvRV(ret) : ret;
2565 mg = mg_find(sv, PERL_MAGIC_qr);
2568 re = (regexp *)mg->mg_obj;
2569 (void)ReREFCNT_inc(re);
2573 char *t = SvPV(ret, len);
2575 char *oprecomp = PL_regprecomp;
2576 I32 osize = PL_regsize;
2577 I32 onpar = PL_regnpar;
2580 re = CALLREGCOMP(aTHX_ t, t + len, &pm);
2582 & (SVs_TEMP | SVs_PADTMP | SVf_READONLY)))
2583 sv_magic(ret,(SV*)ReREFCNT_inc(re),
2585 PL_regprecomp = oprecomp;
2590 PerlIO_printf(Perl_debug_log,
2591 "Entering embedded `%s%.60s%s%s'\n",
2595 (strlen(re->precomp) > 60 ? "..." : ""))
2598 state.prev = PL_reg_call_cc;
2599 state.cc = PL_regcc;
2600 state.re = PL_reg_re;
2604 cp = regcppush(0); /* Save *all* the positions. */
2607 state.ss = PL_savestack_ix;
2608 *PL_reglastparen = 0;
2609 *PL_reglastcloseparen = 0;
2610 PL_reg_call_cc = &state;
2611 PL_reginput = locinput;
2613 /* XXXX This is too dramatic a measure... */
2616 if (regmatch(re->program + 1)) {
2617 /* Even though we succeeded, we need to restore
2618 global variables, since we may be wrapped inside
2619 SUSPEND, thus the match may be not finished yet. */
2621 /* XXXX Do this only if SUSPENDed? */
2622 PL_reg_call_cc = state.prev;
2623 PL_regcc = state.cc;
2624 PL_reg_re = state.re;
2625 cache_re(PL_reg_re);
2627 /* XXXX This is too dramatic a measure... */
2630 /* These are needed even if not SUSPEND. */
2636 REGCP_UNWIND(lastcp);
2638 PL_reg_call_cc = state.prev;
2639 PL_regcc = state.cc;
2640 PL_reg_re = state.re;
2641 cache_re(PL_reg_re);
2643 /* XXXX This is too dramatic a measure... */
2653 sv_setsv(save_scalar(PL_replgv), ret);
2657 n = ARG(scan); /* which paren pair */
2658 PL_reg_start_tmp[n] = locinput;
2663 n = ARG(scan); /* which paren pair */
2664 PL_regstartp[n] = PL_reg_start_tmp[n] - PL_bostr;
2665 PL_regendp[n] = locinput - PL_bostr;
2666 if (n > *PL_reglastparen)
2667 *PL_reglastparen = n;
2668 *PL_reglastcloseparen = n;
2671 n = ARG(scan); /* which paren pair */
2672 sw = (*PL_reglastparen >= n && PL_regendp[n] != -1);
2675 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
2677 next = NEXTOPER(NEXTOPER(scan));
2679 next = scan + ARG(scan);
2680 if (OP(next) == IFTHEN) /* Fake one. */
2681 next = NEXTOPER(NEXTOPER(next));
2685 logical = scan->flags;
2687 /*******************************************************************
2688 PL_regcc contains infoblock about the innermost (...)* loop, and
2689 a pointer to the next outer infoblock.
2691 Here is how Y(A)*Z is processed (if it is compiled into CURLYX/WHILEM):
2693 1) After matching X, regnode for CURLYX is processed;
2695 2) This regnode creates infoblock on the stack, and calls
2696 regmatch() recursively with the starting point at WHILEM node;
2698 3) Each hit of WHILEM node tries to match A and Z (in the order
2699 depending on the current iteration, min/max of {min,max} and
2700 greediness). The information about where are nodes for "A"
2701 and "Z" is read from the infoblock, as is info on how many times "A"
2702 was already matched, and greediness.
2704 4) After A matches, the same WHILEM node is hit again.
2706 5) Each time WHILEM is hit, PL_regcc is the infoblock created by CURLYX
2707 of the same pair. Thus when WHILEM tries to match Z, it temporarily
2708 resets PL_regcc, since this Y(A)*Z can be a part of some other loop:
2709 as in (Y(A)*Z)*. If Z matches, the automaton will hit the WHILEM node
2710 of the external loop.
2712 Currently present infoblocks form a tree with a stem formed by PL_curcc
2713 and whatever it mentions via ->next, and additional attached trees
2714 corresponding to temporarily unset infoblocks as in "5" above.
2716 In the following picture infoblocks for outer loop of
2717 (Y(A)*?Z)*?T are denoted O, for inner I. NULL starting block
2718 is denoted by x. The matched string is YAAZYAZT. Temporarily postponed
2719 infoblocks are drawn below the "reset" infoblock.
2721 In fact in the picture below we do not show failed matches for Z and T
2722 by WHILEM blocks. [We illustrate minimal matches, since for them it is
2723 more obvious *why* one needs to *temporary* unset infoblocks.]
2725 Matched REx position InfoBlocks Comment
2729 Y A)*?Z)*?T x <- O <- I
2730 YA )*?Z)*?T x <- O <- I
2731 YA A)*?Z)*?T x <- O <- I
2732 YAA )*?Z)*?T x <- O <- I
2733 YAA Z)*?T x <- O # Temporary unset I
2736 YAAZ Y(A)*?Z)*?T x <- O
2739 YAAZY (A)*?Z)*?T x <- O
2742 YAAZY A)*?Z)*?T x <- O <- I
2745 YAAZYA )*?Z)*?T x <- O <- I
2748 YAAZYA Z)*?T x <- O # Temporary unset I
2754 YAAZYAZ T x # Temporary unset O
2761 *******************************************************************/
2764 CHECKPOINT cp = PL_savestack_ix;
2765 /* No need to save/restore up to this paren */
2766 I32 parenfloor = scan->flags;
2768 if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
2770 cc.oldcc = PL_regcc;
2772 /* XXXX Probably it is better to teach regpush to support
2773 parenfloor > PL_regsize... */
2774 if (parenfloor > *PL_reglastparen)
2775 parenfloor = *PL_reglastparen; /* Pessimization... */
2776 cc.parenfloor = parenfloor;
2778 cc.min = ARG1(scan);
2779 cc.max = ARG2(scan);
2780 cc.scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
2784 PL_reginput = locinput;
2785 n = regmatch(PREVOPER(next)); /* start on the WHILEM */
2787 PL_regcc = cc.oldcc;
2793 * This is really hard to understand, because after we match
2794 * what we're trying to match, we must make sure the rest of
2795 * the REx is going to match for sure, and to do that we have
2796 * to go back UP the parse tree by recursing ever deeper. And
2797 * if it fails, we have to reset our parent's current state
2798 * that we can try again after backing off.
2801 CHECKPOINT cp, lastcp;
2802 CURCUR* cc = PL_regcc;
2803 char *lastloc = cc->lastloc; /* Detection of 0-len. */
2805 n = cc->cur + 1; /* how many we know we matched */
2806 PL_reginput = locinput;
2809 PerlIO_printf(Perl_debug_log,
2810 "%*s %ld out of %ld..%ld cc=%lx\n",
2811 REPORT_CODE_OFF+PL_regindent*2, "",
2812 (long)n, (long)cc->min,
2813 (long)cc->max, (long)cc)
2816 /* If degenerate scan matches "", assume scan done. */
2818 if (locinput == cc->lastloc && n >= cc->min) {
2819 PL_regcc = cc->oldcc;
2823 PerlIO_printf(Perl_debug_log,
2824 "%*s empty match detected, try continuation...\n",
2825 REPORT_CODE_OFF+PL_regindent*2, "")
2827 if (regmatch(cc->next))
2835 /* First just match a string of min scans. */
2839 cc->lastloc = locinput;
2840 if (regmatch(cc->scan))
2843 cc->lastloc = lastloc;
2848 /* Check whether we already were at this position.
2849 Postpone detection until we know the match is not
2850 *that* much linear. */
2851 if (!PL_reg_maxiter) {
2852 PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
2853 PL_reg_leftiter = PL_reg_maxiter;
2855 if (PL_reg_leftiter-- == 0) {
2856 I32 size = (PL_reg_maxiter + 7)/8;
2857 if (PL_reg_poscache) {
2858 if (PL_reg_poscache_size < size) {
2859 Renew(PL_reg_poscache, size, char);
2860 PL_reg_poscache_size = size;
2862 Zero(PL_reg_poscache, size, char);
2865 PL_reg_poscache_size = size;
2866 Newz(29, PL_reg_poscache, size, char);
2869 PerlIO_printf(Perl_debug_log,
2870 "%sDetected a super-linear match, switching on caching%s...\n",
2871 PL_colors[4], PL_colors[5])
2874 if (PL_reg_leftiter < 0) {
2875 I32 o = locinput - PL_bostr, b;
2877 o = (scan->flags & 0xf) - 1 + o * (scan->flags>>4);
2880 if (PL_reg_poscache[o] & (1<<b)) {
2882 PerlIO_printf(Perl_debug_log,
2883 "%*s already tried at this position...\n",
2884 REPORT_CODE_OFF+PL_regindent*2, "")
2888 PL_reg_poscache[o] |= (1<<b);
2892 /* Prefer next over scan for minimal matching. */
2895 PL_regcc = cc->oldcc;
2898 cp = regcppush(cc->parenfloor);
2900 if (regmatch(cc->next)) {
2902 sayYES; /* All done. */
2904 REGCP_UNWIND(lastcp);
2910 if (n >= cc->max) { /* Maximum greed exceeded? */
2911 if (ckWARN(WARN_REGEXP) && n >= REG_INFTY
2912 && !(PL_reg_flags & RF_warned)) {
2913 PL_reg_flags |= RF_warned;
2914 Perl_warner(aTHX_ WARN_REGEXP, "%s limit (%d) exceeded",
2915 "Complex regular subexpression recursion",
2922 PerlIO_printf(Perl_debug_log,
2923 "%*s trying longer...\n",
2924 REPORT_CODE_OFF+PL_regindent*2, "")
2926 /* Try scanning more and see if it helps. */
2927 PL_reginput = locinput;
2929 cc->lastloc = locinput;
2930 cp = regcppush(cc->parenfloor);
2932 if (regmatch(cc->scan)) {
2936 REGCP_UNWIND(lastcp);
2939 cc->lastloc = lastloc;
2943 /* Prefer scan over next for maximal matching. */
2945 if (n < cc->max) { /* More greed allowed? */
2946 cp = regcppush(cc->parenfloor);
2948 cc->lastloc = locinput;
2950 if (regmatch(cc->scan)) {
2954 REGCP_UNWIND(lastcp);
2955 regcppop(); /* Restore some previous $<digit>s? */
2956 PL_reginput = locinput;
2958 PerlIO_printf(Perl_debug_log,
2959 "%*s failed, try continuation...\n",
2960 REPORT_CODE_OFF+PL_regindent*2, "")
2963 if (ckWARN(WARN_REGEXP) && n >= REG_INFTY
2964 && !(PL_reg_flags & RF_warned)) {
2965 PL_reg_flags |= RF_warned;
2966 Perl_warner(aTHX_ WARN_REGEXP, "%s limit (%d) exceeded",
2967 "Complex regular subexpression recursion",
2971 /* Failed deeper matches of scan, so see if this one works. */
2972 PL_regcc = cc->oldcc;
2975 if (regmatch(cc->next))
2981 cc->lastloc = lastloc;
2986 next = scan + ARG(scan);
2989 inner = NEXTOPER(NEXTOPER(scan));
2992 inner = NEXTOPER(scan);
2996 if (OP(next) != c1) /* No choice. */
2997 next = inner; /* Avoid recursion. */
2999 I32 lastparen = *PL_reglastparen;
3001 re_unwind_branch_t *uw;
3003 /* Put unwinding data on stack */
3004 unwind1 = SSNEWt(1,re_unwind_branch_t);
3005 uw = SSPTRt(unwind1,re_unwind_branch_t);
3008 uw->type = ((c1 == BRANCH)
3010 : RE_UNWIND_BRANCHJ);
3011 uw->lastparen = lastparen;
3013 uw->locinput = locinput;
3014 uw->nextchr = nextchr;
3016 uw->regindent = ++PL_regindent;
3019 REGCP_SET(uw->lastcp);
3021 /* Now go into the first branch */
3034 /* We suppose that the next guy does not need
3035 backtracking: in particular, it is of constant length,
3036 and has no parenths to influence future backrefs. */
3037 ln = ARG1(scan); /* min to match */
3038 n = ARG2(scan); /* max to match */
3039 paren = scan->flags;
3041 if (paren > PL_regsize)
3043 if (paren > *PL_reglastparen)
3044 *PL_reglastparen = paren;
3046 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
3048 scan += NEXT_OFF(scan); /* Skip former OPEN. */
3049 PL_reginput = locinput;
3052 if (ln && regrepeat_hard(scan, ln, &l) < ln)
3054 /* if we matched something zero-length we don't need to
3055 backtrack - capturing parens are already defined, so
3056 the caveat in the maximal case doesn't apply
3058 XXXX if ln == 0, we can redo this check first time
3059 through the following loop
3062 n = ln; /* don't backtrack */
3063 locinput = PL_reginput;
3064 if (NEAR_EXACT(next)) {
3065 regnode *text_node = next;
3067 if (PL_regkind[(U8)OP(next)] != EXACT)
3068 NEXT_IMPT(text_node);
3070 if (PL_regkind[(U8)OP(text_node)] != EXACT) {
3074 c1 = (U8)*STRING(text_node);
3075 if (OP(next) == EXACTF)
3077 else if (OP(text_node) == EXACTFL)
3078 c2 = PL_fold_locale[c1];
3086 /* This may be improved if l == 0. */
3087 while (n >= ln || (n == REG_INFTY && ln > 0 && l)) { /* ln overflow ? */
3088 /* If it could work, try it. */
3090 UCHARAT(PL_reginput) == c1 ||
3091 UCHARAT(PL_reginput) == c2)
3095 PL_regstartp[paren] =
3096 HOPc(PL_reginput, -l) - PL_bostr;
3097 PL_regendp[paren] = PL_reginput - PL_bostr;
3100 PL_regendp[paren] = -1;
3104 REGCP_UNWIND(lastcp);
3106 /* Couldn't or didn't -- move forward. */
3107 PL_reginput = locinput;
3108 if (regrepeat_hard(scan, 1, &l)) {
3110 locinput = PL_reginput;
3117 n = regrepeat_hard(scan, n, &l);
3118 /* if we matched something zero-length we don't need to
3119 backtrack, unless the minimum count is zero and we
3120 are capturing the result - in that case the capture
3121 being defined or not may affect later execution
3123 if (n != 0 && l == 0 && !(paren && ln == 0))
3124 ln = n; /* don't backtrack */
3125 locinput = PL_reginput;
3127 PerlIO_printf(Perl_debug_log,
3128 "%*s matched %"IVdf" times, len=%"IVdf"...\n",
3129 (int)(REPORT_CODE_OFF+PL_regindent*2), "",
3133 if (NEAR_EXACT(next)) {
3134 regnode *text_node = next;
3136 if (PL_regkind[(U8)OP(next)] != EXACT)
3137 NEXT_IMPT(text_node);
3139 if (PL_regkind[(U8)OP(text_node)] != EXACT) {
3143 c1 = (U8)*STRING(text_node);
3144 if (OP(text_node) == EXACTF)
3146 else if (OP(text_node) == EXACTFL)
3147 c2 = PL_fold_locale[c1];
3157 /* If it could work, try it. */
3159 UCHARAT(PL_reginput) == c1 ||
3160 UCHARAT(PL_reginput) == c2)
3163 PerlIO_printf(Perl_debug_log,
3164 "%*s trying tail with n=%"IVdf"...\n",
3165 (int)(REPORT_CODE_OFF+PL_regindent*2), "", (IV)n)
3169 PL_regstartp[paren] = HOPc(PL_reginput, -l) - PL_bostr;
3170 PL_regendp[paren] = PL_reginput - PL_bostr;
3173 PL_regendp[paren] = -1;
3177 REGCP_UNWIND(lastcp);
3179 /* Couldn't or didn't -- back up. */
3181 locinput = HOPc(locinput, -l);
3182 PL_reginput = locinput;
3189 paren = scan->flags; /* Which paren to set */
3190 if (paren > PL_regsize)
3192 if (paren > *PL_reglastparen)
3193 *PL_reglastparen = paren;
3194 ln = ARG1(scan); /* min to match */
3195 n = ARG2(scan); /* max to match */
3196 scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
3200 ln = ARG1(scan); /* min to match */
3201 n = ARG2(scan); /* max to match */
3202 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
3207 scan = NEXTOPER(scan);
3213 scan = NEXTOPER(scan);
3217 * Lookahead to avoid useless match attempts
3218 * when we know what character comes next.
3222 * Used to only do .*x and .*?x, but now it allows
3223 * for )'s, ('s and (?{ ... })'s to be in the way
3224 * of the quantifier and the EXACT-like node. -- japhy
3227 if (NEAR_EXACT(next)) {
3229 regnode *text_node = next;
3231 if (PL_regkind[(U8)OP(next)] != EXACT)
3232 NEXT_IMPT(text_node);
3234 if (PL_regkind[(U8)OP(text_node)] != EXACT) {
3238 s = (U8*)STRING(text_node);
3242 if (OP(text_node) == EXACTF)
3244 else if (OP(text_node) == EXACTFL)
3245 c2 = PL_fold_locale[c1];
3248 if (OP(text_node) == EXACTF) {
3249 STRLEN ulen1, ulen2;
3250 U8 tmpbuf1[UTF8_MAXLEN*2+1];
3251 U8 tmpbuf2[UTF8_MAXLEN*2+1];
3253 to_utf8_lower((U8*)s, tmpbuf1, &ulen1);
3254 to_utf8_upper((U8*)s, tmpbuf2, &ulen2);
3256 c1 = utf8_to_uvuni(tmpbuf1, 0);
3257 c2 = utf8_to_uvuni(tmpbuf2, 0);
3260 c2 = c1 = utf8_to_uvchr(s, NULL);
3267 PL_reginput = locinput;
3271 if (ln && regrepeat(scan, ln) < ln)
3273 locinput = PL_reginput;
3276 char *e; /* Should not check after this */
3277 char *old = locinput;
3279 if (n == REG_INFTY) {
3282 while (UTF8_IS_CONTINUATION(*(U8*)e))
3288 m >0 && e + UTF8SKIP(e) <= PL_regeol; m--)
3292 e = locinput + n - ln;
3298 /* Find place 'next' could work */
3301 while (locinput <= e &&
3302 UCHARAT(locinput) != c1)
3305 while (locinput <= e
3306 && UCHARAT(locinput) != c1
3307 && UCHARAT(locinput) != c2)
3310 count = locinput - old;
3317 utf8_to_uvchr((U8*)locinput, &len) != c1;
3322 for (count = 0; locinput <= e; count++) {
3323 UV c = utf8_to_uvchr((U8*)locinput, &len);
3324 if (c == c1 || c == c2)
3332 /* PL_reginput == old now */
3333 if (locinput != old) {
3334 ln = 1; /* Did some */
3335 if (regrepeat(scan, count) < count)
3338 /* PL_reginput == locinput now */
3339 TRYPAREN(paren, ln, locinput);
3340 PL_reginput = locinput; /* Could be reset... */
3341 REGCP_UNWIND(lastcp);
3342 /* Couldn't or didn't -- move forward. */
3345 locinput += UTF8SKIP(locinput);
3351 while (n >= ln || (n == REG_INFTY && ln > 0)) { /* ln overflow ? */
3355 c = utf8_to_uvchr((U8*)PL_reginput, NULL);
3357 c = UCHARAT(PL_reginput);
3358 /* If it could work, try it. */
3359 if (c == c1 || c == c2)
3361 TRYPAREN(paren, n, PL_reginput);
3362 REGCP_UNWIND(lastcp);
3365 /* If it could work, try it. */
3366 else if (c1 == -1000)
3368 TRYPAREN(paren, n, PL_reginput);
3369 REGCP_UNWIND(lastcp);
3371 /* Couldn't or didn't -- move forward. */
3372 PL_reginput = locinput;
3373 if (regrepeat(scan, 1)) {
3375 locinput = PL_reginput;
3383 n = regrepeat(scan, n);
3384 locinput = PL_reginput;
3385 if (ln < n && PL_regkind[(U8)OP(next)] == EOL &&
3386 (!PL_multiline || OP(next) == SEOL || OP(next) == EOS)) {
3387 ln = n; /* why back off? */
3388 /* ...because $ and \Z can match before *and* after
3389 newline at the end. Consider "\n\n" =~ /\n+\Z\n/.
3390 We should back off by one in this case. */
3391 if (UCHARAT(PL_reginput - 1) == '\n' && OP(next) != EOS)
3400 c = utf8_to_uvchr((U8*)PL_reginput, NULL);
3402 c = UCHARAT(PL_reginput);
3404 /* If it could work, try it. */
3405 if (c1 == -1000 || c == c1 || c == c2)
3407 TRYPAREN(paren, n, PL_reginput);
3408 REGCP_UNWIND(lastcp);
3410 /* Couldn't or didn't -- back up. */
3412 PL_reginput = locinput = HOPc(locinput, -1);
3420 c = utf8_to_uvchr((U8*)PL_reginput, NULL);
3422 c = UCHARAT(PL_reginput);
3424 /* If it could work, try it. */
3425 if (c1 == -1000 || c == c1 || c == c2)
3427 TRYPAREN(paren, n, PL_reginput);
3428 REGCP_UNWIND(lastcp);
3430 /* Couldn't or didn't -- back up. */
3432 PL_reginput = locinput = HOPc(locinput, -1);
3439 if (PL_reg_call_cc) {
3440 re_cc_state *cur_call_cc = PL_reg_call_cc;
3441 CURCUR *cctmp = PL_regcc;
3442 regexp *re = PL_reg_re;
3443 CHECKPOINT cp, lastcp;
3445 cp = regcppush(0); /* Save *all* the positions. */
3447 regcp_set_to(PL_reg_call_cc->ss); /* Restore parens of
3449 PL_reginput = locinput; /* Make position available to
3451 cache_re(PL_reg_call_cc->re);
3452 PL_regcc = PL_reg_call_cc->cc;
3453 PL_reg_call_cc = PL_reg_call_cc->prev;
3454 if (regmatch(cur_call_cc->node)) {
3455 PL_reg_call_cc = cur_call_cc;
3459 REGCP_UNWIND(lastcp);
3461 PL_reg_call_cc = cur_call_cc;
3467 PerlIO_printf(Perl_debug_log,
3468 "%*s continuation failed...\n",
3469 REPORT_CODE_OFF+PL_regindent*2, "")
3473 if (locinput < PL_regtill) {
3474 DEBUG_r(PerlIO_printf(Perl_debug_log,
3475 "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
3477 (long)(locinput - PL_reg_starttry),
3478 (long)(PL_regtill - PL_reg_starttry),
3480 sayNO_FINAL; /* Cannot match: too short. */
3482 PL_reginput = locinput; /* put where regtry can find it */
3483 sayYES_FINAL; /* Success! */
3485 PL_reginput = locinput; /* put where regtry can find it */
3486 sayYES_LOUD; /* Success! */
3489 PL_reginput = locinput;
3494 s = HOPBACKc(locinput, scan->flags);
3500 PL_reginput = locinput;
3505 s = HOPBACKc(locinput, scan->flags);
3511 PL_reginput = locinput;
3514 inner = NEXTOPER(NEXTOPER(scan));
3515 if (regmatch(inner) != n) {
3530 if (OP(scan) == SUSPEND) {
3531 locinput = PL_reginput;
3532 nextchr = UCHARAT(locinput);
3537 next = scan + ARG(scan);
3542 PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
3543 PTR2UV(scan), OP(scan));
3544 Perl_croak(aTHX_ "regexp memory corruption");
3551 * We get here only if there's trouble -- normally "case END" is
3552 * the terminating point.
3554 Perl_croak(aTHX_ "corrupted regexp pointers");
3560 PerlIO_printf(Perl_debug_log,
3561 "%*s %scould match...%s\n",
3562 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4],PL_colors[5])
3566 DEBUG_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
3567 PL_colors[4],PL_colors[5]));
3573 #if 0 /* Breaks $^R */
3581 PerlIO_printf(Perl_debug_log,
3582 "%*s %sfailed...%s\n",
3583 REPORT_CODE_OFF+PL_regindent*2, "",PL_colors[4],PL_colors[5])
3589 re_unwind_t *uw = SSPTRt(unwind,re_unwind_t);
3592 case RE_UNWIND_BRANCH:
3593 case RE_UNWIND_BRANCHJ:
3595 re_unwind_branch_t *uwb = &(uw->branch);
3596 I32 lastparen = uwb->lastparen;
3598 REGCP_UNWIND(uwb->lastcp);
3599 for (n = *PL_reglastparen; n > lastparen; n--)
3601 *PL_reglastparen = n;
3602 scan = next = uwb->next;
3604 OP(scan) != (uwb->type == RE_UNWIND_BRANCH
3605 ? BRANCH : BRANCHJ) ) { /* Failure */
3612 /* Have more choice yet. Reuse the same uwb. */
3614 if ((n = (uwb->type == RE_UNWIND_BRANCH
3615 ? NEXT_OFF(next) : ARG(next))))
3618 next = NULL; /* XXXX Needn't unwinding in this case... */
3620 next = NEXTOPER(scan);
3621 if (uwb->type == RE_UNWIND_BRANCHJ)
3622 next = NEXTOPER(next);
3623 locinput = uwb->locinput;
3624 nextchr = uwb->nextchr;
3626 PL_regindent = uwb->regindent;
3633 Perl_croak(aTHX_ "regexp unwind memory corruption");
3644 - regrepeat - repeatedly match something simple, report how many
3647 * [This routine now assumes that it will only match on things of length 1.
3648 * That was true before, but now we assume scan - reginput is the count,
3649 * rather than incrementing count on every character. [Er, except utf8.]]
3652 S_regrepeat(pTHX_ regnode *p, I32 max)
3654 register char *scan;
3656 register char *loceol = PL_regeol;
3657 register I32 hardcount = 0;
3658 register bool do_utf8 = PL_reg_match_utf8;
3661 if (max != REG_INFTY && max < loceol - scan)
3662 loceol = scan + max;
3667 while (scan < loceol && hardcount < max && *scan != '\n') {
3668 scan += UTF8SKIP(scan);
3672 while (scan < loceol && *scan != '\n')
3682 case EXACT: /* length of string is 1 */
3684 while (scan < loceol && UCHARAT(scan) == c)
3687 case EXACTF: /* length of string is 1 */
3689 while (scan < loceol &&
3690 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold[c]))
3693 case EXACTFL: /* length of string is 1 */
3694 PL_reg_flags |= RF_tainted;
3696 while (scan < loceol &&
3697 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold_locale[c]))
3703 while (hardcount < max && scan < loceol &&
3704 reginclass(p, (U8*)scan, do_utf8)) {
3705 scan += UTF8SKIP(scan);
3709 while (scan < loceol && reginclass(p, (U8*)scan, do_utf8))
3716 LOAD_UTF8_CHARCLASS(alnum,"a");
3717 while (hardcount < max && scan < loceol &&
3718 swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) {
3719 scan += UTF8SKIP(scan);
3723 while (scan < loceol && isALNUM(*scan))
3728 PL_reg_flags |= RF_tainted;
3731 while (hardcount < max && scan < loceol &&
3732 isALNUM_LC_utf8((U8*)scan)) {
3733 scan += UTF8SKIP(scan);
3737 while (scan < loceol && isALNUM_LC(*scan))
3744 LOAD_UTF8_CHARCLASS(alnum,"a");
3745 while (hardcount < max && scan < loceol &&
3746 !swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) {
3747 scan += UTF8SKIP(scan);
3751 while (scan < loceol && !isALNUM(*scan))
3756 PL_reg_flags |= RF_tainted;
3759 while (hardcount < max && scan < loceol &&
3760 !isALNUM_LC_utf8((U8*)scan)) {
3761 scan += UTF8SKIP(scan);
3765 while (scan < loceol && !isALNUM_LC(*scan))
3772 LOAD_UTF8_CHARCLASS(space," ");
3773 while (hardcount < max && scan < loceol &&
3775 swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) {
3776 scan += UTF8SKIP(scan);
3780 while (scan < loceol && isSPACE(*scan))
3785 PL_reg_flags |= RF_tainted;
3788 while (hardcount < max && scan < loceol &&
3789 (*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
3790 scan += UTF8SKIP(scan);
3794 while (scan < loceol && isSPACE_LC(*scan))
3801 LOAD_UTF8_CHARCLASS(space," ");
3802 while (hardcount < max && scan < loceol &&
3804 swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) {
3805 scan += UTF8SKIP(scan);
3809 while (scan < loceol && !isSPACE(*scan))
3814 PL_reg_flags |= RF_tainted;
3817 while (hardcount < max && scan < loceol &&
3818 !(*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
3819 scan += UTF8SKIP(scan);
3823 while (scan < loceol && !isSPACE_LC(*scan))
3830 LOAD_UTF8_CHARCLASS(digit,"0");
3831 while (hardcount < max && scan < loceol &&
3832 swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) {
3833 scan += UTF8SKIP(scan);
3837 while (scan < loceol && isDIGIT(*scan))
3844 LOAD_UTF8_CHARCLASS(digit,"0");
3845 while (hardcount < max && scan < loceol &&
3846 !swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) {
3847 scan += UTF8SKIP(scan);
3851 while (scan < loceol && !isDIGIT(*scan))
3855 default: /* Called on something of 0 width. */
3856 break; /* So match right here or not at all. */
3862 c = scan - PL_reginput;
3867 SV *prop = sv_newmortal();
3870 PerlIO_printf(Perl_debug_log,
3871 "%*s %s can match %"IVdf" times out of %"IVdf"...\n",
3872 REPORT_CODE_OFF+1, "", SvPVX(prop),(IV)c,(IV)max);
3879 - regrepeat_hard - repeatedly match something, report total lenth and length
3881 * The repeater is supposed to have constant length.
3885 S_regrepeat_hard(pTHX_ regnode *p, I32 max, I32 *lp)
3887 register char *scan = Nullch;
3888 register char *start;
3889 register char *loceol = PL_regeol;
3891 I32 count = 0, res = 1;
3896 start = PL_reginput;
3897 if (PL_reg_match_utf8) {
3898 while (PL_reginput < loceol && (scan = PL_reginput, res = regmatch(p))) {
3901 while (start < PL_reginput) {
3903 start += UTF8SKIP(start);
3914 while (PL_reginput < loceol && (scan = PL_reginput, res = regmatch(p))) {
3916 *lp = l = PL_reginput - start;
3917 if (max != REG_INFTY && l*max < loceol - scan)
3918 loceol = scan + l*max;
3931 - regclass_swash - prepare the utf8 swash
3935 Perl_regclass_swash(pTHX_ register regnode* node, bool doinit, SV** initsvp)
3940 if (PL_regdata && PL_regdata->count) {
3943 if (PL_regdata->what[n] == 's') {
3944 SV *rv = (SV*)PL_regdata->data[n];
3945 AV *av = (AV*)SvRV((SV*)rv);
3948 si = *av_fetch(av, 0, FALSE);
3949 a = av_fetch(av, 1, FALSE);
3953 else if (si && doinit) {
3954 sw = swash_init("utf8", "", si, 1, 0);
3955 (void)av_store(av, 1, sw);
3967 - reginclass - determine if a character falls into a character class
3971 S_reginclass(pTHX_ register regnode *n, register U8* p, register bool do_utf8)
3973 char flags = ANYOF_FLAGS(n);
3978 c = do_utf8 ? utf8_to_uvchr(p, &len) : *p;
3980 if (do_utf8 || (flags & ANYOF_UNICODE)) {
3981 if (do_utf8 && !ANYOF_RUNTIME(n)) {
3982 if (len != (STRLEN)-1 && c < 256 && ANYOF_BITMAP_TEST(n, c))
3985 if (!match && do_utf8 && (flags & ANYOF_UNICODE_ALL) && c >= 256)
3988 SV *sw = regclass_swash(n, TRUE, 0);
3991 if (swash_fetch(sw, p, do_utf8))
3993 else if (flags & ANYOF_FOLD) {
3995 U8 tmpbuf[UTF8_MAXLEN*2+1];
3997 toLOWER_utf8(p, tmpbuf, &ulen);
3998 if (swash_fetch(sw, tmpbuf, do_utf8))
4004 if (!match && c < 256) {
4005 if (ANYOF_BITMAP_TEST(n, c))
4007 else if (flags & ANYOF_FOLD) {
4010 if (flags & ANYOF_LOCALE) {
4011 PL_reg_flags |= RF_tainted;
4012 f = PL_fold_locale[c];
4016 if (f != c && ANYOF_BITMAP_TEST(n, f))
4020 if (!match && (flags & ANYOF_CLASS)) {
4021 PL_reg_flags |= RF_tainted;
4023 (ANYOF_CLASS_TEST(n, ANYOF_ALNUM) && isALNUM_LC(c)) ||
4024 (ANYOF_CLASS_TEST(n, ANYOF_NALNUM) && !isALNUM_LC(c)) ||
4025 (ANYOF_CLASS_TEST(n, ANYOF_SPACE) && isSPACE_LC(c)) ||
4026 (ANYOF_CLASS_TEST(n, ANYOF_NSPACE) && !isSPACE_LC(c)) ||
4027 (ANYOF_CLASS_TEST(n, ANYOF_DIGIT) && isDIGIT_LC(c)) ||
4028 (ANYOF_CLASS_TEST(n, ANYOF_NDIGIT) && !isDIGIT_LC(c)) ||
4029 (ANYOF_CLASS_TEST(n, ANYOF_ALNUMC) && isALNUMC_LC(c)) ||
4030 (ANYOF_CLASS_TEST(n, ANYOF_NALNUMC) && !isALNUMC_LC(c)) ||
4031 (ANYOF_CLASS_TEST(n, ANYOF_ALPHA) && isALPHA_LC(c)) ||
4032 (ANYOF_CLASS_TEST(n, ANYOF_NALPHA) && !isALPHA_LC(c)) ||
4033 (ANYOF_CLASS_TEST(n, ANYOF_ASCII) && isASCII(c)) ||
4034 (ANYOF_CLASS_TEST(n, ANYOF_NASCII) && !isASCII(c)) ||
4035 (ANYOF_CLASS_TEST(n, ANYOF_CNTRL) && isCNTRL_LC(c)) ||
4036 (ANYOF_CLASS_TEST(n, ANYOF_NCNTRL) && !isCNTRL_LC(c)) ||
4037 (ANYOF_CLASS_TEST(n, ANYOF_GRAPH) && isGRAPH_LC(c)) ||
4038 (ANYOF_CLASS_TEST(n, ANYOF_NGRAPH) && !isGRAPH_LC(c)) ||
4039 (ANYOF_CLASS_TEST(n, ANYOF_LOWER) && isLOWER_LC(c)) ||
4040 (ANYOF_CLASS_TEST(n, ANYOF_NLOWER) && !isLOWER_LC(c)) ||
4041 (ANYOF_CLASS_TEST(n, ANYOF_PRINT) && isPRINT_LC(c)) ||
4042 (ANYOF_CLASS_TEST(n, ANYOF_NPRINT) && !isPRINT_LC(c)) ||
4043 (ANYOF_CLASS_TEST(n, ANYOF_PUNCT) && isPUNCT_LC(c)) ||
4044 (ANYOF_CLASS_TEST(n, ANYOF_NPUNCT) && !isPUNCT_LC(c)) ||
4045 (ANYOF_CLASS_TEST(n, ANYOF_UPPER) && isUPPER_LC(c)) ||
4046 (ANYOF_CLASS_TEST(n, ANYOF_NUPPER) && !isUPPER_LC(c)) ||
4047 (ANYOF_CLASS_TEST(n, ANYOF_XDIGIT) && isXDIGIT(c)) ||
4048 (ANYOF_CLASS_TEST(n, ANYOF_NXDIGIT) && !isXDIGIT(c)) ||
4049 (ANYOF_CLASS_TEST(n, ANYOF_PSXSPC) && isPSXSPC(c)) ||
4050 (ANYOF_CLASS_TEST(n, ANYOF_NPSXSPC) && !isPSXSPC(c)) ||
4051 (ANYOF_CLASS_TEST(n, ANYOF_BLANK) && isBLANK(c)) ||
4052 (ANYOF_CLASS_TEST(n, ANYOF_NBLANK) && !isBLANK(c))
4053 ) /* How's that for a conditional? */
4060 return (flags & ANYOF_INVERT) ? !match : match;
4064 S_reghop(pTHX_ U8 *s, I32 off)
4066 return S_reghop3(aTHX_ s, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr));
4070 S_reghop3(pTHX_ U8 *s, I32 off, U8* lim)
4073 while (off-- && s < lim) {
4074 /* XXX could check well-formedness here */
4082 if (UTF8_IS_CONTINUED(*s)) {
4083 while (s > (U8*)lim && UTF8_IS_CONTINUATION(*s))
4086 /* XXX could check well-formedness here */
4094 S_reghopmaybe(pTHX_ U8 *s, I32 off)
4096 return S_reghopmaybe3(aTHX_ s, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr));
4100 S_reghopmaybe3(pTHX_ U8* s, I32 off, U8* lim)
4103 while (off-- && s < lim) {
4104 /* XXX could check well-formedness here */
4114 if (UTF8_IS_CONTINUED(*s)) {
4115 while (s > (U8*)lim && UTF8_IS_CONTINUATION(*s))
4118 /* XXX could check well-formedness here */
4130 restore_pos(pTHX_ void *arg)
4132 if (PL_reg_eval_set) {
4133 if (PL_reg_oldsaved) {
4134 PL_reg_re->subbeg = PL_reg_oldsaved;
4135 PL_reg_re->sublen = PL_reg_oldsavedlen;
4136 RX_MATCH_COPIED_on(PL_reg_re);
4138 PL_reg_magic->mg_len = PL_reg_oldpos;
4139 PL_reg_eval_set = 0;
4140 PL_curpm = PL_reg_oldcurpm;