5 * "One Ring to rule them all, One Ring to find them..."
8 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
9 * confused with the original package (see point 3 below). Thanks, Henry!
12 /* Additional note: this code is very heavily munged from Henry's version
13 * in places. In some spots I've traded clarity for efficiency, so don't
14 * blame Henry for some of the lack of readability.
17 /* The names of the functions have been changed from regcomp and
18 * regexec to pregcomp and pregexec in order to avoid conflicts
19 * with the POSIX routines of the same names.
22 #ifdef PERL_EXT_RE_BUILD
23 /* need to replace pregcomp et al, so enable that */
24 # ifndef PERL_IN_XSUB_RE
25 # define PERL_IN_XSUB_RE
27 /* need access to debugger hooks */
28 # if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
33 #ifdef PERL_IN_XSUB_RE
34 /* We *really* need to overwrite these symbols: */
35 # define Perl_regexec_flags my_regexec
36 # define Perl_regdump my_regdump
37 # define Perl_regprop my_regprop
38 # define Perl_re_intuit_start my_re_intuit_start
39 /* *These* symbols are masked to allow static link. */
40 # define Perl_pregexec my_pregexec
41 # define Perl_reginitcolors my_reginitcolors
43 # define PERL_NO_GET_CONTEXT
48 * pregcomp and pregexec -- regsub and regerror are not used in perl
50 * Copyright (c) 1986 by University of Toronto.
51 * Written by Henry Spencer. Not derived from licensed software.
53 * Permission is granted to anyone to use this software for any
54 * purpose on any computer system, and to redistribute it freely,
55 * subject to the following restrictions:
57 * 1. The author is not responsible for the consequences of use of
58 * this software, no matter how awful, even if they arise
61 * 2. The origin of this software must not be misrepresented, either
62 * by explicit claim or by omission.
64 * 3. Altered versions must be plainly marked as such, and must not
65 * be misrepresented as being the original software.
67 **** Alterations to Henry's code are...
69 **** Copyright (c) 1991-1999, Larry Wall
71 **** You may distribute under the terms of either the GNU General Public
72 **** License or the Artistic License, as specified in the README file.
74 * Beware that some of this code is subtly aware of the way operator
75 * precedence is structured in regular expressions. Serious changes in
76 * regular-expression syntax might require a total rethink.
79 #define PERL_IN_REGEXEC_C
82 #ifdef PERL_IN_XSUB_RE
83 # if defined(PERL_CAPI) || defined(PERL_OBJECT)
90 #define RF_tainted 1 /* tainted information used? */
91 #define RF_warned 2 /* warned about big count? */
92 #define RF_evaled 4 /* Did an EVAL with setting? */
93 #define RF_utf8 8 /* String contains multibyte chars? */
95 #define UTF (PL_reg_flags & RF_utf8)
97 #define RS_init 1 /* eval environment created */
98 #define RS_set 2 /* replsv value is set */
101 #define STATIC static
108 #define REGINCLASS(p,c) (ANYOF_FLAGS(p) ? reginclass(p,c) : ANYOF_BITMAP_TEST(p,c))
109 #define REGINCLASSUTF8(f,p) (ARG1(f) ? reginclassutf8(f,p) : swash_fetch((SV*)PL_regdata->data[ARG2(f)],p))
111 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
112 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
114 #define reghop_c(pos,off) ((char*)reghop((U8*)pos, off))
115 #define reghopmaybe_c(pos,off) ((char*)reghopmaybe((U8*)pos, off))
116 #define HOP(pos,off) (UTF ? reghop((U8*)pos, off) : (U8*)(pos + off))
117 #define HOPMAYBE(pos,off) (UTF ? reghopmaybe((U8*)pos, off) : (U8*)(pos + off))
118 #define HOPc(pos,off) ((char*)HOP(pos,off))
119 #define HOPMAYBEc(pos,off) ((char*)HOPMAYBE(pos,off))
121 static void restore_pos(pTHXo_ void *arg);
125 S_regcppush(pTHX_ I32 parenfloor)
128 int retval = PL_savestack_ix;
129 int i = (PL_regsize - parenfloor) * 4;
133 for (p = PL_regsize; p > parenfloor; p--) {
134 SSPUSHINT(PL_regendp[p]);
135 SSPUSHINT(PL_regstartp[p]);
136 SSPUSHPTR(PL_reg_start_tmp[p]);
139 SSPUSHINT(PL_regsize);
140 SSPUSHINT(*PL_reglastparen);
141 SSPUSHPTR(PL_reginput);
143 SSPUSHINT(SAVEt_REGCONTEXT);
147 /* These are needed since we do not localize EVAL nodes: */
148 # define REGCP_SET DEBUG_r(PerlIO_printf(Perl_debug_log, \
149 " Setting an EVAL scope, savestack=%i\n", \
150 PL_savestack_ix)); lastcp = PL_savestack_ix
152 # define REGCP_UNWIND DEBUG_r(lastcp != PL_savestack_ix ? \
153 PerlIO_printf(Perl_debug_log, \
154 " Clearing an EVAL scope, savestack=%i..%i\n", \
155 lastcp, PL_savestack_ix) : 0); regcpblow(lastcp)
165 assert(i == SAVEt_REGCONTEXT);
167 input = (char *) SSPOPPTR;
168 *PL_reglastparen = SSPOPINT;
169 PL_regsize = SSPOPINT;
170 for (i -= 3; i > 0; i -= 4) {
171 paren = (U32)SSPOPINT;
172 PL_reg_start_tmp[paren] = (char *) SSPOPPTR;
173 PL_regstartp[paren] = SSPOPINT;
175 if (paren <= *PL_reglastparen)
176 PL_regendp[paren] = tmps;
178 PerlIO_printf(Perl_debug_log,
179 " restoring \\%d to %d(%d)..%d%s\n",
180 paren, PL_regstartp[paren],
181 PL_reg_start_tmp[paren] - PL_bostr,
183 (paren > *PL_reglastparen ? "(no)" : ""));
187 if (*PL_reglastparen + 1 <= PL_regnpar) {
188 PerlIO_printf(Perl_debug_log,
189 " restoring \\%d..\\%d to undef\n",
190 *PL_reglastparen + 1, PL_regnpar);
193 for (paren = *PL_reglastparen + 1; paren <= PL_regnpar; paren++) {
194 if (paren > PL_regsize)
195 PL_regstartp[paren] = -1;
196 PL_regendp[paren] = -1;
202 S_regcp_set_to(pTHX_ I32 ss)
205 I32 tmp = PL_savestack_ix;
207 PL_savestack_ix = ss;
209 PL_savestack_ix = tmp;
213 typedef struct re_cc_state
217 struct re_cc_state *prev;
222 #define regcpblow(cp) LEAVE_SCOPE(cp)
225 * pregexec and friends
229 - pregexec - match a regexp against a string
232 Perl_pregexec(pTHX_ register regexp *prog, char *stringarg, register char *strend,
233 char *strbeg, I32 minend, SV *screamer, U32 nosave)
234 /* strend: pointer to null at end of string */
235 /* strbeg: real beginning of string */
236 /* minend: end of match must be >=minend after stringarg. */
237 /* nosave: For optimizations. */
240 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
241 nosave ? 0 : REXEC_COPY_STR);
245 S_cache_re(pTHX_ regexp *prog)
248 PL_regprecomp = prog->precomp; /* Needed for FAIL. */
250 PL_regprogram = prog->program;
252 PL_regnpar = prog->nparens;
253 PL_regdata = prog->data;
258 * Need to implement the following flags for reg_anch:
260 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
262 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
263 * INTUIT_AUTORITATIVE_ML
264 * INTUIT_ONCE_NOML - Intuit can match in one location only.
267 * Another flag for this function: SECOND_TIME (so that float substrs
268 * with giant delta may be not rechecked).
271 /* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
273 /* If SCREAM, then SvPVX(sv) should be compatible with strpos and strend.
274 Otherwise, only SvCUR(sv) is used to get strbeg. */
276 /* XXXX We assume that strpos is strbeg unless sv. */
278 /* A failure to find a constant substring means that there is no need to make
279 an expensive call to REx engine, thus we celebrate a failure. Similarly,
280 finding a substring too deep into the string means that less calls to
281 regtry() should be needed.
283 REx compiler's optimizer found 4 possible hints:
284 a) Anchored substring;
286 c) Whether we are anchored (beginning-of-line or \G);
287 d) First node (of those at offset 0) which may distingush positions;
288 We use 'a', 'b', multiline-part of 'c', and try to find a position in the
289 string which does not contradict any of them.
293 Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
294 char *strend, U32 flags, re_scream_pos_data *data)
296 register I32 start_shift;
297 /* Should be nonnegative! */
298 register I32 end_shift;
304 register char *other_last = Nullch;
306 char *i_strpos = strpos;
309 DEBUG_r( if (!PL_colorset) reginitcolors() );
310 DEBUG_r(PerlIO_printf(Perl_debug_log,
311 "%sGuessing start of match, REx%s `%s%.60s%s%s' against `%s%.*s%s%s'...\n",
312 PL_colors[4],PL_colors[5],PL_colors[0],
315 (strlen(prog->precomp) > 60 ? "..." : ""),
317 (strend - strpos > 60 ? 60 : strend - strpos),
318 strpos, PL_colors[1],
319 (strend - strpos > 60 ? "..." : ""))
322 if (prog->minlen > strend - strpos) {
323 DEBUG_r(PerlIO_printf(Perl_debug_log, "String too short...\n"));
326 if (prog->reganch & ROPT_ANCH) { /* Match at beg-of-str or after \n */
327 ml_anch = !( (prog->reganch & ROPT_ANCH_SINGLE)
328 || ( (prog->reganch & ROPT_ANCH_BOL)
329 && !PL_multiline ) ); /* Check after \n? */
331 if ((prog->check_offset_min == prog->check_offset_max) && !ml_anch) {
332 /* Substring at constant offset from beg-of-str... */
335 if ( !(prog->reganch & ROPT_ANCH_GPOS) /* Checked by the caller */
336 && (sv && (strpos + SvCUR(sv) != strend)) ) {
337 DEBUG_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
340 PL_regeol = strend; /* Used in HOP() */
341 s = HOPc(strpos, prog->check_offset_min);
342 if (SvTAIL(prog->check_substr)) {
343 slen = SvCUR(prog->check_substr); /* >= 1 */
345 if ( strend - s > slen || strend - s < slen - 1
346 || (strend - s == slen && strend[-1] != '\n')) {
347 DEBUG_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
350 /* Now should match s[0..slen-2] */
352 if (slen && (*SvPVX(prog->check_substr) != *s
354 && memNE(SvPVX(prog->check_substr), s, slen)))) {
356 DEBUG_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
360 else if (*SvPVX(prog->check_substr) != *s
361 || ((slen = SvCUR(prog->check_substr)) > 1
362 && memNE(SvPVX(prog->check_substr), s, slen)))
364 goto success_at_start;
366 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
368 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
369 /* Should be nonnegative! */
370 end_shift = prog->minlen - start_shift -
371 CHR_SVLEN(prog->check_substr) + (SvTAIL(prog->check_substr) != 0);
373 I32 end = prog->check_offset_max + CHR_SVLEN(prog->check_substr)
374 - (SvTAIL(prog->check_substr) != 0);
375 I32 eshift = strend - s - end;
377 if (end_shift < eshift)
381 else { /* Can match at random position */
384 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
385 /* Should be nonnegative! */
386 end_shift = prog->minlen - start_shift -
387 CHR_SVLEN(prog->check_substr) + (SvTAIL(prog->check_substr) != 0);
390 #ifdef DEBUGGING /* 7/99: reports of failure (with the older version) */
392 Perl_croak(aTHX_ "panic: end_shift");
395 check = prog->check_substr;
397 /* Find a possible match in the region s..strend by looking for
398 the "check" substring in the region corrected by start/end_shift. */
399 if (flags & REXEC_SCREAM) {
400 char *strbeg = SvPVX(sv); /* XXXX Assume PV_force() on SCREAM! */
401 I32 p = -1; /* Internal iterator of scream. */
402 I32 *pp = data ? data->scream_pos : &p;
404 if (PL_screamfirst[BmRARE(check)] >= 0
405 || ( BmRARE(check) == '\n'
406 && (BmPREVIOUS(check) == SvCUR(check) - 1)
408 s = screaminstr(sv, check,
409 start_shift + (s - strbeg), end_shift, pp, 0);
413 *data->scream_olds = s;
416 s = fbm_instr((unsigned char*)s + start_shift,
417 (unsigned char*)strend - end_shift,
418 check, PL_multiline ? FBMrf_MULTILINE : 0);
420 /* Update the count-of-usability, remove useless subpatterns,
423 DEBUG_r(PerlIO_printf(Perl_debug_log, "%s %s substr `%s%.*s%s'%s%s",
424 (s ? "Found" : "Did not find"),
425 ((check == prog->anchored_substr) ? "anchored" : "floating"),
427 SvCUR(check) - (SvTAIL(check)!=0), SvPVX(check),
428 PL_colors[1], (SvTAIL(check) ? "$" : ""),
429 (s ? " at offset " : "...\n") ) );
434 /* Finish the diagnostic message */
435 DEBUG_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
437 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
438 Start with the other substr.
439 XXXX no SCREAM optimization yet - and a very coarse implementation
440 XXXX /ttx+/ results in anchored=`ttx', floating=`x'. floating will
441 *always* match. Probably should be marked during compile...
442 Probably it is right to do no SCREAM here...
445 if (prog->float_substr && prog->anchored_substr) {
446 /* Take into account the "other" substring. */
447 /* XXXX May be hopelessly wrong for UTF... */
449 other_last = strpos - 1;
450 if (check == prog->float_substr) {
453 char *last = s - start_shift, *last1, *last2;
457 t = s - prog->check_offset_max;
458 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
459 && (!(prog->reganch & ROPT_UTF8)
460 || (PL_bostr = strpos, /* Used in regcopmaybe() */
461 (t = reghopmaybe_c(s, -(prog->check_offset_max)))
466 t += prog->anchored_offset;
470 last2 = last1 = strend - prog->minlen;
473 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
474 /* On end-of-str: see comment below. */
475 s = fbm_instr((unsigned char*)t,
476 (unsigned char*)last1 + prog->anchored_offset
477 + SvCUR(prog->anchored_substr)
478 - (SvTAIL(prog->anchored_substr)!=0),
479 prog->anchored_substr, PL_multiline ? FBMrf_MULTILINE : 0);
480 DEBUG_r(PerlIO_printf(Perl_debug_log, "%s anchored substr `%s%.*s%s'%s",
481 (s ? "Found" : "Contradicts"),
483 SvCUR(prog->anchored_substr)
484 - (SvTAIL(prog->anchored_substr)!=0),
485 SvPVX(prog->anchored_substr),
486 PL_colors[1], (SvTAIL(prog->anchored_substr) ? "$" : "")));
488 if (last1 >= last2) {
489 DEBUG_r(PerlIO_printf(Perl_debug_log,
490 ", giving up...\n"));
493 DEBUG_r(PerlIO_printf(Perl_debug_log,
494 ", trying floating at offset %ld...\n",
495 (long)(s1 + 1 - i_strpos)));
496 PL_regeol = strend; /* Used in HOP() */
497 other_last = last1 + prog->anchored_offset;
502 DEBUG_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
503 (long)(s - i_strpos)));
504 t = s - prog->anchored_offset;
513 else { /* Take into account the floating substring. */
518 last1 = last = strend - prog->minlen + prog->float_min_offset;
519 if (last - t > prog->float_max_offset)
520 last = t + prog->float_max_offset;
521 s = t + prog->float_min_offset;
524 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
525 /* fbm_instr() takes into account exact value of end-of-str
526 if the check is SvTAIL(ed). Since false positives are OK,
527 and end-of-str is not later than strend we are OK. */
528 s = fbm_instr((unsigned char*)s,
529 (unsigned char*)last + SvCUR(prog->float_substr)
530 - (SvTAIL(prog->float_substr)!=0),
531 prog->float_substr, PL_multiline ? FBMrf_MULTILINE : 0);
532 DEBUG_r(PerlIO_printf(Perl_debug_log, "%s floating substr `%s%.*s%s'%s",
533 (s ? "Found" : "Contradicts"),
535 SvCUR(prog->float_substr)
536 - (SvTAIL(prog->float_substr)!=0),
537 SvPVX(prog->float_substr),
538 PL_colors[1], (SvTAIL(prog->float_substr) ? "$" : "")));
541 DEBUG_r(PerlIO_printf(Perl_debug_log,
542 ", giving up...\n"));
545 DEBUG_r(PerlIO_printf(Perl_debug_log,
546 ", trying anchored starting at offset %ld...\n",
547 (long)(s1 + 1 - i_strpos)));
549 PL_regeol = strend; /* Used in HOP() */
554 DEBUG_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
555 (long)(s - i_strpos)));
565 t = s - prog->check_offset_max;
567 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
568 && (!(prog->reganch & ROPT_UTF8)
569 || (PL_bostr = strpos, /* Used in regcopmaybe() */
570 ((t = reghopmaybe_c(s, -(prog->check_offset_max)))
573 /* Fixed substring is found far enough so that the match
574 cannot start at strpos. */
576 if (ml_anch && t[-1] != '\n') {
577 /* Eventually fbm_*() should handle this, but often
578 anchored_offset is not 0, so this check will not be wasted. */
579 /* XXXX In the code below we prefer to look for "^" even in
580 presence of anchored substrings. And we search even
581 beyond the found float position. These pessimizations
582 are historical artefacts only. */
584 while (t < strend - prog->minlen) {
586 if (t < s - prog->check_offset_min) {
587 if (prog->anchored_substr) {
588 /* We definitely contradict the found anchored
589 substr. Due to the above check we do not
590 contradict "check" substr.
591 Thus we can arrive here only if check substr
592 is float. Redo checking for "other"=="fixed".
595 DEBUG_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
596 PL_colors[0],PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
597 goto do_other_anchored;
600 DEBUG_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
601 PL_colors[0],PL_colors[1], (long)(s - i_strpos)));
604 DEBUG_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting at offset %ld...\n",
605 PL_colors[0],PL_colors[1], (long)(t + 1 - i_strpos)));
611 DEBUG_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
612 PL_colors[0],PL_colors[1]));
617 ++BmUSEFUL(prog->check_substr); /* hooray/5 */
621 /* The found string does not prohibit matching at beg-of-str
622 - no optimization of calling REx engine can be performed,
623 unless it was an MBOL and we are not after MBOL. */
625 /* Even in this situation we may use MBOL flag if strpos is offset
626 wrt the start of the string. */
628 && (strpos + SvCUR(sv) != strend) && strpos[-1] != '\n') {
632 DEBUG_r( if (ml_anch)
633 PerlIO_printf(Perl_debug_log, "Does not contradict /%s^%s/m...\n",
634 PL_colors[0],PL_colors[1]);
637 if (!(prog->reganch & ROPT_NAUGHTY) /* XXXX If strpos moved? */
638 && --BmUSEFUL(prog->check_substr) < 0
639 && prog->check_substr == prog->float_substr) { /* boo */
640 /* If flags & SOMETHING - do not do it many times on the same match */
641 SvREFCNT_dec(prog->check_substr);
642 prog->check_substr = Nullsv; /* disable */
643 prog->float_substr = Nullsv; /* clear */
645 prog->reganch &= ~RE_USE_INTUIT;
651 DEBUG_r(PerlIO_printf(Perl_debug_log, "%sGuessed:%s match at offset %ld\n",
652 PL_colors[4], PL_colors[5], (long)(s - i_strpos)) );
655 fail_finish: /* Substring not found */
656 BmUSEFUL(prog->check_substr) += 5; /* hooray */
658 DEBUG_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
659 PL_colors[4],PL_colors[5]));
664 - regexec_flags - match a regexp against a string
667 Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *strend,
668 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
669 /* strend: pointer to null at end of string */
670 /* strbeg: real beginning of string */
671 /* minend: end of match must be >=minend after stringarg. */
672 /* data: May be used for some additional optimizations. */
673 /* nosave: For optimizations. */
678 register char *startpos = stringarg;
680 I32 minlen; /* must match at least this many chars */
681 I32 dontbother = 0; /* how many characters not to try at end */
682 I32 start_shift = 0; /* Offset of the start to find
683 constant substr. */ /* CC */
684 I32 end_shift = 0; /* Same for the end. */ /* CC */
685 I32 scream_pos = -1; /* Internal iterator of scream. */
687 SV* oreplsv = GvSV(PL_replgv);
693 PL_regnarrate = PL_debug & 512;
697 if (prog == NULL || startpos == NULL) {
698 Perl_croak(aTHX_ "NULL regexp parameter");
702 minlen = prog->minlen;
703 if (strend - startpos < minlen) goto phooey;
705 if (startpos == strbeg) /* is ^ valid at stringarg? */
708 PL_regprev = (U32)stringarg[-1];
709 if (!PL_multiline && PL_regprev == '\n')
710 PL_regprev = '\0'; /* force ^ to NOT match */
713 /* Check validity of program. */
714 if (UCHARAT(prog->program) != REG_MAGIC) {
715 Perl_croak(aTHX_ "corrupted regexp program");
722 if (prog->reganch & ROPT_UTF8)
723 PL_reg_flags |= RF_utf8;
725 /* Mark beginning of line for ^ and lookbehind. */
726 PL_regbol = startpos;
730 /* Mark end of line for $ (and such) */
733 /* see how far we have to get to not match where we matched before */
734 PL_regtill = startpos+minend;
736 /* We start without call_cc context. */
739 /* If there is a "must appear" string, look for it. */
742 if (prog->reganch & ROPT_GPOS_SEEN) {
745 if (!(flags & REXEC_IGNOREPOS) && sv && SvTYPE(sv) >= SVt_PVMG
746 && SvMAGIC(sv) && (mg = mg_find(sv, 'g')) && mg->mg_len >= 0)
747 PL_reg_ganch = strbeg + mg->mg_len;
749 PL_reg_ganch = startpos;
750 if (prog->reganch & ROPT_ANCH_GPOS) {
751 if (s > PL_reg_ganch)
757 if (!(flags & REXEC_CHECKED) && prog->check_substr != Nullsv) {
758 re_scream_pos_data d;
760 d.scream_olds = &scream_olds;
761 d.scream_pos = &scream_pos;
762 s = re_intuit_start(prog, sv, s, strend, flags, &d);
764 goto phooey; /* not present */
767 DEBUG_r( if (!PL_colorset) reginitcolors() );
768 DEBUG_r(PerlIO_printf(Perl_debug_log,
769 "%sMatching REx%s `%s%.60s%s%s' against `%s%.*s%s%s'\n",
770 PL_colors[4],PL_colors[5],PL_colors[0],
773 (strlen(prog->precomp) > 60 ? "..." : ""),
775 (strend - startpos > 60 ? 60 : strend - startpos),
776 startpos, PL_colors[1],
777 (strend - startpos > 60 ? "..." : ""))
780 /* Simplest case: anchored match need be tried only once. */
781 /* [unless only anchor is BOL and multiline is set] */
782 if (prog->reganch & (ROPT_ANCH & ~ROPT_ANCH_GPOS)) {
783 if (s == startpos && regtry(prog, startpos))
785 else if (PL_multiline || (prog->reganch & ROPT_IMPLICIT)
786 || (prog->reganch & ROPT_ANCH_MBOL)) /* XXXX SBOL? */
791 dontbother = minlen - 1;
792 end = HOPc(strend, -dontbother) - 1;
793 /* for multiline we only have to try after newlines */
794 if (prog->check_substr) {
803 s = re_intuit_start(prog, sv, s + 1, strend, flags, NULL);
811 if (*s++ == '\n') { /* don't need PL_utf8skip here */
819 } else if (prog->reganch & ROPT_ANCH_GPOS) {
820 if (regtry(prog, PL_reg_ganch))
825 /* Messy cases: unanchored match. */
826 if (prog->anchored_substr && prog->reganch & ROPT_SKIP) {
827 /* we have /x+whatever/ */
828 /* it must be a one character string (XXXX Except UTF?) */
829 char ch = SvPVX(prog->anchored_substr)[0];
833 if (regtry(prog, s)) goto got_it;
835 while (s < strend && *s == ch)
844 if (regtry(prog, s)) goto got_it;
846 while (s < strend && *s == ch)
854 else if (prog->anchored_substr != Nullsv
855 || (prog->float_substr != Nullsv
856 && prog->float_max_offset < strend - s)) {
857 SV *must = prog->anchored_substr
858 ? prog->anchored_substr : prog->float_substr;
860 prog->anchored_substr ? prog->anchored_offset : prog->float_max_offset;
862 prog->anchored_substr ? prog->anchored_offset : prog->float_min_offset;
863 I32 delta = back_max - back_min;
864 char *last = HOPc(strend, /* Cannot start after this */
865 -(I32)(CHR_SVLEN(must)
866 - (SvTAIL(must) != 0) + back_min));
867 char *last1; /* Last position checked before */
872 last1 = s - 1; /* bogus */
874 /* XXXX check_substr already used to find `s', can optimize if
875 check_substr==must. */
877 dontbother = end_shift;
878 strend = HOPc(strend, -dontbother);
879 while ( (s <= last) &&
880 ((flags & REXEC_SCREAM)
881 ? (s = screaminstr(sv, must, HOPc(s, back_min) - strbeg,
882 end_shift, &scream_pos, 0))
883 : (s = fbm_instr((unsigned char*)HOP(s, back_min),
884 (unsigned char*)strend, must,
885 PL_multiline ? FBMrf_MULTILINE : 0))) ) {
886 if (HOPc(s, -back_max) > last1) {
887 last1 = HOPc(s, -back_min);
888 s = HOPc(s, -back_max);
891 char *t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
893 last1 = HOPc(s, -back_min);
913 else if (c = prog->regstclass) {
914 I32 doevery = (prog->reganch & ROPT_SKIP) == 0;
918 dontbother = minlen - 1;
919 strend = HOPc(strend, -dontbother); /* don't bother with what can't match */
921 /* We know what class it must start with. */
926 if (REGINCLASSUTF8(c, (U8*)s)) {
927 if (tmp && regtry(prog, s))
940 if (REGINCLASS(cc, *s)) {
941 if (tmp && regtry(prog, s))
952 PL_reg_flags |= RF_tainted;
959 tmp = (s != startpos) ? UCHARAT(s - 1) : PL_regprev;
960 tmp = ((OP(c) == BOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
962 if (tmp == !(OP(c) == BOUND ? isALNUM(*s) : isALNUM_LC(*s))) {
969 if ((minlen || tmp) && regtry(prog,s))
973 PL_reg_flags |= RF_tainted;
978 strend = reghop_c(strend, -1);
980 tmp = (I32)(s != startpos) ? utf8_to_uv(reghop((U8*)s, -1), 0) : PL_regprev;
981 tmp = ((OP(c) == BOUND ? isALNUM_uni(tmp) : isALNUM_LC_uni(tmp)) != 0);
983 if (tmp == !(OP(c) == BOUND ?
984 swash_fetch(PL_utf8_alnum, (U8*)s) :
985 isALNUM_LC_utf8((U8*)s)))
993 if ((minlen || tmp) && regtry(prog,s))
997 PL_reg_flags |= RF_tainted;
1004 tmp = (s != startpos) ? UCHARAT(s - 1) : PL_regprev;
1005 tmp = ((OP(c) == NBOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1006 while (s < strend) {
1007 if (tmp == !(OP(c) == NBOUND ? isALNUM(*s) : isALNUM_LC(*s)))
1009 else if (regtry(prog, s))
1013 if ((minlen || !tmp) && regtry(prog,s))
1017 PL_reg_flags |= RF_tainted;
1022 strend = reghop_c(strend, -1);
1024 tmp = (I32)(s != startpos) ? utf8_to_uv(reghop((U8*)s, -1), 0) : PL_regprev;
1025 tmp = ((OP(c) == NBOUND ? isALNUM_uni(tmp) : isALNUM_LC_uni(tmp)) != 0);
1026 while (s < strend) {
1027 if (tmp == !(OP(c) == NBOUND ?
1028 swash_fetch(PL_utf8_alnum, (U8*)s) :
1029 isALNUM_LC_utf8((U8*)s)))
1031 else if (regtry(prog, s))
1035 if ((minlen || !tmp) && regtry(prog,s))
1039 while (s < strend) {
1041 if (tmp && regtry(prog, s))
1052 while (s < strend) {
1053 if (swash_fetch(PL_utf8_alnum, (U8*)s)) {
1054 if (tmp && regtry(prog, s))
1065 PL_reg_flags |= RF_tainted;
1066 while (s < strend) {
1067 if (isALNUM_LC(*s)) {
1068 if (tmp && regtry(prog, s))
1079 PL_reg_flags |= RF_tainted;
1080 while (s < strend) {
1081 if (isALNUM_LC_utf8((U8*)s)) {
1082 if (tmp && regtry(prog, s))
1093 while (s < strend) {
1095 if (tmp && regtry(prog, s))
1106 while (s < strend) {
1107 if (!swash_fetch(PL_utf8_alnum, (U8*)s)) {
1108 if (tmp && regtry(prog, s))
1119 PL_reg_flags |= RF_tainted;
1120 while (s < strend) {
1121 if (!isALNUM_LC(*s)) {
1122 if (tmp && regtry(prog, s))
1133 PL_reg_flags |= RF_tainted;
1134 while (s < strend) {
1135 if (!isALNUM_LC_utf8((U8*)s)) {
1136 if (tmp && regtry(prog, s))
1147 while (s < strend) {
1149 if (tmp && regtry(prog, s))
1160 while (s < strend) {
1161 if (*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s)) {
1162 if (tmp && regtry(prog, s))
1173 PL_reg_flags |= RF_tainted;
1174 while (s < strend) {
1175 if (isSPACE_LC(*s)) {
1176 if (tmp && regtry(prog, s))
1187 PL_reg_flags |= RF_tainted;
1188 while (s < strend) {
1189 if (*s == ' ' || isSPACE_LC_utf8((U8*)s)) {
1190 if (tmp && regtry(prog, s))
1201 while (s < strend) {
1203 if (tmp && regtry(prog, s))
1214 while (s < strend) {
1215 if (!(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s))) {
1216 if (tmp && regtry(prog, s))
1227 PL_reg_flags |= RF_tainted;
1228 while (s < strend) {
1229 if (!isSPACE_LC(*s)) {
1230 if (tmp && regtry(prog, s))
1241 PL_reg_flags |= RF_tainted;
1242 while (s < strend) {
1243 if (!(*s == ' ' || isSPACE_LC_utf8((U8*)s))) {
1244 if (tmp && regtry(prog, s))
1255 while (s < strend) {
1257 if (tmp && regtry(prog, s))
1268 while (s < strend) {
1269 if (swash_fetch(PL_utf8_digit,(U8*)s)) {
1270 if (tmp && regtry(prog, s))
1281 PL_reg_flags |= RF_tainted;
1282 while (s < strend) {
1283 if (isDIGIT_LC(*s)) {
1284 if (tmp && regtry(prog, s))
1295 PL_reg_flags |= RF_tainted;
1296 while (s < strend) {
1297 if (isDIGIT_LC_utf8((U8*)s)) {
1298 if (tmp && regtry(prog, s))
1309 while (s < strend) {
1311 if (tmp && regtry(prog, s))
1322 while (s < strend) {
1323 if (!swash_fetch(PL_utf8_digit,(U8*)s)) {
1324 if (tmp && regtry(prog, s))
1335 PL_reg_flags |= RF_tainted;
1336 while (s < strend) {
1337 if (!isDIGIT_LC(*s)) {
1338 if (tmp && regtry(prog, s))
1349 PL_reg_flags |= RF_tainted;
1350 while (s < strend) {
1351 if (!isDIGIT_LC_utf8((U8*)s)) {
1352 if (tmp && regtry(prog, s))
1366 if (prog->float_substr != Nullsv) { /* Trim the end. */
1368 I32 oldpos = scream_pos;
1370 if (flags & REXEC_SCREAM) {
1371 last = screaminstr(sv, prog->float_substr, s - strbeg,
1372 end_shift, &scream_pos, 1); /* last one */
1374 last = scream_olds; /* Only one occurence. */
1378 char *little = SvPV(prog->float_substr, len);
1380 if (SvTAIL(prog->float_substr)) {
1381 if (memEQ(strend - len + 1, little, len - 1))
1382 last = strend - len + 1;
1383 else if (!PL_multiline)
1384 last = memEQ(strend - len, little, len)
1385 ? strend - len : Nullch;
1391 last = rninstr(s, strend, little, little + len);
1393 last = strend; /* matching `$' */
1396 if (last == NULL) goto phooey; /* Should not happen! */
1397 dontbother = strend - last + prog->float_min_offset;
1399 if (minlen && (dontbother < minlen))
1400 dontbother = minlen - 1;
1401 strend -= dontbother; /* this one's always in bytes! */
1402 /* We don't know much -- general case. */
1405 if (regtry(prog, s))
1414 if (regtry(prog, s))
1416 } while (s++ < strend);
1424 RX_MATCH_TAINTED_set(prog, PL_reg_flags & RF_tainted);
1426 if (PL_reg_eval_set) {
1427 /* Preserve the current value of $^R */
1428 if (oreplsv != GvSV(PL_replgv))
1429 sv_setsv(oreplsv, GvSV(PL_replgv));/* So that when GvSV(replgv) is
1430 restored, the value remains
1432 restore_pos(aTHXo_ 0);
1435 /* make sure $`, $&, $', and $digit will work later */
1436 if ( !(flags & REXEC_NOT_FIRST) ) {
1437 if (RX_MATCH_COPIED(prog)) {
1438 Safefree(prog->subbeg);
1439 RX_MATCH_COPIED_off(prog);
1441 if (flags & REXEC_COPY_STR) {
1442 I32 i = PL_regeol - startpos + (stringarg - strbeg);
1444 s = savepvn(strbeg, i);
1447 RX_MATCH_COPIED_on(prog);
1450 prog->subbeg = strbeg;
1451 prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
1458 if (PL_reg_eval_set)
1459 restore_pos(aTHXo_ 0);
1464 - regtry - try match at specific point
1466 STATIC I32 /* 0 failure, 1 success */
1467 S_regtry(pTHX_ regexp *prog, char *startpos)
1475 if ((prog->reganch & ROPT_EVAL_SEEN) && !PL_reg_eval_set) {
1478 PL_reg_eval_set = RS_init;
1480 PerlIO_printf(Perl_debug_log, " setting stack tmpbase at %i\n",
1481 PL_stack_sp - PL_stack_base);
1483 SAVEINT(cxstack[cxstack_ix].blk_oldsp);
1484 cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base;
1485 /* Otherwise OP_NEXTSTATE will free whatever on stack now. */
1487 /* Apparently this is not needed, judging by wantarray. */
1488 /* SAVEINT(cxstack[cxstack_ix].blk_gimme);
1489 cxstack[cxstack_ix].blk_gimme = G_SCALAR; */
1492 /* Make $_ available to executed code. */
1493 if (PL_reg_sv != DEFSV) {
1494 /* SAVE_DEFSV does *not* suffice here for USE_THREADS */
1499 if (!(SvTYPE(PL_reg_sv) >= SVt_PVMG && SvMAGIC(PL_reg_sv)
1500 && (mg = mg_find(PL_reg_sv, 'g')))) {
1501 /* prepare for quick setting of pos */
1502 sv_magic(PL_reg_sv, (SV*)0, 'g', Nullch, 0);
1503 mg = mg_find(PL_reg_sv, 'g');
1507 PL_reg_oldpos = mg->mg_len;
1508 SAVEDESTRUCTOR(restore_pos, 0);
1511 New(22,PL_reg_curpm, 1, PMOP);
1512 PL_reg_curpm->op_pmregexp = prog;
1513 PL_reg_oldcurpm = PL_curpm;
1514 PL_curpm = PL_reg_curpm;
1515 if (RX_MATCH_COPIED(prog)) {
1516 /* Here is a serious problem: we cannot rewrite subbeg,
1517 since it may be needed if this match fails. Thus
1518 $` inside (?{}) could fail... */
1519 PL_reg_oldsaved = prog->subbeg;
1520 PL_reg_oldsavedlen = prog->sublen;
1521 RX_MATCH_COPIED_off(prog);
1524 PL_reg_oldsaved = Nullch;
1525 prog->subbeg = PL_bostr;
1526 prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
1528 prog->startp[0] = startpos - PL_bostr;
1529 PL_reginput = startpos;
1530 PL_regstartp = prog->startp;
1531 PL_regendp = prog->endp;
1532 PL_reglastparen = &prog->lastparen;
1533 prog->lastparen = 0;
1535 DEBUG_r(PL_reg_starttry = startpos);
1536 if (PL_reg_start_tmpl <= prog->nparens) {
1537 PL_reg_start_tmpl = prog->nparens*3/2 + 3;
1538 if(PL_reg_start_tmp)
1539 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
1541 New(22,PL_reg_start_tmp, PL_reg_start_tmpl, char*);
1544 /* XXXX What this code is doing here?!!! There should be no need
1545 to do this again and again, PL_reglastparen should take care of
1549 if (prog->nparens) {
1550 for (i = prog->nparens; i >= 1; i--) {
1556 if (regmatch(prog->program + 1)) {
1557 prog->endp[0] = PL_reginput - PL_bostr;
1565 - regmatch - main matching routine
1567 * Conceptually the strategy is simple: check to see whether the current
1568 * node matches, call self recursively to see whether the rest matches,
1569 * and then act accordingly. In practice we make some effort to avoid
1570 * recursion, in particular by going through "ordinary" nodes (that don't
1571 * need to know whether the rest of the match failed) by a loop instead of
1574 /* [lwall] I've hoisted the register declarations to the outer block in order to
1575 * maybe save a little bit of pushing and popping on the stack. It also takes
1576 * advantage of machines that use a register save mask on subroutine entry.
1578 STATIC I32 /* 0 failure, 1 success */
1579 S_regmatch(pTHX_ regnode *prog)
1582 register regnode *scan; /* Current node. */
1583 regnode *next; /* Next node. */
1584 regnode *inner; /* Next node in internal branch. */
1585 register I32 nextchr; /* renamed nextchr - nextchar colides with
1586 function of same name */
1587 register I32 n; /* no or next */
1588 register I32 ln; /* len or last */
1589 register char *s; /* operand or save */
1590 register char *locinput = PL_reginput;
1591 register I32 c1, c2, paren; /* case fold search, parenth */
1592 int minmod = 0, sw = 0, logical = 0;
1597 /* Note that nextchr is a byte even in UTF */
1598 nextchr = UCHARAT(locinput);
1600 while (scan != NULL) {
1601 #define sayNO_L (logical ? (logical = 0, sw = 0, goto cont) : sayNO)
1603 # define sayYES goto yes
1604 # define sayNO goto no
1605 # define saySAME(x) if (x) goto yes; else goto no
1606 # define REPORT_CODE_OFF 24
1608 # define sayYES return 1
1609 # define sayNO return 0
1610 # define saySAME(x) return x
1613 SV *prop = sv_newmortal();
1614 int docolor = *PL_colors[0];
1615 int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
1616 int l = (PL_regeol - locinput > taill ? taill : PL_regeol - locinput);
1617 /* The part of the string before starttry has one color
1618 (pref0_len chars), between starttry and current
1619 position another one (pref_len - pref0_len chars),
1620 after the current position the third one.
1621 We assume that pref0_len <= pref_len, otherwise we
1622 decrease pref0_len. */
1623 int pref_len = (locinput - PL_bostr > (5 + taill) - l
1624 ? (5 + taill) - l : locinput - PL_bostr);
1625 int pref0_len = pref_len - (locinput - PL_reg_starttry);
1627 if (l + pref_len < (5 + taill) && l < PL_regeol - locinput)
1628 l = ( PL_regeol - locinput > (5 + taill) - pref_len
1629 ? (5 + taill) - pref_len : PL_regeol - locinput);
1632 if (pref0_len > pref_len)
1633 pref0_len = pref_len;
1634 regprop(prop, scan);
1635 PerlIO_printf(Perl_debug_log,
1636 "%4i <%s%.*s%s%s%.*s%s%s%s%.*s%s>%*s|%3d:%*s%s\n",
1637 locinput - PL_bostr,
1638 PL_colors[4], pref0_len,
1639 locinput - pref_len, PL_colors[5],
1640 PL_colors[2], pref_len - pref0_len,
1641 locinput - pref_len + pref0_len, PL_colors[3],
1642 (docolor ? "" : "> <"),
1643 PL_colors[0], l, locinput, PL_colors[1],
1644 15 - l - pref_len + 1,
1646 scan - PL_regprogram, PL_regindent*2, "",
1650 next = scan + NEXT_OFF(scan);
1656 if (locinput == PL_bostr
1657 ? PL_regprev == '\n'
1659 (nextchr || locinput < PL_regeol) && locinput[-1] == '\n') )
1661 /* regtill = regbol; */
1666 if (locinput == PL_bostr
1667 ? PL_regprev == '\n'
1668 : ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n') )
1674 if (locinput == PL_regbol && PL_regprev == '\n')
1678 if (locinput == PL_reg_ganch)
1688 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
1693 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
1695 if (PL_regeol - locinput > 1)
1699 if (PL_regeol != locinput)
1703 if (nextchr & 0x80) {
1704 locinput += PL_utf8skip[nextchr];
1705 if (locinput > PL_regeol)
1707 nextchr = UCHARAT(locinput);
1710 if (!nextchr && locinput >= PL_regeol)
1712 nextchr = UCHARAT(++locinput);
1715 if (!nextchr && locinput >= PL_regeol)
1717 nextchr = UCHARAT(++locinput);
1720 if (nextchr & 0x80) {
1721 locinput += PL_utf8skip[nextchr];
1722 if (locinput > PL_regeol)
1724 nextchr = UCHARAT(locinput);
1727 if (!nextchr && locinput >= PL_regeol || nextchr == '\n')
1729 nextchr = UCHARAT(++locinput);
1732 if (!nextchr && locinput >= PL_regeol || nextchr == '\n')
1734 nextchr = UCHARAT(++locinput);
1739 /* Inline the first character, for speed. */
1740 if (UCHARAT(s) != nextchr)
1742 if (PL_regeol - locinput < ln)
1744 if (ln > 1 && memNE(s, locinput, ln))
1747 nextchr = UCHARAT(locinput);
1750 PL_reg_flags |= RF_tainted;
1759 c1 = OP(scan) == EXACTF;
1763 if (utf8_to_uv((U8*)s, 0) != (c1 ?
1764 toLOWER_utf8((U8*)l) :
1765 toLOWER_LC_utf8((U8*)l)))
1773 nextchr = UCHARAT(locinput);
1777 /* Inline the first character, for speed. */
1778 if (UCHARAT(s) != nextchr &&
1779 UCHARAT(s) != ((OP(scan) == EXACTF)
1780 ? PL_fold : PL_fold_locale)[nextchr])
1782 if (PL_regeol - locinput < ln)
1784 if (ln > 1 && (OP(scan) == EXACTF
1785 ? ibcmp(s, locinput, ln)
1786 : ibcmp_locale(s, locinput, ln)))
1789 nextchr = UCHARAT(locinput);
1793 if (!REGINCLASSUTF8(scan, (U8*)locinput))
1795 if (locinput >= PL_regeol)
1797 locinput += PL_utf8skip[nextchr];
1798 nextchr = UCHARAT(locinput);
1803 nextchr = UCHARAT(locinput);
1804 if (!REGINCLASS(s, nextchr))
1806 if (!nextchr && locinput >= PL_regeol)
1808 nextchr = UCHARAT(++locinput);
1811 PL_reg_flags |= RF_tainted;
1816 if (!(OP(scan) == ALNUM
1817 ? isALNUM(nextchr) : isALNUM_LC(nextchr)))
1819 nextchr = UCHARAT(++locinput);
1822 PL_reg_flags |= RF_tainted;
1827 if (nextchr & 0x80) {
1828 if (!(OP(scan) == ALNUMUTF8
1829 ? swash_fetch(PL_utf8_alnum, (U8*)locinput)
1830 : isALNUM_LC_utf8((U8*)locinput)))
1834 locinput += PL_utf8skip[nextchr];
1835 nextchr = UCHARAT(locinput);
1838 if (!(OP(scan) == ALNUMUTF8
1839 ? isALNUM(nextchr) : isALNUM_LC(nextchr)))
1841 nextchr = UCHARAT(++locinput);
1844 PL_reg_flags |= RF_tainted;
1847 if (!nextchr && locinput >= PL_regeol)
1849 if (OP(scan) == NALNUM
1850 ? isALNUM(nextchr) : isALNUM_LC(nextchr))
1852 nextchr = UCHARAT(++locinput);
1855 PL_reg_flags |= RF_tainted;
1858 if (!nextchr && locinput >= PL_regeol)
1860 if (nextchr & 0x80) {
1861 if (OP(scan) == NALNUMUTF8
1862 ? swash_fetch(PL_utf8_alnum, (U8*)locinput)
1863 : isALNUM_LC_utf8((U8*)locinput))
1867 locinput += PL_utf8skip[nextchr];
1868 nextchr = UCHARAT(locinput);
1871 if (OP(scan) == NALNUMUTF8
1872 ? isALNUM(nextchr) : isALNUM_LC(nextchr))
1874 nextchr = UCHARAT(++locinput);
1878 PL_reg_flags |= RF_tainted;
1882 /* was last char in word? */
1883 ln = (locinput != PL_regbol) ? UCHARAT(locinput - 1) : PL_regprev;
1884 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
1886 n = isALNUM(nextchr);
1889 ln = isALNUM_LC(ln);
1890 n = isALNUM_LC(nextchr);
1892 if (((!ln) == (!n)) == (OP(scan) == BOUND || OP(scan) == BOUNDL))
1897 PL_reg_flags |= RF_tainted;
1901 /* was last char in word? */
1902 ln = (locinput != PL_regbol)
1903 ? utf8_to_uv(reghop((U8*)locinput, -1), 0) : PL_regprev;
1904 if (OP(scan) == BOUNDUTF8 || OP(scan) == NBOUNDUTF8) {
1905 ln = isALNUM_uni(ln);
1906 n = swash_fetch(PL_utf8_alnum, (U8*)locinput);
1909 ln = isALNUM_LC_uni(ln);
1910 n = isALNUM_LC_utf8((U8*)locinput);
1912 if (((!ln) == (!n)) == (OP(scan) == BOUNDUTF8 || OP(scan) == BOUNDLUTF8))
1916 PL_reg_flags |= RF_tainted;
1919 if (!nextchr && locinput >= PL_regeol)
1921 if (!(OP(scan) == SPACE
1922 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
1924 nextchr = UCHARAT(++locinput);
1927 PL_reg_flags |= RF_tainted;
1930 if (!nextchr && locinput >= PL_regeol)
1932 if (nextchr & 0x80) {
1933 if (!(OP(scan) == SPACEUTF8
1934 ? swash_fetch(PL_utf8_space,(U8*)locinput)
1935 : isSPACE_LC_utf8((U8*)locinput)))
1939 locinput += PL_utf8skip[nextchr];
1940 nextchr = UCHARAT(locinput);
1943 if (!(OP(scan) == SPACEUTF8
1944 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
1946 nextchr = UCHARAT(++locinput);
1949 PL_reg_flags |= RF_tainted;
1954 if (OP(scan) == SPACE
1955 ? isSPACE(nextchr) : isSPACE_LC(nextchr))
1957 nextchr = UCHARAT(++locinput);
1960 PL_reg_flags |= RF_tainted;
1965 if (nextchr & 0x80) {
1966 if (OP(scan) == NSPACEUTF8
1967 ? swash_fetch(PL_utf8_space,(U8*)locinput)
1968 : isSPACE_LC_utf8((U8*)locinput))
1972 locinput += PL_utf8skip[nextchr];
1973 nextchr = UCHARAT(locinput);
1976 if (OP(scan) == NSPACEUTF8
1977 ? isSPACE(nextchr) : isSPACE_LC(nextchr))
1979 nextchr = UCHARAT(++locinput);
1982 PL_reg_flags |= RF_tainted;
1985 if (!nextchr && locinput >= PL_regeol)
1987 if (!(OP(scan) == DIGIT
1988 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr)))
1990 nextchr = UCHARAT(++locinput);
1993 PL_reg_flags |= RF_tainted;
1998 if (nextchr & 0x80) {
1999 if (OP(scan) == NDIGITUTF8
2000 ? swash_fetch(PL_utf8_digit,(U8*)locinput)
2001 : isDIGIT_LC_utf8((U8*)locinput))
2005 locinput += PL_utf8skip[nextchr];
2006 nextchr = UCHARAT(locinput);
2009 if (!isDIGIT(nextchr))
2011 nextchr = UCHARAT(++locinput);
2014 PL_reg_flags |= RF_tainted;
2019 if (OP(scan) == DIGIT
2020 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr))
2022 nextchr = UCHARAT(++locinput);
2025 PL_reg_flags |= RF_tainted;
2028 if (!nextchr && locinput >= PL_regeol)
2030 if (nextchr & 0x80) {
2031 if (swash_fetch(PL_utf8_digit,(U8*)locinput))
2033 locinput += PL_utf8skip[nextchr];
2034 nextchr = UCHARAT(locinput);
2037 if (isDIGIT(nextchr))
2039 nextchr = UCHARAT(++locinput);
2042 if (locinput >= PL_regeol || swash_fetch(PL_utf8_mark,(U8*)locinput))
2044 locinput += PL_utf8skip[nextchr];
2045 while (locinput < PL_regeol && swash_fetch(PL_utf8_mark,(U8*)locinput))
2046 locinput += UTF8SKIP(locinput);
2047 if (locinput > PL_regeol)
2049 nextchr = UCHARAT(locinput);
2052 PL_reg_flags |= RF_tainted;
2056 n = ARG(scan); /* which paren pair */
2057 ln = PL_regstartp[n];
2058 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
2059 if (*PL_reglastparen < n || ln == -1)
2060 sayNO; /* Do not match unless seen CLOSEn. */
2061 if (ln == PL_regendp[n])
2065 if (UTF && OP(scan) != REF) { /* REF can do byte comparison */
2067 char *e = PL_bostr + PL_regendp[n];
2069 * Note that we can't do the "other character" lookup trick as
2070 * in the 8-bit case (no pun intended) because in Unicode we
2071 * have to map both upper and title case to lower case.
2073 if (OP(scan) == REFF) {
2077 if (toLOWER_utf8((U8*)s) != toLOWER_utf8((U8*)l))
2087 if (toLOWER_LC_utf8((U8*)s) != toLOWER_LC_utf8((U8*)l))
2094 nextchr = UCHARAT(locinput);
2098 /* Inline the first character, for speed. */
2099 if (UCHARAT(s) != nextchr &&
2101 (UCHARAT(s) != ((OP(scan) == REFF
2102 ? PL_fold : PL_fold_locale)[nextchr]))))
2104 ln = PL_regendp[n] - ln;
2105 if (locinput + ln > PL_regeol)
2107 if (ln > 1 && (OP(scan) == REF
2108 ? memNE(s, locinput, ln)
2110 ? ibcmp(s, locinput, ln)
2111 : ibcmp_locale(s, locinput, ln))))
2114 nextchr = UCHARAT(locinput);
2125 OP_4tree *oop = PL_op;
2126 COP *ocurcop = PL_curcop;
2127 SV **ocurpad = PL_curpad;
2131 PL_op = (OP_4tree*)PL_regdata->data[n];
2132 DEBUG_r( PerlIO_printf(Perl_debug_log, " re_eval 0x%x\n", PL_op) );
2133 PL_curpad = AvARRAY((AV*)PL_regdata->data[n + 2]);
2134 PL_regendp[0] = PL_reg_magic->mg_len = locinput - PL_bostr;
2136 CALLRUNOPS(aTHX); /* Scalar context. */
2142 PL_curpad = ocurpad;
2143 PL_curcop = ocurcop;
2145 if (logical == 2) { /* Postponed subexpression. */
2147 MAGIC *mg = Null(MAGIC*);
2149 CHECKPOINT cp, lastcp;
2151 if(SvROK(ret) || SvRMAGICAL(ret)) {
2152 SV *sv = SvROK(ret) ? SvRV(ret) : ret;
2155 mg = mg_find(sv, 'r');
2158 re = (regexp *)mg->mg_obj;
2159 (void)ReREFCNT_inc(re);
2163 char *t = SvPV(ret, len);
2165 char *oprecomp = PL_regprecomp;
2166 I32 osize = PL_regsize;
2167 I32 onpar = PL_regnpar;
2170 re = CALLREGCOMP(aTHX_ t, t + len, &pm);
2172 & (SVs_TEMP | SVs_PADTMP | SVf_READONLY)))
2173 sv_magic(ret,(SV*)ReREFCNT_inc(re),'r',0,0);
2174 PL_regprecomp = oprecomp;
2179 PerlIO_printf(Perl_debug_log,
2180 "Entering embedded `%s%.60s%s%s'\n",
2184 (strlen(re->precomp) > 60 ? "..." : ""))
2187 state.prev = PL_reg_call_cc;
2188 state.cc = PL_regcc;
2189 state.re = PL_reg_re;
2193 cp = regcppush(0); /* Save *all* the positions. */
2196 state.ss = PL_savestack_ix;
2197 *PL_reglastparen = 0;
2198 PL_reg_call_cc = &state;
2199 PL_reginput = locinput;
2201 /* XXXX This is too dramatic a measure... */
2204 if (regmatch(re->program + 1)) {
2205 /* Even though we succeeded, we need to restore
2206 global variables, since we may be wrapped inside
2207 SUSPEND, thus the match may be not finished yet. */
2209 /* XXXX Do this only if SUSPENDed? */
2210 PL_reg_call_cc = state.prev;
2211 PL_regcc = state.cc;
2212 PL_reg_re = state.re;
2213 cache_re(PL_reg_re);
2215 /* XXXX This is too dramatic a measure... */
2218 /* These are needed even if not SUSPEND. */
2224 PerlIO_printf(Perl_debug_log,
2226 REPORT_CODE_OFF+PL_regindent*2, "")
2231 PL_reg_call_cc = state.prev;
2232 PL_regcc = state.cc;
2233 PL_reg_re = state.re;
2234 cache_re(PL_reg_re);
2236 /* XXXX This is too dramatic a measure... */
2245 sv_setsv(save_scalar(PL_replgv), ret);
2249 n = ARG(scan); /* which paren pair */
2250 PL_reg_start_tmp[n] = locinput;
2255 n = ARG(scan); /* which paren pair */
2256 PL_regstartp[n] = PL_reg_start_tmp[n] - PL_bostr;
2257 PL_regendp[n] = locinput - PL_bostr;
2258 if (n > *PL_reglastparen)
2259 *PL_reglastparen = n;
2262 n = ARG(scan); /* which paren pair */
2263 sw = (*PL_reglastparen >= n && PL_regendp[n] != -1);
2266 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
2268 next = NEXTOPER(NEXTOPER(scan));
2270 next = scan + ARG(scan);
2271 if (OP(next) == IFTHEN) /* Fake one. */
2272 next = NEXTOPER(NEXTOPER(next));
2276 logical = scan->flags;
2278 /*******************************************************************
2279 PL_regcc contains infoblock about the innermost (...)* loop, and
2280 a pointer to the next outer infoblock.
2282 Here is how Y(A)*Z is processed (if it is compiled into CURLYX/WHILEM):
2284 1) After matching X, regnode for CURLYX is processed;
2286 2) This regnode creates infoblock on the stack, and calls
2287 regmatch() recursively with the starting point at WHILEM node;
2289 3) Each hit of WHILEM node tries to match A and Z (in the order
2290 depending on the current iteration, min/max of {min,max} and
2291 greediness). The information about where are nodes for "A"
2292 and "Z" is read from the infoblock, as is info on how many times "A"
2293 was already matched, and greediness.
2295 4) After A matches, the same WHILEM node is hit again.
2297 5) Each time WHILEM is hit, PL_regcc is the infoblock created by CURLYX
2298 of the same pair. Thus when WHILEM tries to match Z, it temporarily
2299 resets PL_regcc, since this Y(A)*Z can be a part of some other loop:
2300 as in (Y(A)*Z)*. If Z matches, the automaton will hit the WHILEM node
2301 of the external loop.
2303 Currently present infoblocks form a tree with a stem formed by PL_curcc
2304 and whatever it mentions via ->next, and additional attached trees
2305 corresponding to temporarily unset infoblocks as in "5" above.
2307 In the following picture infoblocks for outer loop of
2308 (Y(A)*?Z)*?T are denoted O, for inner I. NULL starting block
2309 is denoted by x. The matched string is YAAZYAZT. Temporarily postponed
2310 infoblocks are drawn below the "reset" infoblock.
2312 In fact in the picture below we do not show failed matches for Z and T
2313 by WHILEM blocks. [We illustrate minimal matches, since for them it is
2314 more obvious *why* one needs to *temporary* unset infoblocks.]
2316 Matched REx position InfoBlocks Comment
2320 Y A)*?Z)*?T x <- O <- I
2321 YA )*?Z)*?T x <- O <- I
2322 YA A)*?Z)*?T x <- O <- I
2323 YAA )*?Z)*?T x <- O <- I
2324 YAA Z)*?T x <- O # Temporary unset I
2327 YAAZ Y(A)*?Z)*?T x <- O
2330 YAAZY (A)*?Z)*?T x <- O
2333 YAAZY A)*?Z)*?T x <- O <- I
2336 YAAZYA )*?Z)*?T x <- O <- I
2339 YAAZYA Z)*?T x <- O # Temporary unset I
2345 YAAZYAZ T x # Temporary unset O
2352 *******************************************************************/
2355 CHECKPOINT cp = PL_savestack_ix;
2357 if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
2359 cc.oldcc = PL_regcc;
2361 cc.parenfloor = *PL_reglastparen;
2363 cc.min = ARG1(scan);
2364 cc.max = ARG2(scan);
2365 cc.scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
2369 PL_reginput = locinput;
2370 n = regmatch(PREVOPER(next)); /* start on the WHILEM */
2372 PL_regcc = cc.oldcc;
2378 * This is really hard to understand, because after we match
2379 * what we're trying to match, we must make sure the rest of
2380 * the REx is going to match for sure, and to do that we have
2381 * to go back UP the parse tree by recursing ever deeper. And
2382 * if it fails, we have to reset our parent's current state
2383 * that we can try again after backing off.
2386 CHECKPOINT cp, lastcp;
2387 CURCUR* cc = PL_regcc;
2388 char *lastloc = cc->lastloc; /* Detection of 0-len. */
2390 n = cc->cur + 1; /* how many we know we matched */
2391 PL_reginput = locinput;
2394 PerlIO_printf(Perl_debug_log,
2395 "%*s %ld out of %ld..%ld cc=%lx\n",
2396 REPORT_CODE_OFF+PL_regindent*2, "",
2397 (long)n, (long)cc->min,
2398 (long)cc->max, (long)cc)
2401 /* If degenerate scan matches "", assume scan done. */
2403 if (locinput == cc->lastloc && n >= cc->min) {
2404 PL_regcc = cc->oldcc;
2408 PerlIO_printf(Perl_debug_log,
2409 "%*s empty match detected, try continuation...\n",
2410 REPORT_CODE_OFF+PL_regindent*2, "")
2412 if (regmatch(cc->next))
2415 PerlIO_printf(Perl_debug_log,
2417 REPORT_CODE_OFF+PL_regindent*2, "")
2425 /* First just match a string of min scans. */
2429 cc->lastloc = locinput;
2430 if (regmatch(cc->scan))
2433 cc->lastloc = lastloc;
2435 PerlIO_printf(Perl_debug_log,
2437 REPORT_CODE_OFF+PL_regindent*2, "")
2443 /* Check whether we already were at this position.
2444 Postpone detection until we know the match is not
2445 *that* much linear. */
2446 if (!PL_reg_maxiter) {
2447 PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
2448 PL_reg_leftiter = PL_reg_maxiter;
2450 if (PL_reg_leftiter-- == 0) {
2451 I32 size = (PL_reg_maxiter + 7)/8;
2452 if (PL_reg_poscache) {
2453 if (PL_reg_poscache_size < size) {
2454 Renew(PL_reg_poscache, size, char);
2455 PL_reg_poscache_size = size;
2457 Zero(PL_reg_poscache, size, char);
2460 PL_reg_poscache_size = size;
2461 Newz(29, PL_reg_poscache, size, char);
2464 PerlIO_printf(Perl_debug_log,
2465 "%sDetected a super-linear match, switching on caching%s...\n",
2466 PL_colors[4], PL_colors[5])
2469 if (PL_reg_leftiter < 0) {
2470 I32 o = locinput - PL_bostr, b;
2472 o = (scan->flags & 0xf) - 1 + o * (scan->flags>>4);
2475 if (PL_reg_poscache[o] & (1<<b)) {
2477 PerlIO_printf(Perl_debug_log,
2478 "%*s already tried at this position...\n",
2479 REPORT_CODE_OFF+PL_regindent*2, "")
2483 PL_reg_poscache[o] |= (1<<b);
2487 /* Prefer next over scan for minimal matching. */
2490 PL_regcc = cc->oldcc;
2493 cp = regcppush(cc->parenfloor);
2495 if (regmatch(cc->next)) {
2497 sayYES; /* All done. */
2505 if (n >= cc->max) { /* Maximum greed exceeded? */
2506 if (ckWARN(WARN_UNSAFE) && n >= REG_INFTY
2507 && !(PL_reg_flags & RF_warned)) {
2508 PL_reg_flags |= RF_warned;
2509 Perl_warner(aTHX_ WARN_UNSAFE, "%s limit (%d) exceeded",
2510 "Complex regular subexpression recursion",
2517 PerlIO_printf(Perl_debug_log,
2518 "%*s trying longer...\n",
2519 REPORT_CODE_OFF+PL_regindent*2, "")
2521 /* Try scanning more and see if it helps. */
2522 PL_reginput = locinput;
2524 cc->lastloc = locinput;
2525 cp = regcppush(cc->parenfloor);
2527 if (regmatch(cc->scan)) {
2532 PerlIO_printf(Perl_debug_log,
2534 REPORT_CODE_OFF+PL_regindent*2, "")
2539 cc->lastloc = lastloc;
2543 /* Prefer scan over next for maximal matching. */
2545 if (n < cc->max) { /* More greed allowed? */
2546 cp = regcppush(cc->parenfloor);
2548 cc->lastloc = locinput;
2550 if (regmatch(cc->scan)) {
2555 regcppop(); /* Restore some previous $<digit>s? */
2556 PL_reginput = locinput;
2558 PerlIO_printf(Perl_debug_log,
2559 "%*s failed, try continuation...\n",
2560 REPORT_CODE_OFF+PL_regindent*2, "")
2563 if (ckWARN(WARN_UNSAFE) && n >= REG_INFTY
2564 && !(PL_reg_flags & RF_warned)) {
2565 PL_reg_flags |= RF_warned;
2566 Perl_warner(aTHX_ WARN_UNSAFE, "%s limit (%d) exceeded",
2567 "Complex regular subexpression recursion",
2571 /* Failed deeper matches of scan, so see if this one works. */
2572 PL_regcc = cc->oldcc;
2575 if (regmatch(cc->next))
2578 PerlIO_printf(Perl_debug_log, "%*s failed...\n",
2579 REPORT_CODE_OFF+PL_regindent*2, "")
2585 cc->lastloc = lastloc;
2590 next = scan + ARG(scan);
2593 inner = NEXTOPER(NEXTOPER(scan));
2596 inner = NEXTOPER(scan);
2601 if (OP(next) != c1) /* No choice. */
2602 next = inner; /* Avoid recursion. */
2604 int lastparen = *PL_reglastparen;
2608 PL_reginput = locinput;
2609 if (regmatch(inner))
2612 for (n = *PL_reglastparen; n > lastparen; n--)
2614 *PL_reglastparen = n;
2617 if (n = (c1 == BRANCH ? NEXT_OFF(next) : ARG(next)))
2621 inner = NEXTOPER(scan);
2622 if (c1 == BRANCHJ) {
2623 inner = NEXTOPER(inner);
2625 } while (scan != NULL && OP(scan) == c1);
2639 /* We suppose that the next guy does not need
2640 backtracking: in particular, it is of constant length,
2641 and has no parenths to influence future backrefs. */
2642 ln = ARG1(scan); /* min to match */
2643 n = ARG2(scan); /* max to match */
2644 paren = scan->flags;
2646 if (paren > PL_regsize)
2648 if (paren > *PL_reglastparen)
2649 *PL_reglastparen = paren;
2651 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
2653 scan += NEXT_OFF(scan); /* Skip former OPEN. */
2654 PL_reginput = locinput;
2657 if (ln && regrepeat_hard(scan, ln, &l) < ln)
2659 if (ln && l == 0 && n >= ln
2660 /* In fact, this is tricky. If paren, then the
2661 fact that we did/didnot match may influence
2662 future execution. */
2663 && !(paren && ln == 0))
2665 locinput = PL_reginput;
2666 if (PL_regkind[(U8)OP(next)] == EXACT) {
2667 c1 = (U8)*STRING(next);
2668 if (OP(next) == EXACTF)
2670 else if (OP(next) == EXACTFL)
2671 c2 = PL_fold_locale[c1];
2678 /* This may be improved if l == 0. */
2679 while (n >= ln || (n == REG_INFTY && ln > 0 && l)) { /* ln overflow ? */
2680 /* If it could work, try it. */
2682 UCHARAT(PL_reginput) == c1 ||
2683 UCHARAT(PL_reginput) == c2)
2687 PL_regstartp[paren] =
2688 HOPc(PL_reginput, -l) - PL_bostr;
2689 PL_regendp[paren] = PL_reginput - PL_bostr;
2692 PL_regendp[paren] = -1;
2698 /* Couldn't or didn't -- move forward. */
2699 PL_reginput = locinput;
2700 if (regrepeat_hard(scan, 1, &l)) {
2702 locinput = PL_reginput;
2709 n = regrepeat_hard(scan, n, &l);
2710 if (n != 0 && l == 0
2711 /* In fact, this is tricky. If paren, then the
2712 fact that we did/didnot match may influence
2713 future execution. */
2714 && !(paren && ln == 0))
2716 locinput = PL_reginput;
2718 PerlIO_printf(Perl_debug_log,
2719 "%*s matched %ld times, len=%ld...\n",
2720 REPORT_CODE_OFF+PL_regindent*2, "", n, l)
2723 if (PL_regkind[(U8)OP(next)] == EXACT) {
2724 c1 = (U8)*STRING(next);
2725 if (OP(next) == EXACTF)
2727 else if (OP(next) == EXACTFL)
2728 c2 = PL_fold_locale[c1];
2737 /* If it could work, try it. */
2739 UCHARAT(PL_reginput) == c1 ||
2740 UCHARAT(PL_reginput) == c2)
2743 PerlIO_printf(Perl_debug_log,
2744 "%*s trying tail with n=%ld...\n",
2745 REPORT_CODE_OFF+PL_regindent*2, "", n)
2749 PL_regstartp[paren] = HOPc(PL_reginput, -l) - PL_bostr;
2750 PL_regendp[paren] = PL_reginput - PL_bostr;
2753 PL_regendp[paren] = -1;
2759 /* Couldn't or didn't -- back up. */
2761 locinput = HOPc(locinput, -l);
2762 PL_reginput = locinput;
2769 paren = scan->flags; /* Which paren to set */
2770 if (paren > PL_regsize)
2772 if (paren > *PL_reglastparen)
2773 *PL_reglastparen = paren;
2774 ln = ARG1(scan); /* min to match */
2775 n = ARG2(scan); /* max to match */
2776 scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
2780 ln = ARG1(scan); /* min to match */
2781 n = ARG2(scan); /* max to match */
2782 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
2787 scan = NEXTOPER(scan);
2793 scan = NEXTOPER(scan);
2797 * Lookahead to avoid useless match attempts
2798 * when we know what character comes next.
2800 if (PL_regkind[(U8)OP(next)] == EXACT) {
2801 c1 = (U8)*STRING(next);
2802 if (OP(next) == EXACTF)
2804 else if (OP(next) == EXACTFL)
2805 c2 = PL_fold_locale[c1];
2811 PL_reginput = locinput;
2815 if (ln && regrepeat(scan, ln) < ln)
2817 locinput = PL_reginput;
2820 char *e = locinput + n - ln; /* Should not check after this */
2821 char *old = locinput;
2823 if (e >= PL_regeol || (n == REG_INFTY))
2826 /* Find place 'next' could work */
2828 while (locinput <= e && *locinput != c1)
2831 while (locinput <= e
2838 /* PL_reginput == old now */
2839 if (locinput != old) {
2840 ln = 1; /* Did some */
2841 if (regrepeat(scan, locinput - old) <
2845 /* PL_reginput == locinput now */
2848 PL_regstartp[paren] = HOPc(locinput, -1) - PL_bostr;
2849 PL_regendp[paren] = locinput - PL_bostr;
2852 PL_regendp[paren] = -1;
2856 PL_reginput = locinput; /* Could be reset... */
2858 /* Couldn't or didn't -- move forward. */
2863 while (n >= ln || (n == REG_INFTY && ln > 0)) { /* ln overflow ? */
2864 /* If it could work, try it. */
2866 UCHARAT(PL_reginput) == c1 ||
2867 UCHARAT(PL_reginput) == c2)
2871 PL_regstartp[paren] = HOPc(PL_reginput, -1) - PL_bostr;
2872 PL_regendp[paren] = PL_reginput - PL_bostr;
2875 PL_regendp[paren] = -1;
2881 /* Couldn't or didn't -- move forward. */
2882 PL_reginput = locinput;
2883 if (regrepeat(scan, 1)) {
2885 locinput = PL_reginput;
2893 n = regrepeat(scan, n);
2894 locinput = PL_reginput;
2895 if (ln < n && PL_regkind[(U8)OP(next)] == EOL &&
2896 (!PL_multiline || OP(next) == SEOL))
2897 ln = n; /* why back off? */
2901 /* If it could work, try it. */
2903 UCHARAT(PL_reginput) == c1 ||
2904 UCHARAT(PL_reginput) == c2)
2908 PL_regstartp[paren] = HOPc(PL_reginput, -1) - PL_bostr;
2909 PL_regendp[paren] = PL_reginput - PL_bostr;
2912 PL_regendp[paren] = -1;
2918 /* Couldn't or didn't -- back up. */
2920 PL_reginput = locinput = HOPc(locinput, -1);
2925 /* If it could work, try it. */
2927 UCHARAT(PL_reginput) == c1 ||
2928 UCHARAT(PL_reginput) == c2)
2934 /* Couldn't or didn't -- back up. */
2936 PL_reginput = locinput = HOPc(locinput, -1);
2943 if (PL_reg_call_cc) {
2944 re_cc_state *cur_call_cc = PL_reg_call_cc;
2945 CURCUR *cctmp = PL_regcc;
2946 regexp *re = PL_reg_re;
2947 CHECKPOINT cp, lastcp;
2949 cp = regcppush(0); /* Save *all* the positions. */
2951 regcp_set_to(PL_reg_call_cc->ss); /* Restore parens of
2953 PL_reginput = locinput; /* Make position available to
2955 cache_re(PL_reg_call_cc->re);
2956 PL_regcc = PL_reg_call_cc->cc;
2957 PL_reg_call_cc = PL_reg_call_cc->prev;
2958 if (regmatch(cur_call_cc->node)) {
2959 PL_reg_call_cc = cur_call_cc;
2965 PL_reg_call_cc = cur_call_cc;
2971 PerlIO_printf(Perl_debug_log,
2972 "%*s continuation failed...\n",
2973 REPORT_CODE_OFF+PL_regindent*2, "")
2977 if (locinput < PL_regtill)
2978 sayNO; /* Cannot match: too short. */
2981 PL_reginput = locinput; /* put where regtry can find it */
2982 sayYES; /* Success! */
2985 PL_reginput = locinput;
2990 if (UTF) { /* XXXX This is absolutely
2991 broken, we read before
2993 s = HOPMAYBEc(locinput, -scan->flags);
2999 if (locinput < PL_bostr + scan->flags)
3001 PL_reginput = locinput - scan->flags;
3006 PL_reginput = locinput;
3011 if (UTF) { /* XXXX This is absolutely
3012 broken, we read before
3014 s = HOPMAYBEc(locinput, -scan->flags);
3015 if (!s || s < PL_bostr)
3020 if (locinput < PL_bostr + scan->flags)
3022 PL_reginput = locinput - scan->flags;
3027 PL_reginput = locinput;
3030 inner = NEXTOPER(NEXTOPER(scan));
3031 if (regmatch(inner) != n) {
3046 if (OP(scan) == SUSPEND) {
3047 locinput = PL_reginput;
3048 nextchr = UCHARAT(locinput);
3053 next = scan + ARG(scan);
3058 PerlIO_printf(PerlIO_stderr(), "%lx %d\n",
3059 (unsigned long)scan, OP(scan));
3060 Perl_croak(aTHX_ "regexp memory corruption");
3066 * We get here only if there's trouble -- normally "case END" is
3067 * the terminating point.
3069 Perl_croak(aTHX_ "corrupted regexp pointers");
3087 - regrepeat - repeatedly match something simple, report how many
3090 * [This routine now assumes that it will only match on things of length 1.
3091 * That was true before, but now we assume scan - reginput is the count,
3092 * rather than incrementing count on every character. [Er, except utf8.]]
3095 S_regrepeat(pTHX_ regnode *p, I32 max)
3098 register char *scan;
3099 register char *opnd;
3101 register char *loceol = PL_regeol;
3102 register I32 hardcount = 0;
3105 if (max != REG_INFTY && max < loceol - scan)
3106 loceol = scan + max;
3109 while (scan < loceol && *scan != '\n')
3117 while (scan < loceol && *scan != '\n') {
3118 scan += UTF8SKIP(scan);
3124 while (scan < loceol) {
3125 scan += UTF8SKIP(scan);
3129 case EXACT: /* length of string is 1 */
3131 while (scan < loceol && UCHARAT(scan) == c)
3134 case EXACTF: /* length of string is 1 */
3136 while (scan < loceol &&
3137 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold[c]))
3140 case EXACTFL: /* length of string is 1 */
3141 PL_reg_flags |= RF_tainted;
3143 while (scan < loceol &&
3144 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold_locale[c]))
3149 while (scan < loceol && REGINCLASSUTF8(p, (U8*)scan)) {
3150 scan += UTF8SKIP(scan);
3156 while (scan < loceol && REGINCLASS(opnd, *scan))
3160 while (scan < loceol && isALNUM(*scan))
3165 while (scan < loceol && swash_fetch(PL_utf8_alnum, (U8*)scan)) {
3166 scan += UTF8SKIP(scan);
3171 PL_reg_flags |= RF_tainted;
3172 while (scan < loceol && isALNUM_LC(*scan))
3176 PL_reg_flags |= RF_tainted;
3178 while (scan < loceol && isALNUM_LC_utf8((U8*)scan)) {
3179 scan += UTF8SKIP(scan);
3185 while (scan < loceol && !isALNUM(*scan))
3190 while (scan < loceol && !swash_fetch(PL_utf8_alnum, (U8*)scan)) {
3191 scan += UTF8SKIP(scan);
3196 PL_reg_flags |= RF_tainted;
3197 while (scan < loceol && !isALNUM_LC(*scan))
3201 PL_reg_flags |= RF_tainted;
3203 while (scan < loceol && !isALNUM_LC_utf8((U8*)scan)) {
3204 scan += UTF8SKIP(scan);
3209 while (scan < loceol && isSPACE(*scan))
3214 while (scan < loceol && (*scan == ' ' || swash_fetch(PL_utf8_space,(U8*)scan))) {
3215 scan += UTF8SKIP(scan);
3220 PL_reg_flags |= RF_tainted;
3221 while (scan < loceol && isSPACE_LC(*scan))
3225 PL_reg_flags |= RF_tainted;
3227 while (scan < loceol && (*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
3228 scan += UTF8SKIP(scan);
3233 while (scan < loceol && !isSPACE(*scan))
3238 while (scan < loceol && !(*scan == ' ' || swash_fetch(PL_utf8_space,(U8*)scan))) {
3239 scan += UTF8SKIP(scan);
3244 PL_reg_flags |= RF_tainted;
3245 while (scan < loceol && !isSPACE_LC(*scan))
3249 PL_reg_flags |= RF_tainted;
3251 while (scan < loceol && !(*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
3252 scan += UTF8SKIP(scan);
3257 while (scan < loceol && isDIGIT(*scan))
3262 while (scan < loceol && swash_fetch(PL_utf8_digit,(U8*)scan)) {
3263 scan += UTF8SKIP(scan);
3269 while (scan < loceol && !isDIGIT(*scan))
3274 while (scan < loceol && !swash_fetch(PL_utf8_digit,(U8*)scan)) {
3275 scan += UTF8SKIP(scan);
3279 default: /* Called on something of 0 width. */
3280 break; /* So match right here or not at all. */
3286 c = scan - PL_reginput;
3291 SV *prop = sv_newmortal();
3294 PerlIO_printf(Perl_debug_log,
3295 "%*s %s can match %ld times out of %ld...\n",
3296 REPORT_CODE_OFF+1, "", SvPVX(prop),c,max);
3303 - regrepeat_hard - repeatedly match something, report total lenth and length
3305 * The repeater is supposed to have constant length.
3309 S_regrepeat_hard(pTHX_ regnode *p, I32 max, I32 *lp)
3312 register char *scan;
3313 register char *start;
3314 register char *loceol = PL_regeol;
3316 I32 count = 0, res = 1;
3321 start = PL_reginput;
3323 while (PL_reginput < loceol && (scan = PL_reginput, res = regmatch(p))) {
3326 while (start < PL_reginput) {
3328 start += UTF8SKIP(start);
3339 while (PL_reginput < loceol && (scan = PL_reginput, res = regmatch(p))) {
3341 *lp = l = PL_reginput - start;
3342 if (max != REG_INFTY && l*max < loceol - scan)
3343 loceol = scan + l*max;
3356 - reginclass - determine if a character falls into a character class
3360 S_reginclass(pTHX_ register char *p, register I32 c)
3363 char flags = ANYOF_FLAGS(p);
3367 if (ANYOF_BITMAP_TEST(p, c))
3369 else if (flags & ANYOF_FOLD) {
3371 if (flags & ANYOF_LOCALE) {
3372 PL_reg_flags |= RF_tainted;
3373 cf = PL_fold_locale[c];
3377 if (ANYOF_BITMAP_TEST(p, cf))
3381 if (!match && (flags & ANYOF_CLASS)) {
3382 PL_reg_flags |= RF_tainted;
3384 (ANYOF_CLASS_TEST(p, ANYOF_ALNUM) && isALNUM_LC(c)) ||
3385 (ANYOF_CLASS_TEST(p, ANYOF_NALNUM) && !isALNUM_LC(c)) ||
3386 (ANYOF_CLASS_TEST(p, ANYOF_SPACE) && isSPACE_LC(c)) ||
3387 (ANYOF_CLASS_TEST(p, ANYOF_NSPACE) && !isSPACE_LC(c)) ||
3388 (ANYOF_CLASS_TEST(p, ANYOF_DIGIT) && isDIGIT_LC(c)) ||
3389 (ANYOF_CLASS_TEST(p, ANYOF_NDIGIT) && !isDIGIT_LC(c)) ||
3390 (ANYOF_CLASS_TEST(p, ANYOF_ALNUMC) && isALNUMC_LC(c)) ||
3391 (ANYOF_CLASS_TEST(p, ANYOF_NALNUMC) && !isALNUMC_LC(c)) ||
3392 (ANYOF_CLASS_TEST(p, ANYOF_ALPHA) && isALPHA_LC(c)) ||
3393 (ANYOF_CLASS_TEST(p, ANYOF_NALPHA) && !isALPHA_LC(c)) ||
3394 (ANYOF_CLASS_TEST(p, ANYOF_ASCII) && isASCII(c)) ||
3395 (ANYOF_CLASS_TEST(p, ANYOF_NASCII) && !isASCII(c)) ||
3396 (ANYOF_CLASS_TEST(p, ANYOF_CNTRL) && isCNTRL_LC(c)) ||
3397 (ANYOF_CLASS_TEST(p, ANYOF_NCNTRL) && !isCNTRL_LC(c)) ||
3398 (ANYOF_CLASS_TEST(p, ANYOF_GRAPH) && isGRAPH_LC(c)) ||
3399 (ANYOF_CLASS_TEST(p, ANYOF_NGRAPH) && !isGRAPH_LC(c)) ||
3400 (ANYOF_CLASS_TEST(p, ANYOF_LOWER) && isLOWER_LC(c)) ||
3401 (ANYOF_CLASS_TEST(p, ANYOF_NLOWER) && !isLOWER_LC(c)) ||
3402 (ANYOF_CLASS_TEST(p, ANYOF_PRINT) && isPRINT_LC(c)) ||
3403 (ANYOF_CLASS_TEST(p, ANYOF_NPRINT) && !isPRINT_LC(c)) ||
3404 (ANYOF_CLASS_TEST(p, ANYOF_PUNCT) && isPUNCT_LC(c)) ||
3405 (ANYOF_CLASS_TEST(p, ANYOF_NPUNCT) && !isPUNCT_LC(c)) ||
3406 (ANYOF_CLASS_TEST(p, ANYOF_UPPER) && isUPPER_LC(c)) ||
3407 (ANYOF_CLASS_TEST(p, ANYOF_NUPPER) && !isUPPER_LC(c)) ||
3408 (ANYOF_CLASS_TEST(p, ANYOF_XDIGIT) && isXDIGIT(c)) ||
3409 (ANYOF_CLASS_TEST(p, ANYOF_NXDIGIT) && !isXDIGIT(c))
3410 ) /* How's that for a conditional? */
3416 return (flags & ANYOF_INVERT) ? !match : match;
3420 S_reginclassutf8(pTHX_ regnode *f, U8 *p)
3423 char flags = ARG1(f);
3425 SV *sv = (SV*)PL_regdata->data[ARG2(f)];
3427 if (swash_fetch(sv, p))
3429 else if (flags & ANYOF_FOLD) {
3432 if (flags & ANYOF_LOCALE) {
3433 PL_reg_flags |= RF_tainted;
3434 uv_to_utf8(tmpbuf, toLOWER_LC_utf8(p));
3437 uv_to_utf8(tmpbuf, toLOWER_utf8(p));
3438 if (swash_fetch(sv, tmpbuf))
3442 /* UTF8 combined with ANYOF_CLASS is ill-defined. */
3444 return (flags & ANYOF_INVERT) ? !match : match;
3448 S_reghop(pTHX_ U8 *s, I32 off)
3452 while (off-- && s < (U8*)PL_regeol)
3457 if (s > (U8*)PL_bostr) {
3460 while (s > (U8*)PL_bostr && (*s & 0xc0) == 0x80)
3462 } /* XXX could check well-formedness here */
3470 S_reghopmaybe(pTHX_ U8* s, I32 off)
3474 while (off-- && s < (U8*)PL_regeol)
3481 if (s > (U8*)PL_bostr) {
3484 while (s > (U8*)PL_bostr && (*s & 0xc0) == 0x80)
3486 } /* XXX could check well-formedness here */
3503 restore_pos(pTHXo_ void *arg)
3506 if (PL_reg_eval_set) {
3507 if (PL_reg_oldsaved) {
3508 PL_reg_re->subbeg = PL_reg_oldsaved;
3509 PL_reg_re->sublen = PL_reg_oldsavedlen;
3510 RX_MATCH_COPIED_on(PL_reg_re);
3512 PL_reg_magic->mg_len = PL_reg_oldpos;
3513 PL_reg_eval_set = 0;
3514 PL_curpm = PL_reg_oldcurpm;