5 * "One Ring to rule them all, One Ring to find them..."
8 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
9 * confused with the original package (see point 3 below). Thanks, Henry!
12 /* Additional note: this code is very heavily munged from Henry's version
13 * in places. In some spots I've traded clarity for efficiency, so don't
14 * blame Henry for some of the lack of readability.
17 /* The names of the functions have been changed from regcomp and
18 * regexec to pregcomp and pregexec in order to avoid conflicts
19 * with the POSIX routines of the same names.
22 #ifdef PERL_EXT_RE_BUILD
23 /* need to replace pregcomp et al, so enable that */
24 # ifndef PERL_IN_XSUB_RE
25 # define PERL_IN_XSUB_RE
27 /* need access to debugger hooks */
28 # if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
33 #ifdef PERL_IN_XSUB_RE
34 /* We *really* need to overwrite these symbols: */
35 # define Perl_regexec_flags my_regexec
36 # define Perl_regdump my_regdump
37 # define Perl_regprop my_regprop
38 # define Perl_re_intuit_start my_re_intuit_start
39 /* *These* symbols are masked to allow static link. */
40 # define Perl_pregexec my_pregexec
41 # define Perl_reginitcolors my_reginitcolors
43 # define PERL_NO_GET_CONTEXT
48 * pregcomp and pregexec -- regsub and regerror are not used in perl
50 * Copyright (c) 1986 by University of Toronto.
51 * Written by Henry Spencer. Not derived from licensed software.
53 * Permission is granted to anyone to use this software for any
54 * purpose on any computer system, and to redistribute it freely,
55 * subject to the following restrictions:
57 * 1. The author is not responsible for the consequences of use of
58 * this software, no matter how awful, even if they arise
61 * 2. The origin of this software must not be misrepresented, either
62 * by explicit claim or by omission.
64 * 3. Altered versions must be plainly marked as such, and must not
65 * be misrepresented as being the original software.
67 **** Alterations to Henry's code are...
69 **** Copyright (c) 1991-1999, Larry Wall
71 **** You may distribute under the terms of either the GNU General Public
72 **** License or the Artistic License, as specified in the README file.
74 * Beware that some of this code is subtly aware of the way operator
75 * precedence is structured in regular expressions. Serious changes in
76 * regular-expression syntax might require a total rethink.
79 #define PERL_IN_REGEXEC_C
82 #ifdef PERL_IN_XSUB_RE
83 # if defined(PERL_CAPI) || defined(PERL_OBJECT)
90 #define RF_tainted 1 /* tainted information used? */
91 #define RF_warned 2 /* warned about big count? */
92 #define RF_evaled 4 /* Did an EVAL with setting? */
93 #define RF_utf8 8 /* String contains multibyte chars? */
95 #define UTF (PL_reg_flags & RF_utf8)
97 #define RS_init 1 /* eval environment created */
98 #define RS_set 2 /* replsv value is set */
101 #define STATIC static
108 #define REGINCLASS(p,c) (ANYOF_FLAGS(p) ? reginclass(p,c) : ANYOF_BITMAP_TEST(p,c))
109 #define REGINCLASSUTF8(f,p) (ARG1(f) ? reginclassutf8(f,p) : swash_fetch((SV*)PL_regdata->data[ARG2(f)],p))
111 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
112 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
114 #define reghop_c(pos,off) ((char*)reghop((U8*)pos, off))
115 #define reghopmaybe_c(pos,off) ((char*)reghopmaybe((U8*)pos, off))
116 #define HOP(pos,off) (UTF ? reghop((U8*)pos, off) : (U8*)(pos + off))
117 #define HOPMAYBE(pos,off) (UTF ? reghopmaybe((U8*)pos, off) : (U8*)(pos + off))
118 #define HOPc(pos,off) ((char*)HOP(pos,off))
119 #define HOPMAYBEc(pos,off) ((char*)HOPMAYBE(pos,off))
121 static void restore_pos(pTHXo_ void *arg);
125 S_regcppush(pTHX_ I32 parenfloor)
128 int retval = PL_savestack_ix;
129 int i = (PL_regsize - parenfloor) * 4;
133 for (p = PL_regsize; p > parenfloor; p--) {
134 SSPUSHINT(PL_regendp[p]);
135 SSPUSHINT(PL_regstartp[p]);
136 SSPUSHPTR(PL_reg_start_tmp[p]);
139 SSPUSHINT(PL_regsize);
140 SSPUSHINT(*PL_reglastparen);
141 SSPUSHPTR(PL_reginput);
143 SSPUSHINT(SAVEt_REGCONTEXT);
147 /* These are needed since we do not localize EVAL nodes: */
148 # define REGCP_SET DEBUG_r(PerlIO_printf(Perl_debug_log, \
149 " Setting an EVAL scope, savestack=%i\n", \
150 PL_savestack_ix)); lastcp = PL_savestack_ix
152 # define REGCP_UNWIND DEBUG_r(lastcp != PL_savestack_ix ? \
153 PerlIO_printf(Perl_debug_log, \
154 " Clearing an EVAL scope, savestack=%i..%i\n", \
155 lastcp, PL_savestack_ix) : 0); regcpblow(lastcp)
165 assert(i == SAVEt_REGCONTEXT);
167 input = (char *) SSPOPPTR;
168 *PL_reglastparen = SSPOPINT;
169 PL_regsize = SSPOPINT;
170 for (i -= 3; i > 0; i -= 4) {
171 paren = (U32)SSPOPINT;
172 PL_reg_start_tmp[paren] = (char *) SSPOPPTR;
173 PL_regstartp[paren] = SSPOPINT;
175 if (paren <= *PL_reglastparen)
176 PL_regendp[paren] = tmps;
178 PerlIO_printf(Perl_debug_log,
179 " restoring \\%d to %d(%d)..%d%s\n",
180 paren, PL_regstartp[paren],
181 PL_reg_start_tmp[paren] - PL_bostr,
183 (paren > *PL_reglastparen ? "(no)" : ""));
187 if (*PL_reglastparen + 1 <= PL_regnpar) {
188 PerlIO_printf(Perl_debug_log,
189 " restoring \\%d..\\%d to undef\n",
190 *PL_reglastparen + 1, PL_regnpar);
193 for (paren = *PL_reglastparen + 1; paren <= PL_regnpar; paren++) {
194 if (paren > PL_regsize)
195 PL_regstartp[paren] = -1;
196 PL_regendp[paren] = -1;
202 S_regcp_set_to(pTHX_ I32 ss)
205 I32 tmp = PL_savestack_ix;
207 PL_savestack_ix = ss;
209 PL_savestack_ix = tmp;
213 typedef struct re_cc_state
217 struct re_cc_state *prev;
222 #define regcpblow(cp) LEAVE_SCOPE(cp)
225 * pregexec and friends
229 - pregexec - match a regexp against a string
232 Perl_pregexec(pTHX_ register regexp *prog, char *stringarg, register char *strend,
233 char *strbeg, I32 minend, SV *screamer, U32 nosave)
234 /* strend: pointer to null at end of string */
235 /* strbeg: real beginning of string */
236 /* minend: end of match must be >=minend after stringarg. */
237 /* nosave: For optimizations. */
240 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
241 nosave ? 0 : REXEC_COPY_STR);
245 S_cache_re(pTHX_ regexp *prog)
248 PL_regprecomp = prog->precomp; /* Needed for FAIL. */
250 PL_regprogram = prog->program;
252 PL_regnpar = prog->nparens;
253 PL_regdata = prog->data;
258 * Need to implement the following flags for reg_anch:
260 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
262 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
263 * INTUIT_AUTORITATIVE_ML
264 * INTUIT_ONCE_NOML - Intuit can match in one location only.
267 * Another flag for this function: SECOND_TIME (so that float substrs
268 * with giant delta may be not rechecked).
271 /* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
273 /* If SCREAM, then SvPVX(sv) should be compatible with strpos and strend.
274 Otherwise, only SvCUR(sv) is used to get strbeg. */
276 /* XXXX We assume that strpos is strbeg unless sv. */
278 /* A failure to find a constant substring means that there is no need to make
279 an expensive call to REx engine, thus we celebrate a failure. Similarly,
280 finding a substring too deep into the string means that less calls to
281 regtry() should be needed.
283 REx compiler's optimizer found 4 possible hints:
284 a) Anchored substring;
286 c) Whether we are anchored (beginning-of-line or \G);
287 d) First node (of those at offset 0) which may distingush positions;
288 We use 'a', 'b', multiline-part of 'c', and try to find a position in the
289 string which does not contradict any of them.
293 Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
294 char *strend, U32 flags, re_scream_pos_data *data)
296 register I32 start_shift;
297 /* Should be nonnegative! */
298 register I32 end_shift;
304 register char *other_last = Nullch;
306 char *i_strpos = strpos;
309 DEBUG_r( if (!PL_colorset) reginitcolors() );
310 DEBUG_r(PerlIO_printf(Perl_debug_log,
311 "%sGuessing start of match, REx%s `%s%.60s%s%s' against `%s%.*s%s%s'...\n",
312 PL_colors[4],PL_colors[5],PL_colors[0],
315 (strlen(prog->precomp) > 60 ? "..." : ""),
317 (strend - strpos > 60 ? 60 : strend - strpos),
318 strpos, PL_colors[1],
319 (strend - strpos > 60 ? "..." : ""))
322 if (prog->minlen > strend - strpos) {
323 DEBUG_r(PerlIO_printf(Perl_debug_log, "String too short...\n"));
326 if (prog->reganch & ROPT_ANCH) { /* Match at beg-of-str or after \n */
327 ml_anch = !( (prog->reganch & ROPT_ANCH_SINGLE)
328 || ( (prog->reganch & ROPT_ANCH_BOL)
329 && !PL_multiline ) ); /* Check after \n? */
331 if ((prog->check_offset_min == prog->check_offset_max) && !ml_anch) {
332 /* Substring at constant offset from beg-of-str... */
335 if ( !(prog->reganch & ROPT_ANCH_GPOS) /* Checked by the caller */
336 && (sv && (strpos + SvCUR(sv) != strend)) ) {
337 DEBUG_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
340 PL_regeol = strend; /* Used in HOP() */
341 s = HOPc(strpos, prog->check_offset_min);
342 if (SvTAIL(prog->check_substr)) {
343 slen = SvCUR(prog->check_substr); /* >= 1 */
345 if ( strend - s > slen || strend - s < slen - 1
346 || (strend - s == slen && strend[-1] != '\n')) {
347 DEBUG_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
350 /* Now should match s[0..slen-2] */
352 if (slen && (*SvPVX(prog->check_substr) != *s
354 && memNE(SvPVX(prog->check_substr), s, slen)))) {
356 DEBUG_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
360 else if (*SvPVX(prog->check_substr) != *s
361 || ((slen = SvCUR(prog->check_substr)) > 1
362 && memNE(SvPVX(prog->check_substr), s, slen)))
364 goto success_at_start;
366 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
368 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
369 /* Should be nonnegative! */
370 end_shift = prog->minlen - start_shift -
371 CHR_SVLEN(prog->check_substr) + (SvTAIL(prog->check_substr) != 0);
373 I32 end = prog->check_offset_max + CHR_SVLEN(prog->check_substr)
374 - (SvTAIL(prog->check_substr) != 0);
375 I32 eshift = strend - s - end;
377 if (end_shift < eshift)
381 else { /* Can match at random position */
384 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
385 /* Should be nonnegative! */
386 end_shift = prog->minlen - start_shift -
387 CHR_SVLEN(prog->check_substr) + (SvTAIL(prog->check_substr) != 0);
390 #ifdef DEBUGGING /* 7/99: reports of failure (with the older version) */
392 Perl_croak(aTHX_ "panic: end_shift");
395 check = prog->check_substr;
397 /* Find a possible match in the region s..strend by looking for
398 the "check" substring in the region corrected by start/end_shift. */
399 if (flags & REXEC_SCREAM) {
400 char *strbeg = SvPVX(sv); /* XXXX Assume PV_force() on SCREAM! */
401 I32 p = -1; /* Internal iterator of scream. */
402 I32 *pp = data ? data->scream_pos : &p;
404 if (PL_screamfirst[BmRARE(check)] >= 0
405 || ( BmRARE(check) == '\n'
406 && (BmPREVIOUS(check) == SvCUR(check) - 1)
408 s = screaminstr(sv, check,
409 start_shift + (s - strbeg), end_shift, pp, 0);
413 *data->scream_olds = s;
416 s = fbm_instr((unsigned char*)s + start_shift,
417 (unsigned char*)strend - end_shift,
418 check, PL_multiline ? FBMrf_MULTILINE : 0);
420 /* Update the count-of-usability, remove useless subpatterns,
423 DEBUG_r(PerlIO_printf(Perl_debug_log, "%s %s substr `%s%.*s%s'%s%s",
424 (s ? "Found" : "Did not find"),
425 ((check == prog->anchored_substr) ? "anchored" : "floating"),
427 SvCUR(check) - (SvTAIL(check)!=0), SvPVX(check),
428 PL_colors[1], (SvTAIL(check) ? "$" : ""),
429 (s ? " at offset " : "...\n") ) );
434 /* Finish the diagnostic message */
435 DEBUG_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
437 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
438 Start with the other substr.
439 XXXX no SCREAM optimization yet - and a very coarse implementation
440 XXXX /ttx+/ results in anchored=`ttx', floating=`x'. floating will
441 *always* match. Probably should be marked during compile...
442 Probably it is right to do no SCREAM here...
445 if (prog->float_substr && prog->anchored_substr) {
446 /* Take into account the "other" substring. */
447 /* XXXX May be hopelessly wrong for UTF... */
449 other_last = strpos - 1;
450 if (check == prog->float_substr) {
453 char *last = s - start_shift, *last1, *last2;
457 t = s - prog->check_offset_max;
458 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
459 && (!(prog->reganch & ROPT_UTF8)
460 || (PL_bostr = strpos, /* Used in regcopmaybe() */
461 (t = reghopmaybe_c(s, -(prog->check_offset_max)))
466 t += prog->anchored_offset;
470 last2 = last1 = strend - prog->minlen;
473 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
474 /* On end-of-str: see comment below. */
475 s = fbm_instr((unsigned char*)t,
476 (unsigned char*)last1 + prog->anchored_offset
477 + SvCUR(prog->anchored_substr)
478 - (SvTAIL(prog->anchored_substr)!=0),
479 prog->anchored_substr, PL_multiline ? FBMrf_MULTILINE : 0);
480 DEBUG_r(PerlIO_printf(Perl_debug_log, "%s anchored substr `%s%.*s%s'%s",
481 (s ? "Found" : "Contradicts"),
483 SvCUR(prog->anchored_substr)
484 - (SvTAIL(prog->anchored_substr)!=0),
485 SvPVX(prog->anchored_substr),
486 PL_colors[1], (SvTAIL(prog->anchored_substr) ? "$" : "")));
488 if (last1 >= last2) {
489 DEBUG_r(PerlIO_printf(Perl_debug_log,
490 ", giving up...\n"));
493 DEBUG_r(PerlIO_printf(Perl_debug_log,
494 ", trying floating at offset %ld...\n",
495 (long)(s1 + 1 - i_strpos)));
496 PL_regeol = strend; /* Used in HOP() */
497 other_last = last1 + prog->anchored_offset;
502 DEBUG_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
503 (long)(s - i_strpos)));
504 t = s - prog->anchored_offset;
513 else { /* Take into account the floating substring. */
518 last1 = last = strend - prog->minlen + prog->float_min_offset;
519 if (last - t > prog->float_max_offset)
520 last = t + prog->float_max_offset;
521 s = t + prog->float_min_offset;
524 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
525 /* fbm_instr() takes into account exact value of end-of-str
526 if the check is SvTAIL(ed). Since false positives are OK,
527 and end-of-str is not later than strend we are OK. */
528 s = fbm_instr((unsigned char*)s,
529 (unsigned char*)last + SvCUR(prog->float_substr)
530 - (SvTAIL(prog->float_substr)!=0),
531 prog->float_substr, PL_multiline ? FBMrf_MULTILINE : 0);
532 DEBUG_r(PerlIO_printf(Perl_debug_log, "%s floating substr `%s%.*s%s'%s",
533 (s ? "Found" : "Contradicts"),
535 SvCUR(prog->float_substr)
536 - (SvTAIL(prog->float_substr)!=0),
537 SvPVX(prog->float_substr),
538 PL_colors[1], (SvTAIL(prog->float_substr) ? "$" : "")));
541 DEBUG_r(PerlIO_printf(Perl_debug_log,
542 ", giving up...\n"));
545 DEBUG_r(PerlIO_printf(Perl_debug_log,
546 ", trying anchored starting at offset %ld...\n",
547 (long)(s1 + 1 - i_strpos)));
549 PL_regeol = strend; /* Used in HOP() */
554 DEBUG_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
555 (long)(s - i_strpos)));
565 t = s - prog->check_offset_max;
567 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
568 && (!(prog->reganch & ROPT_UTF8)
569 || (PL_bostr = strpos, /* Used in regcopmaybe() */
570 ((t = reghopmaybe_c(s, -(prog->check_offset_max)))
573 /* Fixed substring is found far enough so that the match
574 cannot start at strpos. */
576 if (ml_anch && t[-1] != '\n') {
577 /* Eventually fbm_*() should handle this, but often
578 anchored_offset is not 0, so this check will not be wasted. */
579 /* XXXX In the code below we prefer to look for "^" even in
580 presence of anchored substrings. And we search even
581 beyond the found float position. These pessimizations
582 are historical artefacts only. */
584 while (t < strend - prog->minlen) {
586 if (t < s - prog->check_offset_min) {
587 if (prog->anchored_substr) {
588 /* We definitely contradict the found anchored
589 substr. Due to the above check we do not
590 contradict "check" substr.
591 Thus we can arrive here only if check substr
592 is float. Redo checking for "other"=="fixed".
595 DEBUG_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
596 PL_colors[0],PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
597 goto do_other_anchored;
600 DEBUG_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
601 PL_colors[0],PL_colors[1], (long)(s - i_strpos)));
604 DEBUG_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting at offset %ld...\n",
605 PL_colors[0],PL_colors[1], (long)(t + 1 - i_strpos)));
611 DEBUG_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
612 PL_colors[0],PL_colors[1]));
617 ++BmUSEFUL(prog->check_substr); /* hooray/5 */
621 /* The found string does not prohibit matching at beg-of-str
622 - no optimization of calling REx engine can be performed,
623 unless it was an MBOL and we are not after MBOL. */
625 /* Even in this situation we may use MBOL flag if strpos is offset
626 wrt the start of the string. */
628 && (strpos + SvCUR(sv) != strend) && strpos[-1] != '\n') {
632 DEBUG_r( if (ml_anch)
633 PerlIO_printf(Perl_debug_log, "Does not contradict /%s^%s/m...\n",
634 PL_colors[0],PL_colors[1]);
637 if (!(prog->reganch & ROPT_NAUGHTY) /* XXXX If strpos moved? */
638 && --BmUSEFUL(prog->check_substr) < 0
639 && prog->check_substr == prog->float_substr) { /* boo */
640 /* If flags & SOMETHING - do not do it many times on the same match */
641 SvREFCNT_dec(prog->check_substr);
642 prog->check_substr = Nullsv; /* disable */
643 prog->float_substr = Nullsv; /* clear */
645 prog->reganch &= ~RE_USE_INTUIT;
651 DEBUG_r(PerlIO_printf(Perl_debug_log, "%sGuessed:%s match at offset %ld\n",
652 PL_colors[4], PL_colors[5], (long)(s - i_strpos)) );
655 fail_finish: /* Substring not found */
656 BmUSEFUL(prog->check_substr) += 5; /* hooray */
658 DEBUG_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
659 PL_colors[4],PL_colors[5]));
664 - regexec_flags - match a regexp against a string
667 Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *strend,
668 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
669 /* strend: pointer to null at end of string */
670 /* strbeg: real beginning of string */
671 /* minend: end of match must be >=minend after stringarg. */
672 /* data: May be used for some additional optimizations. */
673 /* nosave: For optimizations. */
678 register char *startpos = stringarg;
680 I32 minlen; /* must match at least this many chars */
681 I32 dontbother = 0; /* how many characters not to try at end */
682 I32 start_shift = 0; /* Offset of the start to find
683 constant substr. */ /* CC */
684 I32 end_shift = 0; /* Same for the end. */ /* CC */
685 I32 scream_pos = -1; /* Internal iterator of scream. */
687 SV* oreplsv = GvSV(PL_replgv);
693 PL_regnarrate = PL_debug & 512;
697 if (prog == NULL || startpos == NULL) {
698 Perl_croak(aTHX_ "NULL regexp parameter");
702 minlen = prog->minlen;
703 if (strend - startpos < minlen) goto phooey;
705 if (startpos == strbeg) /* is ^ valid at stringarg? */
708 PL_regprev = (U32)stringarg[-1];
709 if (!PL_multiline && PL_regprev == '\n')
710 PL_regprev = '\0'; /* force ^ to NOT match */
713 /* Check validity of program. */
714 if (UCHARAT(prog->program) != REG_MAGIC) {
715 Perl_croak(aTHX_ "corrupted regexp program");
722 if (prog->reganch & ROPT_UTF8)
723 PL_reg_flags |= RF_utf8;
725 /* Mark beginning of line for ^ and lookbehind. */
726 PL_regbol = startpos;
730 /* Mark end of line for $ (and such) */
733 /* see how far we have to get to not match where we matched before */
734 PL_regtill = startpos+minend;
736 /* We start without call_cc context. */
739 /* If there is a "must appear" string, look for it. */
742 if (prog->reganch & ROPT_GPOS_SEEN) {
745 if (!(flags & REXEC_IGNOREPOS) && sv && SvTYPE(sv) >= SVt_PVMG
746 && SvMAGIC(sv) && (mg = mg_find(sv, 'g')) && mg->mg_len >= 0)
747 PL_reg_ganch = strbeg + mg->mg_len;
749 PL_reg_ganch = startpos;
750 if (prog->reganch & ROPT_ANCH_GPOS) {
751 if (s > PL_reg_ganch)
757 if (!(flags & REXEC_CHECKED) && prog->check_substr != Nullsv) {
758 re_scream_pos_data d;
760 d.scream_olds = &scream_olds;
761 d.scream_pos = &scream_pos;
762 s = re_intuit_start(prog, sv, s, strend, flags, &d);
764 goto phooey; /* not present */
767 DEBUG_r( if (!PL_colorset) reginitcolors() );
768 DEBUG_r(PerlIO_printf(Perl_debug_log,
769 "%sMatching REx%s `%s%.60s%s%s' against `%s%.*s%s%s'\n",
770 PL_colors[4],PL_colors[5],PL_colors[0],
773 (strlen(prog->precomp) > 60 ? "..." : ""),
775 (strend - startpos > 60 ? 60 : strend - startpos),
776 startpos, PL_colors[1],
777 (strend - startpos > 60 ? "..." : ""))
780 /* Simplest case: anchored match need be tried only once. */
781 /* [unless only anchor is BOL and multiline is set] */
782 if (prog->reganch & (ROPT_ANCH & ~ROPT_ANCH_GPOS)) {
783 if (s == startpos && regtry(prog, startpos))
785 else if (PL_multiline || (prog->reganch & ROPT_IMPLICIT)
786 || (prog->reganch & ROPT_ANCH_MBOL)) /* XXXX SBOL? */
791 dontbother = minlen - 1;
792 end = HOPc(strend, -dontbother) - 1;
793 /* for multiline we only have to try after newlines */
794 if (prog->check_substr) {
803 s = re_intuit_start(prog, sv, s + 1, strend, flags, NULL);
811 if (*s++ == '\n') { /* don't need PL_utf8skip here */
819 } else if (prog->reganch & ROPT_ANCH_GPOS) {
820 if (regtry(prog, PL_reg_ganch))
825 /* Messy cases: unanchored match. */
826 if (prog->anchored_substr && prog->reganch & ROPT_SKIP) {
827 /* we have /x+whatever/ */
828 /* it must be a one character string (XXXX Except UTF?) */
829 char ch = SvPVX(prog->anchored_substr)[0];
833 if (regtry(prog, s)) goto got_it;
835 while (s < strend && *s == ch)
844 if (regtry(prog, s)) goto got_it;
846 while (s < strend && *s == ch)
854 else if (prog->anchored_substr != Nullsv
855 || (prog->float_substr != Nullsv
856 && prog->float_max_offset < strend - s)) {
857 SV *must = prog->anchored_substr
858 ? prog->anchored_substr : prog->float_substr;
860 prog->anchored_substr ? prog->anchored_offset : prog->float_max_offset;
862 prog->anchored_substr ? prog->anchored_offset : prog->float_min_offset;
863 I32 delta = back_max - back_min;
864 char *last = HOPc(strend, /* Cannot start after this */
865 -(I32)(CHR_SVLEN(must)
866 - (SvTAIL(must) != 0) + back_min));
867 char *last1; /* Last position checked before */
872 last1 = s - 1; /* bogus */
874 /* XXXX check_substr already used to find `s', can optimize if
875 check_substr==must. */
877 dontbother = end_shift;
878 strend = HOPc(strend, -dontbother);
879 while ( (s <= last) &&
880 ((flags & REXEC_SCREAM)
881 ? (s = screaminstr(sv, must, HOPc(s, back_min) - strbeg,
882 end_shift, &scream_pos, 0))
883 : (s = fbm_instr((unsigned char*)HOP(s, back_min),
884 (unsigned char*)strend, must,
885 PL_multiline ? FBMrf_MULTILINE : 0))) ) {
886 if (HOPc(s, -back_max) > last1) {
887 last1 = HOPc(s, -back_min);
888 s = HOPc(s, -back_max);
891 char *t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
893 last1 = HOPc(s, -back_min);
913 else if (c = prog->regstclass) {
914 I32 doevery = (prog->reganch & ROPT_SKIP) == 0;
918 dontbother = minlen - 1;
919 strend = HOPc(strend, -dontbother); /* don't bother with what can't match */
921 /* We know what class it must start with. */
926 if (REGINCLASSUTF8(c, (U8*)s)) {
927 if (tmp && regtry(prog, s))
940 if (REGINCLASS(cc, *s)) {
941 if (tmp && regtry(prog, s))
952 PL_reg_flags |= RF_tainted;
959 tmp = (s != startpos) ? UCHARAT(s - 1) : PL_regprev;
960 tmp = ((OP(c) == BOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
962 if (tmp == !(OP(c) == BOUND ? isALNUM(*s) : isALNUM_LC(*s))) {
969 if ((minlen || tmp) && regtry(prog,s))
973 PL_reg_flags |= RF_tainted;
978 strend = reghop_c(strend, -1);
980 tmp = (I32)(s != startpos) ? utf8_to_uv(reghop((U8*)s, -1), 0) : PL_regprev;
981 tmp = ((OP(c) == BOUND ? isALNUM_uni(tmp) : isALNUM_LC_uni(tmp)) != 0);
983 if (tmp == !(OP(c) == BOUND ?
984 swash_fetch(PL_utf8_alnum, (U8*)s) :
985 isALNUM_LC_utf8((U8*)s)))
993 if ((minlen || tmp) && regtry(prog,s))
997 PL_reg_flags |= RF_tainted;
1004 tmp = (s != startpos) ? UCHARAT(s - 1) : PL_regprev;
1005 tmp = ((OP(c) == NBOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1006 while (s < strend) {
1007 if (tmp == !(OP(c) == NBOUND ? isALNUM(*s) : isALNUM_LC(*s)))
1009 else if (regtry(prog, s))
1013 if ((minlen || !tmp) && regtry(prog,s))
1017 PL_reg_flags |= RF_tainted;
1022 strend = reghop_c(strend, -1);
1024 tmp = (I32)(s != startpos) ? utf8_to_uv(reghop((U8*)s, -1), 0) : PL_regprev;
1025 tmp = ((OP(c) == NBOUND ? isALNUM_uni(tmp) : isALNUM_LC_uni(tmp)) != 0);
1026 while (s < strend) {
1027 if (tmp == !(OP(c) == NBOUND ?
1028 swash_fetch(PL_utf8_alnum, (U8*)s) :
1029 isALNUM_LC_utf8((U8*)s)))
1031 else if (regtry(prog, s))
1035 if ((minlen || !tmp) && regtry(prog,s))
1039 while (s < strend) {
1041 if (tmp && regtry(prog, s))
1052 while (s < strend) {
1053 if (swash_fetch(PL_utf8_alnum, (U8*)s)) {
1054 if (tmp && regtry(prog, s))
1065 PL_reg_flags |= RF_tainted;
1066 while (s < strend) {
1067 if (isALNUM_LC(*s)) {
1068 if (tmp && regtry(prog, s))
1079 PL_reg_flags |= RF_tainted;
1080 while (s < strend) {
1081 if (isALNUM_LC_utf8((U8*)s)) {
1082 if (tmp && regtry(prog, s))
1093 while (s < strend) {
1095 if (tmp && regtry(prog, s))
1106 while (s < strend) {
1107 if (!swash_fetch(PL_utf8_alnum, (U8*)s)) {
1108 if (tmp && regtry(prog, s))
1119 PL_reg_flags |= RF_tainted;
1120 while (s < strend) {
1121 if (!isALNUM_LC(*s)) {
1122 if (tmp && regtry(prog, s))
1133 PL_reg_flags |= RF_tainted;
1134 while (s < strend) {
1135 if (!isALNUM_LC_utf8((U8*)s)) {
1136 if (tmp && regtry(prog, s))
1147 while (s < strend) {
1149 if (tmp && regtry(prog, s))
1160 while (s < strend) {
1161 if (*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s)) {
1162 if (tmp && regtry(prog, s))
1173 PL_reg_flags |= RF_tainted;
1174 while (s < strend) {
1175 if (isSPACE_LC(*s)) {
1176 if (tmp && regtry(prog, s))
1187 PL_reg_flags |= RF_tainted;
1188 while (s < strend) {
1189 if (*s == ' ' || isSPACE_LC_utf8((U8*)s)) {
1190 if (tmp && regtry(prog, s))
1201 while (s < strend) {
1203 if (tmp && regtry(prog, s))
1214 while (s < strend) {
1215 if (!(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s))) {
1216 if (tmp && regtry(prog, s))
1227 PL_reg_flags |= RF_tainted;
1228 while (s < strend) {
1229 if (!isSPACE_LC(*s)) {
1230 if (tmp && regtry(prog, s))
1241 PL_reg_flags |= RF_tainted;
1242 while (s < strend) {
1243 if (!(*s == ' ' || isSPACE_LC_utf8((U8*)s))) {
1244 if (tmp && regtry(prog, s))
1255 while (s < strend) {
1257 if (tmp && regtry(prog, s))
1268 while (s < strend) {
1269 if (swash_fetch(PL_utf8_digit,(U8*)s)) {
1270 if (tmp && regtry(prog, s))
1281 PL_reg_flags |= RF_tainted;
1282 while (s < strend) {
1283 if (isDIGIT_LC(*s)) {
1284 if (tmp && regtry(prog, s))
1295 PL_reg_flags |= RF_tainted;
1296 while (s < strend) {
1297 if (isDIGIT_LC_utf8((U8*)s)) {
1298 if (tmp && regtry(prog, s))
1309 while (s < strend) {
1311 if (tmp && regtry(prog, s))
1322 while (s < strend) {
1323 if (!swash_fetch(PL_utf8_digit,(U8*)s)) {
1324 if (tmp && regtry(prog, s))
1335 PL_reg_flags |= RF_tainted;
1336 while (s < strend) {
1337 if (!isDIGIT_LC(*s)) {
1338 if (tmp && regtry(prog, s))
1349 PL_reg_flags |= RF_tainted;
1350 while (s < strend) {
1351 if (!isDIGIT_LC_utf8((U8*)s)) {
1352 if (tmp && regtry(prog, s))
1366 if (prog->float_substr != Nullsv) { /* Trim the end. */
1368 I32 oldpos = scream_pos;
1370 if (flags & REXEC_SCREAM) {
1371 last = screaminstr(sv, prog->float_substr, s - strbeg,
1372 end_shift, &scream_pos, 1); /* last one */
1374 last = scream_olds; /* Only one occurence. */
1378 char *little = SvPV(prog->float_substr, len);
1380 if (SvTAIL(prog->float_substr)) {
1381 if (memEQ(strend - len + 1, little, len - 1))
1382 last = strend - len + 1;
1383 else if (!PL_multiline)
1384 last = memEQ(strend - len, little, len)
1385 ? strend - len : Nullch;
1391 last = rninstr(s, strend, little, little + len);
1393 last = strend; /* matching `$' */
1396 if (last == NULL) goto phooey; /* Should not happen! */
1397 dontbother = strend - last + prog->float_min_offset;
1399 if (minlen && (dontbother < minlen))
1400 dontbother = minlen - 1;
1401 strend -= dontbother; /* this one's always in bytes! */
1402 /* We don't know much -- general case. */
1405 if (regtry(prog, s))
1414 if (regtry(prog, s))
1416 } while (s++ < strend);
1424 RX_MATCH_TAINTED_set(prog, PL_reg_flags & RF_tainted);
1426 if (PL_reg_eval_set) {
1427 /* Preserve the current value of $^R */
1428 if (oreplsv != GvSV(PL_replgv))
1429 sv_setsv(oreplsv, GvSV(PL_replgv));/* So that when GvSV(replgv) is
1430 restored, the value remains
1432 restore_pos(aTHXo_ 0);
1435 /* make sure $`, $&, $', and $digit will work later */
1436 if ( !(flags & REXEC_NOT_FIRST) ) {
1437 if (RX_MATCH_COPIED(prog)) {
1438 Safefree(prog->subbeg);
1439 RX_MATCH_COPIED_off(prog);
1441 if (flags & REXEC_COPY_STR) {
1442 I32 i = PL_regeol - startpos + (stringarg - strbeg);
1444 s = savepvn(strbeg, i);
1447 RX_MATCH_COPIED_on(prog);
1450 prog->subbeg = strbeg;
1451 prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
1458 if (PL_reg_eval_set)
1459 restore_pos(aTHXo_ 0);
1464 - regtry - try match at specific point
1466 STATIC I32 /* 0 failure, 1 success */
1467 S_regtry(pTHX_ regexp *prog, char *startpos)
1475 if ((prog->reganch & ROPT_EVAL_SEEN) && !PL_reg_eval_set) {
1478 PL_reg_eval_set = RS_init;
1480 PerlIO_printf(Perl_debug_log, " setting stack tmpbase at %i\n",
1481 PL_stack_sp - PL_stack_base);
1483 SAVEINT(cxstack[cxstack_ix].blk_oldsp);
1484 cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base;
1485 /* Otherwise OP_NEXTSTATE will free whatever on stack now. */
1487 /* Apparently this is not needed, judging by wantarray. */
1488 /* SAVEINT(cxstack[cxstack_ix].blk_gimme);
1489 cxstack[cxstack_ix].blk_gimme = G_SCALAR; */
1492 /* Make $_ available to executed code. */
1493 if (PL_reg_sv != DEFSV) {
1494 /* SAVE_DEFSV does *not* suffice here for USE_THREADS */
1499 if (!(SvTYPE(PL_reg_sv) >= SVt_PVMG && SvMAGIC(PL_reg_sv)
1500 && (mg = mg_find(PL_reg_sv, 'g')))) {
1501 /* prepare for quick setting of pos */
1502 sv_magic(PL_reg_sv, (SV*)0, 'g', Nullch, 0);
1503 mg = mg_find(PL_reg_sv, 'g');
1507 PL_reg_oldpos = mg->mg_len;
1508 SAVEDESTRUCTOR(restore_pos, 0);
1511 New(22,PL_reg_curpm, 1, PMOP);
1512 PL_reg_curpm->op_pmregexp = prog;
1513 PL_reg_oldcurpm = PL_curpm;
1514 PL_curpm = PL_reg_curpm;
1515 if (RX_MATCH_COPIED(prog)) {
1516 /* Here is a serious problem: we cannot rewrite subbeg,
1517 since it may be needed if this match fails. Thus
1518 $` inside (?{}) could fail... */
1519 PL_reg_oldsaved = prog->subbeg;
1520 PL_reg_oldsavedlen = prog->sublen;
1521 RX_MATCH_COPIED_off(prog);
1524 PL_reg_oldsaved = Nullch;
1525 prog->subbeg = PL_bostr;
1526 prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
1528 prog->startp[0] = startpos - PL_bostr;
1529 PL_reginput = startpos;
1530 PL_regstartp = prog->startp;
1531 PL_regendp = prog->endp;
1532 PL_reglastparen = &prog->lastparen;
1533 prog->lastparen = 0;
1535 DEBUG_r(PL_reg_starttry = startpos);
1536 if (PL_reg_start_tmpl <= prog->nparens) {
1537 PL_reg_start_tmpl = prog->nparens*3/2 + 3;
1538 if(PL_reg_start_tmp)
1539 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
1541 New(22,PL_reg_start_tmp, PL_reg_start_tmpl, char*);
1544 /* XXXX What this code is doing here?!!! There should be no need
1545 to do this again and again, PL_reglastparen should take care of
1549 if (prog->nparens) {
1550 for (i = prog->nparens; i >= 1; i--) {
1556 if (regmatch(prog->program + 1)) {
1557 prog->endp[0] = PL_reginput - PL_bostr;
1565 - regmatch - main matching routine
1567 * Conceptually the strategy is simple: check to see whether the current
1568 * node matches, call self recursively to see whether the rest matches,
1569 * and then act accordingly. In practice we make some effort to avoid
1570 * recursion, in particular by going through "ordinary" nodes (that don't
1571 * need to know whether the rest of the match failed) by a loop instead of
1574 /* [lwall] I've hoisted the register declarations to the outer block in order to
1575 * maybe save a little bit of pushing and popping on the stack. It also takes
1576 * advantage of machines that use a register save mask on subroutine entry.
1578 STATIC I32 /* 0 failure, 1 success */
1579 S_regmatch(pTHX_ regnode *prog)
1582 register regnode *scan; /* Current node. */
1583 regnode *next; /* Next node. */
1584 regnode *inner; /* Next node in internal branch. */
1585 register I32 nextchr; /* renamed nextchr - nextchar colides with
1586 function of same name */
1587 register I32 n; /* no or next */
1588 register I32 ln; /* len or last */
1589 register char *s; /* operand or save */
1590 register char *locinput = PL_reginput;
1591 register I32 c1, c2, paren; /* case fold search, parenth */
1592 int minmod = 0, sw = 0, logical = 0;
1597 /* Note that nextchr is a byte even in UTF */
1598 nextchr = UCHARAT(locinput);
1600 while (scan != NULL) {
1601 #define sayNO_L (logical ? (logical = 0, sw = 0, goto cont) : sayNO)
1603 # define sayYES goto yes
1604 # define sayNO goto no
1605 # define sayYES_FINAL goto yes_final
1606 # define sayYES_LOUD goto yes_loud
1607 # define sayNO_FINAL goto no_final
1608 # define sayNO_SILENT goto do_no
1609 # define saySAME(x) if (x) goto yes; else goto no
1610 # define REPORT_CODE_OFF 24
1612 # define sayYES return 1
1613 # define sayNO return 0
1614 # define sayYES_FINAL return 1
1615 # define sayYES_LOUD return 1
1616 # define sayNO_FINAL return 0
1617 # define sayNO_SILENT return 0
1618 # define saySAME(x) return x
1621 SV *prop = sv_newmortal();
1622 int docolor = *PL_colors[0];
1623 int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
1624 int l = (PL_regeol - locinput > taill ? taill : PL_regeol - locinput);
1625 /* The part of the string before starttry has one color
1626 (pref0_len chars), between starttry and current
1627 position another one (pref_len - pref0_len chars),
1628 after the current position the third one.
1629 We assume that pref0_len <= pref_len, otherwise we
1630 decrease pref0_len. */
1631 int pref_len = (locinput - PL_bostr > (5 + taill) - l
1632 ? (5 + taill) - l : locinput - PL_bostr);
1633 int pref0_len = pref_len - (locinput - PL_reg_starttry);
1635 if (l + pref_len < (5 + taill) && l < PL_regeol - locinput)
1636 l = ( PL_regeol - locinput > (5 + taill) - pref_len
1637 ? (5 + taill) - pref_len : PL_regeol - locinput);
1640 if (pref0_len > pref_len)
1641 pref0_len = pref_len;
1642 regprop(prop, scan);
1643 PerlIO_printf(Perl_debug_log,
1644 "%4i <%s%.*s%s%s%.*s%s%s%s%.*s%s>%*s|%3d:%*s%s\n",
1645 locinput - PL_bostr,
1646 PL_colors[4], pref0_len,
1647 locinput - pref_len, PL_colors[5],
1648 PL_colors[2], pref_len - pref0_len,
1649 locinput - pref_len + pref0_len, PL_colors[3],
1650 (docolor ? "" : "> <"),
1651 PL_colors[0], l, locinput, PL_colors[1],
1652 15 - l - pref_len + 1,
1654 scan - PL_regprogram, PL_regindent*2, "",
1658 next = scan + NEXT_OFF(scan);
1664 if (locinput == PL_bostr
1665 ? PL_regprev == '\n'
1667 (nextchr || locinput < PL_regeol) && locinput[-1] == '\n') )
1669 /* regtill = regbol; */
1674 if (locinput == PL_bostr
1675 ? PL_regprev == '\n'
1676 : ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n') )
1682 if (locinput == PL_regbol && PL_regprev == '\n')
1686 if (locinput == PL_reg_ganch)
1696 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
1701 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
1703 if (PL_regeol - locinput > 1)
1707 if (PL_regeol != locinput)
1711 if (nextchr & 0x80) {
1712 locinput += PL_utf8skip[nextchr];
1713 if (locinput > PL_regeol)
1715 nextchr = UCHARAT(locinput);
1718 if (!nextchr && locinput >= PL_regeol)
1720 nextchr = UCHARAT(++locinput);
1723 if (!nextchr && locinput >= PL_regeol)
1725 nextchr = UCHARAT(++locinput);
1728 if (nextchr & 0x80) {
1729 locinput += PL_utf8skip[nextchr];
1730 if (locinput > PL_regeol)
1732 nextchr = UCHARAT(locinput);
1735 if (!nextchr && locinput >= PL_regeol || nextchr == '\n')
1737 nextchr = UCHARAT(++locinput);
1740 if (!nextchr && locinput >= PL_regeol || nextchr == '\n')
1742 nextchr = UCHARAT(++locinput);
1747 /* Inline the first character, for speed. */
1748 if (UCHARAT(s) != nextchr)
1750 if (PL_regeol - locinput < ln)
1752 if (ln > 1 && memNE(s, locinput, ln))
1755 nextchr = UCHARAT(locinput);
1758 PL_reg_flags |= RF_tainted;
1767 c1 = OP(scan) == EXACTF;
1771 if (utf8_to_uv((U8*)s, 0) != (c1 ?
1772 toLOWER_utf8((U8*)l) :
1773 toLOWER_LC_utf8((U8*)l)))
1781 nextchr = UCHARAT(locinput);
1785 /* Inline the first character, for speed. */
1786 if (UCHARAT(s) != nextchr &&
1787 UCHARAT(s) != ((OP(scan) == EXACTF)
1788 ? PL_fold : PL_fold_locale)[nextchr])
1790 if (PL_regeol - locinput < ln)
1792 if (ln > 1 && (OP(scan) == EXACTF
1793 ? ibcmp(s, locinput, ln)
1794 : ibcmp_locale(s, locinput, ln)))
1797 nextchr = UCHARAT(locinput);
1801 if (!REGINCLASSUTF8(scan, (U8*)locinput))
1803 if (locinput >= PL_regeol)
1805 locinput += PL_utf8skip[nextchr];
1806 nextchr = UCHARAT(locinput);
1811 nextchr = UCHARAT(locinput);
1812 if (!REGINCLASS(s, nextchr))
1814 if (!nextchr && locinput >= PL_regeol)
1816 nextchr = UCHARAT(++locinput);
1819 PL_reg_flags |= RF_tainted;
1824 if (!(OP(scan) == ALNUM
1825 ? isALNUM(nextchr) : isALNUM_LC(nextchr)))
1827 nextchr = UCHARAT(++locinput);
1830 PL_reg_flags |= RF_tainted;
1835 if (nextchr & 0x80) {
1836 if (!(OP(scan) == ALNUMUTF8
1837 ? swash_fetch(PL_utf8_alnum, (U8*)locinput)
1838 : isALNUM_LC_utf8((U8*)locinput)))
1842 locinput += PL_utf8skip[nextchr];
1843 nextchr = UCHARAT(locinput);
1846 if (!(OP(scan) == ALNUMUTF8
1847 ? isALNUM(nextchr) : isALNUM_LC(nextchr)))
1849 nextchr = UCHARAT(++locinput);
1852 PL_reg_flags |= RF_tainted;
1855 if (!nextchr && locinput >= PL_regeol)
1857 if (OP(scan) == NALNUM
1858 ? isALNUM(nextchr) : isALNUM_LC(nextchr))
1860 nextchr = UCHARAT(++locinput);
1863 PL_reg_flags |= RF_tainted;
1866 if (!nextchr && locinput >= PL_regeol)
1868 if (nextchr & 0x80) {
1869 if (OP(scan) == NALNUMUTF8
1870 ? swash_fetch(PL_utf8_alnum, (U8*)locinput)
1871 : isALNUM_LC_utf8((U8*)locinput))
1875 locinput += PL_utf8skip[nextchr];
1876 nextchr = UCHARAT(locinput);
1879 if (OP(scan) == NALNUMUTF8
1880 ? isALNUM(nextchr) : isALNUM_LC(nextchr))
1882 nextchr = UCHARAT(++locinput);
1886 PL_reg_flags |= RF_tainted;
1890 /* was last char in word? */
1891 ln = (locinput != PL_regbol) ? UCHARAT(locinput - 1) : PL_regprev;
1892 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
1894 n = isALNUM(nextchr);
1897 ln = isALNUM_LC(ln);
1898 n = isALNUM_LC(nextchr);
1900 if (((!ln) == (!n)) == (OP(scan) == BOUND || OP(scan) == BOUNDL))
1905 PL_reg_flags |= RF_tainted;
1909 /* was last char in word? */
1910 ln = (locinput != PL_regbol)
1911 ? utf8_to_uv(reghop((U8*)locinput, -1), 0) : PL_regprev;
1912 if (OP(scan) == BOUNDUTF8 || OP(scan) == NBOUNDUTF8) {
1913 ln = isALNUM_uni(ln);
1914 n = swash_fetch(PL_utf8_alnum, (U8*)locinput);
1917 ln = isALNUM_LC_uni(ln);
1918 n = isALNUM_LC_utf8((U8*)locinput);
1920 if (((!ln) == (!n)) == (OP(scan) == BOUNDUTF8 || OP(scan) == BOUNDLUTF8))
1924 PL_reg_flags |= RF_tainted;
1927 if (!nextchr && locinput >= PL_regeol)
1929 if (!(OP(scan) == SPACE
1930 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
1932 nextchr = UCHARAT(++locinput);
1935 PL_reg_flags |= RF_tainted;
1938 if (!nextchr && locinput >= PL_regeol)
1940 if (nextchr & 0x80) {
1941 if (!(OP(scan) == SPACEUTF8
1942 ? swash_fetch(PL_utf8_space,(U8*)locinput)
1943 : isSPACE_LC_utf8((U8*)locinput)))
1947 locinput += PL_utf8skip[nextchr];
1948 nextchr = UCHARAT(locinput);
1951 if (!(OP(scan) == SPACEUTF8
1952 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
1954 nextchr = UCHARAT(++locinput);
1957 PL_reg_flags |= RF_tainted;
1962 if (OP(scan) == SPACE
1963 ? isSPACE(nextchr) : isSPACE_LC(nextchr))
1965 nextchr = UCHARAT(++locinput);
1968 PL_reg_flags |= RF_tainted;
1973 if (nextchr & 0x80) {
1974 if (OP(scan) == NSPACEUTF8
1975 ? swash_fetch(PL_utf8_space,(U8*)locinput)
1976 : isSPACE_LC_utf8((U8*)locinput))
1980 locinput += PL_utf8skip[nextchr];
1981 nextchr = UCHARAT(locinput);
1984 if (OP(scan) == NSPACEUTF8
1985 ? isSPACE(nextchr) : isSPACE_LC(nextchr))
1987 nextchr = UCHARAT(++locinput);
1990 PL_reg_flags |= RF_tainted;
1993 if (!nextchr && locinput >= PL_regeol)
1995 if (!(OP(scan) == DIGIT
1996 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr)))
1998 nextchr = UCHARAT(++locinput);
2001 PL_reg_flags |= RF_tainted;
2006 if (nextchr & 0x80) {
2007 if (OP(scan) == NDIGITUTF8
2008 ? swash_fetch(PL_utf8_digit,(U8*)locinput)
2009 : isDIGIT_LC_utf8((U8*)locinput))
2013 locinput += PL_utf8skip[nextchr];
2014 nextchr = UCHARAT(locinput);
2017 if (!isDIGIT(nextchr))
2019 nextchr = UCHARAT(++locinput);
2022 PL_reg_flags |= RF_tainted;
2027 if (OP(scan) == DIGIT
2028 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr))
2030 nextchr = UCHARAT(++locinput);
2033 PL_reg_flags |= RF_tainted;
2036 if (!nextchr && locinput >= PL_regeol)
2038 if (nextchr & 0x80) {
2039 if (swash_fetch(PL_utf8_digit,(U8*)locinput))
2041 locinput += PL_utf8skip[nextchr];
2042 nextchr = UCHARAT(locinput);
2045 if (isDIGIT(nextchr))
2047 nextchr = UCHARAT(++locinput);
2050 if (locinput >= PL_regeol || swash_fetch(PL_utf8_mark,(U8*)locinput))
2052 locinput += PL_utf8skip[nextchr];
2053 while (locinput < PL_regeol && swash_fetch(PL_utf8_mark,(U8*)locinput))
2054 locinput += UTF8SKIP(locinput);
2055 if (locinput > PL_regeol)
2057 nextchr = UCHARAT(locinput);
2060 PL_reg_flags |= RF_tainted;
2064 n = ARG(scan); /* which paren pair */
2065 ln = PL_regstartp[n];
2066 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
2067 if (*PL_reglastparen < n || ln == -1)
2068 sayNO; /* Do not match unless seen CLOSEn. */
2069 if (ln == PL_regendp[n])
2073 if (UTF && OP(scan) != REF) { /* REF can do byte comparison */
2075 char *e = PL_bostr + PL_regendp[n];
2077 * Note that we can't do the "other character" lookup trick as
2078 * in the 8-bit case (no pun intended) because in Unicode we
2079 * have to map both upper and title case to lower case.
2081 if (OP(scan) == REFF) {
2085 if (toLOWER_utf8((U8*)s) != toLOWER_utf8((U8*)l))
2095 if (toLOWER_LC_utf8((U8*)s) != toLOWER_LC_utf8((U8*)l))
2102 nextchr = UCHARAT(locinput);
2106 /* Inline the first character, for speed. */
2107 if (UCHARAT(s) != nextchr &&
2109 (UCHARAT(s) != ((OP(scan) == REFF
2110 ? PL_fold : PL_fold_locale)[nextchr]))))
2112 ln = PL_regendp[n] - ln;
2113 if (locinput + ln > PL_regeol)
2115 if (ln > 1 && (OP(scan) == REF
2116 ? memNE(s, locinput, ln)
2118 ? ibcmp(s, locinput, ln)
2119 : ibcmp_locale(s, locinput, ln))))
2122 nextchr = UCHARAT(locinput);
2133 OP_4tree *oop = PL_op;
2134 COP *ocurcop = PL_curcop;
2135 SV **ocurpad = PL_curpad;
2139 PL_op = (OP_4tree*)PL_regdata->data[n];
2140 DEBUG_r( PerlIO_printf(Perl_debug_log, " re_eval 0x%x\n", PL_op) );
2141 PL_curpad = AvARRAY((AV*)PL_regdata->data[n + 2]);
2142 PL_regendp[0] = PL_reg_magic->mg_len = locinput - PL_bostr;
2144 CALLRUNOPS(aTHX); /* Scalar context. */
2150 PL_curpad = ocurpad;
2151 PL_curcop = ocurcop;
2153 if (logical == 2) { /* Postponed subexpression. */
2155 MAGIC *mg = Null(MAGIC*);
2157 CHECKPOINT cp, lastcp;
2159 if(SvROK(ret) || SvRMAGICAL(ret)) {
2160 SV *sv = SvROK(ret) ? SvRV(ret) : ret;
2163 mg = mg_find(sv, 'r');
2166 re = (regexp *)mg->mg_obj;
2167 (void)ReREFCNT_inc(re);
2171 char *t = SvPV(ret, len);
2173 char *oprecomp = PL_regprecomp;
2174 I32 osize = PL_regsize;
2175 I32 onpar = PL_regnpar;
2178 re = CALLREGCOMP(aTHX_ t, t + len, &pm);
2180 & (SVs_TEMP | SVs_PADTMP | SVf_READONLY)))
2181 sv_magic(ret,(SV*)ReREFCNT_inc(re),'r',0,0);
2182 PL_regprecomp = oprecomp;
2187 PerlIO_printf(Perl_debug_log,
2188 "Entering embedded `%s%.60s%s%s'\n",
2192 (strlen(re->precomp) > 60 ? "..." : ""))
2195 state.prev = PL_reg_call_cc;
2196 state.cc = PL_regcc;
2197 state.re = PL_reg_re;
2201 cp = regcppush(0); /* Save *all* the positions. */
2204 state.ss = PL_savestack_ix;
2205 *PL_reglastparen = 0;
2206 PL_reg_call_cc = &state;
2207 PL_reginput = locinput;
2209 /* XXXX This is too dramatic a measure... */
2212 if (regmatch(re->program + 1)) {
2213 /* Even though we succeeded, we need to restore
2214 global variables, since we may be wrapped inside
2215 SUSPEND, thus the match may be not finished yet. */
2217 /* XXXX Do this only if SUSPENDed? */
2218 PL_reg_call_cc = state.prev;
2219 PL_regcc = state.cc;
2220 PL_reg_re = state.re;
2221 cache_re(PL_reg_re);
2223 /* XXXX This is too dramatic a measure... */
2226 /* These are needed even if not SUSPEND. */
2234 PL_reg_call_cc = state.prev;
2235 PL_regcc = state.cc;
2236 PL_reg_re = state.re;
2237 cache_re(PL_reg_re);
2239 /* XXXX This is too dramatic a measure... */
2248 sv_setsv(save_scalar(PL_replgv), ret);
2252 n = ARG(scan); /* which paren pair */
2253 PL_reg_start_tmp[n] = locinput;
2258 n = ARG(scan); /* which paren pair */
2259 PL_regstartp[n] = PL_reg_start_tmp[n] - PL_bostr;
2260 PL_regendp[n] = locinput - PL_bostr;
2261 if (n > *PL_reglastparen)
2262 *PL_reglastparen = n;
2265 n = ARG(scan); /* which paren pair */
2266 sw = (*PL_reglastparen >= n && PL_regendp[n] != -1);
2269 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
2271 next = NEXTOPER(NEXTOPER(scan));
2273 next = scan + ARG(scan);
2274 if (OP(next) == IFTHEN) /* Fake one. */
2275 next = NEXTOPER(NEXTOPER(next));
2279 logical = scan->flags;
2281 /*******************************************************************
2282 PL_regcc contains infoblock about the innermost (...)* loop, and
2283 a pointer to the next outer infoblock.
2285 Here is how Y(A)*Z is processed (if it is compiled into CURLYX/WHILEM):
2287 1) After matching X, regnode for CURLYX is processed;
2289 2) This regnode creates infoblock on the stack, and calls
2290 regmatch() recursively with the starting point at WHILEM node;
2292 3) Each hit of WHILEM node tries to match A and Z (in the order
2293 depending on the current iteration, min/max of {min,max} and
2294 greediness). The information about where are nodes for "A"
2295 and "Z" is read from the infoblock, as is info on how many times "A"
2296 was already matched, and greediness.
2298 4) After A matches, the same WHILEM node is hit again.
2300 5) Each time WHILEM is hit, PL_regcc is the infoblock created by CURLYX
2301 of the same pair. Thus when WHILEM tries to match Z, it temporarily
2302 resets PL_regcc, since this Y(A)*Z can be a part of some other loop:
2303 as in (Y(A)*Z)*. If Z matches, the automaton will hit the WHILEM node
2304 of the external loop.
2306 Currently present infoblocks form a tree with a stem formed by PL_curcc
2307 and whatever it mentions via ->next, and additional attached trees
2308 corresponding to temporarily unset infoblocks as in "5" above.
2310 In the following picture infoblocks for outer loop of
2311 (Y(A)*?Z)*?T are denoted O, for inner I. NULL starting block
2312 is denoted by x. The matched string is YAAZYAZT. Temporarily postponed
2313 infoblocks are drawn below the "reset" infoblock.
2315 In fact in the picture below we do not show failed matches for Z and T
2316 by WHILEM blocks. [We illustrate minimal matches, since for them it is
2317 more obvious *why* one needs to *temporary* unset infoblocks.]
2319 Matched REx position InfoBlocks Comment
2323 Y A)*?Z)*?T x <- O <- I
2324 YA )*?Z)*?T x <- O <- I
2325 YA A)*?Z)*?T x <- O <- I
2326 YAA )*?Z)*?T x <- O <- I
2327 YAA Z)*?T x <- O # Temporary unset I
2330 YAAZ Y(A)*?Z)*?T x <- O
2333 YAAZY (A)*?Z)*?T x <- O
2336 YAAZY A)*?Z)*?T x <- O <- I
2339 YAAZYA )*?Z)*?T x <- O <- I
2342 YAAZYA Z)*?T x <- O # Temporary unset I
2348 YAAZYAZ T x # Temporary unset O
2355 *******************************************************************/
2358 CHECKPOINT cp = PL_savestack_ix;
2360 if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
2362 cc.oldcc = PL_regcc;
2364 cc.parenfloor = *PL_reglastparen;
2366 cc.min = ARG1(scan);
2367 cc.max = ARG2(scan);
2368 cc.scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
2372 PL_reginput = locinput;
2373 n = regmatch(PREVOPER(next)); /* start on the WHILEM */
2375 PL_regcc = cc.oldcc;
2381 * This is really hard to understand, because after we match
2382 * what we're trying to match, we must make sure the rest of
2383 * the REx is going to match for sure, and to do that we have
2384 * to go back UP the parse tree by recursing ever deeper. And
2385 * if it fails, we have to reset our parent's current state
2386 * that we can try again after backing off.
2389 CHECKPOINT cp, lastcp;
2390 CURCUR* cc = PL_regcc;
2391 char *lastloc = cc->lastloc; /* Detection of 0-len. */
2393 n = cc->cur + 1; /* how many we know we matched */
2394 PL_reginput = locinput;
2397 PerlIO_printf(Perl_debug_log,
2398 "%*s %ld out of %ld..%ld cc=%lx\n",
2399 REPORT_CODE_OFF+PL_regindent*2, "",
2400 (long)n, (long)cc->min,
2401 (long)cc->max, (long)cc)
2404 /* If degenerate scan matches "", assume scan done. */
2406 if (locinput == cc->lastloc && n >= cc->min) {
2407 PL_regcc = cc->oldcc;
2411 PerlIO_printf(Perl_debug_log,
2412 "%*s empty match detected, try continuation...\n",
2413 REPORT_CODE_OFF+PL_regindent*2, "")
2415 if (regmatch(cc->next))
2423 /* First just match a string of min scans. */
2427 cc->lastloc = locinput;
2428 if (regmatch(cc->scan))
2431 cc->lastloc = lastloc;
2436 /* Check whether we already were at this position.
2437 Postpone detection until we know the match is not
2438 *that* much linear. */
2439 if (!PL_reg_maxiter) {
2440 PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
2441 PL_reg_leftiter = PL_reg_maxiter;
2443 if (PL_reg_leftiter-- == 0) {
2444 I32 size = (PL_reg_maxiter + 7)/8;
2445 if (PL_reg_poscache) {
2446 if (PL_reg_poscache_size < size) {
2447 Renew(PL_reg_poscache, size, char);
2448 PL_reg_poscache_size = size;
2450 Zero(PL_reg_poscache, size, char);
2453 PL_reg_poscache_size = size;
2454 Newz(29, PL_reg_poscache, size, char);
2457 PerlIO_printf(Perl_debug_log,
2458 "%sDetected a super-linear match, switching on caching%s...\n",
2459 PL_colors[4], PL_colors[5])
2462 if (PL_reg_leftiter < 0) {
2463 I32 o = locinput - PL_bostr, b;
2465 o = (scan->flags & 0xf) - 1 + o * (scan->flags>>4);
2468 if (PL_reg_poscache[o] & (1<<b)) {
2470 PerlIO_printf(Perl_debug_log,
2471 "%*s already tried at this position...\n",
2472 REPORT_CODE_OFF+PL_regindent*2, "")
2476 PL_reg_poscache[o] |= (1<<b);
2480 /* Prefer next over scan for minimal matching. */
2483 PL_regcc = cc->oldcc;
2486 cp = regcppush(cc->parenfloor);
2488 if (regmatch(cc->next)) {
2490 sayYES; /* All done. */
2498 if (n >= cc->max) { /* Maximum greed exceeded? */
2499 if (ckWARN(WARN_UNSAFE) && n >= REG_INFTY
2500 && !(PL_reg_flags & RF_warned)) {
2501 PL_reg_flags |= RF_warned;
2502 Perl_warner(aTHX_ WARN_UNSAFE, "%s limit (%d) exceeded",
2503 "Complex regular subexpression recursion",
2510 PerlIO_printf(Perl_debug_log,
2511 "%*s trying longer...\n",
2512 REPORT_CODE_OFF+PL_regindent*2, "")
2514 /* Try scanning more and see if it helps. */
2515 PL_reginput = locinput;
2517 cc->lastloc = locinput;
2518 cp = regcppush(cc->parenfloor);
2520 if (regmatch(cc->scan)) {
2527 cc->lastloc = lastloc;
2531 /* Prefer scan over next for maximal matching. */
2533 if (n < cc->max) { /* More greed allowed? */
2534 cp = regcppush(cc->parenfloor);
2536 cc->lastloc = locinput;
2538 if (regmatch(cc->scan)) {
2543 regcppop(); /* Restore some previous $<digit>s? */
2544 PL_reginput = locinput;
2546 PerlIO_printf(Perl_debug_log,
2547 "%*s failed, try continuation...\n",
2548 REPORT_CODE_OFF+PL_regindent*2, "")
2551 if (ckWARN(WARN_UNSAFE) && n >= REG_INFTY
2552 && !(PL_reg_flags & RF_warned)) {
2553 PL_reg_flags |= RF_warned;
2554 Perl_warner(aTHX_ WARN_UNSAFE, "%s limit (%d) exceeded",
2555 "Complex regular subexpression recursion",
2559 /* Failed deeper matches of scan, so see if this one works. */
2560 PL_regcc = cc->oldcc;
2563 if (regmatch(cc->next))
2569 cc->lastloc = lastloc;
2574 next = scan + ARG(scan);
2577 inner = NEXTOPER(NEXTOPER(scan));
2580 inner = NEXTOPER(scan);
2585 if (OP(next) != c1) /* No choice. */
2586 next = inner; /* Avoid recursion. */
2588 int lastparen = *PL_reglastparen;
2592 PL_reginput = locinput;
2593 if (regmatch(inner))
2596 for (n = *PL_reglastparen; n > lastparen; n--)
2598 *PL_reglastparen = n;
2601 if (n = (c1 == BRANCH ? NEXT_OFF(next) : ARG(next)))
2605 inner = NEXTOPER(scan);
2606 if (c1 == BRANCHJ) {
2607 inner = NEXTOPER(inner);
2609 } while (scan != NULL && OP(scan) == c1);
2623 /* We suppose that the next guy does not need
2624 backtracking: in particular, it is of constant length,
2625 and has no parenths to influence future backrefs. */
2626 ln = ARG1(scan); /* min to match */
2627 n = ARG2(scan); /* max to match */
2628 paren = scan->flags;
2630 if (paren > PL_regsize)
2632 if (paren > *PL_reglastparen)
2633 *PL_reglastparen = paren;
2635 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
2637 scan += NEXT_OFF(scan); /* Skip former OPEN. */
2638 PL_reginput = locinput;
2641 if (ln && regrepeat_hard(scan, ln, &l) < ln)
2643 if (ln && l == 0 && n >= ln
2644 /* In fact, this is tricky. If paren, then the
2645 fact that we did/didnot match may influence
2646 future execution. */
2647 && !(paren && ln == 0))
2649 locinput = PL_reginput;
2650 if (PL_regkind[(U8)OP(next)] == EXACT) {
2651 c1 = (U8)*STRING(next);
2652 if (OP(next) == EXACTF)
2654 else if (OP(next) == EXACTFL)
2655 c2 = PL_fold_locale[c1];
2662 /* This may be improved if l == 0. */
2663 while (n >= ln || (n == REG_INFTY && ln > 0 && l)) { /* ln overflow ? */
2664 /* If it could work, try it. */
2666 UCHARAT(PL_reginput) == c1 ||
2667 UCHARAT(PL_reginput) == c2)
2671 PL_regstartp[paren] =
2672 HOPc(PL_reginput, -l) - PL_bostr;
2673 PL_regendp[paren] = PL_reginput - PL_bostr;
2676 PL_regendp[paren] = -1;
2682 /* Couldn't or didn't -- move forward. */
2683 PL_reginput = locinput;
2684 if (regrepeat_hard(scan, 1, &l)) {
2686 locinput = PL_reginput;
2693 n = regrepeat_hard(scan, n, &l);
2694 if (n != 0 && l == 0
2695 /* In fact, this is tricky. If paren, then the
2696 fact that we did/didnot match may influence
2697 future execution. */
2698 && !(paren && ln == 0))
2700 locinput = PL_reginput;
2702 PerlIO_printf(Perl_debug_log,
2703 "%*s matched %ld times, len=%ld...\n",
2704 REPORT_CODE_OFF+PL_regindent*2, "", n, l)
2707 if (PL_regkind[(U8)OP(next)] == EXACT) {
2708 c1 = (U8)*STRING(next);
2709 if (OP(next) == EXACTF)
2711 else if (OP(next) == EXACTFL)
2712 c2 = PL_fold_locale[c1];
2721 /* If it could work, try it. */
2723 UCHARAT(PL_reginput) == c1 ||
2724 UCHARAT(PL_reginput) == c2)
2727 PerlIO_printf(Perl_debug_log,
2728 "%*s trying tail with n=%ld...\n",
2729 REPORT_CODE_OFF+PL_regindent*2, "", n)
2733 PL_regstartp[paren] = HOPc(PL_reginput, -l) - PL_bostr;
2734 PL_regendp[paren] = PL_reginput - PL_bostr;
2737 PL_regendp[paren] = -1;
2743 /* Couldn't or didn't -- back up. */
2745 locinput = HOPc(locinput, -l);
2746 PL_reginput = locinput;
2753 paren = scan->flags; /* Which paren to set */
2754 if (paren > PL_regsize)
2756 if (paren > *PL_reglastparen)
2757 *PL_reglastparen = paren;
2758 ln = ARG1(scan); /* min to match */
2759 n = ARG2(scan); /* max to match */
2760 scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
2764 ln = ARG1(scan); /* min to match */
2765 n = ARG2(scan); /* max to match */
2766 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
2771 scan = NEXTOPER(scan);
2777 scan = NEXTOPER(scan);
2781 * Lookahead to avoid useless match attempts
2782 * when we know what character comes next.
2784 if (PL_regkind[(U8)OP(next)] == EXACT) {
2785 c1 = (U8)*STRING(next);
2786 if (OP(next) == EXACTF)
2788 else if (OP(next) == EXACTFL)
2789 c2 = PL_fold_locale[c1];
2795 PL_reginput = locinput;
2799 if (ln && regrepeat(scan, ln) < ln)
2801 locinput = PL_reginput;
2804 char *e = locinput + n - ln; /* Should not check after this */
2805 char *old = locinput;
2807 if (e >= PL_regeol || (n == REG_INFTY))
2810 /* Find place 'next' could work */
2812 while (locinput <= e && *locinput != c1)
2815 while (locinput <= e
2822 /* PL_reginput == old now */
2823 if (locinput != old) {
2824 ln = 1; /* Did some */
2825 if (regrepeat(scan, locinput - old) <
2829 /* PL_reginput == locinput now */
2832 PL_regstartp[paren] = HOPc(locinput, -1) - PL_bostr;
2833 PL_regendp[paren] = locinput - PL_bostr;
2836 PL_regendp[paren] = -1;
2840 PL_reginput = locinput; /* Could be reset... */
2842 /* Couldn't or didn't -- move forward. */
2847 while (n >= ln || (n == REG_INFTY && ln > 0)) { /* ln overflow ? */
2848 /* If it could work, try it. */
2850 UCHARAT(PL_reginput) == c1 ||
2851 UCHARAT(PL_reginput) == c2)
2855 PL_regstartp[paren] = HOPc(PL_reginput, -1) - PL_bostr;
2856 PL_regendp[paren] = PL_reginput - PL_bostr;
2859 PL_regendp[paren] = -1;
2865 /* Couldn't or didn't -- move forward. */
2866 PL_reginput = locinput;
2867 if (regrepeat(scan, 1)) {
2869 locinput = PL_reginput;
2877 n = regrepeat(scan, n);
2878 locinput = PL_reginput;
2879 if (ln < n && PL_regkind[(U8)OP(next)] == EOL &&
2880 (!PL_multiline || OP(next) == SEOL))
2881 ln = n; /* why back off? */
2885 /* If it could work, try it. */
2887 UCHARAT(PL_reginput) == c1 ||
2888 UCHARAT(PL_reginput) == c2)
2892 PL_regstartp[paren] = HOPc(PL_reginput, -1) - PL_bostr;
2893 PL_regendp[paren] = PL_reginput - PL_bostr;
2896 PL_regendp[paren] = -1;
2902 /* Couldn't or didn't -- back up. */
2904 PL_reginput = locinput = HOPc(locinput, -1);
2909 /* If it could work, try it. */
2911 UCHARAT(PL_reginput) == c1 ||
2912 UCHARAT(PL_reginput) == c2)
2918 /* Couldn't or didn't -- back up. */
2920 PL_reginput = locinput = HOPc(locinput, -1);
2927 if (PL_reg_call_cc) {
2928 re_cc_state *cur_call_cc = PL_reg_call_cc;
2929 CURCUR *cctmp = PL_regcc;
2930 regexp *re = PL_reg_re;
2931 CHECKPOINT cp, lastcp;
2933 cp = regcppush(0); /* Save *all* the positions. */
2935 regcp_set_to(PL_reg_call_cc->ss); /* Restore parens of
2937 PL_reginput = locinput; /* Make position available to
2939 cache_re(PL_reg_call_cc->re);
2940 PL_regcc = PL_reg_call_cc->cc;
2941 PL_reg_call_cc = PL_reg_call_cc->prev;
2942 if (regmatch(cur_call_cc->node)) {
2943 PL_reg_call_cc = cur_call_cc;
2949 PL_reg_call_cc = cur_call_cc;
2955 PerlIO_printf(Perl_debug_log,
2956 "%*s continuation failed...\n",
2957 REPORT_CODE_OFF+PL_regindent*2, "")
2961 if (locinput < PL_regtill) {
2962 DEBUG_r(PerlIO_printf(Perl_debug_log,
2963 "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
2965 (long)(locinput - PL_reg_starttry),
2966 (long)(PL_regtill - PL_reg_starttry),
2968 sayNO_FINAL; /* Cannot match: too short. */
2970 PL_reginput = locinput; /* put where regtry can find it */
2971 sayYES_FINAL; /* Success! */
2973 PL_reginput = locinput; /* put where regtry can find it */
2974 sayYES_LOUD; /* Success! */
2977 PL_reginput = locinput;
2982 if (UTF) { /* XXXX This is absolutely
2983 broken, we read before
2985 s = HOPMAYBEc(locinput, -scan->flags);
2991 if (locinput < PL_bostr + scan->flags)
2993 PL_reginput = locinput - scan->flags;
2998 PL_reginput = locinput;
3003 if (UTF) { /* XXXX This is absolutely
3004 broken, we read before
3006 s = HOPMAYBEc(locinput, -scan->flags);
3007 if (!s || s < PL_bostr)
3012 if (locinput < PL_bostr + scan->flags)
3014 PL_reginput = locinput - scan->flags;
3019 PL_reginput = locinput;
3022 inner = NEXTOPER(NEXTOPER(scan));
3023 if (regmatch(inner) != n) {
3038 if (OP(scan) == SUSPEND) {
3039 locinput = PL_reginput;
3040 nextchr = UCHARAT(locinput);
3045 next = scan + ARG(scan);
3050 PerlIO_printf(Perl_error_log, "%lx %d\n",
3051 (unsigned long)scan, OP(scan));
3052 Perl_croak(aTHX_ "regexp memory corruption");
3058 * We get here only if there's trouble -- normally "case END" is
3059 * the terminating point.
3061 Perl_croak(aTHX_ "corrupted regexp pointers");
3067 PerlIO_printf(Perl_debug_log,
3068 "%*s %scould match...%s\n",
3069 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4],PL_colors[5])
3073 DEBUG_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
3074 PL_colors[4],PL_colors[5]));
3083 PerlIO_printf(Perl_debug_log,
3084 "%*s %sfailed...%s\n",
3085 REPORT_CODE_OFF+PL_regindent*2, "",PL_colors[4],PL_colors[5])
3097 - regrepeat - repeatedly match something simple, report how many
3100 * [This routine now assumes that it will only match on things of length 1.
3101 * That was true before, but now we assume scan - reginput is the count,
3102 * rather than incrementing count on every character. [Er, except utf8.]]
3105 S_regrepeat(pTHX_ regnode *p, I32 max)
3108 register char *scan;
3109 register char *opnd;
3111 register char *loceol = PL_regeol;
3112 register I32 hardcount = 0;
3115 if (max != REG_INFTY && max < loceol - scan)
3116 loceol = scan + max;
3119 while (scan < loceol && *scan != '\n')
3127 while (scan < loceol && *scan != '\n') {
3128 scan += UTF8SKIP(scan);
3134 while (scan < loceol) {
3135 scan += UTF8SKIP(scan);
3139 case EXACT: /* length of string is 1 */
3141 while (scan < loceol && UCHARAT(scan) == c)
3144 case EXACTF: /* length of string is 1 */
3146 while (scan < loceol &&
3147 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold[c]))
3150 case EXACTFL: /* length of string is 1 */
3151 PL_reg_flags |= RF_tainted;
3153 while (scan < loceol &&
3154 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold_locale[c]))
3159 while (scan < loceol && REGINCLASSUTF8(p, (U8*)scan)) {
3160 scan += UTF8SKIP(scan);
3166 while (scan < loceol && REGINCLASS(opnd, *scan))
3170 while (scan < loceol && isALNUM(*scan))
3175 while (scan < loceol && swash_fetch(PL_utf8_alnum, (U8*)scan)) {
3176 scan += UTF8SKIP(scan);
3181 PL_reg_flags |= RF_tainted;
3182 while (scan < loceol && isALNUM_LC(*scan))
3186 PL_reg_flags |= RF_tainted;
3188 while (scan < loceol && isALNUM_LC_utf8((U8*)scan)) {
3189 scan += UTF8SKIP(scan);
3195 while (scan < loceol && !isALNUM(*scan))
3200 while (scan < loceol && !swash_fetch(PL_utf8_alnum, (U8*)scan)) {
3201 scan += UTF8SKIP(scan);
3206 PL_reg_flags |= RF_tainted;
3207 while (scan < loceol && !isALNUM_LC(*scan))
3211 PL_reg_flags |= RF_tainted;
3213 while (scan < loceol && !isALNUM_LC_utf8((U8*)scan)) {
3214 scan += UTF8SKIP(scan);
3219 while (scan < loceol && isSPACE(*scan))
3224 while (scan < loceol && (*scan == ' ' || swash_fetch(PL_utf8_space,(U8*)scan))) {
3225 scan += UTF8SKIP(scan);
3230 PL_reg_flags |= RF_tainted;
3231 while (scan < loceol && isSPACE_LC(*scan))
3235 PL_reg_flags |= RF_tainted;
3237 while (scan < loceol && (*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
3238 scan += UTF8SKIP(scan);
3243 while (scan < loceol && !isSPACE(*scan))
3248 while (scan < loceol && !(*scan == ' ' || swash_fetch(PL_utf8_space,(U8*)scan))) {
3249 scan += UTF8SKIP(scan);
3254 PL_reg_flags |= RF_tainted;
3255 while (scan < loceol && !isSPACE_LC(*scan))
3259 PL_reg_flags |= RF_tainted;
3261 while (scan < loceol && !(*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
3262 scan += UTF8SKIP(scan);
3267 while (scan < loceol && isDIGIT(*scan))
3272 while (scan < loceol && swash_fetch(PL_utf8_digit,(U8*)scan)) {
3273 scan += UTF8SKIP(scan);
3279 while (scan < loceol && !isDIGIT(*scan))
3284 while (scan < loceol && !swash_fetch(PL_utf8_digit,(U8*)scan)) {
3285 scan += UTF8SKIP(scan);
3289 default: /* Called on something of 0 width. */
3290 break; /* So match right here or not at all. */
3296 c = scan - PL_reginput;
3301 SV *prop = sv_newmortal();
3304 PerlIO_printf(Perl_debug_log,
3305 "%*s %s can match %ld times out of %ld...\n",
3306 REPORT_CODE_OFF+1, "", SvPVX(prop),c,max);
3313 - regrepeat_hard - repeatedly match something, report total lenth and length
3315 * The repeater is supposed to have constant length.
3319 S_regrepeat_hard(pTHX_ regnode *p, I32 max, I32 *lp)
3322 register char *scan;
3323 register char *start;
3324 register char *loceol = PL_regeol;
3326 I32 count = 0, res = 1;
3331 start = PL_reginput;
3333 while (PL_reginput < loceol && (scan = PL_reginput, res = regmatch(p))) {
3336 while (start < PL_reginput) {
3338 start += UTF8SKIP(start);
3349 while (PL_reginput < loceol && (scan = PL_reginput, res = regmatch(p))) {
3351 *lp = l = PL_reginput - start;
3352 if (max != REG_INFTY && l*max < loceol - scan)
3353 loceol = scan + l*max;
3366 - reginclass - determine if a character falls into a character class
3370 S_reginclass(pTHX_ register char *p, register I32 c)
3373 char flags = ANYOF_FLAGS(p);
3377 if (ANYOF_BITMAP_TEST(p, c))
3379 else if (flags & ANYOF_FOLD) {
3381 if (flags & ANYOF_LOCALE) {
3382 PL_reg_flags |= RF_tainted;
3383 cf = PL_fold_locale[c];
3387 if (ANYOF_BITMAP_TEST(p, cf))
3391 if (!match && (flags & ANYOF_CLASS)) {
3392 PL_reg_flags |= RF_tainted;
3394 (ANYOF_CLASS_TEST(p, ANYOF_ALNUM) && isALNUM_LC(c)) ||
3395 (ANYOF_CLASS_TEST(p, ANYOF_NALNUM) && !isALNUM_LC(c)) ||
3396 (ANYOF_CLASS_TEST(p, ANYOF_SPACE) && isSPACE_LC(c)) ||
3397 (ANYOF_CLASS_TEST(p, ANYOF_NSPACE) && !isSPACE_LC(c)) ||
3398 (ANYOF_CLASS_TEST(p, ANYOF_DIGIT) && isDIGIT_LC(c)) ||
3399 (ANYOF_CLASS_TEST(p, ANYOF_NDIGIT) && !isDIGIT_LC(c)) ||
3400 (ANYOF_CLASS_TEST(p, ANYOF_ALNUMC) && isALNUMC_LC(c)) ||
3401 (ANYOF_CLASS_TEST(p, ANYOF_NALNUMC) && !isALNUMC_LC(c)) ||
3402 (ANYOF_CLASS_TEST(p, ANYOF_ALPHA) && isALPHA_LC(c)) ||
3403 (ANYOF_CLASS_TEST(p, ANYOF_NALPHA) && !isALPHA_LC(c)) ||
3404 (ANYOF_CLASS_TEST(p, ANYOF_ASCII) && isASCII(c)) ||
3405 (ANYOF_CLASS_TEST(p, ANYOF_NASCII) && !isASCII(c)) ||
3406 (ANYOF_CLASS_TEST(p, ANYOF_CNTRL) && isCNTRL_LC(c)) ||
3407 (ANYOF_CLASS_TEST(p, ANYOF_NCNTRL) && !isCNTRL_LC(c)) ||
3408 (ANYOF_CLASS_TEST(p, ANYOF_GRAPH) && isGRAPH_LC(c)) ||
3409 (ANYOF_CLASS_TEST(p, ANYOF_NGRAPH) && !isGRAPH_LC(c)) ||
3410 (ANYOF_CLASS_TEST(p, ANYOF_LOWER) && isLOWER_LC(c)) ||
3411 (ANYOF_CLASS_TEST(p, ANYOF_NLOWER) && !isLOWER_LC(c)) ||
3412 (ANYOF_CLASS_TEST(p, ANYOF_PRINT) && isPRINT_LC(c)) ||
3413 (ANYOF_CLASS_TEST(p, ANYOF_NPRINT) && !isPRINT_LC(c)) ||
3414 (ANYOF_CLASS_TEST(p, ANYOF_PUNCT) && isPUNCT_LC(c)) ||
3415 (ANYOF_CLASS_TEST(p, ANYOF_NPUNCT) && !isPUNCT_LC(c)) ||
3416 (ANYOF_CLASS_TEST(p, ANYOF_UPPER) && isUPPER_LC(c)) ||
3417 (ANYOF_CLASS_TEST(p, ANYOF_NUPPER) && !isUPPER_LC(c)) ||
3418 (ANYOF_CLASS_TEST(p, ANYOF_XDIGIT) && isXDIGIT(c)) ||
3419 (ANYOF_CLASS_TEST(p, ANYOF_NXDIGIT) && !isXDIGIT(c))
3420 ) /* How's that for a conditional? */
3426 return (flags & ANYOF_INVERT) ? !match : match;
3430 S_reginclassutf8(pTHX_ regnode *f, U8 *p)
3433 char flags = ARG1(f);
3435 SV *sv = (SV*)PL_regdata->data[ARG2(f)];
3437 if (swash_fetch(sv, p))
3439 else if (flags & ANYOF_FOLD) {
3442 if (flags & ANYOF_LOCALE) {
3443 PL_reg_flags |= RF_tainted;
3444 uv_to_utf8(tmpbuf, toLOWER_LC_utf8(p));
3447 uv_to_utf8(tmpbuf, toLOWER_utf8(p));
3448 if (swash_fetch(sv, tmpbuf))
3452 /* UTF8 combined with ANYOF_CLASS is ill-defined. */
3454 return (flags & ANYOF_INVERT) ? !match : match;
3458 S_reghop(pTHX_ U8 *s, I32 off)
3462 while (off-- && s < (U8*)PL_regeol)
3467 if (s > (U8*)PL_bostr) {
3470 while (s > (U8*)PL_bostr && (*s & 0xc0) == 0x80)
3472 } /* XXX could check well-formedness here */
3480 S_reghopmaybe(pTHX_ U8* s, I32 off)
3484 while (off-- && s < (U8*)PL_regeol)
3491 if (s > (U8*)PL_bostr) {
3494 while (s > (U8*)PL_bostr && (*s & 0xc0) == 0x80)
3496 } /* XXX could check well-formedness here */
3513 restore_pos(pTHXo_ void *arg)
3516 if (PL_reg_eval_set) {
3517 if (PL_reg_oldsaved) {
3518 PL_reg_re->subbeg = PL_reg_oldsaved;
3519 PL_reg_re->sublen = PL_reg_oldsavedlen;
3520 RX_MATCH_COPIED_on(PL_reg_re);
3522 PL_reg_magic->mg_len = PL_reg_oldpos;
3523 PL_reg_eval_set = 0;
3524 PL_curpm = PL_reg_oldcurpm;