5 * "One Ring to rule them all, One Ring to find them..."
8 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
9 * confused with the original package (see point 3 below). Thanks, Henry!
12 /* Additional note: this code is very heavily munged from Henry's version
13 * in places. In some spots I've traded clarity for efficiency, so don't
14 * blame Henry for some of the lack of readability.
17 /* The names of the functions have been changed from regcomp and
18 * regexec to pregcomp and pregexec in order to avoid conflicts
19 * with the POSIX routines of the same names.
22 #ifdef PERL_EXT_RE_BUILD
23 /* need to replace pregcomp et al, so enable that */
24 # ifndef PERL_IN_XSUB_RE
25 # define PERL_IN_XSUB_RE
27 /* need access to debugger hooks */
28 # if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
33 #ifdef PERL_IN_XSUB_RE
34 /* We *really* need to overwrite these symbols: */
35 # define Perl_regexec_flags my_regexec
36 # define Perl_regdump my_regdump
37 # define Perl_regprop my_regprop
38 # define Perl_re_intuit_start my_re_intuit_start
39 /* *These* symbols are masked to allow static link. */
40 # define Perl_pregexec my_pregexec
41 # define Perl_reginitcolors my_reginitcolors
43 # define PERL_NO_GET_CONTEXT
48 * pregcomp and pregexec -- regsub and regerror are not used in perl
50 * Copyright (c) 1986 by University of Toronto.
51 * Written by Henry Spencer. Not derived from licensed software.
53 * Permission is granted to anyone to use this software for any
54 * purpose on any computer system, and to redistribute it freely,
55 * subject to the following restrictions:
57 * 1. The author is not responsible for the consequences of use of
58 * this software, no matter how awful, even if they arise
61 * 2. The origin of this software must not be misrepresented, either
62 * by explicit claim or by omission.
64 * 3. Altered versions must be plainly marked as such, and must not
65 * be misrepresented as being the original software.
67 **** Alterations to Henry's code are...
69 **** Copyright (c) 1991-1999, Larry Wall
71 **** You may distribute under the terms of either the GNU General Public
72 **** License or the Artistic License, as specified in the README file.
74 * Beware that some of this code is subtly aware of the way operator
75 * precedence is structured in regular expressions. Serious changes in
76 * regular-expression syntax might require a total rethink.
79 #define PERL_IN_REGEXEC_C
82 #ifdef PERL_IN_XSUB_RE
83 # if defined(PERL_CAPI) || defined(PERL_OBJECT)
90 #define RF_tainted 1 /* tainted information used? */
91 #define RF_warned 2 /* warned about big count? */
92 #define RF_evaled 4 /* Did an EVAL with setting? */
93 #define RF_utf8 8 /* String contains multibyte chars? */
95 #define UTF (PL_reg_flags & RF_utf8)
97 #define RS_init 1 /* eval environment created */
98 #define RS_set 2 /* replsv value is set */
101 #define STATIC static
108 #define REGINCLASS(p,c) (ANYOF_FLAGS(p) ? reginclass(p,c) : ANYOF_BITMAP_TEST(p,c))
109 #define REGINCLASSUTF8(f,p) (ARG1(f) ? reginclassutf8(f,p) : swash_fetch((SV*)PL_regdata->data[ARG2(f)],p))
111 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
112 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
114 #define reghop_c(pos,off) ((char*)reghop((U8*)pos, off))
115 #define reghopmaybe_c(pos,off) ((char*)reghopmaybe((U8*)pos, off))
116 #define HOP(pos,off) (UTF ? reghop((U8*)pos, off) : (U8*)(pos + off))
117 #define HOPMAYBE(pos,off) (UTF ? reghopmaybe((U8*)pos, off) : (U8*)(pos + off))
118 #define HOPc(pos,off) ((char*)HOP(pos,off))
119 #define HOPMAYBEc(pos,off) ((char*)HOPMAYBE(pos,off))
121 static void restore_pos(pTHXo_ void *arg);
125 S_regcppush(pTHX_ I32 parenfloor)
128 int retval = PL_savestack_ix;
129 int i = (PL_regsize - parenfloor) * 4;
133 for (p = PL_regsize; p > parenfloor; p--) {
134 SSPUSHINT(PL_regendp[p]);
135 SSPUSHINT(PL_regstartp[p]);
136 SSPUSHPTR(PL_reg_start_tmp[p]);
139 SSPUSHINT(PL_regsize);
140 SSPUSHINT(*PL_reglastparen);
141 SSPUSHPTR(PL_reginput);
143 SSPUSHINT(SAVEt_REGCONTEXT);
147 /* These are needed since we do not localize EVAL nodes: */
148 # define REGCP_SET DEBUG_r(PerlIO_printf(Perl_debug_log, \
149 " Setting an EVAL scope, savestack=%i\n", \
150 PL_savestack_ix)); lastcp = PL_savestack_ix
152 # define REGCP_UNWIND DEBUG_r(lastcp != PL_savestack_ix ? \
153 PerlIO_printf(Perl_debug_log, \
154 " Clearing an EVAL scope, savestack=%i..%i\n", \
155 lastcp, PL_savestack_ix) : 0); regcpblow(lastcp)
165 assert(i == SAVEt_REGCONTEXT);
167 input = (char *) SSPOPPTR;
168 *PL_reglastparen = SSPOPINT;
169 PL_regsize = SSPOPINT;
170 for (i -= 3; i > 0; i -= 4) {
171 paren = (U32)SSPOPINT;
172 PL_reg_start_tmp[paren] = (char *) SSPOPPTR;
173 PL_regstartp[paren] = SSPOPINT;
175 if (paren <= *PL_reglastparen)
176 PL_regendp[paren] = tmps;
178 PerlIO_printf(Perl_debug_log,
179 " restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n",
180 (UV)paren, (IV)PL_regstartp[paren],
181 (IV)(PL_reg_start_tmp[paren] - PL_bostr),
182 (IV)PL_regendp[paren],
183 (paren > *PL_reglastparen ? "(no)" : ""));
187 if (*PL_reglastparen + 1 <= PL_regnpar) {
188 PerlIO_printf(Perl_debug_log,
189 " restoring \\%d..\\%d to undef\n",
190 *PL_reglastparen + 1, PL_regnpar);
193 for (paren = *PL_reglastparen + 1; paren <= PL_regnpar; paren++) {
194 if (paren > PL_regsize)
195 PL_regstartp[paren] = -1;
196 PL_regendp[paren] = -1;
202 S_regcp_set_to(pTHX_ I32 ss)
205 I32 tmp = PL_savestack_ix;
207 PL_savestack_ix = ss;
209 PL_savestack_ix = tmp;
213 typedef struct re_cc_state
217 struct re_cc_state *prev;
222 #define regcpblow(cp) LEAVE_SCOPE(cp)
225 * pregexec and friends
229 - pregexec - match a regexp against a string
232 Perl_pregexec(pTHX_ register regexp *prog, char *stringarg, register char *strend,
233 char *strbeg, I32 minend, SV *screamer, U32 nosave)
234 /* strend: pointer to null at end of string */
235 /* strbeg: real beginning of string */
236 /* minend: end of match must be >=minend after stringarg. */
237 /* nosave: For optimizations. */
240 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
241 nosave ? 0 : REXEC_COPY_STR);
245 S_cache_re(pTHX_ regexp *prog)
248 PL_regprecomp = prog->precomp; /* Needed for FAIL. */
250 PL_regprogram = prog->program;
252 PL_regnpar = prog->nparens;
253 PL_regdata = prog->data;
258 * Need to implement the following flags for reg_anch:
260 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
262 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
263 * INTUIT_AUTORITATIVE_ML
264 * INTUIT_ONCE_NOML - Intuit can match in one location only.
267 * Another flag for this function: SECOND_TIME (so that float substrs
268 * with giant delta may be not rechecked).
271 /* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
273 /* If SCREAM, then SvPVX(sv) should be compatible with strpos and strend.
274 Otherwise, only SvCUR(sv) is used to get strbeg. */
276 /* XXXX We assume that strpos is strbeg unless sv. */
278 /* A failure to find a constant substring means that there is no need to make
279 an expensive call to REx engine, thus we celebrate a failure. Similarly,
280 finding a substring too deep into the string means that less calls to
281 regtry() should be needed.
283 REx compiler's optimizer found 4 possible hints:
284 a) Anchored substring;
286 c) Whether we are anchored (beginning-of-line or \G);
287 d) First node (of those at offset 0) which may distingush positions;
288 We use 'a', 'b', multiline-part of 'c', and try to find a position in the
289 string which does not contradict any of them.
293 Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
294 char *strend, U32 flags, re_scream_pos_data *data)
296 register I32 start_shift;
297 /* Should be nonnegative! */
298 register I32 end_shift;
304 register char *other_last = Nullch;
306 char *i_strpos = strpos;
309 DEBUG_r( if (!PL_colorset) reginitcolors() );
310 DEBUG_r(PerlIO_printf(Perl_debug_log,
311 "%sGuessing start of match, REx%s `%s%.60s%s%s' against `%s%.*s%s%s'...\n",
312 PL_colors[4],PL_colors[5],PL_colors[0],
315 (strlen(prog->precomp) > 60 ? "..." : ""),
317 (int)(strend - strpos > 60 ? 60 : strend - strpos),
318 strpos, PL_colors[1],
319 (strend - strpos > 60 ? "..." : ""))
322 if (prog->minlen > strend - strpos) {
323 DEBUG_r(PerlIO_printf(Perl_debug_log, "String too short...\n"));
326 if (prog->reganch & ROPT_ANCH) { /* Match at beg-of-str or after \n */
327 ml_anch = !( (prog->reganch & ROPT_ANCH_SINGLE)
328 || ( (prog->reganch & ROPT_ANCH_BOL)
329 && !PL_multiline ) ); /* Check after \n? */
331 if ((prog->check_offset_min == prog->check_offset_max) && !ml_anch) {
332 /* Substring at constant offset from beg-of-str... */
335 if ( !(prog->reganch & ROPT_ANCH_GPOS) /* Checked by the caller */
336 && (sv && (strpos + SvCUR(sv) != strend)) ) {
337 DEBUG_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
340 PL_regeol = strend; /* Used in HOP() */
341 s = HOPc(strpos, prog->check_offset_min);
342 if (SvTAIL(prog->check_substr)) {
343 slen = SvCUR(prog->check_substr); /* >= 1 */
345 if ( strend - s > slen || strend - s < slen - 1
346 || (strend - s == slen && strend[-1] != '\n')) {
347 DEBUG_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
350 /* Now should match s[0..slen-2] */
352 if (slen && (*SvPVX(prog->check_substr) != *s
354 && memNE(SvPVX(prog->check_substr), s, slen)))) {
356 DEBUG_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
360 else if (*SvPVX(prog->check_substr) != *s
361 || ((slen = SvCUR(prog->check_substr)) > 1
362 && memNE(SvPVX(prog->check_substr), s, slen)))
364 goto success_at_start;
366 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
368 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
369 /* Should be nonnegative! */
370 end_shift = prog->minlen - start_shift -
371 CHR_SVLEN(prog->check_substr) + (SvTAIL(prog->check_substr) != 0);
373 I32 end = prog->check_offset_max + CHR_SVLEN(prog->check_substr)
374 - (SvTAIL(prog->check_substr) != 0);
375 I32 eshift = strend - s - end;
377 if (end_shift < eshift)
381 else { /* Can match at random position */
384 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
385 /* Should be nonnegative! */
386 end_shift = prog->minlen - start_shift -
387 CHR_SVLEN(prog->check_substr) + (SvTAIL(prog->check_substr) != 0);
390 #ifdef DEBUGGING /* 7/99: reports of failure (with the older version) */
392 Perl_croak(aTHX_ "panic: end_shift");
395 check = prog->check_substr;
397 /* Find a possible match in the region s..strend by looking for
398 the "check" substring in the region corrected by start/end_shift. */
399 if (flags & REXEC_SCREAM) {
400 char *strbeg = SvPVX(sv); /* XXXX Assume PV_force() on SCREAM! */
401 I32 p = -1; /* Internal iterator of scream. */
402 I32 *pp = data ? data->scream_pos : &p;
404 if (PL_screamfirst[BmRARE(check)] >= 0
405 || ( BmRARE(check) == '\n'
406 && (BmPREVIOUS(check) == SvCUR(check) - 1)
408 s = screaminstr(sv, check,
409 start_shift + (s - strbeg), end_shift, pp, 0);
413 *data->scream_olds = s;
416 s = fbm_instr((unsigned char*)s + start_shift,
417 (unsigned char*)strend - end_shift,
418 check, PL_multiline ? FBMrf_MULTILINE : 0);
420 /* Update the count-of-usability, remove useless subpatterns,
423 DEBUG_r(PerlIO_printf(Perl_debug_log, "%s %s substr `%s%.*s%s'%s%s",
424 (s ? "Found" : "Did not find"),
425 ((check == prog->anchored_substr) ? "anchored" : "floating"),
427 (int)(SvCUR(check) - (SvTAIL(check)!=0)),
429 PL_colors[1], (SvTAIL(check) ? "$" : ""),
430 (s ? " at offset " : "...\n") ) );
435 /* Finish the diagnostic message */
436 DEBUG_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
438 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
439 Start with the other substr.
440 XXXX no SCREAM optimization yet - and a very coarse implementation
441 XXXX /ttx+/ results in anchored=`ttx', floating=`x'. floating will
442 *always* match. Probably should be marked during compile...
443 Probably it is right to do no SCREAM here...
446 if (prog->float_substr && prog->anchored_substr) {
447 /* Take into account the "other" substring. */
448 /* XXXX May be hopelessly wrong for UTF... */
450 other_last = strpos - 1;
451 if (check == prog->float_substr) {
454 char *last = s - start_shift, *last1, *last2;
458 t = s - prog->check_offset_max;
459 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
460 && (!(prog->reganch & ROPT_UTF8)
461 || (PL_bostr = strpos, /* Used in regcopmaybe() */
462 (t = reghopmaybe_c(s, -(prog->check_offset_max)))
467 t += prog->anchored_offset;
471 last2 = last1 = strend - prog->minlen;
474 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
475 /* On end-of-str: see comment below. */
476 s = fbm_instr((unsigned char*)t,
477 (unsigned char*)last1 + prog->anchored_offset
478 + SvCUR(prog->anchored_substr)
479 - (SvTAIL(prog->anchored_substr)!=0),
480 prog->anchored_substr, PL_multiline ? FBMrf_MULTILINE : 0);
481 DEBUG_r(PerlIO_printf(Perl_debug_log, "%s anchored substr `%s%.*s%s'%s",
482 (s ? "Found" : "Contradicts"),
484 (int)(SvCUR(prog->anchored_substr)
485 - (SvTAIL(prog->anchored_substr)!=0)),
486 SvPVX(prog->anchored_substr),
487 PL_colors[1], (SvTAIL(prog->anchored_substr) ? "$" : "")));
489 if (last1 >= last2) {
490 DEBUG_r(PerlIO_printf(Perl_debug_log,
491 ", giving up...\n"));
494 DEBUG_r(PerlIO_printf(Perl_debug_log,
495 ", trying floating at offset %ld...\n",
496 (long)(s1 + 1 - i_strpos)));
497 PL_regeol = strend; /* Used in HOP() */
498 other_last = last1 + prog->anchored_offset;
503 DEBUG_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
504 (long)(s - i_strpos)));
505 t = s - prog->anchored_offset;
514 else { /* Take into account the floating substring. */
519 last1 = last = strend - prog->minlen + prog->float_min_offset;
520 if (last - t > prog->float_max_offset)
521 last = t + prog->float_max_offset;
522 s = t + prog->float_min_offset;
525 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
526 /* fbm_instr() takes into account exact value of end-of-str
527 if the check is SvTAIL(ed). Since false positives are OK,
528 and end-of-str is not later than strend we are OK. */
529 s = fbm_instr((unsigned char*)s,
530 (unsigned char*)last + SvCUR(prog->float_substr)
531 - (SvTAIL(prog->float_substr)!=0),
532 prog->float_substr, PL_multiline ? FBMrf_MULTILINE : 0);
533 DEBUG_r(PerlIO_printf(Perl_debug_log, "%s floating substr `%s%.*s%s'%s",
534 (s ? "Found" : "Contradicts"),
536 (int)(SvCUR(prog->float_substr)
537 - (SvTAIL(prog->float_substr)!=0)),
538 SvPVX(prog->float_substr),
539 PL_colors[1], (SvTAIL(prog->float_substr) ? "$" : "")));
542 DEBUG_r(PerlIO_printf(Perl_debug_log,
543 ", giving up...\n"));
546 DEBUG_r(PerlIO_printf(Perl_debug_log,
547 ", trying anchored starting at offset %ld...\n",
548 (long)(s1 + 1 - i_strpos)));
550 PL_regeol = strend; /* Used in HOP() */
555 DEBUG_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
556 (long)(s - i_strpos)));
566 t = s - prog->check_offset_max;
568 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
569 && (!(prog->reganch & ROPT_UTF8)
570 || (PL_bostr = strpos, /* Used in regcopmaybe() */
571 ((t = reghopmaybe_c(s, -(prog->check_offset_max)))
574 /* Fixed substring is found far enough so that the match
575 cannot start at strpos. */
577 if (ml_anch && t[-1] != '\n') {
578 /* Eventually fbm_*() should handle this, but often
579 anchored_offset is not 0, so this check will not be wasted. */
580 /* XXXX In the code below we prefer to look for "^" even in
581 presence of anchored substrings. And we search even
582 beyond the found float position. These pessimizations
583 are historical artefacts only. */
585 while (t < strend - prog->minlen) {
587 if (t < s - prog->check_offset_min) {
588 if (prog->anchored_substr) {
589 /* We definitely contradict the found anchored
590 substr. Due to the above check we do not
591 contradict "check" substr.
592 Thus we can arrive here only if check substr
593 is float. Redo checking for "other"=="fixed".
596 DEBUG_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
597 PL_colors[0],PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
598 goto do_other_anchored;
601 DEBUG_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
602 PL_colors[0],PL_colors[1], (long)(s - i_strpos)));
605 DEBUG_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting at offset %ld...\n",
606 PL_colors[0],PL_colors[1], (long)(t + 1 - i_strpos)));
612 DEBUG_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
613 PL_colors[0],PL_colors[1]));
618 ++BmUSEFUL(prog->check_substr); /* hooray/5 */
622 /* The found string does not prohibit matching at beg-of-str
623 - no optimization of calling REx engine can be performed,
624 unless it was an MBOL and we are not after MBOL. */
626 /* Even in this situation we may use MBOL flag if strpos is offset
627 wrt the start of the string. */
629 && (strpos + SvCUR(sv) != strend) && strpos[-1] != '\n') {
633 DEBUG_r( if (ml_anch)
634 PerlIO_printf(Perl_debug_log, "Does not contradict /%s^%s/m...\n",
635 PL_colors[0],PL_colors[1]);
638 if (!(prog->reganch & ROPT_NAUGHTY) /* XXXX If strpos moved? */
639 && --BmUSEFUL(prog->check_substr) < 0
640 && prog->check_substr == prog->float_substr) { /* boo */
641 /* If flags & SOMETHING - do not do it many times on the same match */
642 SvREFCNT_dec(prog->check_substr);
643 prog->check_substr = Nullsv; /* disable */
644 prog->float_substr = Nullsv; /* clear */
646 prog->reganch &= ~RE_USE_INTUIT;
652 DEBUG_r(PerlIO_printf(Perl_debug_log, "%sGuessed:%s match at offset %ld\n",
653 PL_colors[4], PL_colors[5], (long)(s - i_strpos)) );
656 fail_finish: /* Substring not found */
657 BmUSEFUL(prog->check_substr) += 5; /* hooray */
659 DEBUG_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
660 PL_colors[4],PL_colors[5]));
665 - regexec_flags - match a regexp against a string
668 Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *strend,
669 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
670 /* strend: pointer to null at end of string */
671 /* strbeg: real beginning of string */
672 /* minend: end of match must be >=minend after stringarg. */
673 /* data: May be used for some additional optimizations. */
674 /* nosave: For optimizations. */
679 register char *startpos = stringarg;
681 I32 minlen; /* must match at least this many chars */
682 I32 dontbother = 0; /* how many characters not to try at end */
683 I32 start_shift = 0; /* Offset of the start to find
684 constant substr. */ /* CC */
685 I32 end_shift = 0; /* Same for the end. */ /* CC */
686 I32 scream_pos = -1; /* Internal iterator of scream. */
688 SV* oreplsv = GvSV(PL_replgv);
694 PL_regnarrate = PL_debug & 512;
698 if (prog == NULL || startpos == NULL) {
699 Perl_croak(aTHX_ "NULL regexp parameter");
703 minlen = prog->minlen;
704 if (strend - startpos < minlen) goto phooey;
706 if (startpos == strbeg) /* is ^ valid at stringarg? */
709 PL_regprev = (U32)stringarg[-1];
710 if (!PL_multiline && PL_regprev == '\n')
711 PL_regprev = '\0'; /* force ^ to NOT match */
714 /* Check validity of program. */
715 if (UCHARAT(prog->program) != REG_MAGIC) {
716 Perl_croak(aTHX_ "corrupted regexp program");
723 if (prog->reganch & ROPT_UTF8)
724 PL_reg_flags |= RF_utf8;
726 /* Mark beginning of line for ^ and lookbehind. */
727 PL_regbol = startpos;
731 /* Mark end of line for $ (and such) */
734 /* see how far we have to get to not match where we matched before */
735 PL_regtill = startpos+minend;
737 /* We start without call_cc context. */
740 /* If there is a "must appear" string, look for it. */
743 if (prog->reganch & ROPT_GPOS_SEEN) { /* Need to have PL_reg_ganch */
746 if (flags & REXEC_IGNOREPOS) /* Means: check only at start */
747 PL_reg_ganch = startpos;
748 else if (sv && SvTYPE(sv) >= SVt_PVMG
750 && (mg = mg_find(sv, 'g')) && mg->mg_len >= 0) {
751 PL_reg_ganch = strbeg + mg->mg_len; /* Defined pos() */
752 if (prog->reganch & ROPT_ANCH_GPOS) {
753 if (s > PL_reg_ganch)
758 else /* pos() not defined */
759 PL_reg_ganch = strbeg;
762 if (!(flags & REXEC_CHECKED) && prog->check_substr != Nullsv) {
763 re_scream_pos_data d;
765 d.scream_olds = &scream_olds;
766 d.scream_pos = &scream_pos;
767 s = re_intuit_start(prog, sv, s, strend, flags, &d);
769 goto phooey; /* not present */
772 DEBUG_r( if (!PL_colorset) reginitcolors() );
773 DEBUG_r(PerlIO_printf(Perl_debug_log,
774 "%sMatching REx%s `%s%.60s%s%s' against `%s%.*s%s%s'\n",
775 PL_colors[4],PL_colors[5],PL_colors[0],
778 (strlen(prog->precomp) > 60 ? "..." : ""),
780 (int)(strend - startpos > 60 ? 60 : strend - startpos),
781 startpos, PL_colors[1],
782 (strend - startpos > 60 ? "..." : ""))
785 /* Simplest case: anchored match need be tried only once. */
786 /* [unless only anchor is BOL and multiline is set] */
787 if (prog->reganch & (ROPT_ANCH & ~ROPT_ANCH_GPOS)) {
788 if (s == startpos && regtry(prog, startpos))
790 else if (PL_multiline || (prog->reganch & ROPT_IMPLICIT)
791 || (prog->reganch & ROPT_ANCH_MBOL)) /* XXXX SBOL? */
796 dontbother = minlen - 1;
797 end = HOPc(strend, -dontbother) - 1;
798 /* for multiline we only have to try after newlines */
799 if (prog->check_substr) {
808 s = re_intuit_start(prog, sv, s + 1, strend, flags, NULL);
816 if (*s++ == '\n') { /* don't need PL_utf8skip here */
824 } else if (prog->reganch & ROPT_ANCH_GPOS) {
825 if (regtry(prog, PL_reg_ganch))
830 /* Messy cases: unanchored match. */
831 if (prog->anchored_substr && prog->reganch & ROPT_SKIP) {
832 /* we have /x+whatever/ */
833 /* it must be a one character string (XXXX Except UTF?) */
834 char ch = SvPVX(prog->anchored_substr)[0];
838 if (regtry(prog, s)) goto got_it;
840 while (s < strend && *s == ch)
849 if (regtry(prog, s)) goto got_it;
851 while (s < strend && *s == ch)
859 else if (prog->anchored_substr != Nullsv
860 || (prog->float_substr != Nullsv
861 && prog->float_max_offset < strend - s)) {
862 SV *must = prog->anchored_substr
863 ? prog->anchored_substr : prog->float_substr;
865 prog->anchored_substr ? prog->anchored_offset : prog->float_max_offset;
867 prog->anchored_substr ? prog->anchored_offset : prog->float_min_offset;
868 I32 delta = back_max - back_min;
869 char *last = HOPc(strend, /* Cannot start after this */
870 -(I32)(CHR_SVLEN(must)
871 - (SvTAIL(must) != 0) + back_min));
872 char *last1; /* Last position checked before */
877 last1 = s - 1; /* bogus */
879 /* XXXX check_substr already used to find `s', can optimize if
880 check_substr==must. */
882 dontbother = end_shift;
883 strend = HOPc(strend, -dontbother);
884 while ( (s <= last) &&
885 ((flags & REXEC_SCREAM)
886 ? (s = screaminstr(sv, must, HOPc(s, back_min) - strbeg,
887 end_shift, &scream_pos, 0))
888 : (s = fbm_instr((unsigned char*)HOP(s, back_min),
889 (unsigned char*)strend, must,
890 PL_multiline ? FBMrf_MULTILINE : 0))) ) {
891 if (HOPc(s, -back_max) > last1) {
892 last1 = HOPc(s, -back_min);
893 s = HOPc(s, -back_max);
896 char *t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
898 last1 = HOPc(s, -back_min);
918 else if (c = prog->regstclass) {
919 I32 doevery = (prog->reganch & ROPT_SKIP) == 0;
927 dontbother = minlen - 1;
928 strend = HOPc(strend, -dontbother); /* don't bother with what can't match */
930 /* We know what class it must start with. */
934 if (REGINCLASSUTF8(c, (U8*)s)) {
935 if (tmp && regtry(prog, s))
947 if (REGINCLASS(c, *s)) {
948 if (tmp && regtry(prog, s))
968 c2 = PL_fold_locale[c1];
972 /* Here it is NOT UTF! */
976 && (ln == 1 || (OP(c) == EXACTF
978 : ibcmp_locale(s, m, ln)))
985 if ( (*s == c1 || *s == c2)
986 && (ln == 1 || (OP(c) == EXACTF
988 : ibcmp_locale(s, m, ln)))
996 PL_reg_flags |= RF_tainted;
1003 tmp = (s != startpos) ? UCHARAT(s - 1) : PL_regprev;
1004 tmp = ((OP(c) == BOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1005 while (s < strend) {
1006 if (tmp == !(OP(c) == BOUND ? isALNUM(*s) : isALNUM_LC(*s))) {
1008 if (regtry(prog, s))
1013 if ((minlen || tmp) && regtry(prog,s))
1017 PL_reg_flags |= RF_tainted;
1022 strend = reghop_c(strend, -1);
1024 tmp = (I32)(s != startpos) ? utf8_to_uv(reghop((U8*)s, -1), 0) : PL_regprev;
1025 tmp = ((OP(c) == BOUND ? isALNUM_uni(tmp) : isALNUM_LC_uni(tmp)) != 0);
1026 while (s < strend) {
1027 if (tmp == !(OP(c) == BOUND ?
1028 swash_fetch(PL_utf8_alnum, (U8*)s) :
1029 isALNUM_LC_utf8((U8*)s)))
1032 if (regtry(prog, s))
1037 if ((minlen || tmp) && regtry(prog,s))
1041 PL_reg_flags |= RF_tainted;
1048 tmp = (s != startpos) ? UCHARAT(s - 1) : PL_regprev;
1049 tmp = ((OP(c) == NBOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1050 while (s < strend) {
1051 if (tmp == !(OP(c) == NBOUND ? isALNUM(*s) : isALNUM_LC(*s)))
1053 else if (regtry(prog, s))
1057 if ((minlen || !tmp) && regtry(prog,s))
1061 PL_reg_flags |= RF_tainted;
1066 strend = reghop_c(strend, -1);
1068 tmp = (I32)(s != startpos) ? utf8_to_uv(reghop((U8*)s, -1), 0) : PL_regprev;
1069 tmp = ((OP(c) == NBOUND ? isALNUM_uni(tmp) : isALNUM_LC_uni(tmp)) != 0);
1070 while (s < strend) {
1071 if (tmp == !(OP(c) == NBOUND ?
1072 swash_fetch(PL_utf8_alnum, (U8*)s) :
1073 isALNUM_LC_utf8((U8*)s)))
1075 else if (regtry(prog, s))
1079 if ((minlen || !tmp) && regtry(prog,s))
1083 while (s < strend) {
1085 if (tmp && regtry(prog, s))
1096 while (s < strend) {
1097 if (swash_fetch(PL_utf8_alnum, (U8*)s)) {
1098 if (tmp && regtry(prog, s))
1109 PL_reg_flags |= RF_tainted;
1110 while (s < strend) {
1111 if (isALNUM_LC(*s)) {
1112 if (tmp && regtry(prog, s))
1123 PL_reg_flags |= RF_tainted;
1124 while (s < strend) {
1125 if (isALNUM_LC_utf8((U8*)s)) {
1126 if (tmp && regtry(prog, s))
1137 while (s < strend) {
1139 if (tmp && regtry(prog, s))
1150 while (s < strend) {
1151 if (!swash_fetch(PL_utf8_alnum, (U8*)s)) {
1152 if (tmp && regtry(prog, s))
1163 PL_reg_flags |= RF_tainted;
1164 while (s < strend) {
1165 if (!isALNUM_LC(*s)) {
1166 if (tmp && regtry(prog, s))
1177 PL_reg_flags |= RF_tainted;
1178 while (s < strend) {
1179 if (!isALNUM_LC_utf8((U8*)s)) {
1180 if (tmp && regtry(prog, s))
1191 while (s < strend) {
1193 if (tmp && regtry(prog, s))
1204 while (s < strend) {
1205 if (*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s)) {
1206 if (tmp && regtry(prog, s))
1217 PL_reg_flags |= RF_tainted;
1218 while (s < strend) {
1219 if (isSPACE_LC(*s)) {
1220 if (tmp && regtry(prog, s))
1231 PL_reg_flags |= RF_tainted;
1232 while (s < strend) {
1233 if (*s == ' ' || isSPACE_LC_utf8((U8*)s)) {
1234 if (tmp && regtry(prog, s))
1245 while (s < strend) {
1247 if (tmp && regtry(prog, s))
1258 while (s < strend) {
1259 if (!(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s))) {
1260 if (tmp && regtry(prog, s))
1271 PL_reg_flags |= RF_tainted;
1272 while (s < strend) {
1273 if (!isSPACE_LC(*s)) {
1274 if (tmp && regtry(prog, s))
1285 PL_reg_flags |= RF_tainted;
1286 while (s < strend) {
1287 if (!(*s == ' ' || isSPACE_LC_utf8((U8*)s))) {
1288 if (tmp && regtry(prog, s))
1299 while (s < strend) {
1301 if (tmp && regtry(prog, s))
1312 while (s < strend) {
1313 if (swash_fetch(PL_utf8_digit,(U8*)s)) {
1314 if (tmp && regtry(prog, s))
1325 PL_reg_flags |= RF_tainted;
1326 while (s < strend) {
1327 if (isDIGIT_LC(*s)) {
1328 if (tmp && regtry(prog, s))
1339 PL_reg_flags |= RF_tainted;
1340 while (s < strend) {
1341 if (isDIGIT_LC_utf8((U8*)s)) {
1342 if (tmp && regtry(prog, s))
1353 while (s < strend) {
1355 if (tmp && regtry(prog, s))
1366 while (s < strend) {
1367 if (!swash_fetch(PL_utf8_digit,(U8*)s)) {
1368 if (tmp && regtry(prog, s))
1379 PL_reg_flags |= RF_tainted;
1380 while (s < strend) {
1381 if (!isDIGIT_LC(*s)) {
1382 if (tmp && regtry(prog, s))
1393 PL_reg_flags |= RF_tainted;
1394 while (s < strend) {
1395 if (!isDIGIT_LC_utf8((U8*)s)) {
1396 if (tmp && regtry(prog, s))
1407 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
1413 if (prog->float_substr != Nullsv) { /* Trim the end. */
1415 I32 oldpos = scream_pos;
1417 if (flags & REXEC_SCREAM) {
1418 last = screaminstr(sv, prog->float_substr, s - strbeg,
1419 end_shift, &scream_pos, 1); /* last one */
1421 last = scream_olds; /* Only one occurence. */
1425 char *little = SvPV(prog->float_substr, len);
1427 if (SvTAIL(prog->float_substr)) {
1428 if (memEQ(strend - len + 1, little, len - 1))
1429 last = strend - len + 1;
1430 else if (!PL_multiline)
1431 last = memEQ(strend - len, little, len)
1432 ? strend - len : Nullch;
1438 last = rninstr(s, strend, little, little + len);
1440 last = strend; /* matching `$' */
1443 if (last == NULL) goto phooey; /* Should not happen! */
1444 dontbother = strend - last + prog->float_min_offset;
1446 if (minlen && (dontbother < minlen))
1447 dontbother = minlen - 1;
1448 strend -= dontbother; /* this one's always in bytes! */
1449 /* We don't know much -- general case. */
1452 if (regtry(prog, s))
1461 if (regtry(prog, s))
1463 } while (s++ < strend);
1471 RX_MATCH_TAINTED_set(prog, PL_reg_flags & RF_tainted);
1473 if (PL_reg_eval_set) {
1474 /* Preserve the current value of $^R */
1475 if (oreplsv != GvSV(PL_replgv))
1476 sv_setsv(oreplsv, GvSV(PL_replgv));/* So that when GvSV(replgv) is
1477 restored, the value remains
1479 restore_pos(aTHXo_ 0);
1482 /* make sure $`, $&, $', and $digit will work later */
1483 if ( !(flags & REXEC_NOT_FIRST) ) {
1484 if (RX_MATCH_COPIED(prog)) {
1485 Safefree(prog->subbeg);
1486 RX_MATCH_COPIED_off(prog);
1488 if (flags & REXEC_COPY_STR) {
1489 I32 i = PL_regeol - startpos + (stringarg - strbeg);
1491 s = savepvn(strbeg, i);
1494 RX_MATCH_COPIED_on(prog);
1497 prog->subbeg = strbeg;
1498 prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
1505 if (PL_reg_eval_set)
1506 restore_pos(aTHXo_ 0);
1511 - regtry - try match at specific point
1513 STATIC I32 /* 0 failure, 1 success */
1514 S_regtry(pTHX_ regexp *prog, char *startpos)
1522 if ((prog->reganch & ROPT_EVAL_SEEN) && !PL_reg_eval_set) {
1525 PL_reg_eval_set = RS_init;
1527 PerlIO_printf(Perl_debug_log, " setting stack tmpbase at %"IVdf"\n",
1528 (IV)(PL_stack_sp - PL_stack_base));
1530 SAVEINT(cxstack[cxstack_ix].blk_oldsp);
1531 cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base;
1532 /* Otherwise OP_NEXTSTATE will free whatever on stack now. */
1534 /* Apparently this is not needed, judging by wantarray. */
1535 /* SAVEINT(cxstack[cxstack_ix].blk_gimme);
1536 cxstack[cxstack_ix].blk_gimme = G_SCALAR; */
1539 /* Make $_ available to executed code. */
1540 if (PL_reg_sv != DEFSV) {
1541 /* SAVE_DEFSV does *not* suffice here for USE_THREADS */
1546 if (!(SvTYPE(PL_reg_sv) >= SVt_PVMG && SvMAGIC(PL_reg_sv)
1547 && (mg = mg_find(PL_reg_sv, 'g')))) {
1548 /* prepare for quick setting of pos */
1549 sv_magic(PL_reg_sv, (SV*)0, 'g', Nullch, 0);
1550 mg = mg_find(PL_reg_sv, 'g');
1554 PL_reg_oldpos = mg->mg_len;
1555 SAVEDESTRUCTOR_X(restore_pos, 0);
1558 New(22,PL_reg_curpm, 1, PMOP);
1559 PL_reg_curpm->op_pmregexp = prog;
1560 PL_reg_oldcurpm = PL_curpm;
1561 PL_curpm = PL_reg_curpm;
1562 if (RX_MATCH_COPIED(prog)) {
1563 /* Here is a serious problem: we cannot rewrite subbeg,
1564 since it may be needed if this match fails. Thus
1565 $` inside (?{}) could fail... */
1566 PL_reg_oldsaved = prog->subbeg;
1567 PL_reg_oldsavedlen = prog->sublen;
1568 RX_MATCH_COPIED_off(prog);
1571 PL_reg_oldsaved = Nullch;
1572 prog->subbeg = PL_bostr;
1573 prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
1575 prog->startp[0] = startpos - PL_bostr;
1576 PL_reginput = startpos;
1577 PL_regstartp = prog->startp;
1578 PL_regendp = prog->endp;
1579 PL_reglastparen = &prog->lastparen;
1580 prog->lastparen = 0;
1582 DEBUG_r(PL_reg_starttry = startpos);
1583 if (PL_reg_start_tmpl <= prog->nparens) {
1584 PL_reg_start_tmpl = prog->nparens*3/2 + 3;
1585 if(PL_reg_start_tmp)
1586 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
1588 New(22,PL_reg_start_tmp, PL_reg_start_tmpl, char*);
1591 /* XXXX What this code is doing here?!!! There should be no need
1592 to do this again and again, PL_reglastparen should take care of
1596 if (prog->nparens) {
1597 for (i = prog->nparens; i >= 1; i--) {
1603 if (regmatch(prog->program + 1)) {
1604 prog->endp[0] = PL_reginput - PL_bostr;
1612 - regmatch - main matching routine
1614 * Conceptually the strategy is simple: check to see whether the current
1615 * node matches, call self recursively to see whether the rest matches,
1616 * and then act accordingly. In practice we make some effort to avoid
1617 * recursion, in particular by going through "ordinary" nodes (that don't
1618 * need to know whether the rest of the match failed) by a loop instead of
1621 /* [lwall] I've hoisted the register declarations to the outer block in order to
1622 * maybe save a little bit of pushing and popping on the stack. It also takes
1623 * advantage of machines that use a register save mask on subroutine entry.
1625 STATIC I32 /* 0 failure, 1 success */
1626 S_regmatch(pTHX_ regnode *prog)
1629 register regnode *scan; /* Current node. */
1630 regnode *next; /* Next node. */
1631 regnode *inner; /* Next node in internal branch. */
1632 register I32 nextchr; /* renamed nextchr - nextchar colides with
1633 function of same name */
1634 register I32 n; /* no or next */
1635 register I32 ln; /* len or last */
1636 register char *s; /* operand or save */
1637 register char *locinput = PL_reginput;
1638 register I32 c1, c2, paren; /* case fold search, parenth */
1639 int minmod = 0, sw = 0, logical = 0;
1644 /* Note that nextchr is a byte even in UTF */
1645 nextchr = UCHARAT(locinput);
1647 while (scan != NULL) {
1648 #define sayNO_L (logical ? (logical = 0, sw = 0, goto cont) : sayNO)
1650 # define sayYES goto yes
1651 # define sayNO goto no
1652 # define sayYES_FINAL goto yes_final
1653 # define sayYES_LOUD goto yes_loud
1654 # define sayNO_FINAL goto no_final
1655 # define sayNO_SILENT goto do_no
1656 # define saySAME(x) if (x) goto yes; else goto no
1657 # define REPORT_CODE_OFF 24
1659 # define sayYES return 1
1660 # define sayNO return 0
1661 # define sayYES_FINAL return 1
1662 # define sayYES_LOUD return 1
1663 # define sayNO_FINAL return 0
1664 # define sayNO_SILENT return 0
1665 # define saySAME(x) return x
1668 SV *prop = sv_newmortal();
1669 int docolor = *PL_colors[0];
1670 int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
1671 int l = (PL_regeol - locinput > taill ? taill : PL_regeol - locinput);
1672 /* The part of the string before starttry has one color
1673 (pref0_len chars), between starttry and current
1674 position another one (pref_len - pref0_len chars),
1675 after the current position the third one.
1676 We assume that pref0_len <= pref_len, otherwise we
1677 decrease pref0_len. */
1678 int pref_len = (locinput - PL_bostr > (5 + taill) - l
1679 ? (5 + taill) - l : locinput - PL_bostr);
1680 int pref0_len = pref_len - (locinput - PL_reg_starttry);
1682 if (l + pref_len < (5 + taill) && l < PL_regeol - locinput)
1683 l = ( PL_regeol - locinput > (5 + taill) - pref_len
1684 ? (5 + taill) - pref_len : PL_regeol - locinput);
1687 if (pref0_len > pref_len)
1688 pref0_len = pref_len;
1689 regprop(prop, scan);
1690 PerlIO_printf(Perl_debug_log,
1691 "%4"IVdf" <%s%.*s%s%s%.*s%s%s%s%.*s%s>%*s|%3"IVdf":%*s%s\n",
1692 (IV)(locinput - PL_bostr),
1693 PL_colors[4], pref0_len,
1694 locinput - pref_len, PL_colors[5],
1695 PL_colors[2], pref_len - pref0_len,
1696 locinput - pref_len + pref0_len, PL_colors[3],
1697 (docolor ? "" : "> <"),
1698 PL_colors[0], l, locinput, PL_colors[1],
1699 15 - l - pref_len + 1,
1701 (IV)(scan - PL_regprogram), PL_regindent*2, "",
1705 next = scan + NEXT_OFF(scan);
1711 if (locinput == PL_bostr
1712 ? PL_regprev == '\n'
1714 (nextchr || locinput < PL_regeol) && locinput[-1] == '\n') )
1716 /* regtill = regbol; */
1721 if (locinput == PL_bostr
1722 ? PL_regprev == '\n'
1723 : ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n') )
1729 if (locinput == PL_regbol && PL_regprev == '\n')
1733 if (locinput == PL_reg_ganch)
1743 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
1748 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
1750 if (PL_regeol - locinput > 1)
1754 if (PL_regeol != locinput)
1758 if (nextchr & 0x80) {
1759 locinput += PL_utf8skip[nextchr];
1760 if (locinput > PL_regeol)
1762 nextchr = UCHARAT(locinput);
1765 if (!nextchr && locinput >= PL_regeol)
1767 nextchr = UCHARAT(++locinput);
1770 if (!nextchr && locinput >= PL_regeol)
1772 nextchr = UCHARAT(++locinput);
1775 if (nextchr & 0x80) {
1776 locinput += PL_utf8skip[nextchr];
1777 if (locinput > PL_regeol)
1779 nextchr = UCHARAT(locinput);
1782 if (!nextchr && locinput >= PL_regeol || nextchr == '\n')
1784 nextchr = UCHARAT(++locinput);
1787 if (!nextchr && locinput >= PL_regeol || nextchr == '\n')
1789 nextchr = UCHARAT(++locinput);
1794 /* Inline the first character, for speed. */
1795 if (UCHARAT(s) != nextchr)
1797 if (PL_regeol - locinput < ln)
1799 if (ln > 1 && memNE(s, locinput, ln))
1802 nextchr = UCHARAT(locinput);
1805 PL_reg_flags |= RF_tainted;
1814 c1 = OP(scan) == EXACTF;
1818 if (utf8_to_uv((U8*)s, 0) != (c1 ?
1819 toLOWER_utf8((U8*)l) :
1820 toLOWER_LC_utf8((U8*)l)))
1828 nextchr = UCHARAT(locinput);
1832 /* Inline the first character, for speed. */
1833 if (UCHARAT(s) != nextchr &&
1834 UCHARAT(s) != ((OP(scan) == EXACTF)
1835 ? PL_fold : PL_fold_locale)[nextchr])
1837 if (PL_regeol - locinput < ln)
1839 if (ln > 1 && (OP(scan) == EXACTF
1840 ? ibcmp(s, locinput, ln)
1841 : ibcmp_locale(s, locinput, ln)))
1844 nextchr = UCHARAT(locinput);
1847 if (!REGINCLASSUTF8(scan, (U8*)locinput))
1849 if (locinput >= PL_regeol)
1851 locinput += PL_utf8skip[nextchr];
1852 nextchr = UCHARAT(locinput);
1856 nextchr = UCHARAT(locinput);
1857 if (!REGINCLASS(scan, nextchr))
1859 if (!nextchr && locinput >= PL_regeol)
1861 nextchr = UCHARAT(++locinput);
1864 PL_reg_flags |= RF_tainted;
1869 if (!(OP(scan) == ALNUM
1870 ? isALNUM(nextchr) : isALNUM_LC(nextchr)))
1872 nextchr = UCHARAT(++locinput);
1875 PL_reg_flags |= RF_tainted;
1880 if (nextchr & 0x80) {
1881 if (!(OP(scan) == ALNUMUTF8
1882 ? swash_fetch(PL_utf8_alnum, (U8*)locinput)
1883 : isALNUM_LC_utf8((U8*)locinput)))
1887 locinput += PL_utf8skip[nextchr];
1888 nextchr = UCHARAT(locinput);
1891 if (!(OP(scan) == ALNUMUTF8
1892 ? isALNUM(nextchr) : isALNUM_LC(nextchr)))
1894 nextchr = UCHARAT(++locinput);
1897 PL_reg_flags |= RF_tainted;
1900 if (!nextchr && locinput >= PL_regeol)
1902 if (OP(scan) == NALNUM
1903 ? isALNUM(nextchr) : isALNUM_LC(nextchr))
1905 nextchr = UCHARAT(++locinput);
1908 PL_reg_flags |= RF_tainted;
1911 if (!nextchr && locinput >= PL_regeol)
1913 if (nextchr & 0x80) {
1914 if (OP(scan) == NALNUMUTF8
1915 ? swash_fetch(PL_utf8_alnum, (U8*)locinput)
1916 : isALNUM_LC_utf8((U8*)locinput))
1920 locinput += PL_utf8skip[nextchr];
1921 nextchr = UCHARAT(locinput);
1924 if (OP(scan) == NALNUMUTF8
1925 ? isALNUM(nextchr) : isALNUM_LC(nextchr))
1927 nextchr = UCHARAT(++locinput);
1931 PL_reg_flags |= RF_tainted;
1935 /* was last char in word? */
1936 ln = (locinput != PL_regbol) ? UCHARAT(locinput - 1) : PL_regprev;
1937 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
1939 n = isALNUM(nextchr);
1942 ln = isALNUM_LC(ln);
1943 n = isALNUM_LC(nextchr);
1945 if (((!ln) == (!n)) == (OP(scan) == BOUND || OP(scan) == BOUNDL))
1950 PL_reg_flags |= RF_tainted;
1954 /* was last char in word? */
1955 ln = (locinput != PL_regbol)
1956 ? utf8_to_uv(reghop((U8*)locinput, -1), 0) : PL_regprev;
1957 if (OP(scan) == BOUNDUTF8 || OP(scan) == NBOUNDUTF8) {
1958 ln = isALNUM_uni(ln);
1959 n = swash_fetch(PL_utf8_alnum, (U8*)locinput);
1962 ln = isALNUM_LC_uni(ln);
1963 n = isALNUM_LC_utf8((U8*)locinput);
1965 if (((!ln) == (!n)) == (OP(scan) == BOUNDUTF8 || OP(scan) == BOUNDLUTF8))
1969 PL_reg_flags |= RF_tainted;
1972 if (!nextchr && locinput >= PL_regeol)
1974 if (!(OP(scan) == SPACE
1975 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
1977 nextchr = UCHARAT(++locinput);
1980 PL_reg_flags |= RF_tainted;
1983 if (!nextchr && locinput >= PL_regeol)
1985 if (nextchr & 0x80) {
1986 if (!(OP(scan) == SPACEUTF8
1987 ? swash_fetch(PL_utf8_space,(U8*)locinput)
1988 : isSPACE_LC_utf8((U8*)locinput)))
1992 locinput += PL_utf8skip[nextchr];
1993 nextchr = UCHARAT(locinput);
1996 if (!(OP(scan) == SPACEUTF8
1997 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
1999 nextchr = UCHARAT(++locinput);
2002 PL_reg_flags |= RF_tainted;
2007 if (OP(scan) == SPACE
2008 ? isSPACE(nextchr) : isSPACE_LC(nextchr))
2010 nextchr = UCHARAT(++locinput);
2013 PL_reg_flags |= RF_tainted;
2018 if (nextchr & 0x80) {
2019 if (OP(scan) == NSPACEUTF8
2020 ? swash_fetch(PL_utf8_space,(U8*)locinput)
2021 : isSPACE_LC_utf8((U8*)locinput))
2025 locinput += PL_utf8skip[nextchr];
2026 nextchr = UCHARAT(locinput);
2029 if (OP(scan) == NSPACEUTF8
2030 ? isSPACE(nextchr) : isSPACE_LC(nextchr))
2032 nextchr = UCHARAT(++locinput);
2035 PL_reg_flags |= RF_tainted;
2038 if (!nextchr && locinput >= PL_regeol)
2040 if (!(OP(scan) == DIGIT
2041 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr)))
2043 nextchr = UCHARAT(++locinput);
2046 PL_reg_flags |= RF_tainted;
2051 if (nextchr & 0x80) {
2052 if (OP(scan) == NDIGITUTF8
2053 ? swash_fetch(PL_utf8_digit,(U8*)locinput)
2054 : isDIGIT_LC_utf8((U8*)locinput))
2058 locinput += PL_utf8skip[nextchr];
2059 nextchr = UCHARAT(locinput);
2062 if (!isDIGIT(nextchr))
2064 nextchr = UCHARAT(++locinput);
2067 PL_reg_flags |= RF_tainted;
2072 if (OP(scan) == DIGIT
2073 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr))
2075 nextchr = UCHARAT(++locinput);
2078 PL_reg_flags |= RF_tainted;
2081 if (!nextchr && locinput >= PL_regeol)
2083 if (nextchr & 0x80) {
2084 if (swash_fetch(PL_utf8_digit,(U8*)locinput))
2086 locinput += PL_utf8skip[nextchr];
2087 nextchr = UCHARAT(locinput);
2090 if (isDIGIT(nextchr))
2092 nextchr = UCHARAT(++locinput);
2095 if (locinput >= PL_regeol || swash_fetch(PL_utf8_mark,(U8*)locinput))
2097 locinput += PL_utf8skip[nextchr];
2098 while (locinput < PL_regeol && swash_fetch(PL_utf8_mark,(U8*)locinput))
2099 locinput += UTF8SKIP(locinput);
2100 if (locinput > PL_regeol)
2102 nextchr = UCHARAT(locinput);
2105 PL_reg_flags |= RF_tainted;
2109 n = ARG(scan); /* which paren pair */
2110 ln = PL_regstartp[n];
2111 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
2112 if (*PL_reglastparen < n || ln == -1)
2113 sayNO; /* Do not match unless seen CLOSEn. */
2114 if (ln == PL_regendp[n])
2118 if (UTF && OP(scan) != REF) { /* REF can do byte comparison */
2120 char *e = PL_bostr + PL_regendp[n];
2122 * Note that we can't do the "other character" lookup trick as
2123 * in the 8-bit case (no pun intended) because in Unicode we
2124 * have to map both upper and title case to lower case.
2126 if (OP(scan) == REFF) {
2130 if (toLOWER_utf8((U8*)s) != toLOWER_utf8((U8*)l))
2140 if (toLOWER_LC_utf8((U8*)s) != toLOWER_LC_utf8((U8*)l))
2147 nextchr = UCHARAT(locinput);
2151 /* Inline the first character, for speed. */
2152 if (UCHARAT(s) != nextchr &&
2154 (UCHARAT(s) != ((OP(scan) == REFF
2155 ? PL_fold : PL_fold_locale)[nextchr]))))
2157 ln = PL_regendp[n] - ln;
2158 if (locinput + ln > PL_regeol)
2160 if (ln > 1 && (OP(scan) == REF
2161 ? memNE(s, locinput, ln)
2163 ? ibcmp(s, locinput, ln)
2164 : ibcmp_locale(s, locinput, ln))))
2167 nextchr = UCHARAT(locinput);
2178 OP_4tree *oop = PL_op;
2179 COP *ocurcop = PL_curcop;
2180 SV **ocurpad = PL_curpad;
2184 PL_op = (OP_4tree*)PL_regdata->data[n];
2185 DEBUG_r( PerlIO_printf(Perl_debug_log, " re_eval 0x%"UVxf"\n", (UV)PL_op) );
2186 PL_curpad = AvARRAY((AV*)PL_regdata->data[n + 2]);
2187 PL_regendp[0] = PL_reg_magic->mg_len = locinput - PL_bostr;
2189 CALLRUNOPS(aTHX); /* Scalar context. */
2195 PL_curpad = ocurpad;
2196 PL_curcop = ocurcop;
2198 if (logical == 2) { /* Postponed subexpression. */
2200 MAGIC *mg = Null(MAGIC*);
2202 CHECKPOINT cp, lastcp;
2204 if(SvROK(ret) || SvRMAGICAL(ret)) {
2205 SV *sv = SvROK(ret) ? SvRV(ret) : ret;
2208 mg = mg_find(sv, 'r');
2211 re = (regexp *)mg->mg_obj;
2212 (void)ReREFCNT_inc(re);
2216 char *t = SvPV(ret, len);
2218 char *oprecomp = PL_regprecomp;
2219 I32 osize = PL_regsize;
2220 I32 onpar = PL_regnpar;
2223 re = CALLREGCOMP(aTHX_ t, t + len, &pm);
2225 & (SVs_TEMP | SVs_PADTMP | SVf_READONLY)))
2226 sv_magic(ret,(SV*)ReREFCNT_inc(re),'r',0,0);
2227 PL_regprecomp = oprecomp;
2232 PerlIO_printf(Perl_debug_log,
2233 "Entering embedded `%s%.60s%s%s'\n",
2237 (strlen(re->precomp) > 60 ? "..." : ""))
2240 state.prev = PL_reg_call_cc;
2241 state.cc = PL_regcc;
2242 state.re = PL_reg_re;
2246 cp = regcppush(0); /* Save *all* the positions. */
2249 state.ss = PL_savestack_ix;
2250 *PL_reglastparen = 0;
2251 PL_reg_call_cc = &state;
2252 PL_reginput = locinput;
2254 /* XXXX This is too dramatic a measure... */
2257 if (regmatch(re->program + 1)) {
2258 /* Even though we succeeded, we need to restore
2259 global variables, since we may be wrapped inside
2260 SUSPEND, thus the match may be not finished yet. */
2262 /* XXXX Do this only if SUSPENDed? */
2263 PL_reg_call_cc = state.prev;
2264 PL_regcc = state.cc;
2265 PL_reg_re = state.re;
2266 cache_re(PL_reg_re);
2268 /* XXXX This is too dramatic a measure... */
2271 /* These are needed even if not SUSPEND. */
2279 PL_reg_call_cc = state.prev;
2280 PL_regcc = state.cc;
2281 PL_reg_re = state.re;
2282 cache_re(PL_reg_re);
2284 /* XXXX This is too dramatic a measure... */
2293 sv_setsv(save_scalar(PL_replgv), ret);
2297 n = ARG(scan); /* which paren pair */
2298 PL_reg_start_tmp[n] = locinput;
2303 n = ARG(scan); /* which paren pair */
2304 PL_regstartp[n] = PL_reg_start_tmp[n] - PL_bostr;
2305 PL_regendp[n] = locinput - PL_bostr;
2306 if (n > *PL_reglastparen)
2307 *PL_reglastparen = n;
2310 n = ARG(scan); /* which paren pair */
2311 sw = (*PL_reglastparen >= n && PL_regendp[n] != -1);
2314 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
2316 next = NEXTOPER(NEXTOPER(scan));
2318 next = scan + ARG(scan);
2319 if (OP(next) == IFTHEN) /* Fake one. */
2320 next = NEXTOPER(NEXTOPER(next));
2324 logical = scan->flags;
2326 /*******************************************************************
2327 PL_regcc contains infoblock about the innermost (...)* loop, and
2328 a pointer to the next outer infoblock.
2330 Here is how Y(A)*Z is processed (if it is compiled into CURLYX/WHILEM):
2332 1) After matching X, regnode for CURLYX is processed;
2334 2) This regnode creates infoblock on the stack, and calls
2335 regmatch() recursively with the starting point at WHILEM node;
2337 3) Each hit of WHILEM node tries to match A and Z (in the order
2338 depending on the current iteration, min/max of {min,max} and
2339 greediness). The information about where are nodes for "A"
2340 and "Z" is read from the infoblock, as is info on how many times "A"
2341 was already matched, and greediness.
2343 4) After A matches, the same WHILEM node is hit again.
2345 5) Each time WHILEM is hit, PL_regcc is the infoblock created by CURLYX
2346 of the same pair. Thus when WHILEM tries to match Z, it temporarily
2347 resets PL_regcc, since this Y(A)*Z can be a part of some other loop:
2348 as in (Y(A)*Z)*. If Z matches, the automaton will hit the WHILEM node
2349 of the external loop.
2351 Currently present infoblocks form a tree with a stem formed by PL_curcc
2352 and whatever it mentions via ->next, and additional attached trees
2353 corresponding to temporarily unset infoblocks as in "5" above.
2355 In the following picture infoblocks for outer loop of
2356 (Y(A)*?Z)*?T are denoted O, for inner I. NULL starting block
2357 is denoted by x. The matched string is YAAZYAZT. Temporarily postponed
2358 infoblocks are drawn below the "reset" infoblock.
2360 In fact in the picture below we do not show failed matches for Z and T
2361 by WHILEM blocks. [We illustrate minimal matches, since for them it is
2362 more obvious *why* one needs to *temporary* unset infoblocks.]
2364 Matched REx position InfoBlocks Comment
2368 Y A)*?Z)*?T x <- O <- I
2369 YA )*?Z)*?T x <- O <- I
2370 YA A)*?Z)*?T x <- O <- I
2371 YAA )*?Z)*?T x <- O <- I
2372 YAA Z)*?T x <- O # Temporary unset I
2375 YAAZ Y(A)*?Z)*?T x <- O
2378 YAAZY (A)*?Z)*?T x <- O
2381 YAAZY A)*?Z)*?T x <- O <- I
2384 YAAZYA )*?Z)*?T x <- O <- I
2387 YAAZYA Z)*?T x <- O # Temporary unset I
2393 YAAZYAZ T x # Temporary unset O
2400 *******************************************************************/
2403 CHECKPOINT cp = PL_savestack_ix;
2405 if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
2407 cc.oldcc = PL_regcc;
2409 cc.parenfloor = *PL_reglastparen;
2411 cc.min = ARG1(scan);
2412 cc.max = ARG2(scan);
2413 cc.scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
2417 PL_reginput = locinput;
2418 n = regmatch(PREVOPER(next)); /* start on the WHILEM */
2420 PL_regcc = cc.oldcc;
2426 * This is really hard to understand, because after we match
2427 * what we're trying to match, we must make sure the rest of
2428 * the REx is going to match for sure, and to do that we have
2429 * to go back UP the parse tree by recursing ever deeper. And
2430 * if it fails, we have to reset our parent's current state
2431 * that we can try again after backing off.
2434 CHECKPOINT cp, lastcp;
2435 CURCUR* cc = PL_regcc;
2436 char *lastloc = cc->lastloc; /* Detection of 0-len. */
2438 n = cc->cur + 1; /* how many we know we matched */
2439 PL_reginput = locinput;
2442 PerlIO_printf(Perl_debug_log,
2443 "%*s %ld out of %ld..%ld cc=%lx\n",
2444 REPORT_CODE_OFF+PL_regindent*2, "",
2445 (long)n, (long)cc->min,
2446 (long)cc->max, (long)cc)
2449 /* If degenerate scan matches "", assume scan done. */
2451 if (locinput == cc->lastloc && n >= cc->min) {
2452 PL_regcc = cc->oldcc;
2456 PerlIO_printf(Perl_debug_log,
2457 "%*s empty match detected, try continuation...\n",
2458 REPORT_CODE_OFF+PL_regindent*2, "")
2460 if (regmatch(cc->next))
2468 /* First just match a string of min scans. */
2472 cc->lastloc = locinput;
2473 if (regmatch(cc->scan))
2476 cc->lastloc = lastloc;
2481 /* Check whether we already were at this position.
2482 Postpone detection until we know the match is not
2483 *that* much linear. */
2484 if (!PL_reg_maxiter) {
2485 PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
2486 PL_reg_leftiter = PL_reg_maxiter;
2488 if (PL_reg_leftiter-- == 0) {
2489 I32 size = (PL_reg_maxiter + 7)/8;
2490 if (PL_reg_poscache) {
2491 if (PL_reg_poscache_size < size) {
2492 Renew(PL_reg_poscache, size, char);
2493 PL_reg_poscache_size = size;
2495 Zero(PL_reg_poscache, size, char);
2498 PL_reg_poscache_size = size;
2499 Newz(29, PL_reg_poscache, size, char);
2502 PerlIO_printf(Perl_debug_log,
2503 "%sDetected a super-linear match, switching on caching%s...\n",
2504 PL_colors[4], PL_colors[5])
2507 if (PL_reg_leftiter < 0) {
2508 I32 o = locinput - PL_bostr, b;
2510 o = (scan->flags & 0xf) - 1 + o * (scan->flags>>4);
2513 if (PL_reg_poscache[o] & (1<<b)) {
2515 PerlIO_printf(Perl_debug_log,
2516 "%*s already tried at this position...\n",
2517 REPORT_CODE_OFF+PL_regindent*2, "")
2521 PL_reg_poscache[o] |= (1<<b);
2525 /* Prefer next over scan for minimal matching. */
2528 PL_regcc = cc->oldcc;
2531 cp = regcppush(cc->parenfloor);
2533 if (regmatch(cc->next)) {
2535 sayYES; /* All done. */
2543 if (n >= cc->max) { /* Maximum greed exceeded? */
2544 if (ckWARN(WARN_UNSAFE) && n >= REG_INFTY
2545 && !(PL_reg_flags & RF_warned)) {
2546 PL_reg_flags |= RF_warned;
2547 Perl_warner(aTHX_ WARN_UNSAFE, "%s limit (%d) exceeded",
2548 "Complex regular subexpression recursion",
2555 PerlIO_printf(Perl_debug_log,
2556 "%*s trying longer...\n",
2557 REPORT_CODE_OFF+PL_regindent*2, "")
2559 /* Try scanning more and see if it helps. */
2560 PL_reginput = locinput;
2562 cc->lastloc = locinput;
2563 cp = regcppush(cc->parenfloor);
2565 if (regmatch(cc->scan)) {
2572 cc->lastloc = lastloc;
2576 /* Prefer scan over next for maximal matching. */
2578 if (n < cc->max) { /* More greed allowed? */
2579 cp = regcppush(cc->parenfloor);
2581 cc->lastloc = locinput;
2583 if (regmatch(cc->scan)) {
2588 regcppop(); /* Restore some previous $<digit>s? */
2589 PL_reginput = locinput;
2591 PerlIO_printf(Perl_debug_log,
2592 "%*s failed, try continuation...\n",
2593 REPORT_CODE_OFF+PL_regindent*2, "")
2596 if (ckWARN(WARN_UNSAFE) && n >= REG_INFTY
2597 && !(PL_reg_flags & RF_warned)) {
2598 PL_reg_flags |= RF_warned;
2599 Perl_warner(aTHX_ WARN_UNSAFE, "%s limit (%d) exceeded",
2600 "Complex regular subexpression recursion",
2604 /* Failed deeper matches of scan, so see if this one works. */
2605 PL_regcc = cc->oldcc;
2608 if (regmatch(cc->next))
2614 cc->lastloc = lastloc;
2619 next = scan + ARG(scan);
2622 inner = NEXTOPER(NEXTOPER(scan));
2625 inner = NEXTOPER(scan);
2630 if (OP(next) != c1) /* No choice. */
2631 next = inner; /* Avoid recursion. */
2633 int lastparen = *PL_reglastparen;
2637 PL_reginput = locinput;
2638 if (regmatch(inner))
2641 for (n = *PL_reglastparen; n > lastparen; n--)
2643 *PL_reglastparen = n;
2646 if (n = (c1 == BRANCH ? NEXT_OFF(next) : ARG(next)))
2650 inner = NEXTOPER(scan);
2651 if (c1 == BRANCHJ) {
2652 inner = NEXTOPER(inner);
2654 } while (scan != NULL && OP(scan) == c1);
2668 /* We suppose that the next guy does not need
2669 backtracking: in particular, it is of constant length,
2670 and has no parenths to influence future backrefs. */
2671 ln = ARG1(scan); /* min to match */
2672 n = ARG2(scan); /* max to match */
2673 paren = scan->flags;
2675 if (paren > PL_regsize)
2677 if (paren > *PL_reglastparen)
2678 *PL_reglastparen = paren;
2680 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
2682 scan += NEXT_OFF(scan); /* Skip former OPEN. */
2683 PL_reginput = locinput;
2686 if (ln && regrepeat_hard(scan, ln, &l) < ln)
2688 if (ln && l == 0 && n >= ln
2689 /* In fact, this is tricky. If paren, then the
2690 fact that we did/didnot match may influence
2691 future execution. */
2692 && !(paren && ln == 0))
2694 locinput = PL_reginput;
2695 if (PL_regkind[(U8)OP(next)] == EXACT) {
2696 c1 = (U8)*STRING(next);
2697 if (OP(next) == EXACTF)
2699 else if (OP(next) == EXACTFL)
2700 c2 = PL_fold_locale[c1];
2707 /* This may be improved if l == 0. */
2708 while (n >= ln || (n == REG_INFTY && ln > 0 && l)) { /* ln overflow ? */
2709 /* If it could work, try it. */
2711 UCHARAT(PL_reginput) == c1 ||
2712 UCHARAT(PL_reginput) == c2)
2716 PL_regstartp[paren] =
2717 HOPc(PL_reginput, -l) - PL_bostr;
2718 PL_regendp[paren] = PL_reginput - PL_bostr;
2721 PL_regendp[paren] = -1;
2727 /* Couldn't or didn't -- move forward. */
2728 PL_reginput = locinput;
2729 if (regrepeat_hard(scan, 1, &l)) {
2731 locinput = PL_reginput;
2738 n = regrepeat_hard(scan, n, &l);
2739 if (n != 0 && l == 0
2740 /* In fact, this is tricky. If paren, then the
2741 fact that we did/didnot match may influence
2742 future execution. */
2743 && !(paren && ln == 0))
2745 locinput = PL_reginput;
2747 PerlIO_printf(Perl_debug_log,
2748 "%*s matched %d times, len=%"IVdf"...\n",
2749 (int)(REPORT_CODE_OFF+PL_regindent*2), "",
2753 if (PL_regkind[(U8)OP(next)] == EXACT) {
2754 c1 = (U8)*STRING(next);
2755 if (OP(next) == EXACTF)
2757 else if (OP(next) == EXACTFL)
2758 c2 = PL_fold_locale[c1];
2767 /* If it could work, try it. */
2769 UCHARAT(PL_reginput) == c1 ||
2770 UCHARAT(PL_reginput) == c2)
2773 PerlIO_printf(Perl_debug_log,
2774 "%*s trying tail with n=%"IVdf"...\n",
2775 (int)(REPORT_CODE_OFF+PL_regindent*2), "", (IV)n)
2779 PL_regstartp[paren] = HOPc(PL_reginput, -l) - PL_bostr;
2780 PL_regendp[paren] = PL_reginput - PL_bostr;
2783 PL_regendp[paren] = -1;
2789 /* Couldn't or didn't -- back up. */
2791 locinput = HOPc(locinput, -l);
2792 PL_reginput = locinput;
2799 paren = scan->flags; /* Which paren to set */
2800 if (paren > PL_regsize)
2802 if (paren > *PL_reglastparen)
2803 *PL_reglastparen = paren;
2804 ln = ARG1(scan); /* min to match */
2805 n = ARG2(scan); /* max to match */
2806 scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
2810 ln = ARG1(scan); /* min to match */
2811 n = ARG2(scan); /* max to match */
2812 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
2817 scan = NEXTOPER(scan);
2823 scan = NEXTOPER(scan);
2827 * Lookahead to avoid useless match attempts
2828 * when we know what character comes next.
2830 if (PL_regkind[(U8)OP(next)] == EXACT) {
2831 c1 = (U8)*STRING(next);
2832 if (OP(next) == EXACTF)
2834 else if (OP(next) == EXACTFL)
2835 c2 = PL_fold_locale[c1];
2841 PL_reginput = locinput;
2845 if (ln && regrepeat(scan, ln) < ln)
2847 locinput = PL_reginput;
2850 char *e = locinput + n - ln; /* Should not check after this */
2851 char *old = locinput;
2853 if (e >= PL_regeol || (n == REG_INFTY))
2856 /* Find place 'next' could work */
2858 while (locinput <= e && *locinput != c1)
2861 while (locinput <= e
2868 /* PL_reginput == old now */
2869 if (locinput != old) {
2870 ln = 1; /* Did some */
2871 if (regrepeat(scan, locinput - old) <
2875 /* PL_reginput == locinput now */
2878 PL_regstartp[paren] = HOPc(locinput, -1) - PL_bostr;
2879 PL_regendp[paren] = locinput - PL_bostr;
2882 PL_regendp[paren] = -1;
2886 PL_reginput = locinput; /* Could be reset... */
2888 /* Couldn't or didn't -- move forward. */
2893 while (n >= ln || (n == REG_INFTY && ln > 0)) { /* ln overflow ? */
2894 /* If it could work, try it. */
2896 UCHARAT(PL_reginput) == c1 ||
2897 UCHARAT(PL_reginput) == c2)
2901 PL_regstartp[paren] = HOPc(PL_reginput, -1) - PL_bostr;
2902 PL_regendp[paren] = PL_reginput - PL_bostr;
2905 PL_regendp[paren] = -1;
2911 /* Couldn't or didn't -- move forward. */
2912 PL_reginput = locinput;
2913 if (regrepeat(scan, 1)) {
2915 locinput = PL_reginput;
2923 n = regrepeat(scan, n);
2924 locinput = PL_reginput;
2925 if (ln < n && PL_regkind[(U8)OP(next)] == EOL &&
2926 (!PL_multiline || OP(next) == SEOL))
2927 ln = n; /* why back off? */
2931 /* If it could work, try it. */
2933 UCHARAT(PL_reginput) == c1 ||
2934 UCHARAT(PL_reginput) == c2)
2938 PL_regstartp[paren] = HOPc(PL_reginput, -1) - PL_bostr;
2939 PL_regendp[paren] = PL_reginput - PL_bostr;
2942 PL_regendp[paren] = -1;
2948 /* Couldn't or didn't -- back up. */
2950 PL_reginput = locinput = HOPc(locinput, -1);
2955 /* If it could work, try it. */
2957 UCHARAT(PL_reginput) == c1 ||
2958 UCHARAT(PL_reginput) == c2)
2964 /* Couldn't or didn't -- back up. */
2966 PL_reginput = locinput = HOPc(locinput, -1);
2973 if (PL_reg_call_cc) {
2974 re_cc_state *cur_call_cc = PL_reg_call_cc;
2975 CURCUR *cctmp = PL_regcc;
2976 regexp *re = PL_reg_re;
2977 CHECKPOINT cp, lastcp;
2979 cp = regcppush(0); /* Save *all* the positions. */
2981 regcp_set_to(PL_reg_call_cc->ss); /* Restore parens of
2983 PL_reginput = locinput; /* Make position available to
2985 cache_re(PL_reg_call_cc->re);
2986 PL_regcc = PL_reg_call_cc->cc;
2987 PL_reg_call_cc = PL_reg_call_cc->prev;
2988 if (regmatch(cur_call_cc->node)) {
2989 PL_reg_call_cc = cur_call_cc;
2995 PL_reg_call_cc = cur_call_cc;
3001 PerlIO_printf(Perl_debug_log,
3002 "%*s continuation failed...\n",
3003 REPORT_CODE_OFF+PL_regindent*2, "")
3007 if (locinput < PL_regtill) {
3008 DEBUG_r(PerlIO_printf(Perl_debug_log,
3009 "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
3011 (long)(locinput - PL_reg_starttry),
3012 (long)(PL_regtill - PL_reg_starttry),
3014 sayNO_FINAL; /* Cannot match: too short. */
3016 PL_reginput = locinput; /* put where regtry can find it */
3017 sayYES_FINAL; /* Success! */
3019 PL_reginput = locinput; /* put where regtry can find it */
3020 sayYES_LOUD; /* Success! */
3023 PL_reginput = locinput;
3028 if (UTF) { /* XXXX This is absolutely
3029 broken, we read before
3031 s = HOPMAYBEc(locinput, -scan->flags);
3037 if (locinput < PL_bostr + scan->flags)
3039 PL_reginput = locinput - scan->flags;
3044 PL_reginput = locinput;
3049 if (UTF) { /* XXXX This is absolutely
3050 broken, we read before
3052 s = HOPMAYBEc(locinput, -scan->flags);
3053 if (!s || s < PL_bostr)
3058 if (locinput < PL_bostr + scan->flags)
3060 PL_reginput = locinput - scan->flags;
3065 PL_reginput = locinput;
3068 inner = NEXTOPER(NEXTOPER(scan));
3069 if (regmatch(inner) != n) {
3084 if (OP(scan) == SUSPEND) {
3085 locinput = PL_reginput;
3086 nextchr = UCHARAT(locinput);
3091 next = scan + ARG(scan);
3096 PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
3097 (UV)scan, OP(scan));
3098 Perl_croak(aTHX_ "regexp memory corruption");
3104 * We get here only if there's trouble -- normally "case END" is
3105 * the terminating point.
3107 Perl_croak(aTHX_ "corrupted regexp pointers");
3113 PerlIO_printf(Perl_debug_log,
3114 "%*s %scould match...%s\n",
3115 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4],PL_colors[5])
3119 DEBUG_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
3120 PL_colors[4],PL_colors[5]));
3129 PerlIO_printf(Perl_debug_log,
3130 "%*s %sfailed...%s\n",
3131 REPORT_CODE_OFF+PL_regindent*2, "",PL_colors[4],PL_colors[5])
3143 - regrepeat - repeatedly match something simple, report how many
3146 * [This routine now assumes that it will only match on things of length 1.
3147 * That was true before, but now we assume scan - reginput is the count,
3148 * rather than incrementing count on every character. [Er, except utf8.]]
3151 S_regrepeat(pTHX_ regnode *p, I32 max)
3154 register char *scan;
3156 register char *loceol = PL_regeol;
3157 register I32 hardcount = 0;
3160 if (max != REG_INFTY && max < loceol - scan)
3161 loceol = scan + max;
3164 while (scan < loceol && *scan != '\n')
3172 while (scan < loceol && *scan != '\n') {
3173 scan += UTF8SKIP(scan);
3179 while (scan < loceol) {
3180 scan += UTF8SKIP(scan);
3184 case EXACT: /* length of string is 1 */
3186 while (scan < loceol && UCHARAT(scan) == c)
3189 case EXACTF: /* length of string is 1 */
3191 while (scan < loceol &&
3192 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold[c]))
3195 case EXACTFL: /* length of string is 1 */
3196 PL_reg_flags |= RF_tainted;
3198 while (scan < loceol &&
3199 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold_locale[c]))
3204 while (scan < loceol && REGINCLASSUTF8(p, (U8*)scan)) {
3205 scan += UTF8SKIP(scan);
3210 while (scan < loceol && REGINCLASS(p, *scan))
3214 while (scan < loceol && isALNUM(*scan))
3219 while (scan < loceol && swash_fetch(PL_utf8_alnum, (U8*)scan)) {
3220 scan += UTF8SKIP(scan);
3225 PL_reg_flags |= RF_tainted;
3226 while (scan < loceol && isALNUM_LC(*scan))
3230 PL_reg_flags |= RF_tainted;
3232 while (scan < loceol && isALNUM_LC_utf8((U8*)scan)) {
3233 scan += UTF8SKIP(scan);
3239 while (scan < loceol && !isALNUM(*scan))
3244 while (scan < loceol && !swash_fetch(PL_utf8_alnum, (U8*)scan)) {
3245 scan += UTF8SKIP(scan);
3250 PL_reg_flags |= RF_tainted;
3251 while (scan < loceol && !isALNUM_LC(*scan))
3255 PL_reg_flags |= RF_tainted;
3257 while (scan < loceol && !isALNUM_LC_utf8((U8*)scan)) {
3258 scan += UTF8SKIP(scan);
3263 while (scan < loceol && isSPACE(*scan))
3268 while (scan < loceol && (*scan == ' ' || swash_fetch(PL_utf8_space,(U8*)scan))) {
3269 scan += UTF8SKIP(scan);
3274 PL_reg_flags |= RF_tainted;
3275 while (scan < loceol && isSPACE_LC(*scan))
3279 PL_reg_flags |= RF_tainted;
3281 while (scan < loceol && (*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
3282 scan += UTF8SKIP(scan);
3287 while (scan < loceol && !isSPACE(*scan))
3292 while (scan < loceol && !(*scan == ' ' || swash_fetch(PL_utf8_space,(U8*)scan))) {
3293 scan += UTF8SKIP(scan);
3298 PL_reg_flags |= RF_tainted;
3299 while (scan < loceol && !isSPACE_LC(*scan))
3303 PL_reg_flags |= RF_tainted;
3305 while (scan < loceol && !(*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
3306 scan += UTF8SKIP(scan);
3311 while (scan < loceol && isDIGIT(*scan))
3316 while (scan < loceol && swash_fetch(PL_utf8_digit,(U8*)scan)) {
3317 scan += UTF8SKIP(scan);
3323 while (scan < loceol && !isDIGIT(*scan))
3328 while (scan < loceol && !swash_fetch(PL_utf8_digit,(U8*)scan)) {
3329 scan += UTF8SKIP(scan);
3333 default: /* Called on something of 0 width. */
3334 break; /* So match right here or not at all. */
3340 c = scan - PL_reginput;
3345 SV *prop = sv_newmortal();
3348 PerlIO_printf(Perl_debug_log,
3349 "%*s %s can match %"IVdf" times out of %"IVdf"...\n",
3350 REPORT_CODE_OFF+1, "", SvPVX(prop),(IV)c,(IV)max);
3357 - regrepeat_hard - repeatedly match something, report total lenth and length
3359 * The repeater is supposed to have constant length.
3363 S_regrepeat_hard(pTHX_ regnode *p, I32 max, I32 *lp)
3366 register char *scan;
3367 register char *start;
3368 register char *loceol = PL_regeol;
3370 I32 count = 0, res = 1;
3375 start = PL_reginput;
3377 while (PL_reginput < loceol && (scan = PL_reginput, res = regmatch(p))) {
3380 while (start < PL_reginput) {
3382 start += UTF8SKIP(start);
3393 while (PL_reginput < loceol && (scan = PL_reginput, res = regmatch(p))) {
3395 *lp = l = PL_reginput - start;
3396 if (max != REG_INFTY && l*max < loceol - scan)
3397 loceol = scan + l*max;
3410 - reginclass - determine if a character falls into a character class
3414 S_reginclass(pTHX_ register regnode *p, register I32 c)
3417 char flags = ANYOF_FLAGS(p);
3421 if (ANYOF_BITMAP_TEST(p, c))
3423 else if (flags & ANYOF_FOLD) {
3425 if (flags & ANYOF_LOCALE) {
3426 PL_reg_flags |= RF_tainted;
3427 cf = PL_fold_locale[c];
3431 if (ANYOF_BITMAP_TEST(p, cf))
3435 if (!match && (flags & ANYOF_CLASS)) {
3436 PL_reg_flags |= RF_tainted;
3438 (ANYOF_CLASS_TEST(p, ANYOF_ALNUM) && isALNUM_LC(c)) ||
3439 (ANYOF_CLASS_TEST(p, ANYOF_NALNUM) && !isALNUM_LC(c)) ||
3440 (ANYOF_CLASS_TEST(p, ANYOF_SPACE) && isSPACE_LC(c)) ||
3441 (ANYOF_CLASS_TEST(p, ANYOF_NSPACE) && !isSPACE_LC(c)) ||
3442 (ANYOF_CLASS_TEST(p, ANYOF_DIGIT) && isDIGIT_LC(c)) ||
3443 (ANYOF_CLASS_TEST(p, ANYOF_NDIGIT) && !isDIGIT_LC(c)) ||
3444 (ANYOF_CLASS_TEST(p, ANYOF_ALNUMC) && isALNUMC_LC(c)) ||
3445 (ANYOF_CLASS_TEST(p, ANYOF_NALNUMC) && !isALNUMC_LC(c)) ||
3446 (ANYOF_CLASS_TEST(p, ANYOF_ALPHA) && isALPHA_LC(c)) ||
3447 (ANYOF_CLASS_TEST(p, ANYOF_NALPHA) && !isALPHA_LC(c)) ||
3448 (ANYOF_CLASS_TEST(p, ANYOF_ASCII) && isASCII(c)) ||
3449 (ANYOF_CLASS_TEST(p, ANYOF_NASCII) && !isASCII(c)) ||
3450 (ANYOF_CLASS_TEST(p, ANYOF_CNTRL) && isCNTRL_LC(c)) ||
3451 (ANYOF_CLASS_TEST(p, ANYOF_NCNTRL) && !isCNTRL_LC(c)) ||
3452 (ANYOF_CLASS_TEST(p, ANYOF_GRAPH) && isGRAPH_LC(c)) ||
3453 (ANYOF_CLASS_TEST(p, ANYOF_NGRAPH) && !isGRAPH_LC(c)) ||
3454 (ANYOF_CLASS_TEST(p, ANYOF_LOWER) && isLOWER_LC(c)) ||
3455 (ANYOF_CLASS_TEST(p, ANYOF_NLOWER) && !isLOWER_LC(c)) ||
3456 (ANYOF_CLASS_TEST(p, ANYOF_PRINT) && isPRINT_LC(c)) ||
3457 (ANYOF_CLASS_TEST(p, ANYOF_NPRINT) && !isPRINT_LC(c)) ||
3458 (ANYOF_CLASS_TEST(p, ANYOF_PUNCT) && isPUNCT_LC(c)) ||
3459 (ANYOF_CLASS_TEST(p, ANYOF_NPUNCT) && !isPUNCT_LC(c)) ||
3460 (ANYOF_CLASS_TEST(p, ANYOF_UPPER) && isUPPER_LC(c)) ||
3461 (ANYOF_CLASS_TEST(p, ANYOF_NUPPER) && !isUPPER_LC(c)) ||
3462 (ANYOF_CLASS_TEST(p, ANYOF_XDIGIT) && isXDIGIT(c)) ||
3463 (ANYOF_CLASS_TEST(p, ANYOF_NXDIGIT) && !isXDIGIT(c))
3464 ) /* How's that for a conditional? */
3470 return (flags & ANYOF_INVERT) ? !match : match;
3474 S_reginclassutf8(pTHX_ regnode *f, U8 *p)
3477 char flags = ARG1(f);
3479 SV *sv = (SV*)PL_regdata->data[ARG2(f)];
3481 if (swash_fetch(sv, p))
3483 else if (flags & ANYOF_FOLD) {
3486 if (flags & ANYOF_LOCALE) {
3487 PL_reg_flags |= RF_tainted;
3488 uv_to_utf8(tmpbuf, toLOWER_LC_utf8(p));
3491 uv_to_utf8(tmpbuf, toLOWER_utf8(p));
3492 if (swash_fetch(sv, tmpbuf))
3496 /* UTF8 combined with ANYOF_CLASS is ill-defined. */
3498 return (flags & ANYOF_INVERT) ? !match : match;
3502 S_reghop(pTHX_ U8 *s, I32 off)
3506 while (off-- && s < (U8*)PL_regeol)
3511 if (s > (U8*)PL_bostr) {
3514 while (s > (U8*)PL_bostr && (*s & 0xc0) == 0x80)
3516 } /* XXX could check well-formedness here */
3524 S_reghopmaybe(pTHX_ U8* s, I32 off)
3528 while (off-- && s < (U8*)PL_regeol)
3535 if (s > (U8*)PL_bostr) {
3538 while (s > (U8*)PL_bostr && (*s & 0xc0) == 0x80)
3540 } /* XXX could check well-formedness here */
3557 restore_pos(pTHXo_ void *arg)
3560 if (PL_reg_eval_set) {
3561 if (PL_reg_oldsaved) {
3562 PL_reg_re->subbeg = PL_reg_oldsaved;
3563 PL_reg_re->sublen = PL_reg_oldsavedlen;
3564 RX_MATCH_COPIED_on(PL_reg_re);
3566 PL_reg_magic->mg_len = PL_reg_oldpos;
3567 PL_reg_eval_set = 0;
3568 PL_curpm = PL_reg_oldcurpm;