5 * "One Ring to rule them all, One Ring to find them..."
8 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
9 * confused with the original package (see point 3 below). Thanks, Henry!
12 /* Additional note: this code is very heavily munged from Henry's version
13 * in places. In some spots I've traded clarity for efficiency, so don't
14 * blame Henry for some of the lack of readability.
17 /* The names of the functions have been changed from regcomp and
18 * regexec to pregcomp and pregexec in order to avoid conflicts
19 * with the POSIX routines of the same names.
22 #ifdef PERL_EXT_RE_BUILD
23 /* need to replace pregcomp et al, so enable that */
24 # ifndef PERL_IN_XSUB_RE
25 # define PERL_IN_XSUB_RE
27 /* need access to debugger hooks */
28 # if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
33 #ifdef PERL_IN_XSUB_RE
34 /* We *really* need to overwrite these symbols: */
35 # define Perl_regexec_flags my_regexec
36 # define Perl_regdump my_regdump
37 # define Perl_regprop my_regprop
38 # define Perl_re_intuit_start my_re_intuit_start
39 /* *These* symbols are masked to allow static link. */
40 # define Perl_pregexec my_pregexec
41 # define Perl_reginitcolors my_reginitcolors
42 # define Perl_regclass_swash my_regclass_swash
44 # define PERL_NO_GET_CONTEXT
49 * pregcomp and pregexec -- regsub and regerror are not used in perl
51 * Copyright (c) 1986 by University of Toronto.
52 * Written by Henry Spencer. Not derived from licensed software.
54 * Permission is granted to anyone to use this software for any
55 * purpose on any computer system, and to redistribute it freely,
56 * subject to the following restrictions:
58 * 1. The author is not responsible for the consequences of use of
59 * this software, no matter how awful, even if they arise
62 * 2. The origin of this software must not be misrepresented, either
63 * by explicit claim or by omission.
65 * 3. Altered versions must be plainly marked as such, and must not
66 * be misrepresented as being the original software.
68 **** Alterations to Henry's code are...
70 **** Copyright (c) 1991-2001, Larry Wall
72 **** You may distribute under the terms of either the GNU General Public
73 **** License or the Artistic License, as specified in the README file.
75 * Beware that some of this code is subtly aware of the way operator
76 * precedence is structured in regular expressions. Serious changes in
77 * regular-expression syntax might require a total rethink.
80 #define PERL_IN_REGEXEC_C
83 #ifdef PERL_IN_XSUB_RE
84 # if defined(PERL_CAPI) || defined(PERL_OBJECT)
91 #define RF_tainted 1 /* tainted information used? */
92 #define RF_warned 2 /* warned about big count? */
93 #define RF_evaled 4 /* Did an EVAL with setting? */
94 #define RF_utf8 8 /* String contains multibyte chars? */
96 #define UTF (PL_reg_flags & RF_utf8)
98 #define RS_init 1 /* eval environment created */
99 #define RS_set 2 /* replsv value is set */
102 #define STATIC static
109 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
110 #define CHR_DIST(a,b) (DO_UTF8(PL_reg_sv) ? utf8_distance(a,b) : a - b)
112 #define reghop_c(pos,off) ((char*)reghop((U8*)pos, off))
113 #define reghopmaybe_c(pos,off) ((char*)reghopmaybe((U8*)pos, off))
114 #define HOP(pos,off) (DO_UTF8(PL_reg_sv) ? reghop((U8*)pos, off) : (U8*)(pos + off))
115 #define HOPMAYBE(pos,off) (DO_UTF8(PL_reg_sv) ? reghopmaybe((U8*)pos, off) : (U8*)(pos + off))
116 #define HOPc(pos,off) ((char*)HOP(pos,off))
117 #define HOPMAYBEc(pos,off) ((char*)HOPMAYBE(pos,off))
119 #define reghop3_c(pos,off,lim) ((char*)reghop3((U8*)pos, off, (U8*)lim))
120 #define reghopmaybe3_c(pos,off,lim) ((char*)reghopmaybe3((U8*)pos, off, (U8*)lim))
121 #define HOP3(pos,off,lim) (DO_UTF8(PL_reg_sv) ? reghop3((U8*)pos, off, (U8*)lim) : (U8*)(pos + off))
122 #define HOPMAYBE3(pos,off,lim) (DO_UTF8(PL_reg_sv) ? reghopmaybe3((U8*)pos, off, (U8*)lim) : (U8*)(pos + off))
123 #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
124 #define HOPMAYBE3c(pos,off,lim) ((char*)HOPMAYBE3(pos,off,lim))
126 #define LOAD_UTF8_CHARCLASS(a,b) STMT_START { if (!CAT2(PL_utf8_,a)) (void)CAT2(is_utf8_, a)((U8*)b); } STMT_END
128 static void restore_pos(pTHXo_ void *arg);
131 S_regcppush(pTHX_ I32 parenfloor)
133 int retval = PL_savestack_ix;
134 #define REGCP_PAREN_ELEMS 4
135 int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
138 #define REGCP_OTHER_ELEMS 5
139 SSCHECK(paren_elems_to_push + REGCP_OTHER_ELEMS);
140 for (p = PL_regsize; p > parenfloor; p--) {
141 /* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
142 SSPUSHINT(PL_regendp[p]);
143 SSPUSHINT(PL_regstartp[p]);
144 SSPUSHPTR(PL_reg_start_tmp[p]);
147 /* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
148 SSPUSHINT(PL_regsize);
149 SSPUSHINT(*PL_reglastparen);
150 SSPUSHPTR(PL_reginput);
151 #define REGCP_FRAME_ELEMS 2
152 /* REGCP_FRAME_ELEMS are part of the REGCP_OTHER_ELEMS and
153 * are needed for the regexp context stack bookkeeping. */
154 SSPUSHINT(paren_elems_to_push + REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
155 SSPUSHINT(SAVEt_REGCONTEXT); /* Magic cookie. */
160 /* These are needed since we do not localize EVAL nodes: */
161 # define REGCP_SET(cp) DEBUG_r(PerlIO_printf(Perl_debug_log, \
162 " Setting an EVAL scope, savestack=%"IVdf"\n", \
163 (IV)PL_savestack_ix)); cp = PL_savestack_ix
165 # define REGCP_UNWIND(cp) DEBUG_r(cp != PL_savestack_ix ? \
166 PerlIO_printf(Perl_debug_log, \
167 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
168 (IV)(cp), (IV)PL_savestack_ix) : 0); regcpblow(cp)
178 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
180 assert(i == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
181 i = SSPOPINT; /* Parentheses elements to pop. */
182 input = (char *) SSPOPPTR;
183 *PL_reglastparen = SSPOPINT;
184 PL_regsize = SSPOPINT;
186 /* Now restore the parentheses context. */
187 for (i -= (REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
188 i > 0; i -= REGCP_PAREN_ELEMS) {
189 paren = (U32)SSPOPINT;
190 PL_reg_start_tmp[paren] = (char *) SSPOPPTR;
191 PL_regstartp[paren] = SSPOPINT;
193 if (paren <= *PL_reglastparen)
194 PL_regendp[paren] = tmps;
196 PerlIO_printf(Perl_debug_log,
197 " restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n",
198 (UV)paren, (IV)PL_regstartp[paren],
199 (IV)(PL_reg_start_tmp[paren] - PL_bostr),
200 (IV)PL_regendp[paren],
201 (paren > *PL_reglastparen ? "(no)" : ""));
205 if (*PL_reglastparen + 1 <= PL_regnpar) {
206 PerlIO_printf(Perl_debug_log,
207 " restoring \\%"IVdf"..\\%"IVdf" to undef\n",
208 (IV)(*PL_reglastparen + 1), (IV)PL_regnpar);
212 /* It would seem that the similar code in regtry()
213 * already takes care of this, and in fact it is in
214 * a better location to since this code can #if 0-ed out
215 * but the code in regtry() is needed or otherwise tests
216 * requiring null fields (pat.t#187 and split.t#{13,14}
217 * (as of patchlevel 7877) will fail. Then again,
218 * this code seems to be necessary or otherwise
219 * building DynaLoader will fail:
220 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
222 for (paren = *PL_reglastparen + 1; paren <= PL_regnpar; paren++) {
223 if (paren > PL_regsize)
224 PL_regstartp[paren] = -1;
225 PL_regendp[paren] = -1;
232 S_regcp_set_to(pTHX_ I32 ss)
234 I32 tmp = PL_savestack_ix;
236 PL_savestack_ix = ss;
238 PL_savestack_ix = tmp;
242 typedef struct re_cc_state
246 struct re_cc_state *prev;
251 #define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
253 #define TRYPAREN(paren, n, input) { \
256 PL_regstartp[paren] = HOPc(input, -1) - PL_bostr; \
257 PL_regendp[paren] = input - PL_bostr; \
260 PL_regendp[paren] = -1; \
262 if (regmatch(next)) \
265 PL_regendp[paren] = -1; \
270 * pregexec and friends
274 - pregexec - match a regexp against a string
277 Perl_pregexec(pTHX_ register regexp *prog, char *stringarg, register char *strend,
278 char *strbeg, I32 minend, SV *screamer, U32 nosave)
279 /* strend: pointer to null at end of string */
280 /* strbeg: real beginning of string */
281 /* minend: end of match must be >=minend after stringarg. */
282 /* nosave: For optimizations. */
285 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
286 nosave ? 0 : REXEC_COPY_STR);
290 S_cache_re(pTHX_ regexp *prog)
292 PL_regprecomp = prog->precomp; /* Needed for FAIL. */
294 PL_regprogram = prog->program;
296 PL_regnpar = prog->nparens;
297 PL_regdata = prog->data;
302 * Need to implement the following flags for reg_anch:
304 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
306 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
307 * INTUIT_AUTORITATIVE_ML
308 * INTUIT_ONCE_NOML - Intuit can match in one location only.
311 * Another flag for this function: SECOND_TIME (so that float substrs
312 * with giant delta may be not rechecked).
315 /* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
317 /* If SCREAM, then SvPVX(sv) should be compatible with strpos and strend.
318 Otherwise, only SvCUR(sv) is used to get strbeg. */
320 /* XXXX We assume that strpos is strbeg unless sv. */
322 /* XXXX Some places assume that there is a fixed substring.
323 An update may be needed if optimizer marks as "INTUITable"
324 RExen without fixed substrings. Similarly, it is assumed that
325 lengths of all the strings are no more than minlen, thus they
326 cannot come from lookahead.
327 (Or minlen should take into account lookahead.) */
329 /* A failure to find a constant substring means that there is no need to make
330 an expensive call to REx engine, thus we celebrate a failure. Similarly,
331 finding a substring too deep into the string means that less calls to
332 regtry() should be needed.
334 REx compiler's optimizer found 4 possible hints:
335 a) Anchored substring;
337 c) Whether we are anchored (beginning-of-line or \G);
338 d) First node (of those at offset 0) which may distingush positions;
339 We use a)b)d) and multiline-part of c), and try to find a position in the
340 string which does not contradict any of them.
343 /* Most of decisions we do here should have been done at compile time.
344 The nodes of the REx which we used for the search should have been
345 deleted from the finite automaton. */
348 Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
349 char *strend, U32 flags, re_scream_pos_data *data)
351 register I32 start_shift;
352 /* Should be nonnegative! */
353 register I32 end_shift;
360 register char *other_last = Nullch; /* other substr checked before this */
361 char *check_at; /* check substr found at this pos */
363 char *i_strpos = strpos;
366 DEBUG_r( if (!PL_colorset) reginitcolors() );
367 DEBUG_r(PerlIO_printf(Perl_debug_log,
368 "%sGuessing start of match, REx%s `%s%.60s%s%s' against `%s%.*s%s%s'...\n",
369 PL_colors[4],PL_colors[5],PL_colors[0],
372 (strlen(prog->precomp) > 60 ? "..." : ""),
374 (int)(strend - strpos > 60 ? 60 : strend - strpos),
375 strpos, PL_colors[1],
376 (strend - strpos > 60 ? "..." : ""))
379 if (prog->reganch & ROPT_UTF8)
380 PL_reg_flags |= RF_utf8;
382 if (prog->minlen > CHR_DIST((U8*)strend, (U8*)strpos)) {
383 DEBUG_r(PerlIO_printf(Perl_debug_log, "String too short...\n"));
386 strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
388 check = prog->check_substr;
389 if (prog->reganch & ROPT_ANCH) { /* Match at beg-of-str or after \n */
390 ml_anch = !( (prog->reganch & ROPT_ANCH_SINGLE)
391 || ( (prog->reganch & ROPT_ANCH_BOL)
392 && !PL_multiline ) ); /* Check after \n? */
394 if ((prog->check_offset_min == prog->check_offset_max) && !ml_anch) {
395 /* Substring at constant offset from beg-of-str... */
398 if ( !(prog->reganch & ROPT_ANCH_GPOS) /* Checked by the caller */
399 /* SvCUR is not set on references: SvRV and SvPVX overlap */
401 && (strpos != strbeg)) {
402 DEBUG_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
405 PL_regeol = strend; /* Used in HOP() */
406 s = HOP3c(strpos, prog->check_offset_min, strend);
408 slen = SvCUR(check); /* >= 1 */
410 if ( strend - s > slen || strend - s < slen - 1
411 || (strend - s == slen && strend[-1] != '\n')) {
412 DEBUG_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
415 /* Now should match s[0..slen-2] */
417 if (slen && (*SvPVX(check) != *s
419 && memNE(SvPVX(check), s, slen)))) {
421 DEBUG_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
425 else if (*SvPVX(check) != *s
426 || ((slen = SvCUR(check)) > 1
427 && memNE(SvPVX(check), s, slen)))
429 goto success_at_start;
431 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
433 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
434 end_shift = prog->minlen - start_shift -
435 CHR_SVLEN(check) + (SvTAIL(check) != 0);
437 I32 end = prog->check_offset_max + CHR_SVLEN(check)
438 - (SvTAIL(check) != 0);
439 I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
441 if (end_shift < eshift)
445 else { /* Can match at random position */
448 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
449 /* Should be nonnegative! */
450 end_shift = prog->minlen - start_shift -
451 CHR_SVLEN(check) + (SvTAIL(check) != 0);
454 #ifdef DEBUGGING /* 7/99: reports of failure (with the older version) */
456 Perl_croak(aTHX_ "panic: end_shift");
460 /* Find a possible match in the region s..strend by looking for
461 the "check" substring in the region corrected by start/end_shift. */
462 if (flags & REXEC_SCREAM) {
463 I32 p = -1; /* Internal iterator of scream. */
464 I32 *pp = data ? data->scream_pos : &p;
466 if (PL_screamfirst[BmRARE(check)] >= 0
467 || ( BmRARE(check) == '\n'
468 && (BmPREVIOUS(check) == SvCUR(check) - 1)
470 s = screaminstr(sv, check,
471 start_shift + (s - strbeg), end_shift, pp, 0);
475 *data->scream_olds = s;
477 else if (prog->reganch & ROPT_SANY_SEEN)
478 s = fbm_instr((U8*)(s + start_shift),
479 (U8*)(strend - end_shift),
480 check, PL_multiline ? FBMrf_MULTILINE : 0);
482 s = fbm_instr(HOP3(s, start_shift, strend),
483 HOP3(strend, -end_shift, strbeg),
484 check, PL_multiline ? FBMrf_MULTILINE : 0);
486 /* Update the count-of-usability, remove useless subpatterns,
489 DEBUG_r(PerlIO_printf(Perl_debug_log, "%s %s substr `%s%.*s%s'%s%s",
490 (s ? "Found" : "Did not find"),
491 ((check == prog->anchored_substr) ? "anchored" : "floating"),
493 (int)(SvCUR(check) - (SvTAIL(check)!=0)),
495 PL_colors[1], (SvTAIL(check) ? "$" : ""),
496 (s ? " at offset " : "...\n") ) );
503 /* Finish the diagnostic message */
504 DEBUG_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
506 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
507 Start with the other substr.
508 XXXX no SCREAM optimization yet - and a very coarse implementation
509 XXXX /ttx+/ results in anchored=`ttx', floating=`x'. floating will
510 *always* match. Probably should be marked during compile...
511 Probably it is right to do no SCREAM here...
514 if (prog->float_substr && prog->anchored_substr) {
515 /* Take into account the "other" substring. */
516 /* XXXX May be hopelessly wrong for UTF... */
519 if (check == prog->float_substr) {
522 char *last = HOP3c(s, -start_shift, strbeg), *last1, *last2;
525 t = s - prog->check_offset_max;
526 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
527 && (!(prog->reganch & ROPT_UTF8)
528 || ((t = reghopmaybe3_c(s, -(prog->check_offset_max), strpos))
533 t = HOP3c(t, prog->anchored_offset, strend);
534 if (t < other_last) /* These positions already checked */
536 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
539 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
540 /* On end-of-str: see comment below. */
541 s = fbm_instr((unsigned char*)t,
542 HOP3(HOP3(last1, prog->anchored_offset, strend)
543 + SvCUR(prog->anchored_substr),
544 -(SvTAIL(prog->anchored_substr)!=0), strbeg),
545 prog->anchored_substr,
546 PL_multiline ? FBMrf_MULTILINE : 0);
547 DEBUG_r(PerlIO_printf(Perl_debug_log,
548 "%s anchored substr `%s%.*s%s'%s",
549 (s ? "Found" : "Contradicts"),
551 (int)(SvCUR(prog->anchored_substr)
552 - (SvTAIL(prog->anchored_substr)!=0)),
553 SvPVX(prog->anchored_substr),
554 PL_colors[1], (SvTAIL(prog->anchored_substr) ? "$" : "")));
556 if (last1 >= last2) {
557 DEBUG_r(PerlIO_printf(Perl_debug_log,
558 ", giving up...\n"));
561 DEBUG_r(PerlIO_printf(Perl_debug_log,
562 ", trying floating at offset %ld...\n",
563 (long)(HOP3c(s1, 1, strend) - i_strpos)));
564 other_last = HOP3c(last1, prog->anchored_offset+1, strend);
565 s = HOP3c(last, 1, strend);
569 DEBUG_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
570 (long)(s - i_strpos)));
571 t = HOP3c(s, -prog->anchored_offset, strbeg);
572 other_last = HOP3c(s, 1, strend);
580 else { /* Take into account the floating substring. */
584 t = HOP3c(s, -start_shift, strbeg);
586 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
587 if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
588 last = HOP3c(t, prog->float_max_offset, strend);
589 s = HOP3c(t, prog->float_min_offset, strend);
592 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
593 /* fbm_instr() takes into account exact value of end-of-str
594 if the check is SvTAIL(ed). Since false positives are OK,
595 and end-of-str is not later than strend we are OK. */
596 s = fbm_instr((unsigned char*)s,
597 (unsigned char*)last + SvCUR(prog->float_substr)
598 - (SvTAIL(prog->float_substr)!=0),
599 prog->float_substr, PL_multiline ? FBMrf_MULTILINE : 0);
600 DEBUG_r(PerlIO_printf(Perl_debug_log, "%s floating substr `%s%.*s%s'%s",
601 (s ? "Found" : "Contradicts"),
603 (int)(SvCUR(prog->float_substr)
604 - (SvTAIL(prog->float_substr)!=0)),
605 SvPVX(prog->float_substr),
606 PL_colors[1], (SvTAIL(prog->float_substr) ? "$" : "")));
609 DEBUG_r(PerlIO_printf(Perl_debug_log,
610 ", giving up...\n"));
613 DEBUG_r(PerlIO_printf(Perl_debug_log,
614 ", trying anchored starting at offset %ld...\n",
615 (long)(s1 + 1 - i_strpos)));
617 s = HOP3c(t, 1, strend);
621 DEBUG_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
622 (long)(s - i_strpos)));
623 other_last = s; /* Fix this later. --Hugo */
632 t = s - prog->check_offset_max;
633 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
634 && (!(prog->reganch & ROPT_UTF8)
635 || ((t = reghopmaybe3_c(s, -prog->check_offset_max, strpos))
637 /* Fixed substring is found far enough so that the match
638 cannot start at strpos. */
640 if (ml_anch && t[-1] != '\n') {
641 /* Eventually fbm_*() should handle this, but often
642 anchored_offset is not 0, so this check will not be wasted. */
643 /* XXXX In the code below we prefer to look for "^" even in
644 presence of anchored substrings. And we search even
645 beyond the found float position. These pessimizations
646 are historical artefacts only. */
648 while (t < strend - prog->minlen) {
650 if (t < check_at - prog->check_offset_min) {
651 if (prog->anchored_substr) {
652 /* Since we moved from the found position,
653 we definitely contradict the found anchored
654 substr. Due to the above check we do not
655 contradict "check" substr.
656 Thus we can arrive here only if check substr
657 is float. Redo checking for "other"=="fixed".
660 DEBUG_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
661 PL_colors[0],PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
662 goto do_other_anchored;
664 /* We don't contradict the found floating substring. */
665 /* XXXX Why not check for STCLASS? */
667 DEBUG_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
668 PL_colors[0],PL_colors[1], (long)(s - i_strpos)));
671 /* Position contradicts check-string */
672 /* XXXX probably better to look for check-string
673 than for "\n", so one should lower the limit for t? */
674 DEBUG_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
675 PL_colors[0],PL_colors[1], (long)(t + 1 - i_strpos)));
676 other_last = strpos = s = t + 1;
681 DEBUG_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
682 PL_colors[0],PL_colors[1]));
686 DEBUG_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
687 PL_colors[0],PL_colors[1]));
691 ++BmUSEFUL(prog->check_substr); /* hooray/5 */
694 /* The found string does not prohibit matching at strpos,
695 - no optimization of calling REx engine can be performed,
696 unless it was an MBOL and we are not after MBOL,
697 or a future STCLASS check will fail this. */
699 /* Even in this situation we may use MBOL flag if strpos is offset
700 wrt the start of the string. */
701 if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
702 && (strpos != strbeg) && strpos[-1] != '\n'
703 /* May be due to an implicit anchor of m{.*foo} */
704 && !(prog->reganch & ROPT_IMPLICIT))
709 DEBUG_r( if (ml_anch)
710 PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
711 (long)(strpos - i_strpos), PL_colors[0],PL_colors[1]);
714 if (!(prog->reganch & ROPT_NAUGHTY) /* XXXX If strpos moved? */
715 && prog->check_substr /* Could be deleted already */
716 && --BmUSEFUL(prog->check_substr) < 0
717 && prog->check_substr == prog->float_substr)
719 /* If flags & SOMETHING - do not do it many times on the same match */
720 DEBUG_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
721 SvREFCNT_dec(prog->check_substr);
722 prog->check_substr = Nullsv; /* disable */
723 prog->float_substr = Nullsv; /* clear */
724 check = Nullsv; /* abort */
726 /* XXXX This is a remnant of the old implementation. It
727 looks wasteful, since now INTUIT can use many
729 prog->reganch &= ~RE_USE_INTUIT;
736 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
737 if (prog->regstclass) {
738 /* minlen == 0 is possible if regstclass is \b or \B,
739 and the fixed substr is ''$.
740 Since minlen is already taken into account, s+1 is before strend;
741 accidentally, minlen >= 1 guaranties no false positives at s + 1
742 even for \b or \B. But (minlen? 1 : 0) below assumes that
743 regstclass does not come from lookahead... */
744 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
745 This leaves EXACTF only, which is dealt with in find_byclass(). */
746 U8* str = (U8*)STRING(prog->regstclass);
747 int cl_l = (PL_regkind[(U8)OP(prog->regstclass)] == EXACT
748 ? CHR_DIST(str+STR_LEN(prog->regstclass), str)
750 char *endpos = (prog->anchored_substr || ml_anch)
751 ? HOP3c(s, (prog->minlen ? cl_l : 0), strend)
752 : (prog->float_substr
753 ? HOP3c(HOP3c(check_at, -start_shift, strbeg),
756 char *startpos = strbeg;
759 if (prog->reganch & ROPT_UTF8) {
760 PL_regdata = prog->data;
763 s = find_byclass(prog, prog->regstclass, s, endpos, startpos, 1);
768 if (endpos == strend) {
769 DEBUG_r( PerlIO_printf(Perl_debug_log,
770 "Could not match STCLASS...\n") );
773 DEBUG_r( PerlIO_printf(Perl_debug_log,
774 "This position contradicts STCLASS...\n") );
775 if ((prog->reganch & ROPT_ANCH) && !ml_anch)
777 /* Contradict one of substrings */
778 if (prog->anchored_substr) {
779 if (prog->anchored_substr == check) {
780 DEBUG_r( what = "anchored" );
782 s = HOP3c(t, 1, strend);
783 if (s + start_shift + end_shift > strend) {
784 /* XXXX Should be taken into account earlier? */
785 DEBUG_r( PerlIO_printf(Perl_debug_log,
786 "Could not match STCLASS...\n") );
791 DEBUG_r( PerlIO_printf(Perl_debug_log,
792 "Looking for %s substr starting at offset %ld...\n",
793 what, (long)(s + start_shift - i_strpos)) );
796 /* Have both, check_string is floating */
797 if (t + start_shift >= check_at) /* Contradicts floating=check */
798 goto retry_floating_check;
799 /* Recheck anchored substring, but not floating... */
803 DEBUG_r( PerlIO_printf(Perl_debug_log,
804 "Looking for anchored substr starting at offset %ld...\n",
805 (long)(other_last - i_strpos)) );
806 goto do_other_anchored;
808 /* Another way we could have checked stclass at the
809 current position only: */
814 DEBUG_r( PerlIO_printf(Perl_debug_log,
815 "Looking for /%s^%s/m starting at offset %ld...\n",
816 PL_colors[0],PL_colors[1], (long)(t - i_strpos)) );
819 if (!prog->float_substr) /* Could have been deleted */
821 /* Check is floating subtring. */
822 retry_floating_check:
823 t = check_at - start_shift;
824 DEBUG_r( what = "floating" );
825 goto hop_and_restart;
828 PerlIO_printf(Perl_debug_log,
829 "By STCLASS: moving %ld --> %ld\n",
830 (long)(t - i_strpos), (long)(s - i_strpos));
832 PerlIO_printf(Perl_debug_log,
833 "Does not contradict STCLASS...\n") );
836 DEBUG_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
837 PL_colors[4], (check ? "Guessed" : "Giving up"),
838 PL_colors[5], (long)(s - i_strpos)) );
841 fail_finish: /* Substring not found */
842 if (prog->check_substr) /* could be removed already */
843 BmUSEFUL(prog->check_substr) += 5; /* hooray */
845 DEBUG_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
846 PL_colors[4],PL_colors[5]));
850 /* We know what class REx starts with. Try to find this position... */
852 S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *startpos, I32 norun)
854 I32 doevery = (prog->reganch & ROPT_SKIP) == 0;
860 register I32 tmp = 1; /* Scratch variable? */
861 register bool do_utf8 = DO_UTF8(PL_reg_sv);
863 /* We know what class it must start with. */
867 if (reginclass(c, (U8*)s, do_utf8)) {
868 if (tmp && (norun || regtry(prog, s)))
875 s += do_utf8 ? UTF8SKIP(s) : 1;
882 c1 = to_utf8_lower((U8*)m);
883 c2 = to_utf8_upper((U8*)m);
894 c2 = PL_fold_locale[c1];
899 e = s; /* Due to minlen logic of intuit() */
905 if ( utf8_to_uvchr((U8*)s, &len) == c1
912 UV c = utf8_to_uvchr((U8*)s, &len);
913 if ( (c == c1 || c == c2) && regtry(prog, s) )
922 && (ln == 1 || !(OP(c) == EXACTF
924 : ibcmp_locale(s, m, ln)))
925 && (norun || regtry(prog, s)) )
931 if ( (*(U8*)s == c1 || *(U8*)s == c2)
932 && (ln == 1 || !(OP(c) == EXACTF
934 : ibcmp_locale(s, m, ln)))
935 && (norun || regtry(prog, s)) )
942 PL_reg_flags |= RF_tainted;
949 U8 *r = reghop3((U8*)s, -1, (U8*)startpos);
951 tmp = (I32)utf8n_to_uvchr(r, s - (char*)r, 0, 0);
953 tmp = ((OP(c) == BOUND ?
954 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
955 LOAD_UTF8_CHARCLASS(alnum,"a");
957 if (tmp == !(OP(c) == BOUND ?
958 swash_fetch(PL_utf8_alnum, (U8*)s) :
959 isALNUM_LC_utf8((U8*)s)))
962 if ((norun || regtry(prog, s)))
969 tmp = (s != startpos) ? UCHARAT(s - 1) : '\n';
970 tmp = ((OP(c) == BOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
973 !(OP(c) == BOUND ? isALNUM(*s) : isALNUM_LC(*s))) {
975 if ((norun || regtry(prog, s)))
981 if ((!prog->minlen && tmp) && (norun || regtry(prog, s)))
985 PL_reg_flags |= RF_tainted;
992 U8 *r = reghop3((U8*)s, -1, (U8*)startpos);
994 tmp = (I32)utf8n_to_uvchr(r, s - (char*)r, 0, 0);
996 tmp = ((OP(c) == NBOUND ?
997 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
998 LOAD_UTF8_CHARCLASS(alnum,"a");
1000 if (tmp == !(OP(c) == NBOUND ?
1001 swash_fetch(PL_utf8_alnum, (U8*)s) :
1002 isALNUM_LC_utf8((U8*)s)))
1004 else if ((norun || regtry(prog, s)))
1010 tmp = (s != startpos) ? UCHARAT(s - 1) : '\n';
1011 tmp = ((OP(c) == NBOUND ?
1012 isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1013 while (s < strend) {
1015 !(OP(c) == NBOUND ? isALNUM(*s) : isALNUM_LC(*s)))
1017 else if ((norun || regtry(prog, s)))
1022 if ((!prog->minlen && !tmp) && (norun || regtry(prog, s)))
1027 LOAD_UTF8_CHARCLASS(alnum,"a");
1028 while (s < strend) {
1029 if (swash_fetch(PL_utf8_alnum, (U8*)s)) {
1030 if (tmp && (norun || regtry(prog, s)))
1041 while (s < strend) {
1043 if (tmp && (norun || regtry(prog, s)))
1055 PL_reg_flags |= RF_tainted;
1057 while (s < strend) {
1058 if (isALNUM_LC_utf8((U8*)s)) {
1059 if (tmp && (norun || regtry(prog, s)))
1070 while (s < strend) {
1071 if (isALNUM_LC(*s)) {
1072 if (tmp && (norun || regtry(prog, s)))
1085 LOAD_UTF8_CHARCLASS(alnum,"a");
1086 while (s < strend) {
1087 if (!swash_fetch(PL_utf8_alnum, (U8*)s)) {
1088 if (tmp && (norun || regtry(prog, s)))
1099 while (s < strend) {
1101 if (tmp && (norun || regtry(prog, s)))
1113 PL_reg_flags |= RF_tainted;
1115 while (s < strend) {
1116 if (!isALNUM_LC_utf8((U8*)s)) {
1117 if (tmp && (norun || regtry(prog, s)))
1128 while (s < strend) {
1129 if (!isALNUM_LC(*s)) {
1130 if (tmp && (norun || regtry(prog, s)))
1143 LOAD_UTF8_CHARCLASS(space," ");
1144 while (s < strend) {
1145 if (*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s)) {
1146 if (tmp && (norun || regtry(prog, s)))
1157 while (s < strend) {
1159 if (tmp && (norun || regtry(prog, s)))
1171 PL_reg_flags |= RF_tainted;
1173 while (s < strend) {
1174 if (*s == ' ' || isSPACE_LC_utf8((U8*)s)) {
1175 if (tmp && (norun || regtry(prog, s)))
1186 while (s < strend) {
1187 if (isSPACE_LC(*s)) {
1188 if (tmp && (norun || regtry(prog, s)))
1201 LOAD_UTF8_CHARCLASS(space," ");
1202 while (s < strend) {
1203 if (!(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s))) {
1204 if (tmp && (norun || regtry(prog, s)))
1215 while (s < strend) {
1217 if (tmp && (norun || regtry(prog, s)))
1229 PL_reg_flags |= RF_tainted;
1231 while (s < strend) {
1232 if (!(*s == ' ' || isSPACE_LC_utf8((U8*)s))) {
1233 if (tmp && (norun || regtry(prog, s)))
1244 while (s < strend) {
1245 if (!isSPACE_LC(*s)) {
1246 if (tmp && (norun || regtry(prog, s)))
1259 LOAD_UTF8_CHARCLASS(digit,"0");
1260 while (s < strend) {
1261 if (swash_fetch(PL_utf8_digit,(U8*)s)) {
1262 if (tmp && (norun || regtry(prog, s)))
1273 while (s < strend) {
1275 if (tmp && (norun || regtry(prog, s)))
1287 PL_reg_flags |= RF_tainted;
1289 while (s < strend) {
1290 if (isDIGIT_LC_utf8((U8*)s)) {
1291 if (tmp && (norun || regtry(prog, s)))
1302 while (s < strend) {
1303 if (isDIGIT_LC(*s)) {
1304 if (tmp && (norun || regtry(prog, s)))
1317 LOAD_UTF8_CHARCLASS(digit,"0");
1318 while (s < strend) {
1319 if (!swash_fetch(PL_utf8_digit,(U8*)s)) {
1320 if (tmp && (norun || regtry(prog, s)))
1331 while (s < strend) {
1333 if (tmp && (norun || regtry(prog, s)))
1345 PL_reg_flags |= RF_tainted;
1347 while (s < strend) {
1348 if (!isDIGIT_LC_utf8((U8*)s)) {
1349 if (tmp && (norun || regtry(prog, s)))
1360 while (s < strend) {
1361 if (!isDIGIT_LC(*s)) {
1362 if (tmp && (norun || regtry(prog, s)))
1374 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
1383 - regexec_flags - match a regexp against a string
1386 Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *strend,
1387 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
1388 /* strend: pointer to null at end of string */
1389 /* strbeg: real beginning of string */
1390 /* minend: end of match must be >=minend after stringarg. */
1391 /* data: May be used for some additional optimizations. */
1392 /* nosave: For optimizations. */
1395 register regnode *c;
1396 register char *startpos = stringarg;
1397 I32 minlen; /* must match at least this many chars */
1398 I32 dontbother = 0; /* how many characters not to try at end */
1399 /* I32 start_shift = 0; */ /* Offset of the start to find
1400 constant substr. */ /* CC */
1401 I32 end_shift = 0; /* Same for the end. */ /* CC */
1402 I32 scream_pos = -1; /* Internal iterator of scream. */
1404 SV* oreplsv = GvSV(PL_replgv);
1405 bool do_utf8 = DO_UTF8(sv);
1411 PL_regnarrate = DEBUG_r_TEST;
1414 /* Be paranoid... */
1415 if (prog == NULL || startpos == NULL) {
1416 Perl_croak(aTHX_ "NULL regexp parameter");
1420 minlen = prog->minlen;
1422 if (!(prog->reganch & ROPT_SANY_SEEN))
1423 if (utf8_distance((U8*)strend, (U8*)startpos) < minlen) goto phooey;
1426 if (strend - startpos < minlen) goto phooey;
1429 if (startpos == strbeg) /* is ^ valid at stringarg? */
1432 if (prog->reganch & ROPT_UTF8 && do_utf8) {
1433 U8 *s = reghop3((U8*)stringarg, -1, (U8*)strbeg);
1434 PL_regprev = utf8n_to_uvchr(s, (U8*)stringarg - s, NULL, 0);
1437 PL_regprev = (U32)stringarg[-1];
1438 if (!PL_multiline && PL_regprev == '\n')
1439 PL_regprev = '\0'; /* force ^ to NOT match */
1442 /* Check validity of program. */
1443 if (UCHARAT(prog->program) != REG_MAGIC) {
1444 Perl_croak(aTHX_ "corrupted regexp program");
1448 PL_reg_eval_set = 0;
1451 if (prog->reganch & ROPT_UTF8)
1452 PL_reg_flags |= RF_utf8;
1454 /* Mark beginning of line for ^ and lookbehind. */
1455 PL_regbol = startpos;
1459 /* Mark end of line for $ (and such) */
1462 /* see how far we have to get to not match where we matched before */
1463 PL_regtill = startpos+minend;
1465 /* We start without call_cc context. */
1468 /* If there is a "must appear" string, look for it. */
1471 if (prog->reganch & ROPT_GPOS_SEEN) { /* Need to have PL_reg_ganch */
1474 if (flags & REXEC_IGNOREPOS) /* Means: check only at start */
1475 PL_reg_ganch = startpos;
1476 else if (sv && SvTYPE(sv) >= SVt_PVMG
1478 && (mg = mg_find(sv, 'g')) && mg->mg_len >= 0) {
1479 PL_reg_ganch = strbeg + mg->mg_len; /* Defined pos() */
1480 if (prog->reganch & ROPT_ANCH_GPOS) {
1481 if (s > PL_reg_ganch)
1486 else /* pos() not defined */
1487 PL_reg_ganch = strbeg;
1490 if (do_utf8 == (UTF!=0) &&
1491 !(flags & REXEC_CHECKED) && prog->check_substr != Nullsv) {
1492 re_scream_pos_data d;
1494 d.scream_olds = &scream_olds;
1495 d.scream_pos = &scream_pos;
1496 s = re_intuit_start(prog, sv, s, strend, flags, &d);
1498 goto phooey; /* not present */
1501 DEBUG_r( if (!PL_colorset) reginitcolors() );
1502 DEBUG_r(PerlIO_printf(Perl_debug_log,
1503 "%sMatching REx%s `%s%.60s%s%s' against `%s%.*s%s%s'\n",
1504 PL_colors[4],PL_colors[5],PL_colors[0],
1507 (strlen(prog->precomp) > 60 ? "..." : ""),
1509 (int)(strend - startpos > 60 ? 60 : strend - startpos),
1510 startpos, PL_colors[1],
1511 (strend - startpos > 60 ? "..." : ""))
1514 /* Simplest case: anchored match need be tried only once. */
1515 /* [unless only anchor is BOL and multiline is set] */
1516 if (prog->reganch & (ROPT_ANCH & ~ROPT_ANCH_GPOS)) {
1517 if (s == startpos && regtry(prog, startpos))
1519 else if (PL_multiline || (prog->reganch & ROPT_IMPLICIT)
1520 || (prog->reganch & ROPT_ANCH_MBOL)) /* XXXX SBOL? */
1525 dontbother = minlen - 1;
1526 end = HOP3c(strend, -dontbother, strbeg) - 1;
1527 /* for multiline we only have to try after newlines */
1528 if (prog->check_substr) {
1532 if (regtry(prog, s))
1537 if (prog->reganch & RE_USE_INTUIT) {
1538 s = re_intuit_start(prog, sv, s + 1, strend, flags, NULL);
1549 if (*s++ == '\n') { /* don't need PL_utf8skip here */
1550 if (regtry(prog, s))
1557 } else if (prog->reganch & ROPT_ANCH_GPOS) {
1558 if (regtry(prog, PL_reg_ganch))
1563 /* Messy cases: unanchored match. */
1564 if (prog->anchored_substr && prog->reganch & ROPT_SKIP) {
1565 /* we have /x+whatever/ */
1566 /* it must be a one character string (XXXX Except UTF?) */
1567 char ch = SvPVX(prog->anchored_substr)[0];
1573 while (s < strend) {
1575 DEBUG_r( did_match = 1 );
1576 if (regtry(prog, s)) goto got_it;
1578 while (s < strend && *s == ch)
1585 while (s < strend) {
1587 DEBUG_r( did_match = 1 );
1588 if (regtry(prog, s)) goto got_it;
1590 while (s < strend && *s == ch)
1596 DEBUG_r(did_match ||
1597 PerlIO_printf(Perl_debug_log,
1598 "Did not find anchored character...\n"));
1601 else if (do_utf8 == (UTF!=0) &&
1602 (prog->anchored_substr != Nullsv
1603 || (prog->float_substr != Nullsv
1604 && prog->float_max_offset < strend - s))) {
1605 SV *must = prog->anchored_substr
1606 ? prog->anchored_substr : prog->float_substr;
1608 prog->anchored_substr ? prog->anchored_offset : prog->float_max_offset;
1610 prog->anchored_substr ? prog->anchored_offset : prog->float_min_offset;
1611 char *last = HOP3c(strend, /* Cannot start after this */
1612 -(I32)(CHR_SVLEN(must)
1613 - (SvTAIL(must) != 0) + back_min), strbeg);
1614 char *last1; /* Last position checked before */
1620 last1 = HOPc(s, -1);
1622 last1 = s - 1; /* bogus */
1624 /* XXXX check_substr already used to find `s', can optimize if
1625 check_substr==must. */
1627 dontbother = end_shift;
1628 strend = HOPc(strend, -dontbother);
1629 while ( (s <= last) &&
1630 ((flags & REXEC_SCREAM)
1631 ? (s = screaminstr(sv, must, HOP3c(s, back_min, strend) - strbeg,
1632 end_shift, &scream_pos, 0))
1633 : (s = fbm_instr((unsigned char*)HOP3(s, back_min, strend),
1634 (unsigned char*)strend, must,
1635 PL_multiline ? FBMrf_MULTILINE : 0))) ) {
1636 DEBUG_r( did_match = 1 );
1637 if (HOPc(s, -back_max) > last1) {
1638 last1 = HOPc(s, -back_min);
1639 s = HOPc(s, -back_max);
1642 char *t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
1644 last1 = HOPc(s, -back_min);
1648 while (s <= last1) {
1649 if (regtry(prog, s))
1655 while (s <= last1) {
1656 if (regtry(prog, s))
1662 DEBUG_r(did_match ||
1663 PerlIO_printf(Perl_debug_log, "Did not find %s substr `%s%.*s%s'%s...\n",
1664 ((must == prog->anchored_substr)
1665 ? "anchored" : "floating"),
1667 (int)(SvCUR(must) - (SvTAIL(must)!=0)),
1669 PL_colors[1], (SvTAIL(must) ? "$" : "")));
1672 else if ((c = prog->regstclass)) {
1673 if (minlen && PL_regkind[(U8)OP(prog->regstclass)] != EXACT)
1674 /* don't bother with what can't match */
1675 strend = HOPc(strend, -(minlen - 1));
1677 SV *prop = sv_newmortal();
1679 PerlIO_printf(Perl_debug_log, "Matching stclass `%s' against `%s'\n", SvPVX(prop), s);
1681 if (find_byclass(prog, c, s, strend, startpos, 0))
1683 DEBUG_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass...\n"));
1687 if (prog->float_substr != Nullsv) { /* Trim the end. */
1690 if (flags & REXEC_SCREAM) {
1691 last = screaminstr(sv, prog->float_substr, s - strbeg,
1692 end_shift, &scream_pos, 1); /* last one */
1694 last = scream_olds; /* Only one occurrence. */
1698 char *little = SvPV(prog->float_substr, len);
1700 if (SvTAIL(prog->float_substr)) {
1701 if (memEQ(strend - len + 1, little, len - 1))
1702 last = strend - len + 1;
1703 else if (!PL_multiline)
1704 last = memEQ(strend - len, little, len)
1705 ? strend - len : Nullch;
1711 last = rninstr(s, strend, little, little + len);
1713 last = strend; /* matching `$' */
1717 DEBUG_r(PerlIO_printf(Perl_debug_log,
1718 "%sCan't trim the tail, match fails (should not happen)%s\n",
1719 PL_colors[4],PL_colors[5]));
1720 goto phooey; /* Should not happen! */
1722 dontbother = strend - last + prog->float_min_offset;
1724 if (minlen && (dontbother < minlen))
1725 dontbother = minlen - 1;
1726 strend -= dontbother; /* this one's always in bytes! */
1727 /* We don't know much -- general case. */
1730 if (regtry(prog, s))
1739 if (regtry(prog, s))
1741 } while (s++ < strend);
1749 RX_MATCH_TAINTED_set(prog, PL_reg_flags & RF_tainted);
1751 if (PL_reg_eval_set) {
1752 /* Preserve the current value of $^R */
1753 if (oreplsv != GvSV(PL_replgv))
1754 sv_setsv(oreplsv, GvSV(PL_replgv));/* So that when GvSV(replgv) is
1755 restored, the value remains
1757 restore_pos(aTHXo_ 0);
1760 /* make sure $`, $&, $', and $digit will work later */
1761 if ( !(flags & REXEC_NOT_FIRST) ) {
1762 if (RX_MATCH_COPIED(prog)) {
1763 Safefree(prog->subbeg);
1764 RX_MATCH_COPIED_off(prog);
1766 if (flags & REXEC_COPY_STR) {
1767 I32 i = PL_regeol - startpos + (stringarg - strbeg);
1769 s = savepvn(strbeg, i);
1772 RX_MATCH_COPIED_on(prog);
1775 prog->subbeg = strbeg;
1776 prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
1783 DEBUG_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
1784 PL_colors[4],PL_colors[5]));
1785 if (PL_reg_eval_set)
1786 restore_pos(aTHXo_ 0);
1791 - regtry - try match at specific point
1793 STATIC I32 /* 0 failure, 1 success */
1794 S_regtry(pTHX_ regexp *prog, char *startpos)
1802 PL_regindent = 0; /* XXXX Not good when matches are reenterable... */
1804 if ((prog->reganch & ROPT_EVAL_SEEN) && !PL_reg_eval_set) {
1807 PL_reg_eval_set = RS_init;
1809 PerlIO_printf(Perl_debug_log, " setting stack tmpbase at %"IVdf"\n",
1810 (IV)(PL_stack_sp - PL_stack_base));
1812 SAVEI32(cxstack[cxstack_ix].blk_oldsp);
1813 cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base;
1814 /* Otherwise OP_NEXTSTATE will free whatever on stack now. */
1816 /* Apparently this is not needed, judging by wantarray. */
1817 /* SAVEI8(cxstack[cxstack_ix].blk_gimme);
1818 cxstack[cxstack_ix].blk_gimme = G_SCALAR; */
1821 /* Make $_ available to executed code. */
1822 if (PL_reg_sv != DEFSV) {
1823 /* SAVE_DEFSV does *not* suffice here for USE_THREADS */
1828 if (!(SvTYPE(PL_reg_sv) >= SVt_PVMG && SvMAGIC(PL_reg_sv)
1829 && (mg = mg_find(PL_reg_sv, 'g')))) {
1830 /* prepare for quick setting of pos */
1831 sv_magic(PL_reg_sv, (SV*)0, 'g', Nullch, 0);
1832 mg = mg_find(PL_reg_sv, 'g');
1836 PL_reg_oldpos = mg->mg_len;
1837 SAVEDESTRUCTOR_X(restore_pos, 0);
1840 Newz(22,PL_reg_curpm, 1, PMOP);
1841 PL_reg_curpm->op_pmregexp = prog;
1842 PL_reg_oldcurpm = PL_curpm;
1843 PL_curpm = PL_reg_curpm;
1844 if (RX_MATCH_COPIED(prog)) {
1845 /* Here is a serious problem: we cannot rewrite subbeg,
1846 since it may be needed if this match fails. Thus
1847 $` inside (?{}) could fail... */
1848 PL_reg_oldsaved = prog->subbeg;
1849 PL_reg_oldsavedlen = prog->sublen;
1850 RX_MATCH_COPIED_off(prog);
1853 PL_reg_oldsaved = Nullch;
1854 prog->subbeg = PL_bostr;
1855 prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
1857 prog->startp[0] = startpos - PL_bostr;
1858 PL_reginput = startpos;
1859 PL_regstartp = prog->startp;
1860 PL_regendp = prog->endp;
1861 PL_reglastparen = &prog->lastparen;
1862 prog->lastparen = 0;
1864 DEBUG_r(PL_reg_starttry = startpos);
1865 if (PL_reg_start_tmpl <= prog->nparens) {
1866 PL_reg_start_tmpl = prog->nparens*3/2 + 3;
1867 if(PL_reg_start_tmp)
1868 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
1870 New(22,PL_reg_start_tmp, PL_reg_start_tmpl, char*);
1873 /* XXXX What this code is doing here?!!! There should be no need
1874 to do this again and again, PL_reglastparen should take care of
1877 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
1878 * Actually, the code in regcppop() (which Ilya may be meaning by
1879 * PL_reglastparen), is not needed at all by the test suite
1880 * (op/regexp, op/pat, op/split), but that code is needed, oddly
1881 * enough, for building DynaLoader, or otherwise this
1882 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
1883 * will happen. Meanwhile, this code *is* needed for the
1884 * above-mentioned test suite tests to succeed. The common theme
1885 * on those tests seems to be returning null fields from matches.
1890 if (prog->nparens) {
1891 for (i = prog->nparens; i > *PL_reglastparen; i--) {
1898 if (regmatch(prog->program + 1)) {
1899 prog->endp[0] = PL_reginput - PL_bostr;
1902 REGCP_UNWIND(lastcp);
1906 #define RE_UNWIND_BRANCH 1
1907 #define RE_UNWIND_BRANCHJ 2
1911 typedef struct { /* XX: makes sense to enlarge it... */
1915 } re_unwind_generic_t;
1928 } re_unwind_branch_t;
1930 typedef union re_unwind_t {
1932 re_unwind_generic_t generic;
1933 re_unwind_branch_t branch;
1937 - regmatch - main matching routine
1939 * Conceptually the strategy is simple: check to see whether the current
1940 * node matches, call self recursively to see whether the rest matches,
1941 * and then act accordingly. In practice we make some effort to avoid
1942 * recursion, in particular by going through "ordinary" nodes (that don't
1943 * need to know whether the rest of the match failed) by a loop instead of
1946 /* [lwall] I've hoisted the register declarations to the outer block in order to
1947 * maybe save a little bit of pushing and popping on the stack. It also takes
1948 * advantage of machines that use a register save mask on subroutine entry.
1950 STATIC I32 /* 0 failure, 1 success */
1951 S_regmatch(pTHX_ regnode *prog)
1953 register regnode *scan; /* Current node. */
1954 regnode *next; /* Next node. */
1955 regnode *inner; /* Next node in internal branch. */
1956 register I32 nextchr; /* renamed nextchr - nextchar colides with
1957 function of same name */
1958 register I32 n; /* no or next */
1959 register I32 ln; /* len or last */
1960 register char *s; /* operand or save */
1961 register char *locinput = PL_reginput;
1962 register I32 c1, c2, paren; /* case fold search, parenth */
1963 int minmod = 0, sw = 0, logical = 0;
1965 I32 firstcp = PL_savestack_ix;
1966 register bool do_utf8 = DO_UTF8(PL_reg_sv);
1972 /* Note that nextchr is a byte even in UTF */
1973 nextchr = UCHARAT(locinput);
1975 while (scan != NULL) {
1976 #define sayNO_L (logical ? (logical = 0, sw = 0, goto cont) : sayNO)
1978 # define sayYES goto yes
1979 # define sayNO goto no
1980 # define sayYES_FINAL goto yes_final
1981 # define sayYES_LOUD goto yes_loud
1982 # define sayNO_FINAL goto no_final
1983 # define sayNO_SILENT goto do_no
1984 # define saySAME(x) if (x) goto yes; else goto no
1985 # define REPORT_CODE_OFF 24
1987 # define sayYES return 1
1988 # define sayNO return 0
1989 # define sayYES_FINAL return 1
1990 # define sayYES_LOUD return 1
1991 # define sayNO_FINAL return 0
1992 # define sayNO_SILENT return 0
1993 # define saySAME(x) return x
1996 SV *prop = sv_newmortal();
1997 int docolor = *PL_colors[0];
1998 int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
1999 int l = (PL_regeol - locinput) > taill ? taill : (PL_regeol - locinput);
2000 /* The part of the string before starttry has one color
2001 (pref0_len chars), between starttry and current
2002 position another one (pref_len - pref0_len chars),
2003 after the current position the third one.
2004 We assume that pref0_len <= pref_len, otherwise we
2005 decrease pref0_len. */
2006 int pref_len = (locinput - PL_bostr) > (5 + taill) - l
2007 ? (5 + taill) - l : locinput - PL_bostr;
2010 while (UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
2012 pref0_len = pref_len - (locinput - PL_reg_starttry);
2013 if (l + pref_len < (5 + taill) && l < PL_regeol - locinput)
2014 l = ( PL_regeol - locinput > (5 + taill) - pref_len
2015 ? (5 + taill) - pref_len : PL_regeol - locinput);
2016 while (UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
2020 if (pref0_len > pref_len)
2021 pref0_len = pref_len;
2022 regprop(prop, scan);
2023 PerlIO_printf(Perl_debug_log,
2024 "%4"IVdf" <%s%.*s%s%s%.*s%s%s%s%.*s%s>%*s|%3"IVdf":%*s%s\n",
2025 (IV)(locinput - PL_bostr),
2026 PL_colors[4], pref0_len,
2027 locinput - pref_len, PL_colors[5],
2028 PL_colors[2], pref_len - pref0_len,
2029 locinput - pref_len + pref0_len, PL_colors[3],
2030 (docolor ? "" : "> <"),
2031 PL_colors[0], l, locinput, PL_colors[1],
2032 15 - l - pref_len + 1,
2034 (IV)(scan - PL_regprogram), PL_regindent*2, "",
2038 next = scan + NEXT_OFF(scan);
2044 if (locinput == PL_bostr
2045 ? PL_regprev == '\n'
2047 (nextchr || locinput < PL_regeol) && locinput[-1] == '\n') )
2049 /* regtill = regbol; */
2054 if (locinput == PL_bostr
2055 ? PL_regprev == '\n'
2056 : ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n') )
2062 if (locinput == PL_bostr)
2066 if (locinput == PL_reg_ganch)
2076 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
2081 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
2083 if (PL_regeol - locinput > 1)
2087 if (PL_regeol != locinput)
2091 if (!nextchr && locinput >= PL_regeol)
2093 nextchr = UCHARAT(++locinput);
2096 if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
2099 locinput += PL_utf8skip[nextchr];
2100 if (locinput > PL_regeol)
2102 nextchr = UCHARAT(locinput);
2105 nextchr = UCHARAT(++locinput);
2110 if (do_utf8 != (UTF!=0)) {
2118 if (*((U8*)s) != utf8_to_uvchr((U8*)l, &len))
2127 if (*((U8*)l) != utf8_to_uvchr((U8*)s, &len))
2133 nextchr = UCHARAT(locinput);
2136 /* Inline the first character, for speed. */
2137 if (UCHARAT(s) != nextchr)
2139 if (PL_regeol - locinput < ln)
2141 if (ln > 1 && memNE(s, locinput, ln))
2144 nextchr = UCHARAT(locinput);
2147 PL_reg_flags |= RF_tainted;
2157 c1 = OP(scan) == EXACTF;
2159 if (l >= PL_regeol) {
2162 if ((UTF ? utf8n_to_uvchr((U8*)s, e - s, 0, 0) : *((U8*)s)) !=
2163 (c1 ? toLOWER_utf8((U8*)l) : toLOWER_LC_utf8((U8*)l)))
2165 s += UTF ? UTF8SKIP(s) : 1;
2169 nextchr = UCHARAT(locinput);
2173 /* Inline the first character, for speed. */
2174 if (UCHARAT(s) != nextchr &&
2175 UCHARAT(s) != ((OP(scan) == EXACTF)
2176 ? PL_fold : PL_fold_locale)[nextchr])
2178 if (PL_regeol - locinput < ln)
2180 if (ln > 1 && (OP(scan) == EXACTF
2181 ? ibcmp(s, locinput, ln)
2182 : ibcmp_locale(s, locinput, ln)))
2185 nextchr = UCHARAT(locinput);
2189 if (!reginclass(scan, (U8*)locinput, do_utf8))
2191 if (locinput >= PL_regeol)
2193 locinput += PL_utf8skip[nextchr];
2194 nextchr = UCHARAT(locinput);
2198 nextchr = UCHARAT(locinput);
2199 if (!reginclass(scan, (U8*)locinput, do_utf8))
2201 if (!nextchr && locinput >= PL_regeol)
2203 nextchr = UCHARAT(++locinput);
2207 PL_reg_flags |= RF_tainted;
2213 if (!(OP(scan) == ALNUM
2214 ? swash_fetch(PL_utf8_alnum, (U8*)locinput)
2215 : isALNUM_LC_utf8((U8*)locinput)))
2219 locinput += PL_utf8skip[nextchr];
2220 nextchr = UCHARAT(locinput);
2223 if (!(OP(scan) == ALNUM
2224 ? isALNUM(nextchr) : isALNUM_LC(nextchr)))
2226 nextchr = UCHARAT(++locinput);
2229 PL_reg_flags |= RF_tainted;
2232 if (!nextchr && locinput >= PL_regeol)
2235 LOAD_UTF8_CHARCLASS(alnum,"a");
2236 if (OP(scan) == NALNUM
2237 ? swash_fetch(PL_utf8_alnum, (U8*)locinput)
2238 : isALNUM_LC_utf8((U8*)locinput))
2242 locinput += PL_utf8skip[nextchr];
2243 nextchr = UCHARAT(locinput);
2246 if (OP(scan) == NALNUM
2247 ? isALNUM(nextchr) : isALNUM_LC(nextchr))
2249 nextchr = UCHARAT(++locinput);
2253 PL_reg_flags |= RF_tainted;
2257 /* was last char in word? */
2259 if (locinput == PL_regbol)
2262 U8 *r = reghop((U8*)locinput, -1);
2264 ln = utf8n_to_uvchr(r, s - (char*)r, 0, 0);
2266 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
2267 ln = isALNUM_uni(ln);
2268 LOAD_UTF8_CHARCLASS(alnum,"a");
2269 n = swash_fetch(PL_utf8_alnum, (U8*)locinput);
2272 ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(ln));
2273 n = isALNUM_LC_utf8((U8*)locinput);
2277 ln = (locinput != PL_regbol) ?
2278 UCHARAT(locinput - 1) : PL_regprev;
2279 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
2281 n = isALNUM(nextchr);
2284 ln = isALNUM_LC(ln);
2285 n = isALNUM_LC(nextchr);
2288 if (((!ln) == (!n)) == (OP(scan) == BOUND ||
2289 OP(scan) == BOUNDL))
2293 PL_reg_flags |= RF_tainted;
2299 if (UTF8_IS_CONTINUED(nextchr)) {
2300 LOAD_UTF8_CHARCLASS(space," ");
2301 if (!(OP(scan) == SPACE
2302 ? swash_fetch(PL_utf8_space, (U8*)locinput)
2303 : isSPACE_LC_utf8((U8*)locinput)))
2307 locinput += PL_utf8skip[nextchr];
2308 nextchr = UCHARAT(locinput);
2311 if (!(OP(scan) == SPACE
2312 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
2314 nextchr = UCHARAT(++locinput);
2317 if (!(OP(scan) == SPACE
2318 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
2320 nextchr = UCHARAT(++locinput);
2324 PL_reg_flags |= RF_tainted;
2327 if (!nextchr && locinput >= PL_regeol)
2330 LOAD_UTF8_CHARCLASS(space," ");
2331 if (OP(scan) == NSPACE
2332 ? swash_fetch(PL_utf8_space, (U8*)locinput)
2333 : isSPACE_LC_utf8((U8*)locinput))
2337 locinput += PL_utf8skip[nextchr];
2338 nextchr = UCHARAT(locinput);
2341 if (OP(scan) == NSPACE
2342 ? isSPACE(nextchr) : isSPACE_LC(nextchr))
2344 nextchr = UCHARAT(++locinput);
2347 PL_reg_flags |= RF_tainted;
2353 LOAD_UTF8_CHARCLASS(digit,"0");
2354 if (!(OP(scan) == DIGIT
2355 ? swash_fetch(PL_utf8_digit, (U8*)locinput)
2356 : isDIGIT_LC_utf8((U8*)locinput)))
2360 locinput += PL_utf8skip[nextchr];
2361 nextchr = UCHARAT(locinput);
2364 if (!(OP(scan) == DIGIT
2365 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr)))
2367 nextchr = UCHARAT(++locinput);
2370 PL_reg_flags |= RF_tainted;
2373 if (!nextchr && locinput >= PL_regeol)
2376 LOAD_UTF8_CHARCLASS(digit,"0");
2377 if (OP(scan) == NDIGIT
2378 ? swash_fetch(PL_utf8_digit, (U8*)locinput)
2379 : isDIGIT_LC_utf8((U8*)locinput))
2383 locinput += PL_utf8skip[nextchr];
2384 nextchr = UCHARAT(locinput);
2387 if (OP(scan) == NDIGIT
2388 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr))
2390 nextchr = UCHARAT(++locinput);
2393 LOAD_UTF8_CHARCLASS(mark,"~");
2394 if (locinput >= PL_regeol || swash_fetch(PL_utf8_mark,(U8*)locinput))
2396 locinput += PL_utf8skip[nextchr];
2397 while (locinput < PL_regeol && swash_fetch(PL_utf8_mark,(U8*)locinput))
2398 locinput += UTF8SKIP(locinput);
2399 if (locinput > PL_regeol)
2401 nextchr = UCHARAT(locinput);
2404 PL_reg_flags |= RF_tainted;
2408 n = ARG(scan); /* which paren pair */
2409 ln = PL_regstartp[n];
2410 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
2411 if (*PL_reglastparen < n || ln == -1)
2412 sayNO; /* Do not match unless seen CLOSEn. */
2413 if (ln == PL_regendp[n])
2417 if (do_utf8 && OP(scan) != REF) { /* REF can do byte comparison */
2419 char *e = PL_bostr + PL_regendp[n];
2421 * Note that we can't do the "other character" lookup trick as
2422 * in the 8-bit case (no pun intended) because in Unicode we
2423 * have to map both upper and title case to lower case.
2425 if (OP(scan) == REFF) {
2429 if (toLOWER_utf8((U8*)s) != toLOWER_utf8((U8*)l))
2439 if (toLOWER_LC_utf8((U8*)s) != toLOWER_LC_utf8((U8*)l))
2446 nextchr = UCHARAT(locinput);
2450 /* Inline the first character, for speed. */
2451 if (UCHARAT(s) != nextchr &&
2453 (UCHARAT(s) != ((OP(scan) == REFF
2454 ? PL_fold : PL_fold_locale)[nextchr]))))
2456 ln = PL_regendp[n] - ln;
2457 if (locinput + ln > PL_regeol)
2459 if (ln > 1 && (OP(scan) == REF
2460 ? memNE(s, locinput, ln)
2462 ? ibcmp(s, locinput, ln)
2463 : ibcmp_locale(s, locinput, ln))))
2466 nextchr = UCHARAT(locinput);
2477 OP_4tree *oop = PL_op;
2478 COP *ocurcop = PL_curcop;
2479 SV **ocurpad = PL_curpad;
2483 PL_op = (OP_4tree*)PL_regdata->data[n];
2484 DEBUG_r( PerlIO_printf(Perl_debug_log, " re_eval 0x%"UVxf"\n", PTR2UV(PL_op)) );
2485 PL_curpad = AvARRAY((AV*)PL_regdata->data[n + 2]);
2486 PL_regendp[0] = PL_reg_magic->mg_len = locinput - PL_bostr;
2488 CALLRUNOPS(aTHX); /* Scalar context. */
2494 PL_curpad = ocurpad;
2495 PL_curcop = ocurcop;
2497 if (logical == 2) { /* Postponed subexpression. */
2499 MAGIC *mg = Null(MAGIC*);
2501 CHECKPOINT cp, lastcp;
2503 if(SvROK(ret) || SvRMAGICAL(ret)) {
2504 SV *sv = SvROK(ret) ? SvRV(ret) : ret;
2507 mg = mg_find(sv, 'r');
2510 re = (regexp *)mg->mg_obj;
2511 (void)ReREFCNT_inc(re);
2515 char *t = SvPV(ret, len);
2517 char *oprecomp = PL_regprecomp;
2518 I32 osize = PL_regsize;
2519 I32 onpar = PL_regnpar;
2522 re = CALLREGCOMP(aTHX_ t, t + len, &pm);
2524 & (SVs_TEMP | SVs_PADTMP | SVf_READONLY)))
2525 sv_magic(ret,(SV*)ReREFCNT_inc(re),'r',0,0);
2526 PL_regprecomp = oprecomp;
2531 PerlIO_printf(Perl_debug_log,
2532 "Entering embedded `%s%.60s%s%s'\n",
2536 (strlen(re->precomp) > 60 ? "..." : ""))
2539 state.prev = PL_reg_call_cc;
2540 state.cc = PL_regcc;
2541 state.re = PL_reg_re;
2545 cp = regcppush(0); /* Save *all* the positions. */
2548 state.ss = PL_savestack_ix;
2549 *PL_reglastparen = 0;
2550 PL_reg_call_cc = &state;
2551 PL_reginput = locinput;
2553 /* XXXX This is too dramatic a measure... */
2556 if (regmatch(re->program + 1)) {
2557 /* Even though we succeeded, we need to restore
2558 global variables, since we may be wrapped inside
2559 SUSPEND, thus the match may be not finished yet. */
2561 /* XXXX Do this only if SUSPENDed? */
2562 PL_reg_call_cc = state.prev;
2563 PL_regcc = state.cc;
2564 PL_reg_re = state.re;
2565 cache_re(PL_reg_re);
2567 /* XXXX This is too dramatic a measure... */
2570 /* These are needed even if not SUSPEND. */
2576 REGCP_UNWIND(lastcp);
2578 PL_reg_call_cc = state.prev;
2579 PL_regcc = state.cc;
2580 PL_reg_re = state.re;
2581 cache_re(PL_reg_re);
2583 /* XXXX This is too dramatic a measure... */
2592 sv_setsv(save_scalar(PL_replgv), ret);
2596 n = ARG(scan); /* which paren pair */
2597 PL_reg_start_tmp[n] = locinput;
2602 n = ARG(scan); /* which paren pair */
2603 PL_regstartp[n] = PL_reg_start_tmp[n] - PL_bostr;
2604 PL_regendp[n] = locinput - PL_bostr;
2605 if (n > *PL_reglastparen)
2606 *PL_reglastparen = n;
2609 n = ARG(scan); /* which paren pair */
2610 sw = (*PL_reglastparen >= n && PL_regendp[n] != -1);
2613 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
2615 next = NEXTOPER(NEXTOPER(scan));
2617 next = scan + ARG(scan);
2618 if (OP(next) == IFTHEN) /* Fake one. */
2619 next = NEXTOPER(NEXTOPER(next));
2623 logical = scan->flags;
2625 /*******************************************************************
2626 PL_regcc contains infoblock about the innermost (...)* loop, and
2627 a pointer to the next outer infoblock.
2629 Here is how Y(A)*Z is processed (if it is compiled into CURLYX/WHILEM):
2631 1) After matching X, regnode for CURLYX is processed;
2633 2) This regnode creates infoblock on the stack, and calls
2634 regmatch() recursively with the starting point at WHILEM node;
2636 3) Each hit of WHILEM node tries to match A and Z (in the order
2637 depending on the current iteration, min/max of {min,max} and
2638 greediness). The information about where are nodes for "A"
2639 and "Z" is read from the infoblock, as is info on how many times "A"
2640 was already matched, and greediness.
2642 4) After A matches, the same WHILEM node is hit again.
2644 5) Each time WHILEM is hit, PL_regcc is the infoblock created by CURLYX
2645 of the same pair. Thus when WHILEM tries to match Z, it temporarily
2646 resets PL_regcc, since this Y(A)*Z can be a part of some other loop:
2647 as in (Y(A)*Z)*. If Z matches, the automaton will hit the WHILEM node
2648 of the external loop.
2650 Currently present infoblocks form a tree with a stem formed by PL_curcc
2651 and whatever it mentions via ->next, and additional attached trees
2652 corresponding to temporarily unset infoblocks as in "5" above.
2654 In the following picture infoblocks for outer loop of
2655 (Y(A)*?Z)*?T are denoted O, for inner I. NULL starting block
2656 is denoted by x. The matched string is YAAZYAZT. Temporarily postponed
2657 infoblocks are drawn below the "reset" infoblock.
2659 In fact in the picture below we do not show failed matches for Z and T
2660 by WHILEM blocks. [We illustrate minimal matches, since for them it is
2661 more obvious *why* one needs to *temporary* unset infoblocks.]
2663 Matched REx position InfoBlocks Comment
2667 Y A)*?Z)*?T x <- O <- I
2668 YA )*?Z)*?T x <- O <- I
2669 YA A)*?Z)*?T x <- O <- I
2670 YAA )*?Z)*?T x <- O <- I
2671 YAA Z)*?T x <- O # Temporary unset I
2674 YAAZ Y(A)*?Z)*?T x <- O
2677 YAAZY (A)*?Z)*?T x <- O
2680 YAAZY A)*?Z)*?T x <- O <- I
2683 YAAZYA )*?Z)*?T x <- O <- I
2686 YAAZYA Z)*?T x <- O # Temporary unset I
2692 YAAZYAZ T x # Temporary unset O
2699 *******************************************************************/
2702 CHECKPOINT cp = PL_savestack_ix;
2703 /* No need to save/restore up to this paren */
2704 I32 parenfloor = scan->flags;
2706 if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
2708 cc.oldcc = PL_regcc;
2710 /* XXXX Probably it is better to teach regpush to support
2711 parenfloor > PL_regsize... */
2712 if (parenfloor > *PL_reglastparen)
2713 parenfloor = *PL_reglastparen; /* Pessimization... */
2714 cc.parenfloor = parenfloor;
2716 cc.min = ARG1(scan);
2717 cc.max = ARG2(scan);
2718 cc.scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
2722 PL_reginput = locinput;
2723 n = regmatch(PREVOPER(next)); /* start on the WHILEM */
2725 PL_regcc = cc.oldcc;
2731 * This is really hard to understand, because after we match
2732 * what we're trying to match, we must make sure the rest of
2733 * the REx is going to match for sure, and to do that we have
2734 * to go back UP the parse tree by recursing ever deeper. And
2735 * if it fails, we have to reset our parent's current state
2736 * that we can try again after backing off.
2739 CHECKPOINT cp, lastcp;
2740 CURCUR* cc = PL_regcc;
2741 char *lastloc = cc->lastloc; /* Detection of 0-len. */
2743 n = cc->cur + 1; /* how many we know we matched */
2744 PL_reginput = locinput;
2747 PerlIO_printf(Perl_debug_log,
2748 "%*s %ld out of %ld..%ld cc=%lx\n",
2749 REPORT_CODE_OFF+PL_regindent*2, "",
2750 (long)n, (long)cc->min,
2751 (long)cc->max, (long)cc)
2754 /* If degenerate scan matches "", assume scan done. */
2756 if (locinput == cc->lastloc && n >= cc->min) {
2757 PL_regcc = cc->oldcc;
2761 PerlIO_printf(Perl_debug_log,
2762 "%*s empty match detected, try continuation...\n",
2763 REPORT_CODE_OFF+PL_regindent*2, "")
2765 if (regmatch(cc->next))
2773 /* First just match a string of min scans. */
2777 cc->lastloc = locinput;
2778 if (regmatch(cc->scan))
2781 cc->lastloc = lastloc;
2786 /* Check whether we already were at this position.
2787 Postpone detection until we know the match is not
2788 *that* much linear. */
2789 if (!PL_reg_maxiter) {
2790 PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
2791 PL_reg_leftiter = PL_reg_maxiter;
2793 if (PL_reg_leftiter-- == 0) {
2794 I32 size = (PL_reg_maxiter + 7)/8;
2795 if (PL_reg_poscache) {
2796 if (PL_reg_poscache_size < size) {
2797 Renew(PL_reg_poscache, size, char);
2798 PL_reg_poscache_size = size;
2800 Zero(PL_reg_poscache, size, char);
2803 PL_reg_poscache_size = size;
2804 Newz(29, PL_reg_poscache, size, char);
2807 PerlIO_printf(Perl_debug_log,
2808 "%sDetected a super-linear match, switching on caching%s...\n",
2809 PL_colors[4], PL_colors[5])
2812 if (PL_reg_leftiter < 0) {
2813 I32 o = locinput - PL_bostr, b;
2815 o = (scan->flags & 0xf) - 1 + o * (scan->flags>>4);
2818 if (PL_reg_poscache[o] & (1<<b)) {
2820 PerlIO_printf(Perl_debug_log,
2821 "%*s already tried at this position...\n",
2822 REPORT_CODE_OFF+PL_regindent*2, "")
2826 PL_reg_poscache[o] |= (1<<b);
2830 /* Prefer next over scan for minimal matching. */
2833 PL_regcc = cc->oldcc;
2836 cp = regcppush(cc->parenfloor);
2838 if (regmatch(cc->next)) {
2840 sayYES; /* All done. */
2842 REGCP_UNWIND(lastcp);
2848 if (n >= cc->max) { /* Maximum greed exceeded? */
2849 if (ckWARN(WARN_REGEXP) && n >= REG_INFTY
2850 && !(PL_reg_flags & RF_warned)) {
2851 PL_reg_flags |= RF_warned;
2852 Perl_warner(aTHX_ WARN_REGEXP, "%s limit (%d) exceeded",
2853 "Complex regular subexpression recursion",
2860 PerlIO_printf(Perl_debug_log,
2861 "%*s trying longer...\n",
2862 REPORT_CODE_OFF+PL_regindent*2, "")
2864 /* Try scanning more and see if it helps. */
2865 PL_reginput = locinput;
2867 cc->lastloc = locinput;
2868 cp = regcppush(cc->parenfloor);
2870 if (regmatch(cc->scan)) {
2874 REGCP_UNWIND(lastcp);
2877 cc->lastloc = lastloc;
2881 /* Prefer scan over next for maximal matching. */
2883 if (n < cc->max) { /* More greed allowed? */
2884 cp = regcppush(cc->parenfloor);
2886 cc->lastloc = locinput;
2888 if (regmatch(cc->scan)) {
2892 REGCP_UNWIND(lastcp);
2893 regcppop(); /* Restore some previous $<digit>s? */
2894 PL_reginput = locinput;
2896 PerlIO_printf(Perl_debug_log,
2897 "%*s failed, try continuation...\n",
2898 REPORT_CODE_OFF+PL_regindent*2, "")
2901 if (ckWARN(WARN_REGEXP) && n >= REG_INFTY
2902 && !(PL_reg_flags & RF_warned)) {
2903 PL_reg_flags |= RF_warned;
2904 Perl_warner(aTHX_ WARN_REGEXP, "%s limit (%d) exceeded",
2905 "Complex regular subexpression recursion",
2909 /* Failed deeper matches of scan, so see if this one works. */
2910 PL_regcc = cc->oldcc;
2913 if (regmatch(cc->next))
2919 cc->lastloc = lastloc;
2924 next = scan + ARG(scan);
2927 inner = NEXTOPER(NEXTOPER(scan));
2930 inner = NEXTOPER(scan);
2935 if (OP(next) != c1) /* No choice. */
2936 next = inner; /* Avoid recursion. */
2938 I32 lastparen = *PL_reglastparen;
2940 re_unwind_branch_t *uw;
2942 /* Put unwinding data on stack */
2943 unwind1 = SSNEWt(1,re_unwind_branch_t);
2944 uw = SSPTRt(unwind1,re_unwind_branch_t);
2947 uw->type = ((c1 == BRANCH)
2949 : RE_UNWIND_BRANCHJ);
2950 uw->lastparen = lastparen;
2952 uw->locinput = locinput;
2953 uw->nextchr = nextchr;
2955 uw->regindent = ++PL_regindent;
2958 REGCP_SET(uw->lastcp);
2960 /* Now go into the first branch */
2973 /* We suppose that the next guy does not need
2974 backtracking: in particular, it is of constant length,
2975 and has no parenths to influence future backrefs. */
2976 ln = ARG1(scan); /* min to match */
2977 n = ARG2(scan); /* max to match */
2978 paren = scan->flags;
2980 if (paren > PL_regsize)
2982 if (paren > *PL_reglastparen)
2983 *PL_reglastparen = paren;
2985 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
2987 scan += NEXT_OFF(scan); /* Skip former OPEN. */
2988 PL_reginput = locinput;
2991 if (ln && regrepeat_hard(scan, ln, &l) < ln)
2993 if (ln && l == 0 && n >= ln
2994 /* In fact, this is tricky. If paren, then the
2995 fact that we did/didnot match may influence
2996 future execution. */
2997 && !(paren && ln == 0))
2999 locinput = PL_reginput;
3000 if (PL_regkind[(U8)OP(next)] == EXACT) {
3001 c1 = (U8)*STRING(next);
3002 if (OP(next) == EXACTF)
3004 else if (OP(next) == EXACTFL)
3005 c2 = PL_fold_locale[c1];
3012 /* This may be improved if l == 0. */
3013 while (n >= ln || (n == REG_INFTY && ln > 0 && l)) { /* ln overflow ? */
3014 /* If it could work, try it. */
3016 UCHARAT(PL_reginput) == c1 ||
3017 UCHARAT(PL_reginput) == c2)
3021 PL_regstartp[paren] =
3022 HOPc(PL_reginput, -l) - PL_bostr;
3023 PL_regendp[paren] = PL_reginput - PL_bostr;
3026 PL_regendp[paren] = -1;
3030 REGCP_UNWIND(lastcp);
3032 /* Couldn't or didn't -- move forward. */
3033 PL_reginput = locinput;
3034 if (regrepeat_hard(scan, 1, &l)) {
3036 locinput = PL_reginput;
3043 n = regrepeat_hard(scan, n, &l);
3044 if (n != 0 && l == 0
3045 /* In fact, this is tricky. If paren, then the
3046 fact that we did/didnot match may influence
3047 future execution. */
3048 && !(paren && ln == 0))
3050 locinput = PL_reginput;
3052 PerlIO_printf(Perl_debug_log,
3053 "%*s matched %"IVdf" times, len=%"IVdf"...\n",
3054 (int)(REPORT_CODE_OFF+PL_regindent*2), "",
3058 if (PL_regkind[(U8)OP(next)] == EXACT) {
3059 c1 = (U8)*STRING(next);
3060 if (OP(next) == EXACTF)
3062 else if (OP(next) == EXACTFL)
3063 c2 = PL_fold_locale[c1];
3072 /* If it could work, try it. */
3074 UCHARAT(PL_reginput) == c1 ||
3075 UCHARAT(PL_reginput) == c2)
3078 PerlIO_printf(Perl_debug_log,
3079 "%*s trying tail with n=%"IVdf"...\n",
3080 (int)(REPORT_CODE_OFF+PL_regindent*2), "", (IV)n)
3084 PL_regstartp[paren] = HOPc(PL_reginput, -l) - PL_bostr;
3085 PL_regendp[paren] = PL_reginput - PL_bostr;
3088 PL_regendp[paren] = -1;
3092 REGCP_UNWIND(lastcp);
3094 /* Couldn't or didn't -- back up. */
3096 locinput = HOPc(locinput, -l);
3097 PL_reginput = locinput;
3104 paren = scan->flags; /* Which paren to set */
3105 if (paren > PL_regsize)
3107 if (paren > *PL_reglastparen)
3108 *PL_reglastparen = paren;
3109 ln = ARG1(scan); /* min to match */
3110 n = ARG2(scan); /* max to match */
3111 scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
3115 ln = ARG1(scan); /* min to match */
3116 n = ARG2(scan); /* max to match */
3117 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
3122 scan = NEXTOPER(scan);
3128 scan = NEXTOPER(scan);
3132 * Lookahead to avoid useless match attempts
3133 * when we know what character comes next.
3135 if (PL_regkind[(U8)OP(next)] == EXACT) {
3136 U8 *s = (U8*)STRING(next);
3139 if (OP(next) == EXACTF)
3141 else if (OP(next) == EXACTFL)
3142 c2 = PL_fold_locale[c1];
3145 if (OP(next) == EXACTF) {
3146 c1 = to_utf8_lower(s);
3147 c2 = to_utf8_upper(s);
3150 c2 = c1 = utf8_to_uvchr(s, NULL);
3156 PL_reginput = locinput;
3160 if (ln && regrepeat(scan, ln) < ln)
3162 locinput = PL_reginput;
3165 char *e; /* Should not check after this */
3166 char *old = locinput;
3168 if (n == REG_INFTY) {
3171 while (UTF8_IS_CONTINUATION(*(U8*)e))
3177 m >0 && e + UTF8SKIP(e) <= PL_regeol; m--)
3181 e = locinput + n - ln;
3187 /* Find place 'next' could work */
3190 while (locinput <= e && *locinput != c1)
3193 while (locinput <= e
3198 count = locinput - old;
3205 utf8_to_uvchr((U8*)locinput, &len) != c1;
3210 for (count = 0; locinput <= e; count++) {
3211 UV c = utf8_to_uvchr((U8*)locinput, &len);
3212 if (c == c1 || c == c2)
3220 /* PL_reginput == old now */
3221 if (locinput != old) {
3222 ln = 1; /* Did some */
3223 if (regrepeat(scan, count) < count)
3226 /* PL_reginput == locinput now */
3227 TRYPAREN(paren, ln, locinput);
3228 PL_reginput = locinput; /* Could be reset... */
3229 REGCP_UNWIND(lastcp);
3230 /* Couldn't or didn't -- move forward. */
3233 locinput += UTF8SKIP(locinput);
3239 while (n >= ln || (n == REG_INFTY && ln > 0)) { /* ln overflow ? */
3243 c = utf8_to_uvchr((U8*)PL_reginput, NULL);
3245 c = UCHARAT(PL_reginput);
3246 /* If it could work, try it. */
3247 if (c == c1 || c == c2)
3249 TRYPAREN(paren, n, PL_reginput);
3250 REGCP_UNWIND(lastcp);
3253 /* If it could work, try it. */
3254 else if (c1 == -1000)
3256 TRYPAREN(paren, n, PL_reginput);
3257 REGCP_UNWIND(lastcp);
3259 /* Couldn't or didn't -- move forward. */
3260 PL_reginput = locinput;
3261 if (regrepeat(scan, 1)) {
3263 locinput = PL_reginput;
3271 n = regrepeat(scan, n);
3272 locinput = PL_reginput;
3273 if (ln < n && PL_regkind[(U8)OP(next)] == EOL &&
3274 (!PL_multiline || OP(next) == SEOL || OP(next) == EOS)) {
3275 ln = n; /* why back off? */
3276 /* ...because $ and \Z can match before *and* after
3277 newline at the end. Consider "\n\n" =~ /\n+\Z\n/.
3278 We should back off by one in this case. */
3279 if (UCHARAT(PL_reginput - 1) == '\n' && OP(next) != EOS)
3288 c = utf8_to_uvchr((U8*)PL_reginput, NULL);
3290 c = UCHARAT(PL_reginput);
3292 /* If it could work, try it. */
3293 if (c1 == -1000 || c == c1 || c == c2)
3295 TRYPAREN(paren, n, PL_reginput);
3296 REGCP_UNWIND(lastcp);
3298 /* Couldn't or didn't -- back up. */
3300 PL_reginput = locinput = HOPc(locinput, -1);
3308 c = utf8_to_uvchr((U8*)PL_reginput, NULL);
3310 c = UCHARAT(PL_reginput);
3312 /* If it could work, try it. */
3313 if (c1 == -1000 || c == c1 || c == c2)
3315 TRYPAREN(paren, n, PL_reginput);
3316 REGCP_UNWIND(lastcp);
3318 /* Couldn't or didn't -- back up. */
3320 PL_reginput = locinput = HOPc(locinput, -1);
3327 if (PL_reg_call_cc) {
3328 re_cc_state *cur_call_cc = PL_reg_call_cc;
3329 CURCUR *cctmp = PL_regcc;
3330 regexp *re = PL_reg_re;
3331 CHECKPOINT cp, lastcp;
3333 cp = regcppush(0); /* Save *all* the positions. */
3335 regcp_set_to(PL_reg_call_cc->ss); /* Restore parens of
3337 PL_reginput = locinput; /* Make position available to
3339 cache_re(PL_reg_call_cc->re);
3340 PL_regcc = PL_reg_call_cc->cc;
3341 PL_reg_call_cc = PL_reg_call_cc->prev;
3342 if (regmatch(cur_call_cc->node)) {
3343 PL_reg_call_cc = cur_call_cc;
3347 REGCP_UNWIND(lastcp);
3349 PL_reg_call_cc = cur_call_cc;
3355 PerlIO_printf(Perl_debug_log,
3356 "%*s continuation failed...\n",
3357 REPORT_CODE_OFF+PL_regindent*2, "")
3361 if (locinput < PL_regtill) {
3362 DEBUG_r(PerlIO_printf(Perl_debug_log,
3363 "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
3365 (long)(locinput - PL_reg_starttry),
3366 (long)(PL_regtill - PL_reg_starttry),
3368 sayNO_FINAL; /* Cannot match: too short. */
3370 PL_reginput = locinput; /* put where regtry can find it */
3371 sayYES_FINAL; /* Success! */
3373 PL_reginput = locinput; /* put where regtry can find it */
3374 sayYES_LOUD; /* Success! */
3377 PL_reginput = locinput;
3382 if (UTF) { /* XXXX This is absolutely
3383 broken, we read before
3385 s = HOPMAYBEc(locinput, -scan->flags);
3391 if (locinput < PL_bostr + scan->flags)
3393 PL_reginput = locinput - scan->flags;
3398 PL_reginput = locinput;
3403 if (UTF) { /* XXXX This is absolutely
3404 broken, we read before
3406 s = HOPMAYBEc(locinput, -scan->flags);
3407 if (!s || s < PL_bostr)
3412 if (locinput < PL_bostr + scan->flags)
3414 PL_reginput = locinput - scan->flags;
3419 PL_reginput = locinput;
3422 inner = NEXTOPER(NEXTOPER(scan));
3423 if (regmatch(inner) != n) {
3438 if (OP(scan) == SUSPEND) {
3439 locinput = PL_reginput;
3440 nextchr = UCHARAT(locinput);
3445 next = scan + ARG(scan);
3450 PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
3451 PTR2UV(scan), OP(scan));
3452 Perl_croak(aTHX_ "regexp memory corruption");
3459 * We get here only if there's trouble -- normally "case END" is
3460 * the terminating point.
3462 Perl_croak(aTHX_ "corrupted regexp pointers");
3468 PerlIO_printf(Perl_debug_log,
3469 "%*s %scould match...%s\n",
3470 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4],PL_colors[5])
3474 DEBUG_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
3475 PL_colors[4],PL_colors[5]));
3481 #if 0 /* Breaks $^R */
3489 PerlIO_printf(Perl_debug_log,
3490 "%*s %sfailed...%s\n",
3491 REPORT_CODE_OFF+PL_regindent*2, "",PL_colors[4],PL_colors[5])
3497 re_unwind_t *uw = SSPTRt(unwind,re_unwind_t);
3500 case RE_UNWIND_BRANCH:
3501 case RE_UNWIND_BRANCHJ:
3503 re_unwind_branch_t *uwb = &(uw->branch);
3504 I32 lastparen = uwb->lastparen;
3506 REGCP_UNWIND(uwb->lastcp);
3507 for (n = *PL_reglastparen; n > lastparen; n--)
3509 *PL_reglastparen = n;
3510 scan = next = uwb->next;
3512 OP(scan) != (uwb->type == RE_UNWIND_BRANCH
3513 ? BRANCH : BRANCHJ) ) { /* Failure */
3520 /* Have more choice yet. Reuse the same uwb. */
3522 if ((n = (uwb->type == RE_UNWIND_BRANCH
3523 ? NEXT_OFF(next) : ARG(next))))
3526 next = NULL; /* XXXX Needn't unwinding in this case... */
3528 next = NEXTOPER(scan);
3529 if (uwb->type == RE_UNWIND_BRANCHJ)
3530 next = NEXTOPER(next);
3531 locinput = uwb->locinput;
3532 nextchr = uwb->nextchr;
3534 PL_regindent = uwb->regindent;
3541 Perl_croak(aTHX_ "regexp unwind memory corruption");
3552 - regrepeat - repeatedly match something simple, report how many
3555 * [This routine now assumes that it will only match on things of length 1.
3556 * That was true before, but now we assume scan - reginput is the count,
3557 * rather than incrementing count on every character. [Er, except utf8.]]
3560 S_regrepeat(pTHX_ regnode *p, I32 max)
3562 register char *scan;
3564 register char *loceol = PL_regeol;
3565 register I32 hardcount = 0;
3566 register bool do_utf8 = DO_UTF8(PL_reg_sv);
3569 if (max != REG_INFTY && max < loceol - scan)
3570 loceol = scan + max;
3575 while (scan < loceol && hardcount < max && *scan != '\n') {
3576 scan += UTF8SKIP(scan);
3580 while (scan < loceol && *scan != '\n')
3587 case EXACT: /* length of string is 1 */
3589 while (scan < loceol && UCHARAT(scan) == c)
3592 case EXACTF: /* length of string is 1 */
3594 while (scan < loceol &&
3595 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold[c]))
3598 case EXACTFL: /* length of string is 1 */
3599 PL_reg_flags |= RF_tainted;
3601 while (scan < loceol &&
3602 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold_locale[c]))
3608 while (hardcount < max && scan < loceol &&
3609 reginclass(p, (U8*)scan, do_utf8)) {
3610 scan += UTF8SKIP(scan);
3614 while (scan < loceol && reginclass(p, (U8*)scan, do_utf8))
3621 LOAD_UTF8_CHARCLASS(alnum,"a");
3622 while (hardcount < max && scan < loceol &&
3623 swash_fetch(PL_utf8_alnum, (U8*)scan)) {
3624 scan += UTF8SKIP(scan);
3628 while (scan < loceol && isALNUM(*scan))
3633 PL_reg_flags |= RF_tainted;
3636 while (hardcount < max && scan < loceol &&
3637 isALNUM_LC_utf8((U8*)scan)) {
3638 scan += UTF8SKIP(scan);
3642 while (scan < loceol && isALNUM_LC(*scan))
3649 LOAD_UTF8_CHARCLASS(alnum,"a");
3650 while (hardcount < max && scan < loceol &&
3651 !swash_fetch(PL_utf8_alnum, (U8*)scan)) {
3652 scan += UTF8SKIP(scan);
3656 while (scan < loceol && !isALNUM(*scan))
3661 PL_reg_flags |= RF_tainted;
3664 while (hardcount < max && scan < loceol &&
3665 !isALNUM_LC_utf8((U8*)scan)) {
3666 scan += UTF8SKIP(scan);
3670 while (scan < loceol && !isALNUM_LC(*scan))
3677 LOAD_UTF8_CHARCLASS(space," ");
3678 while (hardcount < max && scan < loceol &&
3679 (*scan == ' ' || swash_fetch(PL_utf8_space,(U8*)scan))) {
3680 scan += UTF8SKIP(scan);
3684 while (scan < loceol && isSPACE(*scan))
3689 PL_reg_flags |= RF_tainted;
3692 while (hardcount < max && scan < loceol &&
3693 (*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
3694 scan += UTF8SKIP(scan);
3698 while (scan < loceol && isSPACE_LC(*scan))
3705 LOAD_UTF8_CHARCLASS(space," ");
3706 while (hardcount < max && scan < loceol &&
3707 !(*scan == ' ' || swash_fetch(PL_utf8_space,(U8*)scan))) {
3708 scan += UTF8SKIP(scan);
3712 while (scan < loceol && !isSPACE(*scan))
3717 PL_reg_flags |= RF_tainted;
3720 while (hardcount < max && scan < loceol &&
3721 !(*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
3722 scan += UTF8SKIP(scan);
3726 while (scan < loceol && !isSPACE_LC(*scan))
3733 LOAD_UTF8_CHARCLASS(digit,"0");
3734 while (hardcount < max && scan < loceol &&
3735 swash_fetch(PL_utf8_digit,(U8*)scan)) {
3736 scan += UTF8SKIP(scan);
3740 while (scan < loceol && isDIGIT(*scan))
3747 LOAD_UTF8_CHARCLASS(digit,"0");
3748 while (hardcount < max && scan < loceol &&
3749 !swash_fetch(PL_utf8_digit,(U8*)scan)) {
3750 scan += UTF8SKIP(scan);
3754 while (scan < loceol && !isDIGIT(*scan))
3758 default: /* Called on something of 0 width. */
3759 break; /* So match right here or not at all. */
3765 c = scan - PL_reginput;
3770 SV *prop = sv_newmortal();
3773 PerlIO_printf(Perl_debug_log,
3774 "%*s %s can match %"IVdf" times out of %"IVdf"...\n",
3775 REPORT_CODE_OFF+1, "", SvPVX(prop),(IV)c,(IV)max);
3782 - regrepeat_hard - repeatedly match something, report total lenth and length
3784 * The repeater is supposed to have constant length.
3788 S_regrepeat_hard(pTHX_ regnode *p, I32 max, I32 *lp)
3790 register char *scan;
3791 register char *start;
3792 register char *loceol = PL_regeol;
3794 I32 count = 0, res = 1;
3799 start = PL_reginput;
3800 if (DO_UTF8(PL_reg_sv)) {
3801 while (PL_reginput < loceol && (scan = PL_reginput, res = regmatch(p))) {
3804 while (start < PL_reginput) {
3806 start += UTF8SKIP(start);
3817 while (PL_reginput < loceol && (scan = PL_reginput, res = regmatch(p))) {
3819 *lp = l = PL_reginput - start;
3820 if (max != REG_INFTY && l*max < loceol - scan)
3821 loceol = scan + l*max;
3834 - regclass_swash - prepare the utf8 swash
3838 Perl_regclass_swash(pTHX_ register regnode* node, bool doinit, SV** initsvp)
3843 if (PL_regdata && PL_regdata->count) {
3846 if (PL_regdata->what[n] == 's') {
3847 SV *rv = (SV*)PL_regdata->data[n];
3848 AV *av = (AV*)SvRV((SV*)rv);
3851 si = *av_fetch(av, 0, FALSE);
3852 a = av_fetch(av, 1, FALSE);
3856 else if (si && doinit) {
3857 sw = swash_init("utf8", "", si, 1, 0);
3858 (void)av_store(av, 1, sw);
3870 - reginclass - determine if a character falls into a character class
3874 S_reginclass(pTHX_ register regnode *n, register U8* p, register bool do_utf8)
3876 char flags = ANYOF_FLAGS(n);
3882 c = utf8_to_uvchr(p, &len);
3886 if (do_utf8 || (flags & ANYOF_UNICODE)) {
3887 if (do_utf8 && !ANYOF_RUNTIME(n)) {
3888 if (len != (STRLEN)-1 && c < 256 && ANYOF_BITMAP_TEST(n, c))
3891 if (do_utf8 && flags & ANYOF_UNICODE_ALL && c >= 256)
3894 SV *sw = regclass_swash(n, TRUE, 0);
3897 if (swash_fetch(sw, p))
3899 else if (flags & ANYOF_FOLD) {
3900 U8 tmpbuf[UTF8_MAXLEN+1];
3902 if (flags & ANYOF_LOCALE) {
3903 PL_reg_flags |= RF_tainted;
3904 uvchr_to_utf8(tmpbuf, toLOWER_LC_utf8(p));
3907 uvchr_to_utf8(tmpbuf, toLOWER_utf8(p));
3908 if (swash_fetch(sw, tmpbuf))
3914 if (!match && c < 256) {
3915 if (ANYOF_BITMAP_TEST(n, c))
3917 else if (flags & ANYOF_FOLD) {
3920 if (flags & ANYOF_LOCALE) {
3921 PL_reg_flags |= RF_tainted;
3922 f = PL_fold_locale[c];
3926 if (f != c && ANYOF_BITMAP_TEST(n, f))
3930 if (!match && (flags & ANYOF_CLASS)) {
3931 PL_reg_flags |= RF_tainted;
3933 (ANYOF_CLASS_TEST(n, ANYOF_ALNUM) && isALNUM_LC(c)) ||
3934 (ANYOF_CLASS_TEST(n, ANYOF_NALNUM) && !isALNUM_LC(c)) ||
3935 (ANYOF_CLASS_TEST(n, ANYOF_SPACE) && isSPACE_LC(c)) ||
3936 (ANYOF_CLASS_TEST(n, ANYOF_NSPACE) && !isSPACE_LC(c)) ||
3937 (ANYOF_CLASS_TEST(n, ANYOF_DIGIT) && isDIGIT_LC(c)) ||
3938 (ANYOF_CLASS_TEST(n, ANYOF_NDIGIT) && !isDIGIT_LC(c)) ||
3939 (ANYOF_CLASS_TEST(n, ANYOF_ALNUMC) && isALNUMC_LC(c)) ||
3940 (ANYOF_CLASS_TEST(n, ANYOF_NALNUMC) && !isALNUMC_LC(c)) ||
3941 (ANYOF_CLASS_TEST(n, ANYOF_ALPHA) && isALPHA_LC(c)) ||
3942 (ANYOF_CLASS_TEST(n, ANYOF_NALPHA) && !isALPHA_LC(c)) ||
3943 (ANYOF_CLASS_TEST(n, ANYOF_ASCII) && isASCII(c)) ||
3944 (ANYOF_CLASS_TEST(n, ANYOF_NASCII) && !isASCII(c)) ||
3945 (ANYOF_CLASS_TEST(n, ANYOF_CNTRL) && isCNTRL_LC(c)) ||
3946 (ANYOF_CLASS_TEST(n, ANYOF_NCNTRL) && !isCNTRL_LC(c)) ||
3947 (ANYOF_CLASS_TEST(n, ANYOF_GRAPH) && isGRAPH_LC(c)) ||
3948 (ANYOF_CLASS_TEST(n, ANYOF_NGRAPH) && !isGRAPH_LC(c)) ||
3949 (ANYOF_CLASS_TEST(n, ANYOF_LOWER) && isLOWER_LC(c)) ||
3950 (ANYOF_CLASS_TEST(n, ANYOF_NLOWER) && !isLOWER_LC(c)) ||
3951 (ANYOF_CLASS_TEST(n, ANYOF_PRINT) && isPRINT_LC(c)) ||
3952 (ANYOF_CLASS_TEST(n, ANYOF_NPRINT) && !isPRINT_LC(c)) ||
3953 (ANYOF_CLASS_TEST(n, ANYOF_PUNCT) && isPUNCT_LC(c)) ||
3954 (ANYOF_CLASS_TEST(n, ANYOF_NPUNCT) && !isPUNCT_LC(c)) ||
3955 (ANYOF_CLASS_TEST(n, ANYOF_UPPER) && isUPPER_LC(c)) ||
3956 (ANYOF_CLASS_TEST(n, ANYOF_NUPPER) && !isUPPER_LC(c)) ||
3957 (ANYOF_CLASS_TEST(n, ANYOF_XDIGIT) && isXDIGIT(c)) ||
3958 (ANYOF_CLASS_TEST(n, ANYOF_NXDIGIT) && !isXDIGIT(c)) ||
3959 (ANYOF_CLASS_TEST(n, ANYOF_PSXSPC) && isPSXSPC(c)) ||
3960 (ANYOF_CLASS_TEST(n, ANYOF_NPSXSPC) && !isPSXSPC(c)) ||
3961 (ANYOF_CLASS_TEST(n, ANYOF_BLANK) && isBLANK(c)) ||
3962 (ANYOF_CLASS_TEST(n, ANYOF_NBLANK) && !isBLANK(c))
3963 ) /* How's that for a conditional? */
3970 return (flags & ANYOF_INVERT) ? !match : match;
3974 S_reghop(pTHX_ U8 *s, I32 off)
3976 return S_reghop3(aTHX_ s, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr));
3980 S_reghop3(pTHX_ U8 *s, I32 off, U8* lim)
3983 while (off-- && s < lim) {
3984 /* XXX could check well-formedness here */
3992 if (UTF8_IS_CONTINUED(*s)) {
3993 while (s > (U8*)lim && UTF8_IS_CONTINUATION(*s))
3996 /* XXX could check well-formedness here */
4004 S_reghopmaybe(pTHX_ U8 *s, I32 off)
4006 return S_reghopmaybe3(aTHX_ s, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr));
4010 S_reghopmaybe3(pTHX_ U8* s, I32 off, U8* lim)
4013 while (off-- && s < lim) {
4014 /* XXX could check well-formedness here */
4024 if (UTF8_IS_CONTINUED(*s)) {
4025 while (s > (U8*)lim && UTF8_IS_CONTINUATION(*s))
4028 /* XXX could check well-formedness here */
4044 restore_pos(pTHXo_ void *arg)
4046 if (PL_reg_eval_set) {
4047 if (PL_reg_oldsaved) {
4048 PL_reg_re->subbeg = PL_reg_oldsaved;
4049 PL_reg_re->sublen = PL_reg_oldsavedlen;
4050 RX_MATCH_COPIED_on(PL_reg_re);
4052 PL_reg_magic->mg_len = PL_reg_oldpos;
4053 PL_reg_eval_set = 0;
4054 PL_curpm = PL_reg_oldcurpm;