5 * "One Ring to rule them all, One Ring to find them..."
8 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
9 * confused with the original package (see point 3 below). Thanks, Henry!
12 /* Additional note: this code is very heavily munged from Henry's version
13 * in places. In some spots I've traded clarity for efficiency, so don't
14 * blame Henry for some of the lack of readability.
17 /* The names of the functions have been changed from regcomp and
18 * regexec to pregcomp and pregexec in order to avoid conflicts
19 * with the POSIX routines of the same names.
22 #ifdef PERL_EXT_RE_BUILD
23 /* need to replace pregcomp et al, so enable that */
24 # ifndef PERL_IN_XSUB_RE
25 # define PERL_IN_XSUB_RE
27 /* need access to debugger hooks */
28 # if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
33 #ifdef PERL_IN_XSUB_RE
34 /* We *really* need to overwrite these symbols: */
35 # define Perl_regexec_flags my_regexec
36 # define Perl_regdump my_regdump
37 # define Perl_regprop my_regprop
38 # define Perl_re_intuit_start my_re_intuit_start
39 /* *These* symbols are masked to allow static link. */
40 # define Perl_pregexec my_pregexec
41 # define Perl_reginitcolors my_reginitcolors
42 # define Perl_regclass_swash my_regclass_swash
44 # define PERL_NO_GET_CONTEXT
49 * pregcomp and pregexec -- regsub and regerror are not used in perl
51 * Copyright (c) 1986 by University of Toronto.
52 * Written by Henry Spencer. Not derived from licensed software.
54 * Permission is granted to anyone to use this software for any
55 * purpose on any computer system, and to redistribute it freely,
56 * subject to the following restrictions:
58 * 1. The author is not responsible for the consequences of use of
59 * this software, no matter how awful, even if they arise
62 * 2. The origin of this software must not be misrepresented, either
63 * by explicit claim or by omission.
65 * 3. Altered versions must be plainly marked as such, and must not
66 * be misrepresented as being the original software.
68 **** Alterations to Henry's code are...
70 **** Copyright (c) 1991-2001, Larry Wall
72 **** You may distribute under the terms of either the GNU General Public
73 **** License or the Artistic License, as specified in the README file.
75 * Beware that some of this code is subtly aware of the way operator
76 * precedence is structured in regular expressions. Serious changes in
77 * regular-expression syntax might require a total rethink.
80 #define PERL_IN_REGEXEC_C
85 #define RF_tainted 1 /* tainted information used? */
86 #define RF_warned 2 /* warned about big count? */
87 #define RF_evaled 4 /* Did an EVAL with setting? */
88 #define RF_utf8 8 /* String contains multibyte chars? */
90 #define UTF (PL_reg_flags & RF_utf8)
92 #define RS_init 1 /* eval environment created */
93 #define RS_set 2 /* replsv value is set */
103 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
104 #define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
106 #define reghop_c(pos,off) ((char*)reghop((U8*)pos, off))
107 #define reghopmaybe_c(pos,off) ((char*)reghopmaybe((U8*)pos, off))
108 #define HOP(pos,off) (PL_reg_match_utf8 ? reghop((U8*)pos, off) : (U8*)(pos + off))
109 #define HOPMAYBE(pos,off) (PL_reg_match_utf8 ? reghopmaybe((U8*)pos, off) : (U8*)(pos + off))
110 #define HOPc(pos,off) ((char*)HOP(pos,off))
111 #define HOPMAYBEc(pos,off) ((char*)HOPMAYBE(pos,off))
113 #define HOPBACK(pos, off) ( \
114 (UTF && PL_reg_match_utf8) \
115 ? reghopmaybe((U8*)pos, -off) \
116 : (pos - off >= PL_bostr) \
120 #define HOPBACKc(pos, off) (char*)HOPBACK(pos, off)
122 #define reghop3_c(pos,off,lim) ((char*)reghop3((U8*)pos, off, (U8*)lim))
123 #define reghopmaybe3_c(pos,off,lim) ((char*)reghopmaybe3((U8*)pos, off, (U8*)lim))
124 #define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)pos, off, (U8*)lim) : (U8*)(pos + off))
125 #define HOPMAYBE3(pos,off,lim) (PL_reg_match_utf8 ? reghopmaybe3((U8*)pos, off, (U8*)lim) : (U8*)(pos + off))
126 #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
127 #define HOPMAYBE3c(pos,off,lim) ((char*)HOPMAYBE3(pos,off,lim))
129 #define LOAD_UTF8_CHARCLASS(a,b) STMT_START { if (!CAT2(PL_utf8_,a)) (void)CAT2(is_utf8_, a)((U8*)b); } STMT_END
131 static void restore_pos(pTHX_ void *arg);
134 S_regcppush(pTHX_ I32 parenfloor)
136 int retval = PL_savestack_ix;
137 #define REGCP_PAREN_ELEMS 4
138 int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
141 if (paren_elems_to_push < 0)
142 Perl_croak(aTHX_ "panic: paren_elems_to_push < 0");
144 #define REGCP_OTHER_ELEMS 6
145 SSCHECK(paren_elems_to_push + REGCP_OTHER_ELEMS);
146 for (p = PL_regsize; p > parenfloor; p--) {
147 /* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
148 SSPUSHINT(PL_regendp[p]);
149 SSPUSHINT(PL_regstartp[p]);
150 SSPUSHPTR(PL_reg_start_tmp[p]);
153 /* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
154 SSPUSHINT(PL_regsize);
155 SSPUSHINT(*PL_reglastparen);
156 SSPUSHINT(*PL_reglastcloseparen);
157 SSPUSHPTR(PL_reginput);
158 #define REGCP_FRAME_ELEMS 2
159 /* REGCP_FRAME_ELEMS are part of the REGCP_OTHER_ELEMS and
160 * are needed for the regexp context stack bookkeeping. */
161 SSPUSHINT(paren_elems_to_push + REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
162 SSPUSHINT(SAVEt_REGCONTEXT); /* Magic cookie. */
167 /* These are needed since we do not localize EVAL nodes: */
168 # define REGCP_SET(cp) DEBUG_r(PerlIO_printf(Perl_debug_log, \
169 " Setting an EVAL scope, savestack=%"IVdf"\n", \
170 (IV)PL_savestack_ix)); cp = PL_savestack_ix
172 # define REGCP_UNWIND(cp) DEBUG_r(cp != PL_savestack_ix ? \
173 PerlIO_printf(Perl_debug_log, \
174 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
175 (IV)(cp), (IV)PL_savestack_ix) : 0); regcpblow(cp)
185 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
187 assert(i == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
188 i = SSPOPINT; /* Parentheses elements to pop. */
189 input = (char *) SSPOPPTR;
190 *PL_reglastcloseparen = SSPOPINT;
191 *PL_reglastparen = SSPOPINT;
192 PL_regsize = SSPOPINT;
194 /* Now restore the parentheses context. */
195 for (i -= (REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
196 i > 0; i -= REGCP_PAREN_ELEMS) {
197 paren = (U32)SSPOPINT;
198 PL_reg_start_tmp[paren] = (char *) SSPOPPTR;
199 PL_regstartp[paren] = SSPOPINT;
201 if (paren <= *PL_reglastparen)
202 PL_regendp[paren] = tmps;
204 PerlIO_printf(Perl_debug_log,
205 " restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n",
206 (UV)paren, (IV)PL_regstartp[paren],
207 (IV)(PL_reg_start_tmp[paren] - PL_bostr),
208 (IV)PL_regendp[paren],
209 (paren > *PL_reglastparen ? "(no)" : ""));
213 if (*PL_reglastparen + 1 <= PL_regnpar) {
214 PerlIO_printf(Perl_debug_log,
215 " restoring \\%"IVdf"..\\%"IVdf" to undef\n",
216 (IV)(*PL_reglastparen + 1), (IV)PL_regnpar);
220 /* It would seem that the similar code in regtry()
221 * already takes care of this, and in fact it is in
222 * a better location to since this code can #if 0-ed out
223 * but the code in regtry() is needed or otherwise tests
224 * requiring null fields (pat.t#187 and split.t#{13,14}
225 * (as of patchlevel 7877) will fail. Then again,
226 * this code seems to be necessary or otherwise
227 * building DynaLoader will fail:
228 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
230 for (paren = *PL_reglastparen + 1; paren <= PL_regnpar; paren++) {
231 if (paren > PL_regsize)
232 PL_regstartp[paren] = -1;
233 PL_regendp[paren] = -1;
240 S_regcp_set_to(pTHX_ I32 ss)
242 I32 tmp = PL_savestack_ix;
244 PL_savestack_ix = ss;
246 PL_savestack_ix = tmp;
250 typedef struct re_cc_state
254 struct re_cc_state *prev;
259 #define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
261 #define TRYPAREN(paren, n, input) { \
264 PL_regstartp[paren] = HOPc(input, -1) - PL_bostr; \
265 PL_regendp[paren] = input - PL_bostr; \
268 PL_regendp[paren] = -1; \
270 if (regmatch(next)) \
273 PL_regendp[paren] = -1; \
278 * pregexec and friends
282 - pregexec - match a regexp against a string
285 Perl_pregexec(pTHX_ register regexp *prog, char *stringarg, register char *strend,
286 char *strbeg, I32 minend, SV *screamer, U32 nosave)
287 /* strend: pointer to null at end of string */
288 /* strbeg: real beginning of string */
289 /* minend: end of match must be >=minend after stringarg. */
290 /* nosave: For optimizations. */
293 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
294 nosave ? 0 : REXEC_COPY_STR);
298 S_cache_re(pTHX_ regexp *prog)
300 PL_regprecomp = prog->precomp; /* Needed for FAIL. */
302 PL_regprogram = prog->program;
304 PL_regnpar = prog->nparens;
305 PL_regdata = prog->data;
310 * Need to implement the following flags for reg_anch:
312 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
314 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
315 * INTUIT_AUTORITATIVE_ML
316 * INTUIT_ONCE_NOML - Intuit can match in one location only.
319 * Another flag for this function: SECOND_TIME (so that float substrs
320 * with giant delta may be not rechecked).
323 /* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
325 /* If SCREAM, then SvPVX(sv) should be compatible with strpos and strend.
326 Otherwise, only SvCUR(sv) is used to get strbeg. */
328 /* XXXX We assume that strpos is strbeg unless sv. */
330 /* XXXX Some places assume that there is a fixed substring.
331 An update may be needed if optimizer marks as "INTUITable"
332 RExen without fixed substrings. Similarly, it is assumed that
333 lengths of all the strings are no more than minlen, thus they
334 cannot come from lookahead.
335 (Or minlen should take into account lookahead.) */
337 /* A failure to find a constant substring means that there is no need to make
338 an expensive call to REx engine, thus we celebrate a failure. Similarly,
339 finding a substring too deep into the string means that less calls to
340 regtry() should be needed.
342 REx compiler's optimizer found 4 possible hints:
343 a) Anchored substring;
345 c) Whether we are anchored (beginning-of-line or \G);
346 d) First node (of those at offset 0) which may distingush positions;
347 We use a)b)d) and multiline-part of c), and try to find a position in the
348 string which does not contradict any of them.
351 /* Most of decisions we do here should have been done at compile time.
352 The nodes of the REx which we used for the search should have been
353 deleted from the finite automaton. */
356 Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
357 char *strend, U32 flags, re_scream_pos_data *data)
359 register I32 start_shift = 0;
360 /* Should be nonnegative! */
361 register I32 end_shift = 0;
367 register char *other_last = Nullch; /* other substr checked before this */
368 char *check_at = Nullch; /* check substr found at this pos */
370 char *i_strpos = strpos;
373 DEBUG_r( if (!PL_colorset) reginitcolors() );
374 DEBUG_r(PerlIO_printf(Perl_debug_log,
375 "%sGuessing start of match, REx%s `%s%.60s%s%s' against `%s%.*s%s%s'...\n",
376 PL_colors[4],PL_colors[5],PL_colors[0],
379 (strlen(prog->precomp) > 60 ? "..." : ""),
381 (int)(strend - strpos > 60 ? 60 : strend - strpos),
382 strpos, PL_colors[1],
383 (strend - strpos > 60 ? "..." : ""))
386 if (prog->reganch & ROPT_UTF8)
387 PL_reg_flags |= RF_utf8;
389 if (prog->minlen > CHR_DIST((U8*)strend, (U8*)strpos)) {
390 DEBUG_r(PerlIO_printf(Perl_debug_log, "String too short...\n"));
393 strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
395 check = prog->check_substr;
396 if (prog->reganch & ROPT_ANCH) { /* Match at beg-of-str or after \n */
397 ml_anch = !( (prog->reganch & ROPT_ANCH_SINGLE)
398 || ( (prog->reganch & ROPT_ANCH_BOL)
399 && !PL_multiline ) ); /* Check after \n? */
402 if ( !(prog->reganch & (ROPT_ANCH_GPOS /* Checked by the caller */
403 | ROPT_IMPLICIT)) /* not a real BOL */
404 /* SvCUR is not set on references: SvRV and SvPVX overlap */
406 && (strpos != strbeg)) {
407 DEBUG_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
410 if (prog->check_offset_min == prog->check_offset_max &&
411 !(prog->reganch & ROPT_CANY_SEEN)) {
412 /* Substring at constant offset from beg-of-str... */
415 s = HOP3c(strpos, prog->check_offset_min, strend);
417 slen = SvCUR(check); /* >= 1 */
419 if ( strend - s > slen || strend - s < slen - 1
420 || (strend - s == slen && strend[-1] != '\n')) {
421 DEBUG_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
424 /* Now should match s[0..slen-2] */
426 if (slen && (*SvPVX(check) != *s
428 && memNE(SvPVX(check), s, slen)))) {
430 DEBUG_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
434 else if (*SvPVX(check) != *s
435 || ((slen = SvCUR(check)) > 1
436 && memNE(SvPVX(check), s, slen)))
438 goto success_at_start;
441 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
443 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
444 end_shift = prog->minlen - start_shift -
445 CHR_SVLEN(check) + (SvTAIL(check) != 0);
447 I32 end = prog->check_offset_max + CHR_SVLEN(check)
448 - (SvTAIL(check) != 0);
449 I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
451 if (end_shift < eshift)
455 else { /* Can match at random position */
458 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
459 /* Should be nonnegative! */
460 end_shift = prog->minlen - start_shift -
461 CHR_SVLEN(check) + (SvTAIL(check) != 0);
464 #ifdef DEBUGGING /* 7/99: reports of failure (with the older version) */
466 Perl_croak(aTHX_ "panic: end_shift");
470 /* Find a possible match in the region s..strend by looking for
471 the "check" substring in the region corrected by start/end_shift. */
472 if (flags & REXEC_SCREAM) {
473 I32 p = -1; /* Internal iterator of scream. */
474 I32 *pp = data ? data->scream_pos : &p;
476 if (PL_screamfirst[BmRARE(check)] >= 0
477 || ( BmRARE(check) == '\n'
478 && (BmPREVIOUS(check) == SvCUR(check) - 1)
480 s = screaminstr(sv, check,
481 start_shift + (s - strbeg), end_shift, pp, 0);
485 *data->scream_olds = s;
487 else if (prog->reganch & ROPT_CANY_SEEN)
488 s = fbm_instr((U8*)(s + start_shift),
489 (U8*)(strend - end_shift),
490 check, PL_multiline ? FBMrf_MULTILINE : 0);
492 s = fbm_instr(HOP3(s, start_shift, strend),
493 HOP3(strend, -end_shift, strbeg),
494 check, PL_multiline ? FBMrf_MULTILINE : 0);
496 /* Update the count-of-usability, remove useless subpatterns,
499 DEBUG_r(PerlIO_printf(Perl_debug_log, "%s %s substr `%s%.*s%s'%s%s",
500 (s ? "Found" : "Did not find"),
501 ((check == prog->anchored_substr) ? "anchored" : "floating"),
503 (int)(SvCUR(check) - (SvTAIL(check)!=0)),
505 PL_colors[1], (SvTAIL(check) ? "$" : ""),
506 (s ? " at offset " : "...\n") ) );
513 /* Finish the diagnostic message */
514 DEBUG_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
516 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
517 Start with the other substr.
518 XXXX no SCREAM optimization yet - and a very coarse implementation
519 XXXX /ttx+/ results in anchored=`ttx', floating=`x'. floating will
520 *always* match. Probably should be marked during compile...
521 Probably it is right to do no SCREAM here...
524 if (prog->float_substr && prog->anchored_substr) {
525 /* Take into account the "other" substring. */
526 /* XXXX May be hopelessly wrong for UTF... */
529 if (check == prog->float_substr) {
532 char *last = HOP3c(s, -start_shift, strbeg), *last1, *last2;
535 t = s - prog->check_offset_max;
536 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
537 && (!(prog->reganch & ROPT_UTF8)
538 || ((t = reghopmaybe3_c(s, -(prog->check_offset_max), strpos))
543 t = HOP3c(t, prog->anchored_offset, strend);
544 if (t < other_last) /* These positions already checked */
546 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
549 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
550 /* On end-of-str: see comment below. */
551 s = fbm_instr((unsigned char*)t,
552 HOP3(HOP3(last1, prog->anchored_offset, strend)
553 + SvCUR(prog->anchored_substr),
554 -(SvTAIL(prog->anchored_substr)!=0), strbeg),
555 prog->anchored_substr,
556 PL_multiline ? FBMrf_MULTILINE : 0);
557 DEBUG_r(PerlIO_printf(Perl_debug_log,
558 "%s anchored substr `%s%.*s%s'%s",
559 (s ? "Found" : "Contradicts"),
561 (int)(SvCUR(prog->anchored_substr)
562 - (SvTAIL(prog->anchored_substr)!=0)),
563 SvPVX(prog->anchored_substr),
564 PL_colors[1], (SvTAIL(prog->anchored_substr) ? "$" : "")));
566 if (last1 >= last2) {
567 DEBUG_r(PerlIO_printf(Perl_debug_log,
568 ", giving up...\n"));
571 DEBUG_r(PerlIO_printf(Perl_debug_log,
572 ", trying floating at offset %ld...\n",
573 (long)(HOP3c(s1, 1, strend) - i_strpos)));
574 other_last = HOP3c(last1, prog->anchored_offset+1, strend);
575 s = HOP3c(last, 1, strend);
579 DEBUG_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
580 (long)(s - i_strpos)));
581 t = HOP3c(s, -prog->anchored_offset, strbeg);
582 other_last = HOP3c(s, 1, strend);
590 else { /* Take into account the floating substring. */
594 t = HOP3c(s, -start_shift, strbeg);
596 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
597 if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
598 last = HOP3c(t, prog->float_max_offset, strend);
599 s = HOP3c(t, prog->float_min_offset, strend);
602 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
603 /* fbm_instr() takes into account exact value of end-of-str
604 if the check is SvTAIL(ed). Since false positives are OK,
605 and end-of-str is not later than strend we are OK. */
606 s = fbm_instr((unsigned char*)s,
607 (unsigned char*)last + SvCUR(prog->float_substr)
608 - (SvTAIL(prog->float_substr)!=0),
609 prog->float_substr, PL_multiline ? FBMrf_MULTILINE : 0);
610 DEBUG_r(PerlIO_printf(Perl_debug_log, "%s floating substr `%s%.*s%s'%s",
611 (s ? "Found" : "Contradicts"),
613 (int)(SvCUR(prog->float_substr)
614 - (SvTAIL(prog->float_substr)!=0)),
615 SvPVX(prog->float_substr),
616 PL_colors[1], (SvTAIL(prog->float_substr) ? "$" : "")));
619 DEBUG_r(PerlIO_printf(Perl_debug_log,
620 ", giving up...\n"));
623 DEBUG_r(PerlIO_printf(Perl_debug_log,
624 ", trying anchored starting at offset %ld...\n",
625 (long)(s1 + 1 - i_strpos)));
627 s = HOP3c(t, 1, strend);
631 DEBUG_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
632 (long)(s - i_strpos)));
633 other_last = s; /* Fix this later. --Hugo */
642 t = s - prog->check_offset_max;
643 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
644 && (!(prog->reganch & ROPT_UTF8)
645 || ((t = reghopmaybe3_c(s, -prog->check_offset_max, strpos))
647 /* Fixed substring is found far enough so that the match
648 cannot start at strpos. */
650 if (ml_anch && t[-1] != '\n') {
651 /* Eventually fbm_*() should handle this, but often
652 anchored_offset is not 0, so this check will not be wasted. */
653 /* XXXX In the code below we prefer to look for "^" even in
654 presence of anchored substrings. And we search even
655 beyond the found float position. These pessimizations
656 are historical artefacts only. */
658 while (t < strend - prog->minlen) {
660 if (t < check_at - prog->check_offset_min) {
661 if (prog->anchored_substr) {
662 /* Since we moved from the found position,
663 we definitely contradict the found anchored
664 substr. Due to the above check we do not
665 contradict "check" substr.
666 Thus we can arrive here only if check substr
667 is float. Redo checking for "other"=="fixed".
670 DEBUG_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
671 PL_colors[0],PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
672 goto do_other_anchored;
674 /* We don't contradict the found floating substring. */
675 /* XXXX Why not check for STCLASS? */
677 DEBUG_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
678 PL_colors[0],PL_colors[1], (long)(s - i_strpos)));
681 /* Position contradicts check-string */
682 /* XXXX probably better to look for check-string
683 than for "\n", so one should lower the limit for t? */
684 DEBUG_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
685 PL_colors[0],PL_colors[1], (long)(t + 1 - i_strpos)));
686 other_last = strpos = s = t + 1;
691 DEBUG_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
692 PL_colors[0],PL_colors[1]));
696 DEBUG_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
697 PL_colors[0],PL_colors[1]));
701 ++BmUSEFUL(prog->check_substr); /* hooray/5 */
704 /* The found string does not prohibit matching at strpos,
705 - no optimization of calling REx engine can be performed,
706 unless it was an MBOL and we are not after MBOL,
707 or a future STCLASS check will fail this. */
709 /* Even in this situation we may use MBOL flag if strpos is offset
710 wrt the start of the string. */
711 if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
712 && (strpos != strbeg) && strpos[-1] != '\n'
713 /* May be due to an implicit anchor of m{.*foo} */
714 && !(prog->reganch & ROPT_IMPLICIT))
719 DEBUG_r( if (ml_anch)
720 PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
721 (long)(strpos - i_strpos), PL_colors[0],PL_colors[1]);
724 if (!(prog->reganch & ROPT_NAUGHTY) /* XXXX If strpos moved? */
725 && prog->check_substr /* Could be deleted already */
726 && --BmUSEFUL(prog->check_substr) < 0
727 && prog->check_substr == prog->float_substr)
729 /* If flags & SOMETHING - do not do it many times on the same match */
730 DEBUG_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
731 SvREFCNT_dec(prog->check_substr);
732 prog->check_substr = Nullsv; /* disable */
733 prog->float_substr = Nullsv; /* clear */
734 check = Nullsv; /* abort */
736 /* XXXX This is a remnant of the old implementation. It
737 looks wasteful, since now INTUIT can use many
739 prog->reganch &= ~RE_USE_INTUIT;
746 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
747 if (prog->regstclass) {
748 /* minlen == 0 is possible if regstclass is \b or \B,
749 and the fixed substr is ''$.
750 Since minlen is already taken into account, s+1 is before strend;
751 accidentally, minlen >= 1 guaranties no false positives at s + 1
752 even for \b or \B. But (minlen? 1 : 0) below assumes that
753 regstclass does not come from lookahead... */
754 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
755 This leaves EXACTF only, which is dealt with in find_byclass(). */
756 U8* str = (U8*)STRING(prog->regstclass);
757 int cl_l = (PL_regkind[(U8)OP(prog->regstclass)] == EXACT
758 ? CHR_DIST(str+STR_LEN(prog->regstclass), str)
760 char *endpos = (prog->anchored_substr || ml_anch)
761 ? HOP3c(s, (prog->minlen ? cl_l : 0), strend)
762 : (prog->float_substr
763 ? HOP3c(HOP3c(check_at, -start_shift, strbeg),
766 char *startpos = strbeg;
769 if (prog->reganch & ROPT_UTF8) {
770 PL_regdata = prog->data;
773 s = find_byclass(prog, prog->regstclass, s, endpos, startpos, 1);
778 if (endpos == strend) {
779 DEBUG_r( PerlIO_printf(Perl_debug_log,
780 "Could not match STCLASS...\n") );
783 DEBUG_r( PerlIO_printf(Perl_debug_log,
784 "This position contradicts STCLASS...\n") );
785 if ((prog->reganch & ROPT_ANCH) && !ml_anch)
787 /* Contradict one of substrings */
788 if (prog->anchored_substr) {
789 if (prog->anchored_substr == check) {
790 DEBUG_r( what = "anchored" );
792 s = HOP3c(t, 1, strend);
793 if (s + start_shift + end_shift > strend) {
794 /* XXXX Should be taken into account earlier? */
795 DEBUG_r( PerlIO_printf(Perl_debug_log,
796 "Could not match STCLASS...\n") );
801 DEBUG_r( PerlIO_printf(Perl_debug_log,
802 "Looking for %s substr starting at offset %ld...\n",
803 what, (long)(s + start_shift - i_strpos)) );
806 /* Have both, check_string is floating */
807 if (t + start_shift >= check_at) /* Contradicts floating=check */
808 goto retry_floating_check;
809 /* Recheck anchored substring, but not floating... */
813 DEBUG_r( PerlIO_printf(Perl_debug_log,
814 "Looking for anchored substr starting at offset %ld...\n",
815 (long)(other_last - i_strpos)) );
816 goto do_other_anchored;
818 /* Another way we could have checked stclass at the
819 current position only: */
824 DEBUG_r( PerlIO_printf(Perl_debug_log,
825 "Looking for /%s^%s/m starting at offset %ld...\n",
826 PL_colors[0],PL_colors[1], (long)(t - i_strpos)) );
829 if (!prog->float_substr) /* Could have been deleted */
831 /* Check is floating subtring. */
832 retry_floating_check:
833 t = check_at - start_shift;
834 DEBUG_r( what = "floating" );
835 goto hop_and_restart;
838 DEBUG_r(PerlIO_printf(Perl_debug_log,
839 "By STCLASS: moving %ld --> %ld\n",
840 (long)(t - i_strpos), (long)(s - i_strpos))
844 DEBUG_r(PerlIO_printf(Perl_debug_log,
845 "Does not contradict STCLASS...\n");
850 DEBUG_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
851 PL_colors[4], (check ? "Guessed" : "Giving up"),
852 PL_colors[5], (long)(s - i_strpos)) );
855 fail_finish: /* Substring not found */
856 if (prog->check_substr) /* could be removed already */
857 BmUSEFUL(prog->check_substr) += 5; /* hooray */
859 DEBUG_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
860 PL_colors[4],PL_colors[5]));
864 /* We know what class REx starts with. Try to find this position... */
866 S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *startpos, I32 norun)
868 I32 doevery = (prog->reganch & ROPT_SKIP) == 0;
874 register I32 tmp = 1; /* Scratch variable? */
875 register bool do_utf8 = PL_reg_match_utf8;
877 /* We know what class it must start with. */
881 if (reginclass(c, (U8*)s, do_utf8)) {
882 if (tmp && (norun || regtry(prog, s)))
889 s += do_utf8 ? UTF8SKIP(s) : 1;
894 if (tmp && (norun || regtry(prog, s)))
905 c1 = to_utf8_lower((U8*)m);
906 c2 = to_utf8_upper((U8*)m);
917 c2 = PL_fold_locale[c1];
922 e = s; /* Due to minlen logic of intuit() */
928 if ( utf8_to_uvchr((U8*)s, &len) == c1
935 UV c = utf8_to_uvchr((U8*)s, &len);
936 if ( (c == c1 || c == c2) && regtry(prog, s) )
945 && (ln == 1 || !(OP(c) == EXACTF
947 : ibcmp_locale(s, m, ln)))
948 && (norun || regtry(prog, s)) )
954 if ( (*(U8*)s == c1 || *(U8*)s == c2)
955 && (ln == 1 || !(OP(c) == EXACTF
957 : ibcmp_locale(s, m, ln)))
958 && (norun || regtry(prog, s)) )
965 PL_reg_flags |= RF_tainted;
972 U8 *r = reghop3((U8*)s, -1, (U8*)startpos);
975 tmp = (I32)utf8n_to_uvchr(r, s - (char*)r, 0, 0);
977 tmp = ((OP(c) == BOUND ?
978 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
979 LOAD_UTF8_CHARCLASS(alnum,"a");
981 if (tmp == !(OP(c) == BOUND ?
982 swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
983 isALNUM_LC_utf8((U8*)s)))
986 if ((norun || regtry(prog, s)))
993 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
994 tmp = ((OP(c) == BOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
997 !(OP(c) == BOUND ? isALNUM(*s) : isALNUM_LC(*s))) {
999 if ((norun || regtry(prog, s)))
1005 if ((!prog->minlen && tmp) && (norun || regtry(prog, s)))
1009 PL_reg_flags |= RF_tainted;
1016 U8 *r = reghop3((U8*)s, -1, (U8*)startpos);
1019 tmp = (I32)utf8n_to_uvchr(r, s - (char*)r, 0, 0);
1021 tmp = ((OP(c) == NBOUND ?
1022 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1023 LOAD_UTF8_CHARCLASS(alnum,"a");
1024 while (s < strend) {
1025 if (tmp == !(OP(c) == NBOUND ?
1026 swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
1027 isALNUM_LC_utf8((U8*)s)))
1029 else if ((norun || regtry(prog, s)))
1035 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
1036 tmp = ((OP(c) == NBOUND ?
1037 isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1038 while (s < strend) {
1040 !(OP(c) == NBOUND ? isALNUM(*s) : isALNUM_LC(*s)))
1042 else if ((norun || regtry(prog, s)))
1047 if ((!prog->minlen && !tmp) && (norun || regtry(prog, s)))
1052 LOAD_UTF8_CHARCLASS(alnum,"a");
1053 while (s < strend) {
1054 if (swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8)) {
1055 if (tmp && (norun || regtry(prog, s)))
1066 while (s < strend) {
1068 if (tmp && (norun || regtry(prog, s)))
1080 PL_reg_flags |= RF_tainted;
1082 while (s < strend) {
1083 if (isALNUM_LC_utf8((U8*)s)) {
1084 if (tmp && (norun || regtry(prog, s)))
1095 while (s < strend) {
1096 if (isALNUM_LC(*s)) {
1097 if (tmp && (norun || regtry(prog, s)))
1110 LOAD_UTF8_CHARCLASS(alnum,"a");
1111 while (s < strend) {
1112 if (!swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8)) {
1113 if (tmp && (norun || regtry(prog, s)))
1124 while (s < strend) {
1126 if (tmp && (norun || regtry(prog, s)))
1138 PL_reg_flags |= RF_tainted;
1140 while (s < strend) {
1141 if (!isALNUM_LC_utf8((U8*)s)) {
1142 if (tmp && (norun || regtry(prog, s)))
1153 while (s < strend) {
1154 if (!isALNUM_LC(*s)) {
1155 if (tmp && (norun || regtry(prog, s)))
1168 LOAD_UTF8_CHARCLASS(space," ");
1169 while (s < strend) {
1170 if (*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8)) {
1171 if (tmp && (norun || regtry(prog, s)))
1182 while (s < strend) {
1184 if (tmp && (norun || regtry(prog, s)))
1196 PL_reg_flags |= RF_tainted;
1198 while (s < strend) {
1199 if (*s == ' ' || isSPACE_LC_utf8((U8*)s)) {
1200 if (tmp && (norun || regtry(prog, s)))
1211 while (s < strend) {
1212 if (isSPACE_LC(*s)) {
1213 if (tmp && (norun || regtry(prog, s)))
1226 LOAD_UTF8_CHARCLASS(space," ");
1227 while (s < strend) {
1228 if (!(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8))) {
1229 if (tmp && (norun || regtry(prog, s)))
1240 while (s < strend) {
1242 if (tmp && (norun || regtry(prog, s)))
1254 PL_reg_flags |= RF_tainted;
1256 while (s < strend) {
1257 if (!(*s == ' ' || isSPACE_LC_utf8((U8*)s))) {
1258 if (tmp && (norun || regtry(prog, s)))
1269 while (s < strend) {
1270 if (!isSPACE_LC(*s)) {
1271 if (tmp && (norun || regtry(prog, s)))
1284 LOAD_UTF8_CHARCLASS(digit,"0");
1285 while (s < strend) {
1286 if (swash_fetch(PL_utf8_digit,(U8*)s, do_utf8)) {
1287 if (tmp && (norun || regtry(prog, s)))
1298 while (s < strend) {
1300 if (tmp && (norun || regtry(prog, s)))
1312 PL_reg_flags |= RF_tainted;
1314 while (s < strend) {
1315 if (isDIGIT_LC_utf8((U8*)s)) {
1316 if (tmp && (norun || regtry(prog, s)))
1327 while (s < strend) {
1328 if (isDIGIT_LC(*s)) {
1329 if (tmp && (norun || regtry(prog, s)))
1342 LOAD_UTF8_CHARCLASS(digit,"0");
1343 while (s < strend) {
1344 if (!swash_fetch(PL_utf8_digit,(U8*)s, do_utf8)) {
1345 if (tmp && (norun || regtry(prog, s)))
1356 while (s < strend) {
1358 if (tmp && (norun || regtry(prog, s)))
1370 PL_reg_flags |= RF_tainted;
1372 while (s < strend) {
1373 if (!isDIGIT_LC_utf8((U8*)s)) {
1374 if (tmp && (norun || regtry(prog, s)))
1385 while (s < strend) {
1386 if (!isDIGIT_LC(*s)) {
1387 if (tmp && (norun || regtry(prog, s)))
1399 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
1408 - regexec_flags - match a regexp against a string
1411 Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *strend,
1412 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
1413 /* strend: pointer to null at end of string */
1414 /* strbeg: real beginning of string */
1415 /* minend: end of match must be >=minend after stringarg. */
1416 /* data: May be used for some additional optimizations. */
1417 /* nosave: For optimizations. */
1420 register regnode *c;
1421 register char *startpos = stringarg;
1422 I32 minlen; /* must match at least this many chars */
1423 I32 dontbother = 0; /* how many characters not to try at end */
1424 /* I32 start_shift = 0; */ /* Offset of the start to find
1425 constant substr. */ /* CC */
1426 I32 end_shift = 0; /* Same for the end. */ /* CC */
1427 I32 scream_pos = -1; /* Internal iterator of scream. */
1429 SV* oreplsv = GvSV(PL_replgv);
1430 bool do_utf8 = DO_UTF8(sv);
1436 PL_regnarrate = DEBUG_r_TEST;
1439 /* Be paranoid... */
1440 if (prog == NULL || startpos == NULL) {
1441 Perl_croak(aTHX_ "NULL regexp parameter");
1445 minlen = prog->minlen;
1446 if (do_utf8 && !(prog->reganch & ROPT_CANY_SEEN)) {
1447 if (utf8_distance((U8*)strend, (U8*)startpos) < minlen) goto phooey;
1450 if (strend - startpos < minlen) goto phooey;
1453 /* Check validity of program. */
1454 if (UCHARAT(prog->program) != REG_MAGIC) {
1455 Perl_croak(aTHX_ "corrupted regexp program");
1459 PL_reg_eval_set = 0;
1462 if (prog->reganch & ROPT_UTF8)
1463 PL_reg_flags |= RF_utf8;
1465 /* Mark beginning of line for ^ and lookbehind. */
1466 PL_regbol = startpos;
1470 /* Mark end of line for $ (and such) */
1473 /* see how far we have to get to not match where we matched before */
1474 PL_regtill = startpos+minend;
1476 /* We start without call_cc context. */
1479 /* If there is a "must appear" string, look for it. */
1482 if (prog->reganch & ROPT_GPOS_SEEN) { /* Need to have PL_reg_ganch */
1485 if (flags & REXEC_IGNOREPOS) /* Means: check only at start */
1486 PL_reg_ganch = startpos;
1487 else if (sv && SvTYPE(sv) >= SVt_PVMG
1489 && (mg = mg_find(sv, PERL_MAGIC_regex_global))
1490 && mg->mg_len >= 0) {
1491 PL_reg_ganch = strbeg + mg->mg_len; /* Defined pos() */
1492 if (prog->reganch & ROPT_ANCH_GPOS) {
1493 if (s > PL_reg_ganch)
1498 else /* pos() not defined */
1499 PL_reg_ganch = strbeg;
1502 if (do_utf8 == (UTF!=0) &&
1503 !(flags & REXEC_CHECKED) && prog->check_substr != Nullsv) {
1504 re_scream_pos_data d;
1506 d.scream_olds = &scream_olds;
1507 d.scream_pos = &scream_pos;
1508 s = re_intuit_start(prog, sv, s, strend, flags, &d);
1510 goto phooey; /* not present */
1513 DEBUG_r( if (!PL_colorset) reginitcolors() );
1514 DEBUG_r(PerlIO_printf(Perl_debug_log,
1515 "%sMatching REx%s `%s%.60s%s%s' against `%s%.*s%s%s'\n",
1516 PL_colors[4],PL_colors[5],PL_colors[0],
1519 (strlen(prog->precomp) > 60 ? "..." : ""),
1521 (int)(strend - startpos > 60 ? 60 : strend - startpos),
1522 startpos, PL_colors[1],
1523 (strend - startpos > 60 ? "..." : ""))
1526 /* Simplest case: anchored match need be tried only once. */
1527 /* [unless only anchor is BOL and multiline is set] */
1528 if (prog->reganch & (ROPT_ANCH & ~ROPT_ANCH_GPOS)) {
1529 if (s == startpos && regtry(prog, startpos))
1531 else if (PL_multiline || (prog->reganch & ROPT_IMPLICIT)
1532 || (prog->reganch & ROPT_ANCH_MBOL)) /* XXXX SBOL? */
1537 dontbother = minlen - 1;
1538 end = HOP3c(strend, -dontbother, strbeg) - 1;
1539 /* for multiline we only have to try after newlines */
1540 if (prog->check_substr) {
1544 if (regtry(prog, s))
1549 if (prog->reganch & RE_USE_INTUIT) {
1550 s = re_intuit_start(prog, sv, s + 1, strend, flags, NULL);
1561 if (*s++ == '\n') { /* don't need PL_utf8skip here */
1562 if (regtry(prog, s))
1569 } else if (prog->reganch & ROPT_ANCH_GPOS) {
1570 if (regtry(prog, PL_reg_ganch))
1575 /* Messy cases: unanchored match. */
1576 if (prog->anchored_substr && prog->reganch & ROPT_SKIP) {
1577 /* we have /x+whatever/ */
1578 /* it must be a one character string (XXXX Except UTF?) */
1579 char ch = SvPVX(prog->anchored_substr)[0];
1585 while (s < strend) {
1587 DEBUG_r( did_match = 1 );
1588 if (regtry(prog, s)) goto got_it;
1590 while (s < strend && *s == ch)
1597 while (s < strend) {
1599 DEBUG_r( did_match = 1 );
1600 if (regtry(prog, s)) goto got_it;
1602 while (s < strend && *s == ch)
1608 DEBUG_r(if (!did_match)
1609 PerlIO_printf(Perl_debug_log,
1610 "Did not find anchored character...\n")
1614 else if (do_utf8 == (UTF!=0) &&
1615 (prog->anchored_substr != Nullsv
1616 || (prog->float_substr != Nullsv
1617 && prog->float_max_offset < strend - s))) {
1618 SV *must = prog->anchored_substr
1619 ? prog->anchored_substr : prog->float_substr;
1621 prog->anchored_substr ? prog->anchored_offset : prog->float_max_offset;
1623 prog->anchored_substr ? prog->anchored_offset : prog->float_min_offset;
1624 char *last = HOP3c(strend, /* Cannot start after this */
1625 -(I32)(CHR_SVLEN(must)
1626 - (SvTAIL(must) != 0) + back_min), strbeg);
1627 char *last1; /* Last position checked before */
1633 last1 = HOPc(s, -1);
1635 last1 = s - 1; /* bogus */
1637 /* XXXX check_substr already used to find `s', can optimize if
1638 check_substr==must. */
1640 dontbother = end_shift;
1641 strend = HOPc(strend, -dontbother);
1642 while ( (s <= last) &&
1643 ((flags & REXEC_SCREAM)
1644 ? (s = screaminstr(sv, must, HOP3c(s, back_min, strend) - strbeg,
1645 end_shift, &scream_pos, 0))
1646 : (s = fbm_instr((unsigned char*)HOP3(s, back_min, strend),
1647 (unsigned char*)strend, must,
1648 PL_multiline ? FBMrf_MULTILINE : 0))) ) {
1649 DEBUG_r( did_match = 1 );
1650 if (HOPc(s, -back_max) > last1) {
1651 last1 = HOPc(s, -back_min);
1652 s = HOPc(s, -back_max);
1655 char *t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
1657 last1 = HOPc(s, -back_min);
1661 while (s <= last1) {
1662 if (regtry(prog, s))
1668 while (s <= last1) {
1669 if (regtry(prog, s))
1675 DEBUG_r(if (!did_match)
1676 PerlIO_printf(Perl_debug_log,
1677 "Did not find %s substr `%s%.*s%s'%s...\n",
1678 ((must == prog->anchored_substr)
1679 ? "anchored" : "floating"),
1681 (int)(SvCUR(must) - (SvTAIL(must)!=0)),
1683 PL_colors[1], (SvTAIL(must) ? "$" : ""))
1687 else if ((c = prog->regstclass)) {
1688 if (minlen && PL_regkind[(U8)OP(prog->regstclass)] != EXACT)
1689 /* don't bother with what can't match */
1690 strend = HOPc(strend, -(minlen - 1));
1692 SV *prop = sv_newmortal();
1694 PerlIO_printf(Perl_debug_log, "Matching stclass `%s' against `%s'\n", SvPVX(prop), s);
1696 if (find_byclass(prog, c, s, strend, startpos, 0))
1698 DEBUG_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass...\n"));
1702 if (prog->float_substr != Nullsv) { /* Trim the end. */
1705 if (flags & REXEC_SCREAM) {
1706 last = screaminstr(sv, prog->float_substr, s - strbeg,
1707 end_shift, &scream_pos, 1); /* last one */
1709 last = scream_olds; /* Only one occurrence. */
1713 char *little = SvPV(prog->float_substr, len);
1715 if (SvTAIL(prog->float_substr)) {
1716 if (memEQ(strend - len + 1, little, len - 1))
1717 last = strend - len + 1;
1718 else if (!PL_multiline)
1719 last = memEQ(strend - len, little, len)
1720 ? strend - len : Nullch;
1726 last = rninstr(s, strend, little, little + len);
1728 last = strend; /* matching `$' */
1732 DEBUG_r(PerlIO_printf(Perl_debug_log,
1733 "%sCan't trim the tail, match fails (should not happen)%s\n",
1734 PL_colors[4],PL_colors[5]));
1735 goto phooey; /* Should not happen! */
1737 dontbother = strend - last + prog->float_min_offset;
1739 if (minlen && (dontbother < minlen))
1740 dontbother = minlen - 1;
1741 strend -= dontbother; /* this one's always in bytes! */
1742 /* We don't know much -- general case. */
1745 if (regtry(prog, s))
1754 if (regtry(prog, s))
1756 } while (s++ < strend);
1764 RX_MATCH_TAINTED_set(prog, PL_reg_flags & RF_tainted);
1766 if (PL_reg_eval_set) {
1767 /* Preserve the current value of $^R */
1768 if (oreplsv != GvSV(PL_replgv))
1769 sv_setsv(oreplsv, GvSV(PL_replgv));/* So that when GvSV(replgv) is
1770 restored, the value remains
1772 restore_pos(aTHX_ 0);
1775 /* make sure $`, $&, $', and $digit will work later */
1776 if ( !(flags & REXEC_NOT_FIRST) ) {
1777 if (RX_MATCH_COPIED(prog)) {
1778 Safefree(prog->subbeg);
1779 RX_MATCH_COPIED_off(prog);
1781 if (flags & REXEC_COPY_STR) {
1782 I32 i = PL_regeol - startpos + (stringarg - strbeg);
1784 s = savepvn(strbeg, i);
1787 RX_MATCH_COPIED_on(prog);
1790 prog->subbeg = strbeg;
1791 prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
1798 DEBUG_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
1799 PL_colors[4],PL_colors[5]));
1800 if (PL_reg_eval_set)
1801 restore_pos(aTHX_ 0);
1806 - regtry - try match at specific point
1808 STATIC I32 /* 0 failure, 1 success */
1809 S_regtry(pTHX_ regexp *prog, char *startpos)
1817 PL_regindent = 0; /* XXXX Not good when matches are reenterable... */
1819 if ((prog->reganch & ROPT_EVAL_SEEN) && !PL_reg_eval_set) {
1822 PL_reg_eval_set = RS_init;
1824 PerlIO_printf(Perl_debug_log, " setting stack tmpbase at %"IVdf"\n",
1825 (IV)(PL_stack_sp - PL_stack_base));
1827 SAVEI32(cxstack[cxstack_ix].blk_oldsp);
1828 cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base;
1829 /* Otherwise OP_NEXTSTATE will free whatever on stack now. */
1831 /* Apparently this is not needed, judging by wantarray. */
1832 /* SAVEI8(cxstack[cxstack_ix].blk_gimme);
1833 cxstack[cxstack_ix].blk_gimme = G_SCALAR; */
1836 /* Make $_ available to executed code. */
1837 if (PL_reg_sv != DEFSV) {
1838 /* SAVE_DEFSV does *not* suffice here for USE_5005THREADS */
1843 if (!(SvTYPE(PL_reg_sv) >= SVt_PVMG && SvMAGIC(PL_reg_sv)
1844 && (mg = mg_find(PL_reg_sv, PERL_MAGIC_regex_global)))) {
1845 /* prepare for quick setting of pos */
1846 sv_magic(PL_reg_sv, (SV*)0,
1847 PERL_MAGIC_regex_global, Nullch, 0);
1848 mg = mg_find(PL_reg_sv, PERL_MAGIC_regex_global);
1852 PL_reg_oldpos = mg->mg_len;
1853 SAVEDESTRUCTOR_X(restore_pos, 0);
1855 if (!PL_reg_curpm) {
1856 Newz(22,PL_reg_curpm, 1, PMOP);
1859 SV* repointer = newSViv(0);
1860 /* so we know which PL_regex_padav element is PL_reg_curpm */
1861 SvFLAGS(repointer) |= SVf_BREAK;
1862 av_push(PL_regex_padav,repointer);
1863 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
1864 PL_regex_pad = AvARRAY(PL_regex_padav);
1868 PM_SETRE(PL_reg_curpm, prog);
1869 PL_reg_oldcurpm = PL_curpm;
1870 PL_curpm = PL_reg_curpm;
1871 if (RX_MATCH_COPIED(prog)) {
1872 /* Here is a serious problem: we cannot rewrite subbeg,
1873 since it may be needed if this match fails. Thus
1874 $` inside (?{}) could fail... */
1875 PL_reg_oldsaved = prog->subbeg;
1876 PL_reg_oldsavedlen = prog->sublen;
1877 RX_MATCH_COPIED_off(prog);
1880 PL_reg_oldsaved = Nullch;
1881 prog->subbeg = PL_bostr;
1882 prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
1884 prog->startp[0] = startpos - PL_bostr;
1885 PL_reginput = startpos;
1886 PL_regstartp = prog->startp;
1887 PL_regendp = prog->endp;
1888 PL_reglastparen = &prog->lastparen;
1889 PL_reglastcloseparen = &prog->lastcloseparen;
1890 prog->lastparen = 0;
1892 DEBUG_r(PL_reg_starttry = startpos);
1893 if (PL_reg_start_tmpl <= prog->nparens) {
1894 PL_reg_start_tmpl = prog->nparens*3/2 + 3;
1895 if(PL_reg_start_tmp)
1896 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
1898 New(22,PL_reg_start_tmp, PL_reg_start_tmpl, char*);
1901 /* XXXX What this code is doing here?!!! There should be no need
1902 to do this again and again, PL_reglastparen should take care of
1905 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
1906 * Actually, the code in regcppop() (which Ilya may be meaning by
1907 * PL_reglastparen), is not needed at all by the test suite
1908 * (op/regexp, op/pat, op/split), but that code is needed, oddly
1909 * enough, for building DynaLoader, or otherwise this
1910 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
1911 * will happen. Meanwhile, this code *is* needed for the
1912 * above-mentioned test suite tests to succeed. The common theme
1913 * on those tests seems to be returning null fields from matches.
1918 if (prog->nparens) {
1919 for (i = prog->nparens; i > *PL_reglastparen; i--) {
1926 if (regmatch(prog->program + 1)) {
1927 prog->endp[0] = PL_reginput - PL_bostr;
1930 REGCP_UNWIND(lastcp);
1934 #define RE_UNWIND_BRANCH 1
1935 #define RE_UNWIND_BRANCHJ 2
1939 typedef struct { /* XX: makes sense to enlarge it... */
1943 } re_unwind_generic_t;
1956 } re_unwind_branch_t;
1958 typedef union re_unwind_t {
1960 re_unwind_generic_t generic;
1961 re_unwind_branch_t branch;
1964 #define sayYES goto yes
1965 #define sayNO goto no
1966 #define sayYES_FINAL goto yes_final
1967 #define sayYES_LOUD goto yes_loud
1968 #define sayNO_FINAL goto no_final
1969 #define sayNO_SILENT goto do_no
1970 #define saySAME(x) if (x) goto yes; else goto no
1972 #define REPORT_CODE_OFF 24
1975 - regmatch - main matching routine
1977 * Conceptually the strategy is simple: check to see whether the current
1978 * node matches, call self recursively to see whether the rest matches,
1979 * and then act accordingly. In practice we make some effort to avoid
1980 * recursion, in particular by going through "ordinary" nodes (that don't
1981 * need to know whether the rest of the match failed) by a loop instead of
1984 /* [lwall] I've hoisted the register declarations to the outer block in order to
1985 * maybe save a little bit of pushing and popping on the stack. It also takes
1986 * advantage of machines that use a register save mask on subroutine entry.
1988 STATIC I32 /* 0 failure, 1 success */
1989 S_regmatch(pTHX_ regnode *prog)
1991 register regnode *scan; /* Current node. */
1992 regnode *next; /* Next node. */
1993 regnode *inner; /* Next node in internal branch. */
1994 register I32 nextchr; /* renamed nextchr - nextchar colides with
1995 function of same name */
1996 register I32 n; /* no or next */
1997 register I32 ln = 0; /* len or last */
1998 register char *s = Nullch; /* operand or save */
1999 register char *locinput = PL_reginput;
2000 register I32 c1 = 0, c2 = 0, paren; /* case fold search, parenth */
2001 int minmod = 0, sw = 0, logical = 0;
2004 I32 firstcp = PL_savestack_ix;
2006 register bool do_utf8 = PL_reg_match_utf8;
2012 /* Note that nextchr is a byte even in UTF */
2013 nextchr = UCHARAT(locinput);
2015 while (scan != NULL) {
2018 SV *prop = sv_newmortal();
2019 int docolor = *PL_colors[0];
2020 int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
2021 int l = (PL_regeol - locinput) > taill ? taill : (PL_regeol - locinput);
2022 /* The part of the string before starttry has one color
2023 (pref0_len chars), between starttry and current
2024 position another one (pref_len - pref0_len chars),
2025 after the current position the third one.
2026 We assume that pref0_len <= pref_len, otherwise we
2027 decrease pref0_len. */
2028 int pref_len = (locinput - PL_bostr) > (5 + taill) - l
2029 ? (5 + taill) - l : locinput - PL_bostr;
2032 while (UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
2034 pref0_len = pref_len - (locinput - PL_reg_starttry);
2035 if (l + pref_len < (5 + taill) && l < PL_regeol - locinput)
2036 l = ( PL_regeol - locinput > (5 + taill) - pref_len
2037 ? (5 + taill) - pref_len : PL_regeol - locinput);
2038 while (UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
2042 if (pref0_len > pref_len)
2043 pref0_len = pref_len;
2044 regprop(prop, scan);
2045 PerlIO_printf(Perl_debug_log,
2046 "%4"IVdf" <%s%.*s%s%s%.*s%s%s%s%.*s%s>%*s|%3"IVdf":%*s%s\n",
2047 (IV)(locinput - PL_bostr),
2048 PL_colors[4], pref0_len,
2049 locinput - pref_len, PL_colors[5],
2050 PL_colors[2], pref_len - pref0_len,
2051 locinput - pref_len + pref0_len, PL_colors[3],
2052 (docolor ? "" : "> <"),
2053 PL_colors[0], l, locinput, PL_colors[1],
2054 15 - l - pref_len + 1,
2056 (IV)(scan - PL_regprogram), PL_regindent*2, "",
2060 next = scan + NEXT_OFF(scan);
2066 if (locinput == PL_bostr || (PL_multiline &&
2067 (nextchr || locinput < PL_regeol) && locinput[-1] == '\n') )
2069 /* regtill = regbol; */
2074 if (locinput == PL_bostr ||
2075 ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n'))
2081 if (locinput == PL_bostr)
2085 if (locinput == PL_reg_ganch)
2095 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
2100 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
2102 if (PL_regeol - locinput > 1)
2106 if (PL_regeol != locinput)
2110 if (!nextchr && locinput >= PL_regeol)
2113 locinput += PL_utf8skip[nextchr];
2114 if (locinput > PL_regeol)
2116 nextchr = UCHARAT(locinput);
2119 nextchr = UCHARAT(++locinput);
2122 if (!nextchr && locinput >= PL_regeol)
2124 nextchr = UCHARAT(++locinput);
2127 if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
2130 locinput += PL_utf8skip[nextchr];
2131 if (locinput > PL_regeol)
2133 nextchr = UCHARAT(locinput);
2136 nextchr = UCHARAT(++locinput);
2141 if (do_utf8 != (UTF!=0)) {
2149 if (*((U8*)s) != utf8_to_uvchr((U8*)l, &len))
2158 if (*((U8*)l) != utf8_to_uvchr((U8*)s, &len))
2164 nextchr = UCHARAT(locinput);
2167 /* Inline the first character, for speed. */
2168 if (UCHARAT(s) != nextchr)
2170 if (PL_regeol - locinput < ln)
2172 if (ln > 1 && memNE(s, locinput, ln))
2175 nextchr = UCHARAT(locinput);
2178 PL_reg_flags |= RF_tainted;
2188 c1 = OP(scan) == EXACTF;
2190 if (l >= PL_regeol) {
2193 if ((UTF ? utf8n_to_uvchr((U8*)s, e - s, 0, 0) : *((U8*)s)) !=
2194 (c1 ? toLOWER_utf8((U8*)l) : toLOWER_LC_utf8((U8*)l)))
2196 s += UTF ? UTF8SKIP(s) : 1;
2200 nextchr = UCHARAT(locinput);
2204 /* Inline the first character, for speed. */
2205 if (UCHARAT(s) != nextchr &&
2206 UCHARAT(s) != ((OP(scan) == EXACTF)
2207 ? PL_fold : PL_fold_locale)[nextchr])
2209 if (PL_regeol - locinput < ln)
2211 if (ln > 1 && (OP(scan) == EXACTF
2212 ? ibcmp(s, locinput, ln)
2213 : ibcmp_locale(s, locinput, ln)))
2216 nextchr = UCHARAT(locinput);
2220 if (!reginclass(scan, (U8*)locinput, do_utf8))
2222 if (locinput >= PL_regeol)
2224 locinput += PL_utf8skip[nextchr];
2225 nextchr = UCHARAT(locinput);
2229 nextchr = UCHARAT(locinput);
2230 if (!reginclass(scan, (U8*)locinput, do_utf8))
2232 if (!nextchr && locinput >= PL_regeol)
2234 nextchr = UCHARAT(++locinput);
2238 PL_reg_flags |= RF_tainted;
2244 LOAD_UTF8_CHARCLASS(alnum,"a");
2245 if (!(OP(scan) == ALNUM
2246 ? swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
2247 : isALNUM_LC_utf8((U8*)locinput)))
2251 locinput += PL_utf8skip[nextchr];
2252 nextchr = UCHARAT(locinput);
2255 if (!(OP(scan) == ALNUM
2256 ? isALNUM(nextchr) : isALNUM_LC(nextchr)))
2258 nextchr = UCHARAT(++locinput);
2261 PL_reg_flags |= RF_tainted;
2264 if (!nextchr && locinput >= PL_regeol)
2267 LOAD_UTF8_CHARCLASS(alnum,"a");
2268 if (OP(scan) == NALNUM
2269 ? swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
2270 : isALNUM_LC_utf8((U8*)locinput))
2274 locinput += PL_utf8skip[nextchr];
2275 nextchr = UCHARAT(locinput);
2278 if (OP(scan) == NALNUM
2279 ? isALNUM(nextchr) : isALNUM_LC(nextchr))
2281 nextchr = UCHARAT(++locinput);
2285 PL_reg_flags |= RF_tainted;
2289 /* was last char in word? */
2291 if (locinput == PL_bostr)
2294 U8 *r = reghop((U8*)locinput, -1);
2296 ln = utf8n_to_uvchr(r, s - (char*)r, 0, 0);
2298 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
2299 ln = isALNUM_uni(ln);
2300 LOAD_UTF8_CHARCLASS(alnum,"a");
2301 n = swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8);
2304 ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(ln));
2305 n = isALNUM_LC_utf8((U8*)locinput);
2309 ln = (locinput != PL_bostr) ?
2310 UCHARAT(locinput - 1) : '\n';
2311 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
2313 n = isALNUM(nextchr);
2316 ln = isALNUM_LC(ln);
2317 n = isALNUM_LC(nextchr);
2320 if (((!ln) == (!n)) == (OP(scan) == BOUND ||
2321 OP(scan) == BOUNDL))
2325 PL_reg_flags |= RF_tainted;
2331 if (UTF8_IS_CONTINUED(nextchr)) {
2332 LOAD_UTF8_CHARCLASS(space," ");
2333 if (!(OP(scan) == SPACE
2334 ? swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
2335 : isSPACE_LC_utf8((U8*)locinput)))
2339 locinput += PL_utf8skip[nextchr];
2340 nextchr = UCHARAT(locinput);
2343 if (!(OP(scan) == SPACE
2344 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
2346 nextchr = UCHARAT(++locinput);
2349 if (!(OP(scan) == SPACE
2350 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
2352 nextchr = UCHARAT(++locinput);
2356 PL_reg_flags |= RF_tainted;
2359 if (!nextchr && locinput >= PL_regeol)
2362 LOAD_UTF8_CHARCLASS(space," ");
2363 if (OP(scan) == NSPACE
2364 ? swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
2365 : isSPACE_LC_utf8((U8*)locinput))
2369 locinput += PL_utf8skip[nextchr];
2370 nextchr = UCHARAT(locinput);
2373 if (OP(scan) == NSPACE
2374 ? isSPACE(nextchr) : isSPACE_LC(nextchr))
2376 nextchr = UCHARAT(++locinput);
2379 PL_reg_flags |= RF_tainted;
2385 LOAD_UTF8_CHARCLASS(digit,"0");
2386 if (!(OP(scan) == DIGIT
2387 ? swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
2388 : isDIGIT_LC_utf8((U8*)locinput)))
2392 locinput += PL_utf8skip[nextchr];
2393 nextchr = UCHARAT(locinput);
2396 if (!(OP(scan) == DIGIT
2397 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr)))
2399 nextchr = UCHARAT(++locinput);
2402 PL_reg_flags |= RF_tainted;
2405 if (!nextchr && locinput >= PL_regeol)
2408 LOAD_UTF8_CHARCLASS(digit,"0");
2409 if (OP(scan) == NDIGIT
2410 ? swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
2411 : isDIGIT_LC_utf8((U8*)locinput))
2415 locinput += PL_utf8skip[nextchr];
2416 nextchr = UCHARAT(locinput);
2419 if (OP(scan) == NDIGIT
2420 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr))
2422 nextchr = UCHARAT(++locinput);
2425 LOAD_UTF8_CHARCLASS(mark,"~");
2426 if (locinput >= PL_regeol ||
2427 swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
2429 locinput += PL_utf8skip[nextchr];
2430 while (locinput < PL_regeol &&
2431 swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
2432 locinput += UTF8SKIP(locinput);
2433 if (locinput > PL_regeol)
2435 nextchr = UCHARAT(locinput);
2438 PL_reg_flags |= RF_tainted;
2442 n = ARG(scan); /* which paren pair */
2443 ln = PL_regstartp[n];
2444 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
2445 if (*PL_reglastparen < n || ln == -1)
2446 sayNO; /* Do not match unless seen CLOSEn. */
2447 if (ln == PL_regendp[n])
2451 if (do_utf8 && OP(scan) != REF) { /* REF can do byte comparison */
2453 char *e = PL_bostr + PL_regendp[n];
2455 * Note that we can't do the "other character" lookup trick as
2456 * in the 8-bit case (no pun intended) because in Unicode we
2457 * have to map both upper and title case to lower case.
2459 if (OP(scan) == REFF) {
2463 if (toLOWER_utf8((U8*)s) != toLOWER_utf8((U8*)l))
2473 if (toLOWER_LC_utf8((U8*)s) != toLOWER_LC_utf8((U8*)l))
2480 nextchr = UCHARAT(locinput);
2484 /* Inline the first character, for speed. */
2485 if (UCHARAT(s) != nextchr &&
2487 (UCHARAT(s) != ((OP(scan) == REFF
2488 ? PL_fold : PL_fold_locale)[nextchr]))))
2490 ln = PL_regendp[n] - ln;
2491 if (locinput + ln > PL_regeol)
2493 if (ln > 1 && (OP(scan) == REF
2494 ? memNE(s, locinput, ln)
2496 ? ibcmp(s, locinput, ln)
2497 : ibcmp_locale(s, locinput, ln))))
2500 nextchr = UCHARAT(locinput);
2511 OP_4tree *oop = PL_op;
2512 COP *ocurcop = PL_curcop;
2513 SV **ocurpad = PL_curpad;
2517 PL_op = (OP_4tree*)PL_regdata->data[n];
2518 DEBUG_r( PerlIO_printf(Perl_debug_log, " re_eval 0x%"UVxf"\n", PTR2UV(PL_op)) );
2519 PL_curpad = AvARRAY((AV*)PL_regdata->data[n + 2]);
2520 PL_regendp[0] = PL_reg_magic->mg_len = locinput - PL_bostr;
2522 CALLRUNOPS(aTHX); /* Scalar context. */
2528 PL_curpad = ocurpad;
2529 PL_curcop = ocurcop;
2531 if (logical == 2) { /* Postponed subexpression. */
2533 MAGIC *mg = Null(MAGIC*);
2535 CHECKPOINT cp, lastcp;
2537 if(SvROK(ret) || SvRMAGICAL(ret)) {
2538 SV *sv = SvROK(ret) ? SvRV(ret) : ret;
2541 mg = mg_find(sv, PERL_MAGIC_qr);
2544 re = (regexp *)mg->mg_obj;
2545 (void)ReREFCNT_inc(re);
2549 char *t = SvPV(ret, len);
2551 char *oprecomp = PL_regprecomp;
2552 I32 osize = PL_regsize;
2553 I32 onpar = PL_regnpar;
2556 re = CALLREGCOMP(aTHX_ t, t + len, &pm);
2558 & (SVs_TEMP | SVs_PADTMP | SVf_READONLY)))
2559 sv_magic(ret,(SV*)ReREFCNT_inc(re),
2561 PL_regprecomp = oprecomp;
2566 PerlIO_printf(Perl_debug_log,
2567 "Entering embedded `%s%.60s%s%s'\n",
2571 (strlen(re->precomp) > 60 ? "..." : ""))
2574 state.prev = PL_reg_call_cc;
2575 state.cc = PL_regcc;
2576 state.re = PL_reg_re;
2580 cp = regcppush(0); /* Save *all* the positions. */
2583 state.ss = PL_savestack_ix;
2584 *PL_reglastparen = 0;
2585 *PL_reglastcloseparen = 0;
2586 PL_reg_call_cc = &state;
2587 PL_reginput = locinput;
2589 /* XXXX This is too dramatic a measure... */
2592 if (regmatch(re->program + 1)) {
2593 /* Even though we succeeded, we need to restore
2594 global variables, since we may be wrapped inside
2595 SUSPEND, thus the match may be not finished yet. */
2597 /* XXXX Do this only if SUSPENDed? */
2598 PL_reg_call_cc = state.prev;
2599 PL_regcc = state.cc;
2600 PL_reg_re = state.re;
2601 cache_re(PL_reg_re);
2603 /* XXXX This is too dramatic a measure... */
2606 /* These are needed even if not SUSPEND. */
2612 REGCP_UNWIND(lastcp);
2614 PL_reg_call_cc = state.prev;
2615 PL_regcc = state.cc;
2616 PL_reg_re = state.re;
2617 cache_re(PL_reg_re);
2619 /* XXXX This is too dramatic a measure... */
2629 sv_setsv(save_scalar(PL_replgv), ret);
2633 n = ARG(scan); /* which paren pair */
2634 PL_reg_start_tmp[n] = locinput;
2639 n = ARG(scan); /* which paren pair */
2640 PL_regstartp[n] = PL_reg_start_tmp[n] - PL_bostr;
2641 PL_regendp[n] = locinput - PL_bostr;
2642 if (n > *PL_reglastparen)
2643 *PL_reglastparen = n;
2644 *PL_reglastcloseparen = n;
2647 n = ARG(scan); /* which paren pair */
2648 sw = (*PL_reglastparen >= n && PL_regendp[n] != -1);
2651 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
2653 next = NEXTOPER(NEXTOPER(scan));
2655 next = scan + ARG(scan);
2656 if (OP(next) == IFTHEN) /* Fake one. */
2657 next = NEXTOPER(NEXTOPER(next));
2661 logical = scan->flags;
2663 /*******************************************************************
2664 PL_regcc contains infoblock about the innermost (...)* loop, and
2665 a pointer to the next outer infoblock.
2667 Here is how Y(A)*Z is processed (if it is compiled into CURLYX/WHILEM):
2669 1) After matching X, regnode for CURLYX is processed;
2671 2) This regnode creates infoblock on the stack, and calls
2672 regmatch() recursively with the starting point at WHILEM node;
2674 3) Each hit of WHILEM node tries to match A and Z (in the order
2675 depending on the current iteration, min/max of {min,max} and
2676 greediness). The information about where are nodes for "A"
2677 and "Z" is read from the infoblock, as is info on how many times "A"
2678 was already matched, and greediness.
2680 4) After A matches, the same WHILEM node is hit again.
2682 5) Each time WHILEM is hit, PL_regcc is the infoblock created by CURLYX
2683 of the same pair. Thus when WHILEM tries to match Z, it temporarily
2684 resets PL_regcc, since this Y(A)*Z can be a part of some other loop:
2685 as in (Y(A)*Z)*. If Z matches, the automaton will hit the WHILEM node
2686 of the external loop.
2688 Currently present infoblocks form a tree with a stem formed by PL_curcc
2689 and whatever it mentions via ->next, and additional attached trees
2690 corresponding to temporarily unset infoblocks as in "5" above.
2692 In the following picture infoblocks for outer loop of
2693 (Y(A)*?Z)*?T are denoted O, for inner I. NULL starting block
2694 is denoted by x. The matched string is YAAZYAZT. Temporarily postponed
2695 infoblocks are drawn below the "reset" infoblock.
2697 In fact in the picture below we do not show failed matches for Z and T
2698 by WHILEM blocks. [We illustrate minimal matches, since for them it is
2699 more obvious *why* one needs to *temporary* unset infoblocks.]
2701 Matched REx position InfoBlocks Comment
2705 Y A)*?Z)*?T x <- O <- I
2706 YA )*?Z)*?T x <- O <- I
2707 YA A)*?Z)*?T x <- O <- I
2708 YAA )*?Z)*?T x <- O <- I
2709 YAA Z)*?T x <- O # Temporary unset I
2712 YAAZ Y(A)*?Z)*?T x <- O
2715 YAAZY (A)*?Z)*?T x <- O
2718 YAAZY A)*?Z)*?T x <- O <- I
2721 YAAZYA )*?Z)*?T x <- O <- I
2724 YAAZYA Z)*?T x <- O # Temporary unset I
2730 YAAZYAZ T x # Temporary unset O
2737 *******************************************************************/
2740 CHECKPOINT cp = PL_savestack_ix;
2741 /* No need to save/restore up to this paren */
2742 I32 parenfloor = scan->flags;
2744 if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
2746 cc.oldcc = PL_regcc;
2748 /* XXXX Probably it is better to teach regpush to support
2749 parenfloor > PL_regsize... */
2750 if (parenfloor > *PL_reglastparen)
2751 parenfloor = *PL_reglastparen; /* Pessimization... */
2752 cc.parenfloor = parenfloor;
2754 cc.min = ARG1(scan);
2755 cc.max = ARG2(scan);
2756 cc.scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
2760 PL_reginput = locinput;
2761 n = regmatch(PREVOPER(next)); /* start on the WHILEM */
2763 PL_regcc = cc.oldcc;
2769 * This is really hard to understand, because after we match
2770 * what we're trying to match, we must make sure the rest of
2771 * the REx is going to match for sure, and to do that we have
2772 * to go back UP the parse tree by recursing ever deeper. And
2773 * if it fails, we have to reset our parent's current state
2774 * that we can try again after backing off.
2777 CHECKPOINT cp, lastcp;
2778 CURCUR* cc = PL_regcc;
2779 char *lastloc = cc->lastloc; /* Detection of 0-len. */
2781 n = cc->cur + 1; /* how many we know we matched */
2782 PL_reginput = locinput;
2785 PerlIO_printf(Perl_debug_log,
2786 "%*s %ld out of %ld..%ld cc=%lx\n",
2787 REPORT_CODE_OFF+PL_regindent*2, "",
2788 (long)n, (long)cc->min,
2789 (long)cc->max, (long)cc)
2792 /* If degenerate scan matches "", assume scan done. */
2794 if (locinput == cc->lastloc && n >= cc->min) {
2795 PL_regcc = cc->oldcc;
2799 PerlIO_printf(Perl_debug_log,
2800 "%*s empty match detected, try continuation...\n",
2801 REPORT_CODE_OFF+PL_regindent*2, "")
2803 if (regmatch(cc->next))
2811 /* First just match a string of min scans. */
2815 cc->lastloc = locinput;
2816 if (regmatch(cc->scan))
2819 cc->lastloc = lastloc;
2824 /* Check whether we already were at this position.
2825 Postpone detection until we know the match is not
2826 *that* much linear. */
2827 if (!PL_reg_maxiter) {
2828 PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
2829 PL_reg_leftiter = PL_reg_maxiter;
2831 if (PL_reg_leftiter-- == 0) {
2832 I32 size = (PL_reg_maxiter + 7)/8;
2833 if (PL_reg_poscache) {
2834 if (PL_reg_poscache_size < size) {
2835 Renew(PL_reg_poscache, size, char);
2836 PL_reg_poscache_size = size;
2838 Zero(PL_reg_poscache, size, char);
2841 PL_reg_poscache_size = size;
2842 Newz(29, PL_reg_poscache, size, char);
2845 PerlIO_printf(Perl_debug_log,
2846 "%sDetected a super-linear match, switching on caching%s...\n",
2847 PL_colors[4], PL_colors[5])
2850 if (PL_reg_leftiter < 0) {
2851 I32 o = locinput - PL_bostr, b;
2853 o = (scan->flags & 0xf) - 1 + o * (scan->flags>>4);
2856 if (PL_reg_poscache[o] & (1<<b)) {
2858 PerlIO_printf(Perl_debug_log,
2859 "%*s already tried at this position...\n",
2860 REPORT_CODE_OFF+PL_regindent*2, "")
2864 PL_reg_poscache[o] |= (1<<b);
2868 /* Prefer next over scan for minimal matching. */
2871 PL_regcc = cc->oldcc;
2874 cp = regcppush(cc->parenfloor);
2876 if (regmatch(cc->next)) {
2878 sayYES; /* All done. */
2880 REGCP_UNWIND(lastcp);
2886 if (n >= cc->max) { /* Maximum greed exceeded? */
2887 if (ckWARN(WARN_REGEXP) && n >= REG_INFTY
2888 && !(PL_reg_flags & RF_warned)) {
2889 PL_reg_flags |= RF_warned;
2890 Perl_warner(aTHX_ WARN_REGEXP, "%s limit (%d) exceeded",
2891 "Complex regular subexpression recursion",
2898 PerlIO_printf(Perl_debug_log,
2899 "%*s trying longer...\n",
2900 REPORT_CODE_OFF+PL_regindent*2, "")
2902 /* Try scanning more and see if it helps. */
2903 PL_reginput = locinput;
2905 cc->lastloc = locinput;
2906 cp = regcppush(cc->parenfloor);
2908 if (regmatch(cc->scan)) {
2912 REGCP_UNWIND(lastcp);
2915 cc->lastloc = lastloc;
2919 /* Prefer scan over next for maximal matching. */
2921 if (n < cc->max) { /* More greed allowed? */
2922 cp = regcppush(cc->parenfloor);
2924 cc->lastloc = locinput;
2926 if (regmatch(cc->scan)) {
2930 REGCP_UNWIND(lastcp);
2931 regcppop(); /* Restore some previous $<digit>s? */
2932 PL_reginput = locinput;
2934 PerlIO_printf(Perl_debug_log,
2935 "%*s failed, try continuation...\n",
2936 REPORT_CODE_OFF+PL_regindent*2, "")
2939 if (ckWARN(WARN_REGEXP) && n >= REG_INFTY
2940 && !(PL_reg_flags & RF_warned)) {
2941 PL_reg_flags |= RF_warned;
2942 Perl_warner(aTHX_ WARN_REGEXP, "%s limit (%d) exceeded",
2943 "Complex regular subexpression recursion",
2947 /* Failed deeper matches of scan, so see if this one works. */
2948 PL_regcc = cc->oldcc;
2951 if (regmatch(cc->next))
2957 cc->lastloc = lastloc;
2962 next = scan + ARG(scan);
2965 inner = NEXTOPER(NEXTOPER(scan));
2968 inner = NEXTOPER(scan);
2972 if (OP(next) != c1) /* No choice. */
2973 next = inner; /* Avoid recursion. */
2975 I32 lastparen = *PL_reglastparen;
2977 re_unwind_branch_t *uw;
2979 /* Put unwinding data on stack */
2980 unwind1 = SSNEWt(1,re_unwind_branch_t);
2981 uw = SSPTRt(unwind1,re_unwind_branch_t);
2984 uw->type = ((c1 == BRANCH)
2986 : RE_UNWIND_BRANCHJ);
2987 uw->lastparen = lastparen;
2989 uw->locinput = locinput;
2990 uw->nextchr = nextchr;
2992 uw->regindent = ++PL_regindent;
2995 REGCP_SET(uw->lastcp);
2997 /* Now go into the first branch */
3010 /* We suppose that the next guy does not need
3011 backtracking: in particular, it is of constant length,
3012 and has no parenths to influence future backrefs. */
3013 ln = ARG1(scan); /* min to match */
3014 n = ARG2(scan); /* max to match */
3015 paren = scan->flags;
3017 if (paren > PL_regsize)
3019 if (paren > *PL_reglastparen)
3020 *PL_reglastparen = paren;
3022 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
3024 scan += NEXT_OFF(scan); /* Skip former OPEN. */
3025 PL_reginput = locinput;
3028 if (ln && regrepeat_hard(scan, ln, &l) < ln)
3030 /* if we matched something zero-length we don't need to
3031 backtrack - capturing parens are already defined, so
3032 the caveat in the maximal case doesn't apply
3034 XXXX if ln == 0, we can redo this check first time
3035 through the following loop
3038 n = ln; /* don't backtrack */
3039 locinput = PL_reginput;
3040 if (PL_regkind[(U8)OP(next)] == EXACT) {
3041 c1 = (U8)*STRING(next);
3042 if (OP(next) == EXACTF)
3044 else if (OP(next) == EXACTFL)
3045 c2 = PL_fold_locale[c1];
3052 /* This may be improved if l == 0. */
3053 while (n >= ln || (n == REG_INFTY && ln > 0 && l)) { /* ln overflow ? */
3054 /* If it could work, try it. */
3056 UCHARAT(PL_reginput) == c1 ||
3057 UCHARAT(PL_reginput) == c2)
3061 PL_regstartp[paren] =
3062 HOPc(PL_reginput, -l) - PL_bostr;
3063 PL_regendp[paren] = PL_reginput - PL_bostr;
3066 PL_regendp[paren] = -1;
3070 REGCP_UNWIND(lastcp);
3072 /* Couldn't or didn't -- move forward. */
3073 PL_reginput = locinput;
3074 if (regrepeat_hard(scan, 1, &l)) {
3076 locinput = PL_reginput;
3083 n = regrepeat_hard(scan, n, &l);
3084 /* if we matched something zero-length we don't need to
3085 backtrack, unless the minimum count is zero and we
3086 are capturing the result - in that case the capture
3087 being defined or not may affect later execution
3089 if (n != 0 && l == 0 && !(paren && ln == 0))
3090 ln = n; /* don't backtrack */
3091 locinput = PL_reginput;
3093 PerlIO_printf(Perl_debug_log,
3094 "%*s matched %"IVdf" times, len=%"IVdf"...\n",
3095 (int)(REPORT_CODE_OFF+PL_regindent*2), "",
3099 if (PL_regkind[(U8)OP(next)] == EXACT) {
3100 c1 = (U8)*STRING(next);
3101 if (OP(next) == EXACTF)
3103 else if (OP(next) == EXACTFL)
3104 c2 = PL_fold_locale[c1];
3113 /* If it could work, try it. */
3115 UCHARAT(PL_reginput) == c1 ||
3116 UCHARAT(PL_reginput) == c2)
3119 PerlIO_printf(Perl_debug_log,
3120 "%*s trying tail with n=%"IVdf"...\n",
3121 (int)(REPORT_CODE_OFF+PL_regindent*2), "", (IV)n)
3125 PL_regstartp[paren] = HOPc(PL_reginput, -l) - PL_bostr;
3126 PL_regendp[paren] = PL_reginput - PL_bostr;
3129 PL_regendp[paren] = -1;
3133 REGCP_UNWIND(lastcp);
3135 /* Couldn't or didn't -- back up. */
3137 locinput = HOPc(locinput, -l);
3138 PL_reginput = locinput;
3145 paren = scan->flags; /* Which paren to set */
3146 if (paren > PL_regsize)
3148 if (paren > *PL_reglastparen)
3149 *PL_reglastparen = paren;
3150 ln = ARG1(scan); /* min to match */
3151 n = ARG2(scan); /* max to match */
3152 scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
3156 ln = ARG1(scan); /* min to match */
3157 n = ARG2(scan); /* max to match */
3158 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
3163 scan = NEXTOPER(scan);
3169 scan = NEXTOPER(scan);
3173 * Lookahead to avoid useless match attempts
3174 * when we know what character comes next.
3176 if (PL_regkind[(U8)OP(next)] == EXACT) {
3177 U8 *s = (U8*)STRING(next);
3180 if (OP(next) == EXACTF)
3182 else if (OP(next) == EXACTFL)
3183 c2 = PL_fold_locale[c1];
3186 if (OP(next) == EXACTF) {
3187 c1 = to_utf8_lower(s);
3188 c2 = to_utf8_upper(s);
3191 c2 = c1 = utf8_to_uvchr(s, NULL);
3197 PL_reginput = locinput;
3201 if (ln && regrepeat(scan, ln) < ln)
3203 locinput = PL_reginput;
3206 char *e; /* Should not check after this */
3207 char *old = locinput;
3209 if (n == REG_INFTY) {
3212 while (UTF8_IS_CONTINUATION(*(U8*)e))
3218 m >0 && e + UTF8SKIP(e) <= PL_regeol; m--)
3222 e = locinput + n - ln;
3228 /* Find place 'next' could work */
3231 while (locinput <= e && *locinput != c1)
3234 while (locinput <= e
3239 count = locinput - old;
3246 utf8_to_uvchr((U8*)locinput, &len) != c1;
3251 for (count = 0; locinput <= e; count++) {
3252 UV c = utf8_to_uvchr((U8*)locinput, &len);
3253 if (c == c1 || c == c2)
3261 /* PL_reginput == old now */
3262 if (locinput != old) {
3263 ln = 1; /* Did some */
3264 if (regrepeat(scan, count) < count)
3267 /* PL_reginput == locinput now */
3268 TRYPAREN(paren, ln, locinput);
3269 PL_reginput = locinput; /* Could be reset... */
3270 REGCP_UNWIND(lastcp);
3271 /* Couldn't or didn't -- move forward. */
3274 locinput += UTF8SKIP(locinput);
3280 while (n >= ln || (n == REG_INFTY && ln > 0)) { /* ln overflow ? */
3284 c = utf8_to_uvchr((U8*)PL_reginput, NULL);
3286 c = UCHARAT(PL_reginput);
3287 /* If it could work, try it. */
3288 if (c == c1 || c == c2)
3290 TRYPAREN(paren, n, PL_reginput);
3291 REGCP_UNWIND(lastcp);
3294 /* If it could work, try it. */
3295 else if (c1 == -1000)
3297 TRYPAREN(paren, n, PL_reginput);
3298 REGCP_UNWIND(lastcp);
3300 /* Couldn't or didn't -- move forward. */
3301 PL_reginput = locinput;
3302 if (regrepeat(scan, 1)) {
3304 locinput = PL_reginput;
3312 n = regrepeat(scan, n);
3313 locinput = PL_reginput;
3314 if (ln < n && PL_regkind[(U8)OP(next)] == EOL &&
3315 (!PL_multiline || OP(next) == SEOL || OP(next) == EOS)) {
3316 ln = n; /* why back off? */
3317 /* ...because $ and \Z can match before *and* after
3318 newline at the end. Consider "\n\n" =~ /\n+\Z\n/.
3319 We should back off by one in this case. */
3320 if (UCHARAT(PL_reginput - 1) == '\n' && OP(next) != EOS)
3329 c = utf8_to_uvchr((U8*)PL_reginput, NULL);
3331 c = UCHARAT(PL_reginput);
3333 /* If it could work, try it. */
3334 if (c1 == -1000 || c == c1 || c == c2)
3336 TRYPAREN(paren, n, PL_reginput);
3337 REGCP_UNWIND(lastcp);
3339 /* Couldn't or didn't -- back up. */
3341 PL_reginput = locinput = HOPc(locinput, -1);
3349 c = utf8_to_uvchr((U8*)PL_reginput, NULL);
3351 c = UCHARAT(PL_reginput);
3353 /* If it could work, try it. */
3354 if (c1 == -1000 || c == c1 || c == c2)
3356 TRYPAREN(paren, n, PL_reginput);
3357 REGCP_UNWIND(lastcp);
3359 /* Couldn't or didn't -- back up. */
3361 PL_reginput = locinput = HOPc(locinput, -1);
3368 if (PL_reg_call_cc) {
3369 re_cc_state *cur_call_cc = PL_reg_call_cc;
3370 CURCUR *cctmp = PL_regcc;
3371 regexp *re = PL_reg_re;
3372 CHECKPOINT cp, lastcp;
3374 cp = regcppush(0); /* Save *all* the positions. */
3376 regcp_set_to(PL_reg_call_cc->ss); /* Restore parens of
3378 PL_reginput = locinput; /* Make position available to
3380 cache_re(PL_reg_call_cc->re);
3381 PL_regcc = PL_reg_call_cc->cc;
3382 PL_reg_call_cc = PL_reg_call_cc->prev;
3383 if (regmatch(cur_call_cc->node)) {
3384 PL_reg_call_cc = cur_call_cc;
3388 REGCP_UNWIND(lastcp);
3390 PL_reg_call_cc = cur_call_cc;
3396 PerlIO_printf(Perl_debug_log,
3397 "%*s continuation failed...\n",
3398 REPORT_CODE_OFF+PL_regindent*2, "")
3402 if (locinput < PL_regtill) {
3403 DEBUG_r(PerlIO_printf(Perl_debug_log,
3404 "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
3406 (long)(locinput - PL_reg_starttry),
3407 (long)(PL_regtill - PL_reg_starttry),
3409 sayNO_FINAL; /* Cannot match: too short. */
3411 PL_reginput = locinput; /* put where regtry can find it */
3412 sayYES_FINAL; /* Success! */
3414 PL_reginput = locinput; /* put where regtry can find it */
3415 sayYES_LOUD; /* Success! */
3418 PL_reginput = locinput;
3423 s = HOPBACKc(locinput, scan->flags);
3429 PL_reginput = locinput;
3434 s = HOPBACKc(locinput, scan->flags);
3440 PL_reginput = locinput;
3443 inner = NEXTOPER(NEXTOPER(scan));
3444 if (regmatch(inner) != n) {
3459 if (OP(scan) == SUSPEND) {
3460 locinput = PL_reginput;
3461 nextchr = UCHARAT(locinput);
3466 next = scan + ARG(scan);
3471 PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
3472 PTR2UV(scan), OP(scan));
3473 Perl_croak(aTHX_ "regexp memory corruption");
3480 * We get here only if there's trouble -- normally "case END" is
3481 * the terminating point.
3483 Perl_croak(aTHX_ "corrupted regexp pointers");
3489 PerlIO_printf(Perl_debug_log,
3490 "%*s %scould match...%s\n",
3491 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4],PL_colors[5])
3495 DEBUG_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
3496 PL_colors[4],PL_colors[5]));
3502 #if 0 /* Breaks $^R */
3510 PerlIO_printf(Perl_debug_log,
3511 "%*s %sfailed...%s\n",
3512 REPORT_CODE_OFF+PL_regindent*2, "",PL_colors[4],PL_colors[5])
3518 re_unwind_t *uw = SSPTRt(unwind,re_unwind_t);
3521 case RE_UNWIND_BRANCH:
3522 case RE_UNWIND_BRANCHJ:
3524 re_unwind_branch_t *uwb = &(uw->branch);
3525 I32 lastparen = uwb->lastparen;
3527 REGCP_UNWIND(uwb->lastcp);
3528 for (n = *PL_reglastparen; n > lastparen; n--)
3530 *PL_reglastparen = n;
3531 scan = next = uwb->next;
3533 OP(scan) != (uwb->type == RE_UNWIND_BRANCH
3534 ? BRANCH : BRANCHJ) ) { /* Failure */
3541 /* Have more choice yet. Reuse the same uwb. */
3543 if ((n = (uwb->type == RE_UNWIND_BRANCH
3544 ? NEXT_OFF(next) : ARG(next))))
3547 next = NULL; /* XXXX Needn't unwinding in this case... */
3549 next = NEXTOPER(scan);
3550 if (uwb->type == RE_UNWIND_BRANCHJ)
3551 next = NEXTOPER(next);
3552 locinput = uwb->locinput;
3553 nextchr = uwb->nextchr;
3555 PL_regindent = uwb->regindent;
3562 Perl_croak(aTHX_ "regexp unwind memory corruption");
3573 - regrepeat - repeatedly match something simple, report how many
3576 * [This routine now assumes that it will only match on things of length 1.
3577 * That was true before, but now we assume scan - reginput is the count,
3578 * rather than incrementing count on every character. [Er, except utf8.]]
3581 S_regrepeat(pTHX_ regnode *p, I32 max)
3583 register char *scan;
3585 register char *loceol = PL_regeol;
3586 register I32 hardcount = 0;
3587 register bool do_utf8 = PL_reg_match_utf8;
3590 if (max != REG_INFTY && max < loceol - scan)
3591 loceol = scan + max;
3596 while (scan < loceol && hardcount < max && *scan != '\n') {
3597 scan += UTF8SKIP(scan);
3601 while (scan < loceol && *scan != '\n')
3611 case EXACT: /* length of string is 1 */
3613 while (scan < loceol && UCHARAT(scan) == c)
3616 case EXACTF: /* length of string is 1 */
3618 while (scan < loceol &&
3619 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold[c]))
3622 case EXACTFL: /* length of string is 1 */
3623 PL_reg_flags |= RF_tainted;
3625 while (scan < loceol &&
3626 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold_locale[c]))
3632 while (hardcount < max && scan < loceol &&
3633 reginclass(p, (U8*)scan, do_utf8)) {
3634 scan += UTF8SKIP(scan);
3638 while (scan < loceol && reginclass(p, (U8*)scan, do_utf8))
3645 LOAD_UTF8_CHARCLASS(alnum,"a");
3646 while (hardcount < max && scan < loceol &&
3647 swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) {
3648 scan += UTF8SKIP(scan);
3652 while (scan < loceol && isALNUM(*scan))
3657 PL_reg_flags |= RF_tainted;
3660 while (hardcount < max && scan < loceol &&
3661 isALNUM_LC_utf8((U8*)scan)) {
3662 scan += UTF8SKIP(scan);
3666 while (scan < loceol && isALNUM_LC(*scan))
3673 LOAD_UTF8_CHARCLASS(alnum,"a");
3674 while (hardcount < max && scan < loceol &&
3675 !swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) {
3676 scan += UTF8SKIP(scan);
3680 while (scan < loceol && !isALNUM(*scan))
3685 PL_reg_flags |= RF_tainted;
3688 while (hardcount < max && scan < loceol &&
3689 !isALNUM_LC_utf8((U8*)scan)) {
3690 scan += UTF8SKIP(scan);
3694 while (scan < loceol && !isALNUM_LC(*scan))
3701 LOAD_UTF8_CHARCLASS(space," ");
3702 while (hardcount < max && scan < loceol &&
3704 swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) {
3705 scan += UTF8SKIP(scan);
3709 while (scan < loceol && isSPACE(*scan))
3714 PL_reg_flags |= RF_tainted;
3717 while (hardcount < max && scan < loceol &&
3718 (*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
3719 scan += UTF8SKIP(scan);
3723 while (scan < loceol && isSPACE_LC(*scan))
3730 LOAD_UTF8_CHARCLASS(space," ");
3731 while (hardcount < max && scan < loceol &&
3733 swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) {
3734 scan += UTF8SKIP(scan);
3738 while (scan < loceol && !isSPACE(*scan))
3743 PL_reg_flags |= RF_tainted;
3746 while (hardcount < max && scan < loceol &&
3747 !(*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
3748 scan += UTF8SKIP(scan);
3752 while (scan < loceol && !isSPACE_LC(*scan))
3759 LOAD_UTF8_CHARCLASS(digit,"0");
3760 while (hardcount < max && scan < loceol &&
3761 swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) {
3762 scan += UTF8SKIP(scan);
3766 while (scan < loceol && isDIGIT(*scan))
3773 LOAD_UTF8_CHARCLASS(digit,"0");
3774 while (hardcount < max && scan < loceol &&
3775 !swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) {
3776 scan += UTF8SKIP(scan);
3780 while (scan < loceol && !isDIGIT(*scan))
3784 default: /* Called on something of 0 width. */
3785 break; /* So match right here or not at all. */
3791 c = scan - PL_reginput;
3796 SV *prop = sv_newmortal();
3799 PerlIO_printf(Perl_debug_log,
3800 "%*s %s can match %"IVdf" times out of %"IVdf"...\n",
3801 REPORT_CODE_OFF+1, "", SvPVX(prop),(IV)c,(IV)max);
3808 - regrepeat_hard - repeatedly match something, report total lenth and length
3810 * The repeater is supposed to have constant length.
3814 S_regrepeat_hard(pTHX_ regnode *p, I32 max, I32 *lp)
3816 register char *scan = Nullch;
3817 register char *start;
3818 register char *loceol = PL_regeol;
3820 I32 count = 0, res = 1;
3825 start = PL_reginput;
3826 if (PL_reg_match_utf8) {
3827 while (PL_reginput < loceol && (scan = PL_reginput, res = regmatch(p))) {
3830 while (start < PL_reginput) {
3832 start += UTF8SKIP(start);
3843 while (PL_reginput < loceol && (scan = PL_reginput, res = regmatch(p))) {
3845 *lp = l = PL_reginput - start;
3846 if (max != REG_INFTY && l*max < loceol - scan)
3847 loceol = scan + l*max;
3860 - regclass_swash - prepare the utf8 swash
3864 Perl_regclass_swash(pTHX_ register regnode* node, bool doinit, SV** initsvp)
3869 if (PL_regdata && PL_regdata->count) {
3872 if (PL_regdata->what[n] == 's') {
3873 SV *rv = (SV*)PL_regdata->data[n];
3874 AV *av = (AV*)SvRV((SV*)rv);
3877 si = *av_fetch(av, 0, FALSE);
3878 a = av_fetch(av, 1, FALSE);
3882 else if (si && doinit) {
3883 sw = swash_init("utf8", "", si, 1, 0);
3884 (void)av_store(av, 1, sw);
3896 - reginclass - determine if a character falls into a character class
3900 S_reginclass(pTHX_ register regnode *n, register U8* p, register bool do_utf8)
3902 char flags = ANYOF_FLAGS(n);
3907 c = do_utf8 ? utf8_to_uvchr(p, &len) : *p;
3909 if (do_utf8 || (flags & ANYOF_UNICODE)) {
3910 if (do_utf8 && !ANYOF_RUNTIME(n)) {
3911 if (len != (STRLEN)-1 && c < 256 && ANYOF_BITMAP_TEST(n, c))
3914 if (!match && do_utf8 && (flags & ANYOF_UNICODE_ALL) && c >= 256)
3917 SV *sw = regclass_swash(n, TRUE, 0);
3920 if (swash_fetch(sw, p, do_utf8))
3922 else if (flags & ANYOF_FOLD) {
3923 U8 tmpbuf[UTF8_MAXLEN+1];
3925 if (flags & ANYOF_LOCALE) {
3926 PL_reg_flags |= RF_tainted;
3927 uvchr_to_utf8(tmpbuf, toLOWER_LC_utf8(p));
3930 uvchr_to_utf8(tmpbuf, toLOWER_utf8(p));
3931 if (swash_fetch(sw, tmpbuf, do_utf8))
3937 if (!match && c < 256) {
3938 if (ANYOF_BITMAP_TEST(n, c))
3940 else if (flags & ANYOF_FOLD) {
3943 if (flags & ANYOF_LOCALE) {
3944 PL_reg_flags |= RF_tainted;
3945 f = PL_fold_locale[c];
3949 if (f != c && ANYOF_BITMAP_TEST(n, f))
3953 if (!match && (flags & ANYOF_CLASS)) {
3954 PL_reg_flags |= RF_tainted;
3956 (ANYOF_CLASS_TEST(n, ANYOF_ALNUM) && isALNUM_LC(c)) ||
3957 (ANYOF_CLASS_TEST(n, ANYOF_NALNUM) && !isALNUM_LC(c)) ||
3958 (ANYOF_CLASS_TEST(n, ANYOF_SPACE) && isSPACE_LC(c)) ||
3959 (ANYOF_CLASS_TEST(n, ANYOF_NSPACE) && !isSPACE_LC(c)) ||
3960 (ANYOF_CLASS_TEST(n, ANYOF_DIGIT) && isDIGIT_LC(c)) ||
3961 (ANYOF_CLASS_TEST(n, ANYOF_NDIGIT) && !isDIGIT_LC(c)) ||
3962 (ANYOF_CLASS_TEST(n, ANYOF_ALNUMC) && isALNUMC_LC(c)) ||
3963 (ANYOF_CLASS_TEST(n, ANYOF_NALNUMC) && !isALNUMC_LC(c)) ||
3964 (ANYOF_CLASS_TEST(n, ANYOF_ALPHA) && isALPHA_LC(c)) ||
3965 (ANYOF_CLASS_TEST(n, ANYOF_NALPHA) && !isALPHA_LC(c)) ||
3966 (ANYOF_CLASS_TEST(n, ANYOF_ASCII) && isASCII(c)) ||
3967 (ANYOF_CLASS_TEST(n, ANYOF_NASCII) && !isASCII(c)) ||
3968 (ANYOF_CLASS_TEST(n, ANYOF_CNTRL) && isCNTRL_LC(c)) ||
3969 (ANYOF_CLASS_TEST(n, ANYOF_NCNTRL) && !isCNTRL_LC(c)) ||
3970 (ANYOF_CLASS_TEST(n, ANYOF_GRAPH) && isGRAPH_LC(c)) ||
3971 (ANYOF_CLASS_TEST(n, ANYOF_NGRAPH) && !isGRAPH_LC(c)) ||
3972 (ANYOF_CLASS_TEST(n, ANYOF_LOWER) && isLOWER_LC(c)) ||
3973 (ANYOF_CLASS_TEST(n, ANYOF_NLOWER) && !isLOWER_LC(c)) ||
3974 (ANYOF_CLASS_TEST(n, ANYOF_PRINT) && isPRINT_LC(c)) ||
3975 (ANYOF_CLASS_TEST(n, ANYOF_NPRINT) && !isPRINT_LC(c)) ||
3976 (ANYOF_CLASS_TEST(n, ANYOF_PUNCT) && isPUNCT_LC(c)) ||
3977 (ANYOF_CLASS_TEST(n, ANYOF_NPUNCT) && !isPUNCT_LC(c)) ||
3978 (ANYOF_CLASS_TEST(n, ANYOF_UPPER) && isUPPER_LC(c)) ||
3979 (ANYOF_CLASS_TEST(n, ANYOF_NUPPER) && !isUPPER_LC(c)) ||
3980 (ANYOF_CLASS_TEST(n, ANYOF_XDIGIT) && isXDIGIT(c)) ||
3981 (ANYOF_CLASS_TEST(n, ANYOF_NXDIGIT) && !isXDIGIT(c)) ||
3982 (ANYOF_CLASS_TEST(n, ANYOF_PSXSPC) && isPSXSPC(c)) ||
3983 (ANYOF_CLASS_TEST(n, ANYOF_NPSXSPC) && !isPSXSPC(c)) ||
3984 (ANYOF_CLASS_TEST(n, ANYOF_BLANK) && isBLANK(c)) ||
3985 (ANYOF_CLASS_TEST(n, ANYOF_NBLANK) && !isBLANK(c))
3986 ) /* How's that for a conditional? */
3993 return (flags & ANYOF_INVERT) ? !match : match;
3997 S_reghop(pTHX_ U8 *s, I32 off)
3999 return S_reghop3(aTHX_ s, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr));
4003 S_reghop3(pTHX_ U8 *s, I32 off, U8* lim)
4006 while (off-- && s < lim) {
4007 /* XXX could check well-formedness here */
4015 if (UTF8_IS_CONTINUED(*s)) {
4016 while (s > (U8*)lim && UTF8_IS_CONTINUATION(*s))
4019 /* XXX could check well-formedness here */
4027 S_reghopmaybe(pTHX_ U8 *s, I32 off)
4029 return S_reghopmaybe3(aTHX_ s, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr));
4033 S_reghopmaybe3(pTHX_ U8* s, I32 off, U8* lim)
4036 while (off-- && s < lim) {
4037 /* XXX could check well-formedness here */
4047 if (UTF8_IS_CONTINUED(*s)) {
4048 while (s > (U8*)lim && UTF8_IS_CONTINUATION(*s))
4051 /* XXX could check well-formedness here */
4063 restore_pos(pTHX_ void *arg)
4065 if (PL_reg_eval_set) {
4066 if (PL_reg_oldsaved) {
4067 PL_reg_re->subbeg = PL_reg_oldsaved;
4068 PL_reg_re->sublen = PL_reg_oldsavedlen;
4069 RX_MATCH_COPIED_on(PL_reg_re);
4071 PL_reg_magic->mg_len = PL_reg_oldpos;
4072 PL_reg_eval_set = 0;
4073 PL_curpm = PL_reg_oldcurpm;