5 * "One Ring to rule them all, One Ring to find them..."
8 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
9 * confused with the original package (see point 3 below). Thanks, Henry!
12 /* Additional note: this code is very heavily munged from Henry's version
13 * in places. In some spots I've traded clarity for efficiency, so don't
14 * blame Henry for some of the lack of readability.
17 /* The names of the functions have been changed from regcomp and
18 * regexec to pregcomp and pregexec in order to avoid conflicts
19 * with the POSIX routines of the same names.
22 #ifdef PERL_EXT_RE_BUILD
23 /* need to replace pregcomp et al, so enable that */
24 # ifndef PERL_IN_XSUB_RE
25 # define PERL_IN_XSUB_RE
27 /* need access to debugger hooks */
28 # if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
33 #ifdef PERL_IN_XSUB_RE
34 /* We *really* need to overwrite these symbols: */
35 # define Perl_regexec_flags my_regexec
36 # define Perl_regdump my_regdump
37 # define Perl_regprop my_regprop
38 # define Perl_re_intuit_start my_re_intuit_start
39 /* *These* symbols are masked to allow static link. */
40 # define Perl_pregexec my_pregexec
41 # define Perl_reginitcolors my_reginitcolors
42 # define Perl_regclass_swash my_regclass_swash
44 # define PERL_NO_GET_CONTEXT
49 * pregcomp and pregexec -- regsub and regerror are not used in perl
51 * Copyright (c) 1986 by University of Toronto.
52 * Written by Henry Spencer. Not derived from licensed software.
54 * Permission is granted to anyone to use this software for any
55 * purpose on any computer system, and to redistribute it freely,
56 * subject to the following restrictions:
58 * 1. The author is not responsible for the consequences of use of
59 * this software, no matter how awful, even if they arise
62 * 2. The origin of this software must not be misrepresented, either
63 * by explicit claim or by omission.
65 * 3. Altered versions must be plainly marked as such, and must not
66 * be misrepresented as being the original software.
68 **** Alterations to Henry's code are...
70 **** Copyright (c) 1991-2001, Larry Wall
72 **** You may distribute under the terms of either the GNU General Public
73 **** License or the Artistic License, as specified in the README file.
75 * Beware that some of this code is subtly aware of the way operator
76 * precedence is structured in regular expressions. Serious changes in
77 * regular-expression syntax might require a total rethink.
80 #define PERL_IN_REGEXEC_C
83 #ifdef PERL_IN_XSUB_RE
84 # if defined(PERL_CAPI) || defined(PERL_OBJECT)
91 #define RF_tainted 1 /* tainted information used? */
92 #define RF_warned 2 /* warned about big count? */
93 #define RF_evaled 4 /* Did an EVAL with setting? */
94 #define RF_utf8 8 /* String contains multibyte chars? */
96 #define UTF (PL_reg_flags & RF_utf8)
98 #define RS_init 1 /* eval environment created */
99 #define RS_set 2 /* replsv value is set */
102 #define STATIC static
109 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
110 #define CHR_DIST(a,b) (DO_UTF8(PL_reg_sv) ? utf8_distance(a,b) : a - b)
112 #define reghop_c(pos,off) ((char*)reghop((U8*)pos, off))
113 #define reghopmaybe_c(pos,off) ((char*)reghopmaybe((U8*)pos, off))
114 #define HOP(pos,off) (DO_UTF8(PL_reg_sv) ? reghop((U8*)pos, off) : (U8*)(pos + off))
115 #define HOPMAYBE(pos,off) (DO_UTF8(PL_reg_sv) ? reghopmaybe((U8*)pos, off) : (U8*)(pos + off))
116 #define HOPc(pos,off) ((char*)HOP(pos,off))
117 #define HOPMAYBEc(pos,off) ((char*)HOPMAYBE(pos,off))
119 #define reghop3_c(pos,off,lim) ((char*)reghop3((U8*)pos, off, (U8*)lim))
120 #define reghopmaybe3_c(pos,off,lim) ((char*)reghopmaybe3((U8*)pos, off, (U8*)lim))
121 #define HOP3(pos,off,lim) (DO_UTF8(PL_reg_sv) ? reghop3((U8*)pos, off, (U8*)lim) : (U8*)(pos + off))
122 #define HOPMAYBE3(pos,off,lim) (DO_UTF8(PL_reg_sv) ? reghopmaybe3((U8*)pos, off, (U8*)lim) : (U8*)(pos + off))
123 #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
124 #define HOPMAYBE3c(pos,off,lim) ((char*)HOPMAYBE3(pos,off,lim))
126 #define LOAD_UTF8_CHARCLASS(a,b) STMT_START { if (!CAT2(PL_utf8_,a)) (void)CAT2(is_utf8_, a)((U8*)b); } STMT_END
128 static void restore_pos(pTHXo_ void *arg);
131 S_regcppush(pTHX_ I32 parenfloor)
133 int retval = PL_savestack_ix;
134 #define REGCP_PAREN_ELEMS 4
135 int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
138 #define REGCP_OTHER_ELEMS 5
139 SSCHECK(paren_elems_to_push + REGCP_OTHER_ELEMS);
140 for (p = PL_regsize; p > parenfloor; p--) {
141 /* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
142 SSPUSHINT(PL_regendp[p]);
143 SSPUSHINT(PL_regstartp[p]);
144 SSPUSHPTR(PL_reg_start_tmp[p]);
147 /* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
148 SSPUSHINT(PL_regsize);
149 SSPUSHINT(*PL_reglastparen);
150 SSPUSHPTR(PL_reginput);
151 #define REGCP_FRAME_ELEMS 2
152 /* REGCP_FRAME_ELEMS are part of the REGCP_OTHER_ELEMS and
153 * are needed for the regexp context stack bookkeeping. */
154 SSPUSHINT(paren_elems_to_push + REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
155 SSPUSHINT(SAVEt_REGCONTEXT); /* Magic cookie. */
160 /* These are needed since we do not localize EVAL nodes: */
161 # define REGCP_SET(cp) DEBUG_r(PerlIO_printf(Perl_debug_log, \
162 " Setting an EVAL scope, savestack=%"IVdf"\n", \
163 (IV)PL_savestack_ix)); cp = PL_savestack_ix
165 # define REGCP_UNWIND(cp) DEBUG_r(cp != PL_savestack_ix ? \
166 PerlIO_printf(Perl_debug_log, \
167 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
168 (IV)(cp), (IV)PL_savestack_ix) : 0); regcpblow(cp)
178 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
180 assert(i == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
181 i = SSPOPINT; /* Parentheses elements to pop. */
182 input = (char *) SSPOPPTR;
183 *PL_reglastparen = SSPOPINT;
184 PL_regsize = SSPOPINT;
186 /* Now restore the parentheses context. */
187 for (i -= (REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
188 i > 0; i -= REGCP_PAREN_ELEMS) {
189 paren = (U32)SSPOPINT;
190 PL_reg_start_tmp[paren] = (char *) SSPOPPTR;
191 PL_regstartp[paren] = SSPOPINT;
193 if (paren <= *PL_reglastparen)
194 PL_regendp[paren] = tmps;
196 PerlIO_printf(Perl_debug_log,
197 " restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n",
198 (UV)paren, (IV)PL_regstartp[paren],
199 (IV)(PL_reg_start_tmp[paren] - PL_bostr),
200 (IV)PL_regendp[paren],
201 (paren > *PL_reglastparen ? "(no)" : ""));
205 if (*PL_reglastparen + 1 <= PL_regnpar) {
206 PerlIO_printf(Perl_debug_log,
207 " restoring \\%"IVdf"..\\%"IVdf" to undef\n",
208 (IV)(*PL_reglastparen + 1), (IV)PL_regnpar);
212 /* It would seem that the similar code in regtry()
213 * already takes care of this, and in fact it is in
214 * a better location to since this code can #if 0-ed out
215 * but the code in regtry() is needed or otherwise tests
216 * requiring null fields (pat.t#187 and split.t#{13,14}
217 * (as of patchlevel 7877) will fail. Then again,
218 * this code seems to be necessary or otherwise
219 * building DynaLoader will fail:
220 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
222 for (paren = *PL_reglastparen + 1; paren <= PL_regnpar; paren++) {
223 if (paren > PL_regsize)
224 PL_regstartp[paren] = -1;
225 PL_regendp[paren] = -1;
232 S_regcp_set_to(pTHX_ I32 ss)
234 I32 tmp = PL_savestack_ix;
236 PL_savestack_ix = ss;
238 PL_savestack_ix = tmp;
242 typedef struct re_cc_state
246 struct re_cc_state *prev;
251 #define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
253 #define TRYPAREN(paren, n, input) { \
256 PL_regstartp[paren] = HOPc(input, -1) - PL_bostr; \
257 PL_regendp[paren] = input - PL_bostr; \
260 PL_regendp[paren] = -1; \
262 if (regmatch(next)) \
265 PL_regendp[paren] = -1; \
270 * pregexec and friends
274 - pregexec - match a regexp against a string
277 Perl_pregexec(pTHX_ register regexp *prog, char *stringarg, register char *strend,
278 char *strbeg, I32 minend, SV *screamer, U32 nosave)
279 /* strend: pointer to null at end of string */
280 /* strbeg: real beginning of string */
281 /* minend: end of match must be >=minend after stringarg. */
282 /* nosave: For optimizations. */
285 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
286 nosave ? 0 : REXEC_COPY_STR);
290 S_cache_re(pTHX_ regexp *prog)
292 PL_regprecomp = prog->precomp; /* Needed for FAIL. */
294 PL_regprogram = prog->program;
296 PL_regnpar = prog->nparens;
297 PL_regdata = prog->data;
302 * Need to implement the following flags for reg_anch:
304 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
306 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
307 * INTUIT_AUTORITATIVE_ML
308 * INTUIT_ONCE_NOML - Intuit can match in one location only.
311 * Another flag for this function: SECOND_TIME (so that float substrs
312 * with giant delta may be not rechecked).
315 /* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
317 /* If SCREAM, then SvPVX(sv) should be compatible with strpos and strend.
318 Otherwise, only SvCUR(sv) is used to get strbeg. */
320 /* XXXX We assume that strpos is strbeg unless sv. */
322 /* XXXX Some places assume that there is a fixed substring.
323 An update may be needed if optimizer marks as "INTUITable"
324 RExen without fixed substrings. Similarly, it is assumed that
325 lengths of all the strings are no more than minlen, thus they
326 cannot come from lookahead.
327 (Or minlen should take into account lookahead.) */
329 /* A failure to find a constant substring means that there is no need to make
330 an expensive call to REx engine, thus we celebrate a failure. Similarly,
331 finding a substring too deep into the string means that less calls to
332 regtry() should be needed.
334 REx compiler's optimizer found 4 possible hints:
335 a) Anchored substring;
337 c) Whether we are anchored (beginning-of-line or \G);
338 d) First node (of those at offset 0) which may distingush positions;
339 We use a)b)d) and multiline-part of c), and try to find a position in the
340 string which does not contradict any of them.
343 /* Most of decisions we do here should have been done at compile time.
344 The nodes of the REx which we used for the search should have been
345 deleted from the finite automaton. */
348 Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
349 char *strend, U32 flags, re_scream_pos_data *data)
351 register I32 start_shift;
352 /* Should be nonnegative! */
353 register I32 end_shift;
360 register char *other_last = Nullch; /* other substr checked before this */
361 char *check_at; /* check substr found at this pos */
363 char *i_strpos = strpos;
366 DEBUG_r( if (!PL_colorset) reginitcolors() );
367 DEBUG_r(PerlIO_printf(Perl_debug_log,
368 "%sGuessing start of match, REx%s `%s%.60s%s%s' against `%s%.*s%s%s'...\n",
369 PL_colors[4],PL_colors[5],PL_colors[0],
372 (strlen(prog->precomp) > 60 ? "..." : ""),
374 (int)(strend - strpos > 60 ? 60 : strend - strpos),
375 strpos, PL_colors[1],
376 (strend - strpos > 60 ? "..." : ""))
379 if (prog->reganch & ROPT_UTF8)
380 PL_reg_flags |= RF_utf8;
382 if (prog->minlen > CHR_DIST((U8*)strend, (U8*)strpos)) {
383 DEBUG_r(PerlIO_printf(Perl_debug_log, "String too short...\n"));
386 strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
388 check = prog->check_substr;
389 if (prog->reganch & ROPT_ANCH) { /* Match at beg-of-str or after \n */
390 ml_anch = !( (prog->reganch & ROPT_ANCH_SINGLE)
391 || ( (prog->reganch & ROPT_ANCH_BOL)
392 && !PL_multiline ) ); /* Check after \n? */
395 if ( !(prog->reganch & (ROPT_ANCH_GPOS /* Checked by the caller */
396 | ROPT_IMPLICIT)) /* not a real BOL */
397 /* SvCUR is not set on references: SvRV and SvPVX overlap */
399 && (strpos != strbeg)) {
400 DEBUG_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
403 if (prog->check_offset_min == prog->check_offset_max &&
404 !(prog->reganch & ROPT_SANY_SEEN)) {
405 /* Substring at constant offset from beg-of-str... */
408 s = HOP3c(strpos, prog->check_offset_min, strend);
410 slen = SvCUR(check); /* >= 1 */
412 if ( strend - s > slen || strend - s < slen - 1
413 || (strend - s == slen && strend[-1] != '\n')) {
414 DEBUG_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
417 /* Now should match s[0..slen-2] */
419 if (slen && (*SvPVX(check) != *s
421 && memNE(SvPVX(check), s, slen)))) {
423 DEBUG_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
427 else if (*SvPVX(check) != *s
428 || ((slen = SvCUR(check)) > 1
429 && memNE(SvPVX(check), s, slen)))
431 goto success_at_start;
434 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
436 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
437 end_shift = prog->minlen - start_shift -
438 CHR_SVLEN(check) + (SvTAIL(check) != 0);
440 I32 end = prog->check_offset_max + CHR_SVLEN(check)
441 - (SvTAIL(check) != 0);
442 I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
444 if (end_shift < eshift)
448 else { /* Can match at random position */
451 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
452 /* Should be nonnegative! */
453 end_shift = prog->minlen - start_shift -
454 CHR_SVLEN(check) + (SvTAIL(check) != 0);
457 #ifdef DEBUGGING /* 7/99: reports of failure (with the older version) */
459 Perl_croak(aTHX_ "panic: end_shift");
463 /* Find a possible match in the region s..strend by looking for
464 the "check" substring in the region corrected by start/end_shift. */
465 if (flags & REXEC_SCREAM) {
466 I32 p = -1; /* Internal iterator of scream. */
467 I32 *pp = data ? data->scream_pos : &p;
469 if (PL_screamfirst[BmRARE(check)] >= 0
470 || ( BmRARE(check) == '\n'
471 && (BmPREVIOUS(check) == SvCUR(check) - 1)
473 s = screaminstr(sv, check,
474 start_shift + (s - strbeg), end_shift, pp, 0);
478 *data->scream_olds = s;
480 else if (prog->reganch & ROPT_SANY_SEEN)
481 s = fbm_instr((U8*)(s + start_shift),
482 (U8*)(strend - end_shift),
483 check, PL_multiline ? FBMrf_MULTILINE : 0);
485 s = fbm_instr(HOP3(s, start_shift, strend),
486 HOP3(strend, -end_shift, strbeg),
487 check, PL_multiline ? FBMrf_MULTILINE : 0);
489 /* Update the count-of-usability, remove useless subpatterns,
492 DEBUG_r(PerlIO_printf(Perl_debug_log, "%s %s substr `%s%.*s%s'%s%s",
493 (s ? "Found" : "Did not find"),
494 ((check == prog->anchored_substr) ? "anchored" : "floating"),
496 (int)(SvCUR(check) - (SvTAIL(check)!=0)),
498 PL_colors[1], (SvTAIL(check) ? "$" : ""),
499 (s ? " at offset " : "...\n") ) );
506 /* Finish the diagnostic message */
507 DEBUG_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
509 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
510 Start with the other substr.
511 XXXX no SCREAM optimization yet - and a very coarse implementation
512 XXXX /ttx+/ results in anchored=`ttx', floating=`x'. floating will
513 *always* match. Probably should be marked during compile...
514 Probably it is right to do no SCREAM here...
517 if (prog->float_substr && prog->anchored_substr) {
518 /* Take into account the "other" substring. */
519 /* XXXX May be hopelessly wrong for UTF... */
522 if (check == prog->float_substr) {
525 char *last = HOP3c(s, -start_shift, strbeg), *last1, *last2;
528 t = s - prog->check_offset_max;
529 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
530 && (!(prog->reganch & ROPT_UTF8)
531 || ((t = reghopmaybe3_c(s, -(prog->check_offset_max), strpos))
536 t = HOP3c(t, prog->anchored_offset, strend);
537 if (t < other_last) /* These positions already checked */
539 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
542 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
543 /* On end-of-str: see comment below. */
544 s = fbm_instr((unsigned char*)t,
545 HOP3(HOP3(last1, prog->anchored_offset, strend)
546 + SvCUR(prog->anchored_substr),
547 -(SvTAIL(prog->anchored_substr)!=0), strbeg),
548 prog->anchored_substr,
549 PL_multiline ? FBMrf_MULTILINE : 0);
550 DEBUG_r(PerlIO_printf(Perl_debug_log,
551 "%s anchored substr `%s%.*s%s'%s",
552 (s ? "Found" : "Contradicts"),
554 (int)(SvCUR(prog->anchored_substr)
555 - (SvTAIL(prog->anchored_substr)!=0)),
556 SvPVX(prog->anchored_substr),
557 PL_colors[1], (SvTAIL(prog->anchored_substr) ? "$" : "")));
559 if (last1 >= last2) {
560 DEBUG_r(PerlIO_printf(Perl_debug_log,
561 ", giving up...\n"));
564 DEBUG_r(PerlIO_printf(Perl_debug_log,
565 ", trying floating at offset %ld...\n",
566 (long)(HOP3c(s1, 1, strend) - i_strpos)));
567 other_last = HOP3c(last1, prog->anchored_offset+1, strend);
568 s = HOP3c(last, 1, strend);
572 DEBUG_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
573 (long)(s - i_strpos)));
574 t = HOP3c(s, -prog->anchored_offset, strbeg);
575 other_last = HOP3c(s, 1, strend);
583 else { /* Take into account the floating substring. */
587 t = HOP3c(s, -start_shift, strbeg);
589 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
590 if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
591 last = HOP3c(t, prog->float_max_offset, strend);
592 s = HOP3c(t, prog->float_min_offset, strend);
595 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
596 /* fbm_instr() takes into account exact value of end-of-str
597 if the check is SvTAIL(ed). Since false positives are OK,
598 and end-of-str is not later than strend we are OK. */
599 s = fbm_instr((unsigned char*)s,
600 (unsigned char*)last + SvCUR(prog->float_substr)
601 - (SvTAIL(prog->float_substr)!=0),
602 prog->float_substr, PL_multiline ? FBMrf_MULTILINE : 0);
603 DEBUG_r(PerlIO_printf(Perl_debug_log, "%s floating substr `%s%.*s%s'%s",
604 (s ? "Found" : "Contradicts"),
606 (int)(SvCUR(prog->float_substr)
607 - (SvTAIL(prog->float_substr)!=0)),
608 SvPVX(prog->float_substr),
609 PL_colors[1], (SvTAIL(prog->float_substr) ? "$" : "")));
612 DEBUG_r(PerlIO_printf(Perl_debug_log,
613 ", giving up...\n"));
616 DEBUG_r(PerlIO_printf(Perl_debug_log,
617 ", trying anchored starting at offset %ld...\n",
618 (long)(s1 + 1 - i_strpos)));
620 s = HOP3c(t, 1, strend);
624 DEBUG_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
625 (long)(s - i_strpos)));
626 other_last = s; /* Fix this later. --Hugo */
635 t = s - prog->check_offset_max;
636 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
637 && (!(prog->reganch & ROPT_UTF8)
638 || ((t = reghopmaybe3_c(s, -prog->check_offset_max, strpos))
640 /* Fixed substring is found far enough so that the match
641 cannot start at strpos. */
643 if (ml_anch && t[-1] != '\n') {
644 /* Eventually fbm_*() should handle this, but often
645 anchored_offset is not 0, so this check will not be wasted. */
646 /* XXXX In the code below we prefer to look for "^" even in
647 presence of anchored substrings. And we search even
648 beyond the found float position. These pessimizations
649 are historical artefacts only. */
651 while (t < strend - prog->minlen) {
653 if (t < check_at - prog->check_offset_min) {
654 if (prog->anchored_substr) {
655 /* Since we moved from the found position,
656 we definitely contradict the found anchored
657 substr. Due to the above check we do not
658 contradict "check" substr.
659 Thus we can arrive here only if check substr
660 is float. Redo checking for "other"=="fixed".
663 DEBUG_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
664 PL_colors[0],PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
665 goto do_other_anchored;
667 /* We don't contradict the found floating substring. */
668 /* XXXX Why not check for STCLASS? */
670 DEBUG_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
671 PL_colors[0],PL_colors[1], (long)(s - i_strpos)));
674 /* Position contradicts check-string */
675 /* XXXX probably better to look for check-string
676 than for "\n", so one should lower the limit for t? */
677 DEBUG_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
678 PL_colors[0],PL_colors[1], (long)(t + 1 - i_strpos)));
679 other_last = strpos = s = t + 1;
684 DEBUG_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
685 PL_colors[0],PL_colors[1]));
689 DEBUG_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
690 PL_colors[0],PL_colors[1]));
694 ++BmUSEFUL(prog->check_substr); /* hooray/5 */
697 /* The found string does not prohibit matching at strpos,
698 - no optimization of calling REx engine can be performed,
699 unless it was an MBOL and we are not after MBOL,
700 or a future STCLASS check will fail this. */
702 /* Even in this situation we may use MBOL flag if strpos is offset
703 wrt the start of the string. */
704 if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
705 && (strpos != strbeg) && strpos[-1] != '\n'
706 /* May be due to an implicit anchor of m{.*foo} */
707 && !(prog->reganch & ROPT_IMPLICIT))
712 DEBUG_r( if (ml_anch)
713 PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
714 (long)(strpos - i_strpos), PL_colors[0],PL_colors[1]);
717 if (!(prog->reganch & ROPT_NAUGHTY) /* XXXX If strpos moved? */
718 && prog->check_substr /* Could be deleted already */
719 && --BmUSEFUL(prog->check_substr) < 0
720 && prog->check_substr == prog->float_substr)
722 /* If flags & SOMETHING - do not do it many times on the same match */
723 DEBUG_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
724 SvREFCNT_dec(prog->check_substr);
725 prog->check_substr = Nullsv; /* disable */
726 prog->float_substr = Nullsv; /* clear */
727 check = Nullsv; /* abort */
729 /* XXXX This is a remnant of the old implementation. It
730 looks wasteful, since now INTUIT can use many
732 prog->reganch &= ~RE_USE_INTUIT;
739 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
740 if (prog->regstclass) {
741 /* minlen == 0 is possible if regstclass is \b or \B,
742 and the fixed substr is ''$.
743 Since minlen is already taken into account, s+1 is before strend;
744 accidentally, minlen >= 1 guaranties no false positives at s + 1
745 even for \b or \B. But (minlen? 1 : 0) below assumes that
746 regstclass does not come from lookahead... */
747 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
748 This leaves EXACTF only, which is dealt with in find_byclass(). */
749 U8* str = (U8*)STRING(prog->regstclass);
750 int cl_l = (PL_regkind[(U8)OP(prog->regstclass)] == EXACT
751 ? CHR_DIST(str+STR_LEN(prog->regstclass), str)
753 char *endpos = (prog->anchored_substr || ml_anch)
754 ? HOP3c(s, (prog->minlen ? cl_l : 0), strend)
755 : (prog->float_substr
756 ? HOP3c(HOP3c(check_at, -start_shift, strbeg),
759 char *startpos = strbeg;
762 if (prog->reganch & ROPT_UTF8) {
763 PL_regdata = prog->data;
766 s = find_byclass(prog, prog->regstclass, s, endpos, startpos, 1);
771 if (endpos == strend) {
772 DEBUG_r( PerlIO_printf(Perl_debug_log,
773 "Could not match STCLASS...\n") );
776 DEBUG_r( PerlIO_printf(Perl_debug_log,
777 "This position contradicts STCLASS...\n") );
778 if ((prog->reganch & ROPT_ANCH) && !ml_anch)
780 /* Contradict one of substrings */
781 if (prog->anchored_substr) {
782 if (prog->anchored_substr == check) {
783 DEBUG_r( what = "anchored" );
785 s = HOP3c(t, 1, strend);
786 if (s + start_shift + end_shift > strend) {
787 /* XXXX Should be taken into account earlier? */
788 DEBUG_r( PerlIO_printf(Perl_debug_log,
789 "Could not match STCLASS...\n") );
794 DEBUG_r( PerlIO_printf(Perl_debug_log,
795 "Looking for %s substr starting at offset %ld...\n",
796 what, (long)(s + start_shift - i_strpos)) );
799 /* Have both, check_string is floating */
800 if (t + start_shift >= check_at) /* Contradicts floating=check */
801 goto retry_floating_check;
802 /* Recheck anchored substring, but not floating... */
806 DEBUG_r( PerlIO_printf(Perl_debug_log,
807 "Looking for anchored substr starting at offset %ld...\n",
808 (long)(other_last - i_strpos)) );
809 goto do_other_anchored;
811 /* Another way we could have checked stclass at the
812 current position only: */
817 DEBUG_r( PerlIO_printf(Perl_debug_log,
818 "Looking for /%s^%s/m starting at offset %ld...\n",
819 PL_colors[0],PL_colors[1], (long)(t - i_strpos)) );
822 if (!prog->float_substr) /* Could have been deleted */
824 /* Check is floating subtring. */
825 retry_floating_check:
826 t = check_at - start_shift;
827 DEBUG_r( what = "floating" );
828 goto hop_and_restart;
831 PerlIO_printf(Perl_debug_log,
832 "By STCLASS: moving %ld --> %ld\n",
833 (long)(t - i_strpos), (long)(s - i_strpos));
835 PerlIO_printf(Perl_debug_log,
836 "Does not contradict STCLASS...\n") );
839 DEBUG_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
840 PL_colors[4], (check ? "Guessed" : "Giving up"),
841 PL_colors[5], (long)(s - i_strpos)) );
844 fail_finish: /* Substring not found */
845 if (prog->check_substr) /* could be removed already */
846 BmUSEFUL(prog->check_substr) += 5; /* hooray */
848 DEBUG_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
849 PL_colors[4],PL_colors[5]));
853 /* We know what class REx starts with. Try to find this position... */
855 S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *startpos, I32 norun)
857 I32 doevery = (prog->reganch & ROPT_SKIP) == 0;
863 register I32 tmp = 1; /* Scratch variable? */
864 register bool do_utf8 = DO_UTF8(PL_reg_sv);
866 /* We know what class it must start with. */
870 if (reginclass(c, (U8*)s, do_utf8)) {
871 if (tmp && (norun || regtry(prog, s)))
878 s += do_utf8 ? UTF8SKIP(s) : 1;
885 c1 = to_utf8_lower((U8*)m);
886 c2 = to_utf8_upper((U8*)m);
897 c2 = PL_fold_locale[c1];
902 e = s; /* Due to minlen logic of intuit() */
908 if ( utf8_to_uvchr((U8*)s, &len) == c1
915 UV c = utf8_to_uvchr((U8*)s, &len);
916 if ( (c == c1 || c == c2) && regtry(prog, s) )
925 && (ln == 1 || !(OP(c) == EXACTF
927 : ibcmp_locale(s, m, ln)))
928 && (norun || regtry(prog, s)) )
934 if ( (*(U8*)s == c1 || *(U8*)s == c2)
935 && (ln == 1 || !(OP(c) == EXACTF
937 : ibcmp_locale(s, m, ln)))
938 && (norun || regtry(prog, s)) )
945 PL_reg_flags |= RF_tainted;
952 U8 *r = reghop3((U8*)s, -1, (U8*)startpos);
954 tmp = (I32)utf8n_to_uvchr(r, s - (char*)r, 0, 0);
956 tmp = ((OP(c) == BOUND ?
957 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
958 LOAD_UTF8_CHARCLASS(alnum,"a");
960 if (tmp == !(OP(c) == BOUND ?
961 swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
962 isALNUM_LC_utf8((U8*)s)))
965 if ((norun || regtry(prog, s)))
972 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
973 tmp = ((OP(c) == BOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
976 !(OP(c) == BOUND ? isALNUM(*s) : isALNUM_LC(*s))) {
978 if ((norun || regtry(prog, s)))
984 if ((!prog->minlen && tmp) && (norun || regtry(prog, s)))
988 PL_reg_flags |= RF_tainted;
995 U8 *r = reghop3((U8*)s, -1, (U8*)startpos);
997 tmp = (I32)utf8n_to_uvchr(r, s - (char*)r, 0, 0);
999 tmp = ((OP(c) == NBOUND ?
1000 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1001 LOAD_UTF8_CHARCLASS(alnum,"a");
1002 while (s < strend) {
1003 if (tmp == !(OP(c) == NBOUND ?
1004 swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
1005 isALNUM_LC_utf8((U8*)s)))
1007 else if ((norun || regtry(prog, s)))
1013 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
1014 tmp = ((OP(c) == NBOUND ?
1015 isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1016 while (s < strend) {
1018 !(OP(c) == NBOUND ? isALNUM(*s) : isALNUM_LC(*s)))
1020 else if ((norun || regtry(prog, s)))
1025 if ((!prog->minlen && !tmp) && (norun || regtry(prog, s)))
1030 LOAD_UTF8_CHARCLASS(alnum,"a");
1031 while (s < strend) {
1032 if (swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8)) {
1033 if (tmp && (norun || regtry(prog, s)))
1044 while (s < strend) {
1046 if (tmp && (norun || regtry(prog, s)))
1058 PL_reg_flags |= RF_tainted;
1060 while (s < strend) {
1061 if (isALNUM_LC_utf8((U8*)s)) {
1062 if (tmp && (norun || regtry(prog, s)))
1073 while (s < strend) {
1074 if (isALNUM_LC(*s)) {
1075 if (tmp && (norun || regtry(prog, s)))
1088 LOAD_UTF8_CHARCLASS(alnum,"a");
1089 while (s < strend) {
1090 if (!swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8)) {
1091 if (tmp && (norun || regtry(prog, s)))
1102 while (s < strend) {
1104 if (tmp && (norun || regtry(prog, s)))
1116 PL_reg_flags |= RF_tainted;
1118 while (s < strend) {
1119 if (!isALNUM_LC_utf8((U8*)s)) {
1120 if (tmp && (norun || regtry(prog, s)))
1131 while (s < strend) {
1132 if (!isALNUM_LC(*s)) {
1133 if (tmp && (norun || regtry(prog, s)))
1146 LOAD_UTF8_CHARCLASS(space," ");
1147 while (s < strend) {
1148 if (*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8)) {
1149 if (tmp && (norun || regtry(prog, s)))
1160 while (s < strend) {
1162 if (tmp && (norun || regtry(prog, s)))
1174 PL_reg_flags |= RF_tainted;
1176 while (s < strend) {
1177 if (*s == ' ' || isSPACE_LC_utf8((U8*)s)) {
1178 if (tmp && (norun || regtry(prog, s)))
1189 while (s < strend) {
1190 if (isSPACE_LC(*s)) {
1191 if (tmp && (norun || regtry(prog, s)))
1204 LOAD_UTF8_CHARCLASS(space," ");
1205 while (s < strend) {
1206 if (!(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8))) {
1207 if (tmp && (norun || regtry(prog, s)))
1218 while (s < strend) {
1220 if (tmp && (norun || regtry(prog, s)))
1232 PL_reg_flags |= RF_tainted;
1234 while (s < strend) {
1235 if (!(*s == ' ' || isSPACE_LC_utf8((U8*)s))) {
1236 if (tmp && (norun || regtry(prog, s)))
1247 while (s < strend) {
1248 if (!isSPACE_LC(*s)) {
1249 if (tmp && (norun || regtry(prog, s)))
1262 LOAD_UTF8_CHARCLASS(digit,"0");
1263 while (s < strend) {
1264 if (swash_fetch(PL_utf8_digit,(U8*)s, do_utf8)) {
1265 if (tmp && (norun || regtry(prog, s)))
1276 while (s < strend) {
1278 if (tmp && (norun || regtry(prog, s)))
1290 PL_reg_flags |= RF_tainted;
1292 while (s < strend) {
1293 if (isDIGIT_LC_utf8((U8*)s)) {
1294 if (tmp && (norun || regtry(prog, s)))
1305 while (s < strend) {
1306 if (isDIGIT_LC(*s)) {
1307 if (tmp && (norun || regtry(prog, s)))
1320 LOAD_UTF8_CHARCLASS(digit,"0");
1321 while (s < strend) {
1322 if (!swash_fetch(PL_utf8_digit,(U8*)s, do_utf8)) {
1323 if (tmp && (norun || regtry(prog, s)))
1334 while (s < strend) {
1336 if (tmp && (norun || regtry(prog, s)))
1348 PL_reg_flags |= RF_tainted;
1350 while (s < strend) {
1351 if (!isDIGIT_LC_utf8((U8*)s)) {
1352 if (tmp && (norun || regtry(prog, s)))
1363 while (s < strend) {
1364 if (!isDIGIT_LC(*s)) {
1365 if (tmp && (norun || regtry(prog, s)))
1377 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
1386 - regexec_flags - match a regexp against a string
1389 Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *strend,
1390 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
1391 /* strend: pointer to null at end of string */
1392 /* strbeg: real beginning of string */
1393 /* minend: end of match must be >=minend after stringarg. */
1394 /* data: May be used for some additional optimizations. */
1395 /* nosave: For optimizations. */
1398 register regnode *c;
1399 register char *startpos = stringarg;
1400 I32 minlen; /* must match at least this many chars */
1401 I32 dontbother = 0; /* how many characters not to try at end */
1402 /* I32 start_shift = 0; */ /* Offset of the start to find
1403 constant substr. */ /* CC */
1404 I32 end_shift = 0; /* Same for the end. */ /* CC */
1405 I32 scream_pos = -1; /* Internal iterator of scream. */
1407 SV* oreplsv = GvSV(PL_replgv);
1408 bool do_utf8 = DO_UTF8(sv);
1414 PL_regnarrate = DEBUG_r_TEST;
1417 /* Be paranoid... */
1418 if (prog == NULL || startpos == NULL) {
1419 Perl_croak(aTHX_ "NULL regexp parameter");
1423 minlen = prog->minlen;
1425 if (!(prog->reganch & ROPT_SANY_SEEN))
1426 if (utf8_distance((U8*)strend, (U8*)startpos) < minlen) goto phooey;
1429 if (strend - startpos < minlen) goto phooey;
1432 /* Check validity of program. */
1433 if (UCHARAT(prog->program) != REG_MAGIC) {
1434 Perl_croak(aTHX_ "corrupted regexp program");
1438 PL_reg_eval_set = 0;
1441 if (prog->reganch & ROPT_UTF8)
1442 PL_reg_flags |= RF_utf8;
1444 /* Mark beginning of line for ^ and lookbehind. */
1445 PL_regbol = startpos;
1449 /* Mark end of line for $ (and such) */
1452 /* see how far we have to get to not match where we matched before */
1453 PL_regtill = startpos+minend;
1455 /* We start without call_cc context. */
1458 /* If there is a "must appear" string, look for it. */
1461 if (prog->reganch & ROPT_GPOS_SEEN) { /* Need to have PL_reg_ganch */
1464 if (flags & REXEC_IGNOREPOS) /* Means: check only at start */
1465 PL_reg_ganch = startpos;
1466 else if (sv && SvTYPE(sv) >= SVt_PVMG
1468 && (mg = mg_find(sv, 'g')) && mg->mg_len >= 0) {
1469 PL_reg_ganch = strbeg + mg->mg_len; /* Defined pos() */
1470 if (prog->reganch & ROPT_ANCH_GPOS) {
1471 if (s > PL_reg_ganch)
1476 else /* pos() not defined */
1477 PL_reg_ganch = strbeg;
1480 if (do_utf8 == (UTF!=0) &&
1481 !(flags & REXEC_CHECKED) && prog->check_substr != Nullsv) {
1482 re_scream_pos_data d;
1484 d.scream_olds = &scream_olds;
1485 d.scream_pos = &scream_pos;
1486 s = re_intuit_start(prog, sv, s, strend, flags, &d);
1488 goto phooey; /* not present */
1491 DEBUG_r( if (!PL_colorset) reginitcolors() );
1492 DEBUG_r(PerlIO_printf(Perl_debug_log,
1493 "%sMatching REx%s `%s%.60s%s%s' against `%s%.*s%s%s'\n",
1494 PL_colors[4],PL_colors[5],PL_colors[0],
1497 (strlen(prog->precomp) > 60 ? "..." : ""),
1499 (int)(strend - startpos > 60 ? 60 : strend - startpos),
1500 startpos, PL_colors[1],
1501 (strend - startpos > 60 ? "..." : ""))
1504 /* Simplest case: anchored match need be tried only once. */
1505 /* [unless only anchor is BOL and multiline is set] */
1506 if (prog->reganch & (ROPT_ANCH & ~ROPT_ANCH_GPOS)) {
1507 if (s == startpos && regtry(prog, startpos))
1509 else if (PL_multiline || (prog->reganch & ROPT_IMPLICIT)
1510 || (prog->reganch & ROPT_ANCH_MBOL)) /* XXXX SBOL? */
1515 dontbother = minlen - 1;
1516 end = HOP3c(strend, -dontbother, strbeg) - 1;
1517 /* for multiline we only have to try after newlines */
1518 if (prog->check_substr) {
1522 if (regtry(prog, s))
1527 if (prog->reganch & RE_USE_INTUIT) {
1528 s = re_intuit_start(prog, sv, s + 1, strend, flags, NULL);
1539 if (*s++ == '\n') { /* don't need PL_utf8skip here */
1540 if (regtry(prog, s))
1547 } else if (prog->reganch & ROPT_ANCH_GPOS) {
1548 if (regtry(prog, PL_reg_ganch))
1553 /* Messy cases: unanchored match. */
1554 if (prog->anchored_substr && prog->reganch & ROPT_SKIP) {
1555 /* we have /x+whatever/ */
1556 /* it must be a one character string (XXXX Except UTF?) */
1557 char ch = SvPVX(prog->anchored_substr)[0];
1563 while (s < strend) {
1565 DEBUG_r( did_match = 1 );
1566 if (regtry(prog, s)) goto got_it;
1568 while (s < strend && *s == ch)
1575 while (s < strend) {
1577 DEBUG_r( did_match = 1 );
1578 if (regtry(prog, s)) goto got_it;
1580 while (s < strend && *s == ch)
1586 DEBUG_r(did_match ||
1587 PerlIO_printf(Perl_debug_log,
1588 "Did not find anchored character...\n"));
1591 else if (do_utf8 == (UTF!=0) &&
1592 (prog->anchored_substr != Nullsv
1593 || (prog->float_substr != Nullsv
1594 && prog->float_max_offset < strend - s))) {
1595 SV *must = prog->anchored_substr
1596 ? prog->anchored_substr : prog->float_substr;
1598 prog->anchored_substr ? prog->anchored_offset : prog->float_max_offset;
1600 prog->anchored_substr ? prog->anchored_offset : prog->float_min_offset;
1601 char *last = HOP3c(strend, /* Cannot start after this */
1602 -(I32)(CHR_SVLEN(must)
1603 - (SvTAIL(must) != 0) + back_min), strbeg);
1604 char *last1; /* Last position checked before */
1610 last1 = HOPc(s, -1);
1612 last1 = s - 1; /* bogus */
1614 /* XXXX check_substr already used to find `s', can optimize if
1615 check_substr==must. */
1617 dontbother = end_shift;
1618 strend = HOPc(strend, -dontbother);
1619 while ( (s <= last) &&
1620 ((flags & REXEC_SCREAM)
1621 ? (s = screaminstr(sv, must, HOP3c(s, back_min, strend) - strbeg,
1622 end_shift, &scream_pos, 0))
1623 : (s = fbm_instr((unsigned char*)HOP3(s, back_min, strend),
1624 (unsigned char*)strend, must,
1625 PL_multiline ? FBMrf_MULTILINE : 0))) ) {
1626 DEBUG_r( did_match = 1 );
1627 if (HOPc(s, -back_max) > last1) {
1628 last1 = HOPc(s, -back_min);
1629 s = HOPc(s, -back_max);
1632 char *t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
1634 last1 = HOPc(s, -back_min);
1638 while (s <= last1) {
1639 if (regtry(prog, s))
1645 while (s <= last1) {
1646 if (regtry(prog, s))
1652 DEBUG_r(did_match ||
1653 PerlIO_printf(Perl_debug_log, "Did not find %s substr `%s%.*s%s'%s...\n",
1654 ((must == prog->anchored_substr)
1655 ? "anchored" : "floating"),
1657 (int)(SvCUR(must) - (SvTAIL(must)!=0)),
1659 PL_colors[1], (SvTAIL(must) ? "$" : "")));
1662 else if ((c = prog->regstclass)) {
1663 if (minlen && PL_regkind[(U8)OP(prog->regstclass)] != EXACT)
1664 /* don't bother with what can't match */
1665 strend = HOPc(strend, -(minlen - 1));
1667 SV *prop = sv_newmortal();
1669 PerlIO_printf(Perl_debug_log, "Matching stclass `%s' against `%s'\n", SvPVX(prop), s);
1671 if (find_byclass(prog, c, s, strend, startpos, 0))
1673 DEBUG_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass...\n"));
1677 if (prog->float_substr != Nullsv) { /* Trim the end. */
1680 if (flags & REXEC_SCREAM) {
1681 last = screaminstr(sv, prog->float_substr, s - strbeg,
1682 end_shift, &scream_pos, 1); /* last one */
1684 last = scream_olds; /* Only one occurrence. */
1688 char *little = SvPV(prog->float_substr, len);
1690 if (SvTAIL(prog->float_substr)) {
1691 if (memEQ(strend - len + 1, little, len - 1))
1692 last = strend - len + 1;
1693 else if (!PL_multiline)
1694 last = memEQ(strend - len, little, len)
1695 ? strend - len : Nullch;
1701 last = rninstr(s, strend, little, little + len);
1703 last = strend; /* matching `$' */
1707 DEBUG_r(PerlIO_printf(Perl_debug_log,
1708 "%sCan't trim the tail, match fails (should not happen)%s\n",
1709 PL_colors[4],PL_colors[5]));
1710 goto phooey; /* Should not happen! */
1712 dontbother = strend - last + prog->float_min_offset;
1714 if (minlen && (dontbother < minlen))
1715 dontbother = minlen - 1;
1716 strend -= dontbother; /* this one's always in bytes! */
1717 /* We don't know much -- general case. */
1720 if (regtry(prog, s))
1729 if (regtry(prog, s))
1731 } while (s++ < strend);
1739 RX_MATCH_TAINTED_set(prog, PL_reg_flags & RF_tainted);
1741 if (PL_reg_eval_set) {
1742 /* Preserve the current value of $^R */
1743 if (oreplsv != GvSV(PL_replgv))
1744 sv_setsv(oreplsv, GvSV(PL_replgv));/* So that when GvSV(replgv) is
1745 restored, the value remains
1747 restore_pos(aTHXo_ 0);
1750 /* make sure $`, $&, $', and $digit will work later */
1751 if ( !(flags & REXEC_NOT_FIRST) ) {
1752 if (RX_MATCH_COPIED(prog)) {
1753 Safefree(prog->subbeg);
1754 RX_MATCH_COPIED_off(prog);
1756 if (flags & REXEC_COPY_STR) {
1757 I32 i = PL_regeol - startpos + (stringarg - strbeg);
1759 s = savepvn(strbeg, i);
1762 RX_MATCH_COPIED_on(prog);
1765 prog->subbeg = strbeg;
1766 prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
1773 DEBUG_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
1774 PL_colors[4],PL_colors[5]));
1775 if (PL_reg_eval_set)
1776 restore_pos(aTHXo_ 0);
1781 - regtry - try match at specific point
1783 STATIC I32 /* 0 failure, 1 success */
1784 S_regtry(pTHX_ regexp *prog, char *startpos)
1792 PL_regindent = 0; /* XXXX Not good when matches are reenterable... */
1794 if ((prog->reganch & ROPT_EVAL_SEEN) && !PL_reg_eval_set) {
1797 PL_reg_eval_set = RS_init;
1799 PerlIO_printf(Perl_debug_log, " setting stack tmpbase at %"IVdf"\n",
1800 (IV)(PL_stack_sp - PL_stack_base));
1802 SAVEI32(cxstack[cxstack_ix].blk_oldsp);
1803 cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base;
1804 /* Otherwise OP_NEXTSTATE will free whatever on stack now. */
1806 /* Apparently this is not needed, judging by wantarray. */
1807 /* SAVEI8(cxstack[cxstack_ix].blk_gimme);
1808 cxstack[cxstack_ix].blk_gimme = G_SCALAR; */
1811 /* Make $_ available to executed code. */
1812 if (PL_reg_sv != DEFSV) {
1813 /* SAVE_DEFSV does *not* suffice here for USE_THREADS */
1818 if (!(SvTYPE(PL_reg_sv) >= SVt_PVMG && SvMAGIC(PL_reg_sv)
1819 && (mg = mg_find(PL_reg_sv, 'g')))) {
1820 /* prepare for quick setting of pos */
1821 sv_magic(PL_reg_sv, (SV*)0, 'g', Nullch, 0);
1822 mg = mg_find(PL_reg_sv, 'g');
1826 PL_reg_oldpos = mg->mg_len;
1827 SAVEDESTRUCTOR_X(restore_pos, 0);
1830 Newz(22,PL_reg_curpm, 1, PMOP);
1831 PL_reg_curpm->op_pmregexp = prog;
1832 PL_reg_oldcurpm = PL_curpm;
1833 PL_curpm = PL_reg_curpm;
1834 if (RX_MATCH_COPIED(prog)) {
1835 /* Here is a serious problem: we cannot rewrite subbeg,
1836 since it may be needed if this match fails. Thus
1837 $` inside (?{}) could fail... */
1838 PL_reg_oldsaved = prog->subbeg;
1839 PL_reg_oldsavedlen = prog->sublen;
1840 RX_MATCH_COPIED_off(prog);
1843 PL_reg_oldsaved = Nullch;
1844 prog->subbeg = PL_bostr;
1845 prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
1847 prog->startp[0] = startpos - PL_bostr;
1848 PL_reginput = startpos;
1849 PL_regstartp = prog->startp;
1850 PL_regendp = prog->endp;
1851 PL_reglastparen = &prog->lastparen;
1852 prog->lastparen = 0;
1854 DEBUG_r(PL_reg_starttry = startpos);
1855 if (PL_reg_start_tmpl <= prog->nparens) {
1856 PL_reg_start_tmpl = prog->nparens*3/2 + 3;
1857 if(PL_reg_start_tmp)
1858 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
1860 New(22,PL_reg_start_tmp, PL_reg_start_tmpl, char*);
1863 /* XXXX What this code is doing here?!!! There should be no need
1864 to do this again and again, PL_reglastparen should take care of
1867 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
1868 * Actually, the code in regcppop() (which Ilya may be meaning by
1869 * PL_reglastparen), is not needed at all by the test suite
1870 * (op/regexp, op/pat, op/split), but that code is needed, oddly
1871 * enough, for building DynaLoader, or otherwise this
1872 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
1873 * will happen. Meanwhile, this code *is* needed for the
1874 * above-mentioned test suite tests to succeed. The common theme
1875 * on those tests seems to be returning null fields from matches.
1880 if (prog->nparens) {
1881 for (i = prog->nparens; i > *PL_reglastparen; i--) {
1888 if (regmatch(prog->program + 1)) {
1889 prog->endp[0] = PL_reginput - PL_bostr;
1892 REGCP_UNWIND(lastcp);
1896 #define RE_UNWIND_BRANCH 1
1897 #define RE_UNWIND_BRANCHJ 2
1901 typedef struct { /* XX: makes sense to enlarge it... */
1905 } re_unwind_generic_t;
1918 } re_unwind_branch_t;
1920 typedef union re_unwind_t {
1922 re_unwind_generic_t generic;
1923 re_unwind_branch_t branch;
1927 - regmatch - main matching routine
1929 * Conceptually the strategy is simple: check to see whether the current
1930 * node matches, call self recursively to see whether the rest matches,
1931 * and then act accordingly. In practice we make some effort to avoid
1932 * recursion, in particular by going through "ordinary" nodes (that don't
1933 * need to know whether the rest of the match failed) by a loop instead of
1936 /* [lwall] I've hoisted the register declarations to the outer block in order to
1937 * maybe save a little bit of pushing and popping on the stack. It also takes
1938 * advantage of machines that use a register save mask on subroutine entry.
1940 STATIC I32 /* 0 failure, 1 success */
1941 S_regmatch(pTHX_ regnode *prog)
1943 register regnode *scan; /* Current node. */
1944 regnode *next; /* Next node. */
1945 regnode *inner; /* Next node in internal branch. */
1946 register I32 nextchr; /* renamed nextchr - nextchar colides with
1947 function of same name */
1948 register I32 n; /* no or next */
1949 register I32 ln; /* len or last */
1950 register char *s; /* operand or save */
1951 register char *locinput = PL_reginput;
1952 register I32 c1, c2, paren; /* case fold search, parenth */
1953 int minmod = 0, sw = 0, logical = 0;
1955 I32 firstcp = PL_savestack_ix;
1956 register bool do_utf8 = DO_UTF8(PL_reg_sv);
1962 /* Note that nextchr is a byte even in UTF */
1963 nextchr = UCHARAT(locinput);
1965 while (scan != NULL) {
1966 #define sayNO_L (logical ? (logical = 0, sw = 0, goto cont) : sayNO)
1968 # define sayYES goto yes
1969 # define sayNO goto no
1970 # define sayYES_FINAL goto yes_final
1971 # define sayYES_LOUD goto yes_loud
1972 # define sayNO_FINAL goto no_final
1973 # define sayNO_SILENT goto do_no
1974 # define saySAME(x) if (x) goto yes; else goto no
1975 # define REPORT_CODE_OFF 24
1977 # define sayYES return 1
1978 # define sayNO return 0
1979 # define sayYES_FINAL return 1
1980 # define sayYES_LOUD return 1
1981 # define sayNO_FINAL return 0
1982 # define sayNO_SILENT return 0
1983 # define saySAME(x) return x
1986 SV *prop = sv_newmortal();
1987 int docolor = *PL_colors[0];
1988 int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
1989 int l = (PL_regeol - locinput) > taill ? taill : (PL_regeol - locinput);
1990 /* The part of the string before starttry has one color
1991 (pref0_len chars), between starttry and current
1992 position another one (pref_len - pref0_len chars),
1993 after the current position the third one.
1994 We assume that pref0_len <= pref_len, otherwise we
1995 decrease pref0_len. */
1996 int pref_len = (locinput - PL_bostr) > (5 + taill) - l
1997 ? (5 + taill) - l : locinput - PL_bostr;
2000 while (UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
2002 pref0_len = pref_len - (locinput - PL_reg_starttry);
2003 if (l + pref_len < (5 + taill) && l < PL_regeol - locinput)
2004 l = ( PL_regeol - locinput > (5 + taill) - pref_len
2005 ? (5 + taill) - pref_len : PL_regeol - locinput);
2006 while (UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
2010 if (pref0_len > pref_len)
2011 pref0_len = pref_len;
2012 regprop(prop, scan);
2013 PerlIO_printf(Perl_debug_log,
2014 "%4"IVdf" <%s%.*s%s%s%.*s%s%s%s%.*s%s>%*s|%3"IVdf":%*s%s\n",
2015 (IV)(locinput - PL_bostr),
2016 PL_colors[4], pref0_len,
2017 locinput - pref_len, PL_colors[5],
2018 PL_colors[2], pref_len - pref0_len,
2019 locinput - pref_len + pref0_len, PL_colors[3],
2020 (docolor ? "" : "> <"),
2021 PL_colors[0], l, locinput, PL_colors[1],
2022 15 - l - pref_len + 1,
2024 (IV)(scan - PL_regprogram), PL_regindent*2, "",
2028 next = scan + NEXT_OFF(scan);
2034 if (locinput == PL_bostr || (PL_multiline &&
2035 (nextchr || locinput < PL_regeol) && locinput[-1] == '\n') )
2037 /* regtill = regbol; */
2042 if (locinput == PL_bostr ||
2043 ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n'))
2049 if (locinput == PL_bostr)
2053 if (locinput == PL_reg_ganch)
2063 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
2068 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
2070 if (PL_regeol - locinput > 1)
2074 if (PL_regeol != locinput)
2078 if (!nextchr && locinput >= PL_regeol)
2080 nextchr = UCHARAT(++locinput);
2083 if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
2086 locinput += PL_utf8skip[nextchr];
2087 if (locinput > PL_regeol)
2089 nextchr = UCHARAT(locinput);
2092 nextchr = UCHARAT(++locinput);
2097 if (do_utf8 != (UTF!=0)) {
2105 if (*((U8*)s) != utf8_to_uvchr((U8*)l, &len))
2114 if (*((U8*)l) != utf8_to_uvchr((U8*)s, &len))
2120 nextchr = UCHARAT(locinput);
2123 /* Inline the first character, for speed. */
2124 if (UCHARAT(s) != nextchr)
2126 if (PL_regeol - locinput < ln)
2128 if (ln > 1 && memNE(s, locinput, ln))
2131 nextchr = UCHARAT(locinput);
2134 PL_reg_flags |= RF_tainted;
2144 c1 = OP(scan) == EXACTF;
2146 if (l >= PL_regeol) {
2149 if ((UTF ? utf8n_to_uvchr((U8*)s, e - s, 0, 0) : *((U8*)s)) !=
2150 (c1 ? toLOWER_utf8((U8*)l) : toLOWER_LC_utf8((U8*)l)))
2152 s += UTF ? UTF8SKIP(s) : 1;
2156 nextchr = UCHARAT(locinput);
2160 /* Inline the first character, for speed. */
2161 if (UCHARAT(s) != nextchr &&
2162 UCHARAT(s) != ((OP(scan) == EXACTF)
2163 ? PL_fold : PL_fold_locale)[nextchr])
2165 if (PL_regeol - locinput < ln)
2167 if (ln > 1 && (OP(scan) == EXACTF
2168 ? ibcmp(s, locinput, ln)
2169 : ibcmp_locale(s, locinput, ln)))
2172 nextchr = UCHARAT(locinput);
2176 if (!reginclass(scan, (U8*)locinput, do_utf8))
2178 if (locinput >= PL_regeol)
2180 locinput += PL_utf8skip[nextchr];
2181 nextchr = UCHARAT(locinput);
2185 nextchr = UCHARAT(locinput);
2186 if (!reginclass(scan, (U8*)locinput, do_utf8))
2188 if (!nextchr && locinput >= PL_regeol)
2190 nextchr = UCHARAT(++locinput);
2194 PL_reg_flags |= RF_tainted;
2200 if (!(OP(scan) == ALNUM
2201 ? swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
2202 : isALNUM_LC_utf8((U8*)locinput)))
2206 locinput += PL_utf8skip[nextchr];
2207 nextchr = UCHARAT(locinput);
2210 if (!(OP(scan) == ALNUM
2211 ? isALNUM(nextchr) : isALNUM_LC(nextchr)))
2213 nextchr = UCHARAT(++locinput);
2216 PL_reg_flags |= RF_tainted;
2219 if (!nextchr && locinput >= PL_regeol)
2222 LOAD_UTF8_CHARCLASS(alnum,"a");
2223 if (OP(scan) == NALNUM
2224 ? swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
2225 : isALNUM_LC_utf8((U8*)locinput))
2229 locinput += PL_utf8skip[nextchr];
2230 nextchr = UCHARAT(locinput);
2233 if (OP(scan) == NALNUM
2234 ? isALNUM(nextchr) : isALNUM_LC(nextchr))
2236 nextchr = UCHARAT(++locinput);
2240 PL_reg_flags |= RF_tainted;
2244 /* was last char in word? */
2246 if (locinput == PL_bostr)
2249 U8 *r = reghop((U8*)locinput, -1);
2251 ln = utf8n_to_uvchr(r, s - (char*)r, 0, 0);
2253 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
2254 ln = isALNUM_uni(ln);
2255 LOAD_UTF8_CHARCLASS(alnum,"a");
2256 n = swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8);
2259 ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(ln));
2260 n = isALNUM_LC_utf8((U8*)locinput);
2264 ln = (locinput != PL_bostr) ?
2265 UCHARAT(locinput - 1) : '\n';
2266 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
2268 n = isALNUM(nextchr);
2271 ln = isALNUM_LC(ln);
2272 n = isALNUM_LC(nextchr);
2275 if (((!ln) == (!n)) == (OP(scan) == BOUND ||
2276 OP(scan) == BOUNDL))
2280 PL_reg_flags |= RF_tainted;
2286 if (UTF8_IS_CONTINUED(nextchr)) {
2287 LOAD_UTF8_CHARCLASS(space," ");
2288 if (!(OP(scan) == SPACE
2289 ? swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
2290 : isSPACE_LC_utf8((U8*)locinput)))
2294 locinput += PL_utf8skip[nextchr];
2295 nextchr = UCHARAT(locinput);
2298 if (!(OP(scan) == SPACE
2299 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
2301 nextchr = UCHARAT(++locinput);
2304 if (!(OP(scan) == SPACE
2305 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
2307 nextchr = UCHARAT(++locinput);
2311 PL_reg_flags |= RF_tainted;
2314 if (!nextchr && locinput >= PL_regeol)
2317 LOAD_UTF8_CHARCLASS(space," ");
2318 if (OP(scan) == NSPACE
2319 ? swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
2320 : isSPACE_LC_utf8((U8*)locinput))
2324 locinput += PL_utf8skip[nextchr];
2325 nextchr = UCHARAT(locinput);
2328 if (OP(scan) == NSPACE
2329 ? isSPACE(nextchr) : isSPACE_LC(nextchr))
2331 nextchr = UCHARAT(++locinput);
2334 PL_reg_flags |= RF_tainted;
2340 LOAD_UTF8_CHARCLASS(digit,"0");
2341 if (!(OP(scan) == DIGIT
2342 ? swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
2343 : isDIGIT_LC_utf8((U8*)locinput)))
2347 locinput += PL_utf8skip[nextchr];
2348 nextchr = UCHARAT(locinput);
2351 if (!(OP(scan) == DIGIT
2352 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr)))
2354 nextchr = UCHARAT(++locinput);
2357 PL_reg_flags |= RF_tainted;
2360 if (!nextchr && locinput >= PL_regeol)
2363 LOAD_UTF8_CHARCLASS(digit,"0");
2364 if (OP(scan) == NDIGIT
2365 ? swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
2366 : isDIGIT_LC_utf8((U8*)locinput))
2370 locinput += PL_utf8skip[nextchr];
2371 nextchr = UCHARAT(locinput);
2374 if (OP(scan) == NDIGIT
2375 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr))
2377 nextchr = UCHARAT(++locinput);
2380 LOAD_UTF8_CHARCLASS(mark,"~");
2381 if (locinput >= PL_regeol ||
2382 swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
2384 locinput += PL_utf8skip[nextchr];
2385 while (locinput < PL_regeol &&
2386 swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
2387 locinput += UTF8SKIP(locinput);
2388 if (locinput > PL_regeol)
2390 nextchr = UCHARAT(locinput);
2393 PL_reg_flags |= RF_tainted;
2397 n = ARG(scan); /* which paren pair */
2398 ln = PL_regstartp[n];
2399 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
2400 if (*PL_reglastparen < n || ln == -1)
2401 sayNO; /* Do not match unless seen CLOSEn. */
2402 if (ln == PL_regendp[n])
2406 if (do_utf8 && OP(scan) != REF) { /* REF can do byte comparison */
2408 char *e = PL_bostr + PL_regendp[n];
2410 * Note that we can't do the "other character" lookup trick as
2411 * in the 8-bit case (no pun intended) because in Unicode we
2412 * have to map both upper and title case to lower case.
2414 if (OP(scan) == REFF) {
2418 if (toLOWER_utf8((U8*)s) != toLOWER_utf8((U8*)l))
2428 if (toLOWER_LC_utf8((U8*)s) != toLOWER_LC_utf8((U8*)l))
2435 nextchr = UCHARAT(locinput);
2439 /* Inline the first character, for speed. */
2440 if (UCHARAT(s) != nextchr &&
2442 (UCHARAT(s) != ((OP(scan) == REFF
2443 ? PL_fold : PL_fold_locale)[nextchr]))))
2445 ln = PL_regendp[n] - ln;
2446 if (locinput + ln > PL_regeol)
2448 if (ln > 1 && (OP(scan) == REF
2449 ? memNE(s, locinput, ln)
2451 ? ibcmp(s, locinput, ln)
2452 : ibcmp_locale(s, locinput, ln))))
2455 nextchr = UCHARAT(locinput);
2466 OP_4tree *oop = PL_op;
2467 COP *ocurcop = PL_curcop;
2468 SV **ocurpad = PL_curpad;
2472 PL_op = (OP_4tree*)PL_regdata->data[n];
2473 DEBUG_r( PerlIO_printf(Perl_debug_log, " re_eval 0x%"UVxf"\n", PTR2UV(PL_op)) );
2474 PL_curpad = AvARRAY((AV*)PL_regdata->data[n + 2]);
2475 PL_regendp[0] = PL_reg_magic->mg_len = locinput - PL_bostr;
2477 CALLRUNOPS(aTHX); /* Scalar context. */
2483 PL_curpad = ocurpad;
2484 PL_curcop = ocurcop;
2486 if (logical == 2) { /* Postponed subexpression. */
2488 MAGIC *mg = Null(MAGIC*);
2490 CHECKPOINT cp, lastcp;
2492 if(SvROK(ret) || SvRMAGICAL(ret)) {
2493 SV *sv = SvROK(ret) ? SvRV(ret) : ret;
2496 mg = mg_find(sv, 'r');
2499 re = (regexp *)mg->mg_obj;
2500 (void)ReREFCNT_inc(re);
2504 char *t = SvPV(ret, len);
2506 char *oprecomp = PL_regprecomp;
2507 I32 osize = PL_regsize;
2508 I32 onpar = PL_regnpar;
2511 re = CALLREGCOMP(aTHX_ t, t + len, &pm);
2513 & (SVs_TEMP | SVs_PADTMP | SVf_READONLY)))
2514 sv_magic(ret,(SV*)ReREFCNT_inc(re),'r',0,0);
2515 PL_regprecomp = oprecomp;
2520 PerlIO_printf(Perl_debug_log,
2521 "Entering embedded `%s%.60s%s%s'\n",
2525 (strlen(re->precomp) > 60 ? "..." : ""))
2528 state.prev = PL_reg_call_cc;
2529 state.cc = PL_regcc;
2530 state.re = PL_reg_re;
2534 cp = regcppush(0); /* Save *all* the positions. */
2537 state.ss = PL_savestack_ix;
2538 *PL_reglastparen = 0;
2539 PL_reg_call_cc = &state;
2540 PL_reginput = locinput;
2542 /* XXXX This is too dramatic a measure... */
2545 if (regmatch(re->program + 1)) {
2546 /* Even though we succeeded, we need to restore
2547 global variables, since we may be wrapped inside
2548 SUSPEND, thus the match may be not finished yet. */
2550 /* XXXX Do this only if SUSPENDed? */
2551 PL_reg_call_cc = state.prev;
2552 PL_regcc = state.cc;
2553 PL_reg_re = state.re;
2554 cache_re(PL_reg_re);
2556 /* XXXX This is too dramatic a measure... */
2559 /* These are needed even if not SUSPEND. */
2565 REGCP_UNWIND(lastcp);
2567 PL_reg_call_cc = state.prev;
2568 PL_regcc = state.cc;
2569 PL_reg_re = state.re;
2570 cache_re(PL_reg_re);
2572 /* XXXX This is too dramatic a measure... */
2581 sv_setsv(save_scalar(PL_replgv), ret);
2585 n = ARG(scan); /* which paren pair */
2586 PL_reg_start_tmp[n] = locinput;
2591 n = ARG(scan); /* which paren pair */
2592 PL_regstartp[n] = PL_reg_start_tmp[n] - PL_bostr;
2593 PL_regendp[n] = locinput - PL_bostr;
2594 if (n > *PL_reglastparen)
2595 *PL_reglastparen = n;
2598 n = ARG(scan); /* which paren pair */
2599 sw = (*PL_reglastparen >= n && PL_regendp[n] != -1);
2602 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
2604 next = NEXTOPER(NEXTOPER(scan));
2606 next = scan + ARG(scan);
2607 if (OP(next) == IFTHEN) /* Fake one. */
2608 next = NEXTOPER(NEXTOPER(next));
2612 logical = scan->flags;
2614 /*******************************************************************
2615 PL_regcc contains infoblock about the innermost (...)* loop, and
2616 a pointer to the next outer infoblock.
2618 Here is how Y(A)*Z is processed (if it is compiled into CURLYX/WHILEM):
2620 1) After matching X, regnode for CURLYX is processed;
2622 2) This regnode creates infoblock on the stack, and calls
2623 regmatch() recursively with the starting point at WHILEM node;
2625 3) Each hit of WHILEM node tries to match A and Z (in the order
2626 depending on the current iteration, min/max of {min,max} and
2627 greediness). The information about where are nodes for "A"
2628 and "Z" is read from the infoblock, as is info on how many times "A"
2629 was already matched, and greediness.
2631 4) After A matches, the same WHILEM node is hit again.
2633 5) Each time WHILEM is hit, PL_regcc is the infoblock created by CURLYX
2634 of the same pair. Thus when WHILEM tries to match Z, it temporarily
2635 resets PL_regcc, since this Y(A)*Z can be a part of some other loop:
2636 as in (Y(A)*Z)*. If Z matches, the automaton will hit the WHILEM node
2637 of the external loop.
2639 Currently present infoblocks form a tree with a stem formed by PL_curcc
2640 and whatever it mentions via ->next, and additional attached trees
2641 corresponding to temporarily unset infoblocks as in "5" above.
2643 In the following picture infoblocks for outer loop of
2644 (Y(A)*?Z)*?T are denoted O, for inner I. NULL starting block
2645 is denoted by x. The matched string is YAAZYAZT. Temporarily postponed
2646 infoblocks are drawn below the "reset" infoblock.
2648 In fact in the picture below we do not show failed matches for Z and T
2649 by WHILEM blocks. [We illustrate minimal matches, since for them it is
2650 more obvious *why* one needs to *temporary* unset infoblocks.]
2652 Matched REx position InfoBlocks Comment
2656 Y A)*?Z)*?T x <- O <- I
2657 YA )*?Z)*?T x <- O <- I
2658 YA A)*?Z)*?T x <- O <- I
2659 YAA )*?Z)*?T x <- O <- I
2660 YAA Z)*?T x <- O # Temporary unset I
2663 YAAZ Y(A)*?Z)*?T x <- O
2666 YAAZY (A)*?Z)*?T x <- O
2669 YAAZY A)*?Z)*?T x <- O <- I
2672 YAAZYA )*?Z)*?T x <- O <- I
2675 YAAZYA Z)*?T x <- O # Temporary unset I
2681 YAAZYAZ T x # Temporary unset O
2688 *******************************************************************/
2691 CHECKPOINT cp = PL_savestack_ix;
2692 /* No need to save/restore up to this paren */
2693 I32 parenfloor = scan->flags;
2695 if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
2697 cc.oldcc = PL_regcc;
2699 /* XXXX Probably it is better to teach regpush to support
2700 parenfloor > PL_regsize... */
2701 if (parenfloor > *PL_reglastparen)
2702 parenfloor = *PL_reglastparen; /* Pessimization... */
2703 cc.parenfloor = parenfloor;
2705 cc.min = ARG1(scan);
2706 cc.max = ARG2(scan);
2707 cc.scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
2711 PL_reginput = locinput;
2712 n = regmatch(PREVOPER(next)); /* start on the WHILEM */
2714 PL_regcc = cc.oldcc;
2720 * This is really hard to understand, because after we match
2721 * what we're trying to match, we must make sure the rest of
2722 * the REx is going to match for sure, and to do that we have
2723 * to go back UP the parse tree by recursing ever deeper. And
2724 * if it fails, we have to reset our parent's current state
2725 * that we can try again after backing off.
2728 CHECKPOINT cp, lastcp;
2729 CURCUR* cc = PL_regcc;
2730 char *lastloc = cc->lastloc; /* Detection of 0-len. */
2732 n = cc->cur + 1; /* how many we know we matched */
2733 PL_reginput = locinput;
2736 PerlIO_printf(Perl_debug_log,
2737 "%*s %ld out of %ld..%ld cc=%lx\n",
2738 REPORT_CODE_OFF+PL_regindent*2, "",
2739 (long)n, (long)cc->min,
2740 (long)cc->max, (long)cc)
2743 /* If degenerate scan matches "", assume scan done. */
2745 if (locinput == cc->lastloc && n >= cc->min) {
2746 PL_regcc = cc->oldcc;
2750 PerlIO_printf(Perl_debug_log,
2751 "%*s empty match detected, try continuation...\n",
2752 REPORT_CODE_OFF+PL_regindent*2, "")
2754 if (regmatch(cc->next))
2762 /* First just match a string of min scans. */
2766 cc->lastloc = locinput;
2767 if (regmatch(cc->scan))
2770 cc->lastloc = lastloc;
2775 /* Check whether we already were at this position.
2776 Postpone detection until we know the match is not
2777 *that* much linear. */
2778 if (!PL_reg_maxiter) {
2779 PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
2780 PL_reg_leftiter = PL_reg_maxiter;
2782 if (PL_reg_leftiter-- == 0) {
2783 I32 size = (PL_reg_maxiter + 7)/8;
2784 if (PL_reg_poscache) {
2785 if (PL_reg_poscache_size < size) {
2786 Renew(PL_reg_poscache, size, char);
2787 PL_reg_poscache_size = size;
2789 Zero(PL_reg_poscache, size, char);
2792 PL_reg_poscache_size = size;
2793 Newz(29, PL_reg_poscache, size, char);
2796 PerlIO_printf(Perl_debug_log,
2797 "%sDetected a super-linear match, switching on caching%s...\n",
2798 PL_colors[4], PL_colors[5])
2801 if (PL_reg_leftiter < 0) {
2802 I32 o = locinput - PL_bostr, b;
2804 o = (scan->flags & 0xf) - 1 + o * (scan->flags>>4);
2807 if (PL_reg_poscache[o] & (1<<b)) {
2809 PerlIO_printf(Perl_debug_log,
2810 "%*s already tried at this position...\n",
2811 REPORT_CODE_OFF+PL_regindent*2, "")
2815 PL_reg_poscache[o] |= (1<<b);
2819 /* Prefer next over scan for minimal matching. */
2822 PL_regcc = cc->oldcc;
2825 cp = regcppush(cc->parenfloor);
2827 if (regmatch(cc->next)) {
2829 sayYES; /* All done. */
2831 REGCP_UNWIND(lastcp);
2837 if (n >= cc->max) { /* Maximum greed exceeded? */
2838 if (ckWARN(WARN_REGEXP) && n >= REG_INFTY
2839 && !(PL_reg_flags & RF_warned)) {
2840 PL_reg_flags |= RF_warned;
2841 Perl_warner(aTHX_ WARN_REGEXP, "%s limit (%d) exceeded",
2842 "Complex regular subexpression recursion",
2849 PerlIO_printf(Perl_debug_log,
2850 "%*s trying longer...\n",
2851 REPORT_CODE_OFF+PL_regindent*2, "")
2853 /* Try scanning more and see if it helps. */
2854 PL_reginput = locinput;
2856 cc->lastloc = locinput;
2857 cp = regcppush(cc->parenfloor);
2859 if (regmatch(cc->scan)) {
2863 REGCP_UNWIND(lastcp);
2866 cc->lastloc = lastloc;
2870 /* Prefer scan over next for maximal matching. */
2872 if (n < cc->max) { /* More greed allowed? */
2873 cp = regcppush(cc->parenfloor);
2875 cc->lastloc = locinput;
2877 if (regmatch(cc->scan)) {
2881 REGCP_UNWIND(lastcp);
2882 regcppop(); /* Restore some previous $<digit>s? */
2883 PL_reginput = locinput;
2885 PerlIO_printf(Perl_debug_log,
2886 "%*s failed, try continuation...\n",
2887 REPORT_CODE_OFF+PL_regindent*2, "")
2890 if (ckWARN(WARN_REGEXP) && n >= REG_INFTY
2891 && !(PL_reg_flags & RF_warned)) {
2892 PL_reg_flags |= RF_warned;
2893 Perl_warner(aTHX_ WARN_REGEXP, "%s limit (%d) exceeded",
2894 "Complex regular subexpression recursion",
2898 /* Failed deeper matches of scan, so see if this one works. */
2899 PL_regcc = cc->oldcc;
2902 if (regmatch(cc->next))
2908 cc->lastloc = lastloc;
2913 next = scan + ARG(scan);
2916 inner = NEXTOPER(NEXTOPER(scan));
2919 inner = NEXTOPER(scan);
2924 if (OP(next) != c1) /* No choice. */
2925 next = inner; /* Avoid recursion. */
2927 I32 lastparen = *PL_reglastparen;
2929 re_unwind_branch_t *uw;
2931 /* Put unwinding data on stack */
2932 unwind1 = SSNEWt(1,re_unwind_branch_t);
2933 uw = SSPTRt(unwind1,re_unwind_branch_t);
2936 uw->type = ((c1 == BRANCH)
2938 : RE_UNWIND_BRANCHJ);
2939 uw->lastparen = lastparen;
2941 uw->locinput = locinput;
2942 uw->nextchr = nextchr;
2944 uw->regindent = ++PL_regindent;
2947 REGCP_SET(uw->lastcp);
2949 /* Now go into the first branch */
2962 /* We suppose that the next guy does not need
2963 backtracking: in particular, it is of constant length,
2964 and has no parenths to influence future backrefs. */
2965 ln = ARG1(scan); /* min to match */
2966 n = ARG2(scan); /* max to match */
2967 paren = scan->flags;
2969 if (paren > PL_regsize)
2971 if (paren > *PL_reglastparen)
2972 *PL_reglastparen = paren;
2974 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
2976 scan += NEXT_OFF(scan); /* Skip former OPEN. */
2977 PL_reginput = locinput;
2980 if (ln && regrepeat_hard(scan, ln, &l) < ln)
2982 if (ln && l == 0 && n >= ln
2983 /* In fact, this is tricky. If paren, then the
2984 fact that we did/didnot match may influence
2985 future execution. */
2986 && !(paren && ln == 0))
2988 locinput = PL_reginput;
2989 if (PL_regkind[(U8)OP(next)] == EXACT) {
2990 c1 = (U8)*STRING(next);
2991 if (OP(next) == EXACTF)
2993 else if (OP(next) == EXACTFL)
2994 c2 = PL_fold_locale[c1];
3001 /* This may be improved if l == 0. */
3002 while (n >= ln || (n == REG_INFTY && ln > 0 && l)) { /* ln overflow ? */
3003 /* If it could work, try it. */
3005 UCHARAT(PL_reginput) == c1 ||
3006 UCHARAT(PL_reginput) == c2)
3010 PL_regstartp[paren] =
3011 HOPc(PL_reginput, -l) - PL_bostr;
3012 PL_regendp[paren] = PL_reginput - PL_bostr;
3015 PL_regendp[paren] = -1;
3019 REGCP_UNWIND(lastcp);
3021 /* Couldn't or didn't -- move forward. */
3022 PL_reginput = locinput;
3023 if (regrepeat_hard(scan, 1, &l)) {
3025 locinput = PL_reginput;
3032 n = regrepeat_hard(scan, n, &l);
3033 if (n != 0 && l == 0
3034 /* In fact, this is tricky. If paren, then the
3035 fact that we did/didnot match may influence
3036 future execution. */
3037 && !(paren && ln == 0))
3039 locinput = PL_reginput;
3041 PerlIO_printf(Perl_debug_log,
3042 "%*s matched %"IVdf" times, len=%"IVdf"...\n",
3043 (int)(REPORT_CODE_OFF+PL_regindent*2), "",
3047 if (PL_regkind[(U8)OP(next)] == EXACT) {
3048 c1 = (U8)*STRING(next);
3049 if (OP(next) == EXACTF)
3051 else if (OP(next) == EXACTFL)
3052 c2 = PL_fold_locale[c1];
3061 /* If it could work, try it. */
3063 UCHARAT(PL_reginput) == c1 ||
3064 UCHARAT(PL_reginput) == c2)
3067 PerlIO_printf(Perl_debug_log,
3068 "%*s trying tail with n=%"IVdf"...\n",
3069 (int)(REPORT_CODE_OFF+PL_regindent*2), "", (IV)n)
3073 PL_regstartp[paren] = HOPc(PL_reginput, -l) - PL_bostr;
3074 PL_regendp[paren] = PL_reginput - PL_bostr;
3077 PL_regendp[paren] = -1;
3081 REGCP_UNWIND(lastcp);
3083 /* Couldn't or didn't -- back up. */
3085 locinput = HOPc(locinput, -l);
3086 PL_reginput = locinput;
3093 paren = scan->flags; /* Which paren to set */
3094 if (paren > PL_regsize)
3096 if (paren > *PL_reglastparen)
3097 *PL_reglastparen = paren;
3098 ln = ARG1(scan); /* min to match */
3099 n = ARG2(scan); /* max to match */
3100 scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
3104 ln = ARG1(scan); /* min to match */
3105 n = ARG2(scan); /* max to match */
3106 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
3111 scan = NEXTOPER(scan);
3117 scan = NEXTOPER(scan);
3121 * Lookahead to avoid useless match attempts
3122 * when we know what character comes next.
3124 if (PL_regkind[(U8)OP(next)] == EXACT) {
3125 U8 *s = (U8*)STRING(next);
3128 if (OP(next) == EXACTF)
3130 else if (OP(next) == EXACTFL)
3131 c2 = PL_fold_locale[c1];
3134 if (OP(next) == EXACTF) {
3135 c1 = to_utf8_lower(s);
3136 c2 = to_utf8_upper(s);
3139 c2 = c1 = utf8_to_uvchr(s, NULL);
3145 PL_reginput = locinput;
3149 if (ln && regrepeat(scan, ln) < ln)
3151 locinput = PL_reginput;
3154 char *e; /* Should not check after this */
3155 char *old = locinput;
3157 if (n == REG_INFTY) {
3160 while (UTF8_IS_CONTINUATION(*(U8*)e))
3166 m >0 && e + UTF8SKIP(e) <= PL_regeol; m--)
3170 e = locinput + n - ln;
3176 /* Find place 'next' could work */
3179 while (locinput <= e && *locinput != c1)
3182 while (locinput <= e
3187 count = locinput - old;
3194 utf8_to_uvchr((U8*)locinput, &len) != c1;
3199 for (count = 0; locinput <= e; count++) {
3200 UV c = utf8_to_uvchr((U8*)locinput, &len);
3201 if (c == c1 || c == c2)
3209 /* PL_reginput == old now */
3210 if (locinput != old) {
3211 ln = 1; /* Did some */
3212 if (regrepeat(scan, count) < count)
3215 /* PL_reginput == locinput now */
3216 TRYPAREN(paren, ln, locinput);
3217 PL_reginput = locinput; /* Could be reset... */
3218 REGCP_UNWIND(lastcp);
3219 /* Couldn't or didn't -- move forward. */
3222 locinput += UTF8SKIP(locinput);
3228 while (n >= ln || (n == REG_INFTY && ln > 0)) { /* ln overflow ? */
3232 c = utf8_to_uvchr((U8*)PL_reginput, NULL);
3234 c = UCHARAT(PL_reginput);
3235 /* If it could work, try it. */
3236 if (c == c1 || c == c2)
3238 TRYPAREN(paren, n, PL_reginput);
3239 REGCP_UNWIND(lastcp);
3242 /* If it could work, try it. */
3243 else if (c1 == -1000)
3245 TRYPAREN(paren, n, PL_reginput);
3246 REGCP_UNWIND(lastcp);
3248 /* Couldn't or didn't -- move forward. */
3249 PL_reginput = locinput;
3250 if (regrepeat(scan, 1)) {
3252 locinput = PL_reginput;
3260 n = regrepeat(scan, n);
3261 locinput = PL_reginput;
3262 if (ln < n && PL_regkind[(U8)OP(next)] == EOL &&
3263 (!PL_multiline || OP(next) == SEOL || OP(next) == EOS)) {
3264 ln = n; /* why back off? */
3265 /* ...because $ and \Z can match before *and* after
3266 newline at the end. Consider "\n\n" =~ /\n+\Z\n/.
3267 We should back off by one in this case. */
3268 if (UCHARAT(PL_reginput - 1) == '\n' && OP(next) != EOS)
3277 c = utf8_to_uvchr((U8*)PL_reginput, NULL);
3279 c = UCHARAT(PL_reginput);
3281 /* If it could work, try it. */
3282 if (c1 == -1000 || c == c1 || c == c2)
3284 TRYPAREN(paren, n, PL_reginput);
3285 REGCP_UNWIND(lastcp);
3287 /* Couldn't or didn't -- back up. */
3289 PL_reginput = locinput = HOPc(locinput, -1);
3297 c = utf8_to_uvchr((U8*)PL_reginput, NULL);
3299 c = UCHARAT(PL_reginput);
3301 /* If it could work, try it. */
3302 if (c1 == -1000 || c == c1 || c == c2)
3304 TRYPAREN(paren, n, PL_reginput);
3305 REGCP_UNWIND(lastcp);
3307 /* Couldn't or didn't -- back up. */
3309 PL_reginput = locinput = HOPc(locinput, -1);
3316 if (PL_reg_call_cc) {
3317 re_cc_state *cur_call_cc = PL_reg_call_cc;
3318 CURCUR *cctmp = PL_regcc;
3319 regexp *re = PL_reg_re;
3320 CHECKPOINT cp, lastcp;
3322 cp = regcppush(0); /* Save *all* the positions. */
3324 regcp_set_to(PL_reg_call_cc->ss); /* Restore parens of
3326 PL_reginput = locinput; /* Make position available to
3328 cache_re(PL_reg_call_cc->re);
3329 PL_regcc = PL_reg_call_cc->cc;
3330 PL_reg_call_cc = PL_reg_call_cc->prev;
3331 if (regmatch(cur_call_cc->node)) {
3332 PL_reg_call_cc = cur_call_cc;
3336 REGCP_UNWIND(lastcp);
3338 PL_reg_call_cc = cur_call_cc;
3344 PerlIO_printf(Perl_debug_log,
3345 "%*s continuation failed...\n",
3346 REPORT_CODE_OFF+PL_regindent*2, "")
3350 if (locinput < PL_regtill) {
3351 DEBUG_r(PerlIO_printf(Perl_debug_log,
3352 "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
3354 (long)(locinput - PL_reg_starttry),
3355 (long)(PL_regtill - PL_reg_starttry),
3357 sayNO_FINAL; /* Cannot match: too short. */
3359 PL_reginput = locinput; /* put where regtry can find it */
3360 sayYES_FINAL; /* Success! */
3362 PL_reginput = locinput; /* put where regtry can find it */
3363 sayYES_LOUD; /* Success! */
3366 PL_reginput = locinput;
3371 if (UTF) { /* XXXX This is absolutely
3372 broken, we read before
3374 s = HOPMAYBEc(locinput, -scan->flags);
3380 if (locinput < PL_bostr + scan->flags)
3382 PL_reginput = locinput - scan->flags;
3387 PL_reginput = locinput;
3392 if (UTF) { /* XXXX This is absolutely
3393 broken, we read before
3395 s = HOPMAYBEc(locinput, -scan->flags);
3396 if (!s || s < PL_bostr)
3401 if (locinput < PL_bostr + scan->flags)
3403 PL_reginput = locinput - scan->flags;
3408 PL_reginput = locinput;
3411 inner = NEXTOPER(NEXTOPER(scan));
3412 if (regmatch(inner) != n) {
3427 if (OP(scan) == SUSPEND) {
3428 locinput = PL_reginput;
3429 nextchr = UCHARAT(locinput);
3434 next = scan + ARG(scan);
3439 PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
3440 PTR2UV(scan), OP(scan));
3441 Perl_croak(aTHX_ "regexp memory corruption");
3448 * We get here only if there's trouble -- normally "case END" is
3449 * the terminating point.
3451 Perl_croak(aTHX_ "corrupted regexp pointers");
3457 PerlIO_printf(Perl_debug_log,
3458 "%*s %scould match...%s\n",
3459 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4],PL_colors[5])
3463 DEBUG_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
3464 PL_colors[4],PL_colors[5]));
3470 #if 0 /* Breaks $^R */
3478 PerlIO_printf(Perl_debug_log,
3479 "%*s %sfailed...%s\n",
3480 REPORT_CODE_OFF+PL_regindent*2, "",PL_colors[4],PL_colors[5])
3486 re_unwind_t *uw = SSPTRt(unwind,re_unwind_t);
3489 case RE_UNWIND_BRANCH:
3490 case RE_UNWIND_BRANCHJ:
3492 re_unwind_branch_t *uwb = &(uw->branch);
3493 I32 lastparen = uwb->lastparen;
3495 REGCP_UNWIND(uwb->lastcp);
3496 for (n = *PL_reglastparen; n > lastparen; n--)
3498 *PL_reglastparen = n;
3499 scan = next = uwb->next;
3501 OP(scan) != (uwb->type == RE_UNWIND_BRANCH
3502 ? BRANCH : BRANCHJ) ) { /* Failure */
3509 /* Have more choice yet. Reuse the same uwb. */
3511 if ((n = (uwb->type == RE_UNWIND_BRANCH
3512 ? NEXT_OFF(next) : ARG(next))))
3515 next = NULL; /* XXXX Needn't unwinding in this case... */
3517 next = NEXTOPER(scan);
3518 if (uwb->type == RE_UNWIND_BRANCHJ)
3519 next = NEXTOPER(next);
3520 locinput = uwb->locinput;
3521 nextchr = uwb->nextchr;
3523 PL_regindent = uwb->regindent;
3530 Perl_croak(aTHX_ "regexp unwind memory corruption");
3541 - regrepeat - repeatedly match something simple, report how many
3544 * [This routine now assumes that it will only match on things of length 1.
3545 * That was true before, but now we assume scan - reginput is the count,
3546 * rather than incrementing count on every character. [Er, except utf8.]]
3549 S_regrepeat(pTHX_ regnode *p, I32 max)
3551 register char *scan;
3553 register char *loceol = PL_regeol;
3554 register I32 hardcount = 0;
3555 register bool do_utf8 = DO_UTF8(PL_reg_sv);
3558 if (max != REG_INFTY && max < loceol - scan)
3559 loceol = scan + max;
3564 while (scan < loceol && hardcount < max && *scan != '\n') {
3565 scan += UTF8SKIP(scan);
3569 while (scan < loceol && *scan != '\n')
3576 case EXACT: /* length of string is 1 */
3578 while (scan < loceol && UCHARAT(scan) == c)
3581 case EXACTF: /* length of string is 1 */
3583 while (scan < loceol &&
3584 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold[c]))
3587 case EXACTFL: /* length of string is 1 */
3588 PL_reg_flags |= RF_tainted;
3590 while (scan < loceol &&
3591 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold_locale[c]))
3597 while (hardcount < max && scan < loceol &&
3598 reginclass(p, (U8*)scan, do_utf8)) {
3599 scan += UTF8SKIP(scan);
3603 while (scan < loceol && reginclass(p, (U8*)scan, do_utf8))
3610 LOAD_UTF8_CHARCLASS(alnum,"a");
3611 while (hardcount < max && scan < loceol &&
3612 swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) {
3613 scan += UTF8SKIP(scan);
3617 while (scan < loceol && isALNUM(*scan))
3622 PL_reg_flags |= RF_tainted;
3625 while (hardcount < max && scan < loceol &&
3626 isALNUM_LC_utf8((U8*)scan)) {
3627 scan += UTF8SKIP(scan);
3631 while (scan < loceol && isALNUM_LC(*scan))
3638 LOAD_UTF8_CHARCLASS(alnum,"a");
3639 while (hardcount < max && scan < loceol &&
3640 !swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) {
3641 scan += UTF8SKIP(scan);
3645 while (scan < loceol && !isALNUM(*scan))
3650 PL_reg_flags |= RF_tainted;
3653 while (hardcount < max && scan < loceol &&
3654 !isALNUM_LC_utf8((U8*)scan)) {
3655 scan += UTF8SKIP(scan);
3659 while (scan < loceol && !isALNUM_LC(*scan))
3666 LOAD_UTF8_CHARCLASS(space," ");
3667 while (hardcount < max && scan < loceol &&
3669 swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) {
3670 scan += UTF8SKIP(scan);
3674 while (scan < loceol && isSPACE(*scan))
3679 PL_reg_flags |= RF_tainted;
3682 while (hardcount < max && scan < loceol &&
3683 (*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
3684 scan += UTF8SKIP(scan);
3688 while (scan < loceol && isSPACE_LC(*scan))
3695 LOAD_UTF8_CHARCLASS(space," ");
3696 while (hardcount < max && scan < loceol &&
3698 swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) {
3699 scan += UTF8SKIP(scan);
3703 while (scan < loceol && !isSPACE(*scan))
3708 PL_reg_flags |= RF_tainted;
3711 while (hardcount < max && scan < loceol &&
3712 !(*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
3713 scan += UTF8SKIP(scan);
3717 while (scan < loceol && !isSPACE_LC(*scan))
3724 LOAD_UTF8_CHARCLASS(digit,"0");
3725 while (hardcount < max && scan < loceol &&
3726 swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) {
3727 scan += UTF8SKIP(scan);
3731 while (scan < loceol && isDIGIT(*scan))
3738 LOAD_UTF8_CHARCLASS(digit,"0");
3739 while (hardcount < max && scan < loceol &&
3740 !swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) {
3741 scan += UTF8SKIP(scan);
3745 while (scan < loceol && !isDIGIT(*scan))
3749 default: /* Called on something of 0 width. */
3750 break; /* So match right here or not at all. */
3756 c = scan - PL_reginput;
3761 SV *prop = sv_newmortal();
3764 PerlIO_printf(Perl_debug_log,
3765 "%*s %s can match %"IVdf" times out of %"IVdf"...\n",
3766 REPORT_CODE_OFF+1, "", SvPVX(prop),(IV)c,(IV)max);
3773 - regrepeat_hard - repeatedly match something, report total lenth and length
3775 * The repeater is supposed to have constant length.
3779 S_regrepeat_hard(pTHX_ regnode *p, I32 max, I32 *lp)
3781 register char *scan;
3782 register char *start;
3783 register char *loceol = PL_regeol;
3785 I32 count = 0, res = 1;
3790 start = PL_reginput;
3791 if (DO_UTF8(PL_reg_sv)) {
3792 while (PL_reginput < loceol && (scan = PL_reginput, res = regmatch(p))) {
3795 while (start < PL_reginput) {
3797 start += UTF8SKIP(start);
3808 while (PL_reginput < loceol && (scan = PL_reginput, res = regmatch(p))) {
3810 *lp = l = PL_reginput - start;
3811 if (max != REG_INFTY && l*max < loceol - scan)
3812 loceol = scan + l*max;
3825 - regclass_swash - prepare the utf8 swash
3829 Perl_regclass_swash(pTHX_ register regnode* node, bool doinit, SV** initsvp)
3834 if (PL_regdata && PL_regdata->count) {
3837 if (PL_regdata->what[n] == 's') {
3838 SV *rv = (SV*)PL_regdata->data[n];
3839 AV *av = (AV*)SvRV((SV*)rv);
3842 si = *av_fetch(av, 0, FALSE);
3843 a = av_fetch(av, 1, FALSE);
3847 else if (si && doinit) {
3848 sw = swash_init("utf8", "", si, 1, 0);
3849 (void)av_store(av, 1, sw);
3861 - reginclass - determine if a character falls into a character class
3865 S_reginclass(pTHX_ register regnode *n, register U8* p, register bool do_utf8)
3867 char flags = ANYOF_FLAGS(n);
3872 c = do_utf8 ? utf8_to_uvchr(p, &len) : *p;
3874 if (do_utf8 || (flags & ANYOF_UNICODE)) {
3875 if (do_utf8 && !ANYOF_RUNTIME(n)) {
3876 if (len != (STRLEN)-1 && c < 256 && ANYOF_BITMAP_TEST(n, c))
3879 if (!match && do_utf8 && (flags & ANYOF_UNICODE_ALL) && c >= 256)
3882 SV *sw = regclass_swash(n, TRUE, 0);
3885 if (swash_fetch(sw, p, do_utf8))
3887 else if (flags & ANYOF_FOLD) {
3888 U8 tmpbuf[UTF8_MAXLEN+1];
3890 if (flags & ANYOF_LOCALE) {
3891 PL_reg_flags |= RF_tainted;
3892 uvchr_to_utf8(tmpbuf, toLOWER_LC_utf8(p));
3895 uvchr_to_utf8(tmpbuf, toLOWER_utf8(p));
3896 if (swash_fetch(sw, tmpbuf, do_utf8))
3902 if (!match && c < 256) {
3903 if (ANYOF_BITMAP_TEST(n, c))
3905 else if (flags & ANYOF_FOLD) {
3908 if (flags & ANYOF_LOCALE) {
3909 PL_reg_flags |= RF_tainted;
3910 f = PL_fold_locale[c];
3914 if (f != c && ANYOF_BITMAP_TEST(n, f))
3918 if (!match && (flags & ANYOF_CLASS)) {
3919 PL_reg_flags |= RF_tainted;
3921 (ANYOF_CLASS_TEST(n, ANYOF_ALNUM) && isALNUM_LC(c)) ||
3922 (ANYOF_CLASS_TEST(n, ANYOF_NALNUM) && !isALNUM_LC(c)) ||
3923 (ANYOF_CLASS_TEST(n, ANYOF_SPACE) && isSPACE_LC(c)) ||
3924 (ANYOF_CLASS_TEST(n, ANYOF_NSPACE) && !isSPACE_LC(c)) ||
3925 (ANYOF_CLASS_TEST(n, ANYOF_DIGIT) && isDIGIT_LC(c)) ||
3926 (ANYOF_CLASS_TEST(n, ANYOF_NDIGIT) && !isDIGIT_LC(c)) ||
3927 (ANYOF_CLASS_TEST(n, ANYOF_ALNUMC) && isALNUMC_LC(c)) ||
3928 (ANYOF_CLASS_TEST(n, ANYOF_NALNUMC) && !isALNUMC_LC(c)) ||
3929 (ANYOF_CLASS_TEST(n, ANYOF_ALPHA) && isALPHA_LC(c)) ||
3930 (ANYOF_CLASS_TEST(n, ANYOF_NALPHA) && !isALPHA_LC(c)) ||
3931 (ANYOF_CLASS_TEST(n, ANYOF_ASCII) && isASCII(c)) ||
3932 (ANYOF_CLASS_TEST(n, ANYOF_NASCII) && !isASCII(c)) ||
3933 (ANYOF_CLASS_TEST(n, ANYOF_CNTRL) && isCNTRL_LC(c)) ||
3934 (ANYOF_CLASS_TEST(n, ANYOF_NCNTRL) && !isCNTRL_LC(c)) ||
3935 (ANYOF_CLASS_TEST(n, ANYOF_GRAPH) && isGRAPH_LC(c)) ||
3936 (ANYOF_CLASS_TEST(n, ANYOF_NGRAPH) && !isGRAPH_LC(c)) ||
3937 (ANYOF_CLASS_TEST(n, ANYOF_LOWER) && isLOWER_LC(c)) ||
3938 (ANYOF_CLASS_TEST(n, ANYOF_NLOWER) && !isLOWER_LC(c)) ||
3939 (ANYOF_CLASS_TEST(n, ANYOF_PRINT) && isPRINT_LC(c)) ||
3940 (ANYOF_CLASS_TEST(n, ANYOF_NPRINT) && !isPRINT_LC(c)) ||
3941 (ANYOF_CLASS_TEST(n, ANYOF_PUNCT) && isPUNCT_LC(c)) ||
3942 (ANYOF_CLASS_TEST(n, ANYOF_NPUNCT) && !isPUNCT_LC(c)) ||
3943 (ANYOF_CLASS_TEST(n, ANYOF_UPPER) && isUPPER_LC(c)) ||
3944 (ANYOF_CLASS_TEST(n, ANYOF_NUPPER) && !isUPPER_LC(c)) ||
3945 (ANYOF_CLASS_TEST(n, ANYOF_XDIGIT) && isXDIGIT(c)) ||
3946 (ANYOF_CLASS_TEST(n, ANYOF_NXDIGIT) && !isXDIGIT(c)) ||
3947 (ANYOF_CLASS_TEST(n, ANYOF_PSXSPC) && isPSXSPC(c)) ||
3948 (ANYOF_CLASS_TEST(n, ANYOF_NPSXSPC) && !isPSXSPC(c)) ||
3949 (ANYOF_CLASS_TEST(n, ANYOF_BLANK) && isBLANK(c)) ||
3950 (ANYOF_CLASS_TEST(n, ANYOF_NBLANK) && !isBLANK(c))
3951 ) /* How's that for a conditional? */
3958 return (flags & ANYOF_INVERT) ? !match : match;
3962 S_reghop(pTHX_ U8 *s, I32 off)
3964 return S_reghop3(aTHX_ s, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr));
3968 S_reghop3(pTHX_ U8 *s, I32 off, U8* lim)
3971 while (off-- && s < lim) {
3972 /* XXX could check well-formedness here */
3980 if (UTF8_IS_CONTINUED(*s)) {
3981 while (s > (U8*)lim && UTF8_IS_CONTINUATION(*s))
3984 /* XXX could check well-formedness here */
3992 S_reghopmaybe(pTHX_ U8 *s, I32 off)
3994 return S_reghopmaybe3(aTHX_ s, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr));
3998 S_reghopmaybe3(pTHX_ U8* s, I32 off, U8* lim)
4001 while (off-- && s < lim) {
4002 /* XXX could check well-formedness here */
4012 if (UTF8_IS_CONTINUED(*s)) {
4013 while (s > (U8*)lim && UTF8_IS_CONTINUATION(*s))
4016 /* XXX could check well-formedness here */
4032 restore_pos(pTHXo_ void *arg)
4034 if (PL_reg_eval_set) {
4035 if (PL_reg_oldsaved) {
4036 PL_reg_re->subbeg = PL_reg_oldsaved;
4037 PL_reg_re->sublen = PL_reg_oldsavedlen;
4038 RX_MATCH_COPIED_on(PL_reg_re);
4040 PL_reg_magic->mg_len = PL_reg_oldpos;
4041 PL_reg_eval_set = 0;
4042 PL_curpm = PL_reg_oldcurpm;