5 * "One Ring to rule them all, One Ring to find them..."
8 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
9 * confused with the original package (see point 3 below). Thanks, Henry!
12 /* Additional note: this code is very heavily munged from Henry's version
13 * in places. In some spots I've traded clarity for efficiency, so don't
14 * blame Henry for some of the lack of readability.
17 /* The names of the functions have been changed from regcomp and
18 * regexec to pregcomp and pregexec in order to avoid conflicts
19 * with the POSIX routines of the same names.
22 #ifdef PERL_EXT_RE_BUILD
23 /* need to replace pregcomp et al, so enable that */
24 # ifndef PERL_IN_XSUB_RE
25 # define PERL_IN_XSUB_RE
27 /* need access to debugger hooks */
28 # if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
33 #ifdef PERL_IN_XSUB_RE
34 /* We *really* need to overwrite these symbols: */
35 # define Perl_regexec_flags my_regexec
36 # define Perl_regdump my_regdump
37 # define Perl_regprop my_regprop
38 # define Perl_re_intuit_start my_re_intuit_start
39 /* *These* symbols are masked to allow static link. */
40 # define Perl_pregexec my_pregexec
41 # define Perl_reginitcolors my_reginitcolors
42 # define Perl_regclass_swash my_regclass_swash
44 # define PERL_NO_GET_CONTEXT
49 * pregcomp and pregexec -- regsub and regerror are not used in perl
51 * Copyright (c) 1986 by University of Toronto.
52 * Written by Henry Spencer. Not derived from licensed software.
54 * Permission is granted to anyone to use this software for any
55 * purpose on any computer system, and to redistribute it freely,
56 * subject to the following restrictions:
58 * 1. The author is not responsible for the consequences of use of
59 * this software, no matter how awful, even if they arise
62 * 2. The origin of this software must not be misrepresented, either
63 * by explicit claim or by omission.
65 * 3. Altered versions must be plainly marked as such, and must not
66 * be misrepresented as being the original software.
68 **** Alterations to Henry's code are...
70 **** Copyright (c) 1991-2001, Larry Wall
72 **** You may distribute under the terms of either the GNU General Public
73 **** License or the Artistic License, as specified in the README file.
75 * Beware that some of this code is subtly aware of the way operator
76 * precedence is structured in regular expressions. Serious changes in
77 * regular-expression syntax might require a total rethink.
80 #define PERL_IN_REGEXEC_C
83 #ifdef PERL_IN_XSUB_RE
84 # if defined(PERL_CAPI) || defined(PERL_OBJECT)
91 #define RF_tainted 1 /* tainted information used? */
92 #define RF_warned 2 /* warned about big count? */
93 #define RF_evaled 4 /* Did an EVAL with setting? */
94 #define RF_utf8 8 /* String contains multibyte chars? */
96 #define UTF (PL_reg_flags & RF_utf8)
98 #define RS_init 1 /* eval environment created */
99 #define RS_set 2 /* replsv value is set */
102 #define STATIC static
109 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
110 #define CHR_DIST(a,b) (DO_UTF8(PL_reg_sv) ? utf8_distance(a,b) : a - b)
112 #define reghop_c(pos,off) ((char*)reghop((U8*)pos, off))
113 #define reghopmaybe_c(pos,off) ((char*)reghopmaybe((U8*)pos, off))
114 #define HOP(pos,off) (DO_UTF8(PL_reg_sv) ? reghop((U8*)pos, off) : (U8*)(pos + off))
115 #define HOPMAYBE(pos,off) (DO_UTF8(PL_reg_sv) ? reghopmaybe((U8*)pos, off) : (U8*)(pos + off))
116 #define HOPc(pos,off) ((char*)HOP(pos,off))
117 #define HOPMAYBEc(pos,off) ((char*)HOPMAYBE(pos,off))
119 #define reghop3_c(pos,off,lim) ((char*)reghop3((U8*)pos, off, (U8*)lim))
120 #define reghopmaybe3_c(pos,off,lim) ((char*)reghopmaybe3((U8*)pos, off, (U8*)lim))
121 #define HOP3(pos,off,lim) (DO_UTF8(PL_reg_sv) ? reghop3((U8*)pos, off, (U8*)lim) : (U8*)(pos + off))
122 #define HOPMAYBE3(pos,off,lim) (DO_UTF8(PL_reg_sv) ? reghopmaybe3((U8*)pos, off, (U8*)lim) : (U8*)(pos + off))
123 #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
124 #define HOPMAYBE3c(pos,off,lim) ((char*)HOPMAYBE3(pos,off,lim))
126 static void restore_pos(pTHXo_ void *arg);
130 S_regcppush(pTHX_ I32 parenfloor)
132 int retval = PL_savestack_ix;
133 #define REGCP_PAREN_ELEMS 4
134 int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
137 #define REGCP_OTHER_ELEMS 5
138 SSCHECK(paren_elems_to_push + REGCP_OTHER_ELEMS);
139 for (p = PL_regsize; p > parenfloor; p--) {
140 /* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
141 SSPUSHINT(PL_regendp[p]);
142 SSPUSHINT(PL_regstartp[p]);
143 SSPUSHPTR(PL_reg_start_tmp[p]);
146 /* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
147 SSPUSHINT(PL_regsize);
148 SSPUSHINT(*PL_reglastparen);
149 SSPUSHPTR(PL_reginput);
150 #define REGCP_FRAME_ELEMS 2
151 /* REGCP_FRAME_ELEMS are part of the REGCP_OTHER_ELEMS and
152 * are needed for the regexp context stack bookkeeping. */
153 SSPUSHINT(paren_elems_to_push + REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
154 SSPUSHINT(SAVEt_REGCONTEXT); /* Magic cookie. */
159 /* These are needed since we do not localize EVAL nodes: */
160 # define REGCP_SET(cp) DEBUG_r(PerlIO_printf(Perl_debug_log, \
161 " Setting an EVAL scope, savestack=%"IVdf"\n", \
162 (IV)PL_savestack_ix)); cp = PL_savestack_ix
164 # define REGCP_UNWIND(cp) DEBUG_r(cp != PL_savestack_ix ? \
165 PerlIO_printf(Perl_debug_log, \
166 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
167 (IV)(cp), (IV)PL_savestack_ix) : 0); regcpblow(cp)
177 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
179 assert(i == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
180 i = SSPOPINT; /* Parentheses elements to pop. */
181 input = (char *) SSPOPPTR;
182 *PL_reglastparen = SSPOPINT;
183 PL_regsize = SSPOPINT;
185 /* Now restore the parentheses context. */
186 for (i -= (REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
187 i > 0; i -= REGCP_PAREN_ELEMS) {
188 paren = (U32)SSPOPINT;
189 PL_reg_start_tmp[paren] = (char *) SSPOPPTR;
190 PL_regstartp[paren] = SSPOPINT;
192 if (paren <= *PL_reglastparen)
193 PL_regendp[paren] = tmps;
195 PerlIO_printf(Perl_debug_log,
196 " restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n",
197 (UV)paren, (IV)PL_regstartp[paren],
198 (IV)(PL_reg_start_tmp[paren] - PL_bostr),
199 (IV)PL_regendp[paren],
200 (paren > *PL_reglastparen ? "(no)" : ""));
204 if (*PL_reglastparen + 1 <= PL_regnpar) {
205 PerlIO_printf(Perl_debug_log,
206 " restoring \\%"IVdf"..\\%"IVdf" to undef\n",
207 (IV)(*PL_reglastparen + 1), (IV)PL_regnpar);
211 /* It would seem that the similar code in regtry()
212 * already takes care of this, and in fact it is in
213 * a better location to since this code can #if 0-ed out
214 * but the code in regtry() is needed or otherwise tests
215 * requiring null fields (pat.t#187 and split.t#{13,14}
216 * (as of patchlevel 7877) will fail. Then again,
217 * this code seems to be necessary or otherwise
218 * building DynaLoader will fail:
219 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
221 for (paren = *PL_reglastparen + 1; paren <= PL_regnpar; paren++) {
222 if (paren > PL_regsize)
223 PL_regstartp[paren] = -1;
224 PL_regendp[paren] = -1;
231 S_regcp_set_to(pTHX_ I32 ss)
233 I32 tmp = PL_savestack_ix;
235 PL_savestack_ix = ss;
237 PL_savestack_ix = tmp;
241 typedef struct re_cc_state
245 struct re_cc_state *prev;
250 #define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
252 #define TRYPAREN(paren, n, input) { \
255 PL_regstartp[paren] = HOPc(input, -1) - PL_bostr; \
256 PL_regendp[paren] = input - PL_bostr; \
259 PL_regendp[paren] = -1; \
261 if (regmatch(next)) \
264 PL_regendp[paren] = -1; \
269 * pregexec and friends
273 - pregexec - match a regexp against a string
276 Perl_pregexec(pTHX_ register regexp *prog, char *stringarg, register char *strend,
277 char *strbeg, I32 minend, SV *screamer, U32 nosave)
278 /* strend: pointer to null at end of string */
279 /* strbeg: real beginning of string */
280 /* minend: end of match must be >=minend after stringarg. */
281 /* nosave: For optimizations. */
284 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
285 nosave ? 0 : REXEC_COPY_STR);
289 S_cache_re(pTHX_ regexp *prog)
291 PL_regprecomp = prog->precomp; /* Needed for FAIL. */
293 PL_regprogram = prog->program;
295 PL_regnpar = prog->nparens;
296 PL_regdata = prog->data;
301 * Need to implement the following flags for reg_anch:
303 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
305 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
306 * INTUIT_AUTORITATIVE_ML
307 * INTUIT_ONCE_NOML - Intuit can match in one location only.
310 * Another flag for this function: SECOND_TIME (so that float substrs
311 * with giant delta may be not rechecked).
314 /* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
316 /* If SCREAM, then SvPVX(sv) should be compatible with strpos and strend.
317 Otherwise, only SvCUR(sv) is used to get strbeg. */
319 /* XXXX We assume that strpos is strbeg unless sv. */
321 /* XXXX Some places assume that there is a fixed substring.
322 An update may be needed if optimizer marks as "INTUITable"
323 RExen without fixed substrings. Similarly, it is assumed that
324 lengths of all the strings are no more than minlen, thus they
325 cannot come from lookahead.
326 (Or minlen should take into account lookahead.) */
328 /* A failure to find a constant substring means that there is no need to make
329 an expensive call to REx engine, thus we celebrate a failure. Similarly,
330 finding a substring too deep into the string means that less calls to
331 regtry() should be needed.
333 REx compiler's optimizer found 4 possible hints:
334 a) Anchored substring;
336 c) Whether we are anchored (beginning-of-line or \G);
337 d) First node (of those at offset 0) which may distingush positions;
338 We use a)b)d) and multiline-part of c), and try to find a position in the
339 string which does not contradict any of them.
342 /* Most of decisions we do here should have been done at compile time.
343 The nodes of the REx which we used for the search should have been
344 deleted from the finite automaton. */
347 Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
348 char *strend, U32 flags, re_scream_pos_data *data)
350 register I32 start_shift;
351 /* Should be nonnegative! */
352 register I32 end_shift;
359 register char *other_last = Nullch; /* other substr checked before this */
360 char *check_at; /* check substr found at this pos */
362 char *i_strpos = strpos;
365 DEBUG_r( if (!PL_colorset) reginitcolors() );
366 DEBUG_r(PerlIO_printf(Perl_debug_log,
367 "%sGuessing start of match, REx%s `%s%.60s%s%s' against `%s%.*s%s%s'...\n",
368 PL_colors[4],PL_colors[5],PL_colors[0],
371 (strlen(prog->precomp) > 60 ? "..." : ""),
373 (int)(strend - strpos > 60 ? 60 : strend - strpos),
374 strpos, PL_colors[1],
375 (strend - strpos > 60 ? "..." : ""))
378 if (prog->reganch & ROPT_UTF8)
379 PL_reg_flags |= RF_utf8;
381 if (prog->minlen > CHR_DIST((U8*)strend, (U8*)strpos)) {
382 DEBUG_r(PerlIO_printf(Perl_debug_log, "String too short...\n"));
385 strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
387 check = prog->check_substr;
388 if (prog->reganch & ROPT_ANCH) { /* Match at beg-of-str or after \n */
389 ml_anch = !( (prog->reganch & ROPT_ANCH_SINGLE)
390 || ( (prog->reganch & ROPT_ANCH_BOL)
391 && !PL_multiline ) ); /* Check after \n? */
394 if ( !(prog->reganch & ROPT_ANCH_GPOS) /* Checked by the caller */
395 /* SvCUR is not set on references: SvRV and SvPVX overlap */
397 && (strpos != strbeg)) {
398 DEBUG_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
401 if (prog->check_offset_min == prog->check_offset_max) {
402 /* Substring at constant offset from beg-of-str... */
405 s = HOP3c(strpos, prog->check_offset_min, strend);
407 slen = SvCUR(check); /* >= 1 */
409 if ( strend - s > slen || strend - s < slen - 1
410 || (strend - s == slen && strend[-1] != '\n')) {
411 DEBUG_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
414 /* Now should match s[0..slen-2] */
416 if (slen && (*SvPVX(check) != *s
418 && memNE(SvPVX(check), s, slen)))) {
420 DEBUG_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
424 else if (*SvPVX(check) != *s
425 || ((slen = SvCUR(check)) > 1
426 && memNE(SvPVX(check), s, slen)))
428 goto success_at_start;
431 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
433 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
434 end_shift = prog->minlen - start_shift -
435 CHR_SVLEN(check) + (SvTAIL(check) != 0);
437 I32 end = prog->check_offset_max + CHR_SVLEN(check)
438 - (SvTAIL(check) != 0);
439 I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
441 if (end_shift < eshift)
445 else { /* Can match at random position */
448 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
449 /* Should be nonnegative! */
450 end_shift = prog->minlen - start_shift -
451 CHR_SVLEN(check) + (SvTAIL(check) != 0);
454 #ifdef DEBUGGING /* 7/99: reports of failure (with the older version) */
456 Perl_croak(aTHX_ "panic: end_shift");
460 /* Find a possible match in the region s..strend by looking for
461 the "check" substring in the region corrected by start/end_shift. */
462 if (flags & REXEC_SCREAM) {
463 I32 p = -1; /* Internal iterator of scream. */
464 I32 *pp = data ? data->scream_pos : &p;
466 if (PL_screamfirst[BmRARE(check)] >= 0
467 || ( BmRARE(check) == '\n'
468 && (BmPREVIOUS(check) == SvCUR(check) - 1)
470 s = screaminstr(sv, check,
471 start_shift + (s - strbeg), end_shift, pp, 0);
475 *data->scream_olds = s;
478 s = fbm_instr(HOP3(s, start_shift, strend),
479 HOP3(strend, -end_shift, strbeg),
480 check, PL_multiline ? FBMrf_MULTILINE : 0);
482 /* Update the count-of-usability, remove useless subpatterns,
485 DEBUG_r(PerlIO_printf(Perl_debug_log, "%s %s substr `%s%.*s%s'%s%s",
486 (s ? "Found" : "Did not find"),
487 ((check == prog->anchored_substr) ? "anchored" : "floating"),
489 (int)(SvCUR(check) - (SvTAIL(check)!=0)),
491 PL_colors[1], (SvTAIL(check) ? "$" : ""),
492 (s ? " at offset " : "...\n") ) );
499 /* Finish the diagnostic message */
500 DEBUG_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
502 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
503 Start with the other substr.
504 XXXX no SCREAM optimization yet - and a very coarse implementation
505 XXXX /ttx+/ results in anchored=`ttx', floating=`x'. floating will
506 *always* match. Probably should be marked during compile...
507 Probably it is right to do no SCREAM here...
510 if (prog->float_substr && prog->anchored_substr) {
511 /* Take into account the "other" substring. */
512 /* XXXX May be hopelessly wrong for UTF... */
515 if (check == prog->float_substr) {
518 char *last = HOP3c(s, -start_shift, strbeg), *last1, *last2;
521 t = s - prog->check_offset_max;
522 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
523 && (!(prog->reganch & ROPT_UTF8)
524 || ((t = reghopmaybe3_c(s, -(prog->check_offset_max), strpos))
529 t = HOP3c(t, prog->anchored_offset, strend);
530 if (t < other_last) /* These positions already checked */
532 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
535 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
536 /* On end-of-str: see comment below. */
537 s = fbm_instr((unsigned char*)t,
538 HOP3(HOP3(last1, prog->anchored_offset, strend)
539 + SvCUR(prog->anchored_substr),
540 -(SvTAIL(prog->anchored_substr)!=0), strbeg),
541 prog->anchored_substr,
542 PL_multiline ? FBMrf_MULTILINE : 0);
543 DEBUG_r(PerlIO_printf(Perl_debug_log,
544 "%s anchored substr `%s%.*s%s'%s",
545 (s ? "Found" : "Contradicts"),
547 (int)(SvCUR(prog->anchored_substr)
548 - (SvTAIL(prog->anchored_substr)!=0)),
549 SvPVX(prog->anchored_substr),
550 PL_colors[1], (SvTAIL(prog->anchored_substr) ? "$" : "")));
552 if (last1 >= last2) {
553 DEBUG_r(PerlIO_printf(Perl_debug_log,
554 ", giving up...\n"));
557 DEBUG_r(PerlIO_printf(Perl_debug_log,
558 ", trying floating at offset %ld...\n",
559 (long)(HOP3c(s1, 1, strend) - i_strpos)));
560 other_last = HOP3c(last1, prog->anchored_offset+1, strend);
561 s = HOP3c(last, 1, strend);
565 DEBUG_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
566 (long)(s - i_strpos)));
567 t = HOP3c(s, -prog->anchored_offset, strbeg);
568 other_last = HOP3c(s, 1, strend);
576 else { /* Take into account the floating substring. */
580 t = HOP3c(s, -start_shift, strbeg);
582 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
583 if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
584 last = HOP3c(t, prog->float_max_offset, strend);
585 s = HOP3c(t, prog->float_min_offset, strend);
588 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
589 /* fbm_instr() takes into account exact value of end-of-str
590 if the check is SvTAIL(ed). Since false positives are OK,
591 and end-of-str is not later than strend we are OK. */
592 s = fbm_instr((unsigned char*)s,
593 (unsigned char*)last + SvCUR(prog->float_substr)
594 - (SvTAIL(prog->float_substr)!=0),
595 prog->float_substr, PL_multiline ? FBMrf_MULTILINE : 0);
596 DEBUG_r(PerlIO_printf(Perl_debug_log, "%s floating substr `%s%.*s%s'%s",
597 (s ? "Found" : "Contradicts"),
599 (int)(SvCUR(prog->float_substr)
600 - (SvTAIL(prog->float_substr)!=0)),
601 SvPVX(prog->float_substr),
602 PL_colors[1], (SvTAIL(prog->float_substr) ? "$" : "")));
605 DEBUG_r(PerlIO_printf(Perl_debug_log,
606 ", giving up...\n"));
609 DEBUG_r(PerlIO_printf(Perl_debug_log,
610 ", trying anchored starting at offset %ld...\n",
611 (long)(s1 + 1 - i_strpos)));
613 s = HOP3c(t, 1, strend);
617 DEBUG_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
618 (long)(s - i_strpos)));
619 other_last = s; /* Fix this later. --Hugo */
628 t = s - prog->check_offset_max;
629 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
630 && (!(prog->reganch & ROPT_UTF8)
631 || ((t = reghopmaybe3_c(s, -prog->check_offset_max, strpos))
633 /* Fixed substring is found far enough so that the match
634 cannot start at strpos. */
636 if (ml_anch && t[-1] != '\n') {
637 /* Eventually fbm_*() should handle this, but often
638 anchored_offset is not 0, so this check will not be wasted. */
639 /* XXXX In the code below we prefer to look for "^" even in
640 presence of anchored substrings. And we search even
641 beyond the found float position. These pessimizations
642 are historical artefacts only. */
644 while (t < strend - prog->minlen) {
646 if (t < check_at - prog->check_offset_min) {
647 if (prog->anchored_substr) {
648 /* Since we moved from the found position,
649 we definitely contradict the found anchored
650 substr. Due to the above check we do not
651 contradict "check" substr.
652 Thus we can arrive here only if check substr
653 is float. Redo checking for "other"=="fixed".
656 DEBUG_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
657 PL_colors[0],PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
658 goto do_other_anchored;
660 /* We don't contradict the found floating substring. */
661 /* XXXX Why not check for STCLASS? */
663 DEBUG_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
664 PL_colors[0],PL_colors[1], (long)(s - i_strpos)));
667 /* Position contradicts check-string */
668 /* XXXX probably better to look for check-string
669 than for "\n", so one should lower the limit for t? */
670 DEBUG_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
671 PL_colors[0],PL_colors[1], (long)(t + 1 - i_strpos)));
672 other_last = strpos = s = t + 1;
677 DEBUG_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
678 PL_colors[0],PL_colors[1]));
682 DEBUG_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
683 PL_colors[0],PL_colors[1]));
687 ++BmUSEFUL(prog->check_substr); /* hooray/5 */
690 /* The found string does not prohibit matching at strpos,
691 - no optimization of calling REx engine can be performed,
692 unless it was an MBOL and we are not after MBOL,
693 or a future STCLASS check will fail this. */
695 /* Even in this situation we may use MBOL flag if strpos is offset
696 wrt the start of the string. */
697 if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
698 && (strpos != strbeg) && strpos[-1] != '\n'
699 /* May be due to an implicit anchor of m{.*foo} */
700 && !(prog->reganch & ROPT_IMPLICIT))
705 DEBUG_r( if (ml_anch)
706 PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
707 (long)(strpos - i_strpos), PL_colors[0],PL_colors[1]);
710 if (!(prog->reganch & ROPT_NAUGHTY) /* XXXX If strpos moved? */
711 && prog->check_substr /* Could be deleted already */
712 && --BmUSEFUL(prog->check_substr) < 0
713 && prog->check_substr == prog->float_substr)
715 /* If flags & SOMETHING - do not do it many times on the same match */
716 DEBUG_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
717 SvREFCNT_dec(prog->check_substr);
718 prog->check_substr = Nullsv; /* disable */
719 prog->float_substr = Nullsv; /* clear */
720 check = Nullsv; /* abort */
722 /* XXXX This is a remnant of the old implementation. It
723 looks wasteful, since now INTUIT can use many
725 prog->reganch &= ~RE_USE_INTUIT;
732 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
733 if (prog->regstclass) {
734 /* minlen == 0 is possible if regstclass is \b or \B,
735 and the fixed substr is ''$.
736 Since minlen is already taken into account, s+1 is before strend;
737 accidentally, minlen >= 1 guaranties no false positives at s + 1
738 even for \b or \B. But (minlen? 1 : 0) below assumes that
739 regstclass does not come from lookahead... */
740 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
741 This leaves EXACTF only, which is dealt with in find_byclass(). */
742 U8* str = (U8*)STRING(prog->regstclass);
743 int cl_l = (PL_regkind[(U8)OP(prog->regstclass)] == EXACT
744 ? CHR_DIST(str+STR_LEN(prog->regstclass), str)
746 char *endpos = (prog->anchored_substr || ml_anch)
747 ? HOP3c(s, (prog->minlen ? cl_l : 0), strend)
748 : (prog->float_substr
749 ? HOP3c(HOP3c(check_at, -start_shift, strbeg),
752 char *startpos = strbeg;
755 if (prog->reganch & ROPT_UTF8) {
756 PL_regdata = prog->data;
759 s = find_byclass(prog, prog->regstclass, s, endpos, startpos, 1);
764 if (endpos == strend) {
765 DEBUG_r( PerlIO_printf(Perl_debug_log,
766 "Could not match STCLASS...\n") );
769 DEBUG_r( PerlIO_printf(Perl_debug_log,
770 "This position contradicts STCLASS...\n") );
771 if ((prog->reganch & ROPT_ANCH) && !ml_anch)
773 /* Contradict one of substrings */
774 if (prog->anchored_substr) {
775 if (prog->anchored_substr == check) {
776 DEBUG_r( what = "anchored" );
778 s = HOP3c(t, 1, strend);
779 if (s + start_shift + end_shift > strend) {
780 /* XXXX Should be taken into account earlier? */
781 DEBUG_r( PerlIO_printf(Perl_debug_log,
782 "Could not match STCLASS...\n") );
787 DEBUG_r( PerlIO_printf(Perl_debug_log,
788 "Looking for %s substr starting at offset %ld...\n",
789 what, (long)(s + start_shift - i_strpos)) );
792 /* Have both, check_string is floating */
793 if (t + start_shift >= check_at) /* Contradicts floating=check */
794 goto retry_floating_check;
795 /* Recheck anchored substring, but not floating... */
799 DEBUG_r( PerlIO_printf(Perl_debug_log,
800 "Looking for anchored substr starting at offset %ld...\n",
801 (long)(other_last - i_strpos)) );
802 goto do_other_anchored;
804 /* Another way we could have checked stclass at the
805 current position only: */
810 DEBUG_r( PerlIO_printf(Perl_debug_log,
811 "Looking for /%s^%s/m starting at offset %ld...\n",
812 PL_colors[0],PL_colors[1], (long)(t - i_strpos)) );
815 if (!prog->float_substr) /* Could have been deleted */
817 /* Check is floating subtring. */
818 retry_floating_check:
819 t = check_at - start_shift;
820 DEBUG_r( what = "floating" );
821 goto hop_and_restart;
824 PerlIO_printf(Perl_debug_log,
825 "By STCLASS: moving %ld --> %ld\n",
826 (long)(t - i_strpos), (long)(s - i_strpos));
828 PerlIO_printf(Perl_debug_log,
829 "Does not contradict STCLASS...\n") );
832 DEBUG_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
833 PL_colors[4], (check ? "Guessed" : "Giving up"),
834 PL_colors[5], (long)(s - i_strpos)) );
837 fail_finish: /* Substring not found */
838 if (prog->check_substr) /* could be removed already */
839 BmUSEFUL(prog->check_substr) += 5; /* hooray */
841 DEBUG_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
842 PL_colors[4],PL_colors[5]));
846 /* We know what class REx starts with. Try to find this position... */
848 S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *startpos, I32 norun)
850 I32 doevery = (prog->reganch & ROPT_SKIP) == 0;
856 register I32 tmp = 1; /* Scratch variable? */
857 register bool do_utf8 = DO_UTF8(PL_reg_sv);
859 /* We know what class it must start with. */
863 if (reginclass(c, (U8*)s, do_utf8)) {
864 if (tmp && (norun || regtry(prog, s)))
871 s += do_utf8 ? UTF8SKIP(s) : 1;
878 c1 = to_utf8_lower((U8*)m);
879 c2 = to_utf8_upper((U8*)m);
890 c2 = PL_fold_locale[c1];
895 e = s; /* Due to minlen logic of intuit() */
901 if ( utf8_to_uv_simple((U8*)s, &len) == c1
908 UV c = utf8_to_uv_simple((U8*)s, &len);
909 if ( (c == c1 || c == c2) && regtry(prog, s) )
918 && (ln == 1 || !(OP(c) == EXACTF
920 : ibcmp_locale(s, m, ln)))
921 && (norun || regtry(prog, s)) )
927 if ( (*(U8*)s == c1 || *(U8*)s == c2)
928 && (ln == 1 || !(OP(c) == EXACTF
930 : ibcmp_locale(s, m, ln)))
931 && (norun || regtry(prog, s)) )
938 PL_reg_flags |= RF_tainted;
945 U8 *r = reghop3((U8*)s, -1, (U8*)startpos);
947 tmp = (I32)utf8_to_uv(r, s - (char*)r, 0, 0);
949 tmp = ((OP(c) == BOUND ?
950 isALNUM_uni(tmp) : isALNUM_LC_uni(tmp)) != 0);
952 if (tmp == !(OP(c) == BOUND ?
953 swash_fetch(PL_utf8_alnum, (U8*)s) :
954 isALNUM_LC_utf8((U8*)s)))
957 if ((norun || regtry(prog, s)))
964 tmp = (s != startpos) ? UCHARAT(s - 1) : '\n';
965 tmp = ((OP(c) == BOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
968 !(OP(c) == BOUND ? isALNUM(*s) : isALNUM_LC(*s))) {
970 if ((norun || regtry(prog, s)))
976 if ((!prog->minlen && tmp) && (norun || regtry(prog, s)))
980 PL_reg_flags |= RF_tainted;
987 U8 *r = reghop3((U8*)s, -1, (U8*)startpos);
989 tmp = (I32)utf8_to_uv(r, s - (char*)r, 0, 0);
991 tmp = ((OP(c) == NBOUND ?
992 isALNUM_uni(tmp) : isALNUM_LC_uni(tmp)) != 0);
994 if (tmp == !(OP(c) == NBOUND ?
995 swash_fetch(PL_utf8_alnum, (U8*)s) :
996 isALNUM_LC_utf8((U8*)s)))
998 else if ((norun || regtry(prog, s)))
1004 tmp = (s != startpos) ? UCHARAT(s - 1) : '\n';
1005 tmp = ((OP(c) == NBOUND ?
1006 isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1007 while (s < strend) {
1009 !(OP(c) == NBOUND ? isALNUM(*s) : isALNUM_LC(*s)))
1011 else if ((norun || regtry(prog, s)))
1016 if ((!prog->minlen && !tmp) && (norun || regtry(prog, s)))
1021 while (s < strend) {
1022 if (swash_fetch(PL_utf8_alnum, (U8*)s)) {
1023 if (tmp && (norun || regtry(prog, s)))
1034 while (s < strend) {
1036 if (tmp && (norun || regtry(prog, s)))
1048 PL_reg_flags |= RF_tainted;
1050 while (s < strend) {
1051 if (isALNUM_LC_utf8((U8*)s)) {
1052 if (tmp && (norun || regtry(prog, s)))
1063 while (s < strend) {
1064 if (isALNUM_LC(*s)) {
1065 if (tmp && (norun || regtry(prog, s)))
1078 while (s < strend) {
1079 if (!swash_fetch(PL_utf8_alnum, (U8*)s)) {
1080 if (tmp && (norun || regtry(prog, s)))
1091 while (s < strend) {
1093 if (tmp && (norun || regtry(prog, s)))
1105 PL_reg_flags |= RF_tainted;
1107 while (s < strend) {
1108 if (!isALNUM_LC_utf8((U8*)s)) {
1109 if (tmp && (norun || regtry(prog, s)))
1120 while (s < strend) {
1121 if (!isALNUM_LC(*s)) {
1122 if (tmp && (norun || regtry(prog, s)))
1135 while (s < strend) {
1136 if (*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s)) {
1137 if (tmp && (norun || regtry(prog, s)))
1148 while (s < strend) {
1150 if (tmp && (norun || regtry(prog, s)))
1162 PL_reg_flags |= RF_tainted;
1164 while (s < strend) {
1165 if (*s == ' ' || isSPACE_LC_utf8((U8*)s)) {
1166 if (tmp && (norun || regtry(prog, s)))
1177 while (s < strend) {
1178 if (isSPACE_LC(*s)) {
1179 if (tmp && (norun || regtry(prog, s)))
1192 while (s < strend) {
1193 if (!(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s))) {
1194 if (tmp && (norun || regtry(prog, s)))
1205 while (s < strend) {
1207 if (tmp && (norun || regtry(prog, s)))
1219 PL_reg_flags |= RF_tainted;
1221 while (s < strend) {
1222 if (!(*s == ' ' || isSPACE_LC_utf8((U8*)s))) {
1223 if (tmp && (norun || regtry(prog, s)))
1234 while (s < strend) {
1235 if (!isSPACE_LC(*s)) {
1236 if (tmp && (norun || regtry(prog, s)))
1249 while (s < strend) {
1250 if (swash_fetch(PL_utf8_digit,(U8*)s)) {
1251 if (tmp && (norun || regtry(prog, s)))
1262 while (s < strend) {
1264 if (tmp && (norun || regtry(prog, s)))
1276 PL_reg_flags |= RF_tainted;
1278 while (s < strend) {
1279 if (isDIGIT_LC_utf8((U8*)s)) {
1280 if (tmp && (norun || regtry(prog, s)))
1291 while (s < strend) {
1292 if (isDIGIT_LC(*s)) {
1293 if (tmp && (norun || regtry(prog, s)))
1306 while (s < strend) {
1307 if (!swash_fetch(PL_utf8_digit,(U8*)s)) {
1308 if (tmp && (norun || regtry(prog, s)))
1319 while (s < strend) {
1321 if (tmp && (norun || regtry(prog, s)))
1333 PL_reg_flags |= RF_tainted;
1335 while (s < strend) {
1336 if (!isDIGIT_LC_utf8((U8*)s)) {
1337 if (tmp && (norun || regtry(prog, s)))
1348 while (s < strend) {
1349 if (!isDIGIT_LC(*s)) {
1350 if (tmp && (norun || regtry(prog, s)))
1362 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
1371 - regexec_flags - match a regexp against a string
1374 Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *strend,
1375 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
1376 /* strend: pointer to null at end of string */
1377 /* strbeg: real beginning of string */
1378 /* minend: end of match must be >=minend after stringarg. */
1379 /* data: May be used for some additional optimizations. */
1380 /* nosave: For optimizations. */
1383 register regnode *c;
1384 register char *startpos = stringarg;
1385 I32 minlen; /* must match at least this many chars */
1386 I32 dontbother = 0; /* how many characters not to try at end */
1387 /* I32 start_shift = 0; */ /* Offset of the start to find
1388 constant substr. */ /* CC */
1389 I32 end_shift = 0; /* Same for the end. */ /* CC */
1390 I32 scream_pos = -1; /* Internal iterator of scream. */
1392 SV* oreplsv = GvSV(PL_replgv);
1393 bool do_utf8 = DO_UTF8(sv);
1399 PL_regnarrate = PL_debug & 512;
1402 /* Be paranoid... */
1403 if (prog == NULL || startpos == NULL) {
1404 Perl_croak(aTHX_ "NULL regexp parameter");
1408 minlen = prog->minlen;
1410 if (utf8_distance((U8*)strend, (U8*)startpos) < minlen) goto phooey;
1413 if (strend - startpos < minlen) goto phooey;
1416 if (startpos == strbeg) /* is ^ valid at stringarg? */
1419 if (prog->reganch & ROPT_UTF8 && do_utf8) {
1420 U8 *s = reghop3((U8*)stringarg, -1, (U8*)strbeg);
1421 PL_regprev = utf8_to_uv(s, (U8*)stringarg - s, NULL, 0);
1424 PL_regprev = (U32)stringarg[-1];
1425 if (!PL_multiline && PL_regprev == '\n')
1426 PL_regprev = '\0'; /* force ^ to NOT match */
1429 /* Check validity of program. */
1430 if (UCHARAT(prog->program) != REG_MAGIC) {
1431 Perl_croak(aTHX_ "corrupted regexp program");
1435 PL_reg_eval_set = 0;
1438 if (prog->reganch & ROPT_UTF8)
1439 PL_reg_flags |= RF_utf8;
1441 /* Mark beginning of line for ^ and lookbehind. */
1442 PL_regbol = startpos;
1446 /* Mark end of line for $ (and such) */
1449 /* see how far we have to get to not match where we matched before */
1450 PL_regtill = startpos+minend;
1452 /* We start without call_cc context. */
1455 /* If there is a "must appear" string, look for it. */
1458 if (prog->reganch & ROPT_GPOS_SEEN) { /* Need to have PL_reg_ganch */
1461 if (flags & REXEC_IGNOREPOS) /* Means: check only at start */
1462 PL_reg_ganch = startpos;
1463 else if (sv && SvTYPE(sv) >= SVt_PVMG
1465 && (mg = mg_find(sv, 'g')) && mg->mg_len >= 0) {
1466 PL_reg_ganch = strbeg + mg->mg_len; /* Defined pos() */
1467 if (prog->reganch & ROPT_ANCH_GPOS) {
1468 if (s > PL_reg_ganch)
1473 else /* pos() not defined */
1474 PL_reg_ganch = strbeg;
1477 if (do_utf8 == (UTF!=0) &&
1478 !(flags & REXEC_CHECKED) && prog->check_substr != Nullsv) {
1479 re_scream_pos_data d;
1481 d.scream_olds = &scream_olds;
1482 d.scream_pos = &scream_pos;
1483 s = re_intuit_start(prog, sv, s, strend, flags, &d);
1485 goto phooey; /* not present */
1488 DEBUG_r( if (!PL_colorset) reginitcolors() );
1489 DEBUG_r(PerlIO_printf(Perl_debug_log,
1490 "%sMatching REx%s `%s%.60s%s%s' against `%s%.*s%s%s'\n",
1491 PL_colors[4],PL_colors[5],PL_colors[0],
1494 (strlen(prog->precomp) > 60 ? "..." : ""),
1496 (int)(strend - startpos > 60 ? 60 : strend - startpos),
1497 startpos, PL_colors[1],
1498 (strend - startpos > 60 ? "..." : ""))
1501 /* Simplest case: anchored match need be tried only once. */
1502 /* [unless only anchor is BOL and multiline is set] */
1503 if (prog->reganch & (ROPT_ANCH & ~ROPT_ANCH_GPOS)) {
1504 if (s == startpos && regtry(prog, startpos))
1506 else if (PL_multiline || (prog->reganch & ROPT_IMPLICIT)
1507 || (prog->reganch & ROPT_ANCH_MBOL)) /* XXXX SBOL? */
1512 dontbother = minlen - 1;
1513 end = HOP3c(strend, -dontbother, strbeg) - 1;
1514 /* for multiline we only have to try after newlines */
1515 if (prog->check_substr) {
1519 if (regtry(prog, s))
1524 if (prog->reganch & RE_USE_INTUIT) {
1525 s = re_intuit_start(prog, sv, s + 1, strend, flags, NULL);
1536 if (*s++ == '\n') { /* don't need PL_utf8skip here */
1537 if (regtry(prog, s))
1544 } else if (prog->reganch & ROPT_ANCH_GPOS) {
1545 if (regtry(prog, PL_reg_ganch))
1550 /* Messy cases: unanchored match. */
1551 if (prog->anchored_substr && prog->reganch & ROPT_SKIP) {
1552 /* we have /x+whatever/ */
1553 /* it must be a one character string (XXXX Except UTF?) */
1554 char ch = SvPVX(prog->anchored_substr)[0];
1560 while (s < strend) {
1562 DEBUG_r( did_match = 1 );
1563 if (regtry(prog, s)) goto got_it;
1565 while (s < strend && *s == ch)
1572 while (s < strend) {
1574 DEBUG_r( did_match = 1 );
1575 if (regtry(prog, s)) goto got_it;
1577 while (s < strend && *s == ch)
1583 DEBUG_r(did_match ||
1584 PerlIO_printf(Perl_debug_log,
1585 "Did not find anchored character...\n"));
1588 else if (do_utf8 == (UTF!=0) &&
1589 (prog->anchored_substr != Nullsv
1590 || (prog->float_substr != Nullsv
1591 && prog->float_max_offset < strend - s))) {
1592 SV *must = prog->anchored_substr
1593 ? prog->anchored_substr : prog->float_substr;
1595 prog->anchored_substr ? prog->anchored_offset : prog->float_max_offset;
1597 prog->anchored_substr ? prog->anchored_offset : prog->float_min_offset;
1598 char *last = HOP3c(strend, /* Cannot start after this */
1599 -(I32)(CHR_SVLEN(must)
1600 - (SvTAIL(must) != 0) + back_min), strbeg);
1601 char *last1; /* Last position checked before */
1607 last1 = HOPc(s, -1);
1609 last1 = s - 1; /* bogus */
1611 /* XXXX check_substr already used to find `s', can optimize if
1612 check_substr==must. */
1614 dontbother = end_shift;
1615 strend = HOPc(strend, -dontbother);
1616 while ( (s <= last) &&
1617 ((flags & REXEC_SCREAM)
1618 ? (s = screaminstr(sv, must, HOP3c(s, back_min, strend) - strbeg,
1619 end_shift, &scream_pos, 0))
1620 : (s = fbm_instr((unsigned char*)HOP3(s, back_min, strend),
1621 (unsigned char*)strend, must,
1622 PL_multiline ? FBMrf_MULTILINE : 0))) ) {
1623 DEBUG_r( did_match = 1 );
1624 if (HOPc(s, -back_max) > last1) {
1625 last1 = HOPc(s, -back_min);
1626 s = HOPc(s, -back_max);
1629 char *t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
1631 last1 = HOPc(s, -back_min);
1635 while (s <= last1) {
1636 if (regtry(prog, s))
1642 while (s <= last1) {
1643 if (regtry(prog, s))
1649 DEBUG_r(did_match ||
1650 PerlIO_printf(Perl_debug_log, "Did not find %s substr `%s%.*s%s'%s...\n",
1651 ((must == prog->anchored_substr)
1652 ? "anchored" : "floating"),
1654 (int)(SvCUR(must) - (SvTAIL(must)!=0)),
1656 PL_colors[1], (SvTAIL(must) ? "$" : "")));
1659 else if ((c = prog->regstclass)) {
1660 if (minlen && PL_regkind[(U8)OP(prog->regstclass)] != EXACT)
1661 /* don't bother with what can't match */
1662 strend = HOPc(strend, -(minlen - 1));
1664 SV *prop = sv_newmortal();
1666 PerlIO_printf(Perl_debug_log, "Matching stclass `%s' against `%s'\n", SvPVX(prop), s);
1668 if (find_byclass(prog, c, s, strend, startpos, 0))
1670 DEBUG_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass...\n"));
1674 if (prog->float_substr != Nullsv) { /* Trim the end. */
1677 if (flags & REXEC_SCREAM) {
1678 last = screaminstr(sv, prog->float_substr, s - strbeg,
1679 end_shift, &scream_pos, 1); /* last one */
1681 last = scream_olds; /* Only one occurrence. */
1685 char *little = SvPV(prog->float_substr, len);
1687 if (SvTAIL(prog->float_substr)) {
1688 if (memEQ(strend - len + 1, little, len - 1))
1689 last = strend - len + 1;
1690 else if (!PL_multiline)
1691 last = memEQ(strend - len, little, len)
1692 ? strend - len : Nullch;
1698 last = rninstr(s, strend, little, little + len);
1700 last = strend; /* matching `$' */
1704 DEBUG_r(PerlIO_printf(Perl_debug_log,
1705 "%sCan't trim the tail, match fails (should not happen)%s\n",
1706 PL_colors[4],PL_colors[5]));
1707 goto phooey; /* Should not happen! */
1709 dontbother = strend - last + prog->float_min_offset;
1711 if (minlen && (dontbother < minlen))
1712 dontbother = minlen - 1;
1713 strend -= dontbother; /* this one's always in bytes! */
1714 /* We don't know much -- general case. */
1717 if (regtry(prog, s))
1726 if (regtry(prog, s))
1728 } while (s++ < strend);
1736 RX_MATCH_TAINTED_set(prog, PL_reg_flags & RF_tainted);
1738 if (PL_reg_eval_set) {
1739 /* Preserve the current value of $^R */
1740 if (oreplsv != GvSV(PL_replgv))
1741 sv_setsv(oreplsv, GvSV(PL_replgv));/* So that when GvSV(replgv) is
1742 restored, the value remains
1744 restore_pos(aTHXo_ 0);
1747 /* make sure $`, $&, $', and $digit will work later */
1748 if ( !(flags & REXEC_NOT_FIRST) ) {
1749 if (RX_MATCH_COPIED(prog)) {
1750 Safefree(prog->subbeg);
1751 RX_MATCH_COPIED_off(prog);
1753 if (flags & REXEC_COPY_STR) {
1754 I32 i = PL_regeol - startpos + (stringarg - strbeg);
1756 s = savepvn(strbeg, i);
1759 RX_MATCH_COPIED_on(prog);
1762 prog->subbeg = strbeg;
1763 prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
1770 DEBUG_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
1771 PL_colors[4],PL_colors[5]));
1772 if (PL_reg_eval_set)
1773 restore_pos(aTHXo_ 0);
1778 - regtry - try match at specific point
1780 STATIC I32 /* 0 failure, 1 success */
1781 S_regtry(pTHX_ regexp *prog, char *startpos)
1789 PL_regindent = 0; /* XXXX Not good when matches are reenterable... */
1791 if ((prog->reganch & ROPT_EVAL_SEEN) && !PL_reg_eval_set) {
1794 PL_reg_eval_set = RS_init;
1796 PerlIO_printf(Perl_debug_log, " setting stack tmpbase at %"IVdf"\n",
1797 (IV)(PL_stack_sp - PL_stack_base));
1799 SAVEI32(cxstack[cxstack_ix].blk_oldsp);
1800 cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base;
1801 /* Otherwise OP_NEXTSTATE will free whatever on stack now. */
1803 /* Apparently this is not needed, judging by wantarray. */
1804 /* SAVEI8(cxstack[cxstack_ix].blk_gimme);
1805 cxstack[cxstack_ix].blk_gimme = G_SCALAR; */
1808 /* Make $_ available to executed code. */
1809 if (PL_reg_sv != DEFSV) {
1810 /* SAVE_DEFSV does *not* suffice here for USE_THREADS */
1815 if (!(SvTYPE(PL_reg_sv) >= SVt_PVMG && SvMAGIC(PL_reg_sv)
1816 && (mg = mg_find(PL_reg_sv, 'g')))) {
1817 /* prepare for quick setting of pos */
1818 sv_magic(PL_reg_sv, (SV*)0, 'g', Nullch, 0);
1819 mg = mg_find(PL_reg_sv, 'g');
1823 PL_reg_oldpos = mg->mg_len;
1824 SAVEDESTRUCTOR_X(restore_pos, 0);
1827 Newz(22,PL_reg_curpm, 1, PMOP);
1828 PL_reg_curpm->op_pmregexp = prog;
1829 PL_reg_oldcurpm = PL_curpm;
1830 PL_curpm = PL_reg_curpm;
1831 if (RX_MATCH_COPIED(prog)) {
1832 /* Here is a serious problem: we cannot rewrite subbeg,
1833 since it may be needed if this match fails. Thus
1834 $` inside (?{}) could fail... */
1835 PL_reg_oldsaved = prog->subbeg;
1836 PL_reg_oldsavedlen = prog->sublen;
1837 RX_MATCH_COPIED_off(prog);
1840 PL_reg_oldsaved = Nullch;
1841 prog->subbeg = PL_bostr;
1842 prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
1844 prog->startp[0] = startpos - PL_bostr;
1845 PL_reginput = startpos;
1846 PL_regstartp = prog->startp;
1847 PL_regendp = prog->endp;
1848 PL_reglastparen = &prog->lastparen;
1849 prog->lastparen = 0;
1851 DEBUG_r(PL_reg_starttry = startpos);
1852 if (PL_reg_start_tmpl <= prog->nparens) {
1853 PL_reg_start_tmpl = prog->nparens*3/2 + 3;
1854 if(PL_reg_start_tmp)
1855 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
1857 New(22,PL_reg_start_tmp, PL_reg_start_tmpl, char*);
1860 /* XXXX What this code is doing here?!!! There should be no need
1861 to do this again and again, PL_reglastparen should take care of
1864 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
1865 * Actually, the code in regcppop() (which Ilya may be meaning by
1866 * PL_reglastparen), is not needed at all by the test suite
1867 * (op/regexp, op/pat, op/split), but that code is needed, oddly
1868 * enough, for building DynaLoader, or otherwise this
1869 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
1870 * will happen. Meanwhile, this code *is* needed for the
1871 * above-mentioned test suite tests to succeed. The common theme
1872 * on those tests seems to be returning null fields from matches.
1877 if (prog->nparens) {
1878 for (i = prog->nparens; i > *PL_reglastparen; i--) {
1885 if (regmatch(prog->program + 1)) {
1886 prog->endp[0] = PL_reginput - PL_bostr;
1889 REGCP_UNWIND(lastcp);
1893 #define RE_UNWIND_BRANCH 1
1894 #define RE_UNWIND_BRANCHJ 2
1898 typedef struct { /* XX: makes sense to enlarge it... */
1902 } re_unwind_generic_t;
1915 } re_unwind_branch_t;
1917 typedef union re_unwind_t {
1919 re_unwind_generic_t generic;
1920 re_unwind_branch_t branch;
1924 - regmatch - main matching routine
1926 * Conceptually the strategy is simple: check to see whether the current
1927 * node matches, call self recursively to see whether the rest matches,
1928 * and then act accordingly. In practice we make some effort to avoid
1929 * recursion, in particular by going through "ordinary" nodes (that don't
1930 * need to know whether the rest of the match failed) by a loop instead of
1933 /* [lwall] I've hoisted the register declarations to the outer block in order to
1934 * maybe save a little bit of pushing and popping on the stack. It also takes
1935 * advantage of machines that use a register save mask on subroutine entry.
1937 STATIC I32 /* 0 failure, 1 success */
1938 S_regmatch(pTHX_ regnode *prog)
1940 register regnode *scan; /* Current node. */
1941 regnode *next; /* Next node. */
1942 regnode *inner; /* Next node in internal branch. */
1943 register I32 nextchr; /* renamed nextchr - nextchar colides with
1944 function of same name */
1945 register I32 n; /* no or next */
1946 register I32 ln; /* len or last */
1947 register char *s; /* operand or save */
1948 register char *locinput = PL_reginput;
1949 register I32 c1, c2, paren; /* case fold search, parenth */
1950 int minmod = 0, sw = 0, logical = 0;
1952 I32 firstcp = PL_savestack_ix;
1953 register bool do_utf8 = DO_UTF8(PL_reg_sv);
1959 /* Note that nextchr is a byte even in UTF */
1960 nextchr = UCHARAT(locinput);
1962 while (scan != NULL) {
1963 #define sayNO_L (logical ? (logical = 0, sw = 0, goto cont) : sayNO)
1965 # define sayYES goto yes
1966 # define sayNO goto no
1967 # define sayYES_FINAL goto yes_final
1968 # define sayYES_LOUD goto yes_loud
1969 # define sayNO_FINAL goto no_final
1970 # define sayNO_SILENT goto do_no
1971 # define saySAME(x) if (x) goto yes; else goto no
1972 # define REPORT_CODE_OFF 24
1974 # define sayYES return 1
1975 # define sayNO return 0
1976 # define sayYES_FINAL return 1
1977 # define sayYES_LOUD return 1
1978 # define sayNO_FINAL return 0
1979 # define sayNO_SILENT return 0
1980 # define saySAME(x) return x
1983 SV *prop = sv_newmortal();
1984 int docolor = *PL_colors[0];
1985 int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
1986 int l = (PL_regeol - locinput) > taill ? taill : (PL_regeol - locinput);
1987 /* The part of the string before starttry has one color
1988 (pref0_len chars), between starttry and current
1989 position another one (pref_len - pref0_len chars),
1990 after the current position the third one.
1991 We assume that pref0_len <= pref_len, otherwise we
1992 decrease pref0_len. */
1993 int pref_len = (locinput - PL_bostr) > (5 + taill) - l
1994 ? (5 + taill) - l : locinput - PL_bostr;
1997 while (UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
1999 pref0_len = pref_len - (locinput - PL_reg_starttry);
2000 if (l + pref_len < (5 + taill) && l < PL_regeol - locinput)
2001 l = ( PL_regeol - locinput > (5 + taill) - pref_len
2002 ? (5 + taill) - pref_len : PL_regeol - locinput);
2003 while (UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
2007 if (pref0_len > pref_len)
2008 pref0_len = pref_len;
2009 regprop(prop, scan);
2010 PerlIO_printf(Perl_debug_log,
2011 "%4"IVdf" <%s%.*s%s%s%.*s%s%s%s%.*s%s>%*s|%3"IVdf":%*s%s\n",
2012 (IV)(locinput - PL_bostr),
2013 PL_colors[4], pref0_len,
2014 locinput - pref_len, PL_colors[5],
2015 PL_colors[2], pref_len - pref0_len,
2016 locinput - pref_len + pref0_len, PL_colors[3],
2017 (docolor ? "" : "> <"),
2018 PL_colors[0], l, locinput, PL_colors[1],
2019 15 - l - pref_len + 1,
2021 (IV)(scan - PL_regprogram), PL_regindent*2, "",
2025 next = scan + NEXT_OFF(scan);
2031 if (locinput == PL_bostr
2032 ? PL_regprev == '\n'
2034 (nextchr || locinput < PL_regeol) && locinput[-1] == '\n') )
2036 /* regtill = regbol; */
2041 if (locinput == PL_bostr
2042 ? PL_regprev == '\n'
2043 : ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n') )
2049 if (locinput == PL_bostr)
2053 if (locinput == PL_reg_ganch)
2063 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
2068 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
2070 if (PL_regeol - locinput > 1)
2074 if (PL_regeol != locinput)
2079 locinput += PL_utf8skip[nextchr];
2080 if (locinput > PL_regeol)
2082 nextchr = UCHARAT(locinput);
2085 if (!nextchr && locinput >= PL_regeol)
2087 nextchr = UCHARAT(++locinput);
2090 if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
2093 locinput += PL_utf8skip[nextchr];
2094 if (locinput > PL_regeol)
2096 nextchr = UCHARAT(locinput);
2099 nextchr = UCHARAT(++locinput);
2104 if (do_utf8 != (UTF!=0)) {
2112 if (*((U8*)s) != utf8_to_uv_simple((U8*)l, &len))
2121 if (*((U8*)l) != utf8_to_uv_simple((U8*)s, &len))
2127 nextchr = UCHARAT(locinput);
2130 /* Inline the first character, for speed. */
2131 if (UCHARAT(s) != nextchr)
2133 if (PL_regeol - locinput < ln)
2135 if (ln > 1 && memNE(s, locinput, ln))
2138 nextchr = UCHARAT(locinput);
2141 PL_reg_flags |= RF_tainted;
2151 c1 = OP(scan) == EXACTF;
2153 if (l >= PL_regeol) {
2156 if ((UTF ? utf8_to_uv((U8*)s, e - s, 0, 0) : *((U8*)s)) !=
2157 (c1 ? toLOWER_utf8((U8*)l) : toLOWER_LC_utf8((U8*)l)))
2159 s += UTF ? UTF8SKIP(s) : 1;
2163 nextchr = UCHARAT(locinput);
2167 /* Inline the first character, for speed. */
2168 if (UCHARAT(s) != nextchr &&
2169 UCHARAT(s) != ((OP(scan) == EXACTF)
2170 ? PL_fold : PL_fold_locale)[nextchr])
2172 if (PL_regeol - locinput < ln)
2174 if (ln > 1 && (OP(scan) == EXACTF
2175 ? ibcmp(s, locinput, ln)
2176 : ibcmp_locale(s, locinput, ln)))
2179 nextchr = UCHARAT(locinput);
2183 if (!reginclass(scan, (U8*)locinput, do_utf8))
2185 if (locinput >= PL_regeol)
2187 locinput += PL_utf8skip[nextchr];
2188 nextchr = UCHARAT(locinput);
2192 nextchr = UCHARAT(locinput);
2193 if (!reginclass(scan, (U8*)locinput, do_utf8))
2195 if (!nextchr && locinput >= PL_regeol)
2197 nextchr = UCHARAT(++locinput);
2201 PL_reg_flags |= RF_tainted;
2207 if (!(OP(scan) == ALNUM
2208 ? swash_fetch(PL_utf8_alnum, (U8*)locinput)
2209 : isALNUM_LC_utf8((U8*)locinput)))
2213 locinput += PL_utf8skip[nextchr];
2214 nextchr = UCHARAT(locinput);
2217 if (!(OP(scan) == ALNUM
2218 ? isALNUM(nextchr) : isALNUM_LC(nextchr)))
2220 nextchr = UCHARAT(++locinput);
2223 PL_reg_flags |= RF_tainted;
2226 if (!nextchr && locinput >= PL_regeol)
2229 if (OP(scan) == NALNUM
2230 ? swash_fetch(PL_utf8_alnum, (U8*)locinput)
2231 : isALNUM_LC_utf8((U8*)locinput))
2235 locinput += PL_utf8skip[nextchr];
2236 nextchr = UCHARAT(locinput);
2239 if (OP(scan) == NALNUM
2240 ? isALNUM(nextchr) : isALNUM_LC(nextchr))
2242 nextchr = UCHARAT(++locinput);
2246 PL_reg_flags |= RF_tainted;
2250 /* was last char in word? */
2252 if (locinput == PL_regbol)
2255 U8 *r = reghop((U8*)locinput, -1);
2257 ln = utf8_to_uv(r, s - (char*)r, 0, 0);
2259 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
2260 ln = isALNUM_uni(ln);
2261 n = swash_fetch(PL_utf8_alnum, (U8*)locinput);
2264 ln = isALNUM_LC_uni(ln);
2265 n = isALNUM_LC_utf8((U8*)locinput);
2269 ln = (locinput != PL_regbol) ?
2270 UCHARAT(locinput - 1) : PL_regprev;
2271 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
2273 n = isALNUM(nextchr);
2276 ln = isALNUM_LC(ln);
2277 n = isALNUM_LC(nextchr);
2280 if (((!ln) == (!n)) == (OP(scan) == BOUND ||
2281 OP(scan) == BOUNDL))
2285 PL_reg_flags |= RF_tainted;
2291 if (UTF8_IS_CONTINUED(nextchr)) {
2292 if (!(OP(scan) == SPACE
2293 ? swash_fetch(PL_utf8_space, (U8*)locinput)
2294 : isSPACE_LC_utf8((U8*)locinput)))
2298 locinput += PL_utf8skip[nextchr];
2299 nextchr = UCHARAT(locinput);
2302 if (!(OP(scan) == SPACE
2303 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
2305 nextchr = UCHARAT(++locinput);
2308 if (!(OP(scan) == SPACE
2309 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
2311 nextchr = UCHARAT(++locinput);
2315 PL_reg_flags |= RF_tainted;
2318 if (!nextchr && locinput >= PL_regeol)
2321 if (OP(scan) == NSPACE
2322 ? swash_fetch(PL_utf8_space, (U8*)locinput)
2323 : isSPACE_LC_utf8((U8*)locinput))
2327 locinput += PL_utf8skip[nextchr];
2328 nextchr = UCHARAT(locinput);
2331 if (OP(scan) == NSPACE
2332 ? isSPACE(nextchr) : isSPACE_LC(nextchr))
2334 nextchr = UCHARAT(++locinput);
2337 PL_reg_flags |= RF_tainted;
2343 if (!(OP(scan) == DIGIT
2344 ? swash_fetch(PL_utf8_digit, (U8*)locinput)
2345 : isDIGIT_LC_utf8((U8*)locinput)))
2349 locinput += PL_utf8skip[nextchr];
2350 nextchr = UCHARAT(locinput);
2353 if (!(OP(scan) == DIGIT
2354 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr)))
2356 nextchr = UCHARAT(++locinput);
2359 PL_reg_flags |= RF_tainted;
2362 if (!nextchr && locinput >= PL_regeol)
2365 if (OP(scan) == NDIGIT
2366 ? swash_fetch(PL_utf8_digit, (U8*)locinput)
2367 : isDIGIT_LC_utf8((U8*)locinput))
2371 locinput += PL_utf8skip[nextchr];
2372 nextchr = UCHARAT(locinput);
2375 if (OP(scan) == NDIGIT
2376 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr))
2378 nextchr = UCHARAT(++locinput);
2381 if (locinput >= PL_regeol || swash_fetch(PL_utf8_mark,(U8*)locinput))
2383 locinput += PL_utf8skip[nextchr];
2384 while (locinput < PL_regeol && swash_fetch(PL_utf8_mark,(U8*)locinput))
2385 locinput += UTF8SKIP(locinput);
2386 if (locinput > PL_regeol)
2388 nextchr = UCHARAT(locinput);
2391 PL_reg_flags |= RF_tainted;
2395 n = ARG(scan); /* which paren pair */
2396 ln = PL_regstartp[n];
2397 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
2398 if (*PL_reglastparen < n || ln == -1)
2399 sayNO; /* Do not match unless seen CLOSEn. */
2400 if (ln == PL_regendp[n])
2404 if (do_utf8 && OP(scan) != REF) { /* REF can do byte comparison */
2406 char *e = PL_bostr + PL_regendp[n];
2408 * Note that we can't do the "other character" lookup trick as
2409 * in the 8-bit case (no pun intended) because in Unicode we
2410 * have to map both upper and title case to lower case.
2412 if (OP(scan) == REFF) {
2416 if (toLOWER_utf8((U8*)s) != toLOWER_utf8((U8*)l))
2426 if (toLOWER_LC_utf8((U8*)s) != toLOWER_LC_utf8((U8*)l))
2433 nextchr = UCHARAT(locinput);
2437 /* Inline the first character, for speed. */
2438 if (UCHARAT(s) != nextchr &&
2440 (UCHARAT(s) != ((OP(scan) == REFF
2441 ? PL_fold : PL_fold_locale)[nextchr]))))
2443 ln = PL_regendp[n] - ln;
2444 if (locinput + ln > PL_regeol)
2446 if (ln > 1 && (OP(scan) == REF
2447 ? memNE(s, locinput, ln)
2449 ? ibcmp(s, locinput, ln)
2450 : ibcmp_locale(s, locinput, ln))))
2453 nextchr = UCHARAT(locinput);
2464 OP_4tree *oop = PL_op;
2465 COP *ocurcop = PL_curcop;
2466 SV **ocurpad = PL_curpad;
2470 PL_op = (OP_4tree*)PL_regdata->data[n];
2471 DEBUG_r( PerlIO_printf(Perl_debug_log, " re_eval 0x%"UVxf"\n", PTR2UV(PL_op)) );
2472 PL_curpad = AvARRAY((AV*)PL_regdata->data[n + 2]);
2473 PL_regendp[0] = PL_reg_magic->mg_len = locinput - PL_bostr;
2475 CALLRUNOPS(aTHX); /* Scalar context. */
2481 PL_curpad = ocurpad;
2482 PL_curcop = ocurcop;
2484 if (logical == 2) { /* Postponed subexpression. */
2486 MAGIC *mg = Null(MAGIC*);
2488 CHECKPOINT cp, lastcp;
2490 if(SvROK(ret) || SvRMAGICAL(ret)) {
2491 SV *sv = SvROK(ret) ? SvRV(ret) : ret;
2494 mg = mg_find(sv, 'r');
2497 re = (regexp *)mg->mg_obj;
2498 (void)ReREFCNT_inc(re);
2502 char *t = SvPV(ret, len);
2504 char *oprecomp = PL_regprecomp;
2505 I32 osize = PL_regsize;
2506 I32 onpar = PL_regnpar;
2509 re = CALLREGCOMP(aTHX_ t, t + len, &pm);
2511 & (SVs_TEMP | SVs_PADTMP | SVf_READONLY)))
2512 sv_magic(ret,(SV*)ReREFCNT_inc(re),'r',0,0);
2513 PL_regprecomp = oprecomp;
2518 PerlIO_printf(Perl_debug_log,
2519 "Entering embedded `%s%.60s%s%s'\n",
2523 (strlen(re->precomp) > 60 ? "..." : ""))
2526 state.prev = PL_reg_call_cc;
2527 state.cc = PL_regcc;
2528 state.re = PL_reg_re;
2532 cp = regcppush(0); /* Save *all* the positions. */
2535 state.ss = PL_savestack_ix;
2536 *PL_reglastparen = 0;
2537 PL_reg_call_cc = &state;
2538 PL_reginput = locinput;
2540 /* XXXX This is too dramatic a measure... */
2543 if (regmatch(re->program + 1)) {
2544 /* Even though we succeeded, we need to restore
2545 global variables, since we may be wrapped inside
2546 SUSPEND, thus the match may be not finished yet. */
2548 /* XXXX Do this only if SUSPENDed? */
2549 PL_reg_call_cc = state.prev;
2550 PL_regcc = state.cc;
2551 PL_reg_re = state.re;
2552 cache_re(PL_reg_re);
2554 /* XXXX This is too dramatic a measure... */
2557 /* These are needed even if not SUSPEND. */
2563 REGCP_UNWIND(lastcp);
2565 PL_reg_call_cc = state.prev;
2566 PL_regcc = state.cc;
2567 PL_reg_re = state.re;
2568 cache_re(PL_reg_re);
2570 /* XXXX This is too dramatic a measure... */
2579 sv_setsv(save_scalar(PL_replgv), ret);
2583 n = ARG(scan); /* which paren pair */
2584 PL_reg_start_tmp[n] = locinput;
2589 n = ARG(scan); /* which paren pair */
2590 PL_regstartp[n] = PL_reg_start_tmp[n] - PL_bostr;
2591 PL_regendp[n] = locinput - PL_bostr;
2592 if (n > *PL_reglastparen)
2593 *PL_reglastparen = n;
2596 n = ARG(scan); /* which paren pair */
2597 sw = (*PL_reglastparen >= n && PL_regendp[n] != -1);
2600 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
2602 next = NEXTOPER(NEXTOPER(scan));
2604 next = scan + ARG(scan);
2605 if (OP(next) == IFTHEN) /* Fake one. */
2606 next = NEXTOPER(NEXTOPER(next));
2610 logical = scan->flags;
2612 /*******************************************************************
2613 PL_regcc contains infoblock about the innermost (...)* loop, and
2614 a pointer to the next outer infoblock.
2616 Here is how Y(A)*Z is processed (if it is compiled into CURLYX/WHILEM):
2618 1) After matching X, regnode for CURLYX is processed;
2620 2) This regnode creates infoblock on the stack, and calls
2621 regmatch() recursively with the starting point at WHILEM node;
2623 3) Each hit of WHILEM node tries to match A and Z (in the order
2624 depending on the current iteration, min/max of {min,max} and
2625 greediness). The information about where are nodes for "A"
2626 and "Z" is read from the infoblock, as is info on how many times "A"
2627 was already matched, and greediness.
2629 4) After A matches, the same WHILEM node is hit again.
2631 5) Each time WHILEM is hit, PL_regcc is the infoblock created by CURLYX
2632 of the same pair. Thus when WHILEM tries to match Z, it temporarily
2633 resets PL_regcc, since this Y(A)*Z can be a part of some other loop:
2634 as in (Y(A)*Z)*. If Z matches, the automaton will hit the WHILEM node
2635 of the external loop.
2637 Currently present infoblocks form a tree with a stem formed by PL_curcc
2638 and whatever it mentions via ->next, and additional attached trees
2639 corresponding to temporarily unset infoblocks as in "5" above.
2641 In the following picture infoblocks for outer loop of
2642 (Y(A)*?Z)*?T are denoted O, for inner I. NULL starting block
2643 is denoted by x. The matched string is YAAZYAZT. Temporarily postponed
2644 infoblocks are drawn below the "reset" infoblock.
2646 In fact in the picture below we do not show failed matches for Z and T
2647 by WHILEM blocks. [We illustrate minimal matches, since for them it is
2648 more obvious *why* one needs to *temporary* unset infoblocks.]
2650 Matched REx position InfoBlocks Comment
2654 Y A)*?Z)*?T x <- O <- I
2655 YA )*?Z)*?T x <- O <- I
2656 YA A)*?Z)*?T x <- O <- I
2657 YAA )*?Z)*?T x <- O <- I
2658 YAA Z)*?T x <- O # Temporary unset I
2661 YAAZ Y(A)*?Z)*?T x <- O
2664 YAAZY (A)*?Z)*?T x <- O
2667 YAAZY A)*?Z)*?T x <- O <- I
2670 YAAZYA )*?Z)*?T x <- O <- I
2673 YAAZYA Z)*?T x <- O # Temporary unset I
2679 YAAZYAZ T x # Temporary unset O
2686 *******************************************************************/
2689 CHECKPOINT cp = PL_savestack_ix;
2690 /* No need to save/restore up to this paren */
2691 I32 parenfloor = scan->flags;
2693 if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
2695 cc.oldcc = PL_regcc;
2697 /* XXXX Probably it is better to teach regpush to support
2698 parenfloor > PL_regsize... */
2699 if (parenfloor > *PL_reglastparen)
2700 parenfloor = *PL_reglastparen; /* Pessimization... */
2701 cc.parenfloor = parenfloor;
2703 cc.min = ARG1(scan);
2704 cc.max = ARG2(scan);
2705 cc.scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
2709 PL_reginput = locinput;
2710 n = regmatch(PREVOPER(next)); /* start on the WHILEM */
2712 PL_regcc = cc.oldcc;
2718 * This is really hard to understand, because after we match
2719 * what we're trying to match, we must make sure the rest of
2720 * the REx is going to match for sure, and to do that we have
2721 * to go back UP the parse tree by recursing ever deeper. And
2722 * if it fails, we have to reset our parent's current state
2723 * that we can try again after backing off.
2726 CHECKPOINT cp, lastcp;
2727 CURCUR* cc = PL_regcc;
2728 char *lastloc = cc->lastloc; /* Detection of 0-len. */
2730 n = cc->cur + 1; /* how many we know we matched */
2731 PL_reginput = locinput;
2734 PerlIO_printf(Perl_debug_log,
2735 "%*s %ld out of %ld..%ld cc=%lx\n",
2736 REPORT_CODE_OFF+PL_regindent*2, "",
2737 (long)n, (long)cc->min,
2738 (long)cc->max, (long)cc)
2741 /* If degenerate scan matches "", assume scan done. */
2743 if (locinput == cc->lastloc && n >= cc->min) {
2744 PL_regcc = cc->oldcc;
2748 PerlIO_printf(Perl_debug_log,
2749 "%*s empty match detected, try continuation...\n",
2750 REPORT_CODE_OFF+PL_regindent*2, "")
2752 if (regmatch(cc->next))
2760 /* First just match a string of min scans. */
2764 cc->lastloc = locinput;
2765 if (regmatch(cc->scan))
2768 cc->lastloc = lastloc;
2773 /* Check whether we already were at this position.
2774 Postpone detection until we know the match is not
2775 *that* much linear. */
2776 if (!PL_reg_maxiter) {
2777 PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
2778 PL_reg_leftiter = PL_reg_maxiter;
2780 if (PL_reg_leftiter-- == 0) {
2781 I32 size = (PL_reg_maxiter + 7)/8;
2782 if (PL_reg_poscache) {
2783 if (PL_reg_poscache_size < size) {
2784 Renew(PL_reg_poscache, size, char);
2785 PL_reg_poscache_size = size;
2787 Zero(PL_reg_poscache, size, char);
2790 PL_reg_poscache_size = size;
2791 Newz(29, PL_reg_poscache, size, char);
2794 PerlIO_printf(Perl_debug_log,
2795 "%sDetected a super-linear match, switching on caching%s...\n",
2796 PL_colors[4], PL_colors[5])
2799 if (PL_reg_leftiter < 0) {
2800 I32 o = locinput - PL_bostr, b;
2802 o = (scan->flags & 0xf) - 1 + o * (scan->flags>>4);
2805 if (PL_reg_poscache[o] & (1<<b)) {
2807 PerlIO_printf(Perl_debug_log,
2808 "%*s already tried at this position...\n",
2809 REPORT_CODE_OFF+PL_regindent*2, "")
2813 PL_reg_poscache[o] |= (1<<b);
2817 /* Prefer next over scan for minimal matching. */
2820 PL_regcc = cc->oldcc;
2823 cp = regcppush(cc->parenfloor);
2825 if (regmatch(cc->next)) {
2827 sayYES; /* All done. */
2829 REGCP_UNWIND(lastcp);
2835 if (n >= cc->max) { /* Maximum greed exceeded? */
2836 if (ckWARN(WARN_REGEXP) && n >= REG_INFTY
2837 && !(PL_reg_flags & RF_warned)) {
2838 PL_reg_flags |= RF_warned;
2839 Perl_warner(aTHX_ WARN_REGEXP, "%s limit (%d) exceeded",
2840 "Complex regular subexpression recursion",
2847 PerlIO_printf(Perl_debug_log,
2848 "%*s trying longer...\n",
2849 REPORT_CODE_OFF+PL_regindent*2, "")
2851 /* Try scanning more and see if it helps. */
2852 PL_reginput = locinput;
2854 cc->lastloc = locinput;
2855 cp = regcppush(cc->parenfloor);
2857 if (regmatch(cc->scan)) {
2861 REGCP_UNWIND(lastcp);
2864 cc->lastloc = lastloc;
2868 /* Prefer scan over next for maximal matching. */
2870 if (n < cc->max) { /* More greed allowed? */
2871 cp = regcppush(cc->parenfloor);
2873 cc->lastloc = locinput;
2875 if (regmatch(cc->scan)) {
2879 REGCP_UNWIND(lastcp);
2880 regcppop(); /* Restore some previous $<digit>s? */
2881 PL_reginput = locinput;
2883 PerlIO_printf(Perl_debug_log,
2884 "%*s failed, try continuation...\n",
2885 REPORT_CODE_OFF+PL_regindent*2, "")
2888 if (ckWARN(WARN_REGEXP) && n >= REG_INFTY
2889 && !(PL_reg_flags & RF_warned)) {
2890 PL_reg_flags |= RF_warned;
2891 Perl_warner(aTHX_ WARN_REGEXP, "%s limit (%d) exceeded",
2892 "Complex regular subexpression recursion",
2896 /* Failed deeper matches of scan, so see if this one works. */
2897 PL_regcc = cc->oldcc;
2900 if (regmatch(cc->next))
2906 cc->lastloc = lastloc;
2911 next = scan + ARG(scan);
2914 inner = NEXTOPER(NEXTOPER(scan));
2917 inner = NEXTOPER(scan);
2922 if (OP(next) != c1) /* No choice. */
2923 next = inner; /* Avoid recursion. */
2925 I32 lastparen = *PL_reglastparen;
2927 re_unwind_branch_t *uw;
2929 /* Put unwinding data on stack */
2930 unwind1 = SSNEWt(1,re_unwind_branch_t);
2931 uw = SSPTRt(unwind1,re_unwind_branch_t);
2934 uw->type = ((c1 == BRANCH)
2936 : RE_UNWIND_BRANCHJ);
2937 uw->lastparen = lastparen;
2939 uw->locinput = locinput;
2940 uw->nextchr = nextchr;
2942 uw->regindent = ++PL_regindent;
2945 REGCP_SET(uw->lastcp);
2947 /* Now go into the first branch */
2960 /* We suppose that the next guy does not need
2961 backtracking: in particular, it is of constant length,
2962 and has no parenths to influence future backrefs. */
2963 ln = ARG1(scan); /* min to match */
2964 n = ARG2(scan); /* max to match */
2965 paren = scan->flags;
2967 if (paren > PL_regsize)
2969 if (paren > *PL_reglastparen)
2970 *PL_reglastparen = paren;
2972 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
2974 scan += NEXT_OFF(scan); /* Skip former OPEN. */
2975 PL_reginput = locinput;
2978 if (ln && regrepeat_hard(scan, ln, &l) < ln)
2980 if (ln && l == 0 && n >= ln
2981 /* In fact, this is tricky. If paren, then the
2982 fact that we did/didnot match may influence
2983 future execution. */
2984 && !(paren && ln == 0))
2986 locinput = PL_reginput;
2987 if (PL_regkind[(U8)OP(next)] == EXACT) {
2988 c1 = (U8)*STRING(next);
2989 if (OP(next) == EXACTF)
2991 else if (OP(next) == EXACTFL)
2992 c2 = PL_fold_locale[c1];
2999 /* This may be improved if l == 0. */
3000 while (n >= ln || (n == REG_INFTY && ln > 0 && l)) { /* ln overflow ? */
3001 /* If it could work, try it. */
3003 UCHARAT(PL_reginput) == c1 ||
3004 UCHARAT(PL_reginput) == c2)
3008 PL_regstartp[paren] =
3009 HOPc(PL_reginput, -l) - PL_bostr;
3010 PL_regendp[paren] = PL_reginput - PL_bostr;
3013 PL_regendp[paren] = -1;
3017 REGCP_UNWIND(lastcp);
3019 /* Couldn't or didn't -- move forward. */
3020 PL_reginput = locinput;
3021 if (regrepeat_hard(scan, 1, &l)) {
3023 locinput = PL_reginput;
3030 n = regrepeat_hard(scan, n, &l);
3031 if (n != 0 && l == 0
3032 /* In fact, this is tricky. If paren, then the
3033 fact that we did/didnot match may influence
3034 future execution. */
3035 && !(paren && ln == 0))
3037 locinput = PL_reginput;
3039 PerlIO_printf(Perl_debug_log,
3040 "%*s matched %"IVdf" times, len=%"IVdf"...\n",
3041 (int)(REPORT_CODE_OFF+PL_regindent*2), "",
3045 if (PL_regkind[(U8)OP(next)] == EXACT) {
3046 c1 = (U8)*STRING(next);
3047 if (OP(next) == EXACTF)
3049 else if (OP(next) == EXACTFL)
3050 c2 = PL_fold_locale[c1];
3059 /* If it could work, try it. */
3061 UCHARAT(PL_reginput) == c1 ||
3062 UCHARAT(PL_reginput) == c2)
3065 PerlIO_printf(Perl_debug_log,
3066 "%*s trying tail with n=%"IVdf"...\n",
3067 (int)(REPORT_CODE_OFF+PL_regindent*2), "", (IV)n)
3071 PL_regstartp[paren] = HOPc(PL_reginput, -l) - PL_bostr;
3072 PL_regendp[paren] = PL_reginput - PL_bostr;
3075 PL_regendp[paren] = -1;
3079 REGCP_UNWIND(lastcp);
3081 /* Couldn't or didn't -- back up. */
3083 locinput = HOPc(locinput, -l);
3084 PL_reginput = locinput;
3091 paren = scan->flags; /* Which paren to set */
3092 if (paren > PL_regsize)
3094 if (paren > *PL_reglastparen)
3095 *PL_reglastparen = paren;
3096 ln = ARG1(scan); /* min to match */
3097 n = ARG2(scan); /* max to match */
3098 scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
3102 ln = ARG1(scan); /* min to match */
3103 n = ARG2(scan); /* max to match */
3104 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
3109 scan = NEXTOPER(scan);
3115 scan = NEXTOPER(scan);
3119 * Lookahead to avoid useless match attempts
3120 * when we know what character comes next.
3122 if (PL_regkind[(U8)OP(next)] == EXACT) {
3123 U8 *s = (U8*)STRING(next);
3126 if (OP(next) == EXACTF)
3128 else if (OP(next) == EXACTFL)
3129 c2 = PL_fold_locale[c1];
3132 if (OP(next) == EXACTF) {
3133 c1 = to_utf8_lower(s);
3134 c2 = to_utf8_upper(s);
3137 c2 = c1 = utf8_to_uv_simple(s, NULL);
3143 PL_reginput = locinput;
3147 if (ln && regrepeat(scan, ln) < ln)
3149 locinput = PL_reginput;
3152 char *e; /* Should not check after this */
3153 char *old = locinput;
3155 if (n == REG_INFTY) {
3158 while (UTF8_IS_CONTINUATION(*(U8*)e))
3164 m >0 && e + UTF8SKIP(e) <= PL_regeol; m--)
3168 e = locinput + n - ln;
3174 /* Find place 'next' could work */
3177 while (locinput <= e && *locinput != c1)
3180 while (locinput <= e
3185 count = locinput - old;
3192 utf8_to_uv_simple((U8*)locinput, &len) != c1;
3197 for (count = 0; locinput <= e; count++) {
3198 UV c = utf8_to_uv_simple((U8*)locinput, &len);
3199 if (c == c1 || c == c2)
3207 /* PL_reginput == old now */
3208 if (locinput != old) {
3209 ln = 1; /* Did some */
3210 if (regrepeat(scan, count) < count)
3213 /* PL_reginput == locinput now */
3214 TRYPAREN(paren, ln, locinput);
3215 PL_reginput = locinput; /* Could be reset... */
3216 REGCP_UNWIND(lastcp);
3217 /* Couldn't or didn't -- move forward. */
3220 locinput += UTF8SKIP(locinput);
3226 while (n >= ln || (n == REG_INFTY && ln > 0)) { /* ln overflow ? */
3230 c = utf8_to_uv_simple((U8*)PL_reginput, NULL);
3232 c = UCHARAT(PL_reginput);
3234 /* If it could work, try it. */
3235 if (c1 == -1000 || c == c1 || c == c2)
3237 TRYPAREN(paren, n, PL_reginput);
3238 REGCP_UNWIND(lastcp);
3240 /* Couldn't or didn't -- move forward. */
3241 PL_reginput = locinput;
3242 if (regrepeat(scan, 1)) {
3244 locinput = PL_reginput;
3252 n = regrepeat(scan, n);
3253 locinput = PL_reginput;
3254 if (ln < n && PL_regkind[(U8)OP(next)] == EOL &&
3255 (!PL_multiline || OP(next) == SEOL || OP(next) == EOS)) {
3256 ln = n; /* why back off? */
3257 /* ...because $ and \Z can match before *and* after
3258 newline at the end. Consider "\n\n" =~ /\n+\Z\n/.
3259 We should back off by one in this case. */
3260 if (UCHARAT(PL_reginput - 1) == '\n' && OP(next) != EOS)
3269 c = utf8_to_uv_simple((U8*)PL_reginput, NULL);
3271 c = UCHARAT(PL_reginput);
3273 /* If it could work, try it. */
3274 if (c1 == -1000 || c == c1 || c == c2)
3276 TRYPAREN(paren, n, PL_reginput);
3277 REGCP_UNWIND(lastcp);
3279 /* Couldn't or didn't -- back up. */
3281 PL_reginput = locinput = HOPc(locinput, -1);
3289 c = utf8_to_uv_simple((U8*)PL_reginput, NULL);
3291 c = UCHARAT(PL_reginput);
3293 /* If it could work, try it. */
3294 if (c1 == -1000 || c == c1 || c == c2)
3296 TRYPAREN(paren, n, PL_reginput);
3297 REGCP_UNWIND(lastcp);
3299 /* Couldn't or didn't -- back up. */
3301 PL_reginput = locinput = HOPc(locinput, -1);
3308 if (PL_reg_call_cc) {
3309 re_cc_state *cur_call_cc = PL_reg_call_cc;
3310 CURCUR *cctmp = PL_regcc;
3311 regexp *re = PL_reg_re;
3312 CHECKPOINT cp, lastcp;
3314 cp = regcppush(0); /* Save *all* the positions. */
3316 regcp_set_to(PL_reg_call_cc->ss); /* Restore parens of
3318 PL_reginput = locinput; /* Make position available to
3320 cache_re(PL_reg_call_cc->re);
3321 PL_regcc = PL_reg_call_cc->cc;
3322 PL_reg_call_cc = PL_reg_call_cc->prev;
3323 if (regmatch(cur_call_cc->node)) {
3324 PL_reg_call_cc = cur_call_cc;
3328 REGCP_UNWIND(lastcp);
3330 PL_reg_call_cc = cur_call_cc;
3336 PerlIO_printf(Perl_debug_log,
3337 "%*s continuation failed...\n",
3338 REPORT_CODE_OFF+PL_regindent*2, "")
3342 if (locinput < PL_regtill) {
3343 DEBUG_r(PerlIO_printf(Perl_debug_log,
3344 "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
3346 (long)(locinput - PL_reg_starttry),
3347 (long)(PL_regtill - PL_reg_starttry),
3349 sayNO_FINAL; /* Cannot match: too short. */
3351 PL_reginput = locinput; /* put where regtry can find it */
3352 sayYES_FINAL; /* Success! */
3354 PL_reginput = locinput; /* put where regtry can find it */
3355 sayYES_LOUD; /* Success! */
3358 PL_reginput = locinput;
3363 if (UTF) { /* XXXX This is absolutely
3364 broken, we read before
3366 s = HOPMAYBEc(locinput, -scan->flags);
3372 if (locinput < PL_bostr + scan->flags)
3374 PL_reginput = locinput - scan->flags;
3379 PL_reginput = locinput;
3384 if (UTF) { /* XXXX This is absolutely
3385 broken, we read before
3387 s = HOPMAYBEc(locinput, -scan->flags);
3388 if (!s || s < PL_bostr)
3393 if (locinput < PL_bostr + scan->flags)
3395 PL_reginput = locinput - scan->flags;
3400 PL_reginput = locinput;
3403 inner = NEXTOPER(NEXTOPER(scan));
3404 if (regmatch(inner) != n) {
3419 if (OP(scan) == SUSPEND) {
3420 locinput = PL_reginput;
3421 nextchr = UCHARAT(locinput);
3426 next = scan + ARG(scan);
3431 PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
3432 PTR2UV(scan), OP(scan));
3433 Perl_croak(aTHX_ "regexp memory corruption");
3440 * We get here only if there's trouble -- normally "case END" is
3441 * the terminating point.
3443 Perl_croak(aTHX_ "corrupted regexp pointers");
3449 PerlIO_printf(Perl_debug_log,
3450 "%*s %scould match...%s\n",
3451 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4],PL_colors[5])
3455 DEBUG_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
3456 PL_colors[4],PL_colors[5]));
3462 #if 0 /* Breaks $^R */
3470 PerlIO_printf(Perl_debug_log,
3471 "%*s %sfailed...%s\n",
3472 REPORT_CODE_OFF+PL_regindent*2, "",PL_colors[4],PL_colors[5])
3478 re_unwind_t *uw = SSPTRt(unwind,re_unwind_t);
3481 case RE_UNWIND_BRANCH:
3482 case RE_UNWIND_BRANCHJ:
3484 re_unwind_branch_t *uwb = &(uw->branch);
3485 I32 lastparen = uwb->lastparen;
3487 REGCP_UNWIND(uwb->lastcp);
3488 for (n = *PL_reglastparen; n > lastparen; n--)
3490 *PL_reglastparen = n;
3491 scan = next = uwb->next;
3493 OP(scan) != (uwb->type == RE_UNWIND_BRANCH
3494 ? BRANCH : BRANCHJ) ) { /* Failure */
3501 /* Have more choice yet. Reuse the same uwb. */
3503 if ((n = (uwb->type == RE_UNWIND_BRANCH
3504 ? NEXT_OFF(next) : ARG(next))))
3507 next = NULL; /* XXXX Needn't unwinding in this case... */
3509 next = NEXTOPER(scan);
3510 if (uwb->type == RE_UNWIND_BRANCHJ)
3511 next = NEXTOPER(next);
3512 locinput = uwb->locinput;
3513 nextchr = uwb->nextchr;
3515 PL_regindent = uwb->regindent;
3522 Perl_croak(aTHX_ "regexp unwind memory corruption");
3533 - regrepeat - repeatedly match something simple, report how many
3536 * [This routine now assumes that it will only match on things of length 1.
3537 * That was true before, but now we assume scan - reginput is the count,
3538 * rather than incrementing count on every character. [Er, except utf8.]]
3541 S_regrepeat(pTHX_ regnode *p, I32 max)
3543 register char *scan;
3545 register char *loceol = PL_regeol;
3546 register I32 hardcount = 0;
3547 register bool do_utf8 = DO_UTF8(PL_reg_sv);
3550 if (max != REG_INFTY && max < loceol - scan)
3551 loceol = scan + max;
3556 while (scan < loceol && hardcount < max && *scan != '\n') {
3557 scan += UTF8SKIP(scan);
3561 while (scan < loceol && *scan != '\n')
3568 while (hardcount < max && scan < loceol) {
3569 scan += UTF8SKIP(scan);
3576 case EXACT: /* length of string is 1 */
3578 while (scan < loceol && UCHARAT(scan) == c)
3581 case EXACTF: /* length of string is 1 */
3583 while (scan < loceol &&
3584 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold[c]))
3587 case EXACTFL: /* length of string is 1 */
3588 PL_reg_flags |= RF_tainted;
3590 while (scan < loceol &&
3591 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold_locale[c]))
3597 while (hardcount < max && scan < loceol &&
3598 reginclass(p, (U8*)scan, do_utf8)) {
3599 scan += UTF8SKIP(scan);
3603 while (scan < loceol && reginclass(p, (U8*)scan, do_utf8))
3610 while (hardcount < max && scan < loceol &&
3611 swash_fetch(PL_utf8_alnum, (U8*)scan)) {
3612 scan += UTF8SKIP(scan);
3616 while (scan < loceol && isALNUM(*scan))
3621 PL_reg_flags |= RF_tainted;
3624 while (hardcount < max && scan < loceol &&
3625 isALNUM_LC_utf8((U8*)scan)) {
3626 scan += UTF8SKIP(scan);
3630 while (scan < loceol && isALNUM_LC(*scan))
3637 while (hardcount < max && scan < loceol &&
3638 !swash_fetch(PL_utf8_alnum, (U8*)scan)) {
3639 scan += UTF8SKIP(scan);
3643 while (scan < loceol && !isALNUM(*scan))
3648 PL_reg_flags |= RF_tainted;
3651 while (hardcount < max && scan < loceol &&
3652 !isALNUM_LC_utf8((U8*)scan)) {
3653 scan += UTF8SKIP(scan);
3657 while (scan < loceol && !isALNUM_LC(*scan))
3664 while (hardcount < max && scan < loceol &&
3665 (*scan == ' ' || swash_fetch(PL_utf8_space,(U8*)scan))) {
3666 scan += UTF8SKIP(scan);
3670 while (scan < loceol && isSPACE(*scan))
3675 PL_reg_flags |= RF_tainted;
3678 while (hardcount < max && scan < loceol &&
3679 (*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
3680 scan += UTF8SKIP(scan);
3684 while (scan < loceol && isSPACE_LC(*scan))
3691 while (hardcount < max && scan < loceol &&
3692 !(*scan == ' ' || swash_fetch(PL_utf8_space,(U8*)scan))) {
3693 scan += UTF8SKIP(scan);
3697 while (scan < loceol && !isSPACE(*scan))
3702 PL_reg_flags |= RF_tainted;
3705 while (hardcount < max && scan < loceol &&
3706 !(*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
3707 scan += UTF8SKIP(scan);
3711 while (scan < loceol && !isSPACE_LC(*scan))
3718 while (hardcount < max && scan < loceol &&
3719 swash_fetch(PL_utf8_digit,(U8*)scan)) {
3720 scan += UTF8SKIP(scan);
3724 while (scan < loceol && isDIGIT(*scan))
3731 while (hardcount < max && scan < loceol &&
3732 !swash_fetch(PL_utf8_digit,(U8*)scan)) {
3733 scan += UTF8SKIP(scan);
3737 while (scan < loceol && !isDIGIT(*scan))
3741 default: /* Called on something of 0 width. */
3742 break; /* So match right here or not at all. */
3748 c = scan - PL_reginput;
3753 SV *prop = sv_newmortal();
3756 PerlIO_printf(Perl_debug_log,
3757 "%*s %s can match %"IVdf" times out of %"IVdf"...\n",
3758 REPORT_CODE_OFF+1, "", SvPVX(prop),(IV)c,(IV)max);
3765 - regrepeat_hard - repeatedly match something, report total lenth and length
3767 * The repeater is supposed to have constant length.
3771 S_regrepeat_hard(pTHX_ regnode *p, I32 max, I32 *lp)
3773 register char *scan;
3774 register char *start;
3775 register char *loceol = PL_regeol;
3777 I32 count = 0, res = 1;
3782 start = PL_reginput;
3783 if (DO_UTF8(PL_reg_sv)) {
3784 while (PL_reginput < loceol && (scan = PL_reginput, res = regmatch(p))) {
3787 while (start < PL_reginput) {
3789 start += UTF8SKIP(start);
3800 while (PL_reginput < loceol && (scan = PL_reginput, res = regmatch(p))) {
3802 *lp = l = PL_reginput - start;
3803 if (max != REG_INFTY && l*max < loceol - scan)
3804 loceol = scan + l*max;
3817 - regclass_swash - prepare the utf8 swash
3821 Perl_regclass_swash(pTHX_ register regnode* node, bool doinit, SV** initsvp)
3826 if (PL_regdata && PL_regdata->count) {
3829 if (PL_regdata->what[n] == 's') {
3830 SV *rv = (SV*)PL_regdata->data[n];
3831 AV *av = (AV*)SvRV((SV*)rv);
3834 si = *av_fetch(av, 0, FALSE);
3835 a = av_fetch(av, 1, FALSE);
3839 else if (si && doinit) {
3840 sw = swash_init("utf8", "", si, 1, 0);
3841 (void)av_store(av, 1, sw);
3853 - reginclass - determine if a character falls into a character class
3857 S_reginclass(pTHX_ register regnode *n, register U8* p, register bool do_utf8)
3859 char flags = ANYOF_FLAGS(n);
3865 c = utf8_to_uv_simple(p, &len);
3869 if (do_utf8 || (flags & ANYOF_UNICODE)) {
3870 if (do_utf8 && !ANYOF_RUNTIME(n)) {
3871 if (len != (STRLEN)-1 && c < 256 && ANYOF_BITMAP_TEST(n, c))
3874 if (do_utf8 && flags & ANYOF_UNICODE_ALL && c >= 256)
3877 SV *sw = regclass_swash(n, TRUE, 0);
3880 if (swash_fetch(sw, p))
3882 else if (flags & ANYOF_FOLD) {
3883 U8 tmpbuf[UTF8_MAXLEN+1];
3885 if (flags & ANYOF_LOCALE) {
3886 PL_reg_flags |= RF_tainted;
3887 uv_to_utf8(tmpbuf, toLOWER_LC_utf8(p));
3890 uv_to_utf8(tmpbuf, toLOWER_utf8(p));
3891 if (swash_fetch(sw, tmpbuf))
3897 if (!match && c < 256) {
3898 if (ANYOF_BITMAP_TEST(n, c))
3900 else if (flags & ANYOF_FOLD) {
3903 if (flags & ANYOF_LOCALE) {
3904 PL_reg_flags |= RF_tainted;
3905 f = PL_fold_locale[c];
3909 if (f != c && ANYOF_BITMAP_TEST(n, f))
3913 if (!match && (flags & ANYOF_CLASS)) {
3914 PL_reg_flags |= RF_tainted;
3916 (ANYOF_CLASS_TEST(n, ANYOF_ALNUM) && isALNUM_LC(c)) ||
3917 (ANYOF_CLASS_TEST(n, ANYOF_NALNUM) && !isALNUM_LC(c)) ||
3918 (ANYOF_CLASS_TEST(n, ANYOF_SPACE) && isSPACE_LC(c)) ||
3919 (ANYOF_CLASS_TEST(n, ANYOF_NSPACE) && !isSPACE_LC(c)) ||
3920 (ANYOF_CLASS_TEST(n, ANYOF_DIGIT) && isDIGIT_LC(c)) ||
3921 (ANYOF_CLASS_TEST(n, ANYOF_NDIGIT) && !isDIGIT_LC(c)) ||
3922 (ANYOF_CLASS_TEST(n, ANYOF_ALNUMC) && isALNUMC_LC(c)) ||
3923 (ANYOF_CLASS_TEST(n, ANYOF_NALNUMC) && !isALNUMC_LC(c)) ||
3924 (ANYOF_CLASS_TEST(n, ANYOF_ALPHA) && isALPHA_LC(c)) ||
3925 (ANYOF_CLASS_TEST(n, ANYOF_NALPHA) && !isALPHA_LC(c)) ||
3926 (ANYOF_CLASS_TEST(n, ANYOF_ASCII) && isASCII(c)) ||
3927 (ANYOF_CLASS_TEST(n, ANYOF_NASCII) && !isASCII(c)) ||
3928 (ANYOF_CLASS_TEST(n, ANYOF_CNTRL) && isCNTRL_LC(c)) ||
3929 (ANYOF_CLASS_TEST(n, ANYOF_NCNTRL) && !isCNTRL_LC(c)) ||
3930 (ANYOF_CLASS_TEST(n, ANYOF_GRAPH) && isGRAPH_LC(c)) ||
3931 (ANYOF_CLASS_TEST(n, ANYOF_NGRAPH) && !isGRAPH_LC(c)) ||
3932 (ANYOF_CLASS_TEST(n, ANYOF_LOWER) && isLOWER_LC(c)) ||
3933 (ANYOF_CLASS_TEST(n, ANYOF_NLOWER) && !isLOWER_LC(c)) ||
3934 (ANYOF_CLASS_TEST(n, ANYOF_PRINT) && isPRINT_LC(c)) ||
3935 (ANYOF_CLASS_TEST(n, ANYOF_NPRINT) && !isPRINT_LC(c)) ||
3936 (ANYOF_CLASS_TEST(n, ANYOF_PUNCT) && isPUNCT_LC(c)) ||
3937 (ANYOF_CLASS_TEST(n, ANYOF_NPUNCT) && !isPUNCT_LC(c)) ||
3938 (ANYOF_CLASS_TEST(n, ANYOF_UPPER) && isUPPER_LC(c)) ||
3939 (ANYOF_CLASS_TEST(n, ANYOF_NUPPER) && !isUPPER_LC(c)) ||
3940 (ANYOF_CLASS_TEST(n, ANYOF_XDIGIT) && isXDIGIT(c)) ||
3941 (ANYOF_CLASS_TEST(n, ANYOF_NXDIGIT) && !isXDIGIT(c)) ||
3942 (ANYOF_CLASS_TEST(n, ANYOF_PSXSPC) && isPSXSPC(c)) ||
3943 (ANYOF_CLASS_TEST(n, ANYOF_NPSXSPC) && !isPSXSPC(c)) ||
3944 (ANYOF_CLASS_TEST(n, ANYOF_BLANK) && isBLANK(c)) ||
3945 (ANYOF_CLASS_TEST(n, ANYOF_NBLANK) && !isBLANK(c))
3946 ) /* How's that for a conditional? */
3953 return (flags & ANYOF_INVERT) ? !match : match;
3957 S_reghop(pTHX_ U8 *s, I32 off)
3959 return S_reghop3(aTHX_ s, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr));
3963 S_reghop3(pTHX_ U8 *s, I32 off, U8* lim)
3966 while (off-- && s < lim) {
3967 /* XXX could check well-formedness here */
3975 if (UTF8_IS_CONTINUED(*s)) {
3976 while (s > (U8*)lim && UTF8_IS_CONTINUATION(*s))
3979 /* XXX could check well-formedness here */
3987 S_reghopmaybe(pTHX_ U8 *s, I32 off)
3989 return S_reghopmaybe3(aTHX_ s, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr));
3993 S_reghopmaybe3(pTHX_ U8* s, I32 off, U8* lim)
3996 while (off-- && s < lim) {
3997 /* XXX could check well-formedness here */
4007 if (UTF8_IS_CONTINUED(*s)) {
4008 while (s > (U8*)lim && UTF8_IS_CONTINUATION(*s))
4011 /* XXX could check well-formedness here */
4027 restore_pos(pTHXo_ void *arg)
4029 if (PL_reg_eval_set) {
4030 if (PL_reg_oldsaved) {
4031 PL_reg_re->subbeg = PL_reg_oldsaved;
4032 PL_reg_re->sublen = PL_reg_oldsavedlen;
4033 RX_MATCH_COPIED_on(PL_reg_re);
4035 PL_reg_magic->mg_len = PL_reg_oldpos;
4036 PL_reg_eval_set = 0;
4037 PL_curpm = PL_reg_oldcurpm;