5 * "One Ring to rule them all, One Ring to find them..."
8 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
9 * confused with the original package (see point 3 below). Thanks, Henry!
12 /* Additional note: this code is very heavily munged from Henry's version
13 * in places. In some spots I've traded clarity for efficiency, so don't
14 * blame Henry for some of the lack of readability.
17 /* The names of the functions have been changed from regcomp and
18 * regexec to pregcomp and pregexec in order to avoid conflicts
19 * with the POSIX routines of the same names.
22 #ifdef PERL_EXT_RE_BUILD
23 /* need to replace pregcomp et al, so enable that */
24 # ifndef PERL_IN_XSUB_RE
25 # define PERL_IN_XSUB_RE
27 /* need access to debugger hooks */
28 # if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
33 #ifdef PERL_IN_XSUB_RE
34 /* We *really* need to overwrite these symbols: */
35 # define Perl_regexec_flags my_regexec
36 # define Perl_regdump my_regdump
37 # define Perl_regprop my_regprop
38 # define Perl_re_intuit_start my_re_intuit_start
39 /* *These* symbols are masked to allow static link. */
40 # define Perl_pregexec my_pregexec
41 # define Perl_reginitcolors my_reginitcolors
42 # define Perl_regclass_swash my_regclass_swash
44 # define PERL_NO_GET_CONTEXT
49 * pregcomp and pregexec -- regsub and regerror are not used in perl
51 * Copyright (c) 1986 by University of Toronto.
52 * Written by Henry Spencer. Not derived from licensed software.
54 * Permission is granted to anyone to use this software for any
55 * purpose on any computer system, and to redistribute it freely,
56 * subject to the following restrictions:
58 * 1. The author is not responsible for the consequences of use of
59 * this software, no matter how awful, even if they arise
62 * 2. The origin of this software must not be misrepresented, either
63 * by explicit claim or by omission.
65 * 3. Altered versions must be plainly marked as such, and must not
66 * be misrepresented as being the original software.
68 **** Alterations to Henry's code are...
70 **** Copyright (c) 1991-2001, Larry Wall
72 **** You may distribute under the terms of either the GNU General Public
73 **** License or the Artistic License, as specified in the README file.
75 * Beware that some of this code is subtly aware of the way operator
76 * precedence is structured in regular expressions. Serious changes in
77 * regular-expression syntax might require a total rethink.
80 #define PERL_IN_REGEXEC_C
83 #ifdef PERL_IN_XSUB_RE
84 # if defined(PERL_CAPI) || defined(PERL_OBJECT)
91 #define RF_tainted 1 /* tainted information used? */
92 #define RF_warned 2 /* warned about big count? */
93 #define RF_evaled 4 /* Did an EVAL with setting? */
94 #define RF_utf8 8 /* String contains multibyte chars? */
96 #define UTF (PL_reg_flags & RF_utf8)
98 #define RS_init 1 /* eval environment created */
99 #define RS_set 2 /* replsv value is set */
102 #define STATIC static
109 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
110 #define CHR_DIST(a,b) (DO_UTF8(PL_reg_sv) ? utf8_distance(a,b) : a - b)
112 #define reghop_c(pos,off) ((char*)reghop((U8*)pos, off))
113 #define reghopmaybe_c(pos,off) ((char*)reghopmaybe((U8*)pos, off))
114 #define HOP(pos,off) (DO_UTF8(PL_reg_sv) ? reghop((U8*)pos, off) : (U8*)(pos + off))
115 #define HOPMAYBE(pos,off) (DO_UTF8(PL_reg_sv) ? reghopmaybe((U8*)pos, off) : (U8*)(pos + off))
116 #define HOPc(pos,off) ((char*)HOP(pos,off))
117 #define HOPMAYBEc(pos,off) ((char*)HOPMAYBE(pos,off))
119 #define reghop3_c(pos,off,lim) ((char*)reghop3((U8*)pos, off, (U8*)lim))
120 #define reghopmaybe3_c(pos,off,lim) ((char*)reghopmaybe3((U8*)pos, off, (U8*)lim))
121 #define HOP3(pos,off,lim) (DO_UTF8(PL_reg_sv) ? reghop3((U8*)pos, off, (U8*)lim) : (U8*)(pos + off))
122 #define HOPMAYBE3(pos,off,lim) (DO_UTF8(PL_reg_sv) ? reghopmaybe3((U8*)pos, off, (U8*)lim) : (U8*)(pos + off))
123 #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
124 #define HOPMAYBE3c(pos,off,lim) ((char*)HOPMAYBE3(pos,off,lim))
126 static void restore_pos(pTHXo_ void *arg);
130 S_regcppush(pTHX_ I32 parenfloor)
132 int retval = PL_savestack_ix;
133 #define REGCP_PAREN_ELEMS 4
134 int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
137 #define REGCP_OTHER_ELEMS 5
138 SSCHECK(paren_elems_to_push + REGCP_OTHER_ELEMS);
139 for (p = PL_regsize; p > parenfloor; p--) {
140 /* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
141 SSPUSHINT(PL_regendp[p]);
142 SSPUSHINT(PL_regstartp[p]);
143 SSPUSHPTR(PL_reg_start_tmp[p]);
146 /* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
147 SSPUSHINT(PL_regsize);
148 SSPUSHINT(*PL_reglastparen);
149 SSPUSHPTR(PL_reginput);
150 #define REGCP_FRAME_ELEMS 2
151 /* REGCP_FRAME_ELEMS are part of the REGCP_OTHER_ELEMS and
152 * are needed for the regexp context stack bookkeeping. */
153 SSPUSHINT(paren_elems_to_push + REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
154 SSPUSHINT(SAVEt_REGCONTEXT); /* Magic cookie. */
159 /* These are needed since we do not localize EVAL nodes: */
160 # define REGCP_SET(cp) DEBUG_r(PerlIO_printf(Perl_debug_log, \
161 " Setting an EVAL scope, savestack=%"IVdf"\n", \
162 (IV)PL_savestack_ix)); cp = PL_savestack_ix
164 # define REGCP_UNWIND(cp) DEBUG_r(cp != PL_savestack_ix ? \
165 PerlIO_printf(Perl_debug_log, \
166 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
167 (IV)(cp), (IV)PL_savestack_ix) : 0); regcpblow(cp)
177 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
179 assert(i == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
180 i = SSPOPINT; /* Parentheses elements to pop. */
181 input = (char *) SSPOPPTR;
182 *PL_reglastparen = SSPOPINT;
183 PL_regsize = SSPOPINT;
185 /* Now restore the parentheses context. */
186 for (i -= (REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
187 i > 0; i -= REGCP_PAREN_ELEMS) {
188 paren = (U32)SSPOPINT;
189 PL_reg_start_tmp[paren] = (char *) SSPOPPTR;
190 PL_regstartp[paren] = SSPOPINT;
192 if (paren <= *PL_reglastparen)
193 PL_regendp[paren] = tmps;
195 PerlIO_printf(Perl_debug_log,
196 " restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n",
197 (UV)paren, (IV)PL_regstartp[paren],
198 (IV)(PL_reg_start_tmp[paren] - PL_bostr),
199 (IV)PL_regendp[paren],
200 (paren > *PL_reglastparen ? "(no)" : ""));
204 if (*PL_reglastparen + 1 <= PL_regnpar) {
205 PerlIO_printf(Perl_debug_log,
206 " restoring \\%"IVdf"..\\%"IVdf" to undef\n",
207 (IV)(*PL_reglastparen + 1), (IV)PL_regnpar);
211 /* It would seem that the similar code in regtry()
212 * already takes care of this, and in fact it is in
213 * a better location to since this code can #if 0-ed out
214 * but the code in regtry() is needed or otherwise tests
215 * requiring null fields (pat.t#187 and split.t#{13,14}
216 * (as of patchlevel 7877) will fail. Then again,
217 * this code seems to be necessary or otherwise
218 * building DynaLoader will fail:
219 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
221 for (paren = *PL_reglastparen + 1; paren <= PL_regnpar; paren++) {
222 if (paren > PL_regsize)
223 PL_regstartp[paren] = -1;
224 PL_regendp[paren] = -1;
231 S_regcp_set_to(pTHX_ I32 ss)
233 I32 tmp = PL_savestack_ix;
235 PL_savestack_ix = ss;
237 PL_savestack_ix = tmp;
241 typedef struct re_cc_state
245 struct re_cc_state *prev;
250 #define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
252 #define TRYPAREN(paren, n, input) { \
255 PL_regstartp[paren] = HOPc(input, -1) - PL_bostr; \
256 PL_regendp[paren] = input - PL_bostr; \
259 PL_regendp[paren] = -1; \
261 if (regmatch(next)) \
264 PL_regendp[paren] = -1; \
269 * pregexec and friends
273 - pregexec - match a regexp against a string
276 Perl_pregexec(pTHX_ register regexp *prog, char *stringarg, register char *strend,
277 char *strbeg, I32 minend, SV *screamer, U32 nosave)
278 /* strend: pointer to null at end of string */
279 /* strbeg: real beginning of string */
280 /* minend: end of match must be >=minend after stringarg. */
281 /* nosave: For optimizations. */
284 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
285 nosave ? 0 : REXEC_COPY_STR);
289 S_cache_re(pTHX_ regexp *prog)
291 PL_regprecomp = prog->precomp; /* Needed for FAIL. */
293 PL_regprogram = prog->program;
295 PL_regnpar = prog->nparens;
296 PL_regdata = prog->data;
301 * Need to implement the following flags for reg_anch:
303 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
305 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
306 * INTUIT_AUTORITATIVE_ML
307 * INTUIT_ONCE_NOML - Intuit can match in one location only.
310 * Another flag for this function: SECOND_TIME (so that float substrs
311 * with giant delta may be not rechecked).
314 /* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
316 /* If SCREAM, then SvPVX(sv) should be compatible with strpos and strend.
317 Otherwise, only SvCUR(sv) is used to get strbeg. */
319 /* XXXX We assume that strpos is strbeg unless sv. */
321 /* XXXX Some places assume that there is a fixed substring.
322 An update may be needed if optimizer marks as "INTUITable"
323 RExen without fixed substrings. Similarly, it is assumed that
324 lengths of all the strings are no more than minlen, thus they
325 cannot come from lookahead.
326 (Or minlen should take into account lookahead.) */
328 /* A failure to find a constant substring means that there is no need to make
329 an expensive call to REx engine, thus we celebrate a failure. Similarly,
330 finding a substring too deep into the string means that less calls to
331 regtry() should be needed.
333 REx compiler's optimizer found 4 possible hints:
334 a) Anchored substring;
336 c) Whether we are anchored (beginning-of-line or \G);
337 d) First node (of those at offset 0) which may distingush positions;
338 We use a)b)d) and multiline-part of c), and try to find a position in the
339 string which does not contradict any of them.
342 /* Most of decisions we do here should have been done at compile time.
343 The nodes of the REx which we used for the search should have been
344 deleted from the finite automaton. */
347 Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
348 char *strend, U32 flags, re_scream_pos_data *data)
350 register I32 start_shift;
351 /* Should be nonnegative! */
352 register I32 end_shift;
359 register char *other_last = Nullch; /* other substr checked before this */
360 char *check_at; /* check substr found at this pos */
362 char *i_strpos = strpos;
365 DEBUG_r( if (!PL_colorset) reginitcolors() );
366 DEBUG_r(PerlIO_printf(Perl_debug_log,
367 "%sGuessing start of match, REx%s `%s%.60s%s%s' against `%s%.*s%s%s'...\n",
368 PL_colors[4],PL_colors[5],PL_colors[0],
371 (strlen(prog->precomp) > 60 ? "..." : ""),
373 (int)(strend - strpos > 60 ? 60 : strend - strpos),
374 strpos, PL_colors[1],
375 (strend - strpos > 60 ? "..." : ""))
378 if (prog->reganch & ROPT_UTF8)
379 PL_reg_flags |= RF_utf8;
381 if (prog->minlen > CHR_DIST((U8*)strend, (U8*)strpos)) {
382 DEBUG_r(PerlIO_printf(Perl_debug_log, "String too short...\n"));
385 strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
387 check = prog->check_substr;
388 if (prog->reganch & ROPT_ANCH) { /* Match at beg-of-str or after \n */
389 ml_anch = !( (prog->reganch & ROPT_ANCH_SINGLE)
390 || ( (prog->reganch & ROPT_ANCH_BOL)
391 && !PL_multiline ) ); /* Check after \n? */
394 if ( !(prog->reganch & ROPT_ANCH_GPOS) /* Checked by the caller */
395 /* SvCUR is not set on references: SvRV and SvPVX overlap */
397 && (strpos != strbeg)) {
398 DEBUG_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
401 if (prog->check_offset_min == prog->check_offset_max &&
402 !(prog->reganch & ROPT_SANY_SEEN)) {
403 /* Substring at constant offset from beg-of-str... */
406 s = HOP3c(strpos, prog->check_offset_min, strend);
408 slen = SvCUR(check); /* >= 1 */
410 if ( strend - s > slen || strend - s < slen - 1
411 || (strend - s == slen && strend[-1] != '\n')) {
412 DEBUG_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
415 /* Now should match s[0..slen-2] */
417 if (slen && (*SvPVX(check) != *s
419 && memNE(SvPVX(check), s, slen)))) {
421 DEBUG_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
425 else if (*SvPVX(check) != *s
426 || ((slen = SvCUR(check)) > 1
427 && memNE(SvPVX(check), s, slen)))
429 goto success_at_start;
432 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
434 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
435 end_shift = prog->minlen - start_shift -
436 CHR_SVLEN(check) + (SvTAIL(check) != 0);
438 I32 end = prog->check_offset_max + CHR_SVLEN(check)
439 - (SvTAIL(check) != 0);
440 I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
442 if (end_shift < eshift)
446 else { /* Can match at random position */
449 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
450 /* Should be nonnegative! */
451 end_shift = prog->minlen - start_shift -
452 CHR_SVLEN(check) + (SvTAIL(check) != 0);
455 #ifdef DEBUGGING /* 7/99: reports of failure (with the older version) */
457 Perl_croak(aTHX_ "panic: end_shift");
461 /* Find a possible match in the region s..strend by looking for
462 the "check" substring in the region corrected by start/end_shift. */
463 if (flags & REXEC_SCREAM) {
464 I32 p = -1; /* Internal iterator of scream. */
465 I32 *pp = data ? data->scream_pos : &p;
467 if (PL_screamfirst[BmRARE(check)] >= 0
468 || ( BmRARE(check) == '\n'
469 && (BmPREVIOUS(check) == SvCUR(check) - 1)
471 s = screaminstr(sv, check,
472 start_shift + (s - strbeg), end_shift, pp, 0);
476 *data->scream_olds = s;
478 else if (prog->reganch & ROPT_SANY_SEEN)
479 s = fbm_instr((U8*)(s + start_shift),
480 (U8*)(strend - end_shift),
481 check, PL_multiline ? FBMrf_MULTILINE : 0);
483 s = fbm_instr(HOP3(s, start_shift, strend),
484 HOP3(strend, -end_shift, strbeg),
485 check, PL_multiline ? FBMrf_MULTILINE : 0);
487 /* Update the count-of-usability, remove useless subpatterns,
490 DEBUG_r(PerlIO_printf(Perl_debug_log, "%s %s substr `%s%.*s%s'%s%s",
491 (s ? "Found" : "Did not find"),
492 ((check == prog->anchored_substr) ? "anchored" : "floating"),
494 (int)(SvCUR(check) - (SvTAIL(check)!=0)),
496 PL_colors[1], (SvTAIL(check) ? "$" : ""),
497 (s ? " at offset " : "...\n") ) );
504 /* Finish the diagnostic message */
505 DEBUG_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
507 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
508 Start with the other substr.
509 XXXX no SCREAM optimization yet - and a very coarse implementation
510 XXXX /ttx+/ results in anchored=`ttx', floating=`x'. floating will
511 *always* match. Probably should be marked during compile...
512 Probably it is right to do no SCREAM here...
515 if (prog->float_substr && prog->anchored_substr) {
516 /* Take into account the "other" substring. */
517 /* XXXX May be hopelessly wrong for UTF... */
520 if (check == prog->float_substr) {
523 char *last = HOP3c(s, -start_shift, strbeg), *last1, *last2;
526 t = s - prog->check_offset_max;
527 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
528 && (!(prog->reganch & ROPT_UTF8)
529 || ((t = reghopmaybe3_c(s, -(prog->check_offset_max), strpos))
534 t = HOP3c(t, prog->anchored_offset, strend);
535 if (t < other_last) /* These positions already checked */
537 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
540 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
541 /* On end-of-str: see comment below. */
542 s = fbm_instr((unsigned char*)t,
543 HOP3(HOP3(last1, prog->anchored_offset, strend)
544 + SvCUR(prog->anchored_substr),
545 -(SvTAIL(prog->anchored_substr)!=0), strbeg),
546 prog->anchored_substr,
547 PL_multiline ? FBMrf_MULTILINE : 0);
548 DEBUG_r(PerlIO_printf(Perl_debug_log,
549 "%s anchored substr `%s%.*s%s'%s",
550 (s ? "Found" : "Contradicts"),
552 (int)(SvCUR(prog->anchored_substr)
553 - (SvTAIL(prog->anchored_substr)!=0)),
554 SvPVX(prog->anchored_substr),
555 PL_colors[1], (SvTAIL(prog->anchored_substr) ? "$" : "")));
557 if (last1 >= last2) {
558 DEBUG_r(PerlIO_printf(Perl_debug_log,
559 ", giving up...\n"));
562 DEBUG_r(PerlIO_printf(Perl_debug_log,
563 ", trying floating at offset %ld...\n",
564 (long)(HOP3c(s1, 1, strend) - i_strpos)));
565 other_last = HOP3c(last1, prog->anchored_offset+1, strend);
566 s = HOP3c(last, 1, strend);
570 DEBUG_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
571 (long)(s - i_strpos)));
572 t = HOP3c(s, -prog->anchored_offset, strbeg);
573 other_last = HOP3c(s, 1, strend);
581 else { /* Take into account the floating substring. */
585 t = HOP3c(s, -start_shift, strbeg);
587 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
588 if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
589 last = HOP3c(t, prog->float_max_offset, strend);
590 s = HOP3c(t, prog->float_min_offset, strend);
593 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
594 /* fbm_instr() takes into account exact value of end-of-str
595 if the check is SvTAIL(ed). Since false positives are OK,
596 and end-of-str is not later than strend we are OK. */
597 s = fbm_instr((unsigned char*)s,
598 (unsigned char*)last + SvCUR(prog->float_substr)
599 - (SvTAIL(prog->float_substr)!=0),
600 prog->float_substr, PL_multiline ? FBMrf_MULTILINE : 0);
601 DEBUG_r(PerlIO_printf(Perl_debug_log, "%s floating substr `%s%.*s%s'%s",
602 (s ? "Found" : "Contradicts"),
604 (int)(SvCUR(prog->float_substr)
605 - (SvTAIL(prog->float_substr)!=0)),
606 SvPVX(prog->float_substr),
607 PL_colors[1], (SvTAIL(prog->float_substr) ? "$" : "")));
610 DEBUG_r(PerlIO_printf(Perl_debug_log,
611 ", giving up...\n"));
614 DEBUG_r(PerlIO_printf(Perl_debug_log,
615 ", trying anchored starting at offset %ld...\n",
616 (long)(s1 + 1 - i_strpos)));
618 s = HOP3c(t, 1, strend);
622 DEBUG_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
623 (long)(s - i_strpos)));
624 other_last = s; /* Fix this later. --Hugo */
633 t = s - prog->check_offset_max;
634 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
635 && (!(prog->reganch & ROPT_UTF8)
636 || ((t = reghopmaybe3_c(s, -prog->check_offset_max, strpos))
638 /* Fixed substring is found far enough so that the match
639 cannot start at strpos. */
641 if (ml_anch && t[-1] != '\n') {
642 /* Eventually fbm_*() should handle this, but often
643 anchored_offset is not 0, so this check will not be wasted. */
644 /* XXXX In the code below we prefer to look for "^" even in
645 presence of anchored substrings. And we search even
646 beyond the found float position. These pessimizations
647 are historical artefacts only. */
649 while (t < strend - prog->minlen) {
651 if (t < check_at - prog->check_offset_min) {
652 if (prog->anchored_substr) {
653 /* Since we moved from the found position,
654 we definitely contradict the found anchored
655 substr. Due to the above check we do not
656 contradict "check" substr.
657 Thus we can arrive here only if check substr
658 is float. Redo checking for "other"=="fixed".
661 DEBUG_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
662 PL_colors[0],PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
663 goto do_other_anchored;
665 /* We don't contradict the found floating substring. */
666 /* XXXX Why not check for STCLASS? */
668 DEBUG_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
669 PL_colors[0],PL_colors[1], (long)(s - i_strpos)));
672 /* Position contradicts check-string */
673 /* XXXX probably better to look for check-string
674 than for "\n", so one should lower the limit for t? */
675 DEBUG_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
676 PL_colors[0],PL_colors[1], (long)(t + 1 - i_strpos)));
677 other_last = strpos = s = t + 1;
682 DEBUG_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
683 PL_colors[0],PL_colors[1]));
687 DEBUG_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
688 PL_colors[0],PL_colors[1]));
692 ++BmUSEFUL(prog->check_substr); /* hooray/5 */
695 /* The found string does not prohibit matching at strpos,
696 - no optimization of calling REx engine can be performed,
697 unless it was an MBOL and we are not after MBOL,
698 or a future STCLASS check will fail this. */
700 /* Even in this situation we may use MBOL flag if strpos is offset
701 wrt the start of the string. */
702 if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
703 && (strpos != strbeg) && strpos[-1] != '\n'
704 /* May be due to an implicit anchor of m{.*foo} */
705 && !(prog->reganch & ROPT_IMPLICIT))
710 DEBUG_r( if (ml_anch)
711 PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
712 (long)(strpos - i_strpos), PL_colors[0],PL_colors[1]);
715 if (!(prog->reganch & ROPT_NAUGHTY) /* XXXX If strpos moved? */
716 && prog->check_substr /* Could be deleted already */
717 && --BmUSEFUL(prog->check_substr) < 0
718 && prog->check_substr == prog->float_substr)
720 /* If flags & SOMETHING - do not do it many times on the same match */
721 DEBUG_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
722 SvREFCNT_dec(prog->check_substr);
723 prog->check_substr = Nullsv; /* disable */
724 prog->float_substr = Nullsv; /* clear */
725 check = Nullsv; /* abort */
727 /* XXXX This is a remnant of the old implementation. It
728 looks wasteful, since now INTUIT can use many
730 prog->reganch &= ~RE_USE_INTUIT;
737 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
738 if (prog->regstclass) {
739 /* minlen == 0 is possible if regstclass is \b or \B,
740 and the fixed substr is ''$.
741 Since minlen is already taken into account, s+1 is before strend;
742 accidentally, minlen >= 1 guaranties no false positives at s + 1
743 even for \b or \B. But (minlen? 1 : 0) below assumes that
744 regstclass does not come from lookahead... */
745 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
746 This leaves EXACTF only, which is dealt with in find_byclass(). */
747 U8* str = (U8*)STRING(prog->regstclass);
748 int cl_l = (PL_regkind[(U8)OP(prog->regstclass)] == EXACT
749 ? CHR_DIST(str+STR_LEN(prog->regstclass), str)
751 char *endpos = (prog->anchored_substr || ml_anch)
752 ? HOP3c(s, (prog->minlen ? cl_l : 0), strend)
753 : (prog->float_substr
754 ? HOP3c(HOP3c(check_at, -start_shift, strbeg),
757 char *startpos = strbeg;
760 if (prog->reganch & ROPT_UTF8) {
761 PL_regdata = prog->data;
764 s = find_byclass(prog, prog->regstclass, s, endpos, startpos, 1);
769 if (endpos == strend) {
770 DEBUG_r( PerlIO_printf(Perl_debug_log,
771 "Could not match STCLASS...\n") );
774 DEBUG_r( PerlIO_printf(Perl_debug_log,
775 "This position contradicts STCLASS...\n") );
776 if ((prog->reganch & ROPT_ANCH) && !ml_anch)
778 /* Contradict one of substrings */
779 if (prog->anchored_substr) {
780 if (prog->anchored_substr == check) {
781 DEBUG_r( what = "anchored" );
783 s = HOP3c(t, 1, strend);
784 if (s + start_shift + end_shift > strend) {
785 /* XXXX Should be taken into account earlier? */
786 DEBUG_r( PerlIO_printf(Perl_debug_log,
787 "Could not match STCLASS...\n") );
792 DEBUG_r( PerlIO_printf(Perl_debug_log,
793 "Looking for %s substr starting at offset %ld...\n",
794 what, (long)(s + start_shift - i_strpos)) );
797 /* Have both, check_string is floating */
798 if (t + start_shift >= check_at) /* Contradicts floating=check */
799 goto retry_floating_check;
800 /* Recheck anchored substring, but not floating... */
804 DEBUG_r( PerlIO_printf(Perl_debug_log,
805 "Looking for anchored substr starting at offset %ld...\n",
806 (long)(other_last - i_strpos)) );
807 goto do_other_anchored;
809 /* Another way we could have checked stclass at the
810 current position only: */
815 DEBUG_r( PerlIO_printf(Perl_debug_log,
816 "Looking for /%s^%s/m starting at offset %ld...\n",
817 PL_colors[0],PL_colors[1], (long)(t - i_strpos)) );
820 if (!prog->float_substr) /* Could have been deleted */
822 /* Check is floating subtring. */
823 retry_floating_check:
824 t = check_at - start_shift;
825 DEBUG_r( what = "floating" );
826 goto hop_and_restart;
829 PerlIO_printf(Perl_debug_log,
830 "By STCLASS: moving %ld --> %ld\n",
831 (long)(t - i_strpos), (long)(s - i_strpos));
833 PerlIO_printf(Perl_debug_log,
834 "Does not contradict STCLASS...\n") );
837 DEBUG_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
838 PL_colors[4], (check ? "Guessed" : "Giving up"),
839 PL_colors[5], (long)(s - i_strpos)) );
842 fail_finish: /* Substring not found */
843 if (prog->check_substr) /* could be removed already */
844 BmUSEFUL(prog->check_substr) += 5; /* hooray */
846 DEBUG_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
847 PL_colors[4],PL_colors[5]));
851 /* We know what class REx starts with. Try to find this position... */
853 S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *startpos, I32 norun)
855 I32 doevery = (prog->reganch & ROPT_SKIP) == 0;
861 register I32 tmp = 1; /* Scratch variable? */
862 register bool do_utf8 = DO_UTF8(PL_reg_sv);
864 /* We know what class it must start with. */
868 if (reginclass(c, (U8*)s, do_utf8)) {
869 if (tmp && (norun || regtry(prog, s)))
876 s += do_utf8 ? UTF8SKIP(s) : 1;
883 c1 = to_utf8_lower((U8*)m);
884 c2 = to_utf8_upper((U8*)m);
895 c2 = PL_fold_locale[c1];
900 e = s; /* Due to minlen logic of intuit() */
906 if ( utf8_to_uv_simple((U8*)s, &len) == c1
913 UV c = utf8_to_uv_simple((U8*)s, &len);
914 if ( (c == c1 || c == c2) && regtry(prog, s) )
923 && (ln == 1 || !(OP(c) == EXACTF
925 : ibcmp_locale(s, m, ln)))
926 && (norun || regtry(prog, s)) )
932 if ( (*(U8*)s == c1 || *(U8*)s == c2)
933 && (ln == 1 || !(OP(c) == EXACTF
935 : ibcmp_locale(s, m, ln)))
936 && (norun || regtry(prog, s)) )
943 PL_reg_flags |= RF_tainted;
950 U8 *r = reghop3((U8*)s, -1, (U8*)startpos);
952 tmp = (I32)utf8_to_uv(r, s - (char*)r, 0, 0);
954 tmp = ((OP(c) == BOUND ?
955 isALNUM_uni(tmp) : isALNUM_LC_uni(tmp)) != 0);
957 if (tmp == !(OP(c) == BOUND ?
958 swash_fetch(PL_utf8_alnum, (U8*)s) :
959 isALNUM_LC_utf8((U8*)s)))
962 if ((norun || regtry(prog, s)))
969 tmp = (s != startpos) ? UCHARAT(s - 1) : '\n';
970 tmp = ((OP(c) == BOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
973 !(OP(c) == BOUND ? isALNUM(*s) : isALNUM_LC(*s))) {
975 if ((norun || regtry(prog, s)))
981 if ((!prog->minlen && tmp) && (norun || regtry(prog, s)))
985 PL_reg_flags |= RF_tainted;
992 U8 *r = reghop3((U8*)s, -1, (U8*)startpos);
994 tmp = (I32)utf8_to_uv(r, s - (char*)r, 0, 0);
996 tmp = ((OP(c) == NBOUND ?
997 isALNUM_uni(tmp) : isALNUM_LC_uni(tmp)) != 0);
999 if (tmp == !(OP(c) == NBOUND ?
1000 swash_fetch(PL_utf8_alnum, (U8*)s) :
1001 isALNUM_LC_utf8((U8*)s)))
1003 else if ((norun || regtry(prog, s)))
1009 tmp = (s != startpos) ? UCHARAT(s - 1) : '\n';
1010 tmp = ((OP(c) == NBOUND ?
1011 isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1012 while (s < strend) {
1014 !(OP(c) == NBOUND ? isALNUM(*s) : isALNUM_LC(*s)))
1016 else if ((norun || regtry(prog, s)))
1021 if ((!prog->minlen && !tmp) && (norun || regtry(prog, s)))
1026 while (s < strend) {
1027 if (swash_fetch(PL_utf8_alnum, (U8*)s)) {
1028 if (tmp && (norun || regtry(prog, s)))
1039 while (s < strend) {
1041 if (tmp && (norun || regtry(prog, s)))
1053 PL_reg_flags |= RF_tainted;
1055 while (s < strend) {
1056 if (isALNUM_LC_utf8((U8*)s)) {
1057 if (tmp && (norun || regtry(prog, s)))
1068 while (s < strend) {
1069 if (isALNUM_LC(*s)) {
1070 if (tmp && (norun || regtry(prog, s)))
1083 while (s < strend) {
1084 if (!swash_fetch(PL_utf8_alnum, (U8*)s)) {
1085 if (tmp && (norun || regtry(prog, s)))
1096 while (s < strend) {
1098 if (tmp && (norun || regtry(prog, s)))
1110 PL_reg_flags |= RF_tainted;
1112 while (s < strend) {
1113 if (!isALNUM_LC_utf8((U8*)s)) {
1114 if (tmp && (norun || regtry(prog, s)))
1125 while (s < strend) {
1126 if (!isALNUM_LC(*s)) {
1127 if (tmp && (norun || regtry(prog, s)))
1140 while (s < strend) {
1141 if (*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s)) {
1142 if (tmp && (norun || regtry(prog, s)))
1153 while (s < strend) {
1155 if (tmp && (norun || regtry(prog, s)))
1167 PL_reg_flags |= RF_tainted;
1169 while (s < strend) {
1170 if (*s == ' ' || isSPACE_LC_utf8((U8*)s)) {
1171 if (tmp && (norun || regtry(prog, s)))
1182 while (s < strend) {
1183 if (isSPACE_LC(*s)) {
1184 if (tmp && (norun || regtry(prog, s)))
1197 while (s < strend) {
1198 if (!(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s))) {
1199 if (tmp && (norun || regtry(prog, s)))
1210 while (s < strend) {
1212 if (tmp && (norun || regtry(prog, s)))
1224 PL_reg_flags |= RF_tainted;
1226 while (s < strend) {
1227 if (!(*s == ' ' || isSPACE_LC_utf8((U8*)s))) {
1228 if (tmp && (norun || regtry(prog, s)))
1239 while (s < strend) {
1240 if (!isSPACE_LC(*s)) {
1241 if (tmp && (norun || regtry(prog, s)))
1254 while (s < strend) {
1255 if (swash_fetch(PL_utf8_digit,(U8*)s)) {
1256 if (tmp && (norun || regtry(prog, s)))
1267 while (s < strend) {
1269 if (tmp && (norun || regtry(prog, s)))
1281 PL_reg_flags |= RF_tainted;
1283 while (s < strend) {
1284 if (isDIGIT_LC_utf8((U8*)s)) {
1285 if (tmp && (norun || regtry(prog, s)))
1296 while (s < strend) {
1297 if (isDIGIT_LC(*s)) {
1298 if (tmp && (norun || regtry(prog, s)))
1311 while (s < strend) {
1312 if (!swash_fetch(PL_utf8_digit,(U8*)s)) {
1313 if (tmp && (norun || regtry(prog, s)))
1324 while (s < strend) {
1326 if (tmp && (norun || regtry(prog, s)))
1338 PL_reg_flags |= RF_tainted;
1340 while (s < strend) {
1341 if (!isDIGIT_LC_utf8((U8*)s)) {
1342 if (tmp && (norun || regtry(prog, s)))
1353 while (s < strend) {
1354 if (!isDIGIT_LC(*s)) {
1355 if (tmp && (norun || regtry(prog, s)))
1367 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
1376 - regexec_flags - match a regexp against a string
1379 Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *strend,
1380 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
1381 /* strend: pointer to null at end of string */
1382 /* strbeg: real beginning of string */
1383 /* minend: end of match must be >=minend after stringarg. */
1384 /* data: May be used for some additional optimizations. */
1385 /* nosave: For optimizations. */
1388 register regnode *c;
1389 register char *startpos = stringarg;
1390 I32 minlen; /* must match at least this many chars */
1391 I32 dontbother = 0; /* how many characters not to try at end */
1392 /* I32 start_shift = 0; */ /* Offset of the start to find
1393 constant substr. */ /* CC */
1394 I32 end_shift = 0; /* Same for the end. */ /* CC */
1395 I32 scream_pos = -1; /* Internal iterator of scream. */
1397 SV* oreplsv = GvSV(PL_replgv);
1398 bool do_utf8 = DO_UTF8(sv);
1404 PL_regnarrate = PL_debug & 512;
1407 /* Be paranoid... */
1408 if (prog == NULL || startpos == NULL) {
1409 Perl_croak(aTHX_ "NULL regexp parameter");
1413 minlen = prog->minlen;
1415 if (!(prog->reganch & ROPT_SANY_SEEN))
1416 if (utf8_distance((U8*)strend, (U8*)startpos) < minlen) goto phooey;
1419 if (strend - startpos < minlen) goto phooey;
1422 if (startpos == strbeg) /* is ^ valid at stringarg? */
1425 if (prog->reganch & ROPT_UTF8 && do_utf8) {
1426 U8 *s = reghop3((U8*)stringarg, -1, (U8*)strbeg);
1427 PL_regprev = utf8_to_uv(s, (U8*)stringarg - s, NULL, 0);
1430 PL_regprev = (U32)stringarg[-1];
1431 if (!PL_multiline && PL_regprev == '\n')
1432 PL_regprev = '\0'; /* force ^ to NOT match */
1435 /* Check validity of program. */
1436 if (UCHARAT(prog->program) != REG_MAGIC) {
1437 Perl_croak(aTHX_ "corrupted regexp program");
1441 PL_reg_eval_set = 0;
1444 if (prog->reganch & ROPT_UTF8)
1445 PL_reg_flags |= RF_utf8;
1447 /* Mark beginning of line for ^ and lookbehind. */
1448 PL_regbol = startpos;
1452 /* Mark end of line for $ (and such) */
1455 /* see how far we have to get to not match where we matched before */
1456 PL_regtill = startpos+minend;
1458 /* We start without call_cc context. */
1461 /* If there is a "must appear" string, look for it. */
1464 if (prog->reganch & ROPT_GPOS_SEEN) { /* Need to have PL_reg_ganch */
1467 if (flags & REXEC_IGNOREPOS) /* Means: check only at start */
1468 PL_reg_ganch = startpos;
1469 else if (sv && SvTYPE(sv) >= SVt_PVMG
1471 && (mg = mg_find(sv, 'g')) && mg->mg_len >= 0) {
1472 PL_reg_ganch = strbeg + mg->mg_len; /* Defined pos() */
1473 if (prog->reganch & ROPT_ANCH_GPOS) {
1474 if (s > PL_reg_ganch)
1479 else /* pos() not defined */
1480 PL_reg_ganch = strbeg;
1483 if (do_utf8 == (UTF!=0) &&
1484 !(flags & REXEC_CHECKED) && prog->check_substr != Nullsv) {
1485 re_scream_pos_data d;
1487 d.scream_olds = &scream_olds;
1488 d.scream_pos = &scream_pos;
1489 s = re_intuit_start(prog, sv, s, strend, flags, &d);
1491 goto phooey; /* not present */
1494 DEBUG_r( if (!PL_colorset) reginitcolors() );
1495 DEBUG_r(PerlIO_printf(Perl_debug_log,
1496 "%sMatching REx%s `%s%.60s%s%s' against `%s%.*s%s%s'\n",
1497 PL_colors[4],PL_colors[5],PL_colors[0],
1500 (strlen(prog->precomp) > 60 ? "..." : ""),
1502 (int)(strend - startpos > 60 ? 60 : strend - startpos),
1503 startpos, PL_colors[1],
1504 (strend - startpos > 60 ? "..." : ""))
1507 /* Simplest case: anchored match need be tried only once. */
1508 /* [unless only anchor is BOL and multiline is set] */
1509 if (prog->reganch & (ROPT_ANCH & ~ROPT_ANCH_GPOS)) {
1510 if (s == startpos && regtry(prog, startpos))
1512 else if (PL_multiline || (prog->reganch & ROPT_IMPLICIT)
1513 || (prog->reganch & ROPT_ANCH_MBOL)) /* XXXX SBOL? */
1518 dontbother = minlen - 1;
1519 end = HOP3c(strend, -dontbother, strbeg) - 1;
1520 /* for multiline we only have to try after newlines */
1521 if (prog->check_substr) {
1525 if (regtry(prog, s))
1530 if (prog->reganch & RE_USE_INTUIT) {
1531 s = re_intuit_start(prog, sv, s + 1, strend, flags, NULL);
1542 if (*s++ == '\n') { /* don't need PL_utf8skip here */
1543 if (regtry(prog, s))
1550 } else if (prog->reganch & ROPT_ANCH_GPOS) {
1551 if (regtry(prog, PL_reg_ganch))
1556 /* Messy cases: unanchored match. */
1557 if (prog->anchored_substr && prog->reganch & ROPT_SKIP) {
1558 /* we have /x+whatever/ */
1559 /* it must be a one character string (XXXX Except UTF?) */
1560 char ch = SvPVX(prog->anchored_substr)[0];
1566 while (s < strend) {
1568 DEBUG_r( did_match = 1 );
1569 if (regtry(prog, s)) goto got_it;
1571 while (s < strend && *s == ch)
1578 while (s < strend) {
1580 DEBUG_r( did_match = 1 );
1581 if (regtry(prog, s)) goto got_it;
1583 while (s < strend && *s == ch)
1589 DEBUG_r(did_match ||
1590 PerlIO_printf(Perl_debug_log,
1591 "Did not find anchored character...\n"));
1594 else if (do_utf8 == (UTF!=0) &&
1595 (prog->anchored_substr != Nullsv
1596 || (prog->float_substr != Nullsv
1597 && prog->float_max_offset < strend - s))) {
1598 SV *must = prog->anchored_substr
1599 ? prog->anchored_substr : prog->float_substr;
1601 prog->anchored_substr ? prog->anchored_offset : prog->float_max_offset;
1603 prog->anchored_substr ? prog->anchored_offset : prog->float_min_offset;
1604 char *last = HOP3c(strend, /* Cannot start after this */
1605 -(I32)(CHR_SVLEN(must)
1606 - (SvTAIL(must) != 0) + back_min), strbeg);
1607 char *last1; /* Last position checked before */
1613 last1 = HOPc(s, -1);
1615 last1 = s - 1; /* bogus */
1617 /* XXXX check_substr already used to find `s', can optimize if
1618 check_substr==must. */
1620 dontbother = end_shift;
1621 strend = HOPc(strend, -dontbother);
1622 while ( (s <= last) &&
1623 ((flags & REXEC_SCREAM)
1624 ? (s = screaminstr(sv, must, HOP3c(s, back_min, strend) - strbeg,
1625 end_shift, &scream_pos, 0))
1626 : (s = fbm_instr((unsigned char*)HOP3(s, back_min, strend),
1627 (unsigned char*)strend, must,
1628 PL_multiline ? FBMrf_MULTILINE : 0))) ) {
1629 DEBUG_r( did_match = 1 );
1630 if (HOPc(s, -back_max) > last1) {
1631 last1 = HOPc(s, -back_min);
1632 s = HOPc(s, -back_max);
1635 char *t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
1637 last1 = HOPc(s, -back_min);
1641 while (s <= last1) {
1642 if (regtry(prog, s))
1648 while (s <= last1) {
1649 if (regtry(prog, s))
1655 DEBUG_r(did_match ||
1656 PerlIO_printf(Perl_debug_log, "Did not find %s substr `%s%.*s%s'%s...\n",
1657 ((must == prog->anchored_substr)
1658 ? "anchored" : "floating"),
1660 (int)(SvCUR(must) - (SvTAIL(must)!=0)),
1662 PL_colors[1], (SvTAIL(must) ? "$" : "")));
1665 else if ((c = prog->regstclass)) {
1666 if (minlen && PL_regkind[(U8)OP(prog->regstclass)] != EXACT)
1667 /* don't bother with what can't match */
1668 strend = HOPc(strend, -(minlen - 1));
1670 SV *prop = sv_newmortal();
1672 PerlIO_printf(Perl_debug_log, "Matching stclass `%s' against `%s'\n", SvPVX(prop), s);
1674 if (find_byclass(prog, c, s, strend, startpos, 0))
1676 DEBUG_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass...\n"));
1680 if (prog->float_substr != Nullsv) { /* Trim the end. */
1683 if (flags & REXEC_SCREAM) {
1684 last = screaminstr(sv, prog->float_substr, s - strbeg,
1685 end_shift, &scream_pos, 1); /* last one */
1687 last = scream_olds; /* Only one occurrence. */
1691 char *little = SvPV(prog->float_substr, len);
1693 if (SvTAIL(prog->float_substr)) {
1694 if (memEQ(strend - len + 1, little, len - 1))
1695 last = strend - len + 1;
1696 else if (!PL_multiline)
1697 last = memEQ(strend - len, little, len)
1698 ? strend - len : Nullch;
1704 last = rninstr(s, strend, little, little + len);
1706 last = strend; /* matching `$' */
1710 DEBUG_r(PerlIO_printf(Perl_debug_log,
1711 "%sCan't trim the tail, match fails (should not happen)%s\n",
1712 PL_colors[4],PL_colors[5]));
1713 goto phooey; /* Should not happen! */
1715 dontbother = strend - last + prog->float_min_offset;
1717 if (minlen && (dontbother < minlen))
1718 dontbother = minlen - 1;
1719 strend -= dontbother; /* this one's always in bytes! */
1720 /* We don't know much -- general case. */
1723 if (regtry(prog, s))
1732 if (regtry(prog, s))
1734 } while (s++ < strend);
1742 RX_MATCH_TAINTED_set(prog, PL_reg_flags & RF_tainted);
1744 if (PL_reg_eval_set) {
1745 /* Preserve the current value of $^R */
1746 if (oreplsv != GvSV(PL_replgv))
1747 sv_setsv(oreplsv, GvSV(PL_replgv));/* So that when GvSV(replgv) is
1748 restored, the value remains
1750 restore_pos(aTHXo_ 0);
1753 /* make sure $`, $&, $', and $digit will work later */
1754 if ( !(flags & REXEC_NOT_FIRST) ) {
1755 if (RX_MATCH_COPIED(prog)) {
1756 Safefree(prog->subbeg);
1757 RX_MATCH_COPIED_off(prog);
1759 if (flags & REXEC_COPY_STR) {
1760 I32 i = PL_regeol - startpos + (stringarg - strbeg);
1762 s = savepvn(strbeg, i);
1765 RX_MATCH_COPIED_on(prog);
1768 prog->subbeg = strbeg;
1769 prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
1776 DEBUG_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
1777 PL_colors[4],PL_colors[5]));
1778 if (PL_reg_eval_set)
1779 restore_pos(aTHXo_ 0);
1784 - regtry - try match at specific point
1786 STATIC I32 /* 0 failure, 1 success */
1787 S_regtry(pTHX_ regexp *prog, char *startpos)
1795 PL_regindent = 0; /* XXXX Not good when matches are reenterable... */
1797 if ((prog->reganch & ROPT_EVAL_SEEN) && !PL_reg_eval_set) {
1800 PL_reg_eval_set = RS_init;
1802 PerlIO_printf(Perl_debug_log, " setting stack tmpbase at %"IVdf"\n",
1803 (IV)(PL_stack_sp - PL_stack_base));
1805 SAVEI32(cxstack[cxstack_ix].blk_oldsp);
1806 cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base;
1807 /* Otherwise OP_NEXTSTATE will free whatever on stack now. */
1809 /* Apparently this is not needed, judging by wantarray. */
1810 /* SAVEI8(cxstack[cxstack_ix].blk_gimme);
1811 cxstack[cxstack_ix].blk_gimme = G_SCALAR; */
1814 /* Make $_ available to executed code. */
1815 if (PL_reg_sv != DEFSV) {
1816 /* SAVE_DEFSV does *not* suffice here for USE_THREADS */
1821 if (!(SvTYPE(PL_reg_sv) >= SVt_PVMG && SvMAGIC(PL_reg_sv)
1822 && (mg = mg_find(PL_reg_sv, 'g')))) {
1823 /* prepare for quick setting of pos */
1824 sv_magic(PL_reg_sv, (SV*)0, 'g', Nullch, 0);
1825 mg = mg_find(PL_reg_sv, 'g');
1829 PL_reg_oldpos = mg->mg_len;
1830 SAVEDESTRUCTOR_X(restore_pos, 0);
1833 Newz(22,PL_reg_curpm, 1, PMOP);
1834 PL_reg_curpm->op_pmregexp = prog;
1835 PL_reg_oldcurpm = PL_curpm;
1836 PL_curpm = PL_reg_curpm;
1837 if (RX_MATCH_COPIED(prog)) {
1838 /* Here is a serious problem: we cannot rewrite subbeg,
1839 since it may be needed if this match fails. Thus
1840 $` inside (?{}) could fail... */
1841 PL_reg_oldsaved = prog->subbeg;
1842 PL_reg_oldsavedlen = prog->sublen;
1843 RX_MATCH_COPIED_off(prog);
1846 PL_reg_oldsaved = Nullch;
1847 prog->subbeg = PL_bostr;
1848 prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
1850 prog->startp[0] = startpos - PL_bostr;
1851 PL_reginput = startpos;
1852 PL_regstartp = prog->startp;
1853 PL_regendp = prog->endp;
1854 PL_reglastparen = &prog->lastparen;
1855 prog->lastparen = 0;
1857 DEBUG_r(PL_reg_starttry = startpos);
1858 if (PL_reg_start_tmpl <= prog->nparens) {
1859 PL_reg_start_tmpl = prog->nparens*3/2 + 3;
1860 if(PL_reg_start_tmp)
1861 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
1863 New(22,PL_reg_start_tmp, PL_reg_start_tmpl, char*);
1866 /* XXXX What this code is doing here?!!! There should be no need
1867 to do this again and again, PL_reglastparen should take care of
1870 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
1871 * Actually, the code in regcppop() (which Ilya may be meaning by
1872 * PL_reglastparen), is not needed at all by the test suite
1873 * (op/regexp, op/pat, op/split), but that code is needed, oddly
1874 * enough, for building DynaLoader, or otherwise this
1875 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
1876 * will happen. Meanwhile, this code *is* needed for the
1877 * above-mentioned test suite tests to succeed. The common theme
1878 * on those tests seems to be returning null fields from matches.
1883 if (prog->nparens) {
1884 for (i = prog->nparens; i > *PL_reglastparen; i--) {
1891 if (regmatch(prog->program + 1)) {
1892 prog->endp[0] = PL_reginput - PL_bostr;
1895 REGCP_UNWIND(lastcp);
1899 #define RE_UNWIND_BRANCH 1
1900 #define RE_UNWIND_BRANCHJ 2
1904 typedef struct { /* XX: makes sense to enlarge it... */
1908 } re_unwind_generic_t;
1921 } re_unwind_branch_t;
1923 typedef union re_unwind_t {
1925 re_unwind_generic_t generic;
1926 re_unwind_branch_t branch;
1930 - regmatch - main matching routine
1932 * Conceptually the strategy is simple: check to see whether the current
1933 * node matches, call self recursively to see whether the rest matches,
1934 * and then act accordingly. In practice we make some effort to avoid
1935 * recursion, in particular by going through "ordinary" nodes (that don't
1936 * need to know whether the rest of the match failed) by a loop instead of
1939 /* [lwall] I've hoisted the register declarations to the outer block in order to
1940 * maybe save a little bit of pushing and popping on the stack. It also takes
1941 * advantage of machines that use a register save mask on subroutine entry.
1943 STATIC I32 /* 0 failure, 1 success */
1944 S_regmatch(pTHX_ regnode *prog)
1946 register regnode *scan; /* Current node. */
1947 regnode *next; /* Next node. */
1948 regnode *inner; /* Next node in internal branch. */
1949 register I32 nextchr; /* renamed nextchr - nextchar colides with
1950 function of same name */
1951 register I32 n; /* no or next */
1952 register I32 ln; /* len or last */
1953 register char *s; /* operand or save */
1954 register char *locinput = PL_reginput;
1955 register I32 c1, c2, paren; /* case fold search, parenth */
1956 int minmod = 0, sw = 0, logical = 0;
1958 I32 firstcp = PL_savestack_ix;
1959 register bool do_utf8 = DO_UTF8(PL_reg_sv);
1965 /* Note that nextchr is a byte even in UTF */
1966 nextchr = UCHARAT(locinput);
1968 while (scan != NULL) {
1969 #define sayNO_L (logical ? (logical = 0, sw = 0, goto cont) : sayNO)
1971 # define sayYES goto yes
1972 # define sayNO goto no
1973 # define sayYES_FINAL goto yes_final
1974 # define sayYES_LOUD goto yes_loud
1975 # define sayNO_FINAL goto no_final
1976 # define sayNO_SILENT goto do_no
1977 # define saySAME(x) if (x) goto yes; else goto no
1978 # define REPORT_CODE_OFF 24
1980 # define sayYES return 1
1981 # define sayNO return 0
1982 # define sayYES_FINAL return 1
1983 # define sayYES_LOUD return 1
1984 # define sayNO_FINAL return 0
1985 # define sayNO_SILENT return 0
1986 # define saySAME(x) return x
1989 SV *prop = sv_newmortal();
1990 int docolor = *PL_colors[0];
1991 int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
1992 int l = (PL_regeol - locinput) > taill ? taill : (PL_regeol - locinput);
1993 /* The part of the string before starttry has one color
1994 (pref0_len chars), between starttry and current
1995 position another one (pref_len - pref0_len chars),
1996 after the current position the third one.
1997 We assume that pref0_len <= pref_len, otherwise we
1998 decrease pref0_len. */
1999 int pref_len = (locinput - PL_bostr) > (5 + taill) - l
2000 ? (5 + taill) - l : locinput - PL_bostr;
2003 while (UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
2005 pref0_len = pref_len - (locinput - PL_reg_starttry);
2006 if (l + pref_len < (5 + taill) && l < PL_regeol - locinput)
2007 l = ( PL_regeol - locinput > (5 + taill) - pref_len
2008 ? (5 + taill) - pref_len : PL_regeol - locinput);
2009 while (UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
2013 if (pref0_len > pref_len)
2014 pref0_len = pref_len;
2015 regprop(prop, scan);
2016 PerlIO_printf(Perl_debug_log,
2017 "%4"IVdf" <%s%.*s%s%s%.*s%s%s%s%.*s%s>%*s|%3"IVdf":%*s%s\n",
2018 (IV)(locinput - PL_bostr),
2019 PL_colors[4], pref0_len,
2020 locinput - pref_len, PL_colors[5],
2021 PL_colors[2], pref_len - pref0_len,
2022 locinput - pref_len + pref0_len, PL_colors[3],
2023 (docolor ? "" : "> <"),
2024 PL_colors[0], l, locinput, PL_colors[1],
2025 15 - l - pref_len + 1,
2027 (IV)(scan - PL_regprogram), PL_regindent*2, "",
2031 next = scan + NEXT_OFF(scan);
2037 if (locinput == PL_bostr
2038 ? PL_regprev == '\n'
2040 (nextchr || locinput < PL_regeol) && locinput[-1] == '\n') )
2042 /* regtill = regbol; */
2047 if (locinput == PL_bostr
2048 ? PL_regprev == '\n'
2049 : ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n') )
2055 if (locinput == PL_bostr)
2059 if (locinput == PL_reg_ganch)
2069 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
2074 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
2076 if (PL_regeol - locinput > 1)
2080 if (PL_regeol != locinput)
2084 if (!nextchr && locinput >= PL_regeol)
2086 nextchr = UCHARAT(++locinput);
2089 if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
2092 locinput += PL_utf8skip[nextchr];
2093 if (locinput > PL_regeol)
2095 nextchr = UCHARAT(locinput);
2098 nextchr = UCHARAT(++locinput);
2103 if (do_utf8 != (UTF!=0)) {
2111 if (*((U8*)s) != utf8_to_uv_simple((U8*)l, &len))
2120 if (*((U8*)l) != utf8_to_uv_simple((U8*)s, &len))
2126 nextchr = UCHARAT(locinput);
2129 /* Inline the first character, for speed. */
2130 if (UCHARAT(s) != nextchr)
2132 if (PL_regeol - locinput < ln)
2134 if (ln > 1 && memNE(s, locinput, ln))
2137 nextchr = UCHARAT(locinput);
2140 PL_reg_flags |= RF_tainted;
2150 c1 = OP(scan) == EXACTF;
2152 if (l >= PL_regeol) {
2155 if ((UTF ? utf8_to_uv((U8*)s, e - s, 0, 0) : *((U8*)s)) !=
2156 (c1 ? toLOWER_utf8((U8*)l) : toLOWER_LC_utf8((U8*)l)))
2158 s += UTF ? UTF8SKIP(s) : 1;
2162 nextchr = UCHARAT(locinput);
2166 /* Inline the first character, for speed. */
2167 if (UCHARAT(s) != nextchr &&
2168 UCHARAT(s) != ((OP(scan) == EXACTF)
2169 ? PL_fold : PL_fold_locale)[nextchr])
2171 if (PL_regeol - locinput < ln)
2173 if (ln > 1 && (OP(scan) == EXACTF
2174 ? ibcmp(s, locinput, ln)
2175 : ibcmp_locale(s, locinput, ln)))
2178 nextchr = UCHARAT(locinput);
2182 if (!reginclass(scan, (U8*)locinput, do_utf8))
2184 if (locinput >= PL_regeol)
2186 locinput += PL_utf8skip[nextchr];
2187 nextchr = UCHARAT(locinput);
2191 nextchr = UCHARAT(locinput);
2192 if (!reginclass(scan, (U8*)locinput, do_utf8))
2194 if (!nextchr && locinput >= PL_regeol)
2196 nextchr = UCHARAT(++locinput);
2200 PL_reg_flags |= RF_tainted;
2206 if (!(OP(scan) == ALNUM
2207 ? swash_fetch(PL_utf8_alnum, (U8*)locinput)
2208 : isALNUM_LC_utf8((U8*)locinput)))
2212 locinput += PL_utf8skip[nextchr];
2213 nextchr = UCHARAT(locinput);
2216 if (!(OP(scan) == ALNUM
2217 ? isALNUM(nextchr) : isALNUM_LC(nextchr)))
2219 nextchr = UCHARAT(++locinput);
2222 PL_reg_flags |= RF_tainted;
2225 if (!nextchr && locinput >= PL_regeol)
2228 if (OP(scan) == NALNUM
2229 ? swash_fetch(PL_utf8_alnum, (U8*)locinput)
2230 : isALNUM_LC_utf8((U8*)locinput))
2234 locinput += PL_utf8skip[nextchr];
2235 nextchr = UCHARAT(locinput);
2238 if (OP(scan) == NALNUM
2239 ? isALNUM(nextchr) : isALNUM_LC(nextchr))
2241 nextchr = UCHARAT(++locinput);
2245 PL_reg_flags |= RF_tainted;
2249 /* was last char in word? */
2251 if (locinput == PL_regbol)
2254 U8 *r = reghop((U8*)locinput, -1);
2256 ln = utf8_to_uv(r, s - (char*)r, 0, 0);
2258 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
2259 ln = isALNUM_uni(ln);
2260 n = swash_fetch(PL_utf8_alnum, (U8*)locinput);
2263 ln = isALNUM_LC_uni(ln);
2264 n = isALNUM_LC_utf8((U8*)locinput);
2268 ln = (locinput != PL_regbol) ?
2269 UCHARAT(locinput - 1) : PL_regprev;
2270 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
2272 n = isALNUM(nextchr);
2275 ln = isALNUM_LC(ln);
2276 n = isALNUM_LC(nextchr);
2279 if (((!ln) == (!n)) == (OP(scan) == BOUND ||
2280 OP(scan) == BOUNDL))
2284 PL_reg_flags |= RF_tainted;
2290 if (UTF8_IS_CONTINUED(nextchr)) {
2291 if (!(OP(scan) == SPACE
2292 ? swash_fetch(PL_utf8_space, (U8*)locinput)
2293 : isSPACE_LC_utf8((U8*)locinput)))
2297 locinput += PL_utf8skip[nextchr];
2298 nextchr = UCHARAT(locinput);
2301 if (!(OP(scan) == SPACE
2302 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
2304 nextchr = UCHARAT(++locinput);
2307 if (!(OP(scan) == SPACE
2308 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
2310 nextchr = UCHARAT(++locinput);
2314 PL_reg_flags |= RF_tainted;
2317 if (!nextchr && locinput >= PL_regeol)
2320 if (OP(scan) == NSPACE
2321 ? swash_fetch(PL_utf8_space, (U8*)locinput)
2322 : isSPACE_LC_utf8((U8*)locinput))
2326 locinput += PL_utf8skip[nextchr];
2327 nextchr = UCHARAT(locinput);
2330 if (OP(scan) == NSPACE
2331 ? isSPACE(nextchr) : isSPACE_LC(nextchr))
2333 nextchr = UCHARAT(++locinput);
2336 PL_reg_flags |= RF_tainted;
2342 if (!(OP(scan) == DIGIT
2343 ? swash_fetch(PL_utf8_digit, (U8*)locinput)
2344 : isDIGIT_LC_utf8((U8*)locinput)))
2348 locinput += PL_utf8skip[nextchr];
2349 nextchr = UCHARAT(locinput);
2352 if (!(OP(scan) == DIGIT
2353 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr)))
2355 nextchr = UCHARAT(++locinput);
2358 PL_reg_flags |= RF_tainted;
2361 if (!nextchr && locinput >= PL_regeol)
2364 if (OP(scan) == NDIGIT
2365 ? swash_fetch(PL_utf8_digit, (U8*)locinput)
2366 : isDIGIT_LC_utf8((U8*)locinput))
2370 locinput += PL_utf8skip[nextchr];
2371 nextchr = UCHARAT(locinput);
2374 if (OP(scan) == NDIGIT
2375 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr))
2377 nextchr = UCHARAT(++locinput);
2380 if (locinput >= PL_regeol || swash_fetch(PL_utf8_mark,(U8*)locinput))
2382 locinput += PL_utf8skip[nextchr];
2383 while (locinput < PL_regeol && swash_fetch(PL_utf8_mark,(U8*)locinput))
2384 locinput += UTF8SKIP(locinput);
2385 if (locinput > PL_regeol)
2387 nextchr = UCHARAT(locinput);
2390 PL_reg_flags |= RF_tainted;
2394 n = ARG(scan); /* which paren pair */
2395 ln = PL_regstartp[n];
2396 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
2397 if (*PL_reglastparen < n || ln == -1)
2398 sayNO; /* Do not match unless seen CLOSEn. */
2399 if (ln == PL_regendp[n])
2403 if (do_utf8 && OP(scan) != REF) { /* REF can do byte comparison */
2405 char *e = PL_bostr + PL_regendp[n];
2407 * Note that we can't do the "other character" lookup trick as
2408 * in the 8-bit case (no pun intended) because in Unicode we
2409 * have to map both upper and title case to lower case.
2411 if (OP(scan) == REFF) {
2415 if (toLOWER_utf8((U8*)s) != toLOWER_utf8((U8*)l))
2425 if (toLOWER_LC_utf8((U8*)s) != toLOWER_LC_utf8((U8*)l))
2432 nextchr = UCHARAT(locinput);
2436 /* Inline the first character, for speed. */
2437 if (UCHARAT(s) != nextchr &&
2439 (UCHARAT(s) != ((OP(scan) == REFF
2440 ? PL_fold : PL_fold_locale)[nextchr]))))
2442 ln = PL_regendp[n] - ln;
2443 if (locinput + ln > PL_regeol)
2445 if (ln > 1 && (OP(scan) == REF
2446 ? memNE(s, locinput, ln)
2448 ? ibcmp(s, locinput, ln)
2449 : ibcmp_locale(s, locinput, ln))))
2452 nextchr = UCHARAT(locinput);
2463 OP_4tree *oop = PL_op;
2464 COP *ocurcop = PL_curcop;
2465 SV **ocurpad = PL_curpad;
2469 PL_op = (OP_4tree*)PL_regdata->data[n];
2470 DEBUG_r( PerlIO_printf(Perl_debug_log, " re_eval 0x%"UVxf"\n", PTR2UV(PL_op)) );
2471 PL_curpad = AvARRAY((AV*)PL_regdata->data[n + 2]);
2472 PL_regendp[0] = PL_reg_magic->mg_len = locinput - PL_bostr;
2474 CALLRUNOPS(aTHX); /* Scalar context. */
2480 PL_curpad = ocurpad;
2481 PL_curcop = ocurcop;
2483 if (logical == 2) { /* Postponed subexpression. */
2485 MAGIC *mg = Null(MAGIC*);
2487 CHECKPOINT cp, lastcp;
2489 if(SvROK(ret) || SvRMAGICAL(ret)) {
2490 SV *sv = SvROK(ret) ? SvRV(ret) : ret;
2493 mg = mg_find(sv, 'r');
2496 re = (regexp *)mg->mg_obj;
2497 (void)ReREFCNT_inc(re);
2501 char *t = SvPV(ret, len);
2503 char *oprecomp = PL_regprecomp;
2504 I32 osize = PL_regsize;
2505 I32 onpar = PL_regnpar;
2508 re = CALLREGCOMP(aTHX_ t, t + len, &pm);
2510 & (SVs_TEMP | SVs_PADTMP | SVf_READONLY)))
2511 sv_magic(ret,(SV*)ReREFCNT_inc(re),'r',0,0);
2512 PL_regprecomp = oprecomp;
2517 PerlIO_printf(Perl_debug_log,
2518 "Entering embedded `%s%.60s%s%s'\n",
2522 (strlen(re->precomp) > 60 ? "..." : ""))
2525 state.prev = PL_reg_call_cc;
2526 state.cc = PL_regcc;
2527 state.re = PL_reg_re;
2531 cp = regcppush(0); /* Save *all* the positions. */
2534 state.ss = PL_savestack_ix;
2535 *PL_reglastparen = 0;
2536 PL_reg_call_cc = &state;
2537 PL_reginput = locinput;
2539 /* XXXX This is too dramatic a measure... */
2542 if (regmatch(re->program + 1)) {
2543 /* Even though we succeeded, we need to restore
2544 global variables, since we may be wrapped inside
2545 SUSPEND, thus the match may be not finished yet. */
2547 /* XXXX Do this only if SUSPENDed? */
2548 PL_reg_call_cc = state.prev;
2549 PL_regcc = state.cc;
2550 PL_reg_re = state.re;
2551 cache_re(PL_reg_re);
2553 /* XXXX This is too dramatic a measure... */
2556 /* These are needed even if not SUSPEND. */
2562 REGCP_UNWIND(lastcp);
2564 PL_reg_call_cc = state.prev;
2565 PL_regcc = state.cc;
2566 PL_reg_re = state.re;
2567 cache_re(PL_reg_re);
2569 /* XXXX This is too dramatic a measure... */
2578 sv_setsv(save_scalar(PL_replgv), ret);
2582 n = ARG(scan); /* which paren pair */
2583 PL_reg_start_tmp[n] = locinput;
2588 n = ARG(scan); /* which paren pair */
2589 PL_regstartp[n] = PL_reg_start_tmp[n] - PL_bostr;
2590 PL_regendp[n] = locinput - PL_bostr;
2591 if (n > *PL_reglastparen)
2592 *PL_reglastparen = n;
2595 n = ARG(scan); /* which paren pair */
2596 sw = (*PL_reglastparen >= n && PL_regendp[n] != -1);
2599 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
2601 next = NEXTOPER(NEXTOPER(scan));
2603 next = scan + ARG(scan);
2604 if (OP(next) == IFTHEN) /* Fake one. */
2605 next = NEXTOPER(NEXTOPER(next));
2609 logical = scan->flags;
2611 /*******************************************************************
2612 PL_regcc contains infoblock about the innermost (...)* loop, and
2613 a pointer to the next outer infoblock.
2615 Here is how Y(A)*Z is processed (if it is compiled into CURLYX/WHILEM):
2617 1) After matching X, regnode for CURLYX is processed;
2619 2) This regnode creates infoblock on the stack, and calls
2620 regmatch() recursively with the starting point at WHILEM node;
2622 3) Each hit of WHILEM node tries to match A and Z (in the order
2623 depending on the current iteration, min/max of {min,max} and
2624 greediness). The information about where are nodes for "A"
2625 and "Z" is read from the infoblock, as is info on how many times "A"
2626 was already matched, and greediness.
2628 4) After A matches, the same WHILEM node is hit again.
2630 5) Each time WHILEM is hit, PL_regcc is the infoblock created by CURLYX
2631 of the same pair. Thus when WHILEM tries to match Z, it temporarily
2632 resets PL_regcc, since this Y(A)*Z can be a part of some other loop:
2633 as in (Y(A)*Z)*. If Z matches, the automaton will hit the WHILEM node
2634 of the external loop.
2636 Currently present infoblocks form a tree with a stem formed by PL_curcc
2637 and whatever it mentions via ->next, and additional attached trees
2638 corresponding to temporarily unset infoblocks as in "5" above.
2640 In the following picture infoblocks for outer loop of
2641 (Y(A)*?Z)*?T are denoted O, for inner I. NULL starting block
2642 is denoted by x. The matched string is YAAZYAZT. Temporarily postponed
2643 infoblocks are drawn below the "reset" infoblock.
2645 In fact in the picture below we do not show failed matches for Z and T
2646 by WHILEM blocks. [We illustrate minimal matches, since for them it is
2647 more obvious *why* one needs to *temporary* unset infoblocks.]
2649 Matched REx position InfoBlocks Comment
2653 Y A)*?Z)*?T x <- O <- I
2654 YA )*?Z)*?T x <- O <- I
2655 YA A)*?Z)*?T x <- O <- I
2656 YAA )*?Z)*?T x <- O <- I
2657 YAA Z)*?T x <- O # Temporary unset I
2660 YAAZ Y(A)*?Z)*?T x <- O
2663 YAAZY (A)*?Z)*?T x <- O
2666 YAAZY A)*?Z)*?T x <- O <- I
2669 YAAZYA )*?Z)*?T x <- O <- I
2672 YAAZYA Z)*?T x <- O # Temporary unset I
2678 YAAZYAZ T x # Temporary unset O
2685 *******************************************************************/
2688 CHECKPOINT cp = PL_savestack_ix;
2689 /* No need to save/restore up to this paren */
2690 I32 parenfloor = scan->flags;
2692 if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
2694 cc.oldcc = PL_regcc;
2696 /* XXXX Probably it is better to teach regpush to support
2697 parenfloor > PL_regsize... */
2698 if (parenfloor > *PL_reglastparen)
2699 parenfloor = *PL_reglastparen; /* Pessimization... */
2700 cc.parenfloor = parenfloor;
2702 cc.min = ARG1(scan);
2703 cc.max = ARG2(scan);
2704 cc.scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
2708 PL_reginput = locinput;
2709 n = regmatch(PREVOPER(next)); /* start on the WHILEM */
2711 PL_regcc = cc.oldcc;
2717 * This is really hard to understand, because after we match
2718 * what we're trying to match, we must make sure the rest of
2719 * the REx is going to match for sure, and to do that we have
2720 * to go back UP the parse tree by recursing ever deeper. And
2721 * if it fails, we have to reset our parent's current state
2722 * that we can try again after backing off.
2725 CHECKPOINT cp, lastcp;
2726 CURCUR* cc = PL_regcc;
2727 char *lastloc = cc->lastloc; /* Detection of 0-len. */
2729 n = cc->cur + 1; /* how many we know we matched */
2730 PL_reginput = locinput;
2733 PerlIO_printf(Perl_debug_log,
2734 "%*s %ld out of %ld..%ld cc=%lx\n",
2735 REPORT_CODE_OFF+PL_regindent*2, "",
2736 (long)n, (long)cc->min,
2737 (long)cc->max, (long)cc)
2740 /* If degenerate scan matches "", assume scan done. */
2742 if (locinput == cc->lastloc && n >= cc->min) {
2743 PL_regcc = cc->oldcc;
2747 PerlIO_printf(Perl_debug_log,
2748 "%*s empty match detected, try continuation...\n",
2749 REPORT_CODE_OFF+PL_regindent*2, "")
2751 if (regmatch(cc->next))
2759 /* First just match a string of min scans. */
2763 cc->lastloc = locinput;
2764 if (regmatch(cc->scan))
2767 cc->lastloc = lastloc;
2772 /* Check whether we already were at this position.
2773 Postpone detection until we know the match is not
2774 *that* much linear. */
2775 if (!PL_reg_maxiter) {
2776 PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
2777 PL_reg_leftiter = PL_reg_maxiter;
2779 if (PL_reg_leftiter-- == 0) {
2780 I32 size = (PL_reg_maxiter + 7)/8;
2781 if (PL_reg_poscache) {
2782 if (PL_reg_poscache_size < size) {
2783 Renew(PL_reg_poscache, size, char);
2784 PL_reg_poscache_size = size;
2786 Zero(PL_reg_poscache, size, char);
2789 PL_reg_poscache_size = size;
2790 Newz(29, PL_reg_poscache, size, char);
2793 PerlIO_printf(Perl_debug_log,
2794 "%sDetected a super-linear match, switching on caching%s...\n",
2795 PL_colors[4], PL_colors[5])
2798 if (PL_reg_leftiter < 0) {
2799 I32 o = locinput - PL_bostr, b;
2801 o = (scan->flags & 0xf) - 1 + o * (scan->flags>>4);
2804 if (PL_reg_poscache[o] & (1<<b)) {
2806 PerlIO_printf(Perl_debug_log,
2807 "%*s already tried at this position...\n",
2808 REPORT_CODE_OFF+PL_regindent*2, "")
2812 PL_reg_poscache[o] |= (1<<b);
2816 /* Prefer next over scan for minimal matching. */
2819 PL_regcc = cc->oldcc;
2822 cp = regcppush(cc->parenfloor);
2824 if (regmatch(cc->next)) {
2826 sayYES; /* All done. */
2828 REGCP_UNWIND(lastcp);
2834 if (n >= cc->max) { /* Maximum greed exceeded? */
2835 if (ckWARN(WARN_REGEXP) && n >= REG_INFTY
2836 && !(PL_reg_flags & RF_warned)) {
2837 PL_reg_flags |= RF_warned;
2838 Perl_warner(aTHX_ WARN_REGEXP, "%s limit (%d) exceeded",
2839 "Complex regular subexpression recursion",
2846 PerlIO_printf(Perl_debug_log,
2847 "%*s trying longer...\n",
2848 REPORT_CODE_OFF+PL_regindent*2, "")
2850 /* Try scanning more and see if it helps. */
2851 PL_reginput = locinput;
2853 cc->lastloc = locinput;
2854 cp = regcppush(cc->parenfloor);
2856 if (regmatch(cc->scan)) {
2860 REGCP_UNWIND(lastcp);
2863 cc->lastloc = lastloc;
2867 /* Prefer scan over next for maximal matching. */
2869 if (n < cc->max) { /* More greed allowed? */
2870 cp = regcppush(cc->parenfloor);
2872 cc->lastloc = locinput;
2874 if (regmatch(cc->scan)) {
2878 REGCP_UNWIND(lastcp);
2879 regcppop(); /* Restore some previous $<digit>s? */
2880 PL_reginput = locinput;
2882 PerlIO_printf(Perl_debug_log,
2883 "%*s failed, try continuation...\n",
2884 REPORT_CODE_OFF+PL_regindent*2, "")
2887 if (ckWARN(WARN_REGEXP) && n >= REG_INFTY
2888 && !(PL_reg_flags & RF_warned)) {
2889 PL_reg_flags |= RF_warned;
2890 Perl_warner(aTHX_ WARN_REGEXP, "%s limit (%d) exceeded",
2891 "Complex regular subexpression recursion",
2895 /* Failed deeper matches of scan, so see if this one works. */
2896 PL_regcc = cc->oldcc;
2899 if (regmatch(cc->next))
2905 cc->lastloc = lastloc;
2910 next = scan + ARG(scan);
2913 inner = NEXTOPER(NEXTOPER(scan));
2916 inner = NEXTOPER(scan);
2921 if (OP(next) != c1) /* No choice. */
2922 next = inner; /* Avoid recursion. */
2924 I32 lastparen = *PL_reglastparen;
2926 re_unwind_branch_t *uw;
2928 /* Put unwinding data on stack */
2929 unwind1 = SSNEWt(1,re_unwind_branch_t);
2930 uw = SSPTRt(unwind1,re_unwind_branch_t);
2933 uw->type = ((c1 == BRANCH)
2935 : RE_UNWIND_BRANCHJ);
2936 uw->lastparen = lastparen;
2938 uw->locinput = locinput;
2939 uw->nextchr = nextchr;
2941 uw->regindent = ++PL_regindent;
2944 REGCP_SET(uw->lastcp);
2946 /* Now go into the first branch */
2959 /* We suppose that the next guy does not need
2960 backtracking: in particular, it is of constant length,
2961 and has no parenths to influence future backrefs. */
2962 ln = ARG1(scan); /* min to match */
2963 n = ARG2(scan); /* max to match */
2964 paren = scan->flags;
2966 if (paren > PL_regsize)
2968 if (paren > *PL_reglastparen)
2969 *PL_reglastparen = paren;
2971 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
2973 scan += NEXT_OFF(scan); /* Skip former OPEN. */
2974 PL_reginput = locinput;
2977 if (ln && regrepeat_hard(scan, ln, &l) < ln)
2979 if (ln && l == 0 && n >= ln
2980 /* In fact, this is tricky. If paren, then the
2981 fact that we did/didnot match may influence
2982 future execution. */
2983 && !(paren && ln == 0))
2985 locinput = PL_reginput;
2986 if (PL_regkind[(U8)OP(next)] == EXACT) {
2987 c1 = (U8)*STRING(next);
2988 if (OP(next) == EXACTF)
2990 else if (OP(next) == EXACTFL)
2991 c2 = PL_fold_locale[c1];
2998 /* This may be improved if l == 0. */
2999 while (n >= ln || (n == REG_INFTY && ln > 0 && l)) { /* ln overflow ? */
3000 /* If it could work, try it. */
3002 UCHARAT(PL_reginput) == c1 ||
3003 UCHARAT(PL_reginput) == c2)
3007 PL_regstartp[paren] =
3008 HOPc(PL_reginput, -l) - PL_bostr;
3009 PL_regendp[paren] = PL_reginput - PL_bostr;
3012 PL_regendp[paren] = -1;
3016 REGCP_UNWIND(lastcp);
3018 /* Couldn't or didn't -- move forward. */
3019 PL_reginput = locinput;
3020 if (regrepeat_hard(scan, 1, &l)) {
3022 locinput = PL_reginput;
3029 n = regrepeat_hard(scan, n, &l);
3030 if (n != 0 && l == 0
3031 /* In fact, this is tricky. If paren, then the
3032 fact that we did/didnot match may influence
3033 future execution. */
3034 && !(paren && ln == 0))
3036 locinput = PL_reginput;
3038 PerlIO_printf(Perl_debug_log,
3039 "%*s matched %"IVdf" times, len=%"IVdf"...\n",
3040 (int)(REPORT_CODE_OFF+PL_regindent*2), "",
3044 if (PL_regkind[(U8)OP(next)] == EXACT) {
3045 c1 = (U8)*STRING(next);
3046 if (OP(next) == EXACTF)
3048 else if (OP(next) == EXACTFL)
3049 c2 = PL_fold_locale[c1];
3058 /* If it could work, try it. */
3060 UCHARAT(PL_reginput) == c1 ||
3061 UCHARAT(PL_reginput) == c2)
3064 PerlIO_printf(Perl_debug_log,
3065 "%*s trying tail with n=%"IVdf"...\n",
3066 (int)(REPORT_CODE_OFF+PL_regindent*2), "", (IV)n)
3070 PL_regstartp[paren] = HOPc(PL_reginput, -l) - PL_bostr;
3071 PL_regendp[paren] = PL_reginput - PL_bostr;
3074 PL_regendp[paren] = -1;
3078 REGCP_UNWIND(lastcp);
3080 /* Couldn't or didn't -- back up. */
3082 locinput = HOPc(locinput, -l);
3083 PL_reginput = locinput;
3090 paren = scan->flags; /* Which paren to set */
3091 if (paren > PL_regsize)
3093 if (paren > *PL_reglastparen)
3094 *PL_reglastparen = paren;
3095 ln = ARG1(scan); /* min to match */
3096 n = ARG2(scan); /* max to match */
3097 scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
3101 ln = ARG1(scan); /* min to match */
3102 n = ARG2(scan); /* max to match */
3103 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
3108 scan = NEXTOPER(scan);
3114 scan = NEXTOPER(scan);
3118 * Lookahead to avoid useless match attempts
3119 * when we know what character comes next.
3121 if (PL_regkind[(U8)OP(next)] == EXACT) {
3122 U8 *s = (U8*)STRING(next);
3125 if (OP(next) == EXACTF)
3127 else if (OP(next) == EXACTFL)
3128 c2 = PL_fold_locale[c1];
3131 if (OP(next) == EXACTF) {
3132 c1 = to_utf8_lower(s);
3133 c2 = to_utf8_upper(s);
3136 c2 = c1 = utf8_to_uv_simple(s, NULL);
3142 PL_reginput = locinput;
3146 if (ln && regrepeat(scan, ln) < ln)
3148 locinput = PL_reginput;
3151 char *e; /* Should not check after this */
3152 char *old = locinput;
3154 if (n == REG_INFTY) {
3157 while (UTF8_IS_CONTINUATION(*(U8*)e))
3163 m >0 && e + UTF8SKIP(e) <= PL_regeol; m--)
3167 e = locinput + n - ln;
3173 /* Find place 'next' could work */
3176 while (locinput <= e && *locinput != c1)
3179 while (locinput <= e
3184 count = locinput - old;
3191 utf8_to_uv_simple((U8*)locinput, &len) != c1;
3196 for (count = 0; locinput <= e; count++) {
3197 UV c = utf8_to_uv_simple((U8*)locinput, &len);
3198 if (c == c1 || c == c2)
3206 /* PL_reginput == old now */
3207 if (locinput != old) {
3208 ln = 1; /* Did some */
3209 if (regrepeat(scan, count) < count)
3212 /* PL_reginput == locinput now */
3213 TRYPAREN(paren, ln, locinput);
3214 PL_reginput = locinput; /* Could be reset... */
3215 REGCP_UNWIND(lastcp);
3216 /* Couldn't or didn't -- move forward. */
3219 locinput += UTF8SKIP(locinput);
3225 while (n >= ln || (n == REG_INFTY && ln > 0)) { /* ln overflow ? */
3229 c = utf8_to_uv_simple((U8*)PL_reginput, NULL);
3231 c = UCHARAT(PL_reginput);
3233 /* If it could work, try it. */
3234 if (c1 == -1000 || c == c1 || c == c2)
3236 TRYPAREN(paren, n, PL_reginput);
3237 REGCP_UNWIND(lastcp);
3239 /* Couldn't or didn't -- move forward. */
3240 PL_reginput = locinput;
3241 if (regrepeat(scan, 1)) {
3243 locinput = PL_reginput;
3251 n = regrepeat(scan, n);
3252 locinput = PL_reginput;
3253 if (ln < n && PL_regkind[(U8)OP(next)] == EOL &&
3254 (!PL_multiline || OP(next) == SEOL || OP(next) == EOS)) {
3255 ln = n; /* why back off? */
3256 /* ...because $ and \Z can match before *and* after
3257 newline at the end. Consider "\n\n" =~ /\n+\Z\n/.
3258 We should back off by one in this case. */
3259 if (UCHARAT(PL_reginput - 1) == '\n' && OP(next) != EOS)
3268 c = utf8_to_uv_simple((U8*)PL_reginput, NULL);
3270 c = UCHARAT(PL_reginput);
3272 /* If it could work, try it. */
3273 if (c1 == -1000 || c == c1 || c == c2)
3275 TRYPAREN(paren, n, PL_reginput);
3276 REGCP_UNWIND(lastcp);
3278 /* Couldn't or didn't -- back up. */
3280 PL_reginput = locinput = HOPc(locinput, -1);
3288 c = utf8_to_uv_simple((U8*)PL_reginput, NULL);
3290 c = UCHARAT(PL_reginput);
3292 /* If it could work, try it. */
3293 if (c1 == -1000 || c == c1 || c == c2)
3295 TRYPAREN(paren, n, PL_reginput);
3296 REGCP_UNWIND(lastcp);
3298 /* Couldn't or didn't -- back up. */
3300 PL_reginput = locinput = HOPc(locinput, -1);
3307 if (PL_reg_call_cc) {
3308 re_cc_state *cur_call_cc = PL_reg_call_cc;
3309 CURCUR *cctmp = PL_regcc;
3310 regexp *re = PL_reg_re;
3311 CHECKPOINT cp, lastcp;
3313 cp = regcppush(0); /* Save *all* the positions. */
3315 regcp_set_to(PL_reg_call_cc->ss); /* Restore parens of
3317 PL_reginput = locinput; /* Make position available to
3319 cache_re(PL_reg_call_cc->re);
3320 PL_regcc = PL_reg_call_cc->cc;
3321 PL_reg_call_cc = PL_reg_call_cc->prev;
3322 if (regmatch(cur_call_cc->node)) {
3323 PL_reg_call_cc = cur_call_cc;
3327 REGCP_UNWIND(lastcp);
3329 PL_reg_call_cc = cur_call_cc;
3335 PerlIO_printf(Perl_debug_log,
3336 "%*s continuation failed...\n",
3337 REPORT_CODE_OFF+PL_regindent*2, "")
3341 if (locinput < PL_regtill) {
3342 DEBUG_r(PerlIO_printf(Perl_debug_log,
3343 "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
3345 (long)(locinput - PL_reg_starttry),
3346 (long)(PL_regtill - PL_reg_starttry),
3348 sayNO_FINAL; /* Cannot match: too short. */
3350 PL_reginput = locinput; /* put where regtry can find it */
3351 sayYES_FINAL; /* Success! */
3353 PL_reginput = locinput; /* put where regtry can find it */
3354 sayYES_LOUD; /* Success! */
3357 PL_reginput = locinput;
3362 if (UTF) { /* XXXX This is absolutely
3363 broken, we read before
3365 s = HOPMAYBEc(locinput, -scan->flags);
3371 if (locinput < PL_bostr + scan->flags)
3373 PL_reginput = locinput - scan->flags;
3378 PL_reginput = locinput;
3383 if (UTF) { /* XXXX This is absolutely
3384 broken, we read before
3386 s = HOPMAYBEc(locinput, -scan->flags);
3387 if (!s || s < PL_bostr)
3392 if (locinput < PL_bostr + scan->flags)
3394 PL_reginput = locinput - scan->flags;
3399 PL_reginput = locinput;
3402 inner = NEXTOPER(NEXTOPER(scan));
3403 if (regmatch(inner) != n) {
3418 if (OP(scan) == SUSPEND) {
3419 locinput = PL_reginput;
3420 nextchr = UCHARAT(locinput);
3425 next = scan + ARG(scan);
3430 PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
3431 PTR2UV(scan), OP(scan));
3432 Perl_croak(aTHX_ "regexp memory corruption");
3439 * We get here only if there's trouble -- normally "case END" is
3440 * the terminating point.
3442 Perl_croak(aTHX_ "corrupted regexp pointers");
3448 PerlIO_printf(Perl_debug_log,
3449 "%*s %scould match...%s\n",
3450 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4],PL_colors[5])
3454 DEBUG_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
3455 PL_colors[4],PL_colors[5]));
3461 #if 0 /* Breaks $^R */
3469 PerlIO_printf(Perl_debug_log,
3470 "%*s %sfailed...%s\n",
3471 REPORT_CODE_OFF+PL_regindent*2, "",PL_colors[4],PL_colors[5])
3477 re_unwind_t *uw = SSPTRt(unwind,re_unwind_t);
3480 case RE_UNWIND_BRANCH:
3481 case RE_UNWIND_BRANCHJ:
3483 re_unwind_branch_t *uwb = &(uw->branch);
3484 I32 lastparen = uwb->lastparen;
3486 REGCP_UNWIND(uwb->lastcp);
3487 for (n = *PL_reglastparen; n > lastparen; n--)
3489 *PL_reglastparen = n;
3490 scan = next = uwb->next;
3492 OP(scan) != (uwb->type == RE_UNWIND_BRANCH
3493 ? BRANCH : BRANCHJ) ) { /* Failure */
3500 /* Have more choice yet. Reuse the same uwb. */
3502 if ((n = (uwb->type == RE_UNWIND_BRANCH
3503 ? NEXT_OFF(next) : ARG(next))))
3506 next = NULL; /* XXXX Needn't unwinding in this case... */
3508 next = NEXTOPER(scan);
3509 if (uwb->type == RE_UNWIND_BRANCHJ)
3510 next = NEXTOPER(next);
3511 locinput = uwb->locinput;
3512 nextchr = uwb->nextchr;
3514 PL_regindent = uwb->regindent;
3521 Perl_croak(aTHX_ "regexp unwind memory corruption");
3532 - regrepeat - repeatedly match something simple, report how many
3535 * [This routine now assumes that it will only match on things of length 1.
3536 * That was true before, but now we assume scan - reginput is the count,
3537 * rather than incrementing count on every character. [Er, except utf8.]]
3540 S_regrepeat(pTHX_ regnode *p, I32 max)
3542 register char *scan;
3544 register char *loceol = PL_regeol;
3545 register I32 hardcount = 0;
3546 register bool do_utf8 = DO_UTF8(PL_reg_sv);
3549 if (max != REG_INFTY && max < loceol - scan)
3550 loceol = scan + max;
3555 while (scan < loceol && hardcount < max && *scan != '\n') {
3556 scan += UTF8SKIP(scan);
3560 while (scan < loceol && *scan != '\n')
3567 case EXACT: /* length of string is 1 */
3569 while (scan < loceol && UCHARAT(scan) == c)
3572 case EXACTF: /* length of string is 1 */
3574 while (scan < loceol &&
3575 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold[c]))
3578 case EXACTFL: /* length of string is 1 */
3579 PL_reg_flags |= RF_tainted;
3581 while (scan < loceol &&
3582 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold_locale[c]))
3588 while (hardcount < max && scan < loceol &&
3589 reginclass(p, (U8*)scan, do_utf8)) {
3590 scan += UTF8SKIP(scan);
3594 while (scan < loceol && reginclass(p, (U8*)scan, do_utf8))
3601 while (hardcount < max && scan < loceol &&
3602 swash_fetch(PL_utf8_alnum, (U8*)scan)) {
3603 scan += UTF8SKIP(scan);
3607 while (scan < loceol && isALNUM(*scan))
3612 PL_reg_flags |= RF_tainted;
3615 while (hardcount < max && scan < loceol &&
3616 isALNUM_LC_utf8((U8*)scan)) {
3617 scan += UTF8SKIP(scan);
3621 while (scan < loceol && isALNUM_LC(*scan))
3628 while (hardcount < max && scan < loceol &&
3629 !swash_fetch(PL_utf8_alnum, (U8*)scan)) {
3630 scan += UTF8SKIP(scan);
3634 while (scan < loceol && !isALNUM(*scan))
3639 PL_reg_flags |= RF_tainted;
3642 while (hardcount < max && scan < loceol &&
3643 !isALNUM_LC_utf8((U8*)scan)) {
3644 scan += UTF8SKIP(scan);
3648 while (scan < loceol && !isALNUM_LC(*scan))
3655 while (hardcount < max && scan < loceol &&
3656 (*scan == ' ' || swash_fetch(PL_utf8_space,(U8*)scan))) {
3657 scan += UTF8SKIP(scan);
3661 while (scan < loceol && isSPACE(*scan))
3666 PL_reg_flags |= RF_tainted;
3669 while (hardcount < max && scan < loceol &&
3670 (*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
3671 scan += UTF8SKIP(scan);
3675 while (scan < loceol && isSPACE_LC(*scan))
3682 while (hardcount < max && scan < loceol &&
3683 !(*scan == ' ' || swash_fetch(PL_utf8_space,(U8*)scan))) {
3684 scan += UTF8SKIP(scan);
3688 while (scan < loceol && !isSPACE(*scan))
3693 PL_reg_flags |= RF_tainted;
3696 while (hardcount < max && scan < loceol &&
3697 !(*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
3698 scan += UTF8SKIP(scan);
3702 while (scan < loceol && !isSPACE_LC(*scan))
3709 while (hardcount < max && scan < loceol &&
3710 swash_fetch(PL_utf8_digit,(U8*)scan)) {
3711 scan += UTF8SKIP(scan);
3715 while (scan < loceol && isDIGIT(*scan))
3722 while (hardcount < max && scan < loceol &&
3723 !swash_fetch(PL_utf8_digit,(U8*)scan)) {
3724 scan += UTF8SKIP(scan);
3728 while (scan < loceol && !isDIGIT(*scan))
3732 default: /* Called on something of 0 width. */
3733 break; /* So match right here or not at all. */
3739 c = scan - PL_reginput;
3744 SV *prop = sv_newmortal();
3747 PerlIO_printf(Perl_debug_log,
3748 "%*s %s can match %"IVdf" times out of %"IVdf"...\n",
3749 REPORT_CODE_OFF+1, "", SvPVX(prop),(IV)c,(IV)max);
3756 - regrepeat_hard - repeatedly match something, report total lenth and length
3758 * The repeater is supposed to have constant length.
3762 S_regrepeat_hard(pTHX_ regnode *p, I32 max, I32 *lp)
3764 register char *scan;
3765 register char *start;
3766 register char *loceol = PL_regeol;
3768 I32 count = 0, res = 1;
3773 start = PL_reginput;
3774 if (DO_UTF8(PL_reg_sv)) {
3775 while (PL_reginput < loceol && (scan = PL_reginput, res = regmatch(p))) {
3778 while (start < PL_reginput) {
3780 start += UTF8SKIP(start);
3791 while (PL_reginput < loceol && (scan = PL_reginput, res = regmatch(p))) {
3793 *lp = l = PL_reginput - start;
3794 if (max != REG_INFTY && l*max < loceol - scan)
3795 loceol = scan + l*max;
3808 - regclass_swash - prepare the utf8 swash
3812 Perl_regclass_swash(pTHX_ register regnode* node, bool doinit, SV** initsvp)
3817 if (PL_regdata && PL_regdata->count) {
3820 if (PL_regdata->what[n] == 's') {
3821 SV *rv = (SV*)PL_regdata->data[n];
3822 AV *av = (AV*)SvRV((SV*)rv);
3825 si = *av_fetch(av, 0, FALSE);
3826 a = av_fetch(av, 1, FALSE);
3830 else if (si && doinit) {
3831 sw = swash_init("utf8", "", si, 1, 0);
3832 (void)av_store(av, 1, sw);
3844 - reginclass - determine if a character falls into a character class
3848 S_reginclass(pTHX_ register regnode *n, register U8* p, register bool do_utf8)
3850 char flags = ANYOF_FLAGS(n);
3856 c = utf8_to_uv_simple(p, &len);
3860 if (do_utf8 || (flags & ANYOF_UNICODE)) {
3861 if (do_utf8 && !ANYOF_RUNTIME(n)) {
3862 if (len != (STRLEN)-1 && c < 256 && ANYOF_BITMAP_TEST(n, c))
3865 if (do_utf8 && flags & ANYOF_UNICODE_ALL && c >= 256)
3868 SV *sw = regclass_swash(n, TRUE, 0);
3871 if (swash_fetch(sw, p))
3873 else if (flags & ANYOF_FOLD) {
3874 U8 tmpbuf[UTF8_MAXLEN+1];
3876 if (flags & ANYOF_LOCALE) {
3877 PL_reg_flags |= RF_tainted;
3878 uv_to_utf8(tmpbuf, toLOWER_LC_utf8(p));
3881 uv_to_utf8(tmpbuf, toLOWER_utf8(p));
3882 if (swash_fetch(sw, tmpbuf))
3888 if (!match && c < 256) {
3889 if (ANYOF_BITMAP_TEST(n, c))
3891 else if (flags & ANYOF_FOLD) {
3894 if (flags & ANYOF_LOCALE) {
3895 PL_reg_flags |= RF_tainted;
3896 f = PL_fold_locale[c];
3900 if (f != c && ANYOF_BITMAP_TEST(n, f))
3904 if (!match && (flags & ANYOF_CLASS)) {
3905 PL_reg_flags |= RF_tainted;
3907 (ANYOF_CLASS_TEST(n, ANYOF_ALNUM) && isALNUM_LC(c)) ||
3908 (ANYOF_CLASS_TEST(n, ANYOF_NALNUM) && !isALNUM_LC(c)) ||
3909 (ANYOF_CLASS_TEST(n, ANYOF_SPACE) && isSPACE_LC(c)) ||
3910 (ANYOF_CLASS_TEST(n, ANYOF_NSPACE) && !isSPACE_LC(c)) ||
3911 (ANYOF_CLASS_TEST(n, ANYOF_DIGIT) && isDIGIT_LC(c)) ||
3912 (ANYOF_CLASS_TEST(n, ANYOF_NDIGIT) && !isDIGIT_LC(c)) ||
3913 (ANYOF_CLASS_TEST(n, ANYOF_ALNUMC) && isALNUMC_LC(c)) ||
3914 (ANYOF_CLASS_TEST(n, ANYOF_NALNUMC) && !isALNUMC_LC(c)) ||
3915 (ANYOF_CLASS_TEST(n, ANYOF_ALPHA) && isALPHA_LC(c)) ||
3916 (ANYOF_CLASS_TEST(n, ANYOF_NALPHA) && !isALPHA_LC(c)) ||
3917 (ANYOF_CLASS_TEST(n, ANYOF_ASCII) && isASCII(c)) ||
3918 (ANYOF_CLASS_TEST(n, ANYOF_NASCII) && !isASCII(c)) ||
3919 (ANYOF_CLASS_TEST(n, ANYOF_CNTRL) && isCNTRL_LC(c)) ||
3920 (ANYOF_CLASS_TEST(n, ANYOF_NCNTRL) && !isCNTRL_LC(c)) ||
3921 (ANYOF_CLASS_TEST(n, ANYOF_GRAPH) && isGRAPH_LC(c)) ||
3922 (ANYOF_CLASS_TEST(n, ANYOF_NGRAPH) && !isGRAPH_LC(c)) ||
3923 (ANYOF_CLASS_TEST(n, ANYOF_LOWER) && isLOWER_LC(c)) ||
3924 (ANYOF_CLASS_TEST(n, ANYOF_NLOWER) && !isLOWER_LC(c)) ||
3925 (ANYOF_CLASS_TEST(n, ANYOF_PRINT) && isPRINT_LC(c)) ||
3926 (ANYOF_CLASS_TEST(n, ANYOF_NPRINT) && !isPRINT_LC(c)) ||
3927 (ANYOF_CLASS_TEST(n, ANYOF_PUNCT) && isPUNCT_LC(c)) ||
3928 (ANYOF_CLASS_TEST(n, ANYOF_NPUNCT) && !isPUNCT_LC(c)) ||
3929 (ANYOF_CLASS_TEST(n, ANYOF_UPPER) && isUPPER_LC(c)) ||
3930 (ANYOF_CLASS_TEST(n, ANYOF_NUPPER) && !isUPPER_LC(c)) ||
3931 (ANYOF_CLASS_TEST(n, ANYOF_XDIGIT) && isXDIGIT(c)) ||
3932 (ANYOF_CLASS_TEST(n, ANYOF_NXDIGIT) && !isXDIGIT(c)) ||
3933 (ANYOF_CLASS_TEST(n, ANYOF_PSXSPC) && isPSXSPC(c)) ||
3934 (ANYOF_CLASS_TEST(n, ANYOF_NPSXSPC) && !isPSXSPC(c)) ||
3935 (ANYOF_CLASS_TEST(n, ANYOF_BLANK) && isBLANK(c)) ||
3936 (ANYOF_CLASS_TEST(n, ANYOF_NBLANK) && !isBLANK(c))
3937 ) /* How's that for a conditional? */
3944 return (flags & ANYOF_INVERT) ? !match : match;
3948 S_reghop(pTHX_ U8 *s, I32 off)
3950 return S_reghop3(aTHX_ s, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr));
3954 S_reghop3(pTHX_ U8 *s, I32 off, U8* lim)
3957 while (off-- && s < lim) {
3958 /* XXX could check well-formedness here */
3966 if (UTF8_IS_CONTINUED(*s)) {
3967 while (s > (U8*)lim && UTF8_IS_CONTINUATION(*s))
3970 /* XXX could check well-formedness here */
3978 S_reghopmaybe(pTHX_ U8 *s, I32 off)
3980 return S_reghopmaybe3(aTHX_ s, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr));
3984 S_reghopmaybe3(pTHX_ U8* s, I32 off, U8* lim)
3987 while (off-- && s < lim) {
3988 /* XXX could check well-formedness here */
3998 if (UTF8_IS_CONTINUED(*s)) {
3999 while (s > (U8*)lim && UTF8_IS_CONTINUATION(*s))
4002 /* XXX could check well-formedness here */
4018 restore_pos(pTHXo_ void *arg)
4020 if (PL_reg_eval_set) {
4021 if (PL_reg_oldsaved) {
4022 PL_reg_re->subbeg = PL_reg_oldsaved;
4023 PL_reg_re->sublen = PL_reg_oldsavedlen;
4024 RX_MATCH_COPIED_on(PL_reg_re);
4026 PL_reg_magic->mg_len = PL_reg_oldpos;
4027 PL_reg_eval_set = 0;
4028 PL_curpm = PL_reg_oldcurpm;