5 * "One Ring to rule them all, One Ring to find them..."
8 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
9 * confused with the original package (see point 3 below). Thanks, Henry!
12 /* Additional note: this code is very heavily munged from Henry's version
13 * in places. In some spots I've traded clarity for efficiency, so don't
14 * blame Henry for some of the lack of readability.
17 /* The names of the functions have been changed from regcomp and
18 * regexec to pregcomp and pregexec in order to avoid conflicts
19 * with the POSIX routines of the same names.
22 #ifdef PERL_EXT_RE_BUILD
23 /* need to replace pregcomp et al, so enable that */
24 # ifndef PERL_IN_XSUB_RE
25 # define PERL_IN_XSUB_RE
27 /* need access to debugger hooks */
28 # if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
33 #ifdef PERL_IN_XSUB_RE
34 /* We *really* need to overwrite these symbols: */
35 # define Perl_regexec_flags my_regexec
36 # define Perl_regdump my_regdump
37 # define Perl_regprop my_regprop
38 # define Perl_re_intuit_start my_re_intuit_start
39 /* *These* symbols are masked to allow static link. */
40 # define Perl_pregexec my_pregexec
41 # define Perl_reginitcolors my_reginitcolors
42 # define Perl_regclass_swash my_regclass_swash
44 # define PERL_NO_GET_CONTEXT
49 * pregcomp and pregexec -- regsub and regerror are not used in perl
51 * Copyright (c) 1986 by University of Toronto.
52 * Written by Henry Spencer. Not derived from licensed software.
54 * Permission is granted to anyone to use this software for any
55 * purpose on any computer system, and to redistribute it freely,
56 * subject to the following restrictions:
58 * 1. The author is not responsible for the consequences of use of
59 * this software, no matter how awful, even if they arise
62 * 2. The origin of this software must not be misrepresented, either
63 * by explicit claim or by omission.
65 * 3. Altered versions must be plainly marked as such, and must not
66 * be misrepresented as being the original software.
68 **** Alterations to Henry's code are...
70 **** Copyright (c) 1991-2001, Larry Wall
72 **** You may distribute under the terms of either the GNU General Public
73 **** License or the Artistic License, as specified in the README file.
75 * Beware that some of this code is subtly aware of the way operator
76 * precedence is structured in regular expressions. Serious changes in
77 * regular-expression syntax might require a total rethink.
80 #define PERL_IN_REGEXEC_C
83 #ifdef PERL_IN_XSUB_RE
84 # if defined(PERL_CAPI) || defined(PERL_OBJECT)
91 #define RF_tainted 1 /* tainted information used? */
92 #define RF_warned 2 /* warned about big count? */
93 #define RF_evaled 4 /* Did an EVAL with setting? */
94 #define RF_utf8 8 /* String contains multibyte chars? */
96 #define UTF (PL_reg_flags & RF_utf8)
98 #define RS_init 1 /* eval environment created */
99 #define RS_set 2 /* replsv value is set */
102 #define STATIC static
109 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
110 #define CHR_DIST(a,b) (DO_UTF8(PL_reg_sv) ? utf8_distance(a,b) : a - b)
112 #define reghop_c(pos,off) ((char*)reghop((U8*)pos, off))
113 #define reghopmaybe_c(pos,off) ((char*)reghopmaybe((U8*)pos, off))
114 #define HOP(pos,off) (DO_UTF8(PL_reg_sv) ? reghop((U8*)pos, off) : (U8*)(pos + off))
115 #define HOPMAYBE(pos,off) (DO_UTF8(PL_reg_sv) ? reghopmaybe((U8*)pos, off) : (U8*)(pos + off))
116 #define HOPc(pos,off) ((char*)HOP(pos,off))
117 #define HOPMAYBEc(pos,off) ((char*)HOPMAYBE(pos,off))
119 #define reghop3_c(pos,off,lim) ((char*)reghop3((U8*)pos, off, (U8*)lim))
120 #define reghopmaybe3_c(pos,off,lim) ((char*)reghopmaybe3((U8*)pos, off, (U8*)lim))
121 #define HOP3(pos,off,lim) (DO_UTF8(PL_reg_sv) ? reghop3((U8*)pos, off, (U8*)lim) : (U8*)(pos + off))
122 #define HOPMAYBE3(pos,off,lim) (DO_UTF8(PL_reg_sv) ? reghopmaybe3((U8*)pos, off, (U8*)lim) : (U8*)(pos + off))
123 #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
124 #define HOPMAYBE3c(pos,off,lim) ((char*)HOPMAYBE3(pos,off,lim))
126 static void restore_pos(pTHXo_ void *arg);
130 S_regcppush(pTHX_ I32 parenfloor)
132 int retval = PL_savestack_ix;
133 #define REGCP_PAREN_ELEMS 4
134 int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
137 #define REGCP_OTHER_ELEMS 5
138 SSCHECK(paren_elems_to_push + REGCP_OTHER_ELEMS);
139 for (p = PL_regsize; p > parenfloor; p--) {
140 /* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
141 SSPUSHINT(PL_regendp[p]);
142 SSPUSHINT(PL_regstartp[p]);
143 SSPUSHPTR(PL_reg_start_tmp[p]);
146 /* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
147 SSPUSHINT(PL_regsize);
148 SSPUSHINT(*PL_reglastparen);
149 SSPUSHPTR(PL_reginput);
150 #define REGCP_FRAME_ELEMS 2
151 /* REGCP_FRAME_ELEMS are part of the REGCP_OTHER_ELEMS and
152 * are needed for the regexp context stack bookkeeping. */
153 SSPUSHINT(paren_elems_to_push + REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
154 SSPUSHINT(SAVEt_REGCONTEXT); /* Magic cookie. */
159 /* These are needed since we do not localize EVAL nodes: */
160 # define REGCP_SET(cp) DEBUG_r(PerlIO_printf(Perl_debug_log, \
161 " Setting an EVAL scope, savestack=%"IVdf"\n", \
162 (IV)PL_savestack_ix)); cp = PL_savestack_ix
164 # define REGCP_UNWIND(cp) DEBUG_r(cp != PL_savestack_ix ? \
165 PerlIO_printf(Perl_debug_log, \
166 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
167 (IV)(cp), (IV)PL_savestack_ix) : 0); regcpblow(cp)
177 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
179 assert(i == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
180 i = SSPOPINT; /* Parentheses elements to pop. */
181 input = (char *) SSPOPPTR;
182 *PL_reglastparen = SSPOPINT;
183 PL_regsize = SSPOPINT;
185 /* Now restore the parentheses context. */
186 for (i -= (REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
187 i > 0; i -= REGCP_PAREN_ELEMS) {
188 paren = (U32)SSPOPINT;
189 PL_reg_start_tmp[paren] = (char *) SSPOPPTR;
190 PL_regstartp[paren] = SSPOPINT;
192 if (paren <= *PL_reglastparen)
193 PL_regendp[paren] = tmps;
195 PerlIO_printf(Perl_debug_log,
196 " restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n",
197 (UV)paren, (IV)PL_regstartp[paren],
198 (IV)(PL_reg_start_tmp[paren] - PL_bostr),
199 (IV)PL_regendp[paren],
200 (paren > *PL_reglastparen ? "(no)" : ""));
204 if (*PL_reglastparen + 1 <= PL_regnpar) {
205 PerlIO_printf(Perl_debug_log,
206 " restoring \\%"IVdf"..\\%"IVdf" to undef\n",
207 (IV)(*PL_reglastparen + 1), (IV)PL_regnpar);
211 /* It would seem that the similar code in regtry()
212 * already takes care of this, and in fact it is in
213 * a better location to since this code can #if 0-ed out
214 * but the code in regtry() is needed or otherwise tests
215 * requiring null fields (pat.t#187 and split.t#{13,14}
216 * (as of patchlevel 7877) will fail. Then again,
217 * this code seems to be necessary or otherwise
218 * building DynaLoader will fail:
219 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
221 for (paren = *PL_reglastparen + 1; paren <= PL_regnpar; paren++) {
222 if (paren > PL_regsize)
223 PL_regstartp[paren] = -1;
224 PL_regendp[paren] = -1;
231 S_regcp_set_to(pTHX_ I32 ss)
233 I32 tmp = PL_savestack_ix;
235 PL_savestack_ix = ss;
237 PL_savestack_ix = tmp;
241 typedef struct re_cc_state
245 struct re_cc_state *prev;
250 #define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
252 #define TRYPAREN(paren, n, input) { \
255 PL_regstartp[paren] = HOPc(input, -1) - PL_bostr; \
256 PL_regendp[paren] = input - PL_bostr; \
259 PL_regendp[paren] = -1; \
261 if (regmatch(next)) \
264 PL_regendp[paren] = -1; \
269 * pregexec and friends
273 - pregexec - match a regexp against a string
276 Perl_pregexec(pTHX_ register regexp *prog, char *stringarg, register char *strend,
277 char *strbeg, I32 minend, SV *screamer, U32 nosave)
278 /* strend: pointer to null at end of string */
279 /* strbeg: real beginning of string */
280 /* minend: end of match must be >=minend after stringarg. */
281 /* nosave: For optimizations. */
284 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
285 nosave ? 0 : REXEC_COPY_STR);
289 S_cache_re(pTHX_ regexp *prog)
291 PL_regprecomp = prog->precomp; /* Needed for FAIL. */
293 PL_regprogram = prog->program;
295 PL_regnpar = prog->nparens;
296 PL_regdata = prog->data;
301 * Need to implement the following flags for reg_anch:
303 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
305 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
306 * INTUIT_AUTORITATIVE_ML
307 * INTUIT_ONCE_NOML - Intuit can match in one location only.
310 * Another flag for this function: SECOND_TIME (so that float substrs
311 * with giant delta may be not rechecked).
314 /* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
316 /* If SCREAM, then SvPVX(sv) should be compatible with strpos and strend.
317 Otherwise, only SvCUR(sv) is used to get strbeg. */
319 /* XXXX We assume that strpos is strbeg unless sv. */
321 /* XXXX Some places assume that there is a fixed substring.
322 An update may be needed if optimizer marks as "INTUITable"
323 RExen without fixed substrings. Similarly, it is assumed that
324 lengths of all the strings are no more than minlen, thus they
325 cannot come from lookahead.
326 (Or minlen should take into account lookahead.) */
328 /* A failure to find a constant substring means that there is no need to make
329 an expensive call to REx engine, thus we celebrate a failure. Similarly,
330 finding a substring too deep into the string means that less calls to
331 regtry() should be needed.
333 REx compiler's optimizer found 4 possible hints:
334 a) Anchored substring;
336 c) Whether we are anchored (beginning-of-line or \G);
337 d) First node (of those at offset 0) which may distingush positions;
338 We use a)b)d) and multiline-part of c), and try to find a position in the
339 string which does not contradict any of them.
342 /* Most of decisions we do here should have been done at compile time.
343 The nodes of the REx which we used for the search should have been
344 deleted from the finite automaton. */
347 Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
348 char *strend, U32 flags, re_scream_pos_data *data)
350 register I32 start_shift;
351 /* Should be nonnegative! */
352 register I32 end_shift;
359 register char *other_last = Nullch; /* other substr checked before this */
360 char *check_at; /* check substr found at this pos */
362 char *i_strpos = strpos;
365 DEBUG_r( if (!PL_colorset) reginitcolors() );
366 DEBUG_r(PerlIO_printf(Perl_debug_log,
367 "%sGuessing start of match, REx%s `%s%.60s%s%s' against `%s%.*s%s%s'...\n",
368 PL_colors[4],PL_colors[5],PL_colors[0],
371 (strlen(prog->precomp) > 60 ? "..." : ""),
373 (int)(strend - strpos > 60 ? 60 : strend - strpos),
374 strpos, PL_colors[1],
375 (strend - strpos > 60 ? "..." : ""))
378 if (prog->reganch & ROPT_UTF8)
379 PL_reg_flags |= RF_utf8;
381 if (prog->minlen > CHR_DIST((U8*)strend, (U8*)strpos)) {
382 DEBUG_r(PerlIO_printf(Perl_debug_log, "String too short...\n"));
385 strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
387 check = prog->check_substr;
388 if (prog->reganch & ROPT_ANCH) { /* Match at beg-of-str or after \n */
389 ml_anch = !( (prog->reganch & ROPT_ANCH_SINGLE)
390 || ( (prog->reganch & ROPT_ANCH_BOL)
391 && !PL_multiline ) ); /* Check after \n? */
394 if ( !(prog->reganch & ROPT_ANCH_GPOS) /* Checked by the caller */
395 /* SvCUR is not set on references: SvRV and SvPVX overlap */
397 && (strpos != strbeg)) {
398 DEBUG_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
401 if (prog->check_offset_min == prog->check_offset_max) {
402 /* Substring at constant offset from beg-of-str... */
405 s = HOP3c(strpos, prog->check_offset_min, strend);
407 slen = SvCUR(check); /* >= 1 */
409 if ( strend - s > slen || strend - s < slen - 1
410 || (strend - s == slen && strend[-1] != '\n')) {
411 DEBUG_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
414 /* Now should match s[0..slen-2] */
416 if (slen && (*SvPVX(check) != *s
418 && memNE(SvPVX(check), s, slen)))) {
420 DEBUG_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
424 else if (*SvPVX(check) != *s
425 || ((slen = SvCUR(check)) > 1
426 && memNE(SvPVX(check), s, slen)))
428 goto success_at_start;
431 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
433 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
434 end_shift = prog->minlen - start_shift -
435 CHR_SVLEN(check) + (SvTAIL(check) != 0);
437 I32 end = prog->check_offset_max + CHR_SVLEN(check)
438 - (SvTAIL(check) != 0);
439 I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
441 if (end_shift < eshift)
445 else { /* Can match at random position */
448 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
449 /* Should be nonnegative! */
450 end_shift = prog->minlen - start_shift -
451 CHR_SVLEN(check) + (SvTAIL(check) != 0);
454 #ifdef DEBUGGING /* 7/99: reports of failure (with the older version) */
456 Perl_croak(aTHX_ "panic: end_shift");
460 /* Find a possible match in the region s..strend by looking for
461 the "check" substring in the region corrected by start/end_shift. */
462 if (flags & REXEC_SCREAM) {
463 I32 p = -1; /* Internal iterator of scream. */
464 I32 *pp = data ? data->scream_pos : &p;
466 if (PL_screamfirst[BmRARE(check)] >= 0
467 || ( BmRARE(check) == '\n'
468 && (BmPREVIOUS(check) == SvCUR(check) - 1)
470 s = screaminstr(sv, check,
471 start_shift + (s - strbeg), end_shift, pp, 0);
475 *data->scream_olds = s;
478 s = fbm_instr(HOP3(s, start_shift, strend),
479 HOP3(strend, -end_shift, strbeg),
480 check, PL_multiline ? FBMrf_MULTILINE : 0);
482 /* Update the count-of-usability, remove useless subpatterns,
485 DEBUG_r(PerlIO_printf(Perl_debug_log, "%s %s substr `%s%.*s%s'%s%s",
486 (s ? "Found" : "Did not find"),
487 ((check == prog->anchored_substr) ? "anchored" : "floating"),
489 (int)(SvCUR(check) - (SvTAIL(check)!=0)),
491 PL_colors[1], (SvTAIL(check) ? "$" : ""),
492 (s ? " at offset " : "...\n") ) );
499 /* Finish the diagnostic message */
500 DEBUG_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
502 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
503 Start with the other substr.
504 XXXX no SCREAM optimization yet - and a very coarse implementation
505 XXXX /ttx+/ results in anchored=`ttx', floating=`x'. floating will
506 *always* match. Probably should be marked during compile...
507 Probably it is right to do no SCREAM here...
510 if (prog->float_substr && prog->anchored_substr) {
511 /* Take into account the "other" substring. */
512 /* XXXX May be hopelessly wrong for UTF... */
515 if (check == prog->float_substr) {
518 char *last = HOP3c(s, -start_shift, strbeg), *last1, *last2;
521 t = s - prog->check_offset_max;
522 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
523 && (!(prog->reganch & ROPT_UTF8)
524 || ((t = reghopmaybe3_c(s, -(prog->check_offset_max), strpos))
529 t = HOP3c(t, prog->anchored_offset, strend);
530 if (t < other_last) /* These positions already checked */
532 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
535 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
536 /* On end-of-str: see comment below. */
537 s = fbm_instr((unsigned char*)t,
538 HOP3(HOP3(last1, prog->anchored_offset, strend)
539 + SvCUR(prog->anchored_substr),
540 -(SvTAIL(prog->anchored_substr)!=0), strbeg),
541 prog->anchored_substr,
542 PL_multiline ? FBMrf_MULTILINE : 0);
543 DEBUG_r(PerlIO_printf(Perl_debug_log,
544 "%s anchored substr `%s%.*s%s'%s",
545 (s ? "Found" : "Contradicts"),
547 (int)(SvCUR(prog->anchored_substr)
548 - (SvTAIL(prog->anchored_substr)!=0)),
549 SvPVX(prog->anchored_substr),
550 PL_colors[1], (SvTAIL(prog->anchored_substr) ? "$" : "")));
552 if (last1 >= last2) {
553 DEBUG_r(PerlIO_printf(Perl_debug_log,
554 ", giving up...\n"));
557 DEBUG_r(PerlIO_printf(Perl_debug_log,
558 ", trying floating at offset %ld...\n",
559 (long)(HOP3c(s1, 1, strend) - i_strpos)));
560 other_last = HOP3c(last1, prog->anchored_offset+1, strend);
561 s = HOP3c(last, 1, strend);
565 DEBUG_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
566 (long)(s - i_strpos)));
567 t = HOP3c(s, -prog->anchored_offset, strbeg);
568 other_last = HOP3c(s, 1, strend);
576 else { /* Take into account the floating substring. */
580 t = HOP3c(s, -start_shift, strbeg);
582 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
583 if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
584 last = HOP3c(t, prog->float_max_offset, strend);
585 s = HOP3c(t, prog->float_min_offset, strend);
588 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
589 /* fbm_instr() takes into account exact value of end-of-str
590 if the check is SvTAIL(ed). Since false positives are OK,
591 and end-of-str is not later than strend we are OK. */
592 s = fbm_instr((unsigned char*)s,
593 (unsigned char*)last + SvCUR(prog->float_substr)
594 - (SvTAIL(prog->float_substr)!=0),
595 prog->float_substr, PL_multiline ? FBMrf_MULTILINE : 0);
596 DEBUG_r(PerlIO_printf(Perl_debug_log, "%s floating substr `%s%.*s%s'%s",
597 (s ? "Found" : "Contradicts"),
599 (int)(SvCUR(prog->float_substr)
600 - (SvTAIL(prog->float_substr)!=0)),
601 SvPVX(prog->float_substr),
602 PL_colors[1], (SvTAIL(prog->float_substr) ? "$" : "")));
605 DEBUG_r(PerlIO_printf(Perl_debug_log,
606 ", giving up...\n"));
609 DEBUG_r(PerlIO_printf(Perl_debug_log,
610 ", trying anchored starting at offset %ld...\n",
611 (long)(s1 + 1 - i_strpos)));
613 s = HOP3c(t, 1, strend);
617 DEBUG_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
618 (long)(s - i_strpos)));
619 other_last = s; /* Fix this later. --Hugo */
628 t = s - prog->check_offset_max;
629 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
630 && (!(prog->reganch & ROPT_UTF8)
631 || ((t = reghopmaybe3_c(s, -prog->check_offset_max, strpos))
633 /* Fixed substring is found far enough so that the match
634 cannot start at strpos. */
636 if (ml_anch && t[-1] != '\n') {
637 /* Eventually fbm_*() should handle this, but often
638 anchored_offset is not 0, so this check will not be wasted. */
639 /* XXXX In the code below we prefer to look for "^" even in
640 presence of anchored substrings. And we search even
641 beyond the found float position. These pessimizations
642 are historical artefacts only. */
644 while (t < strend - prog->minlen) {
646 if (t < check_at - prog->check_offset_min) {
647 if (prog->anchored_substr) {
648 /* Since we moved from the found position,
649 we definitely contradict the found anchored
650 substr. Due to the above check we do not
651 contradict "check" substr.
652 Thus we can arrive here only if check substr
653 is float. Redo checking for "other"=="fixed".
656 DEBUG_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
657 PL_colors[0],PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
658 goto do_other_anchored;
660 /* We don't contradict the found floating substring. */
661 /* XXXX Why not check for STCLASS? */
663 DEBUG_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
664 PL_colors[0],PL_colors[1], (long)(s - i_strpos)));
667 /* Position contradicts check-string */
668 /* XXXX probably better to look for check-string
669 than for "\n", so one should lower the limit for t? */
670 DEBUG_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
671 PL_colors[0],PL_colors[1], (long)(t + 1 - i_strpos)));
672 other_last = strpos = s = t + 1;
677 DEBUG_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
678 PL_colors[0],PL_colors[1]));
682 DEBUG_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
683 PL_colors[0],PL_colors[1]));
687 ++BmUSEFUL(prog->check_substr); /* hooray/5 */
690 /* The found string does not prohibit matching at strpos,
691 - no optimization of calling REx engine can be performed,
692 unless it was an MBOL and we are not after MBOL,
693 or a future STCLASS check will fail this. */
695 /* Even in this situation we may use MBOL flag if strpos is offset
696 wrt the start of the string. */
697 if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
698 && (strpos != strbeg) && strpos[-1] != '\n'
699 /* May be due to an implicit anchor of m{.*foo} */
700 && !(prog->reganch & ROPT_IMPLICIT))
705 DEBUG_r( if (ml_anch)
706 PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
707 (long)(strpos - i_strpos), PL_colors[0],PL_colors[1]);
710 if (!(prog->reganch & ROPT_NAUGHTY) /* XXXX If strpos moved? */
711 && prog->check_substr /* Could be deleted already */
712 && --BmUSEFUL(prog->check_substr) < 0
713 && prog->check_substr == prog->float_substr)
715 /* If flags & SOMETHING - do not do it many times on the same match */
716 DEBUG_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
717 SvREFCNT_dec(prog->check_substr);
718 prog->check_substr = Nullsv; /* disable */
719 prog->float_substr = Nullsv; /* clear */
720 check = Nullsv; /* abort */
722 /* XXXX This is a remnant of the old implementation. It
723 looks wasteful, since now INTUIT can use many
725 prog->reganch &= ~RE_USE_INTUIT;
732 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
733 if (prog->regstclass) {
734 /* minlen == 0 is possible if regstclass is \b or \B,
735 and the fixed substr is ''$.
736 Since minlen is already taken into account, s+1 is before strend;
737 accidentally, minlen >= 1 guaranties no false positives at s + 1
738 even for \b or \B. But (minlen? 1 : 0) below assumes that
739 regstclass does not come from lookahead... */
740 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
741 This leaves EXACTF only, which is dealt with in find_byclass(). */
742 U8* str = (U8*)STRING(prog->regstclass);
743 int cl_l = (PL_regkind[(U8)OP(prog->regstclass)] == EXACT
744 ? CHR_DIST(str+STR_LEN(prog->regstclass), str)
746 char *endpos = (prog->anchored_substr || ml_anch)
747 ? HOP3c(s, (prog->minlen ? cl_l : 0), strend)
748 : (prog->float_substr
749 ? HOP3c(HOP3c(check_at, -start_shift, strbeg),
752 char *startpos = strbeg;
755 if (prog->reganch & ROPT_UTF8) {
756 PL_regdata = prog->data;
759 s = find_byclass(prog, prog->regstclass, s, endpos, startpos, 1);
764 if (endpos == strend) {
765 DEBUG_r( PerlIO_printf(Perl_debug_log,
766 "Could not match STCLASS...\n") );
769 DEBUG_r( PerlIO_printf(Perl_debug_log,
770 "This position contradicts STCLASS...\n") );
771 if ((prog->reganch & ROPT_ANCH) && !ml_anch)
773 /* Contradict one of substrings */
774 if (prog->anchored_substr) {
775 if (prog->anchored_substr == check) {
776 DEBUG_r( what = "anchored" );
778 s = HOP3c(t, 1, strend);
779 if (s + start_shift + end_shift > strend) {
780 /* XXXX Should be taken into account earlier? */
781 DEBUG_r( PerlIO_printf(Perl_debug_log,
782 "Could not match STCLASS...\n") );
787 DEBUG_r( PerlIO_printf(Perl_debug_log,
788 "Looking for %s substr starting at offset %ld...\n",
789 what, (long)(s + start_shift - i_strpos)) );
792 /* Have both, check_string is floating */
793 if (t + start_shift >= check_at) /* Contradicts floating=check */
794 goto retry_floating_check;
795 /* Recheck anchored substring, but not floating... */
799 DEBUG_r( PerlIO_printf(Perl_debug_log,
800 "Looking for anchored substr starting at offset %ld...\n",
801 (long)(other_last - i_strpos)) );
802 goto do_other_anchored;
804 /* Another way we could have checked stclass at the
805 current position only: */
810 DEBUG_r( PerlIO_printf(Perl_debug_log,
811 "Looking for /%s^%s/m starting at offset %ld...\n",
812 PL_colors[0],PL_colors[1], (long)(t - i_strpos)) );
815 if (!prog->float_substr) /* Could have been deleted */
817 /* Check is floating subtring. */
818 retry_floating_check:
819 t = check_at - start_shift;
820 DEBUG_r( what = "floating" );
821 goto hop_and_restart;
824 PerlIO_printf(Perl_debug_log,
825 "By STCLASS: moving %ld --> %ld\n",
826 (long)(t - i_strpos), (long)(s - i_strpos));
828 PerlIO_printf(Perl_debug_log,
829 "Does not contradict STCLASS...\n") );
832 DEBUG_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
833 PL_colors[4], (check ? "Guessed" : "Giving up"),
834 PL_colors[5], (long)(s - i_strpos)) );
837 fail_finish: /* Substring not found */
838 if (prog->check_substr) /* could be removed already */
839 BmUSEFUL(prog->check_substr) += 5; /* hooray */
841 DEBUG_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
842 PL_colors[4],PL_colors[5]));
846 /* We know what class REx starts with. Try to find this position... */
848 S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *startpos, I32 norun)
850 I32 doevery = (prog->reganch & ROPT_SKIP) == 0;
856 register I32 tmp = 1; /* Scratch variable? */
857 register bool do_utf8 = DO_UTF8(PL_reg_sv);
859 /* We know what class it must start with. */
863 if (reginclass(c, (U8*)s, do_utf8)) {
864 if (tmp && (norun || regtry(prog, s)))
871 s += do_utf8 ? UTF8SKIP(s) : 1;
878 c1 = to_utf8_lower((U8*)m);
879 c2 = to_utf8_upper((U8*)m);
890 c2 = PL_fold_locale[c1];
895 e = s; /* Due to minlen logic of intuit() */
901 if ( utf8_to_uv_simple((U8*)s, &len) == c1
908 UV c = utf8_to_uv_simple((U8*)s, &len);
909 if ( (c == c1 || c == c2) && regtry(prog, s) )
918 && (ln == 1 || !(OP(c) == EXACTF
920 : ibcmp_locale(s, m, ln)))
921 && (norun || regtry(prog, s)) )
927 if ( (*(U8*)s == c1 || *(U8*)s == c2)
928 && (ln == 1 || !(OP(c) == EXACTF
930 : ibcmp_locale(s, m, ln)))
931 && (norun || regtry(prog, s)) )
938 PL_reg_flags |= RF_tainted;
945 U8 *r = reghop3((U8*)s, -1, (U8*)startpos);
947 tmp = (I32)utf8_to_uv(r, s - (char*)r, 0, 0);
949 tmp = ((OP(c) == BOUND ?
950 isALNUM_uni(tmp) : isALNUM_LC_uni(tmp)) != 0);
952 if (tmp == !(OP(c) == BOUND ?
953 swash_fetch(PL_utf8_alnum, (U8*)s) :
954 isALNUM_LC_utf8((U8*)s)))
957 if ((norun || regtry(prog, s)))
964 tmp = (s != startpos) ? UCHARAT(s - 1) : '\n';
965 tmp = ((OP(c) == BOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
968 !(OP(c) == BOUND ? isALNUM(*s) : isALNUM_LC(*s))) {
970 if ((norun || regtry(prog, s)))
976 if ((!prog->minlen && tmp) && (norun || regtry(prog, s)))
980 PL_reg_flags |= RF_tainted;
987 U8 *r = reghop3((U8*)s, -1, (U8*)startpos);
989 tmp = (I32)utf8_to_uv(r, s - (char*)r, 0, 0);
991 tmp = ((OP(c) == NBOUND ?
992 isALNUM_uni(tmp) : isALNUM_LC_uni(tmp)) != 0);
994 if (tmp == !(OP(c) == NBOUND ?
995 swash_fetch(PL_utf8_alnum, (U8*)s) :
996 isALNUM_LC_utf8((U8*)s)))
998 else if ((norun || regtry(prog, s)))
1004 tmp = (s != startpos) ? UCHARAT(s - 1) : '\n';
1005 tmp = ((OP(c) == NBOUND ?
1006 isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1007 while (s < strend) {
1009 !(OP(c) == NBOUND ? isALNUM(*s) : isALNUM_LC(*s)))
1011 else if ((norun || regtry(prog, s)))
1016 if ((!prog->minlen && !tmp) && (norun || regtry(prog, s)))
1021 while (s < strend) {
1022 if (swash_fetch(PL_utf8_alnum, (U8*)s)) {
1023 if (tmp && (norun || regtry(prog, s)))
1034 while (s < strend) {
1036 if (tmp && (norun || regtry(prog, s)))
1048 PL_reg_flags |= RF_tainted;
1050 while (s < strend) {
1051 if (isALNUM_LC_utf8((U8*)s)) {
1052 if (tmp && (norun || regtry(prog, s)))
1063 while (s < strend) {
1064 if (isALNUM_LC(*s)) {
1065 if (tmp && (norun || regtry(prog, s)))
1078 while (s < strend) {
1079 if (!swash_fetch(PL_utf8_alnum, (U8*)s)) {
1080 if (tmp && (norun || regtry(prog, s)))
1091 while (s < strend) {
1093 if (tmp && (norun || regtry(prog, s)))
1105 PL_reg_flags |= RF_tainted;
1107 while (s < strend) {
1108 if (!isALNUM_LC_utf8((U8*)s)) {
1109 if (tmp && (norun || regtry(prog, s)))
1120 while (s < strend) {
1121 if (!isALNUM_LC(*s)) {
1122 if (tmp && (norun || regtry(prog, s)))
1135 while (s < strend) {
1136 if (*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s)) {
1137 if (tmp && (norun || regtry(prog, s)))
1148 while (s < strend) {
1150 if (tmp && (norun || regtry(prog, s)))
1162 PL_reg_flags |= RF_tainted;
1164 while (s < strend) {
1165 if (*s == ' ' || isSPACE_LC_utf8((U8*)s)) {
1166 if (tmp && (norun || regtry(prog, s)))
1177 while (s < strend) {
1178 if (isSPACE_LC(*s)) {
1179 if (tmp && (norun || regtry(prog, s)))
1192 while (s < strend) {
1193 if (!(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s))) {
1194 if (tmp && (norun || regtry(prog, s)))
1205 while (s < strend) {
1207 if (tmp && (norun || regtry(prog, s)))
1219 PL_reg_flags |= RF_tainted;
1221 while (s < strend) {
1222 if (!(*s == ' ' || isSPACE_LC_utf8((U8*)s))) {
1223 if (tmp && (norun || regtry(prog, s)))
1234 while (s < strend) {
1235 if (!isSPACE_LC(*s)) {
1236 if (tmp && (norun || regtry(prog, s)))
1249 while (s < strend) {
1250 if (swash_fetch(PL_utf8_digit,(U8*)s)) {
1251 if (tmp && (norun || regtry(prog, s)))
1262 while (s < strend) {
1264 if (tmp && (norun || regtry(prog, s)))
1276 PL_reg_flags |= RF_tainted;
1278 while (s < strend) {
1279 if (isDIGIT_LC_utf8((U8*)s)) {
1280 if (tmp && (norun || regtry(prog, s)))
1291 while (s < strend) {
1292 if (isDIGIT_LC(*s)) {
1293 if (tmp && (norun || regtry(prog, s)))
1306 while (s < strend) {
1307 if (!swash_fetch(PL_utf8_digit,(U8*)s)) {
1308 if (tmp && (norun || regtry(prog, s)))
1319 while (s < strend) {
1321 if (tmp && (norun || regtry(prog, s)))
1333 PL_reg_flags |= RF_tainted;
1335 while (s < strend) {
1336 if (!isDIGIT_LC_utf8((U8*)s)) {
1337 if (tmp && (norun || regtry(prog, s)))
1348 while (s < strend) {
1349 if (!isDIGIT_LC(*s)) {
1350 if (tmp && (norun || regtry(prog, s)))
1362 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
1371 - regexec_flags - match a regexp against a string
1374 Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *strend,
1375 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
1376 /* strend: pointer to null at end of string */
1377 /* strbeg: real beginning of string */
1378 /* minend: end of match must be >=minend after stringarg. */
1379 /* data: May be used for some additional optimizations. */
1380 /* nosave: For optimizations. */
1383 register regnode *c;
1384 register char *startpos = stringarg;
1385 I32 minlen; /* must match at least this many chars */
1386 I32 dontbother = 0; /* how many characters not to try at end */
1387 /* I32 start_shift = 0; */ /* Offset of the start to find
1388 constant substr. */ /* CC */
1389 I32 end_shift = 0; /* Same for the end. */ /* CC */
1390 I32 scream_pos = -1; /* Internal iterator of scream. */
1392 SV* oreplsv = GvSV(PL_replgv);
1393 bool do_utf8 = DO_UTF8(sv);
1399 PL_regnarrate = PL_debug & 512;
1402 /* Be paranoid... */
1403 if (prog == NULL || startpos == NULL) {
1404 Perl_croak(aTHX_ "NULL regexp parameter");
1408 minlen = prog->minlen;
1410 if (utf8_distance((U8*)strend, (U8*)startpos) < minlen) goto phooey;
1413 if (strend - startpos < minlen) goto phooey;
1416 if (startpos == strbeg) /* is ^ valid at stringarg? */
1419 if (prog->reganch & ROPT_UTF8 && do_utf8) {
1420 U8 *s = reghop3((U8*)stringarg, -1, (U8*)strbeg);
1421 PL_regprev = utf8_to_uv(s, (U8*)stringarg - s, NULL, 0);
1424 PL_regprev = (U32)stringarg[-1];
1425 if (!PL_multiline && PL_regprev == '\n')
1426 PL_regprev = '\0'; /* force ^ to NOT match */
1429 /* Check validity of program. */
1430 if (UCHARAT(prog->program) != REG_MAGIC) {
1431 Perl_croak(aTHX_ "corrupted regexp program");
1435 PL_reg_eval_set = 0;
1438 if (prog->reganch & ROPT_UTF8)
1439 PL_reg_flags |= RF_utf8;
1441 /* Mark beginning of line for ^ and lookbehind. */
1442 PL_regbol = startpos;
1446 /* Mark end of line for $ (and such) */
1449 /* see how far we have to get to not match where we matched before */
1450 PL_regtill = startpos+minend;
1452 /* We start without call_cc context. */
1455 /* If there is a "must appear" string, look for it. */
1458 if (prog->reganch & ROPT_GPOS_SEEN) { /* Need to have PL_reg_ganch */
1461 if (flags & REXEC_IGNOREPOS) /* Means: check only at start */
1462 PL_reg_ganch = startpos;
1463 else if (sv && SvTYPE(sv) >= SVt_PVMG
1465 && (mg = mg_find(sv, 'g')) && mg->mg_len >= 0) {
1466 PL_reg_ganch = strbeg + mg->mg_len; /* Defined pos() */
1467 if (prog->reganch & ROPT_ANCH_GPOS) {
1468 if (s > PL_reg_ganch)
1473 else /* pos() not defined */
1474 PL_reg_ganch = strbeg;
1477 if (do_utf8 == (UTF!=0) &&
1478 !(flags & REXEC_CHECKED) && prog->check_substr != Nullsv) {
1479 re_scream_pos_data d;
1481 d.scream_olds = &scream_olds;
1482 d.scream_pos = &scream_pos;
1483 s = re_intuit_start(prog, sv, s, strend, flags, &d);
1485 goto phooey; /* not present */
1488 DEBUG_r( if (!PL_colorset) reginitcolors() );
1489 DEBUG_r(PerlIO_printf(Perl_debug_log,
1490 "%sMatching REx%s `%s%.60s%s%s' against `%s%.*s%s%s'\n",
1491 PL_colors[4],PL_colors[5],PL_colors[0],
1494 (strlen(prog->precomp) > 60 ? "..." : ""),
1496 (int)(strend - startpos > 60 ? 60 : strend - startpos),
1497 startpos, PL_colors[1],
1498 (strend - startpos > 60 ? "..." : ""))
1501 /* Simplest case: anchored match need be tried only once. */
1502 /* [unless only anchor is BOL and multiline is set] */
1503 if (prog->reganch & (ROPT_ANCH & ~ROPT_ANCH_GPOS)) {
1504 if (s == startpos && regtry(prog, startpos))
1506 else if (PL_multiline || (prog->reganch & ROPT_IMPLICIT)
1507 || (prog->reganch & ROPT_ANCH_MBOL)) /* XXXX SBOL? */
1512 dontbother = minlen - 1;
1513 end = HOP3c(strend, -dontbother, strbeg) - 1;
1514 /* for multiline we only have to try after newlines */
1515 if (prog->check_substr) {
1519 if (regtry(prog, s))
1524 if (prog->reganch & RE_USE_INTUIT) {
1525 s = re_intuit_start(prog, sv, s + 1, strend, flags, NULL);
1536 if (*s++ == '\n') { /* don't need PL_utf8skip here */
1537 if (regtry(prog, s))
1544 } else if (prog->reganch & ROPT_ANCH_GPOS) {
1545 if (regtry(prog, PL_reg_ganch))
1550 /* Messy cases: unanchored match. */
1551 if (prog->anchored_substr && prog->reganch & ROPT_SKIP) {
1552 /* we have /x+whatever/ */
1553 /* it must be a one character string (XXXX Except UTF?) */
1554 char ch = SvPVX(prog->anchored_substr)[0];
1560 while (s < strend) {
1562 DEBUG_r( did_match = 1 );
1563 if (regtry(prog, s)) goto got_it;
1565 while (s < strend && *s == ch)
1572 while (s < strend) {
1574 DEBUG_r( did_match = 1 );
1575 if (regtry(prog, s)) goto got_it;
1577 while (s < strend && *s == ch)
1583 DEBUG_r(did_match ||
1584 PerlIO_printf(Perl_debug_log,
1585 "Did not find anchored character...\n"));
1588 else if (do_utf8 == (UTF!=0) &&
1589 (prog->anchored_substr != Nullsv
1590 || (prog->float_substr != Nullsv
1591 && prog->float_max_offset < strend - s))) {
1592 SV *must = prog->anchored_substr
1593 ? prog->anchored_substr : prog->float_substr;
1595 prog->anchored_substr ? prog->anchored_offset : prog->float_max_offset;
1597 prog->anchored_substr ? prog->anchored_offset : prog->float_min_offset;
1598 char *last = HOP3c(strend, /* Cannot start after this */
1599 -(I32)(CHR_SVLEN(must)
1600 - (SvTAIL(must) != 0) + back_min), strbeg);
1601 char *last1; /* Last position checked before */
1607 last1 = HOPc(s, -1);
1609 last1 = s - 1; /* bogus */
1611 /* XXXX check_substr already used to find `s', can optimize if
1612 check_substr==must. */
1614 dontbother = end_shift;
1615 strend = HOPc(strend, -dontbother);
1616 while ( (s <= last) &&
1617 ((flags & REXEC_SCREAM)
1618 ? (s = screaminstr(sv, must, HOP3c(s, back_min, strend) - strbeg,
1619 end_shift, &scream_pos, 0))
1620 : (s = fbm_instr((unsigned char*)HOP3(s, back_min, strend),
1621 (unsigned char*)strend, must,
1622 PL_multiline ? FBMrf_MULTILINE : 0))) ) {
1623 DEBUG_r( did_match = 1 );
1624 if (HOPc(s, -back_max) > last1) {
1625 last1 = HOPc(s, -back_min);
1626 s = HOPc(s, -back_max);
1629 char *t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
1631 last1 = HOPc(s, -back_min);
1635 while (s <= last1) {
1636 if (regtry(prog, s))
1642 while (s <= last1) {
1643 if (regtry(prog, s))
1649 DEBUG_r(did_match ||
1650 PerlIO_printf(Perl_debug_log, "Did not find %s substr `%s%.*s%s'%s...\n",
1651 ((must == prog->anchored_substr)
1652 ? "anchored" : "floating"),
1654 (int)(SvCUR(must) - (SvTAIL(must)!=0)),
1656 PL_colors[1], (SvTAIL(must) ? "$" : "")));
1659 else if ((c = prog->regstclass)) {
1660 if (minlen && PL_regkind[(U8)OP(prog->regstclass)] != EXACT)
1661 /* don't bother with what can't match */
1662 strend = HOPc(strend, -(minlen - 1));
1664 SV *prop = sv_newmortal();
1666 PerlIO_printf(Perl_debug_log, "Matching stclass `%s' against `%s'\n", SvPVX(prop), s);
1668 if (find_byclass(prog, c, s, strend, startpos, 0))
1670 DEBUG_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass...\n"));
1674 if (prog->float_substr != Nullsv) { /* Trim the end. */
1677 if (flags & REXEC_SCREAM) {
1678 last = screaminstr(sv, prog->float_substr, s - strbeg,
1679 end_shift, &scream_pos, 1); /* last one */
1681 last = scream_olds; /* Only one occurrence. */
1685 char *little = SvPV(prog->float_substr, len);
1687 if (SvTAIL(prog->float_substr)) {
1688 if (memEQ(strend - len + 1, little, len - 1))
1689 last = strend - len + 1;
1690 else if (!PL_multiline)
1691 last = memEQ(strend - len, little, len)
1692 ? strend - len : Nullch;
1698 last = rninstr(s, strend, little, little + len);
1700 last = strend; /* matching `$' */
1704 DEBUG_r(PerlIO_printf(Perl_debug_log,
1705 "%sCan't trim the tail, match fails (should not happen)%s\n",
1706 PL_colors[4],PL_colors[5]));
1707 goto phooey; /* Should not happen! */
1709 dontbother = strend - last + prog->float_min_offset;
1711 if (minlen && (dontbother < minlen))
1712 dontbother = minlen - 1;
1713 strend -= dontbother; /* this one's always in bytes! */
1714 /* We don't know much -- general case. */
1717 if (regtry(prog, s))
1726 if (regtry(prog, s))
1728 } while (s++ < strend);
1736 RX_MATCH_TAINTED_set(prog, PL_reg_flags & RF_tainted);
1738 if (PL_reg_eval_set) {
1739 /* Preserve the current value of $^R */
1740 if (oreplsv != GvSV(PL_replgv))
1741 sv_setsv(oreplsv, GvSV(PL_replgv));/* So that when GvSV(replgv) is
1742 restored, the value remains
1744 restore_pos(aTHXo_ 0);
1747 /* make sure $`, $&, $', and $digit will work later */
1748 if ( !(flags & REXEC_NOT_FIRST) ) {
1749 if (RX_MATCH_COPIED(prog)) {
1750 Safefree(prog->subbeg);
1751 RX_MATCH_COPIED_off(prog);
1753 if (flags & REXEC_COPY_STR) {
1754 I32 i = PL_regeol - startpos + (stringarg - strbeg);
1756 s = savepvn(strbeg, i);
1759 RX_MATCH_COPIED_on(prog);
1762 prog->subbeg = strbeg;
1763 prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
1770 DEBUG_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
1771 PL_colors[4],PL_colors[5]));
1772 if (PL_reg_eval_set)
1773 restore_pos(aTHXo_ 0);
1778 - regtry - try match at specific point
1780 STATIC I32 /* 0 failure, 1 success */
1781 S_regtry(pTHX_ regexp *prog, char *startpos)
1789 PL_regindent = 0; /* XXXX Not good when matches are reenterable... */
1791 if ((prog->reganch & ROPT_EVAL_SEEN) && !PL_reg_eval_set) {
1794 PL_reg_eval_set = RS_init;
1796 PerlIO_printf(Perl_debug_log, " setting stack tmpbase at %"IVdf"\n",
1797 (IV)(PL_stack_sp - PL_stack_base));
1799 SAVEI32(cxstack[cxstack_ix].blk_oldsp);
1800 cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base;
1801 /* Otherwise OP_NEXTSTATE will free whatever on stack now. */
1803 /* Apparently this is not needed, judging by wantarray. */
1804 /* SAVEI8(cxstack[cxstack_ix].blk_gimme);
1805 cxstack[cxstack_ix].blk_gimme = G_SCALAR; */
1808 /* Make $_ available to executed code. */
1809 if (PL_reg_sv != DEFSV) {
1810 /* SAVE_DEFSV does *not* suffice here for USE_THREADS */
1815 if (!(SvTYPE(PL_reg_sv) >= SVt_PVMG && SvMAGIC(PL_reg_sv)
1816 && (mg = mg_find(PL_reg_sv, 'g')))) {
1817 /* prepare for quick setting of pos */
1818 sv_magic(PL_reg_sv, (SV*)0, 'g', Nullch, 0);
1819 mg = mg_find(PL_reg_sv, 'g');
1823 PL_reg_oldpos = mg->mg_len;
1824 SAVEDESTRUCTOR_X(restore_pos, 0);
1827 Newz(22,PL_reg_curpm, 1, PMOP);
1828 PL_reg_curpm->op_pmregexp = prog;
1829 PL_reg_oldcurpm = PL_curpm;
1830 PL_curpm = PL_reg_curpm;
1831 if (RX_MATCH_COPIED(prog)) {
1832 /* Here is a serious problem: we cannot rewrite subbeg,
1833 since it may be needed if this match fails. Thus
1834 $` inside (?{}) could fail... */
1835 PL_reg_oldsaved = prog->subbeg;
1836 PL_reg_oldsavedlen = prog->sublen;
1837 RX_MATCH_COPIED_off(prog);
1840 PL_reg_oldsaved = Nullch;
1841 prog->subbeg = PL_bostr;
1842 prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
1844 prog->startp[0] = startpos - PL_bostr;
1845 PL_reginput = startpos;
1846 PL_regstartp = prog->startp;
1847 PL_regendp = prog->endp;
1848 PL_reglastparen = &prog->lastparen;
1849 prog->lastparen = 0;
1851 DEBUG_r(PL_reg_starttry = startpos);
1852 if (PL_reg_start_tmpl <= prog->nparens) {
1853 PL_reg_start_tmpl = prog->nparens*3/2 + 3;
1854 if(PL_reg_start_tmp)
1855 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
1857 New(22,PL_reg_start_tmp, PL_reg_start_tmpl, char*);
1860 /* XXXX What this code is doing here?!!! There should be no need
1861 to do this again and again, PL_reglastparen should take care of
1864 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
1865 * Actually, the code in regcppop() (which Ilya may be meaning by
1866 * PL_reglastparen), is not needed at all by the test suite
1867 * (op/regexp, op/pat, op/split), but that code is needed, oddly
1868 * enough, for building DynaLoader, or otherwise this
1869 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
1870 * will happen. Meanwhile, this code *is* needed for the
1871 * above-mentioned test suite tests to succeed. The common theme
1872 * on those tests seems to be returning null fields from matches.
1877 if (prog->nparens) {
1878 for (i = prog->nparens; i > *PL_reglastparen; i--) {
1885 if (regmatch(prog->program + 1)) {
1886 prog->endp[0] = PL_reginput - PL_bostr;
1889 REGCP_UNWIND(lastcp);
1893 #define RE_UNWIND_BRANCH 1
1894 #define RE_UNWIND_BRANCHJ 2
1898 typedef struct { /* XX: makes sense to enlarge it... */
1902 } re_unwind_generic_t;
1915 } re_unwind_branch_t;
1917 typedef union re_unwind_t {
1919 re_unwind_generic_t generic;
1920 re_unwind_branch_t branch;
1924 - regmatch - main matching routine
1926 * Conceptually the strategy is simple: check to see whether the current
1927 * node matches, call self recursively to see whether the rest matches,
1928 * and then act accordingly. In practice we make some effort to avoid
1929 * recursion, in particular by going through "ordinary" nodes (that don't
1930 * need to know whether the rest of the match failed) by a loop instead of
1933 /* [lwall] I've hoisted the register declarations to the outer block in order to
1934 * maybe save a little bit of pushing and popping on the stack. It also takes
1935 * advantage of machines that use a register save mask on subroutine entry.
1937 STATIC I32 /* 0 failure, 1 success */
1938 S_regmatch(pTHX_ regnode *prog)
1940 register regnode *scan; /* Current node. */
1941 regnode *next; /* Next node. */
1942 regnode *inner; /* Next node in internal branch. */
1943 register I32 nextchr; /* renamed nextchr - nextchar colides with
1944 function of same name */
1945 register I32 n; /* no or next */
1946 register I32 ln; /* len or last */
1947 register char *s; /* operand or save */
1948 register char *locinput = PL_reginput;
1949 register I32 c1, c2, paren; /* case fold search, parenth */
1950 int minmod = 0, sw = 0, logical = 0;
1952 I32 firstcp = PL_savestack_ix;
1953 register bool do_utf8 = DO_UTF8(PL_reg_sv);
1959 /* Note that nextchr is a byte even in UTF */
1960 nextchr = UCHARAT(locinput);
1962 while (scan != NULL) {
1963 #define sayNO_L (logical ? (logical = 0, sw = 0, goto cont) : sayNO)
1965 # define sayYES goto yes
1966 # define sayNO goto no
1967 # define sayYES_FINAL goto yes_final
1968 # define sayYES_LOUD goto yes_loud
1969 # define sayNO_FINAL goto no_final
1970 # define sayNO_SILENT goto do_no
1971 # define saySAME(x) if (x) goto yes; else goto no
1972 # define REPORT_CODE_OFF 24
1974 # define sayYES return 1
1975 # define sayNO return 0
1976 # define sayYES_FINAL return 1
1977 # define sayYES_LOUD return 1
1978 # define sayNO_FINAL return 0
1979 # define sayNO_SILENT return 0
1980 # define saySAME(x) return x
1983 SV *prop = sv_newmortal();
1984 int docolor = *PL_colors[0];
1985 int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
1986 int l = (PL_regeol - locinput) > taill ? taill : (PL_regeol - locinput);
1987 /* The part of the string before starttry has one color
1988 (pref0_len chars), between starttry and current
1989 position another one (pref_len - pref0_len chars),
1990 after the current position the third one.
1991 We assume that pref0_len <= pref_len, otherwise we
1992 decrease pref0_len. */
1993 int pref_len = (locinput - PL_bostr) > (5 + taill) - l
1994 ? (5 + taill) - l : locinput - PL_bostr;
1997 while (UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
1999 pref0_len = pref_len - (locinput - PL_reg_starttry);
2000 if (l + pref_len < (5 + taill) && l < PL_regeol - locinput)
2001 l = ( PL_regeol - locinput > (5 + taill) - pref_len
2002 ? (5 + taill) - pref_len : PL_regeol - locinput);
2003 while (UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
2007 if (pref0_len > pref_len)
2008 pref0_len = pref_len;
2009 regprop(prop, scan);
2010 PerlIO_printf(Perl_debug_log,
2011 "%4"IVdf" <%s%.*s%s%s%.*s%s%s%s%.*s%s>%*s|%3"IVdf":%*s%s\n",
2012 (IV)(locinput - PL_bostr),
2013 PL_colors[4], pref0_len,
2014 locinput - pref_len, PL_colors[5],
2015 PL_colors[2], pref_len - pref0_len,
2016 locinput - pref_len + pref0_len, PL_colors[3],
2017 (docolor ? "" : "> <"),
2018 PL_colors[0], l, locinput, PL_colors[1],
2019 15 - l - pref_len + 1,
2021 (IV)(scan - PL_regprogram), PL_regindent*2, "",
2025 next = scan + NEXT_OFF(scan);
2031 if (locinput == PL_bostr
2032 ? PL_regprev == '\n'
2034 (nextchr || locinput < PL_regeol) && locinput[-1] == '\n') )
2036 /* regtill = regbol; */
2041 if (locinput == PL_bostr
2042 ? PL_regprev == '\n'
2043 : ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n') )
2049 if (locinput == PL_bostr)
2053 if (locinput == PL_reg_ganch)
2063 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
2068 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
2070 if (PL_regeol - locinput > 1)
2074 if (PL_regeol != locinput)
2078 if (!nextchr && locinput >= PL_regeol)
2080 nextchr = UCHARAT(++locinput);
2083 if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
2086 locinput += PL_utf8skip[nextchr];
2087 if (locinput > PL_regeol)
2089 nextchr = UCHARAT(locinput);
2092 nextchr = UCHARAT(++locinput);
2097 if (do_utf8 != (UTF!=0)) {
2105 if (*((U8*)s) != utf8_to_uv_simple((U8*)l, &len))
2114 if (*((U8*)l) != utf8_to_uv_simple((U8*)s, &len))
2120 nextchr = UCHARAT(locinput);
2123 /* Inline the first character, for speed. */
2124 if (UCHARAT(s) != nextchr)
2126 if (PL_regeol - locinput < ln)
2128 if (ln > 1 && memNE(s, locinput, ln))
2131 nextchr = UCHARAT(locinput);
2134 PL_reg_flags |= RF_tainted;
2144 c1 = OP(scan) == EXACTF;
2146 if (l >= PL_regeol) {
2149 if ((UTF ? utf8_to_uv((U8*)s, e - s, 0, 0) : *((U8*)s)) !=
2150 (c1 ? toLOWER_utf8((U8*)l) : toLOWER_LC_utf8((U8*)l)))
2152 s += UTF ? UTF8SKIP(s) : 1;
2156 nextchr = UCHARAT(locinput);
2160 /* Inline the first character, for speed. */
2161 if (UCHARAT(s) != nextchr &&
2162 UCHARAT(s) != ((OP(scan) == EXACTF)
2163 ? PL_fold : PL_fold_locale)[nextchr])
2165 if (PL_regeol - locinput < ln)
2167 if (ln > 1 && (OP(scan) == EXACTF
2168 ? ibcmp(s, locinput, ln)
2169 : ibcmp_locale(s, locinput, ln)))
2172 nextchr = UCHARAT(locinput);
2176 if (!reginclass(scan, (U8*)locinput, do_utf8))
2178 if (locinput >= PL_regeol)
2180 locinput += PL_utf8skip[nextchr];
2181 nextchr = UCHARAT(locinput);
2185 nextchr = UCHARAT(locinput);
2186 if (!reginclass(scan, (U8*)locinput, do_utf8))
2188 if (!nextchr && locinput >= PL_regeol)
2190 nextchr = UCHARAT(++locinput);
2194 PL_reg_flags |= RF_tainted;
2200 if (!(OP(scan) == ALNUM
2201 ? swash_fetch(PL_utf8_alnum, (U8*)locinput)
2202 : isALNUM_LC_utf8((U8*)locinput)))
2206 locinput += PL_utf8skip[nextchr];
2207 nextchr = UCHARAT(locinput);
2210 if (!(OP(scan) == ALNUM
2211 ? isALNUM(nextchr) : isALNUM_LC(nextchr)))
2213 nextchr = UCHARAT(++locinput);
2216 PL_reg_flags |= RF_tainted;
2219 if (!nextchr && locinput >= PL_regeol)
2222 if (OP(scan) == NALNUM
2223 ? swash_fetch(PL_utf8_alnum, (U8*)locinput)
2224 : isALNUM_LC_utf8((U8*)locinput))
2228 locinput += PL_utf8skip[nextchr];
2229 nextchr = UCHARAT(locinput);
2232 if (OP(scan) == NALNUM
2233 ? isALNUM(nextchr) : isALNUM_LC(nextchr))
2235 nextchr = UCHARAT(++locinput);
2239 PL_reg_flags |= RF_tainted;
2243 /* was last char in word? */
2245 if (locinput == PL_regbol)
2248 U8 *r = reghop((U8*)locinput, -1);
2250 ln = utf8_to_uv(r, s - (char*)r, 0, 0);
2252 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
2253 ln = isALNUM_uni(ln);
2254 n = swash_fetch(PL_utf8_alnum, (U8*)locinput);
2257 ln = isALNUM_LC_uni(ln);
2258 n = isALNUM_LC_utf8((U8*)locinput);
2262 ln = (locinput != PL_regbol) ?
2263 UCHARAT(locinput - 1) : PL_regprev;
2264 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
2266 n = isALNUM(nextchr);
2269 ln = isALNUM_LC(ln);
2270 n = isALNUM_LC(nextchr);
2273 if (((!ln) == (!n)) == (OP(scan) == BOUND ||
2274 OP(scan) == BOUNDL))
2278 PL_reg_flags |= RF_tainted;
2284 if (UTF8_IS_CONTINUED(nextchr)) {
2285 if (!(OP(scan) == SPACE
2286 ? swash_fetch(PL_utf8_space, (U8*)locinput)
2287 : isSPACE_LC_utf8((U8*)locinput)))
2291 locinput += PL_utf8skip[nextchr];
2292 nextchr = UCHARAT(locinput);
2295 if (!(OP(scan) == SPACE
2296 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
2298 nextchr = UCHARAT(++locinput);
2301 if (!(OP(scan) == SPACE
2302 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
2304 nextchr = UCHARAT(++locinput);
2308 PL_reg_flags |= RF_tainted;
2311 if (!nextchr && locinput >= PL_regeol)
2314 if (OP(scan) == NSPACE
2315 ? swash_fetch(PL_utf8_space, (U8*)locinput)
2316 : isSPACE_LC_utf8((U8*)locinput))
2320 locinput += PL_utf8skip[nextchr];
2321 nextchr = UCHARAT(locinput);
2324 if (OP(scan) == NSPACE
2325 ? isSPACE(nextchr) : isSPACE_LC(nextchr))
2327 nextchr = UCHARAT(++locinput);
2330 PL_reg_flags |= RF_tainted;
2336 if (!(OP(scan) == DIGIT
2337 ? swash_fetch(PL_utf8_digit, (U8*)locinput)
2338 : isDIGIT_LC_utf8((U8*)locinput)))
2342 locinput += PL_utf8skip[nextchr];
2343 nextchr = UCHARAT(locinput);
2346 if (!(OP(scan) == DIGIT
2347 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr)))
2349 nextchr = UCHARAT(++locinput);
2352 PL_reg_flags |= RF_tainted;
2355 if (!nextchr && locinput >= PL_regeol)
2358 if (OP(scan) == NDIGIT
2359 ? swash_fetch(PL_utf8_digit, (U8*)locinput)
2360 : isDIGIT_LC_utf8((U8*)locinput))
2364 locinput += PL_utf8skip[nextchr];
2365 nextchr = UCHARAT(locinput);
2368 if (OP(scan) == NDIGIT
2369 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr))
2371 nextchr = UCHARAT(++locinput);
2374 if (locinput >= PL_regeol || swash_fetch(PL_utf8_mark,(U8*)locinput))
2376 locinput += PL_utf8skip[nextchr];
2377 while (locinput < PL_regeol && swash_fetch(PL_utf8_mark,(U8*)locinput))
2378 locinput += UTF8SKIP(locinput);
2379 if (locinput > PL_regeol)
2381 nextchr = UCHARAT(locinput);
2384 PL_reg_flags |= RF_tainted;
2388 n = ARG(scan); /* which paren pair */
2389 ln = PL_regstartp[n];
2390 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
2391 if (*PL_reglastparen < n || ln == -1)
2392 sayNO; /* Do not match unless seen CLOSEn. */
2393 if (ln == PL_regendp[n])
2397 if (do_utf8 && OP(scan) != REF) { /* REF can do byte comparison */
2399 char *e = PL_bostr + PL_regendp[n];
2401 * Note that we can't do the "other character" lookup trick as
2402 * in the 8-bit case (no pun intended) because in Unicode we
2403 * have to map both upper and title case to lower case.
2405 if (OP(scan) == REFF) {
2409 if (toLOWER_utf8((U8*)s) != toLOWER_utf8((U8*)l))
2419 if (toLOWER_LC_utf8((U8*)s) != toLOWER_LC_utf8((U8*)l))
2426 nextchr = UCHARAT(locinput);
2430 /* Inline the first character, for speed. */
2431 if (UCHARAT(s) != nextchr &&
2433 (UCHARAT(s) != ((OP(scan) == REFF
2434 ? PL_fold : PL_fold_locale)[nextchr]))))
2436 ln = PL_regendp[n] - ln;
2437 if (locinput + ln > PL_regeol)
2439 if (ln > 1 && (OP(scan) == REF
2440 ? memNE(s, locinput, ln)
2442 ? ibcmp(s, locinput, ln)
2443 : ibcmp_locale(s, locinput, ln))))
2446 nextchr = UCHARAT(locinput);
2457 OP_4tree *oop = PL_op;
2458 COP *ocurcop = PL_curcop;
2459 SV **ocurpad = PL_curpad;
2463 PL_op = (OP_4tree*)PL_regdata->data[n];
2464 DEBUG_r( PerlIO_printf(Perl_debug_log, " re_eval 0x%"UVxf"\n", PTR2UV(PL_op)) );
2465 PL_curpad = AvARRAY((AV*)PL_regdata->data[n + 2]);
2466 PL_regendp[0] = PL_reg_magic->mg_len = locinput - PL_bostr;
2468 CALLRUNOPS(aTHX); /* Scalar context. */
2474 PL_curpad = ocurpad;
2475 PL_curcop = ocurcop;
2477 if (logical == 2) { /* Postponed subexpression. */
2479 MAGIC *mg = Null(MAGIC*);
2481 CHECKPOINT cp, lastcp;
2483 if(SvROK(ret) || SvRMAGICAL(ret)) {
2484 SV *sv = SvROK(ret) ? SvRV(ret) : ret;
2487 mg = mg_find(sv, 'r');
2490 re = (regexp *)mg->mg_obj;
2491 (void)ReREFCNT_inc(re);
2495 char *t = SvPV(ret, len);
2497 char *oprecomp = PL_regprecomp;
2498 I32 osize = PL_regsize;
2499 I32 onpar = PL_regnpar;
2502 re = CALLREGCOMP(aTHX_ t, t + len, &pm);
2504 & (SVs_TEMP | SVs_PADTMP | SVf_READONLY)))
2505 sv_magic(ret,(SV*)ReREFCNT_inc(re),'r',0,0);
2506 PL_regprecomp = oprecomp;
2511 PerlIO_printf(Perl_debug_log,
2512 "Entering embedded `%s%.60s%s%s'\n",
2516 (strlen(re->precomp) > 60 ? "..." : ""))
2519 state.prev = PL_reg_call_cc;
2520 state.cc = PL_regcc;
2521 state.re = PL_reg_re;
2525 cp = regcppush(0); /* Save *all* the positions. */
2528 state.ss = PL_savestack_ix;
2529 *PL_reglastparen = 0;
2530 PL_reg_call_cc = &state;
2531 PL_reginput = locinput;
2533 /* XXXX This is too dramatic a measure... */
2536 if (regmatch(re->program + 1)) {
2537 /* Even though we succeeded, we need to restore
2538 global variables, since we may be wrapped inside
2539 SUSPEND, thus the match may be not finished yet. */
2541 /* XXXX Do this only if SUSPENDed? */
2542 PL_reg_call_cc = state.prev;
2543 PL_regcc = state.cc;
2544 PL_reg_re = state.re;
2545 cache_re(PL_reg_re);
2547 /* XXXX This is too dramatic a measure... */
2550 /* These are needed even if not SUSPEND. */
2556 REGCP_UNWIND(lastcp);
2558 PL_reg_call_cc = state.prev;
2559 PL_regcc = state.cc;
2560 PL_reg_re = state.re;
2561 cache_re(PL_reg_re);
2563 /* XXXX This is too dramatic a measure... */
2572 sv_setsv(save_scalar(PL_replgv), ret);
2576 n = ARG(scan); /* which paren pair */
2577 PL_reg_start_tmp[n] = locinput;
2582 n = ARG(scan); /* which paren pair */
2583 PL_regstartp[n] = PL_reg_start_tmp[n] - PL_bostr;
2584 PL_regendp[n] = locinput - PL_bostr;
2585 if (n > *PL_reglastparen)
2586 *PL_reglastparen = n;
2589 n = ARG(scan); /* which paren pair */
2590 sw = (*PL_reglastparen >= n && PL_regendp[n] != -1);
2593 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
2595 next = NEXTOPER(NEXTOPER(scan));
2597 next = scan + ARG(scan);
2598 if (OP(next) == IFTHEN) /* Fake one. */
2599 next = NEXTOPER(NEXTOPER(next));
2603 logical = scan->flags;
2605 /*******************************************************************
2606 PL_regcc contains infoblock about the innermost (...)* loop, and
2607 a pointer to the next outer infoblock.
2609 Here is how Y(A)*Z is processed (if it is compiled into CURLYX/WHILEM):
2611 1) After matching X, regnode for CURLYX is processed;
2613 2) This regnode creates infoblock on the stack, and calls
2614 regmatch() recursively with the starting point at WHILEM node;
2616 3) Each hit of WHILEM node tries to match A and Z (in the order
2617 depending on the current iteration, min/max of {min,max} and
2618 greediness). The information about where are nodes for "A"
2619 and "Z" is read from the infoblock, as is info on how many times "A"
2620 was already matched, and greediness.
2622 4) After A matches, the same WHILEM node is hit again.
2624 5) Each time WHILEM is hit, PL_regcc is the infoblock created by CURLYX
2625 of the same pair. Thus when WHILEM tries to match Z, it temporarily
2626 resets PL_regcc, since this Y(A)*Z can be a part of some other loop:
2627 as in (Y(A)*Z)*. If Z matches, the automaton will hit the WHILEM node
2628 of the external loop.
2630 Currently present infoblocks form a tree with a stem formed by PL_curcc
2631 and whatever it mentions via ->next, and additional attached trees
2632 corresponding to temporarily unset infoblocks as in "5" above.
2634 In the following picture infoblocks for outer loop of
2635 (Y(A)*?Z)*?T are denoted O, for inner I. NULL starting block
2636 is denoted by x. The matched string is YAAZYAZT. Temporarily postponed
2637 infoblocks are drawn below the "reset" infoblock.
2639 In fact in the picture below we do not show failed matches for Z and T
2640 by WHILEM blocks. [We illustrate minimal matches, since for them it is
2641 more obvious *why* one needs to *temporary* unset infoblocks.]
2643 Matched REx position InfoBlocks Comment
2647 Y A)*?Z)*?T x <- O <- I
2648 YA )*?Z)*?T x <- O <- I
2649 YA A)*?Z)*?T x <- O <- I
2650 YAA )*?Z)*?T x <- O <- I
2651 YAA Z)*?T x <- O # Temporary unset I
2654 YAAZ Y(A)*?Z)*?T x <- O
2657 YAAZY (A)*?Z)*?T x <- O
2660 YAAZY A)*?Z)*?T x <- O <- I
2663 YAAZYA )*?Z)*?T x <- O <- I
2666 YAAZYA Z)*?T x <- O # Temporary unset I
2672 YAAZYAZ T x # Temporary unset O
2679 *******************************************************************/
2682 CHECKPOINT cp = PL_savestack_ix;
2683 /* No need to save/restore up to this paren */
2684 I32 parenfloor = scan->flags;
2686 if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
2688 cc.oldcc = PL_regcc;
2690 /* XXXX Probably it is better to teach regpush to support
2691 parenfloor > PL_regsize... */
2692 if (parenfloor > *PL_reglastparen)
2693 parenfloor = *PL_reglastparen; /* Pessimization... */
2694 cc.parenfloor = parenfloor;
2696 cc.min = ARG1(scan);
2697 cc.max = ARG2(scan);
2698 cc.scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
2702 PL_reginput = locinput;
2703 n = regmatch(PREVOPER(next)); /* start on the WHILEM */
2705 PL_regcc = cc.oldcc;
2711 * This is really hard to understand, because after we match
2712 * what we're trying to match, we must make sure the rest of
2713 * the REx is going to match for sure, and to do that we have
2714 * to go back UP the parse tree by recursing ever deeper. And
2715 * if it fails, we have to reset our parent's current state
2716 * that we can try again after backing off.
2719 CHECKPOINT cp, lastcp;
2720 CURCUR* cc = PL_regcc;
2721 char *lastloc = cc->lastloc; /* Detection of 0-len. */
2723 n = cc->cur + 1; /* how many we know we matched */
2724 PL_reginput = locinput;
2727 PerlIO_printf(Perl_debug_log,
2728 "%*s %ld out of %ld..%ld cc=%lx\n",
2729 REPORT_CODE_OFF+PL_regindent*2, "",
2730 (long)n, (long)cc->min,
2731 (long)cc->max, (long)cc)
2734 /* If degenerate scan matches "", assume scan done. */
2736 if (locinput == cc->lastloc && n >= cc->min) {
2737 PL_regcc = cc->oldcc;
2741 PerlIO_printf(Perl_debug_log,
2742 "%*s empty match detected, try continuation...\n",
2743 REPORT_CODE_OFF+PL_regindent*2, "")
2745 if (regmatch(cc->next))
2753 /* First just match a string of min scans. */
2757 cc->lastloc = locinput;
2758 if (regmatch(cc->scan))
2761 cc->lastloc = lastloc;
2766 /* Check whether we already were at this position.
2767 Postpone detection until we know the match is not
2768 *that* much linear. */
2769 if (!PL_reg_maxiter) {
2770 PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
2771 PL_reg_leftiter = PL_reg_maxiter;
2773 if (PL_reg_leftiter-- == 0) {
2774 I32 size = (PL_reg_maxiter + 7)/8;
2775 if (PL_reg_poscache) {
2776 if (PL_reg_poscache_size < size) {
2777 Renew(PL_reg_poscache, size, char);
2778 PL_reg_poscache_size = size;
2780 Zero(PL_reg_poscache, size, char);
2783 PL_reg_poscache_size = size;
2784 Newz(29, PL_reg_poscache, size, char);
2787 PerlIO_printf(Perl_debug_log,
2788 "%sDetected a super-linear match, switching on caching%s...\n",
2789 PL_colors[4], PL_colors[5])
2792 if (PL_reg_leftiter < 0) {
2793 I32 o = locinput - PL_bostr, b;
2795 o = (scan->flags & 0xf) - 1 + o * (scan->flags>>4);
2798 if (PL_reg_poscache[o] & (1<<b)) {
2800 PerlIO_printf(Perl_debug_log,
2801 "%*s already tried at this position...\n",
2802 REPORT_CODE_OFF+PL_regindent*2, "")
2806 PL_reg_poscache[o] |= (1<<b);
2810 /* Prefer next over scan for minimal matching. */
2813 PL_regcc = cc->oldcc;
2816 cp = regcppush(cc->parenfloor);
2818 if (regmatch(cc->next)) {
2820 sayYES; /* All done. */
2822 REGCP_UNWIND(lastcp);
2828 if (n >= cc->max) { /* Maximum greed exceeded? */
2829 if (ckWARN(WARN_REGEXP) && n >= REG_INFTY
2830 && !(PL_reg_flags & RF_warned)) {
2831 PL_reg_flags |= RF_warned;
2832 Perl_warner(aTHX_ WARN_REGEXP, "%s limit (%d) exceeded",
2833 "Complex regular subexpression recursion",
2840 PerlIO_printf(Perl_debug_log,
2841 "%*s trying longer...\n",
2842 REPORT_CODE_OFF+PL_regindent*2, "")
2844 /* Try scanning more and see if it helps. */
2845 PL_reginput = locinput;
2847 cc->lastloc = locinput;
2848 cp = regcppush(cc->parenfloor);
2850 if (regmatch(cc->scan)) {
2854 REGCP_UNWIND(lastcp);
2857 cc->lastloc = lastloc;
2861 /* Prefer scan over next for maximal matching. */
2863 if (n < cc->max) { /* More greed allowed? */
2864 cp = regcppush(cc->parenfloor);
2866 cc->lastloc = locinput;
2868 if (regmatch(cc->scan)) {
2872 REGCP_UNWIND(lastcp);
2873 regcppop(); /* Restore some previous $<digit>s? */
2874 PL_reginput = locinput;
2876 PerlIO_printf(Perl_debug_log,
2877 "%*s failed, try continuation...\n",
2878 REPORT_CODE_OFF+PL_regindent*2, "")
2881 if (ckWARN(WARN_REGEXP) && n >= REG_INFTY
2882 && !(PL_reg_flags & RF_warned)) {
2883 PL_reg_flags |= RF_warned;
2884 Perl_warner(aTHX_ WARN_REGEXP, "%s limit (%d) exceeded",
2885 "Complex regular subexpression recursion",
2889 /* Failed deeper matches of scan, so see if this one works. */
2890 PL_regcc = cc->oldcc;
2893 if (regmatch(cc->next))
2899 cc->lastloc = lastloc;
2904 next = scan + ARG(scan);
2907 inner = NEXTOPER(NEXTOPER(scan));
2910 inner = NEXTOPER(scan);
2915 if (OP(next) != c1) /* No choice. */
2916 next = inner; /* Avoid recursion. */
2918 I32 lastparen = *PL_reglastparen;
2920 re_unwind_branch_t *uw;
2922 /* Put unwinding data on stack */
2923 unwind1 = SSNEWt(1,re_unwind_branch_t);
2924 uw = SSPTRt(unwind1,re_unwind_branch_t);
2927 uw->type = ((c1 == BRANCH)
2929 : RE_UNWIND_BRANCHJ);
2930 uw->lastparen = lastparen;
2932 uw->locinput = locinput;
2933 uw->nextchr = nextchr;
2935 uw->regindent = ++PL_regindent;
2938 REGCP_SET(uw->lastcp);
2940 /* Now go into the first branch */
2953 /* We suppose that the next guy does not need
2954 backtracking: in particular, it is of constant length,
2955 and has no parenths to influence future backrefs. */
2956 ln = ARG1(scan); /* min to match */
2957 n = ARG2(scan); /* max to match */
2958 paren = scan->flags;
2960 if (paren > PL_regsize)
2962 if (paren > *PL_reglastparen)
2963 *PL_reglastparen = paren;
2965 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
2967 scan += NEXT_OFF(scan); /* Skip former OPEN. */
2968 PL_reginput = locinput;
2971 if (ln && regrepeat_hard(scan, ln, &l) < ln)
2973 if (ln && l == 0 && n >= ln
2974 /* In fact, this is tricky. If paren, then the
2975 fact that we did/didnot match may influence
2976 future execution. */
2977 && !(paren && ln == 0))
2979 locinput = PL_reginput;
2980 if (PL_regkind[(U8)OP(next)] == EXACT) {
2981 c1 = (U8)*STRING(next);
2982 if (OP(next) == EXACTF)
2984 else if (OP(next) == EXACTFL)
2985 c2 = PL_fold_locale[c1];
2992 /* This may be improved if l == 0. */
2993 while (n >= ln || (n == REG_INFTY && ln > 0 && l)) { /* ln overflow ? */
2994 /* If it could work, try it. */
2996 UCHARAT(PL_reginput) == c1 ||
2997 UCHARAT(PL_reginput) == c2)
3001 PL_regstartp[paren] =
3002 HOPc(PL_reginput, -l) - PL_bostr;
3003 PL_regendp[paren] = PL_reginput - PL_bostr;
3006 PL_regendp[paren] = -1;
3010 REGCP_UNWIND(lastcp);
3012 /* Couldn't or didn't -- move forward. */
3013 PL_reginput = locinput;
3014 if (regrepeat_hard(scan, 1, &l)) {
3016 locinput = PL_reginput;
3023 n = regrepeat_hard(scan, n, &l);
3024 if (n != 0 && l == 0
3025 /* In fact, this is tricky. If paren, then the
3026 fact that we did/didnot match may influence
3027 future execution. */
3028 && !(paren && ln == 0))
3030 locinput = PL_reginput;
3032 PerlIO_printf(Perl_debug_log,
3033 "%*s matched %"IVdf" times, len=%"IVdf"...\n",
3034 (int)(REPORT_CODE_OFF+PL_regindent*2), "",
3038 if (PL_regkind[(U8)OP(next)] == EXACT) {
3039 c1 = (U8)*STRING(next);
3040 if (OP(next) == EXACTF)
3042 else if (OP(next) == EXACTFL)
3043 c2 = PL_fold_locale[c1];
3052 /* If it could work, try it. */
3054 UCHARAT(PL_reginput) == c1 ||
3055 UCHARAT(PL_reginput) == c2)
3058 PerlIO_printf(Perl_debug_log,
3059 "%*s trying tail with n=%"IVdf"...\n",
3060 (int)(REPORT_CODE_OFF+PL_regindent*2), "", (IV)n)
3064 PL_regstartp[paren] = HOPc(PL_reginput, -l) - PL_bostr;
3065 PL_regendp[paren] = PL_reginput - PL_bostr;
3068 PL_regendp[paren] = -1;
3072 REGCP_UNWIND(lastcp);
3074 /* Couldn't or didn't -- back up. */
3076 locinput = HOPc(locinput, -l);
3077 PL_reginput = locinput;
3084 paren = scan->flags; /* Which paren to set */
3085 if (paren > PL_regsize)
3087 if (paren > *PL_reglastparen)
3088 *PL_reglastparen = paren;
3089 ln = ARG1(scan); /* min to match */
3090 n = ARG2(scan); /* max to match */
3091 scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
3095 ln = ARG1(scan); /* min to match */
3096 n = ARG2(scan); /* max to match */
3097 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
3102 scan = NEXTOPER(scan);
3108 scan = NEXTOPER(scan);
3112 * Lookahead to avoid useless match attempts
3113 * when we know what character comes next.
3115 if (PL_regkind[(U8)OP(next)] == EXACT) {
3116 U8 *s = (U8*)STRING(next);
3119 if (OP(next) == EXACTF)
3121 else if (OP(next) == EXACTFL)
3122 c2 = PL_fold_locale[c1];
3125 if (OP(next) == EXACTF) {
3126 c1 = to_utf8_lower(s);
3127 c2 = to_utf8_upper(s);
3130 c2 = c1 = utf8_to_uv_simple(s, NULL);
3136 PL_reginput = locinput;
3140 if (ln && regrepeat(scan, ln) < ln)
3142 locinput = PL_reginput;
3145 char *e; /* Should not check after this */
3146 char *old = locinput;
3148 if (n == REG_INFTY) {
3151 while (UTF8_IS_CONTINUATION(*(U8*)e))
3157 m >0 && e + UTF8SKIP(e) <= PL_regeol; m--)
3161 e = locinput + n - ln;
3167 /* Find place 'next' could work */
3170 while (locinput <= e && *locinput != c1)
3173 while (locinput <= e
3178 count = locinput - old;
3185 utf8_to_uv_simple((U8*)locinput, &len) != c1;
3190 for (count = 0; locinput <= e; count++) {
3191 UV c = utf8_to_uv_simple((U8*)locinput, &len);
3192 if (c == c1 || c == c2)
3200 /* PL_reginput == old now */
3201 if (locinput != old) {
3202 ln = 1; /* Did some */
3203 if (regrepeat(scan, count) < count)
3206 /* PL_reginput == locinput now */
3207 TRYPAREN(paren, ln, locinput);
3208 PL_reginput = locinput; /* Could be reset... */
3209 REGCP_UNWIND(lastcp);
3210 /* Couldn't or didn't -- move forward. */
3213 locinput += UTF8SKIP(locinput);
3219 while (n >= ln || (n == REG_INFTY && ln > 0)) { /* ln overflow ? */
3223 c = utf8_to_uv_simple((U8*)PL_reginput, NULL);
3225 c = UCHARAT(PL_reginput);
3227 /* If it could work, try it. */
3228 if (c1 == -1000 || c == c1 || c == c2)
3230 TRYPAREN(paren, n, PL_reginput);
3231 REGCP_UNWIND(lastcp);
3233 /* Couldn't or didn't -- move forward. */
3234 PL_reginput = locinput;
3235 if (regrepeat(scan, 1)) {
3237 locinput = PL_reginput;
3245 n = regrepeat(scan, n);
3246 locinput = PL_reginput;
3247 if (ln < n && PL_regkind[(U8)OP(next)] == EOL &&
3248 (!PL_multiline || OP(next) == SEOL || OP(next) == EOS)) {
3249 ln = n; /* why back off? */
3250 /* ...because $ and \Z can match before *and* after
3251 newline at the end. Consider "\n\n" =~ /\n+\Z\n/.
3252 We should back off by one in this case. */
3253 if (UCHARAT(PL_reginput - 1) == '\n' && OP(next) != EOS)
3262 c = utf8_to_uv_simple((U8*)PL_reginput, NULL);
3264 c = UCHARAT(PL_reginput);
3266 /* If it could work, try it. */
3267 if (c1 == -1000 || c == c1 || c == c2)
3269 TRYPAREN(paren, n, PL_reginput);
3270 REGCP_UNWIND(lastcp);
3272 /* Couldn't or didn't -- back up. */
3274 PL_reginput = locinput = HOPc(locinput, -1);
3282 c = utf8_to_uv_simple((U8*)PL_reginput, NULL);
3284 c = UCHARAT(PL_reginput);
3286 /* If it could work, try it. */
3287 if (c1 == -1000 || c == c1 || c == c2)
3289 TRYPAREN(paren, n, PL_reginput);
3290 REGCP_UNWIND(lastcp);
3292 /* Couldn't or didn't -- back up. */
3294 PL_reginput = locinput = HOPc(locinput, -1);
3301 if (PL_reg_call_cc) {
3302 re_cc_state *cur_call_cc = PL_reg_call_cc;
3303 CURCUR *cctmp = PL_regcc;
3304 regexp *re = PL_reg_re;
3305 CHECKPOINT cp, lastcp;
3307 cp = regcppush(0); /* Save *all* the positions. */
3309 regcp_set_to(PL_reg_call_cc->ss); /* Restore parens of
3311 PL_reginput = locinput; /* Make position available to
3313 cache_re(PL_reg_call_cc->re);
3314 PL_regcc = PL_reg_call_cc->cc;
3315 PL_reg_call_cc = PL_reg_call_cc->prev;
3316 if (regmatch(cur_call_cc->node)) {
3317 PL_reg_call_cc = cur_call_cc;
3321 REGCP_UNWIND(lastcp);
3323 PL_reg_call_cc = cur_call_cc;
3329 PerlIO_printf(Perl_debug_log,
3330 "%*s continuation failed...\n",
3331 REPORT_CODE_OFF+PL_regindent*2, "")
3335 if (locinput < PL_regtill) {
3336 DEBUG_r(PerlIO_printf(Perl_debug_log,
3337 "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
3339 (long)(locinput - PL_reg_starttry),
3340 (long)(PL_regtill - PL_reg_starttry),
3342 sayNO_FINAL; /* Cannot match: too short. */
3344 PL_reginput = locinput; /* put where regtry can find it */
3345 sayYES_FINAL; /* Success! */
3347 PL_reginput = locinput; /* put where regtry can find it */
3348 sayYES_LOUD; /* Success! */
3351 PL_reginput = locinput;
3356 if (UTF) { /* XXXX This is absolutely
3357 broken, we read before
3359 s = HOPMAYBEc(locinput, -scan->flags);
3365 if (locinput < PL_bostr + scan->flags)
3367 PL_reginput = locinput - scan->flags;
3372 PL_reginput = locinput;
3377 if (UTF) { /* XXXX This is absolutely
3378 broken, we read before
3380 s = HOPMAYBEc(locinput, -scan->flags);
3381 if (!s || s < PL_bostr)
3386 if (locinput < PL_bostr + scan->flags)
3388 PL_reginput = locinput - scan->flags;
3393 PL_reginput = locinput;
3396 inner = NEXTOPER(NEXTOPER(scan));
3397 if (regmatch(inner) != n) {
3412 if (OP(scan) == SUSPEND) {
3413 locinput = PL_reginput;
3414 nextchr = UCHARAT(locinput);
3419 next = scan + ARG(scan);
3424 PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
3425 PTR2UV(scan), OP(scan));
3426 Perl_croak(aTHX_ "regexp memory corruption");
3433 * We get here only if there's trouble -- normally "case END" is
3434 * the terminating point.
3436 Perl_croak(aTHX_ "corrupted regexp pointers");
3442 PerlIO_printf(Perl_debug_log,
3443 "%*s %scould match...%s\n",
3444 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4],PL_colors[5])
3448 DEBUG_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
3449 PL_colors[4],PL_colors[5]));
3455 #if 0 /* Breaks $^R */
3463 PerlIO_printf(Perl_debug_log,
3464 "%*s %sfailed...%s\n",
3465 REPORT_CODE_OFF+PL_regindent*2, "",PL_colors[4],PL_colors[5])
3471 re_unwind_t *uw = SSPTRt(unwind,re_unwind_t);
3474 case RE_UNWIND_BRANCH:
3475 case RE_UNWIND_BRANCHJ:
3477 re_unwind_branch_t *uwb = &(uw->branch);
3478 I32 lastparen = uwb->lastparen;
3480 REGCP_UNWIND(uwb->lastcp);
3481 for (n = *PL_reglastparen; n > lastparen; n--)
3483 *PL_reglastparen = n;
3484 scan = next = uwb->next;
3486 OP(scan) != (uwb->type == RE_UNWIND_BRANCH
3487 ? BRANCH : BRANCHJ) ) { /* Failure */
3494 /* Have more choice yet. Reuse the same uwb. */
3496 if ((n = (uwb->type == RE_UNWIND_BRANCH
3497 ? NEXT_OFF(next) : ARG(next))))
3500 next = NULL; /* XXXX Needn't unwinding in this case... */
3502 next = NEXTOPER(scan);
3503 if (uwb->type == RE_UNWIND_BRANCHJ)
3504 next = NEXTOPER(next);
3505 locinput = uwb->locinput;
3506 nextchr = uwb->nextchr;
3508 PL_regindent = uwb->regindent;
3515 Perl_croak(aTHX_ "regexp unwind memory corruption");
3526 - regrepeat - repeatedly match something simple, report how many
3529 * [This routine now assumes that it will only match on things of length 1.
3530 * That was true before, but now we assume scan - reginput is the count,
3531 * rather than incrementing count on every character. [Er, except utf8.]]
3534 S_regrepeat(pTHX_ regnode *p, I32 max)
3536 register char *scan;
3538 register char *loceol = PL_regeol;
3539 register I32 hardcount = 0;
3540 register bool do_utf8 = DO_UTF8(PL_reg_sv);
3543 if (max != REG_INFTY && max < loceol - scan)
3544 loceol = scan + max;
3549 while (scan < loceol && hardcount < max && *scan != '\n') {
3550 scan += UTF8SKIP(scan);
3554 while (scan < loceol && *scan != '\n')
3561 case EXACT: /* length of string is 1 */
3563 while (scan < loceol && UCHARAT(scan) == c)
3566 case EXACTF: /* length of string is 1 */
3568 while (scan < loceol &&
3569 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold[c]))
3572 case EXACTFL: /* length of string is 1 */
3573 PL_reg_flags |= RF_tainted;
3575 while (scan < loceol &&
3576 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold_locale[c]))
3582 while (hardcount < max && scan < loceol &&
3583 reginclass(p, (U8*)scan, do_utf8)) {
3584 scan += UTF8SKIP(scan);
3588 while (scan < loceol && reginclass(p, (U8*)scan, do_utf8))
3595 while (hardcount < max && scan < loceol &&
3596 swash_fetch(PL_utf8_alnum, (U8*)scan)) {
3597 scan += UTF8SKIP(scan);
3601 while (scan < loceol && isALNUM(*scan))
3606 PL_reg_flags |= RF_tainted;
3609 while (hardcount < max && scan < loceol &&
3610 isALNUM_LC_utf8((U8*)scan)) {
3611 scan += UTF8SKIP(scan);
3615 while (scan < loceol && isALNUM_LC(*scan))
3622 while (hardcount < max && scan < loceol &&
3623 !swash_fetch(PL_utf8_alnum, (U8*)scan)) {
3624 scan += UTF8SKIP(scan);
3628 while (scan < loceol && !isALNUM(*scan))
3633 PL_reg_flags |= RF_tainted;
3636 while (hardcount < max && scan < loceol &&
3637 !isALNUM_LC_utf8((U8*)scan)) {
3638 scan += UTF8SKIP(scan);
3642 while (scan < loceol && !isALNUM_LC(*scan))
3649 while (hardcount < max && scan < loceol &&
3650 (*scan == ' ' || swash_fetch(PL_utf8_space,(U8*)scan))) {
3651 scan += UTF8SKIP(scan);
3655 while (scan < loceol && isSPACE(*scan))
3660 PL_reg_flags |= RF_tainted;
3663 while (hardcount < max && scan < loceol &&
3664 (*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
3665 scan += UTF8SKIP(scan);
3669 while (scan < loceol && isSPACE_LC(*scan))
3676 while (hardcount < max && scan < loceol &&
3677 !(*scan == ' ' || swash_fetch(PL_utf8_space,(U8*)scan))) {
3678 scan += UTF8SKIP(scan);
3682 while (scan < loceol && !isSPACE(*scan))
3687 PL_reg_flags |= RF_tainted;
3690 while (hardcount < max && scan < loceol &&
3691 !(*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
3692 scan += UTF8SKIP(scan);
3696 while (scan < loceol && !isSPACE_LC(*scan))
3703 while (hardcount < max && scan < loceol &&
3704 swash_fetch(PL_utf8_digit,(U8*)scan)) {
3705 scan += UTF8SKIP(scan);
3709 while (scan < loceol && isDIGIT(*scan))
3716 while (hardcount < max && scan < loceol &&
3717 !swash_fetch(PL_utf8_digit,(U8*)scan)) {
3718 scan += UTF8SKIP(scan);
3722 while (scan < loceol && !isDIGIT(*scan))
3726 default: /* Called on something of 0 width. */
3727 break; /* So match right here or not at all. */
3733 c = scan - PL_reginput;
3738 SV *prop = sv_newmortal();
3741 PerlIO_printf(Perl_debug_log,
3742 "%*s %s can match %"IVdf" times out of %"IVdf"...\n",
3743 REPORT_CODE_OFF+1, "", SvPVX(prop),(IV)c,(IV)max);
3750 - regrepeat_hard - repeatedly match something, report total lenth and length
3752 * The repeater is supposed to have constant length.
3756 S_regrepeat_hard(pTHX_ regnode *p, I32 max, I32 *lp)
3758 register char *scan;
3759 register char *start;
3760 register char *loceol = PL_regeol;
3762 I32 count = 0, res = 1;
3767 start = PL_reginput;
3768 if (DO_UTF8(PL_reg_sv)) {
3769 while (PL_reginput < loceol && (scan = PL_reginput, res = regmatch(p))) {
3772 while (start < PL_reginput) {
3774 start += UTF8SKIP(start);
3785 while (PL_reginput < loceol && (scan = PL_reginput, res = regmatch(p))) {
3787 *lp = l = PL_reginput - start;
3788 if (max != REG_INFTY && l*max < loceol - scan)
3789 loceol = scan + l*max;
3802 - regclass_swash - prepare the utf8 swash
3806 Perl_regclass_swash(pTHX_ register regnode* node, bool doinit, SV** initsvp)
3811 if (PL_regdata && PL_regdata->count) {
3814 if (PL_regdata->what[n] == 's') {
3815 SV *rv = (SV*)PL_regdata->data[n];
3816 AV *av = (AV*)SvRV((SV*)rv);
3819 si = *av_fetch(av, 0, FALSE);
3820 a = av_fetch(av, 1, FALSE);
3824 else if (si && doinit) {
3825 sw = swash_init("utf8", "", si, 1, 0);
3826 (void)av_store(av, 1, sw);
3838 - reginclass - determine if a character falls into a character class
3842 S_reginclass(pTHX_ register regnode *n, register U8* p, register bool do_utf8)
3844 char flags = ANYOF_FLAGS(n);
3850 c = utf8_to_uv_simple(p, &len);
3854 if (do_utf8 || (flags & ANYOF_UNICODE)) {
3855 if (do_utf8 && !ANYOF_RUNTIME(n)) {
3856 if (len != (STRLEN)-1 && c < 256 && ANYOF_BITMAP_TEST(n, c))
3859 if (do_utf8 && flags & ANYOF_UNICODE_ALL && c >= 256)
3862 SV *sw = regclass_swash(n, TRUE, 0);
3865 if (swash_fetch(sw, p))
3867 else if (flags & ANYOF_FOLD) {
3868 U8 tmpbuf[UTF8_MAXLEN+1];
3870 if (flags & ANYOF_LOCALE) {
3871 PL_reg_flags |= RF_tainted;
3872 uv_to_utf8(tmpbuf, toLOWER_LC_utf8(p));
3875 uv_to_utf8(tmpbuf, toLOWER_utf8(p));
3876 if (swash_fetch(sw, tmpbuf))
3882 if (!match && c < 256) {
3883 if (ANYOF_BITMAP_TEST(n, c))
3885 else if (flags & ANYOF_FOLD) {
3888 if (flags & ANYOF_LOCALE) {
3889 PL_reg_flags |= RF_tainted;
3890 f = PL_fold_locale[c];
3894 if (f != c && ANYOF_BITMAP_TEST(n, f))
3898 if (!match && (flags & ANYOF_CLASS)) {
3899 PL_reg_flags |= RF_tainted;
3901 (ANYOF_CLASS_TEST(n, ANYOF_ALNUM) && isALNUM_LC(c)) ||
3902 (ANYOF_CLASS_TEST(n, ANYOF_NALNUM) && !isALNUM_LC(c)) ||
3903 (ANYOF_CLASS_TEST(n, ANYOF_SPACE) && isSPACE_LC(c)) ||
3904 (ANYOF_CLASS_TEST(n, ANYOF_NSPACE) && !isSPACE_LC(c)) ||
3905 (ANYOF_CLASS_TEST(n, ANYOF_DIGIT) && isDIGIT_LC(c)) ||
3906 (ANYOF_CLASS_TEST(n, ANYOF_NDIGIT) && !isDIGIT_LC(c)) ||
3907 (ANYOF_CLASS_TEST(n, ANYOF_ALNUMC) && isALNUMC_LC(c)) ||
3908 (ANYOF_CLASS_TEST(n, ANYOF_NALNUMC) && !isALNUMC_LC(c)) ||
3909 (ANYOF_CLASS_TEST(n, ANYOF_ALPHA) && isALPHA_LC(c)) ||
3910 (ANYOF_CLASS_TEST(n, ANYOF_NALPHA) && !isALPHA_LC(c)) ||
3911 (ANYOF_CLASS_TEST(n, ANYOF_ASCII) && isASCII(c)) ||
3912 (ANYOF_CLASS_TEST(n, ANYOF_NASCII) && !isASCII(c)) ||
3913 (ANYOF_CLASS_TEST(n, ANYOF_CNTRL) && isCNTRL_LC(c)) ||
3914 (ANYOF_CLASS_TEST(n, ANYOF_NCNTRL) && !isCNTRL_LC(c)) ||
3915 (ANYOF_CLASS_TEST(n, ANYOF_GRAPH) && isGRAPH_LC(c)) ||
3916 (ANYOF_CLASS_TEST(n, ANYOF_NGRAPH) && !isGRAPH_LC(c)) ||
3917 (ANYOF_CLASS_TEST(n, ANYOF_LOWER) && isLOWER_LC(c)) ||
3918 (ANYOF_CLASS_TEST(n, ANYOF_NLOWER) && !isLOWER_LC(c)) ||
3919 (ANYOF_CLASS_TEST(n, ANYOF_PRINT) && isPRINT_LC(c)) ||
3920 (ANYOF_CLASS_TEST(n, ANYOF_NPRINT) && !isPRINT_LC(c)) ||
3921 (ANYOF_CLASS_TEST(n, ANYOF_PUNCT) && isPUNCT_LC(c)) ||
3922 (ANYOF_CLASS_TEST(n, ANYOF_NPUNCT) && !isPUNCT_LC(c)) ||
3923 (ANYOF_CLASS_TEST(n, ANYOF_UPPER) && isUPPER_LC(c)) ||
3924 (ANYOF_CLASS_TEST(n, ANYOF_NUPPER) && !isUPPER_LC(c)) ||
3925 (ANYOF_CLASS_TEST(n, ANYOF_XDIGIT) && isXDIGIT(c)) ||
3926 (ANYOF_CLASS_TEST(n, ANYOF_NXDIGIT) && !isXDIGIT(c)) ||
3927 (ANYOF_CLASS_TEST(n, ANYOF_PSXSPC) && isPSXSPC(c)) ||
3928 (ANYOF_CLASS_TEST(n, ANYOF_NPSXSPC) && !isPSXSPC(c)) ||
3929 (ANYOF_CLASS_TEST(n, ANYOF_BLANK) && isBLANK(c)) ||
3930 (ANYOF_CLASS_TEST(n, ANYOF_NBLANK) && !isBLANK(c))
3931 ) /* How's that for a conditional? */
3938 return (flags & ANYOF_INVERT) ? !match : match;
3942 S_reghop(pTHX_ U8 *s, I32 off)
3944 return S_reghop3(aTHX_ s, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr));
3948 S_reghop3(pTHX_ U8 *s, I32 off, U8* lim)
3951 while (off-- && s < lim) {
3952 /* XXX could check well-formedness here */
3960 if (UTF8_IS_CONTINUED(*s)) {
3961 while (s > (U8*)lim && UTF8_IS_CONTINUATION(*s))
3964 /* XXX could check well-formedness here */
3972 S_reghopmaybe(pTHX_ U8 *s, I32 off)
3974 return S_reghopmaybe3(aTHX_ s, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr));
3978 S_reghopmaybe3(pTHX_ U8* s, I32 off, U8* lim)
3981 while (off-- && s < lim) {
3982 /* XXX could check well-formedness here */
3992 if (UTF8_IS_CONTINUED(*s)) {
3993 while (s > (U8*)lim && UTF8_IS_CONTINUATION(*s))
3996 /* XXX could check well-formedness here */
4012 restore_pos(pTHXo_ void *arg)
4014 if (PL_reg_eval_set) {
4015 if (PL_reg_oldsaved) {
4016 PL_reg_re->subbeg = PL_reg_oldsaved;
4017 PL_reg_re->sublen = PL_reg_oldsavedlen;
4018 RX_MATCH_COPIED_on(PL_reg_re);
4020 PL_reg_magic->mg_len = PL_reg_oldpos;
4021 PL_reg_eval_set = 0;
4022 PL_curpm = PL_reg_oldcurpm;