5 * "One Ring to rule them all, One Ring to find them..."
8 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
9 * confused with the original package (see point 3 below). Thanks, Henry!
12 /* Additional note: this code is very heavily munged from Henry's version
13 * in places. In some spots I've traded clarity for efficiency, so don't
14 * blame Henry for some of the lack of readability.
17 /* The names of the functions have been changed from regcomp and
18 * regexec to pregcomp and pregexec in order to avoid conflicts
19 * with the POSIX routines of the same names.
22 #ifdef PERL_EXT_RE_BUILD
23 /* need to replace pregcomp et al, so enable that */
24 # ifndef PERL_IN_XSUB_RE
25 # define PERL_IN_XSUB_RE
27 /* need access to debugger hooks */
28 # if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
33 #ifdef PERL_IN_XSUB_RE
34 /* We *really* need to overwrite these symbols: */
35 # define Perl_regexec_flags my_regexec
36 # define Perl_regdump my_regdump
37 # define Perl_regprop my_regprop
38 # define Perl_re_intuit_start my_re_intuit_start
39 /* *These* symbols are masked to allow static link. */
40 # define Perl_pregexec my_pregexec
41 # define Perl_reginitcolors my_reginitcolors
42 # define Perl_regclass_swash my_regclass_swash
44 # define PERL_NO_GET_CONTEXT
49 * pregcomp and pregexec -- regsub and regerror are not used in perl
51 * Copyright (c) 1986 by University of Toronto.
52 * Written by Henry Spencer. Not derived from licensed software.
54 * Permission is granted to anyone to use this software for any
55 * purpose on any computer system, and to redistribute it freely,
56 * subject to the following restrictions:
58 * 1. The author is not responsible for the consequences of use of
59 * this software, no matter how awful, even if they arise
62 * 2. The origin of this software must not be misrepresented, either
63 * by explicit claim or by omission.
65 * 3. Altered versions must be plainly marked as such, and must not
66 * be misrepresented as being the original software.
68 **** Alterations to Henry's code are...
70 **** Copyright (c) 1991-2001, Larry Wall
72 **** You may distribute under the terms of either the GNU General Public
73 **** License or the Artistic License, as specified in the README file.
75 * Beware that some of this code is subtly aware of the way operator
76 * precedence is structured in regular expressions. Serious changes in
77 * regular-expression syntax might require a total rethink.
80 #define PERL_IN_REGEXEC_C
83 #ifdef PERL_IN_XSUB_RE
84 # if defined(PERL_CAPI) || defined(PERL_OBJECT)
91 #define RF_tainted 1 /* tainted information used? */
92 #define RF_warned 2 /* warned about big count? */
93 #define RF_evaled 4 /* Did an EVAL with setting? */
94 #define RF_utf8 8 /* String contains multibyte chars? */
96 #define UTF (PL_reg_flags & RF_utf8)
98 #define RS_init 1 /* eval environment created */
99 #define RS_set 2 /* replsv value is set */
102 #define STATIC static
109 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
110 #define CHR_DIST(a,b) (DO_UTF8(PL_reg_sv) ? utf8_distance(a,b) : a - b)
112 #define reghop_c(pos,off) ((char*)reghop((U8*)pos, off))
113 #define reghopmaybe_c(pos,off) ((char*)reghopmaybe((U8*)pos, off))
114 #define HOP(pos,off) (DO_UTF8(PL_reg_sv) ? reghop((U8*)pos, off) : (U8*)(pos + off))
115 #define HOPMAYBE(pos,off) (DO_UTF8(PL_reg_sv) ? reghopmaybe((U8*)pos, off) : (U8*)(pos + off))
116 #define HOPc(pos,off) ((char*)HOP(pos,off))
117 #define HOPMAYBEc(pos,off) ((char*)HOPMAYBE(pos,off))
119 #define reghop3_c(pos,off,lim) ((char*)reghop3((U8*)pos, off, (U8*)lim))
120 #define reghopmaybe3_c(pos,off,lim) ((char*)reghopmaybe3((U8*)pos, off, (U8*)lim))
121 #define HOP3(pos,off,lim) (DO_UTF8(PL_reg_sv) ? reghop3((U8*)pos, off, (U8*)lim) : (U8*)(pos + off))
122 #define HOPMAYBE3(pos,off,lim) (DO_UTF8(PL_reg_sv) ? reghopmaybe3((U8*)pos, off, (U8*)lim) : (U8*)(pos + off))
123 #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
124 #define HOPMAYBE3c(pos,off,lim) ((char*)HOPMAYBE3(pos,off,lim))
126 static void restore_pos(pTHXo_ void *arg);
130 S_regcppush(pTHX_ I32 parenfloor)
132 int retval = PL_savestack_ix;
133 #define REGCP_PAREN_ELEMS 4
134 int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
137 #define REGCP_OTHER_ELEMS 5
138 SSCHECK(paren_elems_to_push + REGCP_OTHER_ELEMS);
139 for (p = PL_regsize; p > parenfloor; p--) {
140 /* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
141 SSPUSHINT(PL_regendp[p]);
142 SSPUSHINT(PL_regstartp[p]);
143 SSPUSHPTR(PL_reg_start_tmp[p]);
146 /* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
147 SSPUSHINT(PL_regsize);
148 SSPUSHINT(*PL_reglastparen);
149 SSPUSHPTR(PL_reginput);
150 SSPUSHINT(paren_elems_to_push + (REGCP_PAREN_ELEMS - 1));
151 SSPUSHINT(SAVEt_REGCONTEXT); /* Magic cookie. */
155 /* These are needed since we do not localize EVAL nodes: */
156 # define REGCP_SET(cp) DEBUG_r(PerlIO_printf(Perl_debug_log, \
157 " Setting an EVAL scope, savestack=%"IVdf"\n", \
158 (IV)PL_savestack_ix)); cp = PL_savestack_ix
160 # define REGCP_UNWIND(cp) DEBUG_r(cp != PL_savestack_ix ? \
161 PerlIO_printf(Perl_debug_log, \
162 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
163 (IV)(cp), (IV)PL_savestack_ix) : 0); regcpblow(cp)
173 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
175 assert(i == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
176 i = SSPOPINT; /* Parentheses elements to pop. */
177 input = (char *) SSPOPPTR;
178 *PL_reglastparen = SSPOPINT;
179 PL_regsize = SSPOPINT;
181 /* Now restore the parentheses context. */
182 for (i -= (REGCP_PAREN_ELEMS - 1); i > 0; i -= REGCP_PAREN_ELEMS) {
183 paren = (U32)SSPOPINT;
184 PL_reg_start_tmp[paren] = (char *) SSPOPPTR;
185 PL_regstartp[paren] = SSPOPINT;
187 if (paren <= *PL_reglastparen)
188 PL_regendp[paren] = tmps;
190 PerlIO_printf(Perl_debug_log,
191 " restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n",
192 (UV)paren, (IV)PL_regstartp[paren],
193 (IV)(PL_reg_start_tmp[paren] - PL_bostr),
194 (IV)PL_regendp[paren],
195 (paren > *PL_reglastparen ? "(no)" : ""));
199 if (*PL_reglastparen + 1 <= PL_regnpar) {
200 PerlIO_printf(Perl_debug_log,
201 " restoring \\%"IVdf"..\\%"IVdf" to undef\n",
202 (IV)(*PL_reglastparen + 1), (IV)PL_regnpar);
206 /* It would seem that the similar code in regtry()
207 * already takes care of this, and in fact it is in
208 * a better location to since this code can #if 0-ed out
209 * but the code in regtry() is needed or otherwise tests
210 * requiring null fields (pat.t#187 and split.t#{13,14}
211 * (as of patchlevel 7877) will fail. Then again,
212 * this code seems to be necessary or otherwise
213 * building DynaLoader will fail:
214 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
216 for (paren = *PL_reglastparen + 1; paren <= PL_regnpar; paren++) {
217 if (paren > PL_regsize)
218 PL_regstartp[paren] = -1;
219 PL_regendp[paren] = -1;
226 S_regcp_set_to(pTHX_ I32 ss)
228 I32 tmp = PL_savestack_ix;
230 PL_savestack_ix = ss;
232 PL_savestack_ix = tmp;
236 typedef struct re_cc_state
240 struct re_cc_state *prev;
245 #define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
247 #define TRYPAREN(paren, n, input) { \
250 PL_regstartp[paren] = HOPc(input, -1) - PL_bostr; \
251 PL_regendp[paren] = input - PL_bostr; \
254 PL_regendp[paren] = -1; \
256 if (regmatch(next)) \
259 PL_regendp[paren] = -1; \
264 * pregexec and friends
268 - pregexec - match a regexp against a string
271 Perl_pregexec(pTHX_ register regexp *prog, char *stringarg, register char *strend,
272 char *strbeg, I32 minend, SV *screamer, U32 nosave)
273 /* strend: pointer to null at end of string */
274 /* strbeg: real beginning of string */
275 /* minend: end of match must be >=minend after stringarg. */
276 /* nosave: For optimizations. */
279 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
280 nosave ? 0 : REXEC_COPY_STR);
284 S_cache_re(pTHX_ regexp *prog)
286 PL_regprecomp = prog->precomp; /* Needed for FAIL. */
288 PL_regprogram = prog->program;
290 PL_regnpar = prog->nparens;
291 PL_regdata = prog->data;
296 * Need to implement the following flags for reg_anch:
298 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
300 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
301 * INTUIT_AUTORITATIVE_ML
302 * INTUIT_ONCE_NOML - Intuit can match in one location only.
305 * Another flag for this function: SECOND_TIME (so that float substrs
306 * with giant delta may be not rechecked).
309 /* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
311 /* If SCREAM, then SvPVX(sv) should be compatible with strpos and strend.
312 Otherwise, only SvCUR(sv) is used to get strbeg. */
314 /* XXXX We assume that strpos is strbeg unless sv. */
316 /* XXXX Some places assume that there is a fixed substring.
317 An update may be needed if optimizer marks as "INTUITable"
318 RExen without fixed substrings. Similarly, it is assumed that
319 lengths of all the strings are no more than minlen, thus they
320 cannot come from lookahead.
321 (Or minlen should take into account lookahead.) */
323 /* A failure to find a constant substring means that there is no need to make
324 an expensive call to REx engine, thus we celebrate a failure. Similarly,
325 finding a substring too deep into the string means that less calls to
326 regtry() should be needed.
328 REx compiler's optimizer found 4 possible hints:
329 a) Anchored substring;
331 c) Whether we are anchored (beginning-of-line or \G);
332 d) First node (of those at offset 0) which may distingush positions;
333 We use a)b)d) and multiline-part of c), and try to find a position in the
334 string which does not contradict any of them.
337 /* Most of decisions we do here should have been done at compile time.
338 The nodes of the REx which we used for the search should have been
339 deleted from the finite automaton. */
342 Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
343 char *strend, U32 flags, re_scream_pos_data *data)
345 register I32 start_shift;
346 /* Should be nonnegative! */
347 register I32 end_shift;
354 register char *other_last = Nullch; /* other substr checked before this */
355 char *check_at; /* check substr found at this pos */
357 char *i_strpos = strpos;
360 DEBUG_r( if (!PL_colorset) reginitcolors() );
361 DEBUG_r(PerlIO_printf(Perl_debug_log,
362 "%sGuessing start of match, REx%s `%s%.60s%s%s' against `%s%.*s%s%s'...\n",
363 PL_colors[4],PL_colors[5],PL_colors[0],
366 (strlen(prog->precomp) > 60 ? "..." : ""),
368 (int)(strend - strpos > 60 ? 60 : strend - strpos),
369 strpos, PL_colors[1],
370 (strend - strpos > 60 ? "..." : ""))
373 if (prog->reganch & ROPT_UTF8)
374 PL_reg_flags |= RF_utf8;
376 if (prog->minlen > CHR_DIST((U8*)strend, (U8*)strpos)) {
377 DEBUG_r(PerlIO_printf(Perl_debug_log, "String too short...\n"));
380 strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
382 check = prog->check_substr;
383 if (prog->reganch & ROPT_ANCH) { /* Match at beg-of-str or after \n */
384 ml_anch = !( (prog->reganch & ROPT_ANCH_SINGLE)
385 || ( (prog->reganch & ROPT_ANCH_BOL)
386 && !PL_multiline ) ); /* Check after \n? */
389 if ( !(prog->reganch & ROPT_ANCH_GPOS) /* Checked by the caller */
390 /* SvCUR is not set on references: SvRV and SvPVX overlap */
392 && (strpos != strbeg)) {
393 DEBUG_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
396 if (prog->check_offset_min == prog->check_offset_max) {
397 /* Substring at constant offset from beg-of-str... */
400 s = HOP3c(strpos, prog->check_offset_min, strend);
402 slen = SvCUR(check); /* >= 1 */
404 if ( strend - s > slen || strend - s < slen - 1
405 || (strend - s == slen && strend[-1] != '\n')) {
406 DEBUG_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
409 /* Now should match s[0..slen-2] */
411 if (slen && (*SvPVX(check) != *s
413 && memNE(SvPVX(check), s, slen)))) {
415 DEBUG_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
419 else if (*SvPVX(check) != *s
420 || ((slen = SvCUR(check)) > 1
421 && memNE(SvPVX(check), s, slen)))
423 goto success_at_start;
426 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
428 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
429 end_shift = prog->minlen - start_shift -
430 CHR_SVLEN(check) + (SvTAIL(check) != 0);
432 I32 end = prog->check_offset_max + CHR_SVLEN(check)
433 - (SvTAIL(check) != 0);
434 I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
436 if (end_shift < eshift)
440 else { /* Can match at random position */
443 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
444 /* Should be nonnegative! */
445 end_shift = prog->minlen - start_shift -
446 CHR_SVLEN(check) + (SvTAIL(check) != 0);
449 #ifdef DEBUGGING /* 7/99: reports of failure (with the older version) */
451 Perl_croak(aTHX_ "panic: end_shift");
455 /* Find a possible match in the region s..strend by looking for
456 the "check" substring in the region corrected by start/end_shift. */
457 if (flags & REXEC_SCREAM) {
458 I32 p = -1; /* Internal iterator of scream. */
459 I32 *pp = data ? data->scream_pos : &p;
461 if (PL_screamfirst[BmRARE(check)] >= 0
462 || ( BmRARE(check) == '\n'
463 && (BmPREVIOUS(check) == SvCUR(check) - 1)
465 s = screaminstr(sv, check,
466 start_shift + (s - strbeg), end_shift, pp, 0);
470 *data->scream_olds = s;
473 s = fbm_instr(HOP3(s, start_shift, strend),
474 HOP3(strend, -end_shift, strbeg),
475 check, PL_multiline ? FBMrf_MULTILINE : 0);
477 /* Update the count-of-usability, remove useless subpatterns,
480 DEBUG_r(PerlIO_printf(Perl_debug_log, "%s %s substr `%s%.*s%s'%s%s",
481 (s ? "Found" : "Did not find"),
482 ((check == prog->anchored_substr) ? "anchored" : "floating"),
484 (int)(SvCUR(check) - (SvTAIL(check)!=0)),
486 PL_colors[1], (SvTAIL(check) ? "$" : ""),
487 (s ? " at offset " : "...\n") ) );
494 /* Finish the diagnostic message */
495 DEBUG_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
497 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
498 Start with the other substr.
499 XXXX no SCREAM optimization yet - and a very coarse implementation
500 XXXX /ttx+/ results in anchored=`ttx', floating=`x'. floating will
501 *always* match. Probably should be marked during compile...
502 Probably it is right to do no SCREAM here...
505 if (prog->float_substr && prog->anchored_substr) {
506 /* Take into account the "other" substring. */
507 /* XXXX May be hopelessly wrong for UTF... */
510 if (check == prog->float_substr) {
513 char *last = HOP3c(s, -start_shift, strbeg), *last1, *last2;
516 t = s - prog->check_offset_max;
517 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
518 && (!(prog->reganch & ROPT_UTF8)
519 || ((t = reghopmaybe3_c(s, -(prog->check_offset_max), strpos))
524 t = HOP3c(t, prog->anchored_offset, strend);
525 if (t < other_last) /* These positions already checked */
527 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
530 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
531 /* On end-of-str: see comment below. */
532 s = fbm_instr((unsigned char*)t,
533 HOP3(HOP3(last1, prog->anchored_offset, strend)
534 + SvCUR(prog->anchored_substr),
535 -(SvTAIL(prog->anchored_substr)!=0), strbeg),
536 prog->anchored_substr,
537 PL_multiline ? FBMrf_MULTILINE : 0);
538 DEBUG_r(PerlIO_printf(Perl_debug_log,
539 "%s anchored substr `%s%.*s%s'%s",
540 (s ? "Found" : "Contradicts"),
542 (int)(SvCUR(prog->anchored_substr)
543 - (SvTAIL(prog->anchored_substr)!=0)),
544 SvPVX(prog->anchored_substr),
545 PL_colors[1], (SvTAIL(prog->anchored_substr) ? "$" : "")));
547 if (last1 >= last2) {
548 DEBUG_r(PerlIO_printf(Perl_debug_log,
549 ", giving up...\n"));
552 DEBUG_r(PerlIO_printf(Perl_debug_log,
553 ", trying floating at offset %ld...\n",
554 (long)(HOP3c(s1, 1, strend) - i_strpos)));
555 other_last = HOP3c(last1, prog->anchored_offset+1, strend);
556 s = HOP3c(last, 1, strend);
560 DEBUG_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
561 (long)(s - i_strpos)));
562 t = HOP3c(s, -prog->anchored_offset, strbeg);
563 other_last = HOP3c(s, 1, strend);
571 else { /* Take into account the floating substring. */
575 t = HOP3c(s, -start_shift, strbeg);
577 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
578 if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
579 last = HOP3c(t, prog->float_max_offset, strend);
580 s = HOP3c(t, prog->float_min_offset, strend);
583 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
584 /* fbm_instr() takes into account exact value of end-of-str
585 if the check is SvTAIL(ed). Since false positives are OK,
586 and end-of-str is not later than strend we are OK. */
587 s = fbm_instr((unsigned char*)s,
588 (unsigned char*)last + SvCUR(prog->float_substr)
589 - (SvTAIL(prog->float_substr)!=0),
590 prog->float_substr, PL_multiline ? FBMrf_MULTILINE : 0);
591 DEBUG_r(PerlIO_printf(Perl_debug_log, "%s floating substr `%s%.*s%s'%s",
592 (s ? "Found" : "Contradicts"),
594 (int)(SvCUR(prog->float_substr)
595 - (SvTAIL(prog->float_substr)!=0)),
596 SvPVX(prog->float_substr),
597 PL_colors[1], (SvTAIL(prog->float_substr) ? "$" : "")));
600 DEBUG_r(PerlIO_printf(Perl_debug_log,
601 ", giving up...\n"));
604 DEBUG_r(PerlIO_printf(Perl_debug_log,
605 ", trying anchored starting at offset %ld...\n",
606 (long)(s1 + 1 - i_strpos)));
608 s = HOP3c(t, 1, strend);
612 DEBUG_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
613 (long)(s - i_strpos)));
614 other_last = s; /* Fix this later. --Hugo */
623 t = s - prog->check_offset_max;
624 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
625 && (!(prog->reganch & ROPT_UTF8)
626 || ((t = reghopmaybe3_c(s, -prog->check_offset_max, strpos))
628 /* Fixed substring is found far enough so that the match
629 cannot start at strpos. */
631 if (ml_anch && t[-1] != '\n') {
632 /* Eventually fbm_*() should handle this, but often
633 anchored_offset is not 0, so this check will not be wasted. */
634 /* XXXX In the code below we prefer to look for "^" even in
635 presence of anchored substrings. And we search even
636 beyond the found float position. These pessimizations
637 are historical artefacts only. */
639 while (t < strend - prog->minlen) {
641 if (t < check_at - prog->check_offset_min) {
642 if (prog->anchored_substr) {
643 /* Since we moved from the found position,
644 we definitely contradict the found anchored
645 substr. Due to the above check we do not
646 contradict "check" substr.
647 Thus we can arrive here only if check substr
648 is float. Redo checking for "other"=="fixed".
651 DEBUG_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
652 PL_colors[0],PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
653 goto do_other_anchored;
655 /* We don't contradict the found floating substring. */
656 /* XXXX Why not check for STCLASS? */
658 DEBUG_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
659 PL_colors[0],PL_colors[1], (long)(s - i_strpos)));
662 /* Position contradicts check-string */
663 /* XXXX probably better to look for check-string
664 than for "\n", so one should lower the limit for t? */
665 DEBUG_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
666 PL_colors[0],PL_colors[1], (long)(t + 1 - i_strpos)));
667 other_last = strpos = s = t + 1;
672 DEBUG_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
673 PL_colors[0],PL_colors[1]));
677 DEBUG_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
678 PL_colors[0],PL_colors[1]));
682 ++BmUSEFUL(prog->check_substr); /* hooray/5 */
685 /* The found string does not prohibit matching at strpos,
686 - no optimization of calling REx engine can be performed,
687 unless it was an MBOL and we are not after MBOL,
688 or a future STCLASS check will fail this. */
690 /* Even in this situation we may use MBOL flag if strpos is offset
691 wrt the start of the string. */
692 if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
693 && (strpos != strbeg) && strpos[-1] != '\n'
694 /* May be due to an implicit anchor of m{.*foo} */
695 && !(prog->reganch & ROPT_IMPLICIT))
700 DEBUG_r( if (ml_anch)
701 PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
702 (long)(strpos - i_strpos), PL_colors[0],PL_colors[1]);
705 if (!(prog->reganch & ROPT_NAUGHTY) /* XXXX If strpos moved? */
706 && prog->check_substr /* Could be deleted already */
707 && --BmUSEFUL(prog->check_substr) < 0
708 && prog->check_substr == prog->float_substr)
710 /* If flags & SOMETHING - do not do it many times on the same match */
711 DEBUG_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
712 SvREFCNT_dec(prog->check_substr);
713 prog->check_substr = Nullsv; /* disable */
714 prog->float_substr = Nullsv; /* clear */
715 check = Nullsv; /* abort */
717 /* XXXX This is a remnant of the old implementation. It
718 looks wasteful, since now INTUIT can use many
720 prog->reganch &= ~RE_USE_INTUIT;
727 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
728 if (prog->regstclass) {
729 /* minlen == 0 is possible if regstclass is \b or \B,
730 and the fixed substr is ''$.
731 Since minlen is already taken into account, s+1 is before strend;
732 accidentally, minlen >= 1 guaranties no false positives at s + 1
733 even for \b or \B. But (minlen? 1 : 0) below assumes that
734 regstclass does not come from lookahead... */
735 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
736 This leaves EXACTF only, which is dealt with in find_byclass(). */
737 U8* str = (U8*)STRING(prog->regstclass);
738 int cl_l = (PL_regkind[(U8)OP(prog->regstclass)] == EXACT
739 ? CHR_DIST(str+STR_LEN(prog->regstclass), str)
741 char *endpos = (prog->anchored_substr || ml_anch)
742 ? HOP3c(s, (prog->minlen ? cl_l : 0), strend)
743 : (prog->float_substr
744 ? HOP3c(HOP3c(check_at, -start_shift, strbeg),
747 char *startpos = strbeg;
750 if (prog->reganch & ROPT_UTF8) {
751 PL_regdata = prog->data;
754 s = find_byclass(prog, prog->regstclass, s, endpos, startpos, 1);
759 if (endpos == strend) {
760 DEBUG_r( PerlIO_printf(Perl_debug_log,
761 "Could not match STCLASS...\n") );
764 DEBUG_r( PerlIO_printf(Perl_debug_log,
765 "This position contradicts STCLASS...\n") );
766 if ((prog->reganch & ROPT_ANCH) && !ml_anch)
768 /* Contradict one of substrings */
769 if (prog->anchored_substr) {
770 if (prog->anchored_substr == check) {
771 DEBUG_r( what = "anchored" );
773 s = HOP3c(t, 1, strend);
774 if (s + start_shift + end_shift > strend) {
775 /* XXXX Should be taken into account earlier? */
776 DEBUG_r( PerlIO_printf(Perl_debug_log,
777 "Could not match STCLASS...\n") );
782 DEBUG_r( PerlIO_printf(Perl_debug_log,
783 "Looking for %s substr starting at offset %ld...\n",
784 what, (long)(s + start_shift - i_strpos)) );
787 /* Have both, check_string is floating */
788 if (t + start_shift >= check_at) /* Contradicts floating=check */
789 goto retry_floating_check;
790 /* Recheck anchored substring, but not floating... */
794 DEBUG_r( PerlIO_printf(Perl_debug_log,
795 "Looking for anchored substr starting at offset %ld...\n",
796 (long)(other_last - i_strpos)) );
797 goto do_other_anchored;
799 /* Another way we could have checked stclass at the
800 current position only: */
805 DEBUG_r( PerlIO_printf(Perl_debug_log,
806 "Looking for /%s^%s/m starting at offset %ld...\n",
807 PL_colors[0],PL_colors[1], (long)(t - i_strpos)) );
810 if (!prog->float_substr) /* Could have been deleted */
812 /* Check is floating subtring. */
813 retry_floating_check:
814 t = check_at - start_shift;
815 DEBUG_r( what = "floating" );
816 goto hop_and_restart;
819 PerlIO_printf(Perl_debug_log,
820 "By STCLASS: moving %ld --> %ld\n",
821 (long)(t - i_strpos), (long)(s - i_strpos));
823 PerlIO_printf(Perl_debug_log,
824 "Does not contradict STCLASS...\n") );
827 DEBUG_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
828 PL_colors[4], (check ? "Guessed" : "Giving up"),
829 PL_colors[5], (long)(s - i_strpos)) );
832 fail_finish: /* Substring not found */
833 if (prog->check_substr) /* could be removed already */
834 BmUSEFUL(prog->check_substr) += 5; /* hooray */
836 DEBUG_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
837 PL_colors[4],PL_colors[5]));
841 /* We know what class REx starts with. Try to find this position... */
843 S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *startpos, I32 norun)
845 I32 doevery = (prog->reganch & ROPT_SKIP) == 0;
851 register I32 tmp = 1; /* Scratch variable? */
852 register bool do_utf8 = DO_UTF8(PL_reg_sv);
854 /* We know what class it must start with. */
858 if (reginclass(c, (U8*)s, do_utf8)) {
859 if (tmp && (norun || regtry(prog, s)))
866 s += do_utf8 ? UTF8SKIP(s) : 1;
873 c1 = to_utf8_lower((U8*)m);
874 c2 = to_utf8_upper((U8*)m);
885 c2 = PL_fold_locale[c1];
890 e = s; /* Due to minlen logic of intuit() */
896 if ( utf8_to_uv_simple((U8*)s, &len) == c1
903 UV c = utf8_to_uv_simple((U8*)s, &len);
904 if ( (c == c1 || c == c2) && regtry(prog, s) )
913 && (ln == 1 || !(OP(c) == EXACTF
915 : ibcmp_locale(s, m, ln)))
916 && (norun || regtry(prog, s)) )
922 if ( (*(U8*)s == c1 || *(U8*)s == c2)
923 && (ln == 1 || !(OP(c) == EXACTF
925 : ibcmp_locale(s, m, ln)))
926 && (norun || regtry(prog, s)) )
933 PL_reg_flags |= RF_tainted;
940 U8 *r = reghop3((U8*)s, -1, (U8*)startpos);
942 tmp = (I32)utf8_to_uv(r, s - (char*)r, 0, 0);
944 tmp = ((OP(c) == BOUND ?
945 isALNUM_uni(tmp) : isALNUM_LC_uni(tmp)) != 0);
947 if (tmp == !(OP(c) == BOUND ?
948 swash_fetch(PL_utf8_alnum, (U8*)s) :
949 isALNUM_LC_utf8((U8*)s)))
952 if ((norun || regtry(prog, s)))
959 tmp = (s != startpos) ? UCHARAT(s - 1) : '\n';
960 tmp = ((OP(c) == BOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
963 !(OP(c) == BOUND ? isALNUM(*s) : isALNUM_LC(*s))) {
965 if ((norun || regtry(prog, s)))
971 if ((!prog->minlen && tmp) && (norun || regtry(prog, s)))
975 PL_reg_flags |= RF_tainted;
982 U8 *r = reghop3((U8*)s, -1, (U8*)startpos);
984 tmp = (I32)utf8_to_uv(r, s - (char*)r, 0, 0);
986 tmp = ((OP(c) == NBOUND ?
987 isALNUM_uni(tmp) : isALNUM_LC_uni(tmp)) != 0);
989 if (tmp == !(OP(c) == NBOUND ?
990 swash_fetch(PL_utf8_alnum, (U8*)s) :
991 isALNUM_LC_utf8((U8*)s)))
993 else if ((norun || regtry(prog, s)))
999 tmp = (s != startpos) ? UCHARAT(s - 1) : '\n';
1000 tmp = ((OP(c) == NBOUND ?
1001 isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1002 while (s < strend) {
1004 !(OP(c) == NBOUND ? isALNUM(*s) : isALNUM_LC(*s)))
1006 else if ((norun || regtry(prog, s)))
1011 if ((!prog->minlen && !tmp) && (norun || regtry(prog, s)))
1016 while (s < strend) {
1017 if (swash_fetch(PL_utf8_alnum, (U8*)s)) {
1018 if (tmp && (norun || regtry(prog, s)))
1029 while (s < strend) {
1031 if (tmp && (norun || regtry(prog, s)))
1043 PL_reg_flags |= RF_tainted;
1045 while (s < strend) {
1046 if (isALNUM_LC_utf8((U8*)s)) {
1047 if (tmp && (norun || regtry(prog, s)))
1058 while (s < strend) {
1059 if (isALNUM_LC(*s)) {
1060 if (tmp && (norun || regtry(prog, s)))
1073 while (s < strend) {
1074 if (!swash_fetch(PL_utf8_alnum, (U8*)s)) {
1075 if (tmp && (norun || regtry(prog, s)))
1086 while (s < strend) {
1088 if (tmp && (norun || regtry(prog, s)))
1100 PL_reg_flags |= RF_tainted;
1102 while (s < strend) {
1103 if (!isALNUM_LC_utf8((U8*)s)) {
1104 if (tmp && (norun || regtry(prog, s)))
1115 while (s < strend) {
1116 if (!isALNUM_LC(*s)) {
1117 if (tmp && (norun || regtry(prog, s)))
1130 while (s < strend) {
1131 if (*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s)) {
1132 if (tmp && (norun || regtry(prog, s)))
1143 while (s < strend) {
1145 if (tmp && (norun || regtry(prog, s)))
1157 PL_reg_flags |= RF_tainted;
1159 while (s < strend) {
1160 if (*s == ' ' || isSPACE_LC_utf8((U8*)s)) {
1161 if (tmp && (norun || regtry(prog, s)))
1172 while (s < strend) {
1173 if (isSPACE_LC(*s)) {
1174 if (tmp && (norun || regtry(prog, s)))
1187 while (s < strend) {
1188 if (!(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s))) {
1189 if (tmp && (norun || regtry(prog, s)))
1200 while (s < strend) {
1202 if (tmp && (norun || regtry(prog, s)))
1214 PL_reg_flags |= RF_tainted;
1216 while (s < strend) {
1217 if (!(*s == ' ' || isSPACE_LC_utf8((U8*)s))) {
1218 if (tmp && (norun || regtry(prog, s)))
1229 while (s < strend) {
1230 if (!isSPACE_LC(*s)) {
1231 if (tmp && (norun || regtry(prog, s)))
1244 while (s < strend) {
1245 if (swash_fetch(PL_utf8_digit,(U8*)s)) {
1246 if (tmp && (norun || regtry(prog, s)))
1257 while (s < strend) {
1259 if (tmp && (norun || regtry(prog, s)))
1271 PL_reg_flags |= RF_tainted;
1273 while (s < strend) {
1274 if (isDIGIT_LC_utf8((U8*)s)) {
1275 if (tmp && (norun || regtry(prog, s)))
1286 while (s < strend) {
1287 if (isDIGIT_LC(*s)) {
1288 if (tmp && (norun || regtry(prog, s)))
1301 while (s < strend) {
1302 if (!swash_fetch(PL_utf8_digit,(U8*)s)) {
1303 if (tmp && (norun || regtry(prog, s)))
1314 while (s < strend) {
1316 if (tmp && (norun || regtry(prog, s)))
1328 PL_reg_flags |= RF_tainted;
1330 while (s < strend) {
1331 if (!isDIGIT_LC_utf8((U8*)s)) {
1332 if (tmp && (norun || regtry(prog, s)))
1343 while (s < strend) {
1344 if (!isDIGIT_LC(*s)) {
1345 if (tmp && (norun || regtry(prog, s)))
1357 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
1366 - regexec_flags - match a regexp against a string
1369 Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *strend,
1370 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
1371 /* strend: pointer to null at end of string */
1372 /* strbeg: real beginning of string */
1373 /* minend: end of match must be >=minend after stringarg. */
1374 /* data: May be used for some additional optimizations. */
1375 /* nosave: For optimizations. */
1378 register regnode *c;
1379 register char *startpos = stringarg;
1380 I32 minlen; /* must match at least this many chars */
1381 I32 dontbother = 0; /* how many characters not to try at end */
1382 /* I32 start_shift = 0; */ /* Offset of the start to find
1383 constant substr. */ /* CC */
1384 I32 end_shift = 0; /* Same for the end. */ /* CC */
1385 I32 scream_pos = -1; /* Internal iterator of scream. */
1387 SV* oreplsv = GvSV(PL_replgv);
1388 bool do_utf8 = DO_UTF8(sv);
1394 PL_regnarrate = PL_debug & 512;
1397 /* Be paranoid... */
1398 if (prog == NULL || startpos == NULL) {
1399 Perl_croak(aTHX_ "NULL regexp parameter");
1403 minlen = prog->minlen;
1405 if (utf8_distance((U8*)strend, (U8*)startpos) < minlen) goto phooey;
1408 if (strend - startpos < minlen) goto phooey;
1411 if (startpos == strbeg) /* is ^ valid at stringarg? */
1414 if (prog->reganch & ROPT_UTF8 && do_utf8) {
1415 U8 *s = reghop3((U8*)stringarg, -1, (U8*)strbeg);
1416 PL_regprev = utf8_to_uv(s, (U8*)stringarg - s, NULL, 0);
1419 PL_regprev = (U32)stringarg[-1];
1420 if (!PL_multiline && PL_regprev == '\n')
1421 PL_regprev = '\0'; /* force ^ to NOT match */
1424 /* Check validity of program. */
1425 if (UCHARAT(prog->program) != REG_MAGIC) {
1426 Perl_croak(aTHX_ "corrupted regexp program");
1430 PL_reg_eval_set = 0;
1433 if (prog->reganch & ROPT_UTF8)
1434 PL_reg_flags |= RF_utf8;
1436 /* Mark beginning of line for ^ and lookbehind. */
1437 PL_regbol = startpos;
1441 /* Mark end of line for $ (and such) */
1444 /* see how far we have to get to not match where we matched before */
1445 PL_regtill = startpos+minend;
1447 /* We start without call_cc context. */
1450 /* If there is a "must appear" string, look for it. */
1453 if (prog->reganch & ROPT_GPOS_SEEN) { /* Need to have PL_reg_ganch */
1456 if (flags & REXEC_IGNOREPOS) /* Means: check only at start */
1457 PL_reg_ganch = startpos;
1458 else if (sv && SvTYPE(sv) >= SVt_PVMG
1460 && (mg = mg_find(sv, 'g')) && mg->mg_len >= 0) {
1461 PL_reg_ganch = strbeg + mg->mg_len; /* Defined pos() */
1462 if (prog->reganch & ROPT_ANCH_GPOS) {
1463 if (s > PL_reg_ganch)
1468 else /* pos() not defined */
1469 PL_reg_ganch = strbeg;
1472 if (do_utf8 == (UTF!=0) &&
1473 !(flags & REXEC_CHECKED) && prog->check_substr != Nullsv) {
1474 re_scream_pos_data d;
1476 d.scream_olds = &scream_olds;
1477 d.scream_pos = &scream_pos;
1478 s = re_intuit_start(prog, sv, s, strend, flags, &d);
1480 goto phooey; /* not present */
1483 DEBUG_r( if (!PL_colorset) reginitcolors() );
1484 DEBUG_r(PerlIO_printf(Perl_debug_log,
1485 "%sMatching REx%s `%s%.60s%s%s' against `%s%.*s%s%s'\n",
1486 PL_colors[4],PL_colors[5],PL_colors[0],
1489 (strlen(prog->precomp) > 60 ? "..." : ""),
1491 (int)(strend - startpos > 60 ? 60 : strend - startpos),
1492 startpos, PL_colors[1],
1493 (strend - startpos > 60 ? "..." : ""))
1496 /* Simplest case: anchored match need be tried only once. */
1497 /* [unless only anchor is BOL and multiline is set] */
1498 if (prog->reganch & (ROPT_ANCH & ~ROPT_ANCH_GPOS)) {
1499 if (s == startpos && regtry(prog, startpos))
1501 else if (PL_multiline || (prog->reganch & ROPT_IMPLICIT)
1502 || (prog->reganch & ROPT_ANCH_MBOL)) /* XXXX SBOL? */
1507 dontbother = minlen - 1;
1508 end = HOP3c(strend, -dontbother, strbeg) - 1;
1509 /* for multiline we only have to try after newlines */
1510 if (prog->check_substr) {
1514 if (regtry(prog, s))
1519 if (prog->reganch & RE_USE_INTUIT) {
1520 s = re_intuit_start(prog, sv, s + 1, strend, flags, NULL);
1531 if (*s++ == '\n') { /* don't need PL_utf8skip here */
1532 if (regtry(prog, s))
1539 } else if (prog->reganch & ROPT_ANCH_GPOS) {
1540 if (regtry(prog, PL_reg_ganch))
1545 /* Messy cases: unanchored match. */
1546 if (prog->anchored_substr && prog->reganch & ROPT_SKIP) {
1547 /* we have /x+whatever/ */
1548 /* it must be a one character string (XXXX Except UTF?) */
1549 char ch = SvPVX(prog->anchored_substr)[0];
1555 while (s < strend) {
1557 DEBUG_r( did_match = 1 );
1558 if (regtry(prog, s)) goto got_it;
1560 while (s < strend && *s == ch)
1567 while (s < strend) {
1569 DEBUG_r( did_match = 1 );
1570 if (regtry(prog, s)) goto got_it;
1572 while (s < strend && *s == ch)
1578 DEBUG_r(did_match ||
1579 PerlIO_printf(Perl_debug_log,
1580 "Did not find anchored character...\n"));
1583 else if (do_utf8 == (UTF!=0) &&
1584 (prog->anchored_substr != Nullsv
1585 || (prog->float_substr != Nullsv
1586 && prog->float_max_offset < strend - s))) {
1587 SV *must = prog->anchored_substr
1588 ? prog->anchored_substr : prog->float_substr;
1590 prog->anchored_substr ? prog->anchored_offset : prog->float_max_offset;
1592 prog->anchored_substr ? prog->anchored_offset : prog->float_min_offset;
1593 char *last = HOP3c(strend, /* Cannot start after this */
1594 -(I32)(CHR_SVLEN(must)
1595 - (SvTAIL(must) != 0) + back_min), strbeg);
1596 char *last1; /* Last position checked before */
1602 last1 = HOPc(s, -1);
1604 last1 = s - 1; /* bogus */
1606 /* XXXX check_substr already used to find `s', can optimize if
1607 check_substr==must. */
1609 dontbother = end_shift;
1610 strend = HOPc(strend, -dontbother);
1611 while ( (s <= last) &&
1612 ((flags & REXEC_SCREAM)
1613 ? (s = screaminstr(sv, must, HOP3c(s, back_min, strend) - strbeg,
1614 end_shift, &scream_pos, 0))
1615 : (s = fbm_instr((unsigned char*)HOP3(s, back_min, strend),
1616 (unsigned char*)strend, must,
1617 PL_multiline ? FBMrf_MULTILINE : 0))) ) {
1618 DEBUG_r( did_match = 1 );
1619 if (HOPc(s, -back_max) > last1) {
1620 last1 = HOPc(s, -back_min);
1621 s = HOPc(s, -back_max);
1624 char *t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
1626 last1 = HOPc(s, -back_min);
1630 while (s <= last1) {
1631 if (regtry(prog, s))
1637 while (s <= last1) {
1638 if (regtry(prog, s))
1644 DEBUG_r(did_match ||
1645 PerlIO_printf(Perl_debug_log, "Did not find %s substr `%s%.*s%s'%s...\n",
1646 ((must == prog->anchored_substr)
1647 ? "anchored" : "floating"),
1649 (int)(SvCUR(must) - (SvTAIL(must)!=0)),
1651 PL_colors[1], (SvTAIL(must) ? "$" : "")));
1654 else if ((c = prog->regstclass)) {
1655 if (minlen && PL_regkind[(U8)OP(prog->regstclass)] != EXACT)
1656 /* don't bother with what can't match */
1657 strend = HOPc(strend, -(minlen - 1));
1659 SV *prop = sv_newmortal();
1661 PerlIO_printf(Perl_debug_log, "Matching stclass `%s' against `%s'\n", SvPVX(prop), s);
1663 if (find_byclass(prog, c, s, strend, startpos, 0))
1665 DEBUG_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass...\n"));
1669 if (prog->float_substr != Nullsv) { /* Trim the end. */
1672 if (flags & REXEC_SCREAM) {
1673 last = screaminstr(sv, prog->float_substr, s - strbeg,
1674 end_shift, &scream_pos, 1); /* last one */
1676 last = scream_olds; /* Only one occurrence. */
1680 char *little = SvPV(prog->float_substr, len);
1682 if (SvTAIL(prog->float_substr)) {
1683 if (memEQ(strend - len + 1, little, len - 1))
1684 last = strend - len + 1;
1685 else if (!PL_multiline)
1686 last = memEQ(strend - len, little, len)
1687 ? strend - len : Nullch;
1693 last = rninstr(s, strend, little, little + len);
1695 last = strend; /* matching `$' */
1699 DEBUG_r(PerlIO_printf(Perl_debug_log,
1700 "%sCan't trim the tail, match fails (should not happen)%s\n",
1701 PL_colors[4],PL_colors[5]));
1702 goto phooey; /* Should not happen! */
1704 dontbother = strend - last + prog->float_min_offset;
1706 if (minlen && (dontbother < minlen))
1707 dontbother = minlen - 1;
1708 strend -= dontbother; /* this one's always in bytes! */
1709 /* We don't know much -- general case. */
1712 if (regtry(prog, s))
1721 if (regtry(prog, s))
1723 } while (s++ < strend);
1731 RX_MATCH_TAINTED_set(prog, PL_reg_flags & RF_tainted);
1733 if (PL_reg_eval_set) {
1734 /* Preserve the current value of $^R */
1735 if (oreplsv != GvSV(PL_replgv))
1736 sv_setsv(oreplsv, GvSV(PL_replgv));/* So that when GvSV(replgv) is
1737 restored, the value remains
1739 restore_pos(aTHXo_ 0);
1742 /* make sure $`, $&, $', and $digit will work later */
1743 if ( !(flags & REXEC_NOT_FIRST) ) {
1744 if (RX_MATCH_COPIED(prog)) {
1745 Safefree(prog->subbeg);
1746 RX_MATCH_COPIED_off(prog);
1748 if (flags & REXEC_COPY_STR) {
1749 I32 i = PL_regeol - startpos + (stringarg - strbeg);
1751 s = savepvn(strbeg, i);
1754 RX_MATCH_COPIED_on(prog);
1757 prog->subbeg = strbeg;
1758 prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
1765 DEBUG_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
1766 PL_colors[4],PL_colors[5]));
1767 if (PL_reg_eval_set)
1768 restore_pos(aTHXo_ 0);
1773 - regtry - try match at specific point
1775 STATIC I32 /* 0 failure, 1 success */
1776 S_regtry(pTHX_ regexp *prog, char *startpos)
1784 PL_regindent = 0; /* XXXX Not good when matches are reenterable... */
1786 if ((prog->reganch & ROPT_EVAL_SEEN) && !PL_reg_eval_set) {
1789 PL_reg_eval_set = RS_init;
1791 PerlIO_printf(Perl_debug_log, " setting stack tmpbase at %"IVdf"\n",
1792 (IV)(PL_stack_sp - PL_stack_base));
1794 SAVEI32(cxstack[cxstack_ix].blk_oldsp);
1795 cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base;
1796 /* Otherwise OP_NEXTSTATE will free whatever on stack now. */
1798 /* Apparently this is not needed, judging by wantarray. */
1799 /* SAVEI8(cxstack[cxstack_ix].blk_gimme);
1800 cxstack[cxstack_ix].blk_gimme = G_SCALAR; */
1803 /* Make $_ available to executed code. */
1804 if (PL_reg_sv != DEFSV) {
1805 /* SAVE_DEFSV does *not* suffice here for USE_THREADS */
1810 if (!(SvTYPE(PL_reg_sv) >= SVt_PVMG && SvMAGIC(PL_reg_sv)
1811 && (mg = mg_find(PL_reg_sv, 'g')))) {
1812 /* prepare for quick setting of pos */
1813 sv_magic(PL_reg_sv, (SV*)0, 'g', Nullch, 0);
1814 mg = mg_find(PL_reg_sv, 'g');
1818 PL_reg_oldpos = mg->mg_len;
1819 SAVEDESTRUCTOR_X(restore_pos, 0);
1822 Newz(22,PL_reg_curpm, 1, PMOP);
1823 PL_reg_curpm->op_pmregexp = prog;
1824 PL_reg_oldcurpm = PL_curpm;
1825 PL_curpm = PL_reg_curpm;
1826 if (RX_MATCH_COPIED(prog)) {
1827 /* Here is a serious problem: we cannot rewrite subbeg,
1828 since it may be needed if this match fails. Thus
1829 $` inside (?{}) could fail... */
1830 PL_reg_oldsaved = prog->subbeg;
1831 PL_reg_oldsavedlen = prog->sublen;
1832 RX_MATCH_COPIED_off(prog);
1835 PL_reg_oldsaved = Nullch;
1836 prog->subbeg = PL_bostr;
1837 prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
1839 prog->startp[0] = startpos - PL_bostr;
1840 PL_reginput = startpos;
1841 PL_regstartp = prog->startp;
1842 PL_regendp = prog->endp;
1843 PL_reglastparen = &prog->lastparen;
1844 prog->lastparen = 0;
1846 DEBUG_r(PL_reg_starttry = startpos);
1847 if (PL_reg_start_tmpl <= prog->nparens) {
1848 PL_reg_start_tmpl = prog->nparens*3/2 + 3;
1849 if(PL_reg_start_tmp)
1850 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
1852 New(22,PL_reg_start_tmp, PL_reg_start_tmpl, char*);
1855 /* XXXX What this code is doing here?!!! There should be no need
1856 to do this again and again, PL_reglastparen should take care of
1859 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
1860 * Actually, the code in regcppop() (which Ilya may be meaning by
1861 * PL_reglastparen), is not needed at all by the test suite
1862 * (op/regexp, op/pat, op/split), but that code is needed, oddly
1863 * enough, for building DynaLoader, or otherwise this
1864 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
1865 * will happen. Meanwhile, this code *is* needed for the
1866 * above-mentioned test suite tests to succeed. The common theme
1867 * on those tests seems to be returning null fields from matches.
1872 if (prog->nparens) {
1873 for (i = prog->nparens; i > *PL_reglastparen; i--) {
1880 if (regmatch(prog->program + 1)) {
1881 prog->endp[0] = PL_reginput - PL_bostr;
1884 REGCP_UNWIND(lastcp);
1888 #define RE_UNWIND_BRANCH 1
1889 #define RE_UNWIND_BRANCHJ 2
1893 typedef struct { /* XX: makes sense to enlarge it... */
1897 } re_unwind_generic_t;
1910 } re_unwind_branch_t;
1912 typedef union re_unwind_t {
1914 re_unwind_generic_t generic;
1915 re_unwind_branch_t branch;
1919 - regmatch - main matching routine
1921 * Conceptually the strategy is simple: check to see whether the current
1922 * node matches, call self recursively to see whether the rest matches,
1923 * and then act accordingly. In practice we make some effort to avoid
1924 * recursion, in particular by going through "ordinary" nodes (that don't
1925 * need to know whether the rest of the match failed) by a loop instead of
1928 /* [lwall] I've hoisted the register declarations to the outer block in order to
1929 * maybe save a little bit of pushing and popping on the stack. It also takes
1930 * advantage of machines that use a register save mask on subroutine entry.
1932 STATIC I32 /* 0 failure, 1 success */
1933 S_regmatch(pTHX_ regnode *prog)
1935 register regnode *scan; /* Current node. */
1936 regnode *next; /* Next node. */
1937 regnode *inner; /* Next node in internal branch. */
1938 register I32 nextchr; /* renamed nextchr - nextchar colides with
1939 function of same name */
1940 register I32 n; /* no or next */
1941 register I32 ln; /* len or last */
1942 register char *s; /* operand or save */
1943 register char *locinput = PL_reginput;
1944 register I32 c1, c2, paren; /* case fold search, parenth */
1945 int minmod = 0, sw = 0, logical = 0;
1947 I32 firstcp = PL_savestack_ix;
1948 register bool do_utf8 = DO_UTF8(PL_reg_sv);
1954 /* Note that nextchr is a byte even in UTF */
1955 nextchr = UCHARAT(locinput);
1957 while (scan != NULL) {
1958 #define sayNO_L (logical ? (logical = 0, sw = 0, goto cont) : sayNO)
1960 # define sayYES goto yes
1961 # define sayNO goto no
1962 # define sayYES_FINAL goto yes_final
1963 # define sayYES_LOUD goto yes_loud
1964 # define sayNO_FINAL goto no_final
1965 # define sayNO_SILENT goto do_no
1966 # define saySAME(x) if (x) goto yes; else goto no
1967 # define REPORT_CODE_OFF 24
1969 # define sayYES return 1
1970 # define sayNO return 0
1971 # define sayYES_FINAL return 1
1972 # define sayYES_LOUD return 1
1973 # define sayNO_FINAL return 0
1974 # define sayNO_SILENT return 0
1975 # define saySAME(x) return x
1978 SV *prop = sv_newmortal();
1979 int docolor = *PL_colors[0];
1980 int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
1981 int l = (PL_regeol - locinput) > taill ? taill : (PL_regeol - locinput);
1982 /* The part of the string before starttry has one color
1983 (pref0_len chars), between starttry and current
1984 position another one (pref_len - pref0_len chars),
1985 after the current position the third one.
1986 We assume that pref0_len <= pref_len, otherwise we
1987 decrease pref0_len. */
1988 int pref_len = (locinput - PL_bostr) > (5 + taill) - l
1989 ? (5 + taill) - l : locinput - PL_bostr;
1992 while (UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
1994 pref0_len = pref_len - (locinput - PL_reg_starttry);
1995 if (l + pref_len < (5 + taill) && l < PL_regeol - locinput)
1996 l = ( PL_regeol - locinput > (5 + taill) - pref_len
1997 ? (5 + taill) - pref_len : PL_regeol - locinput);
1998 while (UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
2002 if (pref0_len > pref_len)
2003 pref0_len = pref_len;
2004 regprop(prop, scan);
2005 PerlIO_printf(Perl_debug_log,
2006 "%4"IVdf" <%s%.*s%s%s%.*s%s%s%s%.*s%s>%*s|%3"IVdf":%*s%s\n",
2007 (IV)(locinput - PL_bostr),
2008 PL_colors[4], pref0_len,
2009 locinput - pref_len, PL_colors[5],
2010 PL_colors[2], pref_len - pref0_len,
2011 locinput - pref_len + pref0_len, PL_colors[3],
2012 (docolor ? "" : "> <"),
2013 PL_colors[0], l, locinput, PL_colors[1],
2014 15 - l - pref_len + 1,
2016 (IV)(scan - PL_regprogram), PL_regindent*2, "",
2020 next = scan + NEXT_OFF(scan);
2026 if (locinput == PL_bostr
2027 ? PL_regprev == '\n'
2029 (nextchr || locinput < PL_regeol) && locinput[-1] == '\n') )
2031 /* regtill = regbol; */
2036 if (locinput == PL_bostr
2037 ? PL_regprev == '\n'
2038 : ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n') )
2044 if (locinput == PL_bostr)
2048 if (locinput == PL_reg_ganch)
2058 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
2063 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
2065 if (PL_regeol - locinput > 1)
2069 if (PL_regeol != locinput)
2074 locinput += PL_utf8skip[nextchr];
2075 if (locinput > PL_regeol)
2077 nextchr = UCHARAT(locinput);
2080 if (!nextchr && locinput >= PL_regeol)
2082 nextchr = UCHARAT(++locinput);
2085 if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
2088 locinput += PL_utf8skip[nextchr];
2089 if (locinput > PL_regeol)
2091 nextchr = UCHARAT(locinput);
2094 nextchr = UCHARAT(++locinput);
2099 if (do_utf8 != (UTF!=0)) {
2107 if (*((U8*)s) != utf8_to_uv_simple((U8*)l, &len))
2116 if (*((U8*)l) != utf8_to_uv_simple((U8*)s, &len))
2122 nextchr = UCHARAT(locinput);
2125 /* Inline the first character, for speed. */
2126 if (UCHARAT(s) != nextchr)
2128 if (PL_regeol - locinput < ln)
2130 if (ln > 1 && memNE(s, locinput, ln))
2133 nextchr = UCHARAT(locinput);
2136 PL_reg_flags |= RF_tainted;
2146 c1 = OP(scan) == EXACTF;
2148 if (l >= PL_regeol) {
2151 if ((UTF ? utf8_to_uv((U8*)s, e - s, 0, 0) : *((U8*)s)) !=
2152 (c1 ? toLOWER_utf8((U8*)l) : toLOWER_LC_utf8((U8*)l)))
2154 s += UTF ? UTF8SKIP(s) : 1;
2158 nextchr = UCHARAT(locinput);
2162 /* Inline the first character, for speed. */
2163 if (UCHARAT(s) != nextchr &&
2164 UCHARAT(s) != ((OP(scan) == EXACTF)
2165 ? PL_fold : PL_fold_locale)[nextchr])
2167 if (PL_regeol - locinput < ln)
2169 if (ln > 1 && (OP(scan) == EXACTF
2170 ? ibcmp(s, locinput, ln)
2171 : ibcmp_locale(s, locinput, ln)))
2174 nextchr = UCHARAT(locinput);
2178 if (!reginclass(scan, (U8*)locinput, do_utf8))
2180 if (locinput >= PL_regeol)
2182 locinput += PL_utf8skip[nextchr];
2183 nextchr = UCHARAT(locinput);
2187 nextchr = UCHARAT(locinput);
2188 if (!reginclass(scan, (U8*)locinput, do_utf8))
2190 if (!nextchr && locinput >= PL_regeol)
2192 nextchr = UCHARAT(++locinput);
2196 PL_reg_flags |= RF_tainted;
2202 if (!(OP(scan) == ALNUM
2203 ? swash_fetch(PL_utf8_alnum, (U8*)locinput)
2204 : isALNUM_LC_utf8((U8*)locinput)))
2208 locinput += PL_utf8skip[nextchr];
2209 nextchr = UCHARAT(locinput);
2212 if (!(OP(scan) == ALNUM
2213 ? isALNUM(nextchr) : isALNUM_LC(nextchr)))
2215 nextchr = UCHARAT(++locinput);
2218 PL_reg_flags |= RF_tainted;
2221 if (!nextchr && locinput >= PL_regeol)
2224 if (OP(scan) == NALNUM
2225 ? swash_fetch(PL_utf8_alnum, (U8*)locinput)
2226 : isALNUM_LC_utf8((U8*)locinput))
2230 locinput += PL_utf8skip[nextchr];
2231 nextchr = UCHARAT(locinput);
2234 if (OP(scan) == NALNUM
2235 ? isALNUM(nextchr) : isALNUM_LC(nextchr))
2237 nextchr = UCHARAT(++locinput);
2241 PL_reg_flags |= RF_tainted;
2245 /* was last char in word? */
2247 if (locinput == PL_regbol)
2250 U8 *r = reghop((U8*)locinput, -1);
2252 ln = utf8_to_uv(r, s - (char*)r, 0, 0);
2254 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
2255 ln = isALNUM_uni(ln);
2256 n = swash_fetch(PL_utf8_alnum, (U8*)locinput);
2259 ln = isALNUM_LC_uni(ln);
2260 n = isALNUM_LC_utf8((U8*)locinput);
2264 ln = (locinput != PL_regbol) ?
2265 UCHARAT(locinput - 1) : PL_regprev;
2266 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
2268 n = isALNUM(nextchr);
2271 ln = isALNUM_LC(ln);
2272 n = isALNUM_LC(nextchr);
2275 if (((!ln) == (!n)) == (OP(scan) == BOUND ||
2276 OP(scan) == BOUNDL))
2280 PL_reg_flags |= RF_tainted;
2286 if (UTF8_IS_CONTINUED(nextchr)) {
2287 if (!(OP(scan) == SPACE
2288 ? swash_fetch(PL_utf8_space, (U8*)locinput)
2289 : isSPACE_LC_utf8((U8*)locinput)))
2293 locinput += PL_utf8skip[nextchr];
2294 nextchr = UCHARAT(locinput);
2297 if (!(OP(scan) == SPACE
2298 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
2300 nextchr = UCHARAT(++locinput);
2303 if (!(OP(scan) == SPACE
2304 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
2306 nextchr = UCHARAT(++locinput);
2310 PL_reg_flags |= RF_tainted;
2313 if (!nextchr && locinput >= PL_regeol)
2316 if (OP(scan) == NSPACE
2317 ? swash_fetch(PL_utf8_space, (U8*)locinput)
2318 : isSPACE_LC_utf8((U8*)locinput))
2322 locinput += PL_utf8skip[nextchr];
2323 nextchr = UCHARAT(locinput);
2326 if (OP(scan) == NSPACE
2327 ? isSPACE(nextchr) : isSPACE_LC(nextchr))
2329 nextchr = UCHARAT(++locinput);
2332 PL_reg_flags |= RF_tainted;
2338 if (!(OP(scan) == DIGIT
2339 ? swash_fetch(PL_utf8_digit, (U8*)locinput)
2340 : isDIGIT_LC_utf8((U8*)locinput)))
2344 locinput += PL_utf8skip[nextchr];
2345 nextchr = UCHARAT(locinput);
2348 if (!(OP(scan) == DIGIT
2349 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr)))
2351 nextchr = UCHARAT(++locinput);
2354 PL_reg_flags |= RF_tainted;
2357 if (!nextchr && locinput >= PL_regeol)
2360 if (OP(scan) == NDIGIT
2361 ? swash_fetch(PL_utf8_digit, (U8*)locinput)
2362 : isDIGIT_LC_utf8((U8*)locinput))
2366 locinput += PL_utf8skip[nextchr];
2367 nextchr = UCHARAT(locinput);
2370 if (OP(scan) == NDIGIT
2371 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr))
2373 nextchr = UCHARAT(++locinput);
2376 if (locinput >= PL_regeol || swash_fetch(PL_utf8_mark,(U8*)locinput))
2378 locinput += PL_utf8skip[nextchr];
2379 while (locinput < PL_regeol && swash_fetch(PL_utf8_mark,(U8*)locinput))
2380 locinput += UTF8SKIP(locinput);
2381 if (locinput > PL_regeol)
2383 nextchr = UCHARAT(locinput);
2386 PL_reg_flags |= RF_tainted;
2390 n = ARG(scan); /* which paren pair */
2391 ln = PL_regstartp[n];
2392 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
2393 if (*PL_reglastparen < n || ln == -1)
2394 sayNO; /* Do not match unless seen CLOSEn. */
2395 if (ln == PL_regendp[n])
2399 if (do_utf8 && OP(scan) != REF) { /* REF can do byte comparison */
2401 char *e = PL_bostr + PL_regendp[n];
2403 * Note that we can't do the "other character" lookup trick as
2404 * in the 8-bit case (no pun intended) because in Unicode we
2405 * have to map both upper and title case to lower case.
2407 if (OP(scan) == REFF) {
2411 if (toLOWER_utf8((U8*)s) != toLOWER_utf8((U8*)l))
2421 if (toLOWER_LC_utf8((U8*)s) != toLOWER_LC_utf8((U8*)l))
2428 nextchr = UCHARAT(locinput);
2432 /* Inline the first character, for speed. */
2433 if (UCHARAT(s) != nextchr &&
2435 (UCHARAT(s) != ((OP(scan) == REFF
2436 ? PL_fold : PL_fold_locale)[nextchr]))))
2438 ln = PL_regendp[n] - ln;
2439 if (locinput + ln > PL_regeol)
2441 if (ln > 1 && (OP(scan) == REF
2442 ? memNE(s, locinput, ln)
2444 ? ibcmp(s, locinput, ln)
2445 : ibcmp_locale(s, locinput, ln))))
2448 nextchr = UCHARAT(locinput);
2459 OP_4tree *oop = PL_op;
2460 COP *ocurcop = PL_curcop;
2461 SV **ocurpad = PL_curpad;
2465 PL_op = (OP_4tree*)PL_regdata->data[n];
2466 DEBUG_r( PerlIO_printf(Perl_debug_log, " re_eval 0x%"UVxf"\n", PTR2UV(PL_op)) );
2467 PL_curpad = AvARRAY((AV*)PL_regdata->data[n + 2]);
2468 PL_regendp[0] = PL_reg_magic->mg_len = locinput - PL_bostr;
2470 CALLRUNOPS(aTHX); /* Scalar context. */
2476 PL_curpad = ocurpad;
2477 PL_curcop = ocurcop;
2479 if (logical == 2) { /* Postponed subexpression. */
2481 MAGIC *mg = Null(MAGIC*);
2483 CHECKPOINT cp, lastcp;
2485 if(SvROK(ret) || SvRMAGICAL(ret)) {
2486 SV *sv = SvROK(ret) ? SvRV(ret) : ret;
2489 mg = mg_find(sv, 'r');
2492 re = (regexp *)mg->mg_obj;
2493 (void)ReREFCNT_inc(re);
2497 char *t = SvPV(ret, len);
2499 char *oprecomp = PL_regprecomp;
2500 I32 osize = PL_regsize;
2501 I32 onpar = PL_regnpar;
2504 re = CALLREGCOMP(aTHX_ t, t + len, &pm);
2506 & (SVs_TEMP | SVs_PADTMP | SVf_READONLY)))
2507 sv_magic(ret,(SV*)ReREFCNT_inc(re),'r',0,0);
2508 PL_regprecomp = oprecomp;
2513 PerlIO_printf(Perl_debug_log,
2514 "Entering embedded `%s%.60s%s%s'\n",
2518 (strlen(re->precomp) > 60 ? "..." : ""))
2521 state.prev = PL_reg_call_cc;
2522 state.cc = PL_regcc;
2523 state.re = PL_reg_re;
2527 cp = regcppush(0); /* Save *all* the positions. */
2530 state.ss = PL_savestack_ix;
2531 *PL_reglastparen = 0;
2532 PL_reg_call_cc = &state;
2533 PL_reginput = locinput;
2535 /* XXXX This is too dramatic a measure... */
2538 if (regmatch(re->program + 1)) {
2539 /* Even though we succeeded, we need to restore
2540 global variables, since we may be wrapped inside
2541 SUSPEND, thus the match may be not finished yet. */
2543 /* XXXX Do this only if SUSPENDed? */
2544 PL_reg_call_cc = state.prev;
2545 PL_regcc = state.cc;
2546 PL_reg_re = state.re;
2547 cache_re(PL_reg_re);
2549 /* XXXX This is too dramatic a measure... */
2552 /* These are needed even if not SUSPEND. */
2558 REGCP_UNWIND(lastcp);
2560 PL_reg_call_cc = state.prev;
2561 PL_regcc = state.cc;
2562 PL_reg_re = state.re;
2563 cache_re(PL_reg_re);
2565 /* XXXX This is too dramatic a measure... */
2574 sv_setsv(save_scalar(PL_replgv), ret);
2578 n = ARG(scan); /* which paren pair */
2579 PL_reg_start_tmp[n] = locinput;
2584 n = ARG(scan); /* which paren pair */
2585 PL_regstartp[n] = PL_reg_start_tmp[n] - PL_bostr;
2586 PL_regendp[n] = locinput - PL_bostr;
2587 if (n > *PL_reglastparen)
2588 *PL_reglastparen = n;
2591 n = ARG(scan); /* which paren pair */
2592 sw = (*PL_reglastparen >= n && PL_regendp[n] != -1);
2595 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
2597 next = NEXTOPER(NEXTOPER(scan));
2599 next = scan + ARG(scan);
2600 if (OP(next) == IFTHEN) /* Fake one. */
2601 next = NEXTOPER(NEXTOPER(next));
2605 logical = scan->flags;
2607 /*******************************************************************
2608 PL_regcc contains infoblock about the innermost (...)* loop, and
2609 a pointer to the next outer infoblock.
2611 Here is how Y(A)*Z is processed (if it is compiled into CURLYX/WHILEM):
2613 1) After matching X, regnode for CURLYX is processed;
2615 2) This regnode creates infoblock on the stack, and calls
2616 regmatch() recursively with the starting point at WHILEM node;
2618 3) Each hit of WHILEM node tries to match A and Z (in the order
2619 depending on the current iteration, min/max of {min,max} and
2620 greediness). The information about where are nodes for "A"
2621 and "Z" is read from the infoblock, as is info on how many times "A"
2622 was already matched, and greediness.
2624 4) After A matches, the same WHILEM node is hit again.
2626 5) Each time WHILEM is hit, PL_regcc is the infoblock created by CURLYX
2627 of the same pair. Thus when WHILEM tries to match Z, it temporarily
2628 resets PL_regcc, since this Y(A)*Z can be a part of some other loop:
2629 as in (Y(A)*Z)*. If Z matches, the automaton will hit the WHILEM node
2630 of the external loop.
2632 Currently present infoblocks form a tree with a stem formed by PL_curcc
2633 and whatever it mentions via ->next, and additional attached trees
2634 corresponding to temporarily unset infoblocks as in "5" above.
2636 In the following picture infoblocks for outer loop of
2637 (Y(A)*?Z)*?T are denoted O, for inner I. NULL starting block
2638 is denoted by x. The matched string is YAAZYAZT. Temporarily postponed
2639 infoblocks are drawn below the "reset" infoblock.
2641 In fact in the picture below we do not show failed matches for Z and T
2642 by WHILEM blocks. [We illustrate minimal matches, since for them it is
2643 more obvious *why* one needs to *temporary* unset infoblocks.]
2645 Matched REx position InfoBlocks Comment
2649 Y A)*?Z)*?T x <- O <- I
2650 YA )*?Z)*?T x <- O <- I
2651 YA A)*?Z)*?T x <- O <- I
2652 YAA )*?Z)*?T x <- O <- I
2653 YAA Z)*?T x <- O # Temporary unset I
2656 YAAZ Y(A)*?Z)*?T x <- O
2659 YAAZY (A)*?Z)*?T x <- O
2662 YAAZY A)*?Z)*?T x <- O <- I
2665 YAAZYA )*?Z)*?T x <- O <- I
2668 YAAZYA Z)*?T x <- O # Temporary unset I
2674 YAAZYAZ T x # Temporary unset O
2681 *******************************************************************/
2684 CHECKPOINT cp = PL_savestack_ix;
2685 /* No need to save/restore up to this paren */
2686 I32 parenfloor = scan->flags;
2688 if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
2690 cc.oldcc = PL_regcc;
2692 /* XXXX Probably it is better to teach regpush to support
2693 parenfloor > PL_regsize... */
2694 if (parenfloor > *PL_reglastparen)
2695 parenfloor = *PL_reglastparen; /* Pessimization... */
2696 cc.parenfloor = parenfloor;
2698 cc.min = ARG1(scan);
2699 cc.max = ARG2(scan);
2700 cc.scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
2704 PL_reginput = locinput;
2705 n = regmatch(PREVOPER(next)); /* start on the WHILEM */
2707 PL_regcc = cc.oldcc;
2713 * This is really hard to understand, because after we match
2714 * what we're trying to match, we must make sure the rest of
2715 * the REx is going to match for sure, and to do that we have
2716 * to go back UP the parse tree by recursing ever deeper. And
2717 * if it fails, we have to reset our parent's current state
2718 * that we can try again after backing off.
2721 CHECKPOINT cp, lastcp;
2722 CURCUR* cc = PL_regcc;
2723 char *lastloc = cc->lastloc; /* Detection of 0-len. */
2725 n = cc->cur + 1; /* how many we know we matched */
2726 PL_reginput = locinput;
2729 PerlIO_printf(Perl_debug_log,
2730 "%*s %ld out of %ld..%ld cc=%lx\n",
2731 REPORT_CODE_OFF+PL_regindent*2, "",
2732 (long)n, (long)cc->min,
2733 (long)cc->max, (long)cc)
2736 /* If degenerate scan matches "", assume scan done. */
2738 if (locinput == cc->lastloc && n >= cc->min) {
2739 PL_regcc = cc->oldcc;
2743 PerlIO_printf(Perl_debug_log,
2744 "%*s empty match detected, try continuation...\n",
2745 REPORT_CODE_OFF+PL_regindent*2, "")
2747 if (regmatch(cc->next))
2755 /* First just match a string of min scans. */
2759 cc->lastloc = locinput;
2760 if (regmatch(cc->scan))
2763 cc->lastloc = lastloc;
2768 /* Check whether we already were at this position.
2769 Postpone detection until we know the match is not
2770 *that* much linear. */
2771 if (!PL_reg_maxiter) {
2772 PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
2773 PL_reg_leftiter = PL_reg_maxiter;
2775 if (PL_reg_leftiter-- == 0) {
2776 I32 size = (PL_reg_maxiter + 7)/8;
2777 if (PL_reg_poscache) {
2778 if (PL_reg_poscache_size < size) {
2779 Renew(PL_reg_poscache, size, char);
2780 PL_reg_poscache_size = size;
2782 Zero(PL_reg_poscache, size, char);
2785 PL_reg_poscache_size = size;
2786 Newz(29, PL_reg_poscache, size, char);
2789 PerlIO_printf(Perl_debug_log,
2790 "%sDetected a super-linear match, switching on caching%s...\n",
2791 PL_colors[4], PL_colors[5])
2794 if (PL_reg_leftiter < 0) {
2795 I32 o = locinput - PL_bostr, b;
2797 o = (scan->flags & 0xf) - 1 + o * (scan->flags>>4);
2800 if (PL_reg_poscache[o] & (1<<b)) {
2802 PerlIO_printf(Perl_debug_log,
2803 "%*s already tried at this position...\n",
2804 REPORT_CODE_OFF+PL_regindent*2, "")
2808 PL_reg_poscache[o] |= (1<<b);
2812 /* Prefer next over scan for minimal matching. */
2815 PL_regcc = cc->oldcc;
2818 cp = regcppush(cc->parenfloor);
2820 if (regmatch(cc->next)) {
2822 sayYES; /* All done. */
2824 REGCP_UNWIND(lastcp);
2830 if (n >= cc->max) { /* Maximum greed exceeded? */
2831 if (ckWARN(WARN_REGEXP) && n >= REG_INFTY
2832 && !(PL_reg_flags & RF_warned)) {
2833 PL_reg_flags |= RF_warned;
2834 Perl_warner(aTHX_ WARN_REGEXP, "%s limit (%d) exceeded",
2835 "Complex regular subexpression recursion",
2842 PerlIO_printf(Perl_debug_log,
2843 "%*s trying longer...\n",
2844 REPORT_CODE_OFF+PL_regindent*2, "")
2846 /* Try scanning more and see if it helps. */
2847 PL_reginput = locinput;
2849 cc->lastloc = locinput;
2850 cp = regcppush(cc->parenfloor);
2852 if (regmatch(cc->scan)) {
2856 REGCP_UNWIND(lastcp);
2859 cc->lastloc = lastloc;
2863 /* Prefer scan over next for maximal matching. */
2865 if (n < cc->max) { /* More greed allowed? */
2866 cp = regcppush(cc->parenfloor);
2868 cc->lastloc = locinput;
2870 if (regmatch(cc->scan)) {
2874 REGCP_UNWIND(lastcp);
2875 regcppop(); /* Restore some previous $<digit>s? */
2876 PL_reginput = locinput;
2878 PerlIO_printf(Perl_debug_log,
2879 "%*s failed, try continuation...\n",
2880 REPORT_CODE_OFF+PL_regindent*2, "")
2883 if (ckWARN(WARN_REGEXP) && n >= REG_INFTY
2884 && !(PL_reg_flags & RF_warned)) {
2885 PL_reg_flags |= RF_warned;
2886 Perl_warner(aTHX_ WARN_REGEXP, "%s limit (%d) exceeded",
2887 "Complex regular subexpression recursion",
2891 /* Failed deeper matches of scan, so see if this one works. */
2892 PL_regcc = cc->oldcc;
2895 if (regmatch(cc->next))
2901 cc->lastloc = lastloc;
2906 next = scan + ARG(scan);
2909 inner = NEXTOPER(NEXTOPER(scan));
2912 inner = NEXTOPER(scan);
2917 if (OP(next) != c1) /* No choice. */
2918 next = inner; /* Avoid recursion. */
2920 I32 lastparen = *PL_reglastparen;
2922 re_unwind_branch_t *uw;
2924 /* Put unwinding data on stack */
2925 unwind1 = SSNEWt(1,re_unwind_branch_t);
2926 uw = SSPTRt(unwind1,re_unwind_branch_t);
2929 uw->type = ((c1 == BRANCH)
2931 : RE_UNWIND_BRANCHJ);
2932 uw->lastparen = lastparen;
2934 uw->locinput = locinput;
2935 uw->nextchr = nextchr;
2937 uw->regindent = ++PL_regindent;
2940 REGCP_SET(uw->lastcp);
2942 /* Now go into the first branch */
2955 /* We suppose that the next guy does not need
2956 backtracking: in particular, it is of constant length,
2957 and has no parenths to influence future backrefs. */
2958 ln = ARG1(scan); /* min to match */
2959 n = ARG2(scan); /* max to match */
2960 paren = scan->flags;
2962 if (paren > PL_regsize)
2964 if (paren > *PL_reglastparen)
2965 *PL_reglastparen = paren;
2967 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
2969 scan += NEXT_OFF(scan); /* Skip former OPEN. */
2970 PL_reginput = locinput;
2973 if (ln && regrepeat_hard(scan, ln, &l) < ln)
2975 if (ln && l == 0 && n >= ln
2976 /* In fact, this is tricky. If paren, then the
2977 fact that we did/didnot match may influence
2978 future execution. */
2979 && !(paren && ln == 0))
2981 locinput = PL_reginput;
2982 if (PL_regkind[(U8)OP(next)] == EXACT) {
2983 c1 = (U8)*STRING(next);
2984 if (OP(next) == EXACTF)
2986 else if (OP(next) == EXACTFL)
2987 c2 = PL_fold_locale[c1];
2994 /* This may be improved if l == 0. */
2995 while (n >= ln || (n == REG_INFTY && ln > 0 && l)) { /* ln overflow ? */
2996 /* If it could work, try it. */
2998 UCHARAT(PL_reginput) == c1 ||
2999 UCHARAT(PL_reginput) == c2)
3003 PL_regstartp[paren] =
3004 HOPc(PL_reginput, -l) - PL_bostr;
3005 PL_regendp[paren] = PL_reginput - PL_bostr;
3008 PL_regendp[paren] = -1;
3012 REGCP_UNWIND(lastcp);
3014 /* Couldn't or didn't -- move forward. */
3015 PL_reginput = locinput;
3016 if (regrepeat_hard(scan, 1, &l)) {
3018 locinput = PL_reginput;
3025 n = regrepeat_hard(scan, n, &l);
3026 if (n != 0 && l == 0
3027 /* In fact, this is tricky. If paren, then the
3028 fact that we did/didnot match may influence
3029 future execution. */
3030 && !(paren && ln == 0))
3032 locinput = PL_reginput;
3034 PerlIO_printf(Perl_debug_log,
3035 "%*s matched %"IVdf" times, len=%"IVdf"...\n",
3036 (int)(REPORT_CODE_OFF+PL_regindent*2), "",
3040 if (PL_regkind[(U8)OP(next)] == EXACT) {
3041 c1 = (U8)*STRING(next);
3042 if (OP(next) == EXACTF)
3044 else if (OP(next) == EXACTFL)
3045 c2 = PL_fold_locale[c1];
3054 /* If it could work, try it. */
3056 UCHARAT(PL_reginput) == c1 ||
3057 UCHARAT(PL_reginput) == c2)
3060 PerlIO_printf(Perl_debug_log,
3061 "%*s trying tail with n=%"IVdf"...\n",
3062 (int)(REPORT_CODE_OFF+PL_regindent*2), "", (IV)n)
3066 PL_regstartp[paren] = HOPc(PL_reginput, -l) - PL_bostr;
3067 PL_regendp[paren] = PL_reginput - PL_bostr;
3070 PL_regendp[paren] = -1;
3074 REGCP_UNWIND(lastcp);
3076 /* Couldn't or didn't -- back up. */
3078 locinput = HOPc(locinput, -l);
3079 PL_reginput = locinput;
3086 paren = scan->flags; /* Which paren to set */
3087 if (paren > PL_regsize)
3089 if (paren > *PL_reglastparen)
3090 *PL_reglastparen = paren;
3091 ln = ARG1(scan); /* min to match */
3092 n = ARG2(scan); /* max to match */
3093 scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
3097 ln = ARG1(scan); /* min to match */
3098 n = ARG2(scan); /* max to match */
3099 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
3104 scan = NEXTOPER(scan);
3110 scan = NEXTOPER(scan);
3114 * Lookahead to avoid useless match attempts
3115 * when we know what character comes next.
3117 if (PL_regkind[(U8)OP(next)] == EXACT) {
3118 U8 *s = (U8*)STRING(next);
3121 if (OP(next) == EXACTF)
3123 else if (OP(next) == EXACTFL)
3124 c2 = PL_fold_locale[c1];
3127 if (OP(next) == EXACTF) {
3128 c1 = to_utf8_lower(s);
3129 c2 = to_utf8_upper(s);
3132 c2 = c1 = utf8_to_uv_simple(s, NULL);
3138 PL_reginput = locinput;
3142 if (ln && regrepeat(scan, ln) < ln)
3144 locinput = PL_reginput;
3147 char *e; /* Should not check after this */
3148 char *old = locinput;
3150 if (n == REG_INFTY) {
3153 while (UTF8_IS_CONTINUATION(*(U8*)e))
3159 m >0 && e + UTF8SKIP(e) <= PL_regeol; m--)
3163 e = locinput + n - ln;
3169 /* Find place 'next' could work */
3172 while (locinput <= e && *locinput != c1)
3175 while (locinput <= e
3180 count = locinput - old;
3187 utf8_to_uv_simple((U8*)locinput, &len) != c1;
3192 for (count = 0; locinput <= e; count++) {
3193 UV c = utf8_to_uv_simple((U8*)locinput, &len);
3194 if (c == c1 || c == c2)
3202 /* PL_reginput == old now */
3203 if (locinput != old) {
3204 ln = 1; /* Did some */
3205 if (regrepeat(scan, count) < count)
3208 /* PL_reginput == locinput now */
3209 TRYPAREN(paren, ln, locinput);
3210 PL_reginput = locinput; /* Could be reset... */
3211 REGCP_UNWIND(lastcp);
3212 /* Couldn't or didn't -- move forward. */
3215 locinput += UTF8SKIP(locinput);
3221 while (n >= ln || (n == REG_INFTY && ln > 0)) { /* ln overflow ? */
3225 c = utf8_to_uv_simple((U8*)PL_reginput, NULL);
3227 c = UCHARAT(PL_reginput);
3229 /* If it could work, try it. */
3230 if (c1 == -1000 || c == c1 || c == c2)
3232 TRYPAREN(paren, n, PL_reginput);
3233 REGCP_UNWIND(lastcp);
3235 /* Couldn't or didn't -- move forward. */
3236 PL_reginput = locinput;
3237 if (regrepeat(scan, 1)) {
3239 locinput = PL_reginput;
3247 n = regrepeat(scan, n);
3248 locinput = PL_reginput;
3249 if (ln < n && PL_regkind[(U8)OP(next)] == EOL &&
3250 (!PL_multiline || OP(next) == SEOL || OP(next) == EOS)) {
3251 ln = n; /* why back off? */
3252 /* ...because $ and \Z can match before *and* after
3253 newline at the end. Consider "\n\n" =~ /\n+\Z\n/.
3254 We should back off by one in this case. */
3255 if (UCHARAT(PL_reginput - 1) == '\n' && OP(next) != EOS)
3264 c = utf8_to_uv_simple((U8*)PL_reginput, NULL);
3266 c = UCHARAT(PL_reginput);
3268 /* If it could work, try it. */
3269 if (c1 == -1000 || c == c1 || c == c2)
3271 TRYPAREN(paren, n, PL_reginput);
3272 REGCP_UNWIND(lastcp);
3274 /* Couldn't or didn't -- back up. */
3276 PL_reginput = locinput = HOPc(locinput, -1);
3284 c = utf8_to_uv_simple((U8*)PL_reginput, NULL);
3286 c = UCHARAT(PL_reginput);
3288 /* If it could work, try it. */
3289 if (c1 == -1000 || c == c1 || c == c2)
3291 TRYPAREN(paren, n, PL_reginput);
3292 REGCP_UNWIND(lastcp);
3294 /* Couldn't or didn't -- back up. */
3296 PL_reginput = locinput = HOPc(locinput, -1);
3303 if (PL_reg_call_cc) {
3304 re_cc_state *cur_call_cc = PL_reg_call_cc;
3305 CURCUR *cctmp = PL_regcc;
3306 regexp *re = PL_reg_re;
3307 CHECKPOINT cp, lastcp;
3309 cp = regcppush(0); /* Save *all* the positions. */
3311 regcp_set_to(PL_reg_call_cc->ss); /* Restore parens of
3313 PL_reginput = locinput; /* Make position available to
3315 cache_re(PL_reg_call_cc->re);
3316 PL_regcc = PL_reg_call_cc->cc;
3317 PL_reg_call_cc = PL_reg_call_cc->prev;
3318 if (regmatch(cur_call_cc->node)) {
3319 PL_reg_call_cc = cur_call_cc;
3323 REGCP_UNWIND(lastcp);
3325 PL_reg_call_cc = cur_call_cc;
3331 PerlIO_printf(Perl_debug_log,
3332 "%*s continuation failed...\n",
3333 REPORT_CODE_OFF+PL_regindent*2, "")
3337 if (locinput < PL_regtill) {
3338 DEBUG_r(PerlIO_printf(Perl_debug_log,
3339 "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
3341 (long)(locinput - PL_reg_starttry),
3342 (long)(PL_regtill - PL_reg_starttry),
3344 sayNO_FINAL; /* Cannot match: too short. */
3346 PL_reginput = locinput; /* put where regtry can find it */
3347 sayYES_FINAL; /* Success! */
3349 PL_reginput = locinput; /* put where regtry can find it */
3350 sayYES_LOUD; /* Success! */
3353 PL_reginput = locinput;
3358 if (UTF) { /* XXXX This is absolutely
3359 broken, we read before
3361 s = HOPMAYBEc(locinput, -scan->flags);
3367 if (locinput < PL_bostr + scan->flags)
3369 PL_reginput = locinput - scan->flags;
3374 PL_reginput = locinput;
3379 if (UTF) { /* XXXX This is absolutely
3380 broken, we read before
3382 s = HOPMAYBEc(locinput, -scan->flags);
3383 if (!s || s < PL_bostr)
3388 if (locinput < PL_bostr + scan->flags)
3390 PL_reginput = locinput - scan->flags;
3395 PL_reginput = locinput;
3398 inner = NEXTOPER(NEXTOPER(scan));
3399 if (regmatch(inner) != n) {
3414 if (OP(scan) == SUSPEND) {
3415 locinput = PL_reginput;
3416 nextchr = UCHARAT(locinput);
3421 next = scan + ARG(scan);
3426 PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
3427 PTR2UV(scan), OP(scan));
3428 Perl_croak(aTHX_ "regexp memory corruption");
3435 * We get here only if there's trouble -- normally "case END" is
3436 * the terminating point.
3438 Perl_croak(aTHX_ "corrupted regexp pointers");
3444 PerlIO_printf(Perl_debug_log,
3445 "%*s %scould match...%s\n",
3446 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4],PL_colors[5])
3450 DEBUG_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
3451 PL_colors[4],PL_colors[5]));
3457 #if 0 /* Breaks $^R */
3465 PerlIO_printf(Perl_debug_log,
3466 "%*s %sfailed...%s\n",
3467 REPORT_CODE_OFF+PL_regindent*2, "",PL_colors[4],PL_colors[5])
3473 re_unwind_t *uw = SSPTRt(unwind,re_unwind_t);
3476 case RE_UNWIND_BRANCH:
3477 case RE_UNWIND_BRANCHJ:
3479 re_unwind_branch_t *uwb = &(uw->branch);
3480 I32 lastparen = uwb->lastparen;
3482 REGCP_UNWIND(uwb->lastcp);
3483 for (n = *PL_reglastparen; n > lastparen; n--)
3485 *PL_reglastparen = n;
3486 scan = next = uwb->next;
3488 OP(scan) != (uwb->type == RE_UNWIND_BRANCH
3489 ? BRANCH : BRANCHJ) ) { /* Failure */
3496 /* Have more choice yet. Reuse the same uwb. */
3498 if ((n = (uwb->type == RE_UNWIND_BRANCH
3499 ? NEXT_OFF(next) : ARG(next))))
3502 next = NULL; /* XXXX Needn't unwinding in this case... */
3504 next = NEXTOPER(scan);
3505 if (uwb->type == RE_UNWIND_BRANCHJ)
3506 next = NEXTOPER(next);
3507 locinput = uwb->locinput;
3508 nextchr = uwb->nextchr;
3510 PL_regindent = uwb->regindent;
3517 Perl_croak(aTHX_ "regexp unwind memory corruption");
3528 - regrepeat - repeatedly match something simple, report how many
3531 * [This routine now assumes that it will only match on things of length 1.
3532 * That was true before, but now we assume scan - reginput is the count,
3533 * rather than incrementing count on every character. [Er, except utf8.]]
3536 S_regrepeat(pTHX_ regnode *p, I32 max)
3538 register char *scan;
3540 register char *loceol = PL_regeol;
3541 register I32 hardcount = 0;
3542 register bool do_utf8 = DO_UTF8(PL_reg_sv);
3545 if (max != REG_INFTY && max < loceol - scan)
3546 loceol = scan + max;
3551 while (scan < loceol && hardcount < max && *scan != '\n') {
3552 scan += UTF8SKIP(scan);
3556 while (scan < loceol && *scan != '\n')
3563 while (hardcount < max && scan < loceol) {
3564 scan += UTF8SKIP(scan);
3571 case EXACT: /* length of string is 1 */
3573 while (scan < loceol && UCHARAT(scan) == c)
3576 case EXACTF: /* length of string is 1 */
3578 while (scan < loceol &&
3579 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold[c]))
3582 case EXACTFL: /* length of string is 1 */
3583 PL_reg_flags |= RF_tainted;
3585 while (scan < loceol &&
3586 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold_locale[c]))
3592 while (hardcount < max && scan < loceol &&
3593 reginclass(p, (U8*)scan, do_utf8)) {
3594 scan += UTF8SKIP(scan);
3598 while (scan < loceol && reginclass(p, (U8*)scan, do_utf8))
3605 while (hardcount < max && scan < loceol &&
3606 swash_fetch(PL_utf8_alnum, (U8*)scan)) {
3607 scan += UTF8SKIP(scan);
3611 while (scan < loceol && isALNUM(*scan))
3616 PL_reg_flags |= RF_tainted;
3619 while (hardcount < max && scan < loceol &&
3620 isALNUM_LC_utf8((U8*)scan)) {
3621 scan += UTF8SKIP(scan);
3625 while (scan < loceol && isALNUM_LC(*scan))
3632 while (hardcount < max && scan < loceol &&
3633 !swash_fetch(PL_utf8_alnum, (U8*)scan)) {
3634 scan += UTF8SKIP(scan);
3638 while (scan < loceol && !isALNUM(*scan))
3643 PL_reg_flags |= RF_tainted;
3646 while (hardcount < max && scan < loceol &&
3647 !isALNUM_LC_utf8((U8*)scan)) {
3648 scan += UTF8SKIP(scan);
3652 while (scan < loceol && !isALNUM_LC(*scan))
3659 while (hardcount < max && scan < loceol &&
3660 (*scan == ' ' || swash_fetch(PL_utf8_space,(U8*)scan))) {
3661 scan += UTF8SKIP(scan);
3665 while (scan < loceol && isSPACE(*scan))
3670 PL_reg_flags |= RF_tainted;
3673 while (hardcount < max && scan < loceol &&
3674 (*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
3675 scan += UTF8SKIP(scan);
3679 while (scan < loceol && isSPACE_LC(*scan))
3686 while (hardcount < max && scan < loceol &&
3687 !(*scan == ' ' || swash_fetch(PL_utf8_space,(U8*)scan))) {
3688 scan += UTF8SKIP(scan);
3692 while (scan < loceol && !isSPACE(*scan))
3697 PL_reg_flags |= RF_tainted;
3700 while (hardcount < max && scan < loceol &&
3701 !(*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
3702 scan += UTF8SKIP(scan);
3706 while (scan < loceol && !isSPACE_LC(*scan))
3713 while (hardcount < max && scan < loceol &&
3714 swash_fetch(PL_utf8_digit,(U8*)scan)) {
3715 scan += UTF8SKIP(scan);
3719 while (scan < loceol && isDIGIT(*scan))
3726 while (hardcount < max && scan < loceol &&
3727 !swash_fetch(PL_utf8_digit,(U8*)scan)) {
3728 scan += UTF8SKIP(scan);
3732 while (scan < loceol && !isDIGIT(*scan))
3736 default: /* Called on something of 0 width. */
3737 break; /* So match right here or not at all. */
3743 c = scan - PL_reginput;
3748 SV *prop = sv_newmortal();
3751 PerlIO_printf(Perl_debug_log,
3752 "%*s %s can match %"IVdf" times out of %"IVdf"...\n",
3753 REPORT_CODE_OFF+1, "", SvPVX(prop),(IV)c,(IV)max);
3760 - regrepeat_hard - repeatedly match something, report total lenth and length
3762 * The repeater is supposed to have constant length.
3766 S_regrepeat_hard(pTHX_ regnode *p, I32 max, I32 *lp)
3768 register char *scan;
3769 register char *start;
3770 register char *loceol = PL_regeol;
3772 I32 count = 0, res = 1;
3777 start = PL_reginput;
3778 if (DO_UTF8(PL_reg_sv)) {
3779 while (PL_reginput < loceol && (scan = PL_reginput, res = regmatch(p))) {
3782 while (start < PL_reginput) {
3784 start += UTF8SKIP(start);
3795 while (PL_reginput < loceol && (scan = PL_reginput, res = regmatch(p))) {
3797 *lp = l = PL_reginput - start;
3798 if (max != REG_INFTY && l*max < loceol - scan)
3799 loceol = scan + l*max;
3812 - regclass_swash - prepare the utf8 swash
3816 Perl_regclass_swash(pTHX_ register regnode* node, bool doinit, SV** initsvp)
3821 if (PL_regdata && PL_regdata->count) {
3824 if (PL_regdata->what[n] == 's') {
3825 SV *rv = (SV*)PL_regdata->data[n];
3826 AV *av = (AV*)SvRV((SV*)rv);
3829 si = *av_fetch(av, 0, FALSE);
3830 a = av_fetch(av, 1, FALSE);
3834 else if (si && doinit) {
3835 sw = swash_init("utf8", "", si, 1, 0);
3836 (void)av_store(av, 1, sw);
3848 - reginclass - determine if a character falls into a character class
3852 S_reginclass(pTHX_ register regnode *n, register U8* p, register bool do_utf8)
3854 char flags = ANYOF_FLAGS(n);
3860 c = utf8_to_uv_simple(p, &len);
3864 if (do_utf8 || (flags & ANYOF_UNICODE)) {
3865 if (do_utf8 && !ANYOF_RUNTIME(n)) {
3866 if (len != (STRLEN)-1 && c < 256 && ANYOF_BITMAP_TEST(n, c))
3869 if (do_utf8 && flags & ANYOF_UNICODE_ALL && c >= 256)
3872 SV *sw = regclass_swash(n, TRUE, 0);
3875 if (swash_fetch(sw, p))
3877 else if (flags & ANYOF_FOLD) {
3878 U8 tmpbuf[UTF8_MAXLEN+1];
3880 if (flags & ANYOF_LOCALE) {
3881 PL_reg_flags |= RF_tainted;
3882 uv_to_utf8(tmpbuf, toLOWER_LC_utf8(p));
3885 uv_to_utf8(tmpbuf, toLOWER_utf8(p));
3886 if (swash_fetch(sw, tmpbuf))
3892 if (!match && c < 256) {
3893 if (ANYOF_BITMAP_TEST(n, c))
3895 else if (flags & ANYOF_FOLD) {
3898 if (flags & ANYOF_LOCALE) {
3899 PL_reg_flags |= RF_tainted;
3900 f = PL_fold_locale[c];
3904 if (f != c && ANYOF_BITMAP_TEST(n, f))
3908 if (!match && (flags & ANYOF_CLASS)) {
3909 PL_reg_flags |= RF_tainted;
3911 (ANYOF_CLASS_TEST(n, ANYOF_ALNUM) && isALNUM_LC(c)) ||
3912 (ANYOF_CLASS_TEST(n, ANYOF_NALNUM) && !isALNUM_LC(c)) ||
3913 (ANYOF_CLASS_TEST(n, ANYOF_SPACE) && isSPACE_LC(c)) ||
3914 (ANYOF_CLASS_TEST(n, ANYOF_NSPACE) && !isSPACE_LC(c)) ||
3915 (ANYOF_CLASS_TEST(n, ANYOF_DIGIT) && isDIGIT_LC(c)) ||
3916 (ANYOF_CLASS_TEST(n, ANYOF_NDIGIT) && !isDIGIT_LC(c)) ||
3917 (ANYOF_CLASS_TEST(n, ANYOF_ALNUMC) && isALNUMC_LC(c)) ||
3918 (ANYOF_CLASS_TEST(n, ANYOF_NALNUMC) && !isALNUMC_LC(c)) ||
3919 (ANYOF_CLASS_TEST(n, ANYOF_ALPHA) && isALPHA_LC(c)) ||
3920 (ANYOF_CLASS_TEST(n, ANYOF_NALPHA) && !isALPHA_LC(c)) ||
3921 (ANYOF_CLASS_TEST(n, ANYOF_ASCII) && isASCII(c)) ||
3922 (ANYOF_CLASS_TEST(n, ANYOF_NASCII) && !isASCII(c)) ||
3923 (ANYOF_CLASS_TEST(n, ANYOF_CNTRL) && isCNTRL_LC(c)) ||
3924 (ANYOF_CLASS_TEST(n, ANYOF_NCNTRL) && !isCNTRL_LC(c)) ||
3925 (ANYOF_CLASS_TEST(n, ANYOF_GRAPH) && isGRAPH_LC(c)) ||
3926 (ANYOF_CLASS_TEST(n, ANYOF_NGRAPH) && !isGRAPH_LC(c)) ||
3927 (ANYOF_CLASS_TEST(n, ANYOF_LOWER) && isLOWER_LC(c)) ||
3928 (ANYOF_CLASS_TEST(n, ANYOF_NLOWER) && !isLOWER_LC(c)) ||
3929 (ANYOF_CLASS_TEST(n, ANYOF_PRINT) && isPRINT_LC(c)) ||
3930 (ANYOF_CLASS_TEST(n, ANYOF_NPRINT) && !isPRINT_LC(c)) ||
3931 (ANYOF_CLASS_TEST(n, ANYOF_PUNCT) && isPUNCT_LC(c)) ||
3932 (ANYOF_CLASS_TEST(n, ANYOF_NPUNCT) && !isPUNCT_LC(c)) ||
3933 (ANYOF_CLASS_TEST(n, ANYOF_UPPER) && isUPPER_LC(c)) ||
3934 (ANYOF_CLASS_TEST(n, ANYOF_NUPPER) && !isUPPER_LC(c)) ||
3935 (ANYOF_CLASS_TEST(n, ANYOF_XDIGIT) && isXDIGIT(c)) ||
3936 (ANYOF_CLASS_TEST(n, ANYOF_NXDIGIT) && !isXDIGIT(c)) ||
3937 (ANYOF_CLASS_TEST(n, ANYOF_PSXSPC) && isPSXSPC(c)) ||
3938 (ANYOF_CLASS_TEST(n, ANYOF_NPSXSPC) && !isPSXSPC(c)) ||
3939 (ANYOF_CLASS_TEST(n, ANYOF_BLANK) && isBLANK(c)) ||
3940 (ANYOF_CLASS_TEST(n, ANYOF_NBLANK) && !isBLANK(c))
3941 ) /* How's that for a conditional? */
3948 return (flags & ANYOF_INVERT) ? !match : match;
3952 S_reghop(pTHX_ U8 *s, I32 off)
3954 return S_reghop3(aTHX_ s, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr));
3958 S_reghop3(pTHX_ U8 *s, I32 off, U8* lim)
3961 while (off-- && s < lim) {
3962 /* XXX could check well-formedness here */
3970 if (UTF8_IS_CONTINUED(*s)) {
3971 while (s > (U8*)lim && UTF8_IS_CONTINUATION(*s))
3974 /* XXX could check well-formedness here */
3982 S_reghopmaybe(pTHX_ U8 *s, I32 off)
3984 return S_reghopmaybe3(aTHX_ s, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr));
3988 S_reghopmaybe3(pTHX_ U8* s, I32 off, U8* lim)
3991 while (off-- && s < lim) {
3992 /* XXX could check well-formedness here */
4002 if (UTF8_IS_CONTINUED(*s)) {
4003 while (s > (U8*)lim && UTF8_IS_CONTINUATION(*s))
4006 /* XXX could check well-formedness here */
4022 restore_pos(pTHXo_ void *arg)
4024 if (PL_reg_eval_set) {
4025 if (PL_reg_oldsaved) {
4026 PL_reg_re->subbeg = PL_reg_oldsaved;
4027 PL_reg_re->sublen = PL_reg_oldsavedlen;
4028 RX_MATCH_COPIED_on(PL_reg_re);
4030 PL_reg_magic->mg_len = PL_reg_oldpos;
4031 PL_reg_eval_set = 0;
4032 PL_curpm = PL_reg_oldcurpm;