5 * "One Ring to rule them all, One Ring to find them..."
8 /* This file contains functions for executing a regular expression. See
9 * also regcomp.c which funnily enough, contains functions for compiling
10 * a regular expression.
12 * This file is also copied at build time to ext/re/re_exec.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
19 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
20 * confused with the original package (see point 3 below). Thanks, Henry!
23 /* Additional note: this code is very heavily munged from Henry's version
24 * in places. In some spots I've traded clarity for efficiency, so don't
25 * blame Henry for some of the lack of readability.
28 /* The names of the functions have been changed from regcomp and
29 * regexec to pregcomp and pregexec in order to avoid conflicts
30 * with the POSIX routines of the same names.
33 #ifdef PERL_EXT_RE_BUILD
34 /* need to replace pregcomp et al, so enable that */
35 # ifndef PERL_IN_XSUB_RE
36 # define PERL_IN_XSUB_RE
38 /* need access to debugger hooks */
39 # if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
44 #ifdef PERL_IN_XSUB_RE
45 /* We *really* need to overwrite these symbols: */
46 # define Perl_regexec_flags my_regexec
47 # define Perl_regdump my_regdump
48 # define Perl_regprop my_regprop
49 # define Perl_re_intuit_start my_re_intuit_start
50 /* *These* symbols are masked to allow static link. */
51 # define Perl_pregexec my_pregexec
52 # define Perl_reginitcolors my_reginitcolors
53 # define Perl_regclass_swash my_regclass_swash
55 # define PERL_NO_GET_CONTEXT
60 * pregcomp and pregexec -- regsub and regerror are not used in perl
62 * Copyright (c) 1986 by University of Toronto.
63 * Written by Henry Spencer. Not derived from licensed software.
65 * Permission is granted to anyone to use this software for any
66 * purpose on any computer system, and to redistribute it freely,
67 * subject to the following restrictions:
69 * 1. The author is not responsible for the consequences of use of
70 * this software, no matter how awful, even if they arise
73 * 2. The origin of this software must not be misrepresented, either
74 * by explicit claim or by omission.
76 * 3. Altered versions must be plainly marked as such, and must not
77 * be misrepresented as being the original software.
79 **** Alterations to Henry's code are...
81 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
82 **** 2000, 2001, 2002, 2003, 2004, 2005, by Larry Wall and others
84 **** You may distribute under the terms of either the GNU General Public
85 **** License or the Artistic License, as specified in the README file.
87 * Beware that some of this code is subtly aware of the way operator
88 * precedence is structured in regular expressions. Serious changes in
89 * regular-expression syntax might require a total rethink.
92 #define PERL_IN_REGEXEC_C
97 #define RF_tainted 1 /* tainted information used? */
98 #define RF_warned 2 /* warned about big count? */
99 #define RF_evaled 4 /* Did an EVAL with setting? */
100 #define RF_utf8 8 /* String contains multibyte chars? */
101 #define RF_false 16 /* odd number of nested negatives */
103 #define UTF ((PL_reg_flags & RF_utf8) != 0)
105 #define RS_init 1 /* eval environment created */
106 #define RS_set 2 /* replsv value is set */
109 #define STATIC static
112 #define REGINCLASS(p,c) (ANYOF_FLAGS(p) ? reginclass(p,c,0,0) : ANYOF_BITMAP_TEST(p,*(c)))
118 #define CHR_SVLEN(sv) (do_utf8 ? sv_len_utf8(sv) : SvCUR(sv))
119 #define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
121 #define reghop_c(pos,off) ((char*)reghop((U8*)pos, off))
122 #define reghopmaybe_c(pos,off) ((char*)reghopmaybe((U8*)pos, off))
123 #define HOP(pos,off) (PL_reg_match_utf8 ? reghop((U8*)pos, off) : (U8*)(pos + off))
124 #define HOPMAYBE(pos,off) (PL_reg_match_utf8 ? reghopmaybe((U8*)pos, off) : (U8*)(pos + off))
125 #define HOPc(pos,off) ((char*)HOP(pos,off))
126 #define HOPMAYBEc(pos,off) ((char*)HOPMAYBE(pos,off))
128 #define HOPBACK(pos, off) ( \
129 (PL_reg_match_utf8) \
130 ? reghopmaybe((U8*)pos, -off) \
131 : (pos - off >= PL_bostr) \
135 #define HOPBACKc(pos, off) (char*)HOPBACK(pos, off)
137 #define reghop3_c(pos,off,lim) ((char*)reghop3((U8*)pos, off, (U8*)lim))
138 #define reghopmaybe3_c(pos,off,lim) ((char*)reghopmaybe3((U8*)pos, off, (U8*)lim))
139 #define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)pos, off, (U8*)lim) : (U8*)(pos + off))
140 #define HOPMAYBE3(pos,off,lim) (PL_reg_match_utf8 ? reghopmaybe3((U8*)pos, off, (U8*)lim) : (U8*)(pos + off))
141 #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
142 #define HOPMAYBE3c(pos,off,lim) ((char*)HOPMAYBE3(pos,off,lim))
144 #define LOAD_UTF8_CHARCLASS(a,b) STMT_START { if (!CAT2(PL_utf8_,a)) { ENTER; save_re_context(); (void)CAT2(is_utf8_, a)((const U8*)b); LEAVE; } } STMT_END
146 /* for use after a quantifier and before an EXACT-like node -- japhy */
147 #define JUMPABLE(rn) ( \
148 OP(rn) == OPEN || OP(rn) == CLOSE || OP(rn) == EVAL || \
149 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
150 OP(rn) == PLUS || OP(rn) == MINMOD || \
151 (PL_regkind[(U8)OP(rn)] == CURLY && ARG1(rn) > 0) \
154 #define HAS_TEXT(rn) ( \
155 PL_regkind[(U8)OP(rn)] == EXACT || PL_regkind[(U8)OP(rn)] == REF \
159 Search for mandatory following text node; for lookahead, the text must
160 follow but for lookbehind (rn->flags != 0) we skip to the next step.
162 #define FIND_NEXT_IMPT(rn) STMT_START { \
163 while (JUMPABLE(rn)) \
164 if (OP(rn) == SUSPEND || PL_regkind[(U8)OP(rn)] == CURLY) \
165 rn = NEXTOPER(NEXTOPER(rn)); \
166 else if (OP(rn) == PLUS) \
168 else if (OP(rn) == IFMATCH) \
169 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
170 else rn += NEXT_OFF(rn); \
173 static void restore_pos(pTHX_ void *arg);
176 S_regcppush(pTHX_ I32 parenfloor)
178 int retval = PL_savestack_ix;
179 #define REGCP_PAREN_ELEMS 4
180 int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
183 if (paren_elems_to_push < 0)
184 Perl_croak(aTHX_ "panic: paren_elems_to_push < 0");
186 #define REGCP_OTHER_ELEMS 6
187 SSGROW(paren_elems_to_push + REGCP_OTHER_ELEMS);
188 for (p = PL_regsize; p > parenfloor; p--) {
189 /* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
190 SSPUSHINT(PL_regendp[p]);
191 SSPUSHINT(PL_regstartp[p]);
192 SSPUSHPTR(PL_reg_start_tmp[p]);
195 /* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
196 SSPUSHINT(PL_regsize);
197 SSPUSHINT(*PL_reglastparen);
198 SSPUSHINT(*PL_reglastcloseparen);
199 SSPUSHPTR(PL_reginput);
200 #define REGCP_FRAME_ELEMS 2
201 /* REGCP_FRAME_ELEMS are part of the REGCP_OTHER_ELEMS and
202 * are needed for the regexp context stack bookkeeping. */
203 SSPUSHINT(paren_elems_to_push + REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
204 SSPUSHINT(SAVEt_REGCONTEXT); /* Magic cookie. */
209 /* These are needed since we do not localize EVAL nodes: */
210 # define REGCP_SET(cp) DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, \
211 " Setting an EVAL scope, savestack=%"IVdf"\n", \
212 (IV)PL_savestack_ix)); cp = PL_savestack_ix
214 # define REGCP_UNWIND(cp) DEBUG_EXECUTE_r(cp != PL_savestack_ix ? \
215 PerlIO_printf(Perl_debug_log, \
216 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
217 (IV)(cp), (IV)PL_savestack_ix) : 0); regcpblow(cp)
227 GET_RE_DEBUG_FLAGS_DECL;
229 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
231 assert(i == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
232 i = SSPOPINT; /* Parentheses elements to pop. */
233 input = (char *) SSPOPPTR;
234 *PL_reglastcloseparen = SSPOPINT;
235 *PL_reglastparen = SSPOPINT;
236 PL_regsize = SSPOPINT;
238 /* Now restore the parentheses context. */
239 for (i -= (REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
240 i > 0; i -= REGCP_PAREN_ELEMS) {
241 paren = (U32)SSPOPINT;
242 PL_reg_start_tmp[paren] = (char *) SSPOPPTR;
243 PL_regstartp[paren] = SSPOPINT;
245 if (paren <= *PL_reglastparen)
246 PL_regendp[paren] = tmps;
248 PerlIO_printf(Perl_debug_log,
249 " restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n",
250 (UV)paren, (IV)PL_regstartp[paren],
251 (IV)(PL_reg_start_tmp[paren] - PL_bostr),
252 (IV)PL_regendp[paren],
253 (paren > *PL_reglastparen ? "(no)" : ""));
257 if ((I32)(*PL_reglastparen + 1) <= PL_regnpar) {
258 PerlIO_printf(Perl_debug_log,
259 " restoring \\%"IVdf"..\\%"IVdf" to undef\n",
260 (IV)(*PL_reglastparen + 1), (IV)PL_regnpar);
264 /* It would seem that the similar code in regtry()
265 * already takes care of this, and in fact it is in
266 * a better location to since this code can #if 0-ed out
267 * but the code in regtry() is needed or otherwise tests
268 * requiring null fields (pat.t#187 and split.t#{13,14}
269 * (as of patchlevel 7877) will fail. Then again,
270 * this code seems to be necessary or otherwise
271 * building DynaLoader will fail:
272 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
274 for (paren = *PL_reglastparen + 1; (I32)paren <= PL_regnpar; paren++) {
275 if ((I32)paren > PL_regsize)
276 PL_regstartp[paren] = -1;
277 PL_regendp[paren] = -1;
284 S_regcp_set_to(pTHX_ I32 ss)
286 I32 tmp = PL_savestack_ix;
288 PL_savestack_ix = ss;
290 PL_savestack_ix = tmp;
294 typedef struct re_cc_state
298 struct re_cc_state *prev;
303 #define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
305 #define TRYPAREN(paren, n, input) { \
308 PL_regstartp[paren] = HOPc(input, -1) - PL_bostr; \
309 PL_regendp[paren] = input - PL_bostr; \
312 PL_regendp[paren] = -1; \
314 if (regmatch(next)) \
317 PL_regendp[paren] = -1; \
322 * pregexec and friends
326 - pregexec - match a regexp against a string
329 Perl_pregexec(pTHX_ register regexp *prog, char *stringarg, register char *strend,
330 char *strbeg, I32 minend, SV *screamer, U32 nosave)
331 /* strend: pointer to null at end of string */
332 /* strbeg: real beginning of string */
333 /* minend: end of match must be >=minend after stringarg. */
334 /* nosave: For optimizations. */
337 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
338 nosave ? 0 : REXEC_COPY_STR);
342 S_cache_re(pTHX_ regexp *prog)
344 PL_regprecomp = prog->precomp; /* Needed for FAIL. */
346 PL_regprogram = prog->program;
348 PL_regnpar = prog->nparens;
349 PL_regdata = prog->data;
354 * Need to implement the following flags for reg_anch:
356 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
358 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
359 * INTUIT_AUTORITATIVE_ML
360 * INTUIT_ONCE_NOML - Intuit can match in one location only.
363 * Another flag for this function: SECOND_TIME (so that float substrs
364 * with giant delta may be not rechecked).
367 /* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
369 /* If SCREAM, then SvPVX(sv) should be compatible with strpos and strend.
370 Otherwise, only SvCUR(sv) is used to get strbeg. */
372 /* XXXX We assume that strpos is strbeg unless sv. */
374 /* XXXX Some places assume that there is a fixed substring.
375 An update may be needed if optimizer marks as "INTUITable"
376 RExen without fixed substrings. Similarly, it is assumed that
377 lengths of all the strings are no more than minlen, thus they
378 cannot come from lookahead.
379 (Or minlen should take into account lookahead.) */
381 /* A failure to find a constant substring means that there is no need to make
382 an expensive call to REx engine, thus we celebrate a failure. Similarly,
383 finding a substring too deep into the string means that less calls to
384 regtry() should be needed.
386 REx compiler's optimizer found 4 possible hints:
387 a) Anchored substring;
389 c) Whether we are anchored (beginning-of-line or \G);
390 d) First node (of those at offset 0) which may distingush positions;
391 We use a)b)d) and multiline-part of c), and try to find a position in the
392 string which does not contradict any of them.
395 /* Most of decisions we do here should have been done at compile time.
396 The nodes of the REx which we used for the search should have been
397 deleted from the finite automaton. */
400 Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
401 char *strend, U32 flags, re_scream_pos_data *data)
403 register I32 start_shift = 0;
404 /* Should be nonnegative! */
405 register I32 end_shift = 0;
410 int do_utf8 = sv ? SvUTF8(sv) : 0; /* if no sv we have to assume bytes */
412 register char *other_last = Nullch; /* other substr checked before this */
413 char *check_at = Nullch; /* check substr found at this pos */
414 I32 multiline = prog->reganch & PMf_MULTILINE;
416 char *i_strpos = strpos;
417 SV *dsv = PERL_DEBUG_PAD_ZERO(0);
420 GET_RE_DEBUG_FLAGS_DECL;
422 RX_MATCH_UTF8_set(prog,do_utf8);
424 if (prog->reganch & ROPT_UTF8) {
425 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
426 "UTF-8 regex...\n"));
427 PL_reg_flags |= RF_utf8;
431 char *s = PL_reg_match_utf8 ?
432 sv_uni_display(dsv, sv, 60, UNI_DISPLAY_REGEX) :
434 int len = PL_reg_match_utf8 ?
435 strlen(s) : strend - strpos;
438 if (PL_reg_match_utf8)
439 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
440 "UTF-8 target...\n"));
441 PerlIO_printf(Perl_debug_log,
442 "%sGuessing start of match, REx%s `%s%.60s%s%s' against `%s%.*s%s%s'...\n",
443 PL_colors[4],PL_colors[5],PL_colors[0],
446 (strlen(prog->precomp) > 60 ? "..." : ""),
448 (int)(len > 60 ? 60 : len),
450 (len > 60 ? "..." : "")
454 /* CHR_DIST() would be more correct here but it makes things slow. */
455 if (prog->minlen > strend - strpos) {
456 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
457 "String too short... [re_intuit_start]\n"));
460 strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
463 if (!prog->check_utf8 && prog->check_substr)
464 to_utf8_substr(prog);
465 check = prog->check_utf8;
467 if (!prog->check_substr && prog->check_utf8)
468 to_byte_substr(prog);
469 check = prog->check_substr;
471 if (check == &PL_sv_undef) {
472 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
473 "Non-utf string cannot match utf check string\n"));
476 if (prog->reganch & ROPT_ANCH) { /* Match at beg-of-str or after \n */
477 ml_anch = !( (prog->reganch & ROPT_ANCH_SINGLE)
478 || ( (prog->reganch & ROPT_ANCH_BOL)
479 && !multiline ) ); /* Check after \n? */
482 if ( !(prog->reganch & (ROPT_ANCH_GPOS /* Checked by the caller */
483 | ROPT_IMPLICIT)) /* not a real BOL */
484 /* SvCUR is not set on references: SvRV and SvPVX overlap */
486 && (strpos != strbeg)) {
487 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
490 if (prog->check_offset_min == prog->check_offset_max &&
491 !(prog->reganch & ROPT_CANY_SEEN)) {
492 /* Substring at constant offset from beg-of-str... */
495 s = HOP3c(strpos, prog->check_offset_min, strend);
497 slen = SvCUR(check); /* >= 1 */
499 if ( strend - s > slen || strend - s < slen - 1
500 || (strend - s == slen && strend[-1] != '\n')) {
501 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
504 /* Now should match s[0..slen-2] */
506 if (slen && (*SvPVX(check) != *s
508 && memNE(SvPVX(check), s, slen)))) {
510 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
514 else if (*SvPVX(check) != *s
515 || ((slen = SvCUR(check)) > 1
516 && memNE(SvPVX(check), s, slen)))
518 goto success_at_start;
521 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
523 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
524 end_shift = prog->minlen - start_shift -
525 CHR_SVLEN(check) + (SvTAIL(check) != 0);
527 I32 end = prog->check_offset_max + CHR_SVLEN(check)
528 - (SvTAIL(check) != 0);
529 I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
531 if (end_shift < eshift)
535 else { /* Can match at random position */
538 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
539 /* Should be nonnegative! */
540 end_shift = prog->minlen - start_shift -
541 CHR_SVLEN(check) + (SvTAIL(check) != 0);
544 #ifdef DEBUGGING /* 7/99: reports of failure (with the older version) */
546 Perl_croak(aTHX_ "panic: end_shift");
550 /* Find a possible match in the region s..strend by looking for
551 the "check" substring in the region corrected by start/end_shift. */
552 if (flags & REXEC_SCREAM) {
553 I32 p = -1; /* Internal iterator of scream. */
554 I32 *pp = data ? data->scream_pos : &p;
556 if (PL_screamfirst[BmRARE(check)] >= 0
557 || ( BmRARE(check) == '\n'
558 && (BmPREVIOUS(check) == SvCUR(check) - 1)
560 s = screaminstr(sv, check,
561 start_shift + (s - strbeg), end_shift, pp, 0);
564 /* we may be pointing at the wrong string */
565 if (s && RX_MATCH_COPIED(prog))
566 s = strbeg + (s - SvPVX(sv));
568 *data->scream_olds = s;
570 else if (prog->reganch & ROPT_CANY_SEEN)
571 s = fbm_instr((U8*)(s + start_shift),
572 (U8*)(strend - end_shift),
573 check, multiline ? FBMrf_MULTILINE : 0);
575 s = fbm_instr(HOP3(s, start_shift, strend),
576 HOP3(strend, -end_shift, strbeg),
577 check, multiline ? FBMrf_MULTILINE : 0);
579 /* Update the count-of-usability, remove useless subpatterns,
582 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s %s substr `%s%.*s%s'%s%s",
583 (s ? "Found" : "Did not find"),
584 (check == (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) ? "anchored" : "floating"),
586 (int)(SvCUR(check) - (SvTAIL(check)!=0)),
588 PL_colors[1], (SvTAIL(check) ? "$" : ""),
589 (s ? " at offset " : "...\n") ) );
596 /* Finish the diagnostic message */
597 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
599 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
600 Start with the other substr.
601 XXXX no SCREAM optimization yet - and a very coarse implementation
602 XXXX /ttx+/ results in anchored=`ttx', floating=`x'. floating will
603 *always* match. Probably should be marked during compile...
604 Probably it is right to do no SCREAM here...
607 if (do_utf8 ? (prog->float_utf8 && prog->anchored_utf8) : (prog->float_substr && prog->anchored_substr)) {
608 /* Take into account the "other" substring. */
609 /* XXXX May be hopelessly wrong for UTF... */
612 if (check == (do_utf8 ? prog->float_utf8 : prog->float_substr)) {
615 char *last = HOP3c(s, -start_shift, strbeg), *last1, *last2;
619 t = s - prog->check_offset_max;
620 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
622 || ((t = reghopmaybe3_c(s, -(prog->check_offset_max), strpos))
627 t = HOP3c(t, prog->anchored_offset, strend);
628 if (t < other_last) /* These positions already checked */
630 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
633 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
634 /* On end-of-str: see comment below. */
635 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
636 if (must == &PL_sv_undef) {
638 DEBUG_EXECUTE_r(must = prog->anchored_utf8); /* for debug */
643 HOP3(HOP3(last1, prog->anchored_offset, strend)
644 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
646 multiline ? FBMrf_MULTILINE : 0
648 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
649 "%s anchored substr `%s%.*s%s'%s",
650 (s ? "Found" : "Contradicts"),
653 - (SvTAIL(must)!=0)),
655 PL_colors[1], (SvTAIL(must) ? "$" : "")));
657 if (last1 >= last2) {
658 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
659 ", giving up...\n"));
662 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
663 ", trying floating at offset %ld...\n",
664 (long)(HOP3c(s1, 1, strend) - i_strpos)));
665 other_last = HOP3c(last1, prog->anchored_offset+1, strend);
666 s = HOP3c(last, 1, strend);
670 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
671 (long)(s - i_strpos)));
672 t = HOP3c(s, -prog->anchored_offset, strbeg);
673 other_last = HOP3c(s, 1, strend);
681 else { /* Take into account the floating substring. */
686 t = HOP3c(s, -start_shift, strbeg);
688 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
689 if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
690 last = HOP3c(t, prog->float_max_offset, strend);
691 s = HOP3c(t, prog->float_min_offset, strend);
694 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
695 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
696 /* fbm_instr() takes into account exact value of end-of-str
697 if the check is SvTAIL(ed). Since false positives are OK,
698 and end-of-str is not later than strend we are OK. */
699 if (must == &PL_sv_undef) {
701 DEBUG_EXECUTE_r(must = prog->float_utf8); /* for debug message */
704 s = fbm_instr((unsigned char*)s,
705 (unsigned char*)last + SvCUR(must)
707 must, multiline ? FBMrf_MULTILINE : 0);
708 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s floating substr `%s%.*s%s'%s",
709 (s ? "Found" : "Contradicts"),
711 (int)(SvCUR(must) - (SvTAIL(must)!=0)),
713 PL_colors[1], (SvTAIL(must) ? "$" : "")));
716 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
717 ", giving up...\n"));
720 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
721 ", trying anchored starting at offset %ld...\n",
722 (long)(s1 + 1 - i_strpos)));
724 s = HOP3c(t, 1, strend);
728 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
729 (long)(s - i_strpos)));
730 other_last = s; /* Fix this later. --Hugo */
739 t = s - prog->check_offset_max;
740 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
742 || ((t = reghopmaybe3_c(s, -prog->check_offset_max, strpos))
744 /* Fixed substring is found far enough so that the match
745 cannot start at strpos. */
747 if (ml_anch && t[-1] != '\n') {
748 /* Eventually fbm_*() should handle this, but often
749 anchored_offset is not 0, so this check will not be wasted. */
750 /* XXXX In the code below we prefer to look for "^" even in
751 presence of anchored substrings. And we search even
752 beyond the found float position. These pessimizations
753 are historical artefacts only. */
755 while (t < strend - prog->minlen) {
757 if (t < check_at - prog->check_offset_min) {
758 if (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) {
759 /* Since we moved from the found position,
760 we definitely contradict the found anchored
761 substr. Due to the above check we do not
762 contradict "check" substr.
763 Thus we can arrive here only if check substr
764 is float. Redo checking for "other"=="fixed".
767 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
768 PL_colors[0],PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
769 goto do_other_anchored;
771 /* We don't contradict the found floating substring. */
772 /* XXXX Why not check for STCLASS? */
774 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
775 PL_colors[0],PL_colors[1], (long)(s - i_strpos)));
778 /* Position contradicts check-string */
779 /* XXXX probably better to look for check-string
780 than for "\n", so one should lower the limit for t? */
781 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
782 PL_colors[0],PL_colors[1], (long)(t + 1 - i_strpos)));
783 other_last = strpos = s = t + 1;
788 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
789 PL_colors[0],PL_colors[1]));
793 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
794 PL_colors[0],PL_colors[1]));
798 ++BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
801 /* The found string does not prohibit matching at strpos,
802 - no optimization of calling REx engine can be performed,
803 unless it was an MBOL and we are not after MBOL,
804 or a future STCLASS check will fail this. */
806 /* Even in this situation we may use MBOL flag if strpos is offset
807 wrt the start of the string. */
808 if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
809 && (strpos != strbeg) && strpos[-1] != '\n'
810 /* May be due to an implicit anchor of m{.*foo} */
811 && !(prog->reganch & ROPT_IMPLICIT))
816 DEBUG_EXECUTE_r( if (ml_anch)
817 PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
818 (long)(strpos - i_strpos), PL_colors[0],PL_colors[1]);
821 if (!(prog->reganch & ROPT_NAUGHTY) /* XXXX If strpos moved? */
823 prog->check_utf8 /* Could be deleted already */
824 && --BmUSEFUL(prog->check_utf8) < 0
825 && (prog->check_utf8 == prog->float_utf8)
827 prog->check_substr /* Could be deleted already */
828 && --BmUSEFUL(prog->check_substr) < 0
829 && (prog->check_substr == prog->float_substr)
832 /* If flags & SOMETHING - do not do it many times on the same match */
833 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
834 SvREFCNT_dec(do_utf8 ? prog->check_utf8 : prog->check_substr);
835 if (do_utf8 ? prog->check_substr : prog->check_utf8)
836 SvREFCNT_dec(do_utf8 ? prog->check_substr : prog->check_utf8);
837 prog->check_substr = prog->check_utf8 = Nullsv; /* disable */
838 prog->float_substr = prog->float_utf8 = Nullsv; /* clear */
839 check = Nullsv; /* abort */
841 /* XXXX This is a remnant of the old implementation. It
842 looks wasteful, since now INTUIT can use many
844 prog->reganch &= ~RE_USE_INTUIT;
851 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
852 if (prog->regstclass) {
853 /* minlen == 0 is possible if regstclass is \b or \B,
854 and the fixed substr is ''$.
855 Since minlen is already taken into account, s+1 is before strend;
856 accidentally, minlen >= 1 guaranties no false positives at s + 1
857 even for \b or \B. But (minlen? 1 : 0) below assumes that
858 regstclass does not come from lookahead... */
859 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
860 This leaves EXACTF only, which is dealt with in find_byclass(). */
861 U8* str = (U8*)STRING(prog->regstclass);
862 int cl_l = (PL_regkind[(U8)OP(prog->regstclass)] == EXACT
863 ? CHR_DIST(str+STR_LEN(prog->regstclass), str)
865 char *endpos = (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
866 ? HOP3c(s, (prog->minlen ? cl_l : 0), strend)
867 : (prog->float_substr || prog->float_utf8
868 ? HOP3c(HOP3c(check_at, -start_shift, strbeg),
871 char *startpos = strbeg;
875 s = find_byclass(prog, prog->regstclass, s, endpos, startpos, 1);
880 if (endpos == strend) {
881 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
882 "Could not match STCLASS...\n") );
885 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
886 "This position contradicts STCLASS...\n") );
887 if ((prog->reganch & ROPT_ANCH) && !ml_anch)
889 /* Contradict one of substrings */
890 if (prog->anchored_substr || prog->anchored_utf8) {
891 if ((do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) == check) {
892 DEBUG_EXECUTE_r( what = "anchored" );
894 s = HOP3c(t, 1, strend);
895 if (s + start_shift + end_shift > strend) {
896 /* XXXX Should be taken into account earlier? */
897 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
898 "Could not match STCLASS...\n") );
903 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
904 "Looking for %s substr starting at offset %ld...\n",
905 what, (long)(s + start_shift - i_strpos)) );
908 /* Have both, check_string is floating */
909 if (t + start_shift >= check_at) /* Contradicts floating=check */
910 goto retry_floating_check;
911 /* Recheck anchored substring, but not floating... */
915 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
916 "Looking for anchored substr starting at offset %ld...\n",
917 (long)(other_last - i_strpos)) );
918 goto do_other_anchored;
920 /* Another way we could have checked stclass at the
921 current position only: */
926 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
927 "Looking for /%s^%s/m starting at offset %ld...\n",
928 PL_colors[0],PL_colors[1], (long)(t - i_strpos)) );
931 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
933 /* Check is floating subtring. */
934 retry_floating_check:
935 t = check_at - start_shift;
936 DEBUG_EXECUTE_r( what = "floating" );
937 goto hop_and_restart;
940 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
941 "By STCLASS: moving %ld --> %ld\n",
942 (long)(t - i_strpos), (long)(s - i_strpos))
946 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
947 "Does not contradict STCLASS...\n");
952 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
953 PL_colors[4], (check ? "Guessed" : "Giving up"),
954 PL_colors[5], (long)(s - i_strpos)) );
957 fail_finish: /* Substring not found */
958 if (prog->check_substr || prog->check_utf8) /* could be removed already */
959 BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
961 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
962 PL_colors[4],PL_colors[5]));
966 /* We know what class REx starts with. Try to find this position... */
968 S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *startpos, I32 norun)
970 I32 doevery = (prog->reganch & ROPT_SKIP) == 0;
974 register STRLEN uskip;
978 register I32 tmp = 1; /* Scratch variable? */
979 register bool do_utf8 = PL_reg_match_utf8;
981 /* We know what class it must start with. */
985 while (s + (uskip = UTF8SKIP(s)) <= strend) {
986 if ((ANYOF_FLAGS(c) & ANYOF_UNICODE) ||
987 !UTF8_IS_INVARIANT((U8)s[0]) ?
988 reginclass(c, (U8*)s, 0, do_utf8) :
989 REGINCLASS(c, (U8*)s)) {
990 if (tmp && (norun || regtry(prog, s)))
1001 while (s < strend) {
1004 if (REGINCLASS(c, (U8*)s) ||
1005 (ANYOF_FOLD_SHARP_S(c, s, strend) &&
1006 /* The assignment of 2 is intentional:
1007 * for the folded sharp s, the skip is 2. */
1008 (skip = SHARP_S_SKIP))) {
1009 if (tmp && (norun || regtry(prog, s)))
1021 while (s < strend) {
1022 if (tmp && (norun || regtry(prog, s)))
1031 ln = STR_LEN(c); /* length to match in octets/bytes */
1032 lnc = (I32) ln; /* length to match in characters */
1034 STRLEN ulen1, ulen2;
1036 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
1037 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
1039 to_utf8_lower((U8*)m, tmpbuf1, &ulen1);
1040 to_utf8_upper((U8*)m, tmpbuf2, &ulen2);
1042 c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXBYTES_CASE,
1043 0, ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
1044 c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXBYTES_CASE,
1045 0, ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
1047 while (sm < ((U8 *) m + ln)) {
1062 c2 = PL_fold_locale[c1];
1064 e = HOP3c(strend, -((I32)lnc), s);
1067 e = s; /* Due to minlen logic of intuit() */
1069 /* The idea in the EXACTF* cases is to first find the
1070 * first character of the EXACTF* node and then, if
1071 * necessary, case-insensitively compare the full
1072 * text of the node. The c1 and c2 are the first
1073 * characters (though in Unicode it gets a bit
1074 * more complicated because there are more cases
1075 * than just upper and lower: one needs to use
1076 * the so-called folding case for case-insensitive
1077 * matching (called "loose matching" in Unicode).
1078 * ibcmp_utf8() will do just that. */
1082 U8 tmpbuf [UTF8_MAXBYTES+1];
1083 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
1084 STRLEN len, foldlen;
1087 /* Upper and lower of 1st char are equal -
1088 * probably not a "letter". */
1090 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
1092 0 : UTF8_ALLOW_ANY);
1095 ibcmp_utf8(s, (char **)0, 0, do_utf8,
1096 m, (char **)0, ln, (bool)UTF))
1097 && (norun || regtry(prog, s)) )
1100 uvchr_to_utf8(tmpbuf, c);
1101 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen);
1103 && (f == c1 || f == c2)
1104 && (ln == foldlen ||
1105 !ibcmp_utf8((char *) foldbuf,
1106 (char **)0, foldlen, do_utf8,
1108 (char **)0, ln, (bool)UTF))
1109 && (norun || regtry(prog, s)) )
1117 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
1119 0 : UTF8_ALLOW_ANY);
1121 /* Handle some of the three Greek sigmas cases.
1122 * Note that not all the possible combinations
1123 * are handled here: some of them are handled
1124 * by the standard folding rules, and some of
1125 * them (the character class or ANYOF cases)
1126 * are handled during compiletime in
1127 * regexec.c:S_regclass(). */
1128 if (c == (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA ||
1129 c == (UV)UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA)
1130 c = (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA;
1132 if ( (c == c1 || c == c2)
1134 ibcmp_utf8(s, (char **)0, 0, do_utf8,
1135 m, (char **)0, ln, (bool)UTF))
1136 && (norun || regtry(prog, s)) )
1139 uvchr_to_utf8(tmpbuf, c);
1140 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen);
1142 && (f == c1 || f == c2)
1143 && (ln == foldlen ||
1144 !ibcmp_utf8((char *) foldbuf,
1145 (char **)0, foldlen, do_utf8,
1147 (char **)0, ln, (bool)UTF))
1148 && (norun || regtry(prog, s)) )
1159 && (ln == 1 || !(OP(c) == EXACTF
1161 : ibcmp_locale(s, m, ln)))
1162 && (norun || regtry(prog, s)) )
1168 if ( (*(U8*)s == c1 || *(U8*)s == c2)
1169 && (ln == 1 || !(OP(c) == EXACTF
1171 : ibcmp_locale(s, m, ln)))
1172 && (norun || regtry(prog, s)) )
1179 PL_reg_flags |= RF_tainted;
1186 U8 *r = reghop3((U8*)s, -1, (U8*)PL_bostr);
1188 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, 0);
1190 tmp = ((OP(c) == BOUND ?
1191 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1192 LOAD_UTF8_CHARCLASS(alnum,"a");
1193 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1194 if (tmp == !(OP(c) == BOUND ?
1195 swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
1196 isALNUM_LC_utf8((U8*)s)))
1199 if ((norun || regtry(prog, s)))
1206 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
1207 tmp = ((OP(c) == BOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1208 while (s < strend) {
1210 !(OP(c) == BOUND ? isALNUM(*s) : isALNUM_LC(*s))) {
1212 if ((norun || regtry(prog, s)))
1218 if ((!prog->minlen && tmp) && (norun || regtry(prog, s)))
1222 PL_reg_flags |= RF_tainted;
1229 U8 *r = reghop3((U8*)s, -1, (U8*)PL_bostr);
1231 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, 0);
1233 tmp = ((OP(c) == NBOUND ?
1234 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1235 LOAD_UTF8_CHARCLASS(alnum,"a");
1236 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1237 if (tmp == !(OP(c) == NBOUND ?
1238 swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
1239 isALNUM_LC_utf8((U8*)s)))
1241 else if ((norun || regtry(prog, s)))
1247 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
1248 tmp = ((OP(c) == NBOUND ?
1249 isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1250 while (s < strend) {
1252 !(OP(c) == NBOUND ? isALNUM(*s) : isALNUM_LC(*s)))
1254 else if ((norun || regtry(prog, s)))
1259 if ((!prog->minlen && !tmp) && (norun || regtry(prog, s)))
1264 LOAD_UTF8_CHARCLASS(alnum,"a");
1265 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1266 if (swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8)) {
1267 if (tmp && (norun || regtry(prog, s)))
1278 while (s < strend) {
1280 if (tmp && (norun || regtry(prog, s)))
1292 PL_reg_flags |= RF_tainted;
1294 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1295 if (isALNUM_LC_utf8((U8*)s)) {
1296 if (tmp && (norun || regtry(prog, s)))
1307 while (s < strend) {
1308 if (isALNUM_LC(*s)) {
1309 if (tmp && (norun || regtry(prog, s)))
1322 LOAD_UTF8_CHARCLASS(alnum,"a");
1323 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1324 if (!swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8)) {
1325 if (tmp && (norun || regtry(prog, s)))
1336 while (s < strend) {
1338 if (tmp && (norun || regtry(prog, s)))
1350 PL_reg_flags |= RF_tainted;
1352 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1353 if (!isALNUM_LC_utf8((U8*)s)) {
1354 if (tmp && (norun || regtry(prog, s)))
1365 while (s < strend) {
1366 if (!isALNUM_LC(*s)) {
1367 if (tmp && (norun || regtry(prog, s)))
1380 LOAD_UTF8_CHARCLASS(space," ");
1381 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1382 if (*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8)) {
1383 if (tmp && (norun || regtry(prog, s)))
1394 while (s < strend) {
1396 if (tmp && (norun || regtry(prog, s)))
1408 PL_reg_flags |= RF_tainted;
1410 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1411 if (*s == ' ' || isSPACE_LC_utf8((U8*)s)) {
1412 if (tmp && (norun || regtry(prog, s)))
1423 while (s < strend) {
1424 if (isSPACE_LC(*s)) {
1425 if (tmp && (norun || regtry(prog, s)))
1438 LOAD_UTF8_CHARCLASS(space," ");
1439 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1440 if (!(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8))) {
1441 if (tmp && (norun || regtry(prog, s)))
1452 while (s < strend) {
1454 if (tmp && (norun || regtry(prog, s)))
1466 PL_reg_flags |= RF_tainted;
1468 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1469 if (!(*s == ' ' || isSPACE_LC_utf8((U8*)s))) {
1470 if (tmp && (norun || regtry(prog, s)))
1481 while (s < strend) {
1482 if (!isSPACE_LC(*s)) {
1483 if (tmp && (norun || regtry(prog, s)))
1496 LOAD_UTF8_CHARCLASS(digit,"0");
1497 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1498 if (swash_fetch(PL_utf8_digit,(U8*)s, do_utf8)) {
1499 if (tmp && (norun || regtry(prog, s)))
1510 while (s < strend) {
1512 if (tmp && (norun || regtry(prog, s)))
1524 PL_reg_flags |= RF_tainted;
1526 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1527 if (isDIGIT_LC_utf8((U8*)s)) {
1528 if (tmp && (norun || regtry(prog, s)))
1539 while (s < strend) {
1540 if (isDIGIT_LC(*s)) {
1541 if (tmp && (norun || regtry(prog, s)))
1554 LOAD_UTF8_CHARCLASS(digit,"0");
1555 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1556 if (!swash_fetch(PL_utf8_digit,(U8*)s, do_utf8)) {
1557 if (tmp && (norun || regtry(prog, s)))
1568 while (s < strend) {
1570 if (tmp && (norun || regtry(prog, s)))
1582 PL_reg_flags |= RF_tainted;
1584 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1585 if (!isDIGIT_LC_utf8((U8*)s)) {
1586 if (tmp && (norun || regtry(prog, s)))
1597 while (s < strend) {
1598 if (!isDIGIT_LC(*s)) {
1599 if (tmp && (norun || regtry(prog, s)))
1611 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
1620 - regexec_flags - match a regexp against a string
1623 Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *strend,
1624 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
1625 /* strend: pointer to null at end of string */
1626 /* strbeg: real beginning of string */
1627 /* minend: end of match must be >=minend after stringarg. */
1628 /* data: May be used for some additional optimizations. */
1629 /* nosave: For optimizations. */
1632 register regnode *c;
1633 register char *startpos = stringarg;
1634 I32 minlen; /* must match at least this many chars */
1635 I32 dontbother = 0; /* how many characters not to try at end */
1636 /* I32 start_shift = 0; */ /* Offset of the start to find
1637 constant substr. */ /* CC */
1638 I32 end_shift = 0; /* Same for the end. */ /* CC */
1639 I32 scream_pos = -1; /* Internal iterator of scream. */
1641 SV* oreplsv = GvSV(PL_replgv);
1642 bool do_utf8 = DO_UTF8(sv);
1643 I32 multiline = prog->reganch & PMf_MULTILINE;
1645 SV *dsv0 = PERL_DEBUG_PAD_ZERO(0);
1646 SV *dsv1 = PERL_DEBUG_PAD_ZERO(1);
1649 GET_RE_DEBUG_FLAGS_DECL;
1651 RX_MATCH_UTF8_set(prog,do_utf8);
1657 PL_regnarrate = DEBUG_r_TEST;
1660 /* Be paranoid... */
1661 if (prog == NULL || startpos == NULL) {
1662 Perl_croak(aTHX_ "NULL regexp parameter");
1666 minlen = prog->minlen;
1667 if (strend - startpos < minlen) {
1668 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1669 "String too short [regexec_flags]...\n"));
1673 /* Check validity of program. */
1674 if (UCHARAT(prog->program) != REG_MAGIC) {
1675 Perl_croak(aTHX_ "corrupted regexp program");
1679 PL_reg_eval_set = 0;
1682 if (prog->reganch & ROPT_UTF8)
1683 PL_reg_flags |= RF_utf8;
1685 /* Mark beginning of line for ^ and lookbehind. */
1686 PL_regbol = startpos;
1690 /* Mark end of line for $ (and such) */
1693 /* see how far we have to get to not match where we matched before */
1694 PL_regtill = startpos+minend;
1696 /* We start without call_cc context. */
1699 /* If there is a "must appear" string, look for it. */
1702 if (prog->reganch & ROPT_GPOS_SEEN) { /* Need to have PL_reg_ganch */
1705 if (flags & REXEC_IGNOREPOS) /* Means: check only at start */
1706 PL_reg_ganch = startpos;
1707 else if (sv && SvTYPE(sv) >= SVt_PVMG
1709 && (mg = mg_find(sv, PERL_MAGIC_regex_global))
1710 && mg->mg_len >= 0) {
1711 PL_reg_ganch = strbeg + mg->mg_len; /* Defined pos() */
1712 if (prog->reganch & ROPT_ANCH_GPOS) {
1713 if (s > PL_reg_ganch)
1718 else /* pos() not defined */
1719 PL_reg_ganch = strbeg;
1722 if (!(flags & REXEC_CHECKED) && (prog->check_substr != Nullsv || prog->check_utf8 != Nullsv)) {
1723 re_scream_pos_data d;
1725 d.scream_olds = &scream_olds;
1726 d.scream_pos = &scream_pos;
1727 s = re_intuit_start(prog, sv, s, strend, flags, &d);
1729 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
1730 goto phooey; /* not present */
1736 pv_uni_display(dsv0, (U8*)prog->precomp, prog->prelen, 60,
1737 UNI_DISPLAY_REGEX) :
1739 int len0 = UTF ? SvCUR(dsv0) : prog->prelen;
1740 char *s1 = do_utf8 ? sv_uni_display(dsv1, sv, 60,
1741 UNI_DISPLAY_REGEX) : startpos;
1742 int len1 = do_utf8 ? SvCUR(dsv1) : strend - startpos;
1745 PerlIO_printf(Perl_debug_log,
1746 "%sMatching REx%s `%s%*.*s%s%s' against `%s%.*s%s%s'\n",
1747 PL_colors[4],PL_colors[5],PL_colors[0],
1750 len0 > 60 ? "..." : "",
1752 (int)(len1 > 60 ? 60 : len1),
1754 (len1 > 60 ? "..." : "")
1758 /* Simplest case: anchored match need be tried only once. */
1759 /* [unless only anchor is BOL and multiline is set] */
1760 if (prog->reganch & (ROPT_ANCH & ~ROPT_ANCH_GPOS)) {
1761 if (s == startpos && regtry(prog, startpos))
1763 else if (multiline || (prog->reganch & ROPT_IMPLICIT)
1764 || (prog->reganch & ROPT_ANCH_MBOL)) /* XXXX SBOL? */
1769 dontbother = minlen - 1;
1770 end = HOP3c(strend, -dontbother, strbeg) - 1;
1771 /* for multiline we only have to try after newlines */
1772 if (prog->check_substr || prog->check_utf8) {
1776 if (regtry(prog, s))
1781 if (prog->reganch & RE_USE_INTUIT) {
1782 s = re_intuit_start(prog, sv, s + 1, strend, flags, NULL);
1793 if (*s++ == '\n') { /* don't need PL_utf8skip here */
1794 if (regtry(prog, s))
1801 } else if (prog->reganch & ROPT_ANCH_GPOS) {
1802 if (regtry(prog, PL_reg_ganch))
1807 /* Messy cases: unanchored match. */
1808 if ((prog->anchored_substr || prog->anchored_utf8) && prog->reganch & ROPT_SKIP) {
1809 /* we have /x+whatever/ */
1810 /* it must be a one character string (XXXX Except UTF?) */
1815 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1816 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1817 ch = SvPVX(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr)[0];
1820 while (s < strend) {
1822 DEBUG_EXECUTE_r( did_match = 1 );
1823 if (regtry(prog, s)) goto got_it;
1825 while (s < strend && *s == ch)
1832 while (s < strend) {
1834 DEBUG_EXECUTE_r( did_match = 1 );
1835 if (regtry(prog, s)) goto got_it;
1837 while (s < strend && *s == ch)
1843 DEBUG_EXECUTE_r(if (!did_match)
1844 PerlIO_printf(Perl_debug_log,
1845 "Did not find anchored character...\n")
1849 else if (prog->anchored_substr != Nullsv
1850 || prog->anchored_utf8 != Nullsv
1851 || ((prog->float_substr != Nullsv || prog->float_utf8 != Nullsv)
1852 && prog->float_max_offset < strend - s)) {
1857 char *last1; /* Last position checked before */
1861 if (prog->anchored_substr || prog->anchored_utf8) {
1862 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1863 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1864 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
1865 back_max = back_min = prog->anchored_offset;
1867 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
1868 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1869 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
1870 back_max = prog->float_max_offset;
1871 back_min = prog->float_min_offset;
1873 if (must == &PL_sv_undef)
1874 /* could not downgrade utf8 check substring, so must fail */
1877 last = HOP3c(strend, /* Cannot start after this */
1878 -(I32)(CHR_SVLEN(must)
1879 - (SvTAIL(must) != 0) + back_min), strbeg);
1882 last1 = HOPc(s, -1);
1884 last1 = s - 1; /* bogus */
1886 /* XXXX check_substr already used to find `s', can optimize if
1887 check_substr==must. */
1889 dontbother = end_shift;
1890 strend = HOPc(strend, -dontbother);
1891 while ( (s <= last) &&
1892 ((flags & REXEC_SCREAM)
1893 ? (s = screaminstr(sv, must, HOP3c(s, back_min, strend) - strbeg,
1894 end_shift, &scream_pos, 0))
1895 : (s = fbm_instr((unsigned char*)HOP3(s, back_min, strend),
1896 (unsigned char*)strend, must,
1897 multiline ? FBMrf_MULTILINE : 0))) ) {
1898 /* we may be pointing at the wrong string */
1899 if ((flags & REXEC_SCREAM) && RX_MATCH_COPIED(prog))
1900 s = strbeg + (s - SvPVX(sv));
1901 DEBUG_EXECUTE_r( did_match = 1 );
1902 if (HOPc(s, -back_max) > last1) {
1903 last1 = HOPc(s, -back_min);
1904 s = HOPc(s, -back_max);
1907 char *t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
1909 last1 = HOPc(s, -back_min);
1913 while (s <= last1) {
1914 if (regtry(prog, s))
1920 while (s <= last1) {
1921 if (regtry(prog, s))
1927 DEBUG_EXECUTE_r(if (!did_match)
1928 PerlIO_printf(Perl_debug_log,
1929 "Did not find %s substr `%s%.*s%s'%s...\n",
1930 ((must == prog->anchored_substr || must == prog->anchored_utf8)
1931 ? "anchored" : "floating"),
1933 (int)(SvCUR(must) - (SvTAIL(must)!=0)),
1935 PL_colors[1], (SvTAIL(must) ? "$" : ""))
1939 else if ((c = prog->regstclass)) {
1941 I32 op = (U8)OP(prog->regstclass);
1942 /* don't bother with what can't match */
1943 if (PL_regkind[op] != EXACT && op != CANY)
1944 strend = HOPc(strend, -(minlen - 1));
1947 SV *prop = sv_newmortal();
1955 pv_uni_display(dsv0, (U8*)SvPVX(prop), SvCUR(prop), 60,
1956 UNI_DISPLAY_REGEX) :
1958 len0 = UTF ? SvCUR(dsv0) : SvCUR(prop);
1960 sv_uni_display(dsv1, sv, 60, UNI_DISPLAY_REGEX) : s;
1961 len1 = UTF ? SvCUR(dsv1) : strend - s;
1962 PerlIO_printf(Perl_debug_log,
1963 "Matching stclass `%*.*s' against `%*.*s'\n",
1967 if (find_byclass(prog, c, s, strend, startpos, 0))
1969 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass...\n"));
1973 if (prog->float_substr != Nullsv || prog->float_utf8 != Nullsv) {
1978 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
1979 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1980 float_real = do_utf8 ? prog->float_utf8 : prog->float_substr;
1982 if (flags & REXEC_SCREAM) {
1983 last = screaminstr(sv, float_real, s - strbeg,
1984 end_shift, &scream_pos, 1); /* last one */
1986 last = scream_olds; /* Only one occurrence. */
1987 /* we may be pointing at the wrong string */
1988 else if (RX_MATCH_COPIED(prog))
1989 s = strbeg + (s - SvPVX(sv));
1993 char *little = SvPV(float_real, len);
1995 if (SvTAIL(float_real)) {
1996 if (memEQ(strend - len + 1, little, len - 1))
1997 last = strend - len + 1;
1998 else if (!multiline)
1999 last = memEQ(strend - len, little, len)
2000 ? strend - len : Nullch;
2006 last = rninstr(s, strend, little, little + len);
2008 last = strend; /* matching `$' */
2012 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2013 "%sCan't trim the tail, match fails (should not happen)%s\n",
2014 PL_colors[4],PL_colors[5]));
2015 goto phooey; /* Should not happen! */
2017 dontbother = strend - last + prog->float_min_offset;
2019 if (minlen && (dontbother < minlen))
2020 dontbother = minlen - 1;
2021 strend -= dontbother; /* this one's always in bytes! */
2022 /* We don't know much -- general case. */
2025 if (regtry(prog, s))
2034 if (regtry(prog, s))
2036 } while (s++ < strend);
2044 RX_MATCH_TAINTED_set(prog, PL_reg_flags & RF_tainted);
2046 if (PL_reg_eval_set) {
2047 /* Preserve the current value of $^R */
2048 if (oreplsv != GvSV(PL_replgv))
2049 sv_setsv(oreplsv, GvSV(PL_replgv));/* So that when GvSV(replgv) is
2050 restored, the value remains
2052 restore_pos(aTHX_ 0);
2055 /* make sure $`, $&, $', and $digit will work later */
2056 if ( !(flags & REXEC_NOT_FIRST) ) {
2057 RX_MATCH_COPY_FREE(prog);
2058 if (flags & REXEC_COPY_STR) {
2059 I32 i = PL_regeol - startpos + (stringarg - strbeg);
2060 #ifdef PERL_COPY_ON_WRITE
2062 || (SvFLAGS(sv) & CAN_COW_MASK) == CAN_COW_FLAGS)) {
2064 PerlIO_printf(Perl_debug_log,
2065 "Copy on write: regexp capture, type %d\n",
2068 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
2069 prog->subbeg = SvPVX(prog->saved_copy);
2070 assert (SvPOKp(prog->saved_copy));
2074 RX_MATCH_COPIED_on(prog);
2075 s = savepvn(strbeg, i);
2081 prog->subbeg = strbeg;
2082 prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
2089 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
2090 PL_colors[4],PL_colors[5]));
2091 if (PL_reg_eval_set)
2092 restore_pos(aTHX_ 0);
2097 - regtry - try match at specific point
2099 STATIC I32 /* 0 failure, 1 success */
2100 S_regtry(pTHX_ regexp *prog, char *startpos)
2106 GET_RE_DEBUG_FLAGS_DECL;
2109 PL_regindent = 0; /* XXXX Not good when matches are reenterable... */
2111 if ((prog->reganch & ROPT_EVAL_SEEN) && !PL_reg_eval_set) {
2114 PL_reg_eval_set = RS_init;
2115 DEBUG_EXECUTE_r(DEBUG_s(
2116 PerlIO_printf(Perl_debug_log, " setting stack tmpbase at %"IVdf"\n",
2117 (IV)(PL_stack_sp - PL_stack_base));
2119 SAVEI32(cxstack[cxstack_ix].blk_oldsp);
2120 cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base;
2121 /* Otherwise OP_NEXTSTATE will free whatever on stack now. */
2123 /* Apparently this is not needed, judging by wantarray. */
2124 /* SAVEI8(cxstack[cxstack_ix].blk_gimme);
2125 cxstack[cxstack_ix].blk_gimme = G_SCALAR; */
2128 /* Make $_ available to executed code. */
2129 if (PL_reg_sv != DEFSV) {
2134 if (!(SvTYPE(PL_reg_sv) >= SVt_PVMG && SvMAGIC(PL_reg_sv)
2135 && (mg = mg_find(PL_reg_sv, PERL_MAGIC_regex_global)))) {
2136 /* prepare for quick setting of pos */
2137 sv_magic(PL_reg_sv, (SV*)0,
2138 PERL_MAGIC_regex_global, Nullch, 0);
2139 mg = mg_find(PL_reg_sv, PERL_MAGIC_regex_global);
2143 PL_reg_oldpos = mg->mg_len;
2144 SAVEDESTRUCTOR_X(restore_pos, 0);
2146 if (!PL_reg_curpm) {
2147 Newz(22,PL_reg_curpm, 1, PMOP);
2150 SV* repointer = newSViv(0);
2151 /* so we know which PL_regex_padav element is PL_reg_curpm */
2152 SvFLAGS(repointer) |= SVf_BREAK;
2153 av_push(PL_regex_padav,repointer);
2154 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2155 PL_regex_pad = AvARRAY(PL_regex_padav);
2159 PM_SETRE(PL_reg_curpm, prog);
2160 PL_reg_oldcurpm = PL_curpm;
2161 PL_curpm = PL_reg_curpm;
2162 if (RX_MATCH_COPIED(prog)) {
2163 /* Here is a serious problem: we cannot rewrite subbeg,
2164 since it may be needed if this match fails. Thus
2165 $` inside (?{}) could fail... */
2166 PL_reg_oldsaved = prog->subbeg;
2167 PL_reg_oldsavedlen = prog->sublen;
2168 #ifdef PERL_COPY_ON_WRITE
2169 PL_nrs = prog->saved_copy;
2171 RX_MATCH_COPIED_off(prog);
2174 PL_reg_oldsaved = Nullch;
2175 prog->subbeg = PL_bostr;
2176 prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2178 prog->startp[0] = startpos - PL_bostr;
2179 PL_reginput = startpos;
2180 PL_regstartp = prog->startp;
2181 PL_regendp = prog->endp;
2182 PL_reglastparen = &prog->lastparen;
2183 PL_reglastcloseparen = &prog->lastcloseparen;
2184 prog->lastparen = 0;
2185 prog->lastcloseparen = 0;
2187 DEBUG_EXECUTE_r(PL_reg_starttry = startpos);
2188 if (PL_reg_start_tmpl <= prog->nparens) {
2189 PL_reg_start_tmpl = prog->nparens*3/2 + 3;
2190 if(PL_reg_start_tmp)
2191 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2193 New(22,PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2196 /* XXXX What this code is doing here?!!! There should be no need
2197 to do this again and again, PL_reglastparen should take care of
2200 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2201 * Actually, the code in regcppop() (which Ilya may be meaning by
2202 * PL_reglastparen), is not needed at all by the test suite
2203 * (op/regexp, op/pat, op/split), but that code is needed, oddly
2204 * enough, for building DynaLoader, or otherwise this
2205 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
2206 * will happen. Meanwhile, this code *is* needed for the
2207 * above-mentioned test suite tests to succeed. The common theme
2208 * on those tests seems to be returning null fields from matches.
2213 if (prog->nparens) {
2214 for (i = prog->nparens; i > (I32)*PL_reglastparen; i--) {
2221 if (regmatch(prog->program + 1)) {
2222 prog->endp[0] = PL_reginput - PL_bostr;
2225 REGCP_UNWIND(lastcp);
2229 #define RE_UNWIND_BRANCH 1
2230 #define RE_UNWIND_BRANCHJ 2
2234 typedef struct { /* XX: makes sense to enlarge it... */
2238 } re_unwind_generic_t;
2251 } re_unwind_branch_t;
2253 typedef union re_unwind_t {
2255 re_unwind_generic_t generic;
2256 re_unwind_branch_t branch;
2259 #define sayYES goto yes
2260 #define sayNO goto no
2261 #define sayNO_ANYOF goto no_anyof
2262 #define sayYES_FINAL goto yes_final
2263 #define sayYES_LOUD goto yes_loud
2264 #define sayNO_FINAL goto no_final
2265 #define sayNO_SILENT goto do_no
2266 #define saySAME(x) if (x) goto yes; else goto no
2268 /* this is used to determine how far from the left messages like
2269 'failed...' are printed. Currently 29 makes these messages line
2270 up with the opcode they refer to. Earlier perls used 25 which
2271 left these messages outdented making reviewing a debug output
2274 #define REPORT_CODE_OFF 29
2277 /* Make sure there is a test for this +1 options in re_tests */
2278 #define TRIE_INITAL_ACCEPT_BUFFLEN 4;
2280 #define TRIE_CHECK_STATE_IS_ACCEPTING STMT_START { \
2281 if ( trie->states[ state ].wordnum ) { \
2282 if ( !accepted ) { \
2285 bufflen = TRIE_INITAL_ACCEPT_BUFFLEN ; \
2286 sv_accept_buff=NEWSV( 1234, \
2287 bufflen * sizeof(reg_trie_accepted) - 1 ); \
2288 SvCUR_set( sv_accept_buff, sizeof(reg_trie_accepted) ); \
2289 SvPOK_on( sv_accept_buff ); \
2290 sv_2mortal( sv_accept_buff ); \
2291 accept_buff = (reg_trie_accepted*)SvPV_nolen( sv_accept_buff );\
2293 if ( accepted >= bufflen ) { \
2295 accept_buff =(reg_trie_accepted*)SvGROW( sv_accept_buff, \
2296 bufflen * sizeof(reg_trie_accepted) ); \
2298 SvCUR_set( sv_accept_buff,SvCUR( sv_accept_buff ) \
2299 + sizeof( reg_trie_accepted ) ); \
2301 accept_buff[ accepted ].wordnum = trie->states[ state ].wordnum; \
2302 accept_buff[ accepted ].endpos = uc; \
2306 #define TRIE_HANDLE_CHAR STMT_START { \
2307 if ( uvc < 256 ) { \
2308 charid = trie->charmap[ uvc ]; \
2311 if( trie->widecharmap ) { \
2312 SV** svpp = (SV**)NULL; \
2313 svpp = hv_fetch( trie->widecharmap, (char*)&uvc, \
2314 sizeof( UV ), 0 ); \
2316 charid = (U16)SvIV( *svpp ); \
2321 ( base + charid - 1 - trie->uniquecharcount ) >=0 && \
2322 trie->trans[ base + charid - 1 - trie->uniquecharcount ].check == state ) \
2324 state = trie->trans[ base + charid - 1 - trie->uniquecharcount ].next; \
2332 - regmatch - main matching routine
2334 * Conceptually the strategy is simple: check to see whether the current
2335 * node matches, call self recursively to see whether the rest matches,
2336 * and then act accordingly. In practice we make some effort to avoid
2337 * recursion, in particular by going through "ordinary" nodes (that don't
2338 * need to know whether the rest of the match failed) by a loop instead of
2341 /* [lwall] I've hoisted the register declarations to the outer block in order to
2342 * maybe save a little bit of pushing and popping on the stack. It also takes
2343 * advantage of machines that use a register save mask on subroutine entry.
2345 STATIC I32 /* 0 failure, 1 success */
2346 S_regmatch(pTHX_ regnode *prog)
2348 register regnode *scan; /* Current node. */
2349 regnode *next; /* Next node. */
2350 regnode *inner; /* Next node in internal branch. */
2351 register I32 nextchr; /* renamed nextchr - nextchar colides with
2352 function of same name */
2353 register I32 n; /* no or next */
2354 register I32 ln = 0; /* len or last */
2355 register char *s = Nullch; /* operand or save */
2356 register char *locinput = PL_reginput;
2357 register I32 c1 = 0, c2 = 0, paren; /* case fold search, parenth */
2358 int minmod = 0, sw = 0, logical = 0;
2361 /* used by the trie code */
2362 SV *sv_accept_buff; /* accepting states we have traversed */
2363 reg_trie_accepted *accept_buff; /* "" */
2364 reg_trie_data *trie; /* what trie are we using right now */
2365 U32 accepted = 0; /* how many accepting states we have seen*/
2368 I32 firstcp = PL_savestack_ix;
2370 register bool do_utf8 = PL_reg_match_utf8;
2372 SV *dsv0 = PERL_DEBUG_PAD_ZERO(0);
2373 SV *dsv1 = PERL_DEBUG_PAD_ZERO(1);
2374 SV *dsv2 = PERL_DEBUG_PAD_ZERO(2);
2386 /* Note that nextchr is a byte even in UTF */
2387 nextchr = UCHARAT(locinput);
2389 while (scan != NULL) {
2392 SV *prop = sv_newmortal();
2393 int docolor = *PL_colors[0];
2394 int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
2395 int l = (PL_regeol - locinput) > taill ? taill : (PL_regeol - locinput);
2396 /* The part of the string before starttry has one color
2397 (pref0_len chars), between starttry and current
2398 position another one (pref_len - pref0_len chars),
2399 after the current position the third one.
2400 We assume that pref0_len <= pref_len, otherwise we
2401 decrease pref0_len. */
2402 int pref_len = (locinput - PL_bostr) > (5 + taill) - l
2403 ? (5 + taill) - l : locinput - PL_bostr;
2406 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
2408 pref0_len = pref_len - (locinput - PL_reg_starttry);
2409 if (l + pref_len < (5 + taill) && l < PL_regeol - locinput)
2410 l = ( PL_regeol - locinput > (5 + taill) - pref_len
2411 ? (5 + taill) - pref_len : PL_regeol - locinput);
2412 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
2416 if (pref0_len > pref_len)
2417 pref0_len = pref_len;
2418 regprop(prop, scan);
2421 do_utf8 && OP(scan) != CANY ?
2422 pv_uni_display(dsv0, (U8*)(locinput - pref_len),
2423 pref0_len, 60, UNI_DISPLAY_REGEX) :
2424 locinput - pref_len;
2425 int len0 = do_utf8 ? strlen(s0) : pref0_len;
2426 char *s1 = do_utf8 && OP(scan) != CANY ?
2427 pv_uni_display(dsv1, (U8*)(locinput - pref_len + pref0_len),
2428 pref_len - pref0_len, 60, UNI_DISPLAY_REGEX) :
2429 locinput - pref_len + pref0_len;
2430 int len1 = do_utf8 ? strlen(s1) : pref_len - pref0_len;
2431 char *s2 = do_utf8 && OP(scan) != CANY ?
2432 pv_uni_display(dsv2, (U8*)locinput,
2433 PL_regeol - locinput, 60, UNI_DISPLAY_REGEX) :
2435 int len2 = do_utf8 ? strlen(s2) : l;
2436 PerlIO_printf(Perl_debug_log,
2437 "%4"IVdf" <%s%.*s%s%s%.*s%s%s%s%.*s%s>%*s|%3"IVdf":%*s%s\n",
2438 (IV)(locinput - PL_bostr),
2445 (docolor ? "" : "> <"),
2449 15 - l - pref_len + 1,
2451 (IV)(scan - PL_regprogram), PL_regindent*2, "",
2456 next = scan + NEXT_OFF(scan);
2462 if (locinput == PL_bostr)
2464 /* regtill = regbol; */
2469 if (locinput == PL_bostr ||
2470 ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n'))
2476 if (locinput == PL_bostr)
2480 if (locinput == PL_reg_ganch)
2486 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
2491 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
2493 if (PL_regeol - locinput > 1)
2497 if (PL_regeol != locinput)
2501 if (!nextchr && locinput >= PL_regeol)
2504 locinput += PL_utf8skip[nextchr];
2505 if (locinput > PL_regeol)
2507 nextchr = UCHARAT(locinput);
2510 nextchr = UCHARAT(++locinput);
2513 if (!nextchr && locinput >= PL_regeol)
2515 nextchr = UCHARAT(++locinput);
2518 if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
2521 locinput += PL_utf8skip[nextchr];
2522 if (locinput > PL_regeol)
2524 nextchr = UCHARAT(locinput);
2527 nextchr = UCHARAT(++locinput);
2533 traverse the TRIE keeping track of all accepting states
2534 we transition through until we get to a failing node.
2536 we use two slightly different pieces of code to handle
2537 the traversal depending on whether its case sensitive or
2538 not. we reuse the accept code however. (this should probably
2539 be turned into a macro.)
2546 U32 uniflags = ckWARN( WARN_UTF8 ) ? 0 : UTF8_ALLOW_ANY;
2547 U8 *uc = ( U8* )locinput;
2554 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2555 U8 *uscan = (U8*)NULL;
2559 trie = (reg_trie_data*)PL_regdata->data[ ARG( scan ) ];
2561 while ( state && uc <= (U8*)PL_regeol ) {
2563 TRIE_CHECK_STATE_IS_ACCEPTING;
2565 base = trie->states[ state ].trans.base;
2567 DEBUG_TRIE_EXECUTE_r(
2568 PerlIO_printf( Perl_debug_log,
2569 "%*s %sState: %4x, Base: %4x Accepted: %4x ",
2570 REPORT_CODE_OFF + PL_regindent * 2, "", PL_colors[4],
2571 state, base, accepted );
2576 if ( do_utf8 || UTF ) {
2578 uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags );
2583 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags );
2584 uvc = to_uni_fold( uvc, foldbuf, &foldlen );
2585 foldlen -= UNISKIP( uvc );
2586 uscan = foldbuf + UNISKIP( uvc );
2598 DEBUG_TRIE_EXECUTE_r(
2599 PerlIO_printf( Perl_debug_log,
2600 "Charid:%3x CV:%4x After State: %4x%s\n",
2601 charid, uvc, state, PL_colors[5] );
2610 /* unreached codepoint: we jump into the middle of the next case
2611 from previous if blocks */
2614 U32 uniflags = ckWARN( WARN_UTF8 ) ? 0 : UTF8_ALLOW_ANY;
2615 U8 *uc = (U8*)locinput;
2624 trie = (reg_trie_data*)PL_regdata->data[ ARG( scan ) ];
2626 while ( state && uc <= (U8*)PL_regeol ) {
2628 TRIE_CHECK_STATE_IS_ACCEPTING;
2630 base = trie->states[ state ].trans.base;
2632 DEBUG_TRIE_EXECUTE_r(
2633 PerlIO_printf( Perl_debug_log,
2634 "%*s %sState: %4x, Base: %4x Accepted: %4x ",
2635 REPORT_CODE_OFF + PL_regindent * 2, "", PL_colors[4],
2636 state, base, accepted );
2641 if ( do_utf8 || UTF ) {
2642 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags );
2653 DEBUG_TRIE_EXECUTE_r(
2654 PerlIO_printf( Perl_debug_log,
2655 "Charid:%3x CV:%4x After State: %4x%s\n",
2656 charid, uvc, state, PL_colors[5] );
2666 There was at least one accepting state that we
2667 transitioned through. Presumably the number of accepting
2668 states is going to be low, typically one or two. So we
2669 simply scan through to find the one with lowest wordnum.
2670 Once we find it, we swap the last state into its place
2671 and decrement the size. We then try to match the rest of
2672 the pattern at the point where the word ends, if we
2673 succeed then we end the loop, otherwise the loop
2674 eventually terminates once all of the accepting states
2681 if ( accepted == 1 ) {
2683 SV **tmp = av_fetch( trie->words, accept_buff[ 0 ].wordnum-1, 0 );
2684 PerlIO_printf( Perl_debug_log,
2685 "%*s %sonly one match : #%d <%s>%s\n",
2686 REPORT_CODE_OFF+PL_regindent*2, "",PL_colors[4],
2687 accept_buff[ 0 ].wordnum,
2688 tmp ? SvPV_nolen( *tmp ) : "not compiled under -Dr",
2691 PL_reginput = accept_buff[ 0 ].endpos;
2692 /* in this case we free tmps/leave before we call regmatch
2693 as we wont be using accept_buff again. */
2696 gotit = regmatch( scan + NEXT_OFF( scan ) );
2699 PerlIO_printf( Perl_debug_log,"%*s %sgot %d possible matches%s\n",
2700 REPORT_CODE_OFF + PL_regindent * 2, "",PL_colors[4], accepted,
2703 while ( !gotit && accepted-- ) {
2706 for( cur = 1 ; cur <= accepted ; cur++ ) {
2707 DEBUG_TRIE_EXECUTE_r(
2708 PerlIO_printf( Perl_debug_log,
2709 "%*s %sgot %d (%d) as best, looking at %d (%d)%s\n",
2710 REPORT_CODE_OFF + PL_regindent * 2, "", PL_colors[4],
2711 best, accept_buff[ best ].wordnum, cur,
2712 accept_buff[ cur ].wordnum, PL_colors[5] );
2715 if ( accept_buff[ cur ].wordnum < accept_buff[ best ].wordnum )
2719 SV **tmp = av_fetch( trie->words, accept_buff[ best ].wordnum - 1, 0 );
2720 PerlIO_printf( Perl_debug_log, "%*s %strying alternation #%d <%s> at 0x%p%s\n",
2721 REPORT_CODE_OFF+PL_regindent*2, "",PL_colors[4],
2722 accept_buff[best].wordnum,
2723 tmp ? SvPV_nolen( *tmp ) : "not compiled under -Dr",scan,
2726 if ( best<accepted ) {
2727 reg_trie_accepted tmp = accept_buff[ best ];
2728 accept_buff[ best ] = accept_buff[ accepted ];
2729 accept_buff[ accepted ] = tmp;
2732 PL_reginput = accept_buff[ best ].endpos;
2735 as far as I can tell we only need the SAVETMPS/FREETMPS
2736 for re's with EVAL in them but I'm leaving them in for
2737 all until I can be sure.
2740 gotit = regmatch( scan + NEXT_OFF( scan ) ) ;
2753 /* unreached codepoint */
2757 if (do_utf8 != UTF) {
2758 /* The target and the pattern have differing utf8ness. */
2764 /* The target is utf8, the pattern is not utf8. */
2768 if (NATIVE_TO_UNI(*(U8*)s) !=
2769 utf8n_to_uvuni((U8*)l, UTF8_MAXBYTES, &ulen,
2771 0 : UTF8_ALLOW_ANY))
2778 /* The target is not utf8, the pattern is utf8. */
2782 if (NATIVE_TO_UNI(*((U8*)l)) !=
2783 utf8n_to_uvuni((U8*)s, UTF8_MAXBYTES, &ulen,
2785 0 : UTF8_ALLOW_ANY))
2792 nextchr = UCHARAT(locinput);
2795 /* The target and the pattern have the same utf8ness. */
2796 /* Inline the first character, for speed. */
2797 if (UCHARAT(s) != nextchr)
2799 if (PL_regeol - locinput < ln)
2801 if (ln > 1 && memNE(s, locinput, ln))
2804 nextchr = UCHARAT(locinput);
2807 PL_reg_flags |= RF_tainted;
2813 if (do_utf8 || UTF) {
2814 /* Either target or the pattern are utf8. */
2816 char *e = PL_regeol;
2818 if (ibcmp_utf8(s, 0, ln, (bool)UTF,
2819 l, &e, 0, do_utf8)) {
2820 /* One more case for the sharp s:
2821 * pack("U0U*", 0xDF) =~ /ss/i,
2822 * the 0xC3 0x9F are the UTF-8
2823 * byte sequence for the U+00DF. */
2825 toLOWER(s[0]) == 's' &&
2827 toLOWER(s[1]) == 's' &&
2834 nextchr = UCHARAT(locinput);
2838 /* Neither the target and the pattern are utf8. */
2840 /* Inline the first character, for speed. */
2841 if (UCHARAT(s) != nextchr &&
2842 UCHARAT(s) != ((OP(scan) == EXACTF)
2843 ? PL_fold : PL_fold_locale)[nextchr])
2845 if (PL_regeol - locinput < ln)
2847 if (ln > 1 && (OP(scan) == EXACTF
2848 ? ibcmp(s, locinput, ln)
2849 : ibcmp_locale(s, locinput, ln)))
2852 nextchr = UCHARAT(locinput);
2856 STRLEN inclasslen = PL_regeol - locinput;
2858 if (!reginclass(scan, (U8*)locinput, &inclasslen, do_utf8))
2860 if (locinput >= PL_regeol)
2862 locinput += inclasslen ? inclasslen : UTF8SKIP(locinput);
2863 nextchr = UCHARAT(locinput);
2868 nextchr = UCHARAT(locinput);
2869 if (!REGINCLASS(scan, (U8*)locinput))
2871 if (!nextchr && locinput >= PL_regeol)
2873 nextchr = UCHARAT(++locinput);
2877 /* If we might have the case of the German sharp s
2878 * in a casefolding Unicode character class. */
2880 if (ANYOF_FOLD_SHARP_S(scan, locinput, PL_regeol)) {
2881 locinput += SHARP_S_SKIP;
2882 nextchr = UCHARAT(locinput);
2888 PL_reg_flags |= RF_tainted;
2894 LOAD_UTF8_CHARCLASS(alnum,"a");
2895 if (!(OP(scan) == ALNUM
2896 ? swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
2897 : isALNUM_LC_utf8((U8*)locinput)))
2901 locinput += PL_utf8skip[nextchr];
2902 nextchr = UCHARAT(locinput);
2905 if (!(OP(scan) == ALNUM
2906 ? isALNUM(nextchr) : isALNUM_LC(nextchr)))
2908 nextchr = UCHARAT(++locinput);
2911 PL_reg_flags |= RF_tainted;
2914 if (!nextchr && locinput >= PL_regeol)
2917 LOAD_UTF8_CHARCLASS(alnum,"a");
2918 if (OP(scan) == NALNUM
2919 ? swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
2920 : isALNUM_LC_utf8((U8*)locinput))
2924 locinput += PL_utf8skip[nextchr];
2925 nextchr = UCHARAT(locinput);
2928 if (OP(scan) == NALNUM
2929 ? isALNUM(nextchr) : isALNUM_LC(nextchr))
2931 nextchr = UCHARAT(++locinput);
2935 PL_reg_flags |= RF_tainted;
2939 /* was last char in word? */
2941 if (locinput == PL_bostr)
2944 U8 *r = reghop3((U8*)locinput, -1, (U8*)PL_bostr);
2946 ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, 0);
2948 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
2949 ln = isALNUM_uni(ln);
2950 LOAD_UTF8_CHARCLASS(alnum,"a");
2951 n = swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8);
2954 ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(ln));
2955 n = isALNUM_LC_utf8((U8*)locinput);
2959 ln = (locinput != PL_bostr) ?
2960 UCHARAT(locinput - 1) : '\n';
2961 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
2963 n = isALNUM(nextchr);
2966 ln = isALNUM_LC(ln);
2967 n = isALNUM_LC(nextchr);
2970 if (((!ln) == (!n)) == (OP(scan) == BOUND ||
2971 OP(scan) == BOUNDL))
2975 PL_reg_flags |= RF_tainted;
2981 if (UTF8_IS_CONTINUED(nextchr)) {
2982 LOAD_UTF8_CHARCLASS(space," ");
2983 if (!(OP(scan) == SPACE
2984 ? swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
2985 : isSPACE_LC_utf8((U8*)locinput)))
2989 locinput += PL_utf8skip[nextchr];
2990 nextchr = UCHARAT(locinput);
2993 if (!(OP(scan) == SPACE
2994 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
2996 nextchr = UCHARAT(++locinput);
2999 if (!(OP(scan) == SPACE
3000 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
3002 nextchr = UCHARAT(++locinput);
3006 PL_reg_flags |= RF_tainted;
3009 if (!nextchr && locinput >= PL_regeol)
3012 LOAD_UTF8_CHARCLASS(space," ");
3013 if (OP(scan) == NSPACE
3014 ? swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
3015 : isSPACE_LC_utf8((U8*)locinput))
3019 locinput += PL_utf8skip[nextchr];
3020 nextchr = UCHARAT(locinput);
3023 if (OP(scan) == NSPACE
3024 ? isSPACE(nextchr) : isSPACE_LC(nextchr))
3026 nextchr = UCHARAT(++locinput);
3029 PL_reg_flags |= RF_tainted;
3035 LOAD_UTF8_CHARCLASS(digit,"0");
3036 if (!(OP(scan) == DIGIT
3037 ? swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
3038 : isDIGIT_LC_utf8((U8*)locinput)))
3042 locinput += PL_utf8skip[nextchr];
3043 nextchr = UCHARAT(locinput);
3046 if (!(OP(scan) == DIGIT
3047 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr)))
3049 nextchr = UCHARAT(++locinput);
3052 PL_reg_flags |= RF_tainted;
3055 if (!nextchr && locinput >= PL_regeol)
3058 LOAD_UTF8_CHARCLASS(digit,"0");
3059 if (OP(scan) == NDIGIT
3060 ? swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
3061 : isDIGIT_LC_utf8((U8*)locinput))
3065 locinput += PL_utf8skip[nextchr];
3066 nextchr = UCHARAT(locinput);
3069 if (OP(scan) == NDIGIT
3070 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr))
3072 nextchr = UCHARAT(++locinput);
3075 if (locinput >= PL_regeol)
3078 LOAD_UTF8_CHARCLASS(mark,"~");
3079 if (swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
3081 locinput += PL_utf8skip[nextchr];
3082 while (locinput < PL_regeol &&
3083 swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
3084 locinput += UTF8SKIP(locinput);
3085 if (locinput > PL_regeol)
3090 nextchr = UCHARAT(locinput);
3093 PL_reg_flags |= RF_tainted;
3097 n = ARG(scan); /* which paren pair */
3098 ln = PL_regstartp[n];
3099 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
3100 if ((I32)*PL_reglastparen < n || ln == -1)
3101 sayNO; /* Do not match unless seen CLOSEn. */
3102 if (ln == PL_regendp[n])
3106 if (do_utf8 && OP(scan) != REF) { /* REF can do byte comparison */
3108 char *e = PL_bostr + PL_regendp[n];
3110 * Note that we can't do the "other character" lookup trick as
3111 * in the 8-bit case (no pun intended) because in Unicode we
3112 * have to map both upper and title case to lower case.
3114 if (OP(scan) == REFF) {
3115 STRLEN ulen1, ulen2;
3116 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
3117 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
3121 toLOWER_utf8((U8*)s, tmpbuf1, &ulen1);
3122 toLOWER_utf8((U8*)l, tmpbuf2, &ulen2);
3123 if (ulen1 != ulen2 || memNE((char *)tmpbuf1, (char *)tmpbuf2, ulen1))
3130 nextchr = UCHARAT(locinput);
3134 /* Inline the first character, for speed. */
3135 if (UCHARAT(s) != nextchr &&
3137 (UCHARAT(s) != ((OP(scan) == REFF
3138 ? PL_fold : PL_fold_locale)[nextchr]))))
3140 ln = PL_regendp[n] - ln;
3141 if (locinput + ln > PL_regeol)
3143 if (ln > 1 && (OP(scan) == REF
3144 ? memNE(s, locinput, ln)
3146 ? ibcmp(s, locinput, ln)
3147 : ibcmp_locale(s, locinput, ln))))
3150 nextchr = UCHARAT(locinput);
3161 OP_4tree *oop = PL_op;
3162 COP *ocurcop = PL_curcop;
3165 struct regexp *oreg = PL_reg_re;
3168 PL_op = (OP_4tree*)PL_regdata->data[n];
3169 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, " re_eval 0x%"UVxf"\n", PTR2UV(PL_op)) );
3170 PAD_SAVE_LOCAL(old_comppad, (PAD*)PL_regdata->data[n + 2]);
3171 PL_regendp[0] = PL_reg_magic->mg_len = locinput - PL_bostr;
3175 CALLRUNOPS(aTHX); /* Scalar context. */
3178 ret = &PL_sv_undef; /* protect against empty (?{}) blocks. */
3186 PAD_RESTORE_LOCAL(old_comppad);
3187 PL_curcop = ocurcop;
3189 if (logical == 2) { /* Postponed subexpression. */
3191 MAGIC *mg = Null(MAGIC*);
3193 CHECKPOINT cp, lastcp;
3197 if(SvROK(ret) && SvSMAGICAL(sv = SvRV(ret)))
3198 mg = mg_find(sv, PERL_MAGIC_qr);
3199 else if (SvSMAGICAL(ret)) {
3200 if (SvGMAGICAL(ret))
3201 sv_unmagic(ret, PERL_MAGIC_qr);
3203 mg = mg_find(ret, PERL_MAGIC_qr);
3207 re = (regexp *)mg->mg_obj;
3208 (void)ReREFCNT_inc(re);
3212 char *t = SvPV(ret, len);
3214 char *oprecomp = PL_regprecomp;
3215 I32 osize = PL_regsize;
3216 I32 onpar = PL_regnpar;
3219 if (DO_UTF8(ret)) pm.op_pmdynflags |= PMdf_DYN_UTF8;
3220 re = CALLREGCOMP(aTHX_ t, t + len, &pm);
3222 & (SVs_TEMP | SVs_PADTMP | SVf_READONLY
3224 sv_magic(ret,(SV*)ReREFCNT_inc(re),
3226 PL_regprecomp = oprecomp;
3231 PerlIO_printf(Perl_debug_log,
3232 "Entering embedded `%s%.60s%s%s'\n",
3236 (strlen(re->precomp) > 60 ? "..." : ""))
3239 state.prev = PL_reg_call_cc;
3240 state.cc = PL_regcc;
3241 state.re = PL_reg_re;
3245 cp = regcppush(0); /* Save *all* the positions. */
3248 state.ss = PL_savestack_ix;
3249 *PL_reglastparen = 0;
3250 *PL_reglastcloseparen = 0;
3251 PL_reg_call_cc = &state;
3252 PL_reginput = locinput;
3253 toggleutf = ((PL_reg_flags & RF_utf8) != 0) ^
3254 ((re->reganch & ROPT_UTF8) != 0);
3255 if (toggleutf) PL_reg_flags ^= RF_utf8;
3257 /* XXXX This is too dramatic a measure... */
3260 if (regmatch(re->program + 1)) {
3261 /* Even though we succeeded, we need to restore
3262 global variables, since we may be wrapped inside
3263 SUSPEND, thus the match may be not finished yet. */
3265 /* XXXX Do this only if SUSPENDed? */
3266 PL_reg_call_cc = state.prev;
3267 PL_regcc = state.cc;
3268 PL_reg_re = state.re;
3269 cache_re(PL_reg_re);
3270 if (toggleutf) PL_reg_flags ^= RF_utf8;
3272 /* XXXX This is too dramatic a measure... */
3275 /* These are needed even if not SUSPEND. */
3281 REGCP_UNWIND(lastcp);
3283 PL_reg_call_cc = state.prev;
3284 PL_regcc = state.cc;
3285 PL_reg_re = state.re;
3286 cache_re(PL_reg_re);
3287 if (toggleutf) PL_reg_flags ^= RF_utf8;
3289 /* XXXX This is too dramatic a measure... */
3299 sv_setsv(save_scalar(PL_replgv), ret);
3305 n = ARG(scan); /* which paren pair */
3306 PL_reg_start_tmp[n] = locinput;
3311 n = ARG(scan); /* which paren pair */
3312 PL_regstartp[n] = PL_reg_start_tmp[n] - PL_bostr;
3313 PL_regendp[n] = locinput - PL_bostr;
3314 if (n > (I32)*PL_reglastparen)
3315 *PL_reglastparen = n;
3316 *PL_reglastcloseparen = n;
3319 n = ARG(scan); /* which paren pair */
3320 sw = ((I32)*PL_reglastparen >= n && PL_regendp[n] != -1);
3323 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
3325 next = NEXTOPER(NEXTOPER(scan));
3327 next = scan + ARG(scan);
3328 if (OP(next) == IFTHEN) /* Fake one. */
3329 next = NEXTOPER(NEXTOPER(next));
3333 logical = scan->flags;
3335 /*******************************************************************
3336 PL_regcc contains infoblock about the innermost (...)* loop, and
3337 a pointer to the next outer infoblock.
3339 Here is how Y(A)*Z is processed (if it is compiled into CURLYX/WHILEM):
3341 1) After matching X, regnode for CURLYX is processed;
3343 2) This regnode creates infoblock on the stack, and calls
3344 regmatch() recursively with the starting point at WHILEM node;
3346 3) Each hit of WHILEM node tries to match A and Z (in the order
3347 depending on the current iteration, min/max of {min,max} and
3348 greediness). The information about where are nodes for "A"
3349 and "Z" is read from the infoblock, as is info on how many times "A"
3350 was already matched, and greediness.
3352 4) After A matches, the same WHILEM node is hit again.
3354 5) Each time WHILEM is hit, PL_regcc is the infoblock created by CURLYX
3355 of the same pair. Thus when WHILEM tries to match Z, it temporarily
3356 resets PL_regcc, since this Y(A)*Z can be a part of some other loop:
3357 as in (Y(A)*Z)*. If Z matches, the automaton will hit the WHILEM node
3358 of the external loop.
3360 Currently present infoblocks form a tree with a stem formed by PL_curcc
3361 and whatever it mentions via ->next, and additional attached trees
3362 corresponding to temporarily unset infoblocks as in "5" above.
3364 In the following picture infoblocks for outer loop of
3365 (Y(A)*?Z)*?T are denoted O, for inner I. NULL starting block
3366 is denoted by x. The matched string is YAAZYAZT. Temporarily postponed
3367 infoblocks are drawn below the "reset" infoblock.
3369 In fact in the picture below we do not show failed matches for Z and T
3370 by WHILEM blocks. [We illustrate minimal matches, since for them it is
3371 more obvious *why* one needs to *temporary* unset infoblocks.]
3373 Matched REx position InfoBlocks Comment
3377 Y A)*?Z)*?T x <- O <- I
3378 YA )*?Z)*?T x <- O <- I
3379 YA A)*?Z)*?T x <- O <- I
3380 YAA )*?Z)*?T x <- O <- I
3381 YAA Z)*?T x <- O # Temporary unset I
3384 YAAZ Y(A)*?Z)*?T x <- O
3387 YAAZY (A)*?Z)*?T x <- O
3390 YAAZY A)*?Z)*?T x <- O <- I
3393 YAAZYA )*?Z)*?T x <- O <- I
3396 YAAZYA Z)*?T x <- O # Temporary unset I
3402 YAAZYAZ T x # Temporary unset O
3409 *******************************************************************/
3412 CHECKPOINT cp = PL_savestack_ix;
3413 /* No need to save/restore up to this paren */
3414 I32 parenfloor = scan->flags;
3416 if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
3418 cc.oldcc = PL_regcc;
3420 /* XXXX Probably it is better to teach regpush to support
3421 parenfloor > PL_regsize... */
3422 if (parenfloor > (I32)*PL_reglastparen)
3423 parenfloor = *PL_reglastparen; /* Pessimization... */
3424 cc.parenfloor = parenfloor;
3426 cc.min = ARG1(scan);
3427 cc.max = ARG2(scan);
3428 cc.scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3432 PL_reginput = locinput;
3433 n = regmatch(PREVOPER(next)); /* start on the WHILEM */
3435 PL_regcc = cc.oldcc;
3441 * This is really hard to understand, because after we match
3442 * what we're trying to match, we must make sure the rest of
3443 * the REx is going to match for sure, and to do that we have
3444 * to go back UP the parse tree by recursing ever deeper. And
3445 * if it fails, we have to reset our parent's current state
3446 * that we can try again after backing off.
3449 CHECKPOINT cp, lastcp;
3450 CURCUR* cc = PL_regcc;
3451 char *lastloc = cc->lastloc; /* Detection of 0-len. */
3453 n = cc->cur + 1; /* how many we know we matched */
3454 PL_reginput = locinput;
3457 PerlIO_printf(Perl_debug_log,
3458 "%*s %ld out of %ld..%ld cc=%"UVxf"\n",
3459 REPORT_CODE_OFF+PL_regindent*2, "",
3460 (long)n, (long)cc->min,
3461 (long)cc->max, PTR2UV(cc))
3464 /* If degenerate scan matches "", assume scan done. */
3466 if (locinput == cc->lastloc && n >= cc->min) {
3467 PL_regcc = cc->oldcc;
3471 PerlIO_printf(Perl_debug_log,
3472 "%*s empty match detected, try continuation...\n",
3473 REPORT_CODE_OFF+PL_regindent*2, "")
3475 if (regmatch(cc->next))
3483 /* First just match a string of min scans. */
3487 cc->lastloc = locinput;
3488 if (regmatch(cc->scan))
3491 cc->lastloc = lastloc;
3496 /* Check whether we already were at this position.
3497 Postpone detection until we know the match is not
3498 *that* much linear. */
3499 if (!PL_reg_maxiter) {
3500 PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
3501 PL_reg_leftiter = PL_reg_maxiter;
3503 if (PL_reg_leftiter-- == 0) {
3504 I32 size = (PL_reg_maxiter + 7)/8;
3505 if (PL_reg_poscache) {
3506 if ((I32)PL_reg_poscache_size < size) {
3507 Renew(PL_reg_poscache, size, char);
3508 PL_reg_poscache_size = size;
3510 Zero(PL_reg_poscache, size, char);
3513 PL_reg_poscache_size = size;
3514 Newz(29, PL_reg_poscache, size, char);
3517 PerlIO_printf(Perl_debug_log,
3518 "%sDetected a super-linear match, switching on caching%s...\n",
3519 PL_colors[4], PL_colors[5])
3522 if (PL_reg_leftiter < 0) {
3523 I32 o = locinput - PL_bostr, b;
3525 o = (scan->flags & 0xf) - 1 + o * (scan->flags>>4);
3528 if (PL_reg_poscache[o] & (1<<b)) {
3530 PerlIO_printf(Perl_debug_log,
3531 "%*s already tried at this position...\n",
3532 REPORT_CODE_OFF+PL_regindent*2, "")
3534 if (PL_reg_flags & RF_false)
3539 PL_reg_poscache[o] |= (1<<b);
3543 /* Prefer next over scan for minimal matching. */
3546 PL_regcc = cc->oldcc;
3549 cp = regcppush(cc->parenfloor);
3551 if (regmatch(cc->next)) {
3553 sayYES; /* All done. */
3555 REGCP_UNWIND(lastcp);
3561 if (n >= cc->max) { /* Maximum greed exceeded? */
3562 if (ckWARN(WARN_REGEXP) && n >= REG_INFTY
3563 && !(PL_reg_flags & RF_warned)) {
3564 PL_reg_flags |= RF_warned;
3565 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s limit (%d) exceeded",
3566 "Complex regular subexpression recursion",
3573 PerlIO_printf(Perl_debug_log,
3574 "%*s trying longer...\n",
3575 REPORT_CODE_OFF+PL_regindent*2, "")
3577 /* Try scanning more and see if it helps. */
3578 PL_reginput = locinput;
3580 cc->lastloc = locinput;
3581 cp = regcppush(cc->parenfloor);
3583 if (regmatch(cc->scan)) {
3587 REGCP_UNWIND(lastcp);
3590 cc->lastloc = lastloc;
3594 /* Prefer scan over next for maximal matching. */
3596 if (n < cc->max) { /* More greed allowed? */
3597 cp = regcppush(cc->parenfloor);
3599 cc->lastloc = locinput;
3601 if (regmatch(cc->scan)) {
3605 REGCP_UNWIND(lastcp);
3606 regcppop(); /* Restore some previous $<digit>s? */
3607 PL_reginput = locinput;
3609 PerlIO_printf(Perl_debug_log,
3610 "%*s failed, try continuation...\n",
3611 REPORT_CODE_OFF+PL_regindent*2, "")
3614 if (ckWARN(WARN_REGEXP) && n >= REG_INFTY
3615 && !(PL_reg_flags & RF_warned)) {
3616 PL_reg_flags |= RF_warned;
3617 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s limit (%d) exceeded",
3618 "Complex regular subexpression recursion",
3622 /* Failed deeper matches of scan, so see if this one works. */
3623 PL_regcc = cc->oldcc;
3626 if (regmatch(cc->next))
3632 cc->lastloc = lastloc;
3637 next = scan + ARG(scan);
3640 inner = NEXTOPER(NEXTOPER(scan));
3643 inner = NEXTOPER(scan);
3647 if (OP(next) != c1) /* No choice. */
3648 next = inner; /* Avoid recursion. */
3650 I32 lastparen = *PL_reglastparen;
3652 re_unwind_branch_t *uw;
3654 /* Put unwinding data on stack */
3655 unwind1 = SSNEWt(1,re_unwind_branch_t);
3656 uw = SSPTRt(unwind1,re_unwind_branch_t);
3659 uw->type = ((c1 == BRANCH)
3661 : RE_UNWIND_BRANCHJ);
3662 uw->lastparen = lastparen;
3664 uw->locinput = locinput;
3665 uw->nextchr = nextchr;
3667 uw->regindent = ++PL_regindent;
3670 REGCP_SET(uw->lastcp);
3672 /* Now go into the first branch */
3685 /* We suppose that the next guy does not need
3686 backtracking: in particular, it is of constant non-zero length,
3687 and has no parenths to influence future backrefs. */
3688 ln = ARG1(scan); /* min to match */
3689 n = ARG2(scan); /* max to match */
3690 paren = scan->flags;
3692 if (paren > PL_regsize)
3694 if (paren > (I32)*PL_reglastparen)
3695 *PL_reglastparen = paren;
3697 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
3699 scan += NEXT_OFF(scan); /* Skip former OPEN. */
3700 PL_reginput = locinput;
3703 if (ln && regrepeat_hard(scan, ln, &l) < ln)
3705 locinput = PL_reginput;
3706 if (HAS_TEXT(next) || JUMPABLE(next)) {
3707 regnode *text_node = next;
3709 if (! HAS_TEXT(text_node)) FIND_NEXT_IMPT(text_node);
3711 if (! HAS_TEXT(text_node)) c1 = c2 = -1000;
3713 if (PL_regkind[(U8)OP(text_node)] == REF) {
3717 else { c1 = (U8)*STRING(text_node); }
3718 if (OP(text_node) == EXACTF || OP(text_node) == REFF)
3720 else if (OP(text_node) == EXACTFL || OP(text_node) == REFFL)
3721 c2 = PL_fold_locale[c1];
3730 while (n >= ln || (n == REG_INFTY && ln > 0)) { /* ln overflow ? */
3731 /* If it could work, try it. */
3733 UCHARAT(PL_reginput) == c1 ||
3734 UCHARAT(PL_reginput) == c2)
3738 PL_regstartp[paren] =
3739 HOPc(PL_reginput, -l) - PL_bostr;
3740 PL_regendp[paren] = PL_reginput - PL_bostr;
3743 PL_regendp[paren] = -1;
3747 REGCP_UNWIND(lastcp);
3749 /* Couldn't or didn't -- move forward. */
3750 PL_reginput = locinput;
3751 if (regrepeat_hard(scan, 1, &l)) {
3753 locinput = PL_reginput;
3760 n = regrepeat_hard(scan, n, &l);
3761 locinput = PL_reginput;
3763 PerlIO_printf(Perl_debug_log,
3764 "%*s matched %"IVdf" times, len=%"IVdf"...\n",
3765 (int)(REPORT_CODE_OFF+PL_regindent*2), "",
3769 if (HAS_TEXT(next) || JUMPABLE(next)) {
3770 regnode *text_node = next;
3772 if (! HAS_TEXT(text_node)) FIND_NEXT_IMPT(text_node);
3774 if (! HAS_TEXT(text_node)) c1 = c2 = -1000;
3776 if (PL_regkind[(U8)OP(text_node)] == REF) {
3780 else { c1 = (U8)*STRING(text_node); }
3782 if (OP(text_node) == EXACTF || OP(text_node) == REFF)
3784 else if (OP(text_node) == EXACTFL || OP(text_node) == REFFL)
3785 c2 = PL_fold_locale[c1];
3796 /* If it could work, try it. */
3798 UCHARAT(PL_reginput) == c1 ||
3799 UCHARAT(PL_reginput) == c2)
3802 PerlIO_printf(Perl_debug_log,
3803 "%*s trying tail with n=%"IVdf"...\n",
3804 (int)(REPORT_CODE_OFF+PL_regindent*2), "", (IV)n)
3808 PL_regstartp[paren] = HOPc(PL_reginput, -l) - PL_bostr;
3809 PL_regendp[paren] = PL_reginput - PL_bostr;
3812 PL_regendp[paren] = -1;
3816 REGCP_UNWIND(lastcp);
3818 /* Couldn't or didn't -- back up. */
3820 locinput = HOPc(locinput, -l);
3821 PL_reginput = locinput;
3828 paren = scan->flags; /* Which paren to set */
3829 if (paren > PL_regsize)
3831 if (paren > (I32)*PL_reglastparen)
3832 *PL_reglastparen = paren;
3833 ln = ARG1(scan); /* min to match */
3834 n = ARG2(scan); /* max to match */
3835 scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
3839 ln = ARG1(scan); /* min to match */
3840 n = ARG2(scan); /* max to match */
3841 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
3846 scan = NEXTOPER(scan);
3852 scan = NEXTOPER(scan);
3856 * Lookahead to avoid useless match attempts
3857 * when we know what character comes next.
3861 * Used to only do .*x and .*?x, but now it allows
3862 * for )'s, ('s and (?{ ... })'s to be in the way
3863 * of the quantifier and the EXACT-like node. -- japhy
3866 if (HAS_TEXT(next) || JUMPABLE(next)) {
3868 regnode *text_node = next;
3870 if (! HAS_TEXT(text_node)) FIND_NEXT_IMPT(text_node);
3872 if (! HAS_TEXT(text_node)) c1 = c2 = -1000;
3874 if (PL_regkind[(U8)OP(text_node)] == REF) {
3876 goto assume_ok_easy;
3878 else { s = (U8*)STRING(text_node); }
3882 if (OP(text_node) == EXACTF || OP(text_node) == REFF)
3884 else if (OP(text_node) == EXACTFL || OP(text_node) == REFFL)
3885 c2 = PL_fold_locale[c1];
3888 if (OP(text_node) == EXACTF || OP(text_node) == REFF) {
3889 STRLEN ulen1, ulen2;
3890 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
3891 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
3893 to_utf8_lower((U8*)s, tmpbuf1, &ulen1);
3894 to_utf8_upper((U8*)s, tmpbuf2, &ulen2);
3896 c1 = utf8n_to_uvuni(tmpbuf1, UTF8_MAXBYTES, 0,
3898 0 : UTF8_ALLOW_ANY);
3899 c2 = utf8n_to_uvuni(tmpbuf2, UTF8_MAXBYTES, 0,
3901 0 : UTF8_ALLOW_ANY);
3904 c2 = c1 = utf8n_to_uvchr(s, UTF8_MAXBYTES, 0,
3906 0 : UTF8_ALLOW_ANY);
3914 PL_reginput = locinput;
3918 if (ln && regrepeat(scan, ln) < ln)
3920 locinput = PL_reginput;
3923 char *e; /* Should not check after this */
3924 char *old = locinput;
3927 if (n == REG_INFTY) {
3930 while (UTF8_IS_CONTINUATION(*(U8*)e))
3936 m >0 && e + UTF8SKIP(e) <= PL_regeol; m--)
3940 e = locinput + n - ln;
3945 /* Find place 'next' could work */
3948 while (locinput <= e &&
3949 UCHARAT(locinput) != c1)
3952 while (locinput <= e
3953 && UCHARAT(locinput) != c1
3954 && UCHARAT(locinput) != c2)
3957 count = locinput - old;
3962 /* count initialised to
3963 * utf8_distance(old, locinput) */
3964 while (locinput <= e &&
3965 utf8n_to_uvchr((U8*)locinput,
3966 UTF8_MAXBYTES, &len,
3968 0 : UTF8_ALLOW_ANY) != (UV)c1) {
3973 /* count initialised to
3974 * utf8_distance(old, locinput) */
3975 while (locinput <= e) {
3976 UV c = utf8n_to_uvchr((U8*)locinput,
3977 UTF8_MAXBYTES, &len,
3979 0 : UTF8_ALLOW_ANY);
3980 if (c == (UV)c1 || c == (UV)c2)
3989 /* PL_reginput == old now */
3990 if (locinput != old) {
3991 ln = 1; /* Did some */
3992 if (regrepeat(scan, count) < count)
3995 /* PL_reginput == locinput now */
3996 TRYPAREN(paren, ln, locinput);
3997 PL_reginput = locinput; /* Could be reset... */
3998 REGCP_UNWIND(lastcp);
3999 /* Couldn't or didn't -- move forward. */
4002 locinput += UTF8SKIP(locinput);
4009 while (n >= ln || (n == REG_INFTY && ln > 0)) { /* ln overflow ? */
4013 c = utf8n_to_uvchr((U8*)PL_reginput,
4016 0 : UTF8_ALLOW_ANY);
4018 c = UCHARAT(PL_reginput);
4019 /* If it could work, try it. */
4020 if (c == (UV)c1 || c == (UV)c2)
4022 TRYPAREN(paren, ln, PL_reginput);
4023 REGCP_UNWIND(lastcp);
4026 /* If it could work, try it. */
4027 else if (c1 == -1000)
4029 TRYPAREN(paren, ln, PL_reginput);
4030 REGCP_UNWIND(lastcp);
4032 /* Couldn't or didn't -- move forward. */
4033 PL_reginput = locinput;
4034 if (regrepeat(scan, 1)) {
4036 locinput = PL_reginput;
4044 n = regrepeat(scan, n);
4045 locinput = PL_reginput;
4046 if (ln < n && PL_regkind[(U8)OP(next)] == EOL &&
4047 (OP(next) != MEOL ||
4048 OP(next) == SEOL || OP(next) == EOS))
4050 ln = n; /* why back off? */
4051 /* ...because $ and \Z can match before *and* after
4052 newline at the end. Consider "\n\n" =~ /\n+\Z\n/.
4053 We should back off by one in this case. */
4054 if (UCHARAT(PL_reginput - 1) == '\n' && OP(next) != EOS)
4063 c = utf8n_to_uvchr((U8*)PL_reginput,
4066 0 : UTF8_ALLOW_ANY);
4068 c = UCHARAT(PL_reginput);
4070 /* If it could work, try it. */
4071 if (c1 == -1000 || c == (UV)c1 || c == (UV)c2)
4073 TRYPAREN(paren, n, PL_reginput);
4074 REGCP_UNWIND(lastcp);
4076 /* Couldn't or didn't -- back up. */
4078 PL_reginput = locinput = HOPc(locinput, -1);
4086 c = utf8n_to_uvchr((U8*)PL_reginput,
4089 0 : UTF8_ALLOW_ANY);
4091 c = UCHARAT(PL_reginput);
4093 /* If it could work, try it. */
4094 if (c1 == -1000 || c == (UV)c1 || c == (UV)c2)
4096 TRYPAREN(paren, n, PL_reginput);
4097 REGCP_UNWIND(lastcp);
4099 /* Couldn't or didn't -- back up. */
4101 PL_reginput = locinput = HOPc(locinput, -1);
4108 if (PL_reg_call_cc) {
4109 re_cc_state *cur_call_cc = PL_reg_call_cc;
4110 CURCUR *cctmp = PL_regcc;
4111 regexp *re = PL_reg_re;
4112 CHECKPOINT cp, lastcp;
4114 cp = regcppush(0); /* Save *all* the positions. */
4116 regcp_set_to(PL_reg_call_cc->ss); /* Restore parens of
4118 PL_reginput = locinput; /* Make position available to
4120 cache_re(PL_reg_call_cc->re);
4121 PL_regcc = PL_reg_call_cc->cc;
4122 PL_reg_call_cc = PL_reg_call_cc->prev;
4123 if (regmatch(cur_call_cc->node)) {
4124 PL_reg_call_cc = cur_call_cc;
4128 REGCP_UNWIND(lastcp);
4130 PL_reg_call_cc = cur_call_cc;
4136 PerlIO_printf(Perl_debug_log,
4137 "%*s continuation failed...\n",
4138 REPORT_CODE_OFF+PL_regindent*2, "")
4142 if (locinput < PL_regtill) {
4143 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
4144 "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
4146 (long)(locinput - PL_reg_starttry),
4147 (long)(PL_regtill - PL_reg_starttry),
4149 sayNO_FINAL; /* Cannot match: too short. */
4151 PL_reginput = locinput; /* put where regtry can find it */
4152 sayYES_FINAL; /* Success! */
4154 PL_reginput = locinput; /* put where regtry can find it */
4155 sayYES_LOUD; /* Success! */
4158 PL_reginput = locinput;
4163 s = HOPBACKc(locinput, scan->flags);
4169 PL_reginput = locinput;
4170 PL_reg_flags ^= RF_false;
4175 s = HOPBACKc(locinput, scan->flags);
4181 PL_reginput = locinput;
4184 inner = NEXTOPER(NEXTOPER(scan));
4185 if (regmatch(inner) != n) {
4187 PL_reg_flags ^= RF_false;
4198 PL_reg_flags ^= RF_false;
4204 if (OP(scan) == SUSPEND) {
4205 locinput = PL_reginput;
4206 nextchr = UCHARAT(locinput);
4211 next = scan + ARG(scan);
4216 PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
4217 PTR2UV(scan), OP(scan));
4218 Perl_croak(aTHX_ "regexp memory corruption");
4225 * We get here only if there's trouble -- normally "case END" is
4226 * the terminating point.
4228 Perl_croak(aTHX_ "corrupted regexp pointers");
4234 PerlIO_printf(Perl_debug_log,
4235 "%*s %scould match...%s\n",
4236 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4],PL_colors[5])
4240 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
4241 PL_colors[4],PL_colors[5]));
4247 #if 0 /* Breaks $^R */
4255 PerlIO_printf(Perl_debug_log,
4256 "%*s %sfailed...%s\n",
4257 REPORT_CODE_OFF+PL_regindent*2, "",PL_colors[4],PL_colors[5])
4263 re_unwind_t *uw = SSPTRt(unwind,re_unwind_t);
4266 case RE_UNWIND_BRANCH:
4267 case RE_UNWIND_BRANCHJ:
4269 re_unwind_branch_t *uwb = &(uw->branch);
4270 I32 lastparen = uwb->lastparen;
4272 REGCP_UNWIND(uwb->lastcp);
4273 for (n = *PL_reglastparen; n > lastparen; n--)
4275 *PL_reglastparen = n;
4276 scan = next = uwb->next;
4278 OP(scan) != (uwb->type == RE_UNWIND_BRANCH
4279 ? BRANCH : BRANCHJ) ) { /* Failure */
4286 /* Have more choice yet. Reuse the same uwb. */
4288 if ((n = (uwb->type == RE_UNWIND_BRANCH
4289 ? NEXT_OFF(next) : ARG(next))))
4292 next = NULL; /* XXXX Needn't unwinding in this case... */
4294 next = NEXTOPER(scan);
4295 if (uwb->type == RE_UNWIND_BRANCHJ)
4296 next = NEXTOPER(next);
4297 locinput = uwb->locinput;
4298 nextchr = uwb->nextchr;
4300 PL_regindent = uwb->regindent;
4307 Perl_croak(aTHX_ "regexp unwind memory corruption");
4318 - regrepeat - repeatedly match something simple, report how many
4321 * [This routine now assumes that it will only match on things of length 1.
4322 * That was true before, but now we assume scan - reginput is the count,
4323 * rather than incrementing count on every character. [Er, except utf8.]]
4326 S_regrepeat(pTHX_ regnode *p, I32 max)
4328 register char *scan;
4330 register char *loceol = PL_regeol;
4331 register I32 hardcount = 0;
4332 register bool do_utf8 = PL_reg_match_utf8;
4335 if (max == REG_INFTY)
4337 else if (max < loceol - scan)
4338 loceol = scan + max;
4343 while (scan < loceol && hardcount < max && *scan != '\n') {
4344 scan += UTF8SKIP(scan);
4348 while (scan < loceol && *scan != '\n')
4355 while (scan < loceol && hardcount < max) {
4356 scan += UTF8SKIP(scan);
4366 case EXACT: /* length of string is 1 */
4368 while (scan < loceol && UCHARAT(scan) == c)
4371 case EXACTF: /* length of string is 1 */
4373 while (scan < loceol &&
4374 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold[c]))
4377 case EXACTFL: /* length of string is 1 */
4378 PL_reg_flags |= RF_tainted;
4380 while (scan < loceol &&
4381 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold_locale[c]))
4387 while (hardcount < max && scan < loceol &&
4388 reginclass(p, (U8*)scan, 0, do_utf8)) {
4389 scan += UTF8SKIP(scan);
4393 while (scan < loceol && REGINCLASS(p, (U8*)scan))
4400 LOAD_UTF8_CHARCLASS(alnum,"a");
4401 while (hardcount < max && scan < loceol &&
4402 swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) {
4403 scan += UTF8SKIP(scan);
4407 while (scan < loceol && isALNUM(*scan))
4412 PL_reg_flags |= RF_tainted;
4415 while (hardcount < max && scan < loceol &&
4416 isALNUM_LC_utf8((U8*)scan)) {
4417 scan += UTF8SKIP(scan);
4421 while (scan < loceol && isALNUM_LC(*scan))
4428 LOAD_UTF8_CHARCLASS(alnum,"a");
4429 while (hardcount < max && scan < loceol &&
4430 !swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) {
4431 scan += UTF8SKIP(scan);
4435 while (scan < loceol && !isALNUM(*scan))
4440 PL_reg_flags |= RF_tainted;
4443 while (hardcount < max && scan < loceol &&
4444 !isALNUM_LC_utf8((U8*)scan)) {
4445 scan += UTF8SKIP(scan);
4449 while (scan < loceol && !isALNUM_LC(*scan))
4456 LOAD_UTF8_CHARCLASS(space," ");
4457 while (hardcount < max && scan < loceol &&
4459 swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) {
4460 scan += UTF8SKIP(scan);
4464 while (scan < loceol && isSPACE(*scan))
4469 PL_reg_flags |= RF_tainted;
4472 while (hardcount < max && scan < loceol &&
4473 (*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
4474 scan += UTF8SKIP(scan);
4478 while (scan < loceol && isSPACE_LC(*scan))
4485 LOAD_UTF8_CHARCLASS(space," ");
4486 while (hardcount < max && scan < loceol &&
4488 swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) {
4489 scan += UTF8SKIP(scan);
4493 while (scan < loceol && !isSPACE(*scan))
4498 PL_reg_flags |= RF_tainted;
4501 while (hardcount < max && scan < loceol &&
4502 !(*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
4503 scan += UTF8SKIP(scan);
4507 while (scan < loceol && !isSPACE_LC(*scan))
4514 LOAD_UTF8_CHARCLASS(digit,"0");
4515 while (hardcount < max && scan < loceol &&
4516 swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) {
4517 scan += UTF8SKIP(scan);
4521 while (scan < loceol && isDIGIT(*scan))
4528 LOAD_UTF8_CHARCLASS(digit,"0");
4529 while (hardcount < max && scan < loceol &&
4530 !swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) {
4531 scan += UTF8SKIP(scan);
4535 while (scan < loceol && !isDIGIT(*scan))
4539 default: /* Called on something of 0 width. */
4540 break; /* So match right here or not at all. */
4546 c = scan - PL_reginput;
4551 SV *prop = sv_newmortal();
4555 PerlIO_printf(Perl_debug_log,
4556 "%*s %s can match %"IVdf" times out of %"IVdf"...\n",
4557 REPORT_CODE_OFF+1, "", SvPVX(prop),(IV)c,(IV)max);
4565 - regrepeat_hard - repeatedly match something, report total lenth and length
4567 * The repeater is supposed to have constant non-zero length.
4571 S_regrepeat_hard(pTHX_ regnode *p, I32 max, I32 *lp)
4573 register char *scan = Nullch;
4574 register char *start;
4575 register char *loceol = PL_regeol;
4577 I32 count = 0, res = 1;
4582 start = PL_reginput;
4583 if (PL_reg_match_utf8) {
4584 while (PL_reginput < loceol && (scan = PL_reginput, res = regmatch(p))) {
4587 while (start < PL_reginput) {
4589 start += UTF8SKIP(start);
4600 while (PL_reginput < loceol && (scan = PL_reginput, res = regmatch(p))) {
4602 *lp = l = PL_reginput - start;
4603 if (max != REG_INFTY && l*max < loceol - scan)
4604 loceol = scan + l*max;
4617 - regclass_swash - prepare the utf8 swash
4621 Perl_regclass_swash(pTHX_ register regnode* node, bool doinit, SV** listsvp, SV **altsvp)
4627 if (PL_regdata && PL_regdata->count) {
4630 if (PL_regdata->what[n] == 's') {
4631 SV *rv = (SV*)PL_regdata->data[n];
4632 AV *av = (AV*)SvRV((SV*)rv);
4633 SV **ary = AvARRAY(av);
4636 /* See the end of regcomp.c:S_reglass() for
4637 * documentation of these array elements. */
4640 a = SvTYPE(ary[1]) == SVt_RV ? &ary[1] : 0;
4641 b = SvTYPE(ary[2]) == SVt_PVAV ? &ary[2] : 0;
4645 else if (si && doinit) {
4646 sw = swash_init("utf8", "", si, 1, 0);
4647 (void)av_store(av, 1, sw);
4663 - reginclass - determine if a character falls into a character class
4665 The n is the ANYOF regnode, the p is the target string, lenp
4666 is pointer to the maximum length of how far to go in the p
4667 (if the lenp is zero, UTF8SKIP(p) is used),
4668 do_utf8 tells whether the target string is in UTF-8.
4673 S_reginclass(pTHX_ register regnode *n, register U8* p, STRLEN* lenp, register bool do_utf8)
4675 char flags = ANYOF_FLAGS(n);
4681 if (do_utf8 && !UTF8_IS_INVARIANT(c))
4682 c = utf8n_to_uvchr(p, UTF8_MAXBYTES, &len,
4683 ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
4685 plen = lenp ? *lenp : UNISKIP(NATIVE_TO_UNI(c));
4686 if (do_utf8 || (flags & ANYOF_UNICODE)) {
4689 if (do_utf8 && !ANYOF_RUNTIME(n)) {
4690 if (len != (STRLEN)-1 && c < 256 && ANYOF_BITMAP_TEST(n, c))
4693 if (!match && do_utf8 && (flags & ANYOF_UNICODE_ALL) && c >= 256)
4697 SV *sw = regclass_swash(n, TRUE, 0, (SV**)&av);
4700 if (swash_fetch(sw, p, do_utf8))
4702 else if (flags & ANYOF_FOLD) {
4703 if (!match && lenp && av) {
4706 for (i = 0; i <= av_len(av); i++) {
4707 SV* sv = *av_fetch(av, i, FALSE);
4709 char *s = SvPV(sv, len);
4711 if (len <= plen && memEQ(s, (char*)p, len)) {
4719 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
4722 to_utf8_fold(p, tmpbuf, &tmplen);
4723 if (swash_fetch(sw, tmpbuf, do_utf8))
4729 if (match && lenp && *lenp == 0)
4730 *lenp = UNISKIP(NATIVE_TO_UNI(c));
4732 if (!match && c < 256) {
4733 if (ANYOF_BITMAP_TEST(n, c))
4735 else if (flags & ANYOF_FOLD) {
4738 if (flags & ANYOF_LOCALE) {
4739 PL_reg_flags |= RF_tainted;
4740 f = PL_fold_locale[c];
4744 if (f != c && ANYOF_BITMAP_TEST(n, f))
4748 if (!match && (flags & ANYOF_CLASS)) {
4749 PL_reg_flags |= RF_tainted;
4751 (ANYOF_CLASS_TEST(n, ANYOF_ALNUM) && isALNUM_LC(c)) ||
4752 (ANYOF_CLASS_TEST(n, ANYOF_NALNUM) && !isALNUM_LC(c)) ||
4753 (ANYOF_CLASS_TEST(n, ANYOF_SPACE) && isSPACE_LC(c)) ||
4754 (ANYOF_CLASS_TEST(n, ANYOF_NSPACE) && !isSPACE_LC(c)) ||
4755 (ANYOF_CLASS_TEST(n, ANYOF_DIGIT) && isDIGIT_LC(c)) ||
4756 (ANYOF_CLASS_TEST(n, ANYOF_NDIGIT) && !isDIGIT_LC(c)) ||
4757 (ANYOF_CLASS_TEST(n, ANYOF_ALNUMC) && isALNUMC_LC(c)) ||
4758 (ANYOF_CLASS_TEST(n, ANYOF_NALNUMC) && !isALNUMC_LC(c)) ||
4759 (ANYOF_CLASS_TEST(n, ANYOF_ALPHA) && isALPHA_LC(c)) ||
4760 (ANYOF_CLASS_TEST(n, ANYOF_NALPHA) && !isALPHA_LC(c)) ||
4761 (ANYOF_CLASS_TEST(n, ANYOF_ASCII) && isASCII(c)) ||
4762 (ANYOF_CLASS_TEST(n, ANYOF_NASCII) && !isASCII(c)) ||
4763 (ANYOF_CLASS_TEST(n, ANYOF_CNTRL) && isCNTRL_LC(c)) ||
4764 (ANYOF_CLASS_TEST(n, ANYOF_NCNTRL) && !isCNTRL_LC(c)) ||
4765 (ANYOF_CLASS_TEST(n, ANYOF_GRAPH) && isGRAPH_LC(c)) ||
4766 (ANYOF_CLASS_TEST(n, ANYOF_NGRAPH) && !isGRAPH_LC(c)) ||
4767 (ANYOF_CLASS_TEST(n, ANYOF_LOWER) && isLOWER_LC(c)) ||
4768 (ANYOF_CLASS_TEST(n, ANYOF_NLOWER) && !isLOWER_LC(c)) ||
4769 (ANYOF_CLASS_TEST(n, ANYOF_PRINT) && isPRINT_LC(c)) ||
4770 (ANYOF_CLASS_TEST(n, ANYOF_NPRINT) && !isPRINT_LC(c)) ||
4771 (ANYOF_CLASS_TEST(n, ANYOF_PUNCT) && isPUNCT_LC(c)) ||
4772 (ANYOF_CLASS_TEST(n, ANYOF_NPUNCT) && !isPUNCT_LC(c)) ||
4773 (ANYOF_CLASS_TEST(n, ANYOF_UPPER) && isUPPER_LC(c)) ||
4774 (ANYOF_CLASS_TEST(n, ANYOF_NUPPER) && !isUPPER_LC(c)) ||
4775 (ANYOF_CLASS_TEST(n, ANYOF_XDIGIT) && isXDIGIT(c)) ||
4776 (ANYOF_CLASS_TEST(n, ANYOF_NXDIGIT) && !isXDIGIT(c)) ||
4777 (ANYOF_CLASS_TEST(n, ANYOF_PSXSPC) && isPSXSPC(c)) ||
4778 (ANYOF_CLASS_TEST(n, ANYOF_NPSXSPC) && !isPSXSPC(c)) ||
4779 (ANYOF_CLASS_TEST(n, ANYOF_BLANK) && isBLANK(c)) ||
4780 (ANYOF_CLASS_TEST(n, ANYOF_NBLANK) && !isBLANK(c))
4781 ) /* How's that for a conditional? */
4788 return (flags & ANYOF_INVERT) ? !match : match;
4792 S_reghop(pTHX_ U8 *s, I32 off)
4794 return S_reghop3(aTHX_ s, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr));
4798 S_reghop3(pTHX_ U8 *s, I32 off, U8* lim)
4801 while (off-- && s < lim) {
4802 /* XXX could check well-formedness here */
4810 if (UTF8_IS_CONTINUED(*s)) {
4811 while (s > (U8*)lim && UTF8_IS_CONTINUATION(*s))
4814 /* XXX could check well-formedness here */
4822 S_reghopmaybe(pTHX_ U8 *s, I32 off)
4824 return S_reghopmaybe3(aTHX_ s, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr));
4828 S_reghopmaybe3(pTHX_ U8* s, I32 off, U8* lim)
4831 while (off-- && s < lim) {
4832 /* XXX could check well-formedness here */
4842 if (UTF8_IS_CONTINUED(*s)) {
4843 while (s > (U8*)lim && UTF8_IS_CONTINUATION(*s))
4846 /* XXX could check well-formedness here */
4858 restore_pos(pTHX_ void *arg)
4860 if (PL_reg_eval_set) {
4861 if (PL_reg_oldsaved) {
4862 PL_reg_re->subbeg = PL_reg_oldsaved;
4863 PL_reg_re->sublen = PL_reg_oldsavedlen;
4864 #ifdef PERL_COPY_ON_WRITE
4865 PL_reg_re->saved_copy = PL_nrs;
4867 RX_MATCH_COPIED_on(PL_reg_re);
4869 PL_reg_magic->mg_len = PL_reg_oldpos;
4870 PL_reg_eval_set = 0;
4871 PL_curpm = PL_reg_oldcurpm;
4876 S_to_utf8_substr(pTHX_ register regexp *prog)
4879 if (prog->float_substr && !prog->float_utf8) {
4880 prog->float_utf8 = sv = newSVsv(prog->float_substr);
4881 sv_utf8_upgrade(sv);
4882 if (SvTAIL(prog->float_substr))
4884 if (prog->float_substr == prog->check_substr)
4885 prog->check_utf8 = sv;
4887 if (prog->anchored_substr && !prog->anchored_utf8) {
4888 prog->anchored_utf8 = sv = newSVsv(prog->anchored_substr);
4889 sv_utf8_upgrade(sv);
4890 if (SvTAIL(prog->anchored_substr))
4892 if (prog->anchored_substr == prog->check_substr)
4893 prog->check_utf8 = sv;
4898 S_to_byte_substr(pTHX_ register regexp *prog)
4901 if (prog->float_utf8 && !prog->float_substr) {
4902 prog->float_substr = sv = newSVsv(prog->float_utf8);
4903 if (sv_utf8_downgrade(sv, TRUE)) {
4904 if (SvTAIL(prog->float_utf8))
4908 prog->float_substr = sv = &PL_sv_undef;
4910 if (prog->float_utf8 == prog->check_utf8)
4911 prog->check_substr = sv;
4913 if (prog->anchored_utf8 && !prog->anchored_substr) {
4914 prog->anchored_substr = sv = newSVsv(prog->anchored_utf8);
4915 if (sv_utf8_downgrade(sv, TRUE)) {
4916 if (SvTAIL(prog->anchored_utf8))
4920 prog->anchored_substr = sv = &PL_sv_undef;
4922 if (prog->anchored_utf8 == prog->check_utf8)
4923 prog->check_substr = sv;