5 * "One Ring to rule them all, One Ring to find them..."
8 /* This file contains functions for executing a regular expression. See
9 * also regcomp.c which funnily enough, contains functions for compiling
10 * a regular expression.
12 * This file is also copied at build time to ext/re/re_exec.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
19 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
20 * confused with the original package (see point 3 below). Thanks, Henry!
23 /* Additional note: this code is very heavily munged from Henry's version
24 * in places. In some spots I've traded clarity for efficiency, so don't
25 * blame Henry for some of the lack of readability.
28 /* The names of the functions have been changed from regcomp and
29 * regexec to pregcomp and pregexec in order to avoid conflicts
30 * with the POSIX routines of the same names.
33 #ifdef PERL_EXT_RE_BUILD
38 * pregcomp and pregexec -- regsub and regerror are not used in perl
40 * Copyright (c) 1986 by University of Toronto.
41 * Written by Henry Spencer. Not derived from licensed software.
43 * Permission is granted to anyone to use this software for any
44 * purpose on any computer system, and to redistribute it freely,
45 * subject to the following restrictions:
47 * 1. The author is not responsible for the consequences of use of
48 * this software, no matter how awful, even if they arise
51 * 2. The origin of this software must not be misrepresented, either
52 * by explicit claim or by omission.
54 * 3. Altered versions must be plainly marked as such, and must not
55 * be misrepresented as being the original software.
57 **** Alterations to Henry's code are...
59 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
60 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, by Larry Wall and others
62 **** You may distribute under the terms of either the GNU General Public
63 **** License or the Artistic License, as specified in the README file.
65 * Beware that some of this code is subtly aware of the way operator
66 * precedence is structured in regular expressions. Serious changes in
67 * regular-expression syntax might require a total rethink.
70 #define PERL_IN_REGEXEC_C
73 #ifdef PERL_IN_XSUB_RE
79 #define RF_tainted 1 /* tainted information used? */
80 #define RF_warned 2 /* warned about big count? */
81 #define RF_evaled 4 /* Did an EVAL with setting? */
82 #define RF_utf8 8 /* String contains multibyte chars? */
84 #define UTF ((PL_reg_flags & RF_utf8) != 0)
86 #define RS_init 1 /* eval environment created */
87 #define RS_set 2 /* replsv value is set */
93 #define REGINCLASS(prog,p,c) (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0,0) : ANYOF_BITMAP_TEST(p,*(c)))
99 #define CHR_SVLEN(sv) (do_utf8 ? sv_len_utf8(sv) : SvCUR(sv))
100 #define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
102 #define HOPc(pos,off) \
103 (char *)(PL_reg_match_utf8 \
104 ? reghop3((U8*)pos, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr)) \
106 #define HOPBACKc(pos, off) \
107 (char*)(PL_reg_match_utf8\
108 ? reghopmaybe3((U8*)pos, -off, (U8*)PL_bostr) \
109 : (pos - off >= PL_bostr) \
113 #define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)pos, off, (U8*)lim) : (U8*)(pos + off))
114 #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
116 #define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \
117 if (!CAT2(PL_utf8_,class)) { bool ok; ENTER; save_re_context(); ok=CAT2(is_utf8_,class)((const U8*)str); assert(ok); LEAVE; } } STMT_END
118 #define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS(alnum,"a")
119 #define LOAD_UTF8_CHARCLASS_DIGIT() LOAD_UTF8_CHARCLASS(digit,"0")
120 #define LOAD_UTF8_CHARCLASS_SPACE() LOAD_UTF8_CHARCLASS(space," ")
121 #define LOAD_UTF8_CHARCLASS_MARK() LOAD_UTF8_CHARCLASS(mark, "\xcd\x86")
123 /* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
125 /* for use after a quantifier and before an EXACT-like node -- japhy */
126 #define JUMPABLE(rn) ( \
127 OP(rn) == OPEN || OP(rn) == CLOSE || OP(rn) == EVAL || \
128 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
129 OP(rn) == PLUS || OP(rn) == MINMOD || \
130 (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
133 #define HAS_TEXT(rn) ( \
134 PL_regkind[OP(rn)] == EXACT || PL_regkind[OP(rn)] == REF \
138 Search for mandatory following text node; for lookahead, the text must
139 follow but for lookbehind (rn->flags != 0) we skip to the next step.
141 #define FIND_NEXT_IMPT(rn) STMT_START { \
142 while (JUMPABLE(rn)) { \
143 const OPCODE type = OP(rn); \
144 if (type == SUSPEND || PL_regkind[type] == CURLY) \
145 rn = NEXTOPER(NEXTOPER(rn)); \
146 else if (type == PLUS) \
148 else if (type == IFMATCH) \
149 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
150 else rn += NEXT_OFF(rn); \
154 static void restore_pos(pTHX_ void *arg);
157 S_regcppush(pTHX_ I32 parenfloor)
160 const int retval = PL_savestack_ix;
161 #define REGCP_PAREN_ELEMS 4
162 const int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
164 GET_RE_DEBUG_FLAGS_DECL;
166 if (paren_elems_to_push < 0)
167 Perl_croak(aTHX_ "panic: paren_elems_to_push < 0");
169 #define REGCP_OTHER_ELEMS 6
170 SSGROW(paren_elems_to_push + REGCP_OTHER_ELEMS);
171 for (p = PL_regsize; p > parenfloor; p--) {
172 /* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
173 SSPUSHINT(PL_regendp[p]);
174 SSPUSHINT(PL_regstartp[p]);
175 SSPUSHPTR(PL_reg_start_tmp[p]);
177 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
178 " saving \\%"UVuf" %"IVdf"(%"IVdf")..%"IVdf"\n",
179 (UV)p, (IV)PL_regstartp[p],
180 (IV)(PL_reg_start_tmp[p] - PL_bostr),
184 /* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
185 SSPUSHINT(PL_regsize);
186 SSPUSHINT(*PL_reglastparen);
187 SSPUSHINT(*PL_reglastcloseparen);
188 SSPUSHPTR(PL_reginput);
189 #define REGCP_FRAME_ELEMS 2
190 /* REGCP_FRAME_ELEMS are part of the REGCP_OTHER_ELEMS and
191 * are needed for the regexp context stack bookkeeping. */
192 SSPUSHINT(paren_elems_to_push + REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
193 SSPUSHINT(SAVEt_REGCONTEXT); /* Magic cookie. */
198 /* These are needed since we do not localize EVAL nodes: */
199 # define REGCP_SET(cp) DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, \
200 " Setting an EVAL scope, savestack=%"IVdf"\n", \
201 (IV)PL_savestack_ix)); cp = PL_savestack_ix
203 # define REGCP_UNWIND(cp) DEBUG_EXECUTE_r(cp != PL_savestack_ix ? \
204 PerlIO_printf(Perl_debug_log, \
205 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
206 (IV)(cp), (IV)PL_savestack_ix) : 0); regcpblow(cp)
209 S_regcppop(pTHX_ const regexp *rex)
215 GET_RE_DEBUG_FLAGS_DECL;
217 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
219 assert(i == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
220 i = SSPOPINT; /* Parentheses elements to pop. */
221 input = (char *) SSPOPPTR;
222 *PL_reglastcloseparen = SSPOPINT;
223 *PL_reglastparen = SSPOPINT;
224 PL_regsize = SSPOPINT;
226 /* Now restore the parentheses context. */
227 for (i -= (REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
228 i > 0; i -= REGCP_PAREN_ELEMS) {
230 U32 paren = (U32)SSPOPINT;
231 PL_reg_start_tmp[paren] = (char *) SSPOPPTR;
232 PL_regstartp[paren] = SSPOPINT;
234 if (paren <= *PL_reglastparen)
235 PL_regendp[paren] = tmps;
237 PerlIO_printf(Perl_debug_log,
238 " restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n",
239 (UV)paren, (IV)PL_regstartp[paren],
240 (IV)(PL_reg_start_tmp[paren] - PL_bostr),
241 (IV)PL_regendp[paren],
242 (paren > *PL_reglastparen ? "(no)" : ""));
246 if (*PL_reglastparen + 1 <= rex->nparens) {
247 PerlIO_printf(Perl_debug_log,
248 " restoring \\%"IVdf"..\\%"IVdf" to undef\n",
249 (IV)(*PL_reglastparen + 1), (IV)rex->nparens);
253 /* It would seem that the similar code in regtry()
254 * already takes care of this, and in fact it is in
255 * a better location to since this code can #if 0-ed out
256 * but the code in regtry() is needed or otherwise tests
257 * requiring null fields (pat.t#187 and split.t#{13,14}
258 * (as of patchlevel 7877) will fail. Then again,
259 * this code seems to be necessary or otherwise
260 * building DynaLoader will fail:
261 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
263 for (i = *PL_reglastparen + 1; (U32)i <= rex->nparens; i++) {
265 PL_regstartp[i] = -1;
272 #define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
274 #define TRYPAREN(paren, n, input, where) { \
277 PL_regstartp[paren] = HOPc(input, -1) - PL_bostr; \
278 PL_regendp[paren] = input - PL_bostr; \
281 PL_regendp[paren] = -1; \
283 REGMATCH(next, where); \
287 PL_regendp[paren] = -1; \
292 * pregexec and friends
295 #ifndef PERL_IN_XSUB_RE
297 - pregexec - match a regexp against a string
300 Perl_pregexec(pTHX_ register regexp *prog, char *stringarg, register char *strend,
301 char *strbeg, I32 minend, SV *screamer, U32 nosave)
302 /* strend: pointer to null at end of string */
303 /* strbeg: real beginning of string */
304 /* minend: end of match must be >=minend after stringarg. */
305 /* nosave: For optimizations. */
308 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
309 nosave ? 0 : REXEC_COPY_STR);
314 * Need to implement the following flags for reg_anch:
316 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
318 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
319 * INTUIT_AUTORITATIVE_ML
320 * INTUIT_ONCE_NOML - Intuit can match in one location only.
323 * Another flag for this function: SECOND_TIME (so that float substrs
324 * with giant delta may be not rechecked).
327 /* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
329 /* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
330 Otherwise, only SvCUR(sv) is used to get strbeg. */
332 /* XXXX We assume that strpos is strbeg unless sv. */
334 /* XXXX Some places assume that there is a fixed substring.
335 An update may be needed if optimizer marks as "INTUITable"
336 RExen without fixed substrings. Similarly, it is assumed that
337 lengths of all the strings are no more than minlen, thus they
338 cannot come from lookahead.
339 (Or minlen should take into account lookahead.) */
341 /* A failure to find a constant substring means that there is no need to make
342 an expensive call to REx engine, thus we celebrate a failure. Similarly,
343 finding a substring too deep into the string means that less calls to
344 regtry() should be needed.
346 REx compiler's optimizer found 4 possible hints:
347 a) Anchored substring;
349 c) Whether we are anchored (beginning-of-line or \G);
350 d) First node (of those at offset 0) which may distingush positions;
351 We use a)b)d) and multiline-part of c), and try to find a position in the
352 string which does not contradict any of them.
355 /* Most of decisions we do here should have been done at compile time.
356 The nodes of the REx which we used for the search should have been
357 deleted from the finite automaton. */
360 Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
361 char *strend, U32 flags, re_scream_pos_data *data)
364 register I32 start_shift = 0;
365 /* Should be nonnegative! */
366 register I32 end_shift = 0;
371 const int do_utf8 = sv ? SvUTF8(sv) : 0; /* if no sv we have to assume bytes */
373 register char *other_last = NULL; /* other substr checked before this */
374 char *check_at = NULL; /* check substr found at this pos */
375 const I32 multiline = prog->reganch & PMf_MULTILINE;
377 const char * const i_strpos = strpos;
378 SV * const dsv = PERL_DEBUG_PAD_ZERO(0);
381 GET_RE_DEBUG_FLAGS_DECL;
383 RX_MATCH_UTF8_set(prog,do_utf8);
385 if (prog->reganch & ROPT_UTF8) {
386 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
387 "UTF-8 regex...\n"));
388 PL_reg_flags |= RF_utf8;
392 const char *s = PL_reg_match_utf8 ?
393 sv_uni_display(dsv, sv, 60, UNI_DISPLAY_REGEX) :
395 const int len = PL_reg_match_utf8 ?
396 (int)strlen(s) : strend - strpos;
399 if (PL_reg_match_utf8)
400 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
401 "UTF-8 target...\n"));
402 PerlIO_printf(Perl_debug_log,
403 "%sGuessing start of match, REx%s \"%s%.60s%s%s\" against \"%s%.*s%s%s\"...\n",
404 PL_colors[4], PL_colors[5], PL_colors[0],
407 (strlen(prog->precomp) > 60 ? "..." : ""),
409 (int)(len > 60 ? 60 : len),
411 (len > 60 ? "..." : "")
415 /* CHR_DIST() would be more correct here but it makes things slow. */
416 if (prog->minlen > strend - strpos) {
417 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
418 "String too short... [re_intuit_start]\n"));
421 strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
424 if (!prog->check_utf8 && prog->check_substr)
425 to_utf8_substr(prog);
426 check = prog->check_utf8;
428 if (!prog->check_substr && prog->check_utf8)
429 to_byte_substr(prog);
430 check = prog->check_substr;
432 if (check == &PL_sv_undef) {
433 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
434 "Non-utf string cannot match utf check string\n"));
437 if (prog->reganch & ROPT_ANCH) { /* Match at beg-of-str or after \n */
438 ml_anch = !( (prog->reganch & ROPT_ANCH_SINGLE)
439 || ( (prog->reganch & ROPT_ANCH_BOL)
440 && !multiline ) ); /* Check after \n? */
443 if ( !(prog->reganch & (ROPT_ANCH_GPOS /* Checked by the caller */
444 | ROPT_IMPLICIT)) /* not a real BOL */
445 /* SvCUR is not set on references: SvRV and SvPVX_const overlap */
447 && (strpos != strbeg)) {
448 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
451 if (prog->check_offset_min == prog->check_offset_max &&
452 !(prog->reganch & ROPT_CANY_SEEN)) {
453 /* Substring at constant offset from beg-of-str... */
456 s = HOP3c(strpos, prog->check_offset_min, strend);
458 slen = SvCUR(check); /* >= 1 */
460 if ( strend - s > slen || strend - s < slen - 1
461 || (strend - s == slen && strend[-1] != '\n')) {
462 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
465 /* Now should match s[0..slen-2] */
467 if (slen && (*SvPVX_const(check) != *s
469 && memNE(SvPVX_const(check), s, slen)))) {
471 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
475 else if (*SvPVX_const(check) != *s
476 || ((slen = SvCUR(check)) > 1
477 && memNE(SvPVX_const(check), s, slen)))
480 goto success_at_start;
483 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
485 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
486 end_shift = prog->minlen - start_shift -
487 CHR_SVLEN(check) + (SvTAIL(check) != 0);
489 const I32 end = prog->check_offset_max + CHR_SVLEN(check)
490 - (SvTAIL(check) != 0);
491 const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
493 if (end_shift < eshift)
497 else { /* Can match at random position */
500 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
501 /* Should be nonnegative! */
502 end_shift = prog->minlen - start_shift -
503 CHR_SVLEN(check) + (SvTAIL(check) != 0);
506 #ifdef DEBUGGING /* 7/99: reports of failure (with the older version) */
508 Perl_croak(aTHX_ "panic: end_shift");
512 /* Find a possible match in the region s..strend by looking for
513 the "check" substring in the region corrected by start/end_shift. */
514 if (flags & REXEC_SCREAM) {
515 I32 p = -1; /* Internal iterator of scream. */
516 I32 * const pp = data ? data->scream_pos : &p;
518 if (PL_screamfirst[BmRARE(check)] >= 0
519 || ( BmRARE(check) == '\n'
520 && (BmPREVIOUS(check) == SvCUR(check) - 1)
522 s = screaminstr(sv, check,
523 start_shift + (s - strbeg), end_shift, pp, 0);
526 /* we may be pointing at the wrong string */
527 if (s && RX_MATCH_COPIED(prog))
528 s = strbeg + (s - SvPVX_const(sv));
530 *data->scream_olds = s;
532 else if (prog->reganch & ROPT_CANY_SEEN)
533 s = fbm_instr((U8*)(s + start_shift),
534 (U8*)(strend - end_shift),
535 check, multiline ? FBMrf_MULTILINE : 0);
537 s = fbm_instr(HOP3(s, start_shift, strend),
538 HOP3(strend, -end_shift, strbeg),
539 check, multiline ? FBMrf_MULTILINE : 0);
541 /* Update the count-of-usability, remove useless subpatterns,
544 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s %s substr \"%s%.*s%s\"%s%s",
545 (s ? "Found" : "Did not find"),
546 (check == (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) ? "anchored" : "floating"),
548 (int)(SvCUR(check) - (SvTAIL(check)!=0)),
550 PL_colors[1], (SvTAIL(check) ? "$" : ""),
551 (s ? " at offset " : "...\n") ) );
558 /* Finish the diagnostic message */
559 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
561 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
562 Start with the other substr.
563 XXXX no SCREAM optimization yet - and a very coarse implementation
564 XXXX /ttx+/ results in anchored="ttx", floating="x". floating will
565 *always* match. Probably should be marked during compile...
566 Probably it is right to do no SCREAM here...
569 if (do_utf8 ? (prog->float_utf8 && prog->anchored_utf8) : (prog->float_substr && prog->anchored_substr)) {
570 /* Take into account the "other" substring. */
571 /* XXXX May be hopelessly wrong for UTF... */
574 if (check == (do_utf8 ? prog->float_utf8 : prog->float_substr)) {
577 char * const last = HOP3c(s, -start_shift, strbeg);
579 char * const saved_s = s;
582 t = s - prog->check_offset_max;
583 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
585 || ((t = (char*)reghopmaybe3((U8*)s, -(prog->check_offset_max), (U8*)strpos))
590 t = HOP3c(t, prog->anchored_offset, strend);
591 if (t < other_last) /* These positions already checked */
593 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
596 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
597 /* On end-of-str: see comment below. */
598 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
599 if (must == &PL_sv_undef) {
601 DEBUG_EXECUTE_r(must = prog->anchored_utf8); /* for debug */
606 HOP3(HOP3(last1, prog->anchored_offset, strend)
607 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
609 multiline ? FBMrf_MULTILINE : 0
611 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
612 "%s anchored substr \"%s%.*s%s\"%s",
613 (s ? "Found" : "Contradicts"),
616 - (SvTAIL(must)!=0)),
618 PL_colors[1], (SvTAIL(must) ? "$" : "")));
620 if (last1 >= last2) {
621 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
622 ", giving up...\n"));
625 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
626 ", trying floating at offset %ld...\n",
627 (long)(HOP3c(saved_s, 1, strend) - i_strpos)));
628 other_last = HOP3c(last1, prog->anchored_offset+1, strend);
629 s = HOP3c(last, 1, strend);
633 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
634 (long)(s - i_strpos)));
635 t = HOP3c(s, -prog->anchored_offset, strbeg);
636 other_last = HOP3c(s, 1, strend);
644 else { /* Take into account the floating substring. */
646 char * const saved_s = s;
649 t = HOP3c(s, -start_shift, strbeg);
651 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
652 if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
653 last = HOP3c(t, prog->float_max_offset, strend);
654 s = HOP3c(t, prog->float_min_offset, strend);
657 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
658 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
659 /* fbm_instr() takes into account exact value of end-of-str
660 if the check is SvTAIL(ed). Since false positives are OK,
661 and end-of-str is not later than strend we are OK. */
662 if (must == &PL_sv_undef) {
664 DEBUG_EXECUTE_r(must = prog->float_utf8); /* for debug message */
667 s = fbm_instr((unsigned char*)s,
668 (unsigned char*)last + SvCUR(must)
670 must, multiline ? FBMrf_MULTILINE : 0);
671 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s floating substr \"%s%.*s%s\"%s",
672 (s ? "Found" : "Contradicts"),
674 (int)(SvCUR(must) - (SvTAIL(must)!=0)),
676 PL_colors[1], (SvTAIL(must) ? "$" : "")));
679 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
680 ", giving up...\n"));
683 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
684 ", trying anchored starting at offset %ld...\n",
685 (long)(saved_s + 1 - i_strpos)));
687 s = HOP3c(t, 1, strend);
691 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
692 (long)(s - i_strpos)));
693 other_last = s; /* Fix this later. --Hugo */
702 t = s - prog->check_offset_max;
703 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
705 || ((t = (char*)reghopmaybe3((U8*)s, -prog->check_offset_max, (U8*)strpos))
707 /* Fixed substring is found far enough so that the match
708 cannot start at strpos. */
710 if (ml_anch && t[-1] != '\n') {
711 /* Eventually fbm_*() should handle this, but often
712 anchored_offset is not 0, so this check will not be wasted. */
713 /* XXXX In the code below we prefer to look for "^" even in
714 presence of anchored substrings. And we search even
715 beyond the found float position. These pessimizations
716 are historical artefacts only. */
718 while (t < strend - prog->minlen) {
720 if (t < check_at - prog->check_offset_min) {
721 if (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) {
722 /* Since we moved from the found position,
723 we definitely contradict the found anchored
724 substr. Due to the above check we do not
725 contradict "check" substr.
726 Thus we can arrive here only if check substr
727 is float. Redo checking for "other"=="fixed".
730 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
731 PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
732 goto do_other_anchored;
734 /* We don't contradict the found floating substring. */
735 /* XXXX Why not check for STCLASS? */
737 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
738 PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
741 /* Position contradicts check-string */
742 /* XXXX probably better to look for check-string
743 than for "\n", so one should lower the limit for t? */
744 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
745 PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
746 other_last = strpos = s = t + 1;
751 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
752 PL_colors[0], PL_colors[1]));
756 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
757 PL_colors[0], PL_colors[1]));
761 ++BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
764 /* The found string does not prohibit matching at strpos,
765 - no optimization of calling REx engine can be performed,
766 unless it was an MBOL and we are not after MBOL,
767 or a future STCLASS check will fail this. */
769 /* Even in this situation we may use MBOL flag if strpos is offset
770 wrt the start of the string. */
771 if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
772 && (strpos != strbeg) && strpos[-1] != '\n'
773 /* May be due to an implicit anchor of m{.*foo} */
774 && !(prog->reganch & ROPT_IMPLICIT))
779 DEBUG_EXECUTE_r( if (ml_anch)
780 PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
781 (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
784 if (!(prog->reganch & ROPT_NAUGHTY) /* XXXX If strpos moved? */
786 prog->check_utf8 /* Could be deleted already */
787 && --BmUSEFUL(prog->check_utf8) < 0
788 && (prog->check_utf8 == prog->float_utf8)
790 prog->check_substr /* Could be deleted already */
791 && --BmUSEFUL(prog->check_substr) < 0
792 && (prog->check_substr == prog->float_substr)
795 /* If flags & SOMETHING - do not do it many times on the same match */
796 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
797 SvREFCNT_dec(do_utf8 ? prog->check_utf8 : prog->check_substr);
798 if (do_utf8 ? prog->check_substr : prog->check_utf8)
799 SvREFCNT_dec(do_utf8 ? prog->check_substr : prog->check_utf8);
800 prog->check_substr = prog->check_utf8 = NULL; /* disable */
801 prog->float_substr = prog->float_utf8 = NULL; /* clear */
802 check = NULL; /* abort */
804 /* XXXX This is a remnant of the old implementation. It
805 looks wasteful, since now INTUIT can use many
807 prog->reganch &= ~RE_USE_INTUIT;
814 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
815 if (prog->regstclass && OP(prog->regstclass)!=TRIE) {
816 /* minlen == 0 is possible if regstclass is \b or \B,
817 and the fixed substr is ''$.
818 Since minlen is already taken into account, s+1 is before strend;
819 accidentally, minlen >= 1 guaranties no false positives at s + 1
820 even for \b or \B. But (minlen? 1 : 0) below assumes that
821 regstclass does not come from lookahead... */
822 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
823 This leaves EXACTF only, which is dealt with in find_byclass(). */
824 const U8* const str = (U8*)STRING(prog->regstclass);
825 const int cl_l = (PL_regkind[OP(prog->regstclass)] == EXACT
826 ? CHR_DIST(str+STR_LEN(prog->regstclass), str)
828 const char * endpos = (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
829 ? HOP3c(s, (prog->minlen ? cl_l : 0), strend)
830 : (prog->float_substr || prog->float_utf8
831 ? HOP3c(HOP3c(check_at, -start_shift, strbeg),
834 /*if (OP(prog->regstclass) == TRIE)
837 s = find_byclass(prog, prog->regstclass, s, endpos, NULL);
840 const char *what = NULL;
842 if (endpos == strend) {
843 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
844 "Could not match STCLASS...\n") );
847 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
848 "This position contradicts STCLASS...\n") );
849 if ((prog->reganch & ROPT_ANCH) && !ml_anch)
851 /* Contradict one of substrings */
852 if (prog->anchored_substr || prog->anchored_utf8) {
853 if ((do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) == check) {
854 DEBUG_EXECUTE_r( what = "anchored" );
856 s = HOP3c(t, 1, strend);
857 if (s + start_shift + end_shift > strend) {
858 /* XXXX Should be taken into account earlier? */
859 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
860 "Could not match STCLASS...\n") );
865 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
866 "Looking for %s substr starting at offset %ld...\n",
867 what, (long)(s + start_shift - i_strpos)) );
870 /* Have both, check_string is floating */
871 if (t + start_shift >= check_at) /* Contradicts floating=check */
872 goto retry_floating_check;
873 /* Recheck anchored substring, but not floating... */
877 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
878 "Looking for anchored substr starting at offset %ld...\n",
879 (long)(other_last - i_strpos)) );
880 goto do_other_anchored;
882 /* Another way we could have checked stclass at the
883 current position only: */
888 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
889 "Looking for /%s^%s/m starting at offset %ld...\n",
890 PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
893 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
895 /* Check is floating subtring. */
896 retry_floating_check:
897 t = check_at - start_shift;
898 DEBUG_EXECUTE_r( what = "floating" );
899 goto hop_and_restart;
902 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
903 "By STCLASS: moving %ld --> %ld\n",
904 (long)(t - i_strpos), (long)(s - i_strpos))
908 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
909 "Does not contradict STCLASS...\n");
914 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
915 PL_colors[4], (check ? "Guessed" : "Giving up"),
916 PL_colors[5], (long)(s - i_strpos)) );
919 fail_finish: /* Substring not found */
920 if (prog->check_substr || prog->check_utf8) /* could be removed already */
921 BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
923 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
924 PL_colors[4], PL_colors[5]));
928 /* We know what class REx starts with. Try to find this position... */
929 /* if reginfo is NULL, its a dryrun */
930 /* annoyingly all the vars in this routine have different names from their counterparts
931 in regmatch. /grrr */
934 S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
935 const char *strend, const regmatch_info *reginfo)
938 const I32 doevery = (prog->reganch & ROPT_SKIP) == 0;
942 register STRLEN uskip;
946 register I32 tmp = 1; /* Scratch variable? */
947 register const bool do_utf8 = PL_reg_match_utf8;
949 /* We know what class it must start with. */
953 while (s + (uskip = UTF8SKIP(s)) <= strend) {
954 if ((ANYOF_FLAGS(c) & ANYOF_UNICODE) ||
955 !UTF8_IS_INVARIANT((U8)s[0]) ?
956 reginclass(prog, c, (U8*)s, 0, do_utf8) :
957 REGINCLASS(prog, c, (U8*)s)) {
958 if (tmp && (!reginfo || regtry(reginfo, s)))
972 if (REGINCLASS(prog, c, (U8*)s) ||
973 (ANYOF_FOLD_SHARP_S(c, s, strend) &&
974 /* The assignment of 2 is intentional:
975 * for the folded sharp s, the skip is 2. */
976 (skip = SHARP_S_SKIP))) {
977 if (tmp && (!reginfo || regtry(reginfo, s)))
990 if (tmp && (!reginfo || regtry(reginfo, s)))
999 ln = STR_LEN(c); /* length to match in octets/bytes */
1000 lnc = (I32) ln; /* length to match in characters */
1002 STRLEN ulen1, ulen2;
1004 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
1005 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
1006 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1008 to_utf8_lower((U8*)m, tmpbuf1, &ulen1);
1009 to_utf8_upper((U8*)m, tmpbuf2, &ulen2);
1011 c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXBYTES_CASE,
1013 c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXBYTES_CASE,
1016 while (sm < ((U8 *) m + ln)) {
1031 c2 = PL_fold_locale[c1];
1033 e = HOP3c(strend, -((I32)lnc), s);
1035 if (!reginfo && e < s)
1036 e = s; /* Due to minlen logic of intuit() */
1038 /* The idea in the EXACTF* cases is to first find the
1039 * first character of the EXACTF* node and then, if
1040 * necessary, case-insensitively compare the full
1041 * text of the node. The c1 and c2 are the first
1042 * characters (though in Unicode it gets a bit
1043 * more complicated because there are more cases
1044 * than just upper and lower: one needs to use
1045 * the so-called folding case for case-insensitive
1046 * matching (called "loose matching" in Unicode).
1047 * ibcmp_utf8() will do just that. */
1051 U8 tmpbuf [UTF8_MAXBYTES+1];
1052 STRLEN len, foldlen;
1053 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1055 /* Upper and lower of 1st char are equal -
1056 * probably not a "letter". */
1058 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
1062 ibcmp_utf8(s, NULL, 0, do_utf8,
1063 m, NULL, ln, (bool)UTF))
1064 && (!reginfo || regtry(reginfo, s)) )
1067 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
1068 uvchr_to_utf8(tmpbuf, c);
1069 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen);
1071 && (f == c1 || f == c2)
1072 && (ln == foldlen ||
1073 !ibcmp_utf8((char *) foldbuf,
1074 NULL, foldlen, do_utf8,
1076 NULL, ln, (bool)UTF))
1077 && (!reginfo || regtry(reginfo, s)) )
1085 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
1088 /* Handle some of the three Greek sigmas cases.
1089 * Note that not all the possible combinations
1090 * are handled here: some of them are handled
1091 * by the standard folding rules, and some of
1092 * them (the character class or ANYOF cases)
1093 * are handled during compiletime in
1094 * regexec.c:S_regclass(). */
1095 if (c == (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA ||
1096 c == (UV)UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA)
1097 c = (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA;
1099 if ( (c == c1 || c == c2)
1101 ibcmp_utf8(s, NULL, 0, do_utf8,
1102 m, NULL, ln, (bool)UTF))
1103 && (!reginfo || regtry(reginfo, s)) )
1106 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
1107 uvchr_to_utf8(tmpbuf, c);
1108 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen);
1110 && (f == c1 || f == c2)
1111 && (ln == foldlen ||
1112 !ibcmp_utf8((char *) foldbuf,
1113 NULL, foldlen, do_utf8,
1115 NULL, ln, (bool)UTF))
1116 && (!reginfo || regtry(reginfo, s)) )
1127 && (ln == 1 || !(OP(c) == EXACTF
1129 : ibcmp_locale(s, m, ln)))
1130 && (!reginfo || regtry(reginfo, s)) )
1136 if ( (*(U8*)s == c1 || *(U8*)s == c2)
1137 && (ln == 1 || !(OP(c) == EXACTF
1139 : ibcmp_locale(s, m, ln)))
1140 && (!reginfo || regtry(reginfo, s)) )
1147 PL_reg_flags |= RF_tainted;
1154 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
1155 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);
1157 tmp = ((OP(c) == BOUND ?
1158 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1159 LOAD_UTF8_CHARCLASS_ALNUM();
1160 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1161 if (tmp == !(OP(c) == BOUND ?
1162 (bool)swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
1163 isALNUM_LC_utf8((U8*)s)))
1166 if ((!reginfo || regtry(reginfo, s)))
1173 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
1174 tmp = ((OP(c) == BOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1175 while (s < strend) {
1177 !(OP(c) == BOUND ? isALNUM(*s) : isALNUM_LC(*s))) {
1179 if ((!reginfo || regtry(reginfo, s)))
1185 if ((!prog->minlen && tmp) && (!reginfo || regtry(reginfo, s)))
1189 PL_reg_flags |= RF_tainted;
1196 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
1197 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);
1199 tmp = ((OP(c) == NBOUND ?
1200 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1201 LOAD_UTF8_CHARCLASS_ALNUM();
1202 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1203 if (tmp == !(OP(c) == NBOUND ?
1204 (bool)swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
1205 isALNUM_LC_utf8((U8*)s)))
1207 else if ((!reginfo || regtry(reginfo, s)))
1213 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
1214 tmp = ((OP(c) == NBOUND ?
1215 isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1216 while (s < strend) {
1218 !(OP(c) == NBOUND ? isALNUM(*s) : isALNUM_LC(*s)))
1220 else if ((!reginfo || regtry(reginfo, s)))
1225 if ((!prog->minlen && !tmp) && (!reginfo || regtry(reginfo, s)))
1230 LOAD_UTF8_CHARCLASS_ALNUM();
1231 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1232 if (swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8)) {
1233 if (tmp && (!reginfo || regtry(reginfo, s)))
1244 while (s < strend) {
1246 if (tmp && (!reginfo || regtry(reginfo, s)))
1258 PL_reg_flags |= RF_tainted;
1260 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1261 if (isALNUM_LC_utf8((U8*)s)) {
1262 if (tmp && (!reginfo || regtry(reginfo, s)))
1273 while (s < strend) {
1274 if (isALNUM_LC(*s)) {
1275 if (tmp && (!reginfo || regtry(reginfo, s)))
1288 LOAD_UTF8_CHARCLASS_ALNUM();
1289 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1290 if (!swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8)) {
1291 if (tmp && (!reginfo || regtry(reginfo, s)))
1302 while (s < strend) {
1304 if (tmp && (!reginfo || regtry(reginfo, s)))
1316 PL_reg_flags |= RF_tainted;
1318 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1319 if (!isALNUM_LC_utf8((U8*)s)) {
1320 if (tmp && (!reginfo || regtry(reginfo, s)))
1331 while (s < strend) {
1332 if (!isALNUM_LC(*s)) {
1333 if (tmp && (!reginfo || regtry(reginfo, s)))
1346 LOAD_UTF8_CHARCLASS_SPACE();
1347 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1348 if (*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8)) {
1349 if (tmp && (!reginfo || regtry(reginfo, s)))
1360 while (s < strend) {
1362 if (tmp && (!reginfo || regtry(reginfo, s)))
1374 PL_reg_flags |= RF_tainted;
1376 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1377 if (*s == ' ' || isSPACE_LC_utf8((U8*)s)) {
1378 if (tmp && (!reginfo || regtry(reginfo, s)))
1389 while (s < strend) {
1390 if (isSPACE_LC(*s)) {
1391 if (tmp && (!reginfo || regtry(reginfo, s)))
1404 LOAD_UTF8_CHARCLASS_SPACE();
1405 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1406 if (!(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8))) {
1407 if (tmp && (!reginfo || regtry(reginfo, s)))
1418 while (s < strend) {
1420 if (tmp && (!reginfo || regtry(reginfo, s)))
1432 PL_reg_flags |= RF_tainted;
1434 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1435 if (!(*s == ' ' || isSPACE_LC_utf8((U8*)s))) {
1436 if (tmp && (!reginfo || regtry(reginfo, s)))
1447 while (s < strend) {
1448 if (!isSPACE_LC(*s)) {
1449 if (tmp && (!reginfo || regtry(reginfo, s)))
1462 LOAD_UTF8_CHARCLASS_DIGIT();
1463 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1464 if (swash_fetch(PL_utf8_digit,(U8*)s, do_utf8)) {
1465 if (tmp && (!reginfo || regtry(reginfo, s)))
1476 while (s < strend) {
1478 if (tmp && (!reginfo || regtry(reginfo, s)))
1490 PL_reg_flags |= RF_tainted;
1492 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1493 if (isDIGIT_LC_utf8((U8*)s)) {
1494 if (tmp && (!reginfo || regtry(reginfo, s)))
1505 while (s < strend) {
1506 if (isDIGIT_LC(*s)) {
1507 if (tmp && (!reginfo || regtry(reginfo, s)))
1520 LOAD_UTF8_CHARCLASS_DIGIT();
1521 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1522 if (!swash_fetch(PL_utf8_digit,(U8*)s, do_utf8)) {
1523 if (tmp && (!reginfo || regtry(reginfo, s)))
1534 while (s < strend) {
1536 if (tmp && (!reginfo || regtry(reginfo, s)))
1548 PL_reg_flags |= RF_tainted;
1550 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1551 if (!isDIGIT_LC_utf8((U8*)s)) {
1552 if (tmp && (!reginfo || regtry(reginfo, s)))
1563 while (s < strend) {
1564 if (!isDIGIT_LC(*s)) {
1565 if (tmp && (!reginfo || regtry(reginfo, s)))
1577 /*Perl_croak(aTHX_ "panic: unknown regstclass TRIE");*/
1579 const enum { trie_plain, trie_utf8, trie_utf8_fold }
1580 trie_type = do_utf8 ?
1581 (c->flags == EXACT ? trie_utf8 : trie_utf8_fold)
1583 /* what trie are we using right now */
1585 = (reg_ac_data*)prog->data->data[ ARG( c ) ];
1586 reg_trie_data *trie=aho->trie;
1588 const char *last_start = strend - trie->minlen;
1589 const char *real_start = s;
1590 STRLEN maxlen = trie->maxlen;
1592 U8 **points; /* map of where we were in the input string
1593 when reading a given string. For ASCII this
1594 is unnecessary overhead as the relationship
1595 is always 1:1, but for unicode, especially
1596 case folded unicode this is not true. */
1597 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1599 GET_RE_DEBUG_FLAGS_DECL;
1601 /* We can't just allocate points here. We need to wrap it in
1602 * an SV so it gets freed properly if there is a croak while
1603 * running the match */
1606 sv_points=newSV(maxlen * sizeof(U8 *));
1607 SvCUR_set(sv_points,
1608 maxlen * sizeof(U8 *));
1609 SvPOK_on(sv_points);
1610 sv_2mortal(sv_points);
1611 points=(U8**)SvPV_nolen(sv_points );
1613 if (trie->bitmap && trie_type != trie_utf8_fold) {
1614 while (s <= last_start && !TRIE_BITMAP_TEST(trie,*s) ) {
1619 while (s <= last_start) {
1620 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1628 U8 *uscan = (U8*)NULL;
1629 U8 *leftmost = NULL;
1633 while ( state && uc <= (U8*)strend ) {
1635 if (aho->states[ state ].wordnum) {
1636 U8 *lpos= points[ (pointpos - trie->wordlen[aho->states[ state ].wordnum-1] ) % maxlen ];
1637 if (!leftmost || lpos < leftmost)
1641 points[pointpos++ % maxlen]= uc;
1642 switch (trie_type) {
1643 case trie_utf8_fold:
1645 uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags );
1650 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags );
1651 uvc = to_uni_fold( uvc, foldbuf, &foldlen );
1652 foldlen -= UNISKIP( uvc );
1653 uscan = foldbuf + UNISKIP( uvc );
1657 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN,
1666 charid = trie->charmap[ uvc ];
1670 if (trie->widecharmap) {
1671 SV** const svpp = hv_fetch(trie->widecharmap,
1672 (char*)&uvc, sizeof(UV), 0);
1674 charid = (U16)SvIV(*svpp);
1677 DEBUG_TRIE_EXECUTE_r(
1678 PerlIO_printf(Perl_debug_log,
1679 "Pos: %d Charid:%3x CV:%4"UVxf" ",
1680 (int)((const char*)uc - real_start), charid, uvc)
1685 U32 word = aho->states[ state ].wordnum;
1686 base = aho->states[ state ].trans.base;
1688 DEBUG_TRIE_EXECUTE_r(
1689 PerlIO_printf( Perl_debug_log,
1690 "%sState: %4"UVxf", Base: 0x%-4"UVxf" uvc=%"UVxf" word=%"UVxf"\n",
1691 failed ? "Fail transition to " : "",
1692 state, base, uvc, word)
1697 (base + charid > trie->uniquecharcount )
1698 && (base + charid - 1 - trie->uniquecharcount
1700 && trie->trans[base + charid - 1 -
1701 trie->uniquecharcount].check == state
1702 && (tmp=trie->trans[base + charid - 1 -
1703 trie->uniquecharcount ].next))
1713 state = aho->fail[state];
1717 /* we must be accepting here */
1725 else if (!charid && trie->bitmap && trie_type != trie_utf8_fold) {
1726 while ( uc <= (U8*)last_start && !TRIE_BITMAP_TEST(trie,*uc) ) {
1732 if ( aho->states[ state ].wordnum ) {
1733 U8 *lpos = points[ (pointpos - trie->wordlen[aho->states[ state ].wordnum-1]) % maxlen ];
1734 if (!leftmost || lpos < leftmost)
1737 DEBUG_TRIE_EXECUTE_r(
1738 PerlIO_printf( Perl_debug_log,
1739 "%sState: %4"UVxf", Base: 0x%-4"UVxf" uvc=%"UVxf"\n",
1744 s = (char*)leftmost;
1745 if (!reginfo || regtry(reginfo, s)) {
1760 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
1769 - regexec_flags - match a regexp against a string
1772 Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *strend,
1773 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
1774 /* strend: pointer to null at end of string */
1775 /* strbeg: real beginning of string */
1776 /* minend: end of match must be >=minend after stringarg. */
1777 /* data: May be used for some additional optimizations. */
1778 /* nosave: For optimizations. */
1782 register regnode *c;
1783 register char *startpos = stringarg;
1784 I32 minlen; /* must match at least this many chars */
1785 I32 dontbother = 0; /* how many characters not to try at end */
1786 I32 end_shift = 0; /* Same for the end. */ /* CC */
1787 I32 scream_pos = -1; /* Internal iterator of scream. */
1788 char *scream_olds = NULL;
1789 SV* const oreplsv = GvSV(PL_replgv);
1790 const bool do_utf8 = DO_UTF8(sv);
1796 regmatch_info reginfo; /* create some info to pass to regtry etc */
1798 GET_RE_DEBUG_FLAGS_DECL;
1800 PERL_UNUSED_ARG(data);
1802 /* Be paranoid... */
1803 if (prog == NULL || startpos == NULL) {
1804 Perl_croak(aTHX_ "NULL regexp parameter");
1808 multiline = prog->reganch & PMf_MULTILINE;
1809 reginfo.prog = prog;
1812 dsv0 = PERL_DEBUG_PAD_ZERO(0);
1813 dsv1 = PERL_DEBUG_PAD_ZERO(1);
1816 RX_MATCH_UTF8_set(prog, do_utf8);
1818 minlen = prog->minlen;
1819 if (strend - startpos < minlen) {
1820 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1821 "String too short [regexec_flags]...\n"));
1825 /* Check validity of program. */
1826 if (UCHARAT(prog->program) != REG_MAGIC) {
1827 Perl_croak(aTHX_ "corrupted regexp program");
1831 PL_reg_eval_set = 0;
1834 if (prog->reganch & ROPT_UTF8)
1835 PL_reg_flags |= RF_utf8;
1837 /* Mark beginning of line for ^ and lookbehind. */
1838 reginfo.bol = startpos; /* XXX not used ??? */
1842 /* Mark end of line for $ (and such) */
1845 /* see how far we have to get to not match where we matched before */
1846 reginfo.till = startpos+minend;
1848 /* If there is a "must appear" string, look for it. */
1851 if (prog->reganch & ROPT_GPOS_SEEN) { /* Need to set reginfo->ganch */
1854 if (flags & REXEC_IGNOREPOS) /* Means: check only at start */
1855 reginfo.ganch = startpos;
1856 else if (sv && SvTYPE(sv) >= SVt_PVMG
1858 && (mg = mg_find(sv, PERL_MAGIC_regex_global))
1859 && mg->mg_len >= 0) {
1860 reginfo.ganch = strbeg + mg->mg_len; /* Defined pos() */
1861 if (prog->reganch & ROPT_ANCH_GPOS) {
1862 if (s > reginfo.ganch)
1867 else /* pos() not defined */
1868 reginfo.ganch = strbeg;
1871 if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) {
1872 re_scream_pos_data d;
1874 d.scream_olds = &scream_olds;
1875 d.scream_pos = &scream_pos;
1876 s = re_intuit_start(prog, sv, s, strend, flags, &d);
1878 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
1879 goto phooey; /* not present */
1884 const char * const s0 = UTF
1885 ? pv_uni_display(dsv0, (U8*)prog->precomp, prog->prelen, 60,
1888 const int len0 = UTF ? (int)SvCUR(dsv0) : prog->prelen;
1889 const char * const s1 = do_utf8 ? sv_uni_display(dsv1, sv, 60,
1890 UNI_DISPLAY_REGEX) : startpos;
1891 const int len1 = do_utf8 ? (int)SvCUR(dsv1) : strend - startpos;
1894 PerlIO_printf(Perl_debug_log,
1895 "%sMatching REx%s \"%s%*.*s%s%s\" against \"%s%.*s%s%s\"\n",
1896 PL_colors[4], PL_colors[5], PL_colors[0],
1899 len0 > 60 ? "..." : "",
1901 (int)(len1 > 60 ? 60 : len1),
1903 (len1 > 60 ? "..." : "")
1907 /* Simplest case: anchored match need be tried only once. */
1908 /* [unless only anchor is BOL and multiline is set] */
1909 if (prog->reganch & (ROPT_ANCH & ~ROPT_ANCH_GPOS)) {
1910 if (s == startpos && regtry(®info, startpos))
1912 else if (multiline || (prog->reganch & ROPT_IMPLICIT)
1913 || (prog->reganch & ROPT_ANCH_MBOL)) /* XXXX SBOL? */
1918 dontbother = minlen - 1;
1919 end = HOP3c(strend, -dontbother, strbeg) - 1;
1920 /* for multiline we only have to try after newlines */
1921 if (prog->check_substr || prog->check_utf8) {
1925 if (regtry(®info, s))
1930 if (prog->reganch & RE_USE_INTUIT) {
1931 s = re_intuit_start(prog, sv, s + 1, strend, flags, NULL);
1942 if (*s++ == '\n') { /* don't need PL_utf8skip here */
1943 if (regtry(®info, s))
1950 } else if (prog->reganch & ROPT_ANCH_GPOS) {
1951 if (regtry(®info, reginfo.ganch))
1956 /* Messy cases: unanchored match. */
1957 if ((prog->anchored_substr || prog->anchored_utf8) && prog->reganch & ROPT_SKIP) {
1958 /* we have /x+whatever/ */
1959 /* it must be a one character string (XXXX Except UTF?) */
1964 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1965 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1966 ch = SvPVX_const(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr)[0];
1969 while (s < strend) {
1971 DEBUG_EXECUTE_r( did_match = 1 );
1972 if (regtry(®info, s)) goto got_it;
1974 while (s < strend && *s == ch)
1981 while (s < strend) {
1983 DEBUG_EXECUTE_r( did_match = 1 );
1984 if (regtry(®info, s)) goto got_it;
1986 while (s < strend && *s == ch)
1992 DEBUG_EXECUTE_r(if (!did_match)
1993 PerlIO_printf(Perl_debug_log,
1994 "Did not find anchored character...\n")
1997 else if (prog->anchored_substr != NULL
1998 || prog->anchored_utf8 != NULL
1999 || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
2000 && prog->float_max_offset < strend - s)) {
2005 char *last1; /* Last position checked before */
2009 if (prog->anchored_substr || prog->anchored_utf8) {
2010 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
2011 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
2012 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
2013 back_max = back_min = prog->anchored_offset;
2015 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
2016 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
2017 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
2018 back_max = prog->float_max_offset;
2019 back_min = prog->float_min_offset;
2021 if (must == &PL_sv_undef)
2022 /* could not downgrade utf8 check substring, so must fail */
2025 last = HOP3c(strend, /* Cannot start after this */
2026 -(I32)(CHR_SVLEN(must)
2027 - (SvTAIL(must) != 0) + back_min), strbeg);
2030 last1 = HOPc(s, -1);
2032 last1 = s - 1; /* bogus */
2034 /* XXXX check_substr already used to find "s", can optimize if
2035 check_substr==must. */
2037 dontbother = end_shift;
2038 strend = HOPc(strend, -dontbother);
2039 while ( (s <= last) &&
2040 ((flags & REXEC_SCREAM)
2041 ? (s = screaminstr(sv, must, HOP3c(s, back_min, strend) - strbeg,
2042 end_shift, &scream_pos, 0))
2043 : (s = fbm_instr((unsigned char*)HOP3(s, back_min, strend),
2044 (unsigned char*)strend, must,
2045 multiline ? FBMrf_MULTILINE : 0))) ) {
2046 /* we may be pointing at the wrong string */
2047 if ((flags & REXEC_SCREAM) && RX_MATCH_COPIED(prog))
2048 s = strbeg + (s - SvPVX_const(sv));
2049 DEBUG_EXECUTE_r( did_match = 1 );
2050 if (HOPc(s, -back_max) > last1) {
2051 last1 = HOPc(s, -back_min);
2052 s = HOPc(s, -back_max);
2055 char * const t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
2057 last1 = HOPc(s, -back_min);
2061 while (s <= last1) {
2062 if (regtry(®info, s))
2068 while (s <= last1) {
2069 if (regtry(®info, s))
2075 DEBUG_EXECUTE_r(if (!did_match)
2076 PerlIO_printf(Perl_debug_log,
2077 "Did not find %s substr \"%s%.*s%s\"%s...\n",
2078 ((must == prog->anchored_substr || must == prog->anchored_utf8)
2079 ? "anchored" : "floating"),
2081 (int)(SvCUR(must) - (SvTAIL(must)!=0)),
2083 PL_colors[1], (SvTAIL(must) ? "$" : ""))
2087 else if ((c = prog->regstclass)) {
2089 const OPCODE op = OP(prog->regstclass);
2090 /* don't bother with what can't match */
2091 if (PL_regkind[op] != EXACT && op != CANY && op != TRIE)
2092 strend = HOPc(strend, -(minlen - 1));
2095 SV * const prop = sv_newmortal();
2101 regprop(prog, prop, c);
2103 pv_uni_display(dsv0, (U8*)SvPVX_const(prop), SvCUR(prop), 60,
2104 UNI_DISPLAY_REGEX) :
2106 len0 = UTF ? SvCUR(dsv0) : SvCUR(prop);
2108 sv_uni_display(dsv1, sv, 60, UNI_DISPLAY_REGEX) : s;
2109 len1 = UTF ? (int)SvCUR(dsv1) : strend - s;
2110 PerlIO_printf(Perl_debug_log,
2111 "Matching stclass \"%*.*s\" against \"%*.*s\" (%d chars)\n",
2113 len1, len1, s1, (int)(strend - s));
2115 if (find_byclass(prog, c, s, strend, ®info))
2117 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
2121 if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
2126 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
2127 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
2128 float_real = do_utf8 ? prog->float_utf8 : prog->float_substr;
2130 if (flags & REXEC_SCREAM) {
2131 last = screaminstr(sv, float_real, s - strbeg,
2132 end_shift, &scream_pos, 1); /* last one */
2134 last = scream_olds; /* Only one occurrence. */
2135 /* we may be pointing at the wrong string */
2136 else if (RX_MATCH_COPIED(prog))
2137 s = strbeg + (s - SvPVX_const(sv));
2141 const char * const little = SvPV_const(float_real, len);
2143 if (SvTAIL(float_real)) {
2144 if (memEQ(strend - len + 1, little, len - 1))
2145 last = strend - len + 1;
2146 else if (!multiline)
2147 last = memEQ(strend - len, little, len)
2148 ? strend - len : NULL;
2154 last = rninstr(s, strend, little, little + len);
2156 last = strend; /* matching "$" */
2160 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2161 "%sCan't trim the tail, match fails (should not happen)%s\n",
2162 PL_colors[4], PL_colors[5]));
2163 goto phooey; /* Should not happen! */
2165 dontbother = strend - last + prog->float_min_offset;
2167 if (minlen && (dontbother < minlen))
2168 dontbother = minlen - 1;
2169 strend -= dontbother; /* this one's always in bytes! */
2170 /* We don't know much -- general case. */
2173 if (regtry(®info, s))
2182 if (regtry(®info, s))
2184 } while (s++ < strend);
2192 RX_MATCH_TAINTED_set(prog, PL_reg_flags & RF_tainted);
2194 if (PL_reg_eval_set) {
2195 /* Preserve the current value of $^R */
2196 if (oreplsv != GvSV(PL_replgv))
2197 sv_setsv(oreplsv, GvSV(PL_replgv));/* So that when GvSV(replgv) is
2198 restored, the value remains
2200 restore_pos(aTHX_ prog);
2203 /* make sure $`, $&, $', and $digit will work later */
2204 if ( !(flags & REXEC_NOT_FIRST) ) {
2205 RX_MATCH_COPY_FREE(prog);
2206 if (flags & REXEC_COPY_STR) {
2207 const I32 i = PL_regeol - startpos + (stringarg - strbeg);
2208 #ifdef PERL_OLD_COPY_ON_WRITE
2210 || (SvFLAGS(sv) & CAN_COW_MASK) == CAN_COW_FLAGS)) {
2212 PerlIO_printf(Perl_debug_log,
2213 "Copy on write: regexp capture, type %d\n",
2216 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
2217 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
2218 assert (SvPOKp(prog->saved_copy));
2222 RX_MATCH_COPIED_on(prog);
2223 s = savepvn(strbeg, i);
2229 prog->subbeg = strbeg;
2230 prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
2237 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
2238 PL_colors[4], PL_colors[5]));
2239 if (PL_reg_eval_set)
2240 restore_pos(aTHX_ prog);
2245 - regtry - try match at specific point
2247 STATIC I32 /* 0 failure, 1 success */
2248 S_regtry(pTHX_ const regmatch_info *reginfo, char *startpos)
2254 regexp *prog = reginfo->prog;
2255 GET_RE_DEBUG_FLAGS_DECL;
2258 PL_regindent = 0; /* XXXX Not good when matches are reenterable... */
2260 if ((prog->reganch & ROPT_EVAL_SEEN) && !PL_reg_eval_set) {
2263 PL_reg_eval_set = RS_init;
2264 DEBUG_EXECUTE_r(DEBUG_s(
2265 PerlIO_printf(Perl_debug_log, " setting stack tmpbase at %"IVdf"\n",
2266 (IV)(PL_stack_sp - PL_stack_base));
2268 SAVEI32(cxstack[cxstack_ix].blk_oldsp);
2269 cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base;
2270 /* Otherwise OP_NEXTSTATE will free whatever on stack now. */
2272 /* Apparently this is not needed, judging by wantarray. */
2273 /* SAVEI8(cxstack[cxstack_ix].blk_gimme);
2274 cxstack[cxstack_ix].blk_gimme = G_SCALAR; */
2277 /* Make $_ available to executed code. */
2278 if (reginfo->sv != DEFSV) {
2280 DEFSV = reginfo->sv;
2283 if (!(SvTYPE(reginfo->sv) >= SVt_PVMG && SvMAGIC(reginfo->sv)
2284 && (mg = mg_find(reginfo->sv, PERL_MAGIC_regex_global)))) {
2285 /* prepare for quick setting of pos */
2286 #ifdef PERL_OLD_COPY_ON_WRITE
2288 sv_force_normal_flags(sv, 0);
2290 mg = sv_magicext(reginfo->sv, NULL, PERL_MAGIC_regex_global,
2291 &PL_vtbl_mglob, NULL, 0);
2295 PL_reg_oldpos = mg->mg_len;
2296 SAVEDESTRUCTOR_X(restore_pos, prog);
2298 if (!PL_reg_curpm) {
2299 Newxz(PL_reg_curpm, 1, PMOP);
2302 SV* const repointer = newSViv(0);
2303 /* so we know which PL_regex_padav element is PL_reg_curpm */
2304 SvFLAGS(repointer) |= SVf_BREAK;
2305 av_push(PL_regex_padav,repointer);
2306 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2307 PL_regex_pad = AvARRAY(PL_regex_padav);
2311 PM_SETRE(PL_reg_curpm, prog);
2312 PL_reg_oldcurpm = PL_curpm;
2313 PL_curpm = PL_reg_curpm;
2314 if (RX_MATCH_COPIED(prog)) {
2315 /* Here is a serious problem: we cannot rewrite subbeg,
2316 since it may be needed if this match fails. Thus
2317 $` inside (?{}) could fail... */
2318 PL_reg_oldsaved = prog->subbeg;
2319 PL_reg_oldsavedlen = prog->sublen;
2320 #ifdef PERL_OLD_COPY_ON_WRITE
2321 PL_nrs = prog->saved_copy;
2323 RX_MATCH_COPIED_off(prog);
2326 PL_reg_oldsaved = NULL;
2327 prog->subbeg = PL_bostr;
2328 prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2330 prog->startp[0] = startpos - PL_bostr;
2331 PL_reginput = startpos;
2332 PL_regstartp = prog->startp;
2333 PL_regendp = prog->endp;
2334 PL_reglastparen = &prog->lastparen;
2335 PL_reglastcloseparen = &prog->lastcloseparen;
2336 prog->lastparen = 0;
2337 prog->lastcloseparen = 0;
2339 DEBUG_EXECUTE_r(PL_reg_starttry = startpos);
2340 if (PL_reg_start_tmpl <= prog->nparens) {
2341 PL_reg_start_tmpl = prog->nparens*3/2 + 3;
2342 if(PL_reg_start_tmp)
2343 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2345 Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2348 /* XXXX What this code is doing here?!!! There should be no need
2349 to do this again and again, PL_reglastparen should take care of
2352 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2353 * Actually, the code in regcppop() (which Ilya may be meaning by
2354 * PL_reglastparen), is not needed at all by the test suite
2355 * (op/regexp, op/pat, op/split), but that code is needed, oddly
2356 * enough, for building DynaLoader, or otherwise this
2357 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
2358 * will happen. Meanwhile, this code *is* needed for the
2359 * above-mentioned test suite tests to succeed. The common theme
2360 * on those tests seems to be returning null fields from matches.
2365 if (prog->nparens) {
2367 for (i = prog->nparens; i > (I32)*PL_reglastparen; i--) {
2374 if (regmatch(reginfo, prog->program + 1)) {
2375 prog->endp[0] = PL_reginput - PL_bostr;
2378 REGCP_UNWIND(lastcp);
2383 #define sayYES goto yes
2384 #define sayNO goto no
2385 #define sayNO_ANYOF goto no_anyof
2386 #define sayYES_FINAL goto yes_final
2387 #define sayNO_FINAL goto no_final
2388 #define sayNO_SILENT goto do_no
2389 #define saySAME(x) if (x) goto yes; else goto no
2391 #define POSCACHE_SUCCESS 0 /* caching success rather than failure */
2392 #define POSCACHE_SEEN 1 /* we know what we're caching */
2393 #define POSCACHE_START 2 /* the real cache: this bit maps to pos 0 */
2395 #define CACHEsayYES STMT_START { \
2396 if (st->u.whilem.cache_offset | st->u.whilem.cache_bit) { \
2397 if (!(PL_reg_poscache[0] & (1<<POSCACHE_SEEN))) { \
2398 PL_reg_poscache[0] |= (1<<POSCACHE_SUCCESS) | (1<<POSCACHE_SEEN); \
2399 PL_reg_poscache[st->u.whilem.cache_offset] |= (1<<st->u.whilem.cache_bit); \
2401 else if (PL_reg_poscache[0] & (1<<POSCACHE_SUCCESS)) { \
2402 PL_reg_poscache[st->u.whilem.cache_offset] |= (1<<st->u.whilem.cache_bit); \
2405 /* cache records failure, but this is success */ \
2407 PerlIO_printf(Perl_debug_log, \
2408 "%*s (remove success from failure cache)\n", \
2409 REPORT_CODE_OFF+PL_regindent*2, "") \
2411 PL_reg_poscache[st->u.whilem.cache_offset] &= ~(1<<st->u.whilem.cache_bit); \
2417 #define CACHEsayNO STMT_START { \
2418 if (st->u.whilem.cache_offset | st->u.whilem.cache_bit) { \
2419 if (!(PL_reg_poscache[0] & (1<<POSCACHE_SEEN))) { \
2420 PL_reg_poscache[0] |= (1<<POSCACHE_SEEN); \
2421 PL_reg_poscache[st->u.whilem.cache_offset] |= (1<<st->u.whilem.cache_bit); \
2423 else if (!(PL_reg_poscache[0] & (1<<POSCACHE_SUCCESS))) { \
2424 PL_reg_poscache[st->u.whilem.cache_offset] |= (1<<st->u.whilem.cache_bit); \
2427 /* cache records success, but this is failure */ \
2429 PerlIO_printf(Perl_debug_log, \
2430 "%*s (remove failure from success cache)\n", \
2431 REPORT_CODE_OFF+PL_regindent*2, "") \
2433 PL_reg_poscache[st->u.whilem.cache_offset] &= ~(1<<st->u.whilem.cache_bit); \
2439 /* this is used to determine how far from the left messages like
2440 'failed...' are printed. Currently 29 makes these messages line
2441 up with the opcode they refer to. Earlier perls used 25 which
2442 left these messages outdented making reviewing a debug output
2445 #define REPORT_CODE_OFF 29
2448 /* Make sure there is a test for this +1 options in re_tests */
2449 #define TRIE_INITAL_ACCEPT_BUFFLEN 4;
2451 #define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
2452 #define CHRTEST_VOID -1000 /* the c1/c2 "next char" test should be skipped */
2454 #define SLAB_FIRST(s) (&(s)->states[0])
2455 #define SLAB_LAST(s) (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
2457 /* grab a new slab and return the first slot in it */
2459 STATIC regmatch_state *
2462 #if PERL_VERSION < 9
2465 regmatch_slab *s = PL_regmatch_slab->next;
2467 Newx(s, 1, regmatch_slab);
2468 s->prev = PL_regmatch_slab;
2470 PL_regmatch_slab->next = s;
2472 PL_regmatch_slab = s;
2473 return SLAB_FIRST(s);
2476 /* simulate a recursive call to regmatch */
2478 #define REGMATCH(ns, where) \
2481 st->resume_state = resume_##where; \
2482 goto start_recurse; \
2483 resume_point_##where:
2485 /* push a new state then goto it */
2487 #define PUSH_STATE_GOTO(state, node) \
2489 st->resume_state = state; \
2492 /* push a new state with success backtracking, then goto it */
2494 #define PUSH_YES_STATE_GOTO(state, node) \
2496 st->resume_state = state; \
2497 goto push_yes_state;
2502 - regmatch - main matching routine
2504 * Conceptually the strategy is simple: check to see whether the current
2505 * node matches, call self recursively to see whether the rest matches,
2506 * and then act accordingly. In practice we make some effort to avoid
2507 * recursion, in particular by going through "ordinary" nodes (that don't
2508 * need to know whether the rest of the match failed) by a loop instead of
2511 /* [lwall] I've hoisted the register declarations to the outer block in order to
2512 * maybe save a little bit of pushing and popping on the stack. It also takes
2513 * advantage of machines that use a register save mask on subroutine entry.
2515 * This function used to be heavily recursive, but since this had the
2516 * effect of blowing the CPU stack on complex regexes, it has been
2517 * restructured to be iterative, and to save state onto the heap rather
2518 * than the stack. Essentially whereever regmatch() used to be called, it
2519 * pushes the current state, notes where to return, then jumps back into
2522 * Originally the structure of this function used to look something like
2527 while (scan != NULL) {
2528 a++; // do stuff with a and b
2534 if (regmatch(...)) // recurse
2544 * Now it looks something like this:
2552 regmatch_state *st = new();
2554 st->a++; // do stuff with a and b
2556 while (scan != NULL) {
2564 st->resume_state = resume_FOO;
2565 goto start_recurse; // recurse
2574 st = new(); push a new state
2575 st->a = 1; st->b = 2;
2582 switch (resume_state) {
2584 goto resume_point_FOO;
2591 * WARNING: this means that any line in this function that contains a
2592 * REGMATCH() or TRYPAREN() is actually simulating a recursive call to
2593 * regmatch() using gotos instead. Thus the values of any local variables
2594 * not saved in the regmatch_state structure will have been lost when
2595 * execution resumes on the next line .
2597 * States (ie the st pointer) are allocated in slabs of about 4K in size.
2598 * PL_regmatch_state always points to the currently active state, and
2599 * PL_regmatch_slab points to the slab currently containing PL_regmatch_state.
2600 * The first time regmatch is called, the first slab is allocated, and is
2601 * never freed until interpreter desctruction. When the slab is full,
2602 * a new one is allocated chained to the end. At exit from regmatch, slabs
2603 * allocated since entry are freed.
2606 /* *** every FOO_fail should = FOO+1 */
2607 #define TRIE_next (REGNODE_MAX+1)
2608 #define TRIE_next_fail (REGNODE_MAX+2)
2609 #define EVAL_A (REGNODE_MAX+3)
2610 #define EVAL_A_fail (REGNODE_MAX+4)
2611 #define resume_CURLYX (REGNODE_MAX+5)
2612 #define resume_WHILEM1 (REGNODE_MAX+6)
2613 #define resume_WHILEM2 (REGNODE_MAX+7)
2614 #define resume_WHILEM3 (REGNODE_MAX+8)
2615 #define resume_WHILEM4 (REGNODE_MAX+9)
2616 #define resume_WHILEM5 (REGNODE_MAX+10)
2617 #define resume_WHILEM6 (REGNODE_MAX+11)
2618 #define BRANCH_next (REGNODE_MAX+12)
2619 #define BRANCH_next_fail (REGNODE_MAX+13)
2620 #define CURLYM_A (REGNODE_MAX+14)
2621 #define CURLYM_A_fail (REGNODE_MAX+15)
2622 #define CURLYM_B (REGNODE_MAX+16)
2623 #define CURLYM_B_fail (REGNODE_MAX+17)
2624 #define IFMATCH_A (REGNODE_MAX+18)
2625 #define IFMATCH_A_fail (REGNODE_MAX+19)
2626 #define resume_PLUS1 (REGNODE_MAX+20)
2627 #define resume_PLUS2 (REGNODE_MAX+21)
2628 #define resume_PLUS3 (REGNODE_MAX+22)
2629 #define resume_PLUS4 (REGNODE_MAX+23)
2633 #define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
2637 S_dump_exec_pos(pTHX_ const char *locinput, const regnode *scan, const bool do_utf8)
2639 const int docolor = *PL_colors[0];
2640 const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
2641 int l = (PL_regeol - locinput) > taill ? taill : (PL_regeol - locinput);
2642 /* The part of the string before starttry has one color
2643 (pref0_len chars), between starttry and current
2644 position another one (pref_len - pref0_len chars),
2645 after the current position the third one.
2646 We assume that pref0_len <= pref_len, otherwise we
2647 decrease pref0_len. */
2648 int pref_len = (locinput - PL_bostr) > (5 + taill) - l
2649 ? (5 + taill) - l : locinput - PL_bostr;
2652 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
2654 pref0_len = pref_len - (locinput - PL_reg_starttry);
2655 if (l + pref_len < (5 + taill) && l < PL_regeol - locinput)
2656 l = ( PL_regeol - locinput > (5 + taill) - pref_len
2657 ? (5 + taill) - pref_len : PL_regeol - locinput);
2658 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
2662 if (pref0_len > pref_len)
2663 pref0_len = pref_len;
2665 const char * const s0 =
2666 do_utf8 && OP(scan) != CANY ?
2667 pv_uni_display(PERL_DEBUG_PAD(0), (U8*)(locinput - pref_len),
2668 pref0_len, 60, UNI_DISPLAY_REGEX) :
2669 locinput - pref_len;
2670 const int len0 = do_utf8 ? (int)strlen(s0) : pref0_len;
2671 const char * const s1 = do_utf8 && OP(scan) != CANY ?
2672 pv_uni_display(PERL_DEBUG_PAD(1),
2673 (U8*)(locinput - pref_len + pref0_len),
2674 pref_len - pref0_len, 60, UNI_DISPLAY_REGEX) :
2675 locinput - pref_len + pref0_len;
2676 const int len1 = do_utf8 ? (int)strlen(s1) : pref_len - pref0_len;
2677 const char * const s2 = do_utf8 && OP(scan) != CANY ?
2678 pv_uni_display(PERL_DEBUG_PAD(2), (U8*)locinput,
2679 PL_regeol - locinput, 60, UNI_DISPLAY_REGEX) :
2681 const int len2 = do_utf8 ? (int)strlen(s2) : l;
2682 PerlIO_printf(Perl_debug_log,
2683 "%4"IVdf" <%s%.*s%s%s%.*s%s%s%s%.*s%s>%*s|",
2684 (IV)(locinput - PL_bostr),
2691 (docolor ? "" : "> <"),
2695 15 - l - pref_len + 1,
2701 STATIC I32 /* 0 failure, 1 success */
2702 S_regmatch(pTHX_ const regmatch_info *reginfo, regnode *prog)
2704 #if PERL_VERSION < 9
2708 register const bool do_utf8 = PL_reg_match_utf8;
2709 const U32 uniflags = UTF8_ALLOW_DEFAULT;
2711 regexp *rex = reginfo->prog;
2713 regmatch_slab *orig_slab;
2714 regmatch_state *orig_state;
2716 /* the current state. This is a cached copy of PL_regmatch_state */
2717 register regmatch_state *st;
2719 /* cache heavy used fields of st in registers */
2720 register regnode *scan;
2721 register regnode *next;
2722 register I32 n = 0; /* initialize to shut up compiler warning */
2723 register char *locinput = PL_reginput;
2725 /* these variables are NOT saved during a recusive RFEGMATCH: */
2726 register I32 nextchr; /* is always set to UCHARAT(locinput) */
2727 bool result; /* return value of S_regmatch */
2728 int depth = 0; /* depth of recursion */
2729 regmatch_state *yes_state = NULL; /* state to pop to on success of
2734 GET_RE_DEBUG_FLAGS_DECL;
2738 /* on first ever call to regmatch, allocate first slab */
2739 if (!PL_regmatch_slab) {
2740 Newx(PL_regmatch_slab, 1, regmatch_slab);
2741 PL_regmatch_slab->prev = NULL;
2742 PL_regmatch_slab->next = NULL;
2743 PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
2746 /* remember current high-water mark for exit */
2747 /* XXX this should be done with SAVE* instead */
2748 orig_slab = PL_regmatch_slab;
2749 orig_state = PL_regmatch_state;
2751 /* grab next free state slot */
2752 st = ++PL_regmatch_state;
2753 if (st > SLAB_LAST(PL_regmatch_slab))
2754 st = PL_regmatch_state = S_push_slab(aTHX);
2760 /* Note that nextchr is a byte even in UTF */
2761 nextchr = UCHARAT(locinput);
2763 while (scan != NULL) {
2766 SV * const prop = sv_newmortal();
2767 dump_exec_pos( locinput, scan, do_utf8 );
2768 regprop(rex, prop, scan);
2770 PerlIO_printf(Perl_debug_log,
2771 "%3"IVdf":%*s%s(%"IVdf")\n",
2772 (IV)(scan - rex->program), PL_regindent*2, "",
2774 PL_regkind[OP(scan)] == END ? 0 : (IV)(regnext(scan) - rex->program));
2777 next = scan + NEXT_OFF(scan);
2780 state_num = OP(scan);
2783 switch (state_num) {
2785 if (locinput == PL_bostr)
2787 /* reginfo->till = reginfo->bol; */
2792 if (locinput == PL_bostr ||
2793 ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n'))
2799 if (locinput == PL_bostr)
2803 if (locinput == reginfo->ganch)
2809 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
2814 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
2816 if (PL_regeol - locinput > 1)
2820 if (PL_regeol != locinput)
2824 if (!nextchr && locinput >= PL_regeol)
2827 locinput += PL_utf8skip[nextchr];
2828 if (locinput > PL_regeol)
2830 nextchr = UCHARAT(locinput);
2833 nextchr = UCHARAT(++locinput);
2836 if (!nextchr && locinput >= PL_regeol)
2838 nextchr = UCHARAT(++locinput);
2841 if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
2844 locinput += PL_utf8skip[nextchr];
2845 if (locinput > PL_regeol)
2847 nextchr = UCHARAT(locinput);
2850 nextchr = UCHARAT(++locinput);
2854 #define ST st->u.trie
2858 /* what type of TRIE am I? (utf8 makes this contextual) */
2859 const enum { trie_plain, trie_utf8, trie_utf8_fold }
2860 trie_type = do_utf8 ?
2861 (scan->flags == EXACT ? trie_utf8 : trie_utf8_fold)
2864 /* what trie are we using right now */
2865 reg_trie_data * const trie
2866 = (reg_trie_data*)rex->data->data[ ARG( scan ) ];
2867 U32 state = trie->startstate;
2869 U8 *uc = ( U8* )locinput;
2875 U8 *uscan = (U8*)NULL;
2877 SV *sv_accept_buff = NULL;
2878 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2880 ST.accepted = 0; /* how many accepting states we have seen */
2886 if (trie->bitmap && trie_type != trie_utf8_fold &&
2887 !TRIE_BITMAP_TEST(trie,*locinput)
2889 if (trie->states[ state ].wordnum) {
2891 PerlIO_printf(Perl_debug_log,
2892 "%*s %smatched empty string...%s\n",
2893 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4], PL_colors[5])
2898 PerlIO_printf(Perl_debug_log,
2899 "%*s %sfailed to match start class...%s\n",
2900 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4], PL_colors[5])
2907 traverse the TRIE keeping track of all accepting states
2908 we transition through until we get to a failing node.
2911 while ( state && uc <= (U8*)PL_regeol ) {
2913 if (trie->states[ state ].wordnum) {
2914 if (!ST.accepted ) {
2917 bufflen = TRIE_INITAL_ACCEPT_BUFFLEN;
2918 sv_accept_buff=newSV(bufflen *
2919 sizeof(reg_trie_accepted) - 1);
2920 SvCUR_set(sv_accept_buff,
2921 sizeof(reg_trie_accepted));
2922 SvPOK_on(sv_accept_buff);
2923 sv_2mortal(sv_accept_buff);
2926 (reg_trie_accepted*)SvPV_nolen(sv_accept_buff );
2929 if (ST.accepted >= bufflen) {
2931 ST.accept_buff =(reg_trie_accepted*)
2932 SvGROW(sv_accept_buff,
2933 bufflen * sizeof(reg_trie_accepted));
2935 SvCUR_set(sv_accept_buff,SvCUR(sv_accept_buff)
2936 + sizeof(reg_trie_accepted));
2938 ST.accept_buff[ST.accepted].wordnum = trie->states[state].wordnum;
2939 ST.accept_buff[ST.accepted].endpos = uc;
2943 base = trie->states[ state ].trans.base;
2945 DEBUG_TRIE_EXECUTE_r({
2946 dump_exec_pos( (char *)uc, scan, do_utf8 );
2947 PerlIO_printf( Perl_debug_log,
2948 "%*s %sState: %4"UVxf", Base: %4"UVxf", Accepted: %4"UVxf" ",
2949 2+PL_regindent * 2, "", PL_colors[4],
2950 (UV)state, (UV)base, (UV)ST.accepted );
2954 switch (trie_type) {
2955 case trie_utf8_fold:
2957 uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags );
2962 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags );
2963 uvc = to_uni_fold( uvc, foldbuf, &foldlen );
2964 foldlen -= UNISKIP( uvc );
2965 uscan = foldbuf + UNISKIP( uvc );
2969 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN,
2978 charid = trie->charmap[ uvc ];
2982 if (trie->widecharmap) {
2983 SV** const svpp = hv_fetch(trie->widecharmap,
2984 (char*)&uvc, sizeof(UV), 0);
2986 charid = (U16)SvIV(*svpp);
2991 (base + charid > trie->uniquecharcount )
2992 && (base + charid - 1 - trie->uniquecharcount
2994 && trie->trans[base + charid - 1 -
2995 trie->uniquecharcount].check == state)
2997 state = trie->trans[base + charid - 1 -
2998 trie->uniquecharcount ].next;
3009 DEBUG_TRIE_EXECUTE_r(
3010 PerlIO_printf( Perl_debug_log,
3011 "Charid:%3x CV:%4"UVxf" After State: %4"UVxf"%s\n",
3012 charid, uvc, (UV)state, PL_colors[5] );
3019 PerlIO_printf( Perl_debug_log,
3020 "%*s %sgot %"IVdf" possible matches%s\n",
3021 REPORT_CODE_OFF + PL_regindent * 2, "",
3022 PL_colors[4], (IV)ST.accepted, PL_colors[5] );
3028 case TRIE_next_fail: /* we failed - try next alterative */
3030 if ( ST.accepted == 1 ) {
3031 /* only one choice left - just continue */
3033 reg_trie_data * const trie
3034 = (reg_trie_data*)rex->data->data[ ARG(ST.me) ];
3035 SV ** const tmp = RX_DEBUG(reginfo->prog)
3036 ? av_fetch( trie->words, ST.accept_buff[ 0 ].wordnum-1, 0 )
3038 PerlIO_printf( Perl_debug_log,
3039 "%*s %sonly one match left: #%d <%s>%s\n",
3040 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4],
3041 ST.accept_buff[ 0 ].wordnum,
3042 tmp ? SvPV_nolen_const( *tmp ) : "not compiled under -Dr",
3045 PL_reginput = (char *)ST.accept_buff[ 0 ].endpos;
3046 /* in this case we free tmps/leave before we call regmatch
3047 as we wont be using accept_buff again. */
3050 locinput = PL_reginput;
3051 nextchr = UCHARAT(locinput);
3053 continue; /* execute rest of RE */
3056 if (!ST.accepted-- ) {
3063 There are at least two accepting states left. Presumably
3064 the number of accepting states is going to be low,
3065 typically two. So we simply scan through to find the one
3066 with lowest wordnum. Once we find it, we swap the last
3067 state into its place and decrement the size. We then try to
3068 match the rest of the pattern at the point where the word
3069 ends. If we succeed, control just continues along the
3070 regex; if we fail we return here to try the next accepting
3077 for( cur = 1 ; cur <= ST.accepted ; cur++ ) {
3078 DEBUG_TRIE_EXECUTE_r(
3079 PerlIO_printf( Perl_debug_log,
3080 "%*s %sgot %"IVdf" (%d) as best, looking at %"IVdf" (%d)%s\n",
3081 REPORT_CODE_OFF + PL_regindent * 2, "", PL_colors[4],
3082 (IV)best, ST.accept_buff[ best ].wordnum, (IV)cur,
3083 ST.accept_buff[ cur ].wordnum, PL_colors[5] );
3086 if (ST.accept_buff[cur].wordnum <
3087 ST.accept_buff[best].wordnum)
3092 reg_trie_data * const trie
3093 = (reg_trie_data*)rex->data->data[ ARG(ST.me) ];
3094 SV ** const tmp = RX_DEBUG(reginfo->prog)
3095 ? av_fetch( trie->words, ST.accept_buff[ best ].wordnum - 1, 0 )
3097 PerlIO_printf( Perl_debug_log, "%*s %strying alternation #%d <%s> at node #%d %s\n",
3098 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4],
3099 ST.accept_buff[best].wordnum,
3100 tmp ? SvPV_nolen_const( *tmp ) : "not compiled under -Dr", REG_NODE_NUM(scan),
3104 if ( best<ST.accepted ) {
3105 reg_trie_accepted tmp = ST.accept_buff[ best ];
3106 ST.accept_buff[ best ] = ST.accept_buff[ ST.accepted ];
3107 ST.accept_buff[ ST.accepted ] = tmp;
3110 PL_reginput = (char *)ST.accept_buff[ best ].endpos;
3112 PUSH_STATE_GOTO(TRIE_next, ST.B);
3118 char *s = STRING(scan);
3119 st->ln = STR_LEN(scan);
3120 if (do_utf8 != UTF) {
3121 /* The target and the pattern have differing utf8ness. */
3123 const char * const e = s + st->ln;
3126 /* The target is utf8, the pattern is not utf8. */
3131 if (NATIVE_TO_UNI(*(U8*)s) !=
3132 utf8n_to_uvuni((U8*)l, UTF8_MAXBYTES, &ulen,
3140 /* The target is not utf8, the pattern is utf8. */
3145 if (NATIVE_TO_UNI(*((U8*)l)) !=
3146 utf8n_to_uvuni((U8*)s, UTF8_MAXBYTES, &ulen,
3154 nextchr = UCHARAT(locinput);
3157 /* The target and the pattern have the same utf8ness. */
3158 /* Inline the first character, for speed. */
3159 if (UCHARAT(s) != nextchr)
3161 if (PL_regeol - locinput < st->ln)
3163 if (st->ln > 1 && memNE(s, locinput, st->ln))
3166 nextchr = UCHARAT(locinput);
3170 PL_reg_flags |= RF_tainted;
3173 char * const s = STRING(scan);
3174 st->ln = STR_LEN(scan);
3176 if (do_utf8 || UTF) {
3177 /* Either target or the pattern are utf8. */
3178 const char * const l = locinput;
3179 char *e = PL_regeol;
3181 if (ibcmp_utf8(s, 0, st->ln, (bool)UTF,
3182 l, &e, 0, do_utf8)) {
3183 /* One more case for the sharp s:
3184 * pack("U0U*", 0xDF) =~ /ss/i,
3185 * the 0xC3 0x9F are the UTF-8
3186 * byte sequence for the U+00DF. */
3188 toLOWER(s[0]) == 's' &&
3190 toLOWER(s[1]) == 's' &&
3197 nextchr = UCHARAT(locinput);
3201 /* Neither the target and the pattern are utf8. */
3203 /* Inline the first character, for speed. */
3204 if (UCHARAT(s) != nextchr &&
3205 UCHARAT(s) != ((OP(scan) == EXACTF)
3206 ? PL_fold : PL_fold_locale)[nextchr])
3208 if (PL_regeol - locinput < st->ln)
3210 if (st->ln > 1 && (OP(scan) == EXACTF
3211 ? ibcmp(s, locinput, st->ln)
3212 : ibcmp_locale(s, locinput, st->ln)))
3215 nextchr = UCHARAT(locinput);
3220 STRLEN inclasslen = PL_regeol - locinput;
3222 if (!reginclass(rex, scan, (U8*)locinput, &inclasslen, do_utf8))
3224 if (locinput >= PL_regeol)
3226 locinput += inclasslen ? inclasslen : UTF8SKIP(locinput);
3227 nextchr = UCHARAT(locinput);
3232 nextchr = UCHARAT(locinput);
3233 if (!REGINCLASS(rex, scan, (U8*)locinput))
3235 if (!nextchr && locinput >= PL_regeol)
3237 nextchr = UCHARAT(++locinput);
3241 /* If we might have the case of the German sharp s
3242 * in a casefolding Unicode character class. */
3244 if (ANYOF_FOLD_SHARP_S(scan, locinput, PL_regeol)) {
3245 locinput += SHARP_S_SKIP;
3246 nextchr = UCHARAT(locinput);
3252 PL_reg_flags |= RF_tainted;
3258 LOAD_UTF8_CHARCLASS_ALNUM();
3259 if (!(OP(scan) == ALNUM
3260 ? (bool)swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
3261 : isALNUM_LC_utf8((U8*)locinput)))
3265 locinput += PL_utf8skip[nextchr];
3266 nextchr = UCHARAT(locinput);
3269 if (!(OP(scan) == ALNUM
3270 ? isALNUM(nextchr) : isALNUM_LC(nextchr)))
3272 nextchr = UCHARAT(++locinput);
3275 PL_reg_flags |= RF_tainted;
3278 if (!nextchr && locinput >= PL_regeol)
3281 LOAD_UTF8_CHARCLASS_ALNUM();
3282 if (OP(scan) == NALNUM
3283 ? (bool)swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
3284 : isALNUM_LC_utf8((U8*)locinput))
3288 locinput += PL_utf8skip[nextchr];
3289 nextchr = UCHARAT(locinput);
3292 if (OP(scan) == NALNUM
3293 ? isALNUM(nextchr) : isALNUM_LC(nextchr))
3295 nextchr = UCHARAT(++locinput);
3299 PL_reg_flags |= RF_tainted;
3303 /* was last char in word? */
3305 if (locinput == PL_bostr)
3308 const U8 * const r = reghop3((U8*)locinput, -1, (U8*)PL_bostr);
3310 st->ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, uniflags);
3312 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
3313 st->ln = isALNUM_uni(st->ln);
3314 LOAD_UTF8_CHARCLASS_ALNUM();
3315 n = swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8);
3318 st->ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(st->ln));
3319 n = isALNUM_LC_utf8((U8*)locinput);
3323 st->ln = (locinput != PL_bostr) ?
3324 UCHARAT(locinput - 1) : '\n';
3325 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
3326 st->ln = isALNUM(st->ln);
3327 n = isALNUM(nextchr);
3330 st->ln = isALNUM_LC(st->ln);
3331 n = isALNUM_LC(nextchr);
3334 if (((!st->ln) == (!n)) == (OP(scan) == BOUND ||
3335 OP(scan) == BOUNDL))
3339 PL_reg_flags |= RF_tainted;
3345 if (UTF8_IS_CONTINUED(nextchr)) {
3346 LOAD_UTF8_CHARCLASS_SPACE();
3347 if (!(OP(scan) == SPACE
3348 ? (bool)swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
3349 : isSPACE_LC_utf8((U8*)locinput)))
3353 locinput += PL_utf8skip[nextchr];
3354 nextchr = UCHARAT(locinput);
3357 if (!(OP(scan) == SPACE
3358 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
3360 nextchr = UCHARAT(++locinput);
3363 if (!(OP(scan) == SPACE
3364 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
3366 nextchr = UCHARAT(++locinput);
3370 PL_reg_flags |= RF_tainted;
3373 if (!nextchr && locinput >= PL_regeol)
3376 LOAD_UTF8_CHARCLASS_SPACE();
3377 if (OP(scan) == NSPACE
3378 ? (bool)swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
3379 : isSPACE_LC_utf8((U8*)locinput))
3383 locinput += PL_utf8skip[nextchr];
3384 nextchr = UCHARAT(locinput);
3387 if (OP(scan) == NSPACE
3388 ? isSPACE(nextchr) : isSPACE_LC(nextchr))
3390 nextchr = UCHARAT(++locinput);
3393 PL_reg_flags |= RF_tainted;
3399 LOAD_UTF8_CHARCLASS_DIGIT();
3400 if (!(OP(scan) == DIGIT
3401 ? (bool)swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
3402 : isDIGIT_LC_utf8((U8*)locinput)))
3406 locinput += PL_utf8skip[nextchr];
3407 nextchr = UCHARAT(locinput);
3410 if (!(OP(scan) == DIGIT
3411 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr)))
3413 nextchr = UCHARAT(++locinput);
3416 PL_reg_flags |= RF_tainted;
3419 if (!nextchr && locinput >= PL_regeol)
3422 LOAD_UTF8_CHARCLASS_DIGIT();
3423 if (OP(scan) == NDIGIT
3424 ? (bool)swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
3425 : isDIGIT_LC_utf8((U8*)locinput))
3429 locinput += PL_utf8skip[nextchr];
3430 nextchr = UCHARAT(locinput);
3433 if (OP(scan) == NDIGIT
3434 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr))
3436 nextchr = UCHARAT(++locinput);
3439 if (locinput >= PL_regeol)
3442 LOAD_UTF8_CHARCLASS_MARK();
3443 if (swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
3445 locinput += PL_utf8skip[nextchr];
3446 while (locinput < PL_regeol &&
3447 swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
3448 locinput += UTF8SKIP(locinput);
3449 if (locinput > PL_regeol)
3454 nextchr = UCHARAT(locinput);
3457 PL_reg_flags |= RF_tainted;
3462 n = ARG(scan); /* which paren pair */
3463 st->ln = PL_regstartp[n];
3464 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
3465 if ((I32)*PL_reglastparen < n || st->ln == -1)
3466 sayNO; /* Do not match unless seen CLOSEn. */
3467 if (st->ln == PL_regendp[n])
3470 s = PL_bostr + st->ln;
3471 if (do_utf8 && OP(scan) != REF) { /* REF can do byte comparison */
3473 const char *e = PL_bostr + PL_regendp[n];
3475 * Note that we can't do the "other character" lookup trick as
3476 * in the 8-bit case (no pun intended) because in Unicode we
3477 * have to map both upper and title case to lower case.
3479 if (OP(scan) == REFF) {
3481 STRLEN ulen1, ulen2;
3482 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
3483 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
3487 toLOWER_utf8((U8*)s, tmpbuf1, &ulen1);
3488 toLOWER_utf8((U8*)l, tmpbuf2, &ulen2);
3489 if (ulen1 != ulen2 || memNE((char *)tmpbuf1, (char *)tmpbuf2, ulen1))
3496 nextchr = UCHARAT(locinput);
3500 /* Inline the first character, for speed. */
3501 if (UCHARAT(s) != nextchr &&
3503 (UCHARAT(s) != ((OP(scan) == REFF
3504 ? PL_fold : PL_fold_locale)[nextchr]))))
3506 st->ln = PL_regendp[n] - st->ln;
3507 if (locinput + st->ln > PL_regeol)
3509 if (st->ln > 1 && (OP(scan) == REF
3510 ? memNE(s, locinput, st->ln)
3512 ? ibcmp(s, locinput, st->ln)
3513 : ibcmp_locale(s, locinput, st->ln))))
3516 nextchr = UCHARAT(locinput);
3527 #define ST st->u.eval
3529 case EVAL: /* /(?{A})B/ /(??{A})B/ and /(?(?{A})X|Y)B/ */
3533 /* execute the code in the {...} */
3535 SV ** const before = SP;
3536 OP_4tree * const oop = PL_op;
3537 COP * const ocurcop = PL_curcop;
3541 PL_op = (OP_4tree*)rex->data->data[n];
3542 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, " re_eval 0x%"UVxf"\n", PTR2UV(PL_op)) );
3543 PAD_SAVE_LOCAL(old_comppad, (PAD*)rex->data->data[n + 2]);
3544 PL_regendp[0] = PL_reg_magic->mg_len = locinput - PL_bostr;
3546 CALLRUNOPS(aTHX); /* Scalar context. */
3549 ret = &PL_sv_undef; /* protect against empty (?{}) blocks. */
3556 PAD_RESTORE_LOCAL(old_comppad);
3557 PL_curcop = ocurcop;
3560 sv_setsv(save_scalar(PL_replgv), ret);
3564 if (st->logical == 2) { /* Postponed subexpression: /(??{...})/ */
3567 /* extract RE object from returned value; compiling if
3572 if(SvROK(ret) && SvSMAGICAL(sv = SvRV(ret)))
3573 mg = mg_find(sv, PERL_MAGIC_qr);
3574 else if (SvSMAGICAL(ret)) {
3575 if (SvGMAGICAL(ret))
3576 sv_unmagic(ret, PERL_MAGIC_qr);
3578 mg = mg_find(ret, PERL_MAGIC_qr);
3582 re = (regexp *)mg->mg_obj;
3583 (void)ReREFCNT_inc(re);
3587 const char * const t = SvPV_const(ret, len);
3589 const I32 osize = PL_regsize;
3592 if (DO_UTF8(ret)) pm.op_pmdynflags |= PMdf_DYN_UTF8;
3593 re = CALLREGCOMP(aTHX_ (char*)t, (char*)t + len, &pm);
3595 & (SVs_TEMP | SVs_PADTMP | SVf_READONLY
3597 sv_magic(ret,(SV*)ReREFCNT_inc(re),
3603 /* run the pattern returned from (??{...}) */
3606 PerlIO_printf(Perl_debug_log,
3607 "Entering embedded \"%s%.60s%s%s\"\n",
3611 (strlen(re->precomp) > 60 ? "..." : ""))
3614 ST.cp = regcppush(0); /* Save *all* the positions. */
3615 REGCP_SET(ST.lastcp);
3616 *PL_reglastparen = 0;
3617 *PL_reglastcloseparen = 0;
3618 PL_reginput = locinput;
3620 /* XXXX This is too dramatic a measure... */
3624 ST.toggleutf = ((PL_reg_flags & RF_utf8) != 0) ^
3625 ((re->reganch & ROPT_UTF8) != 0);
3626 if (ST.toggleutf) PL_reg_flags ^= RF_utf8;
3631 /* now continue from first node in postoned RE */
3632 PUSH_YES_STATE_GOTO(EVAL_A, re->program + 1);
3635 /* /(?(?{...})X|Y)/ */
3636 st->sw = SvTRUE(ret);
3641 case EVAL_A: /* successfully ran inner rex (??{rex}) */
3643 PL_reg_flags ^= RF_utf8;
3646 /* XXXX This is too dramatic a measure... */
3648 /* Restore parens of the caller without popping the
3651 const I32 tmp = PL_savestack_ix;
3652 PL_savestack_ix = ST.lastcp;
3654 PL_savestack_ix = tmp;
3656 PL_reginput = locinput;
3657 /* continue at the node following the (??{...}) */
3661 case EVAL_A_fail: /* unsuccessfully ran inner rex (??{rex}) */
3662 /* Restore state to the outer re then re-throw the failure */
3664 PL_reg_flags ^= RF_utf8;
3668 /* XXXX This is too dramatic a measure... */
3671 PL_reginput = locinput;
3672 REGCP_UNWIND(ST.lastcp);
3679 n = ARG(scan); /* which paren pair */
3680 PL_reg_start_tmp[n] = locinput;
3685 n = ARG(scan); /* which paren pair */
3686 PL_regstartp[n] = PL_reg_start_tmp[n] - PL_bostr;
3687 PL_regendp[n] = locinput - PL_bostr;
3688 if (n > (I32)*PL_reglastparen)
3689 *PL_reglastparen = n;
3690 *PL_reglastcloseparen = n;
3693 n = ARG(scan); /* which paren pair */
3694 st->sw = ((I32)*PL_reglastparen >= n && PL_regendp[n] != -1);
3697 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
3699 next = NEXTOPER(NEXTOPER(scan));
3701 next = scan + ARG(scan);
3702 if (OP(next) == IFTHEN) /* Fake one. */
3703 next = NEXTOPER(NEXTOPER(next));
3707 st->logical = scan->flags;
3709 /*******************************************************************
3710 cc points to the regmatch_state associated with the most recent CURLYX.
3711 This struct contains info about the innermost (...)* loop (an
3712 "infoblock"), and a pointer to the next outer cc.
3714 Here is how Y(A)*Z is processed (if it is compiled into CURLYX/WHILEM):
3716 1) After matching Y, regnode for CURLYX is processed;
3718 2) This regnode populates cc, and calls regmatch() recursively
3719 with the starting point at WHILEM node;
3721 3) Each hit of WHILEM node tries to match A and Z (in the order
3722 depending on the current iteration, min/max of {min,max} and
3723 greediness). The information about where are nodes for "A"
3724 and "Z" is read from cc, as is info on how many times "A"
3725 was already matched, and greediness.
3727 4) After A matches, the same WHILEM node is hit again.
3729 5) Each time WHILEM is hit, cc is the infoblock created by CURLYX
3730 of the same pair. Thus when WHILEM tries to match Z, it temporarily
3731 resets cc, since this Y(A)*Z can be a part of some other loop:
3732 as in (Y(A)*Z)*. If Z matches, the automaton will hit the WHILEM node
3733 of the external loop.
3735 Currently present infoblocks form a tree with a stem formed by st->cc
3736 and whatever it mentions via ->next, and additional attached trees
3737 corresponding to temporarily unset infoblocks as in "5" above.
3739 In the following picture, infoblocks for outer loop of
3740 (Y(A)*?Z)*?T are denoted O, for inner I. NULL starting block
3741 is denoted by x. The matched string is YAAZYAZT. Temporarily postponed
3742 infoblocks are drawn below the "reset" infoblock.
3744 In fact in the picture below we do not show failed matches for Z and T
3745 by WHILEM blocks. [We illustrate minimal matches, since for them it is
3746 more obvious *why* one needs to *temporary* unset infoblocks.]
3748 Matched REx position InfoBlocks Comment
3752 Y A)*?Z)*?T x <- O <- I
3753 YA )*?Z)*?T x <- O <- I
3754 YA A)*?Z)*?T x <- O <- I
3755 YAA )*?Z)*?T x <- O <- I
3756 YAA Z)*?T x <- O # Temporary unset I
3759 YAAZ Y(A)*?Z)*?T x <- O
3762 YAAZY (A)*?Z)*?T x <- O
3765 YAAZY A)*?Z)*?T x <- O <- I
3768 YAAZYA )*?Z)*?T x <- O <- I
3771 YAAZYA Z)*?T x <- O # Temporary unset I
3777 YAAZYAZ T x # Temporary unset O
3784 *******************************************************************/
3787 /* No need to save/restore up to this paren */
3788 I32 parenfloor = scan->flags;
3792 CURLYX and WHILEM are always paired: they're the moral
3793 equivalent of pp_enteriter anbd pp_iter.
3795 The only time next could be null is if the node tree is
3796 corrupt. This was mentioned on p5p a few days ago.
3798 See http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/2006-04/msg00556.html
3799 So we'll assert that this is true:
3802 if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
3804 /* XXXX Probably it is better to teach regpush to support
3805 parenfloor > PL_regsize... */
3806 if (parenfloor > (I32)*PL_reglastparen)
3807 parenfloor = *PL_reglastparen; /* Pessimization... */
3809 st->u.curlyx.cp = PL_savestack_ix;
3810 st->u.curlyx.outercc = st->cc;
3812 /* these fields contain the state of the current curly.
3813 * they are accessed by subsequent WHILEMs;
3814 * cur and lastloc are also updated by WHILEM */
3815 st->u.curlyx.parenfloor = parenfloor;
3816 st->u.curlyx.cur = -1; /* this will be updated by WHILEM */
3817 st->u.curlyx.min = ARG1(scan);
3818 st->u.curlyx.max = ARG2(scan);
3819 st->u.curlyx.scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3820 st->u.curlyx.lastloc = 0;
3821 /* st->next and st->minmod are also read by WHILEM */
3823 PL_reginput = locinput;
3824 REGMATCH(PREVOPER(next), CURLYX); /* start on the WHILEM */
3825 /*** all unsaved local vars undefined at this point */
3826 regcpblow(st->u.curlyx.cp);
3827 st->cc = st->u.curlyx.outercc;
3833 * This is really hard to understand, because after we match
3834 * what we're trying to match, we must make sure the rest of
3835 * the REx is going to match for sure, and to do that we have
3836 * to go back UP the parse tree by recursing ever deeper. And
3837 * if it fails, we have to reset our parent's current state
3838 * that we can try again after backing off.
3843 st->cc gets initialised by CURLYX ready for use by WHILEM.
3844 So again, unless somethings been corrupted, st->cc cannot
3845 be null at that point in WHILEM.
3847 See http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/2006-04/msg00556.html
3848 So we'll assert that this is true:
3851 st->u.whilem.lastloc = st->cc->u.curlyx.lastloc; /* Detection of 0-len. */
3852 st->u.whilem.cache_offset = 0;
3853 st->u.whilem.cache_bit = 0;
3855 n = st->cc->u.curlyx.cur + 1; /* how many we know we matched */
3856 PL_reginput = locinput;
3859 PerlIO_printf(Perl_debug_log,
3860 "%*s %ld out of %ld..%ld cc=%"UVxf"\n",
3861 REPORT_CODE_OFF+PL_regindent*2, "",
3862 (long)n, (long)st->cc->u.curlyx.min,
3863 (long)st->cc->u.curlyx.max, PTR2UV(st->cc))
3866 /* If degenerate scan matches "", assume scan done. */
3868 if (locinput == st->cc->u.curlyx.lastloc && n >= st->cc->u.curlyx.min) {
3869 st->u.whilem.savecc = st->cc;
3870 st->cc = st->cc->u.curlyx.outercc;
3872 st->ln = st->cc->u.curlyx.cur;
3874 PerlIO_printf(Perl_debug_log,
3875 "%*s empty match detected, try continuation...\n",
3876 REPORT_CODE_OFF+PL_regindent*2, "")
3878 REGMATCH(st->u.whilem.savecc->next, WHILEM1);
3879 /*** all unsaved local vars undefined at this point */
3880 st->cc = st->u.whilem.savecc;
3883 if (st->cc->u.curlyx.outercc)
3884 st->cc->u.curlyx.outercc->u.curlyx.cur = st->ln;
3888 /* First just match a string of min scans. */
3890 if (n < st->cc->u.curlyx.min) {
3891 st->cc->u.curlyx.cur = n;
3892 st->cc->u.curlyx.lastloc = locinput;
3893 REGMATCH(st->cc->u.curlyx.scan, WHILEM2);
3894 /*** all unsaved local vars undefined at this point */
3897 st->cc->u.curlyx.cur = n - 1;
3898 st->cc->u.curlyx.lastloc = st->u.whilem.lastloc;
3903 /* Check whether we already were at this position.
3904 Postpone detection until we know the match is not
3905 *that* much linear. */
3906 if (!PL_reg_maxiter) {
3907 PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
3908 /* possible overflow for long strings and many CURLYX's */
3909 if (PL_reg_maxiter < 0)
3910 PL_reg_maxiter = I32_MAX;
3911 PL_reg_leftiter = PL_reg_maxiter;
3913 if (PL_reg_leftiter-- == 0) {
3914 const I32 size = (PL_reg_maxiter + 7 + POSCACHE_START)/8;
3915 if (PL_reg_poscache) {
3916 if ((I32)PL_reg_poscache_size < size) {
3917 Renew(PL_reg_poscache, size, char);
3918 PL_reg_poscache_size = size;
3920 Zero(PL_reg_poscache, size, char);
3923 PL_reg_poscache_size = size;
3924 Newxz(PL_reg_poscache, size, char);
3927 PerlIO_printf(Perl_debug_log,
3928 "%sDetected a super-linear match, switching on caching%s...\n",
3929 PL_colors[4], PL_colors[5])
3932 if (PL_reg_leftiter < 0) {
3933 st->u.whilem.cache_offset = locinput - PL_bostr;
3935 st->u.whilem.cache_offset = (scan->flags & 0xf) - 1 + POSCACHE_START
3936 + st->u.whilem.cache_offset * (scan->flags>>4);
3937 st->u.whilem.cache_bit = st->u.whilem.cache_offset % 8;
3938 st->u.whilem.cache_offset /= 8;
3939 if (PL_reg_poscache[st->u.whilem.cache_offset] & (1<<st->u.whilem.cache_bit)) {
3941 PerlIO_printf(Perl_debug_log,
3942 "%*s already tried at this position...\n",
3943 REPORT_CODE_OFF+PL_regindent*2, "")
3945 if (PL_reg_poscache[0] & (1<<POSCACHE_SUCCESS))
3946 /* cache records success */
3949 /* cache records failure */
3955 /* Prefer next over scan for minimal matching. */
3957 if (st->cc->minmod) {
3958 st->u.whilem.savecc = st->cc;
3959 st->cc = st->cc->u.curlyx.outercc;
3961 st->ln = st->cc->u.curlyx.cur;
3962 st->u.whilem.cp = regcppush(st->u.whilem.savecc->u.curlyx.parenfloor);
3963 REGCP_SET(st->u.whilem.lastcp);
3964 REGMATCH(st->u.whilem.savecc->next, WHILEM3);
3965 /*** all unsaved local vars undefined at this point */
3966 st->cc = st->u.whilem.savecc;
3968 regcpblow(st->u.whilem.cp);
3969 CACHEsayYES; /* All done. */
3971 REGCP_UNWIND(st->u.whilem.lastcp);
3973 if (st->cc->u.curlyx.outercc)
3974 st->cc->u.curlyx.outercc->u.curlyx.cur = st->ln;
3976 if (n >= st->cc->u.curlyx.max) { /* Maximum greed exceeded? */
3977 if (ckWARN(WARN_REGEXP) && n >= REG_INFTY
3978 && !(PL_reg_flags & RF_warned)) {
3979 PL_reg_flags |= RF_warned;
3980 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s limit (%d) exceeded",
3981 "Complex regular subexpression recursion",
3988 PerlIO_printf(Perl_debug_log,
3989 "%*s trying longer...\n",
3990 REPORT_CODE_OFF+PL_regindent*2, "")
3992 /* Try scanning more and see if it helps. */
3993 PL_reginput = locinput;
3994 st->cc->u.curlyx.cur = n;
3995 st->cc->u.curlyx.lastloc = locinput;
3996 st->u.whilem.cp = regcppush(st->cc->u.curlyx.parenfloor);
3997 REGCP_SET(st->u.whilem.lastcp);
3998 REGMATCH(st->cc->u.curlyx.scan, WHILEM4);
3999 /*** all unsaved local vars undefined at this point */
4001 regcpblow(st->u.whilem.cp);
4004 REGCP_UNWIND(st->u.whilem.lastcp);
4006 st->cc->u.curlyx.cur = n - 1;
4007 st->cc->u.curlyx.lastloc = st->u.whilem.lastloc;
4011 /* Prefer scan over next for maximal matching. */
4013 if (n < st->cc->u.curlyx.max) { /* More greed allowed? */
4014 st->u.whilem.cp = regcppush(st->cc->u.curlyx.parenfloor);
4015 st->cc->u.curlyx.cur = n;
4016 st->cc->u.curlyx.lastloc = locinput;
4017 REGCP_SET(st->u.whilem.lastcp);
4018 REGMATCH(st->cc->u.curlyx.scan, WHILEM5);
4019 /*** all unsaved local vars undefined at this point */
4021 regcpblow(st->u.whilem.cp);
4024 REGCP_UNWIND(st->u.whilem.lastcp);
4025 regcppop(rex); /* Restore some previous $<digit>s? */
4026 PL_reginput = locinput;
4028 PerlIO_printf(Perl_debug_log,
4029 "%*s failed, try continuation...\n",
4030 REPORT_CODE_OFF+PL_regindent*2, "")
4033 if (ckWARN(WARN_REGEXP) && n >= REG_INFTY
4034 && !(PL_reg_flags & RF_warned)) {
4035 PL_reg_flags |= RF_warned;
4036 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s limit (%d) exceeded",
4037 "Complex regular subexpression recursion",
4041 /* Failed deeper matches of scan, so see if this one works. */
4042 st->u.whilem.savecc = st->cc;
4043 st->cc = st->cc->u.curlyx.outercc;
4045 st->ln = st->cc->u.curlyx.cur;
4046 REGMATCH(st->u.whilem.savecc->next, WHILEM6);
4047 /*** all unsaved local vars undefined at this point */
4048 st->cc = st->u.whilem.savecc;
4051 if (st->cc->u.curlyx.outercc)
4052 st->cc->u.curlyx.outercc->u.curlyx.cur = st->ln;
4053 st->cc->u.curlyx.cur = n - 1;
4054 st->cc->u.curlyx.lastloc = st->u.whilem.lastloc;
4060 #define ST st->u.branch
4062 case BRANCHJ: /* /(...|A|...)/ with long next pointer */
4063 next = scan + ARG(scan);
4066 scan = NEXTOPER(scan);
4069 case BRANCH: /* /(...|A|...)/ */
4070 scan = NEXTOPER(scan); /* scan now points to inner node */
4071 if (!next || (OP(next) != BRANCH && OP(next) != BRANCHJ))
4072 /* last branch; skip state push and jump direct to node */
4074 ST.lastparen = *PL_reglastparen;
4075 ST.next_branch = next;
4077 PL_reginput = locinput;
4079 /* Now go into the branch */
4080 PUSH_STATE_GOTO(BRANCH_next, scan);
4083 case BRANCH_next_fail: /* that branch failed; try the next, if any */
4084 REGCP_UNWIND(ST.cp);
4085 for (n = *PL_reglastparen; n > ST.lastparen; n--)
4087 *PL_reglastparen = n;
4088 scan = ST.next_branch;
4089 /* no more branches? */
4090 if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ))
4092 continue; /* execute next BRANCH[J] op */
4100 #define ST st->u.curlym
4102 case CURLYM: /* /A{m,n}B/ where A is fixed-length */
4104 /* This is an optimisation of CURLYX that enables us to push
4105 * only a single backtracking state, no matter now many matches
4106 * there are in {m,n}. It relies on the pattern being constant
4107 * length, with no parens to influence future backrefs
4111 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
4113 /* if paren positive, emulate an OPEN/CLOSE around A */
4115 I32 paren = ST.me->flags;
4116 if (paren > PL_regsize)
4118 if (paren > (I32)*PL_reglastparen)
4119 *PL_reglastparen = paren;
4120 scan += NEXT_OFF(scan); /* Skip former OPEN. */
4126 ST.minmod = st->minmod;
4128 ST.c1 = CHRTEST_UNINIT;
4131 if (!(ST.minmod ? ARG1(ST.me) : ARG2(ST.me))) /* min/max */
4134 curlym_do_A: /* execute the A in /A{m,n}B/ */
4135 PL_reginput = locinput;
4136 PUSH_YES_STATE_GOTO(CURLYM_A, ST.A); /* match A */
4139 case CURLYM_A: /* we've just matched an A */
4140 locinput = st->locinput;
4141 nextchr = UCHARAT(locinput);
4144 /* after first match, determine A's length: u.curlym.alen */
4145 if (ST.count == 1) {
4146 if (PL_reg_match_utf8) {
4148 while (s < PL_reginput) {
4154 ST.alen = PL_reginput - locinput;
4157 ST.count = ST.minmod ? ARG1(ST.me) : ARG2(ST.me);
4160 PerlIO_printf(Perl_debug_log,
4161 "%*s CURLYM now matched %"IVdf" times, len=%"IVdf"...\n",
4162 (int)(REPORT_CODE_OFF+PL_regindent*2), "",
4163 (IV) ST.count, (IV)ST.alen)
4166 locinput = PL_reginput;
4167 if (ST.count < (ST.minmod ? ARG1(ST.me) : ARG2(ST.me)))
4168 goto curlym_do_A; /* try to match another A */
4169 goto curlym_do_B; /* try to match B */
4171 case CURLYM_A_fail: /* just failed to match an A */
4172 REGCP_UNWIND(ST.cp);
4173 if (ST.minmod || ST.count < ARG1(ST.me) /* min*/ )
4176 curlym_do_B: /* execute the B in /A{m,n}B/ */
4177 PL_reginput = locinput;
4178 if (ST.c1 == CHRTEST_UNINIT) {
4179 /* calculate c1 and c2 for possible match of 1st char
4180 * following curly */
4181 ST.c1 = ST.c2 = CHRTEST_VOID;
4182 if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) {
4183 regnode *text_node = ST.B;
4184 if (! HAS_TEXT(text_node))
4185 FIND_NEXT_IMPT(text_node);
4186 if (HAS_TEXT(text_node)
4187 && PL_regkind[OP(text_node)] != REF)
4189 ST.c1 = (U8)*STRING(text_node);
4191 (OP(text_node) == EXACTF || OP(text_node) == REFF)
4193 : (OP(text_node) == EXACTFL || OP(text_node) == REFFL)
4194 ? PL_fold_locale[ST.c1]
4201 PerlIO_printf(Perl_debug_log,
4202 "%*s CURLYM trying tail with matches=%"IVdf"...\n",
4203 (int)(REPORT_CODE_OFF+PL_regindent*2),
4206 if (ST.c1 != CHRTEST_VOID
4207 && UCHARAT(PL_reginput) != ST.c1
4208 && UCHARAT(PL_reginput) != ST.c2)
4210 /* simulate B failing */
4211 state_num = CURLYM_B_fail;
4212 goto reenter_switch;
4216 /* mark current A as captured */
4217 I32 paren = ST.me->flags;
4220 = HOPc(PL_reginput, -ST.alen) - PL_bostr;
4221 PL_regendp[paren] = PL_reginput - PL_bostr;
4224 PL_regendp[paren] = -1;
4226 PUSH_STATE_GOTO(CURLYM_B, ST.B); /* match B */
4229 case CURLYM_B_fail: /* just failed to match a B */
4230 REGCP_UNWIND(ST.cp);
4232 if (ST.count == ARG2(ST.me) /* max */)
4234 goto curlym_do_A; /* try to match a further A */
4236 /* backtrack one A */
4237 if (ST.count == ARG1(ST.me) /* min */)
4240 locinput = HOPc(locinput, -ST.alen);
4241 goto curlym_do_B; /* try to match B */
4245 st->u.plus.paren = scan->flags; /* Which paren to set */
4246 if (st->u.plus.paren > PL_regsize)
4247 PL_regsize = st->u.plus.paren;
4248 if (st->u.plus.paren > (I32)*PL_reglastparen)
4249 *PL_reglastparen = st->u.plus.paren;
4250 st->ln = ARG1(scan); /* min to match */
4251 n = ARG2(scan); /* max to match */
4252 scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
4255 st->u.plus.paren = 0;
4256 st->ln = ARG1(scan); /* min to match */
4257 n = ARG2(scan); /* max to match */
4258 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
4263 scan = NEXTOPER(scan);
4264 st->u.plus.paren = 0;
4269 scan = NEXTOPER(scan);
4270 st->u.plus.paren = 0;
4273 * Lookahead to avoid useless match attempts
4274 * when we know what character comes next.
4278 * Used to only do .*x and .*?x, but now it allows
4279 * for )'s, ('s and (?{ ... })'s to be in the way
4280 * of the quantifier and the EXACT-like node. -- japhy
4283 if (HAS_TEXT(next) || JUMPABLE(next)) {
4285 regnode *text_node = next;
4287 if (! HAS_TEXT(text_node))
4288 FIND_NEXT_IMPT(text_node);
4290 if (! HAS_TEXT(text_node))
4291 st->u.plus.c1 = st->u.plus.c2 = CHRTEST_VOID;
4293 if (PL_regkind[OP(text_node)] == REF) {
4294 st->u.plus.c1 = st->u.plus.c2 = CHRTEST_VOID;
4295 goto assume_ok_easy;
4298 s = (U8*)STRING(text_node);
4301 st->u.plus.c2 = st->u.plus.c1 = *s;
4302 if (OP(text_node) == EXACTF || OP(text_node) == REFF)
4303 st->u.plus.c2 = PL_fold[st->u.plus.c1];
4304 else if (OP(text_node) == EXACTFL || OP(text_node) == REFFL)
4305 st->u.plus.c2 = PL_fold_locale[st->u.plus.c1];
4308 if (OP(text_node) == EXACTF || OP(text_node) == REFF) {
4309 STRLEN ulen1, ulen2;
4310 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
4311 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
4313 to_utf8_lower((U8*)s, tmpbuf1, &ulen1);
4314 to_utf8_upper((U8*)s, tmpbuf2, &ulen2);
4316 st->u.plus.c1 = utf8n_to_uvuni(tmpbuf1, UTF8_MAXBYTES, 0,
4318 st->u.plus.c2 = utf8n_to_uvuni(tmpbuf2, UTF8_MAXBYTES, 0,
4322 st->u.plus.c2 = st->u.plus.c1 = utf8n_to_uvchr(s, UTF8_MAXBYTES, 0,
4329 st->u.plus.c1 = st->u.plus.c2 = CHRTEST_VOID;
4331 PL_reginput = locinput;
4334 if (st->ln && regrepeat(rex, scan, st->ln) < st->ln)
4336 locinput = PL_reginput;
4337 REGCP_SET(st->u.plus.lastcp);
4338 if (st->u.plus.c1 != CHRTEST_VOID) {
4339 st->u.plus.old = locinput;
4340 st->u.plus.count = 0;
4342 if (n == REG_INFTY) {
4343 st->u.plus.e = PL_regeol - 1;
4345 while (UTF8_IS_CONTINUATION(*(U8*)st->u.plus.e))
4350 for (st->u.plus.e = locinput;
4351 m >0 && st->u.plus.e + UTF8SKIP(st->u.plus.e) <= PL_regeol; m--)
4352 st->u.plus.e += UTF8SKIP(st->u.plus.e);
4355 st->u.plus.e = locinput + n - st->ln;
4356 if (st->u.plus.e >= PL_regeol)
4357 st->u.plus.e = PL_regeol - 1;
4360 /* Find place 'next' could work */
4362 if (st->u.plus.c1 == st->u.plus.c2) {
4363 while (locinput <= st->u.plus.e &&
4364 UCHARAT(locinput) != st->u.plus.c1)
4367 while (locinput <= st->u.plus.e
4368 && UCHARAT(locinput) != st->u.plus.c1
4369 && UCHARAT(locinput) != st->u.plus.c2)
4372 st->u.plus.count = locinput - st->u.plus.old;
4375 if (st->u.plus.c1 == st->u.plus.c2) {
4377 /* count initialised to
4378 * utf8_distance(old, locinput) */
4379 while (locinput <= st->u.plus.e &&
4380 utf8n_to_uvchr((U8*)locinput,
4381 UTF8_MAXBYTES, &len,
4382 uniflags) != (UV)st->u.plus.c1) {
4387 /* count initialised to
4388 * utf8_distance(old, locinput) */
4389 while (locinput <= st->u.plus.e) {
4391 const UV c = utf8n_to_uvchr((U8*)locinput,
4392 UTF8_MAXBYTES, &len,
4394 if (c == (UV)st->u.plus.c1 || c == (UV)st->u.plus.c2)
4401 if (locinput > st->u.plus.e)
4403 /* PL_reginput == old now */
4404 if (locinput != st->u.plus.old) {
4405 st->ln = 1; /* Did some */
4406 if (regrepeat(rex, scan, st->u.plus.count) < st->u.plus.count)
4409 /* PL_reginput == locinput now */
4410 PL_reginput = locinput; /* Could be reset... */
4411 TRYPAREN(st->u.plus.paren, st->ln, locinput, PLUS1);
4412 /*** all unsaved local vars undefined at this point */
4414 REGCP_UNWIND(st->u.plus.lastcp);
4415 /* Couldn't or didn't -- move forward. */
4416 st->u.plus.old = locinput;
4418 locinput += UTF8SKIP(locinput);
4421 st->u.plus.count = 1;
4425 while (n >= st->ln || (n == REG_INFTY && st->ln > 0)) { /* ln overflow ? */
4427 if (st->u.plus.c1 != CHRTEST_VOID) {
4429 c = utf8n_to_uvchr((U8*)PL_reginput,
4433 c = UCHARAT(PL_reginput);
4434 /* If it could work, try it. */
4435 if (c == (UV)st->u.plus.c1 || c == (UV)st->u.plus.c2) {
4436 TRYPAREN(st->u.plus.paren, st->ln, PL_reginput, PLUS2);
4437 /*** all unsaved local vars undefined at this point */
4438 REGCP_UNWIND(st->u.plus.lastcp);
4441 /* If it could work, try it. */
4442 else if (st->u.plus.c1 == CHRTEST_VOID) {
4443 TRYPAREN(st->u.plus.paren, st->ln, PL_reginput, PLUS3);
4444 /*** all unsaved local vars undefined at this point */
4445 REGCP_UNWIND(st->u.plus.lastcp);
4447 /* Couldn't or didn't -- move forward. */
4448 PL_reginput = locinput;
4449 if (regrepeat(rex, scan, 1)) {
4451 locinput = PL_reginput;
4458 n = regrepeat(rex, scan, n);
4459 locinput = PL_reginput;
4460 if ((st->ln < n) && (PL_regkind[OP(next)] == EOL) &&
4461 (OP(next) != MEOL || OP(next) == SEOL || OP(next) == EOS))
4463 st->ln = n; /* why back off? */
4464 /* ...because $ and \Z can match before *and* after
4465 newline at the end. Consider "\n\n" =~ /\n+\Z\n/.
4466 We should back off by one in this case. */
4467 if (UCHARAT(PL_reginput - 1) == '\n' && OP(next) != EOS)
4470 REGCP_SET(st->u.plus.lastcp);
4472 while (n >= st->ln) {
4474 if (st->u.plus.c1 != CHRTEST_VOID) {
4476 c = utf8n_to_uvchr((U8*)PL_reginput,
4480 c = UCHARAT(PL_reginput);
4482 /* If it could work, try it. */
4483 if (st->u.plus.c1 == CHRTEST_VOID || c == (UV)st->u.plus.c1 || c == (UV)st->u.plus.c2) {
4484 TRYPAREN(st->u.plus.paren, n, PL_reginput, PLUS4);
4485 /*** all unsaved local vars undefined at this point */
4486 REGCP_UNWIND(st->u.plus.lastcp);
4488 /* Couldn't or didn't -- back up. */
4490 PL_reginput = locinput = HOPc(locinput, -1);
4497 if (locinput < reginfo->till) {
4498 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
4499 "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
4501 (long)(locinput - PL_reg_starttry),
4502 (long)(reginfo->till - PL_reg_starttry),
4504 sayNO_FINAL; /* Cannot match: too short. */
4506 PL_reginput = locinput; /* put where regtry can find it */
4507 sayYES_FINAL; /* Success! */
4509 case SUCCEED: /* successful SUSPEND/UNLESSM/IFMATCH/CURLYM */
4511 PerlIO_printf(Perl_debug_log,
4512 "%*s %ssubpattern success...%s\n",
4513 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4], PL_colors[5]));
4514 PL_reginput = locinput; /* put where regtry can find it */
4515 sayYES_FINAL; /* Success! */
4518 #define ST st->u.ifmatch
4520 case SUSPEND: /* (?>A) */
4522 PL_reginput = locinput;
4525 case UNLESSM: /* -ve lookaround: (?!A), or with flags, (?<!A) */
4527 goto ifmatch_trivial_fail_test;
4529 case IFMATCH: /* +ve lookaround: (?=A), or with flags, (?<=A) */
4531 ifmatch_trivial_fail_test:
4533 char * const s = HOPBACKc(locinput, scan->flags);
4538 st->sw = 1 - (bool)ST.wanted;
4542 next = scan + ARG(scan);
4550 PL_reginput = locinput;
4554 /* execute body of (?...A) */
4555 PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)));
4558 case IFMATCH_A_fail: /* body of (?...A) failed */
4559 ST.wanted = !ST.wanted;
4562 case IFMATCH_A: /* body of (?...A) succeeded */
4565 st->sw = (bool)ST.wanted;
4567 else if (!ST.wanted)
4570 if (OP(ST.me) == SUSPEND)
4571 locinput = PL_reginput;
4573 locinput = PL_reginput = st->locinput;
4574 nextchr = UCHARAT(locinput);
4576 scan = ST.me + ARG(ST.me);
4579 continue; /* execute B */
4584 next = scan + ARG(scan);
4589 PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
4590 PTR2UV(scan), OP(scan));
4591 Perl_croak(aTHX_ "regexp memory corruption");
4599 /* push a state that backtracks on success */
4600 st->u.yes.prev_yes_state = yes_state;
4604 /* push a new regex state, then continue at scan */
4606 regmatch_state *newst;
4609 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
4610 "PUSH STATE(%d)\n", depth));
4611 st->locinput = locinput;
4613 if (newst > SLAB_LAST(PL_regmatch_slab))
4614 newst = S_push_slab(aTHX);
4615 PL_regmatch_state = newst;
4617 /* XXX probably don't need to initialise these */
4622 locinput = PL_reginput;
4623 nextchr = UCHARAT(locinput);
4629 /* simulate recursively calling regmatch(), but without actually
4630 * recursing - ie save the current state on the heap rather than on
4631 * the stack, then re-enter the loop. This avoids complex regexes
4632 * blowing the processor stack */
4636 /* push new state */
4637 regmatch_state *oldst = st;
4640 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "PUSH RECURSE STATE(%d)\n", depth));
4642 /* grab the next free state slot */
4644 if (st > SLAB_LAST(PL_regmatch_slab))
4645 st = S_push_slab(aTHX);
4646 PL_regmatch_state = st;
4650 oldst->locinput = locinput;
4653 locinput = PL_reginput;
4654 nextchr = UCHARAT(locinput);
4667 * We get here only if there's trouble -- normally "case END" is
4668 * the terminating point.
4670 Perl_croak(aTHX_ "corrupted regexp pointers");
4677 /* we have successfully completed a subexpression, but we must now
4678 * pop to the state marked by yes_state and continue from there */
4680 assert(st != yes_state);
4681 while (yes_state < SLAB_FIRST(PL_regmatch_slab)
4682 || yes_state > SLAB_LAST(PL_regmatch_slab))
4684 /* not in this slab, pop slab */
4685 depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1);
4686 PL_regmatch_slab = PL_regmatch_slab->prev;
4687 st = SLAB_LAST(PL_regmatch_slab);
4689 depth -= (st - yes_state);
4690 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "POP STATES (%d..%d)\n",
4691 depth+1, depth+(st - yes_state)));
4693 yes_state = st->u.yes.prev_yes_state;
4694 PL_regmatch_state = st;
4696 switch (st->resume_state) {
4700 state_num = st->resume_state;
4701 goto reenter_switch;
4707 Perl_croak(aTHX_ "unexpected yes resume state");
4711 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
4712 PL_colors[4], PL_colors[5]));
4719 /* XXX this is duplicate(ish) code to that in the do_no section.
4720 * will disappear when REGFMATCH goes */
4722 /* restore previous state and re-enter */
4723 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "POP STATE(%d)\n", depth));
4726 if (st < SLAB_FIRST(PL_regmatch_slab)) {
4727 PL_regmatch_slab = PL_regmatch_slab->prev;
4728 st = SLAB_LAST(PL_regmatch_slab);
4730 PL_regmatch_state = st;
4734 locinput= st->locinput;
4735 nextchr = UCHARAT(locinput);
4737 switch (st->resume_state) {
4739 goto resume_point_CURLYX;
4740 case resume_WHILEM1:
4741 goto resume_point_WHILEM1;
4742 case resume_WHILEM2:
4743 goto resume_point_WHILEM2;
4744 case resume_WHILEM3:
4745 goto resume_point_WHILEM3;
4746 case resume_WHILEM4:
4747 goto resume_point_WHILEM4;
4748 case resume_WHILEM5:
4749 goto resume_point_WHILEM5;
4750 case resume_WHILEM6:
4751 goto resume_point_WHILEM6;
4753 goto resume_point_PLUS1;
4755 goto resume_point_PLUS2;
4757 goto resume_point_PLUS3;
4759 goto resume_point_PLUS4;
4770 Perl_croak(aTHX_ "regexp resume memory corruption");
4777 PerlIO_printf(Perl_debug_log,
4778 "%*s %sfailed...%s\n",
4779 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4], PL_colors[5])
4790 /* there's a previous state to backtrack to */
4791 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "POP STATE(%d)\n", depth));
4794 if (st < SLAB_FIRST(PL_regmatch_slab)) {
4795 PL_regmatch_slab = PL_regmatch_slab->prev;
4796 st = SLAB_LAST(PL_regmatch_slab);
4798 PL_regmatch_state = st;
4802 locinput= st->locinput;
4803 nextchr = UCHARAT(locinput);
4805 switch (st->resume_state) {
4807 goto resume_point_CURLYX;
4808 case resume_WHILEM1:
4809 goto resume_point_WHILEM1;
4810 case resume_WHILEM2:
4811 goto resume_point_WHILEM2;
4812 case resume_WHILEM3:
4813 goto resume_point_WHILEM3;
4814 case resume_WHILEM4:
4815 goto resume_point_WHILEM4;
4816 case resume_WHILEM5:
4817 goto resume_point_WHILEM5;
4818 case resume_WHILEM6:
4819 goto resume_point_WHILEM6;
4827 if (yes_state == st)
4828 yes_state = st->u.yes.prev_yes_state;
4829 state_num = st->resume_state + 1; /* failure = success + 1 */
4830 goto reenter_switch;
4833 goto resume_point_PLUS1;
4835 goto resume_point_PLUS2;
4837 goto resume_point_PLUS3;
4839 goto resume_point_PLUS4;
4841 Perl_croak(aTHX_ "regexp resume memory corruption");
4847 /* restore original high-water mark */
4848 PL_regmatch_slab = orig_slab;
4849 PL_regmatch_state = orig_state;
4851 /* free all slabs above current one */
4852 if (orig_slab->next) {
4853 regmatch_slab *sl = orig_slab->next;
4854 orig_slab->next = NULL;
4856 regmatch_slab * const osl = sl;
4867 - regrepeat - repeatedly match something simple, report how many
4870 * [This routine now assumes that it will only match on things of length 1.
4871 * That was true before, but now we assume scan - reginput is the count,
4872 * rather than incrementing count on every character. [Er, except utf8.]]
4875 S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max)
4878 register char *scan;
4880 register char *loceol = PL_regeol;
4881 register I32 hardcount = 0;
4882 register bool do_utf8 = PL_reg_match_utf8;
4885 if (max == REG_INFTY)
4887 else if (max < loceol - scan)
4888 loceol = scan + max;
4893 while (scan < loceol && hardcount < max && *scan != '\n') {
4894 scan += UTF8SKIP(scan);
4898 while (scan < loceol && *scan != '\n')
4905 while (scan < loceol && hardcount < max) {
4906 scan += UTF8SKIP(scan);
4916 case EXACT: /* length of string is 1 */
4918 while (scan < loceol && UCHARAT(scan) == c)
4921 case EXACTF: /* length of string is 1 */
4923 while (scan < loceol &&
4924 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold[c]))
4927 case EXACTFL: /* length of string is 1 */
4928 PL_reg_flags |= RF_tainted;
4930 while (scan < loceol &&
4931 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold_locale[c]))
4937 while (hardcount < max && scan < loceol &&
4938 reginclass(prog, p, (U8*)scan, 0, do_utf8)) {
4939 scan += UTF8SKIP(scan);
4943 while (scan < loceol && REGINCLASS(prog, p, (U8*)scan))
4950 LOAD_UTF8_CHARCLASS_ALNUM();
4951 while (hardcount < max && scan < loceol &&
4952 swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) {
4953 scan += UTF8SKIP(scan);
4957 while (scan < loceol && isALNUM(*scan))
4962 PL_reg_flags |= RF_tainted;
4965 while (hardcount < max && scan < loceol &&
4966 isALNUM_LC_utf8((U8*)scan)) {
4967 scan += UTF8SKIP(scan);
4971 while (scan < loceol && isALNUM_LC(*scan))
4978 LOAD_UTF8_CHARCLASS_ALNUM();
4979 while (hardcount < max && scan < loceol &&
4980 !swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) {
4981 scan += UTF8SKIP(scan);
4985 while (scan < loceol && !isALNUM(*scan))
4990 PL_reg_flags |= RF_tainted;
4993 while (hardcount < max && scan < loceol &&
4994 !isALNUM_LC_utf8((U8*)scan)) {
4995 scan += UTF8SKIP(scan);
4999 while (scan < loceol && !isALNUM_LC(*scan))
5006 LOAD_UTF8_CHARCLASS_SPACE();
5007 while (hardcount < max && scan < loceol &&
5009 swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) {
5010 scan += UTF8SKIP(scan);
5014 while (scan < loceol && isSPACE(*scan))
5019 PL_reg_flags |= RF_tainted;
5022 while (hardcount < max && scan < loceol &&
5023 (*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
5024 scan += UTF8SKIP(scan);
5028 while (scan < loceol && isSPACE_LC(*scan))
5035 LOAD_UTF8_CHARCLASS_SPACE();
5036 while (hardcount < max && scan < loceol &&
5038 swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) {
5039 scan += UTF8SKIP(scan);
5043 while (scan < loceol && !isSPACE(*scan))
5048 PL_reg_flags |= RF_tainted;
5051 while (hardcount < max && scan < loceol &&
5052 !(*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
5053 scan += UTF8SKIP(scan);
5057 while (scan < loceol && !isSPACE_LC(*scan))
5064 LOAD_UTF8_CHARCLASS_DIGIT();
5065 while (hardcount < max && scan < loceol &&
5066 swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) {
5067 scan += UTF8SKIP(scan);
5071 while (scan < loceol && isDIGIT(*scan))
5078 LOAD_UTF8_CHARCLASS_DIGIT();
5079 while (hardcount < max && scan < loceol &&
5080 !swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) {
5081 scan += UTF8SKIP(scan);
5085 while (scan < loceol && !isDIGIT(*scan))
5089 default: /* Called on something of 0 width. */
5090 break; /* So match right here or not at all. */
5096 c = scan - PL_reginput;
5100 GET_RE_DEBUG_FLAGS_DECL;
5102 SV * const prop = sv_newmortal();
5103 regprop(prog, prop, p);
5104 PerlIO_printf(Perl_debug_log,
5105 "%*s %s can match %"IVdf" times out of %"IVdf"...\n",
5106 REPORT_CODE_OFF+1, "", SvPVX_const(prop),(IV)c,(IV)max);
5114 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
5116 - regclass_swash - prepare the utf8 swash
5120 Perl_regclass_swash(pTHX_ const regexp *prog, register const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
5126 const struct reg_data * const data = prog ? prog->data : NULL;
5128 if (data && data->count) {
5129 const U32 n = ARG(node);
5131 if (data->what[n] == 's') {
5132 SV * const rv = (SV*)data->data[n];
5133 AV * const av = (AV*)SvRV((SV*)rv);
5134 SV **const ary = AvARRAY(av);
5137 /* See the end of regcomp.c:S_regclass() for
5138 * documentation of these array elements. */
5141 a = SvROK(ary[1]) ? &ary[1] : 0;
5142 b = SvTYPE(ary[2]) == SVt_PVAV ? &ary[2] : 0;
5146 else if (si && doinit) {
5147 sw = swash_init("utf8", "", si, 1, 0);
5148 (void)av_store(av, 1, sw);
5165 - reginclass - determine if a character falls into a character class
5167 The n is the ANYOF regnode, the p is the target string, lenp
5168 is pointer to the maximum length of how far to go in the p
5169 (if the lenp is zero, UTF8SKIP(p) is used),
5170 do_utf8 tells whether the target string is in UTF-8.
5175 S_reginclass(pTHX_ const regexp *prog, register const regnode *n, register const U8* p, STRLEN* lenp, register bool do_utf8)
5178 const char flags = ANYOF_FLAGS(n);
5184 if (do_utf8 && !UTF8_IS_INVARIANT(c)) {
5185 c = utf8n_to_uvchr(p, UTF8_MAXBYTES, &len,
5186 (UTF8_ALLOW_DEFAULT & UTF8_ALLOW_ANYUV) | UTF8_CHECK_ONLY);
5187 /* see [perl #37836] for UTF8_ALLOW_ANYUV */
5188 if (len == (STRLEN)-1)
5189 Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
5192 plen = lenp ? *lenp : UNISKIP(NATIVE_TO_UNI(c));
5193 if (do_utf8 || (flags & ANYOF_UNICODE)) {
5196 if (do_utf8 && !ANYOF_RUNTIME(n)) {
5197 if (len != (STRLEN)-1 && c < 256 && ANYOF_BITMAP_TEST(n, c))
5200 if (!match && do_utf8 && (flags & ANYOF_UNICODE_ALL) && c >= 256)
5204 SV * const sw = regclass_swash(prog, n, TRUE, 0, (SV**)&av);
5207 if (swash_fetch(sw, p, do_utf8))
5209 else if (flags & ANYOF_FOLD) {
5210 if (!match && lenp && av) {
5212 for (i = 0; i <= av_len(av); i++) {
5213 SV* const sv = *av_fetch(av, i, FALSE);
5215 const char * const s = SvPV_const(sv, len);
5217 if (len <= plen && memEQ(s, (char*)p, len)) {
5225 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
5228 to_utf8_fold(p, tmpbuf, &tmplen);
5229 if (swash_fetch(sw, tmpbuf, do_utf8))
5235 if (match && lenp && *lenp == 0)
5236 *lenp = UNISKIP(NATIVE_TO_UNI(c));
5238 if (!match && c < 256) {
5239 if (ANYOF_BITMAP_TEST(n, c))
5241 else if (flags & ANYOF_FOLD) {
5244 if (flags & ANYOF_LOCALE) {
5245 PL_reg_flags |= RF_tainted;
5246 f = PL_fold_locale[c];
5250 if (f != c && ANYOF_BITMAP_TEST(n, f))
5254 if (!match && (flags & ANYOF_CLASS)) {
5255 PL_reg_flags |= RF_tainted;
5257 (ANYOF_CLASS_TEST(n, ANYOF_ALNUM) && isALNUM_LC(c)) ||
5258 (ANYOF_CLASS_TEST(n, ANYOF_NALNUM) && !isALNUM_LC(c)) ||
5259 (ANYOF_CLASS_TEST(n, ANYOF_SPACE) && isSPACE_LC(c)) ||
5260 (ANYOF_CLASS_TEST(n, ANYOF_NSPACE) && !isSPACE_LC(c)) ||
5261 (ANYOF_CLASS_TEST(n, ANYOF_DIGIT) && isDIGIT_LC(c)) ||
5262 (ANYOF_CLASS_TEST(n, ANYOF_NDIGIT) && !isDIGIT_LC(c)) ||
5263 (ANYOF_CLASS_TEST(n, ANYOF_ALNUMC) && isALNUMC_LC(c)) ||
5264 (ANYOF_CLASS_TEST(n, ANYOF_NALNUMC) && !isALNUMC_LC(c)) ||
5265 (ANYOF_CLASS_TEST(n, ANYOF_ALPHA) && isALPHA_LC(c)) ||
5266 (ANYOF_CLASS_TEST(n, ANYOF_NALPHA) && !isALPHA_LC(c)) ||
5267 (ANYOF_CLASS_TEST(n, ANYOF_ASCII) && isASCII(c)) ||
5268 (ANYOF_CLASS_TEST(n, ANYOF_NASCII) && !isASCII(c)) ||
5269 (ANYOF_CLASS_TEST(n, ANYOF_CNTRL) && isCNTRL_LC(c)) ||
5270 (ANYOF_CLASS_TEST(n, ANYOF_NCNTRL) && !isCNTRL_LC(c)) ||
5271 (ANYOF_CLASS_TEST(n, ANYOF_GRAPH) && isGRAPH_LC(c)) ||
5272 (ANYOF_CLASS_TEST(n, ANYOF_NGRAPH) && !isGRAPH_LC(c)) ||
5273 (ANYOF_CLASS_TEST(n, ANYOF_LOWER) && isLOWER_LC(c)) ||
5274 (ANYOF_CLASS_TEST(n, ANYOF_NLOWER) && !isLOWER_LC(c)) ||
5275 (ANYOF_CLASS_TEST(n, ANYOF_PRINT) && isPRINT_LC(c)) ||
5276 (ANYOF_CLASS_TEST(n, ANYOF_NPRINT) && !isPRINT_LC(c)) ||
5277 (ANYOF_CLASS_TEST(n, ANYOF_PUNCT) && isPUNCT_LC(c)) ||
5278 (ANYOF_CLASS_TEST(n, ANYOF_NPUNCT) && !isPUNCT_LC(c)) ||
5279 (ANYOF_CLASS_TEST(n, ANYOF_UPPER) && isUPPER_LC(c)) ||
5280 (ANYOF_CLASS_TEST(n, ANYOF_NUPPER) && !isUPPER_LC(c)) ||
5281 (ANYOF_CLASS_TEST(n, ANYOF_XDIGIT) && isXDIGIT(c)) ||
5282 (ANYOF_CLASS_TEST(n, ANYOF_NXDIGIT) && !isXDIGIT(c)) ||
5283 (ANYOF_CLASS_TEST(n, ANYOF_PSXSPC) && isPSXSPC(c)) ||
5284 (ANYOF_CLASS_TEST(n, ANYOF_NPSXSPC) && !isPSXSPC(c)) ||
5285 (ANYOF_CLASS_TEST(n, ANYOF_BLANK) && isBLANK(c)) ||
5286 (ANYOF_CLASS_TEST(n, ANYOF_NBLANK) && !isBLANK(c))
5287 ) /* How's that for a conditional? */
5294 return (flags & ANYOF_INVERT) ? !match : match;
5298 S_reghop3(U8 *s, I32 off, const U8* lim)
5302 while (off-- && s < lim) {
5303 /* XXX could check well-formedness here */
5311 if (UTF8_IS_CONTINUED(*s)) {
5312 while (s > (U8*)lim && UTF8_IS_CONTINUATION(*s))
5315 /* XXX could check well-formedness here */
5323 S_reghopmaybe3(U8* s, I32 off, const U8* lim)
5327 while (off-- && s < lim) {
5328 /* XXX could check well-formedness here */
5338 if (UTF8_IS_CONTINUED(*s)) {
5339 while (s > (U8*)lim && UTF8_IS_CONTINUATION(*s))
5342 /* XXX could check well-formedness here */
5354 restore_pos(pTHX_ void *arg)
5357 regexp * const rex = (regexp *)arg;
5358 if (PL_reg_eval_set) {
5359 if (PL_reg_oldsaved) {
5360 rex->subbeg = PL_reg_oldsaved;
5361 rex->sublen = PL_reg_oldsavedlen;
5362 #ifdef PERL_OLD_COPY_ON_WRITE
5363 rex->saved_copy = PL_nrs;
5365 RX_MATCH_COPIED_on(rex);
5367 PL_reg_magic->mg_len = PL_reg_oldpos;
5368 PL_reg_eval_set = 0;
5369 PL_curpm = PL_reg_oldcurpm;
5374 S_to_utf8_substr(pTHX_ register regexp *prog)
5376 if (prog->float_substr && !prog->float_utf8) {
5377 SV* const sv = newSVsv(prog->float_substr);
5378 prog->float_utf8 = sv;
5379 sv_utf8_upgrade(sv);
5380 if (SvTAIL(prog->float_substr))
5382 if (prog->float_substr == prog->check_substr)
5383 prog->check_utf8 = sv;
5385 if (prog->anchored_substr && !prog->anchored_utf8) {
5386 SV* const sv = newSVsv(prog->anchored_substr);
5387 prog->anchored_utf8 = sv;
5388 sv_utf8_upgrade(sv);
5389 if (SvTAIL(prog->anchored_substr))
5391 if (prog->anchored_substr == prog->check_substr)
5392 prog->check_utf8 = sv;
5397 S_to_byte_substr(pTHX_ register regexp *prog)
5400 if (prog->float_utf8 && !prog->float_substr) {
5401 SV* sv = newSVsv(prog->float_utf8);
5402 prog->float_substr = sv;
5403 if (sv_utf8_downgrade(sv, TRUE)) {
5404 if (SvTAIL(prog->float_utf8))
5408 prog->float_substr = sv = &PL_sv_undef;
5410 if (prog->float_utf8 == prog->check_utf8)
5411 prog->check_substr = sv;
5413 if (prog->anchored_utf8 && !prog->anchored_substr) {
5414 SV* sv = newSVsv(prog->anchored_utf8);
5415 prog->anchored_substr = sv;
5416 if (sv_utf8_downgrade(sv, TRUE)) {
5417 if (SvTAIL(prog->anchored_utf8))
5421 prog->anchored_substr = sv = &PL_sv_undef;
5423 if (prog->anchored_utf8 == prog->check_utf8)
5424 prog->check_substr = sv;
5430 * c-indentation-style: bsd
5432 * indent-tabs-mode: t
5435 * ex: set ts=8 sts=4 sw=4 noet: