5 * "One Ring to rule them all, One Ring to find them..."
8 /* This file contains functions for executing a regular expression. See
9 * also regcomp.c which funnily enough, contains functions for compiling
10 * a regular expression.
12 * This file is also copied at build time to ext/re/re_exec.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
18 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
19 * confused with the original package (see point 3 below). Thanks, Henry!
22 /* Additional note: this code is very heavily munged from Henry's version
23 * in places. In some spots I've traded clarity for efficiency, so don't
24 * blame Henry for some of the lack of readability.
27 /* The names of the functions have been changed from regcomp and
28 * regexec to pregcomp and pregexec in order to avoid conflicts
29 * with the POSIX routines of the same names.
32 #ifdef PERL_EXT_RE_BUILD
37 * pregcomp and pregexec -- regsub and regerror are not used in perl
39 * Copyright (c) 1986 by University of Toronto.
40 * Written by Henry Spencer. Not derived from licensed software.
42 * Permission is granted to anyone to use this software for any
43 * purpose on any computer system, and to redistribute it freely,
44 * subject to the following restrictions:
46 * 1. The author is not responsible for the consequences of use of
47 * this software, no matter how awful, even if they arise
50 * 2. The origin of this software must not be misrepresented, either
51 * by explicit claim or by omission.
53 * 3. Altered versions must be plainly marked as such, and must not
54 * be misrepresented as being the original software.
56 **** Alterations to Henry's code are...
58 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
59 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007 by Larry Wall and others
61 **** You may distribute under the terms of either the GNU General Public
62 **** License or the Artistic License, as specified in the README file.
64 * Beware that some of this code is subtly aware of the way operator
65 * precedence is structured in regular expressions. Serious changes in
66 * regular-expression syntax might require a total rethink.
69 #define PERL_IN_REGEXEC_C
72 #ifdef PERL_IN_XSUB_RE
78 #define RF_tainted 1 /* tainted information used? */
79 #define RF_warned 2 /* warned about big count? */
81 #define RF_utf8 8 /* Pattern contains multibyte chars? */
83 #define UTF ((PL_reg_flags & RF_utf8) != 0)
85 #define RS_init 1 /* eval environment created */
86 #define RS_set 2 /* replsv value is set */
92 #define REGINCLASS(prog,p,c) (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0,0) : ANYOF_BITMAP_TEST(p,*(c)))
98 #define CHR_SVLEN(sv) (do_utf8 ? sv_len_utf8(sv) : SvCUR(sv))
99 #define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
101 #define HOPc(pos,off) \
102 (char *)(PL_reg_match_utf8 \
103 ? reghop3((U8*)pos, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr)) \
105 #define HOPBACKc(pos, off) \
106 (char*)(PL_reg_match_utf8\
107 ? reghopmaybe3((U8*)pos, -off, (U8*)PL_bostr) \
108 : (pos - off >= PL_bostr) \
112 #define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
113 #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
115 #define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \
116 if (!CAT2(PL_utf8_,class)) { bool ok; ENTER; save_re_context(); ok=CAT2(is_utf8_,class)((const U8*)str); assert(ok); LEAVE; } } STMT_END
117 #define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS(alnum,"a")
118 #define LOAD_UTF8_CHARCLASS_DIGIT() LOAD_UTF8_CHARCLASS(digit,"0")
119 #define LOAD_UTF8_CHARCLASS_SPACE() LOAD_UTF8_CHARCLASS(space," ")
120 #define LOAD_UTF8_CHARCLASS_MARK() LOAD_UTF8_CHARCLASS(mark, "\xcd\x86")
122 /* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
124 /* for use after a quantifier and before an EXACT-like node -- japhy */
125 /* it would be nice to rework regcomp.sym to generate this stuff. sigh */
126 #define JUMPABLE(rn) ( \
128 (OP(rn) == CLOSE && (!cur_eval || cur_eval->u.eval.close_paren != ARG(rn))) || \
130 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
131 OP(rn) == PLUS || OP(rn) == MINMOD || \
132 OP(rn) == KEEPS || (PL_regkind[OP(rn)] == VERB) || \
133 (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
135 #define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT)
137 #define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF )
140 /* Currently these are only used when PL_regkind[OP(rn)] == EXACT so
141 we don't need this definition. */
142 #define IS_TEXT(rn) ( OP(rn)==EXACT || OP(rn)==REF || OP(rn)==NREF )
143 #define IS_TEXTF(rn) ( OP(rn)==EXACTF || OP(rn)==REFF || OP(rn)==NREFF )
144 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL )
147 /* ... so we use this as its faster. */
148 #define IS_TEXT(rn) ( OP(rn)==EXACT )
149 #define IS_TEXTF(rn) ( OP(rn)==EXACTF )
150 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL )
155 Search for mandatory following text node; for lookahead, the text must
156 follow but for lookbehind (rn->flags != 0) we skip to the next step.
158 #define FIND_NEXT_IMPT(rn) STMT_START { \
159 while (JUMPABLE(rn)) { \
160 const OPCODE type = OP(rn); \
161 if (type == SUSPEND || PL_regkind[type] == CURLY) \
162 rn = NEXTOPER(NEXTOPER(rn)); \
163 else if (type == PLUS) \
165 else if (type == IFMATCH) \
166 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
167 else rn += NEXT_OFF(rn); \
172 static void restore_pos(pTHX_ void *arg);
175 S_regcppush(pTHX_ I32 parenfloor)
178 const int retval = PL_savestack_ix;
179 #define REGCP_PAREN_ELEMS 4
180 const int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
182 GET_RE_DEBUG_FLAGS_DECL;
184 if (paren_elems_to_push < 0)
185 Perl_croak(aTHX_ "panic: paren_elems_to_push < 0");
187 #define REGCP_OTHER_ELEMS 7
188 SSGROW(paren_elems_to_push + REGCP_OTHER_ELEMS);
190 for (p = PL_regsize; p > parenfloor; p--) {
191 /* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
192 SSPUSHINT(PL_regoffs[p].end);
193 SSPUSHINT(PL_regoffs[p].start);
194 SSPUSHPTR(PL_reg_start_tmp[p]);
196 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
197 " saving \\%"UVuf" %"IVdf"(%"IVdf")..%"IVdf"\n",
198 (UV)p, (IV)PL_regoffs[p].start,
199 (IV)(PL_reg_start_tmp[p] - PL_bostr),
200 (IV)PL_regoffs[p].end
203 /* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
204 SSPUSHPTR(PL_regoffs);
205 SSPUSHINT(PL_regsize);
206 SSPUSHINT(*PL_reglastparen);
207 SSPUSHINT(*PL_reglastcloseparen);
208 SSPUSHPTR(PL_reginput);
209 #define REGCP_FRAME_ELEMS 2
210 /* REGCP_FRAME_ELEMS are part of the REGCP_OTHER_ELEMS and
211 * are needed for the regexp context stack bookkeeping. */
212 SSPUSHINT(paren_elems_to_push + REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
213 SSPUSHINT(SAVEt_REGCONTEXT); /* Magic cookie. */
218 /* These are needed since we do not localize EVAL nodes: */
219 #define REGCP_SET(cp) \
221 PerlIO_printf(Perl_debug_log, \
222 " Setting an EVAL scope, savestack=%"IVdf"\n", \
223 (IV)PL_savestack_ix)); \
226 #define REGCP_UNWIND(cp) \
228 if (cp != PL_savestack_ix) \
229 PerlIO_printf(Perl_debug_log, \
230 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
231 (IV)(cp), (IV)PL_savestack_ix)); \
235 S_regcppop(pTHX_ const regexp *rex)
241 GET_RE_DEBUG_FLAGS_DECL;
243 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
245 assert(i == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
246 i = SSPOPINT; /* Parentheses elements to pop. */
247 input = (char *) SSPOPPTR;
248 *PL_reglastcloseparen = SSPOPINT;
249 *PL_reglastparen = SSPOPINT;
250 PL_regsize = SSPOPINT;
251 PL_regoffs=(regexp_paren_pair *) SSPOPPTR;
254 /* Now restore the parentheses context. */
255 for (i -= (REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
256 i > 0; i -= REGCP_PAREN_ELEMS) {
258 U32 paren = (U32)SSPOPINT;
259 PL_reg_start_tmp[paren] = (char *) SSPOPPTR;
260 PL_regoffs[paren].start = SSPOPINT;
262 if (paren <= *PL_reglastparen)
263 PL_regoffs[paren].end = tmps;
265 PerlIO_printf(Perl_debug_log,
266 " restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n",
267 (UV)paren, (IV)PL_regoffs[paren].start,
268 (IV)(PL_reg_start_tmp[paren] - PL_bostr),
269 (IV)PL_regoffs[paren].end,
270 (paren > *PL_reglastparen ? "(no)" : ""));
274 if (*PL_reglastparen + 1 <= rex->nparens) {
275 PerlIO_printf(Perl_debug_log,
276 " restoring \\%"IVdf"..\\%"IVdf" to undef\n",
277 (IV)(*PL_reglastparen + 1), (IV)rex->nparens);
281 /* It would seem that the similar code in regtry()
282 * already takes care of this, and in fact it is in
283 * a better location to since this code can #if 0-ed out
284 * but the code in regtry() is needed or otherwise tests
285 * requiring null fields (pat.t#187 and split.t#{13,14}
286 * (as of patchlevel 7877) will fail. Then again,
287 * this code seems to be necessary or otherwise
288 * building DynaLoader will fail:
289 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
291 for (i = *PL_reglastparen + 1; i <= rex->nparens; i++) {
293 PL_regoffs[i].start = -1;
294 PL_regoffs[i].end = -1;
300 #define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
303 * pregexec and friends
306 #ifndef PERL_IN_XSUB_RE
308 - pregexec - match a regexp against a string
311 Perl_pregexec(pTHX_ register regexp *prog, char *stringarg, register char *strend,
312 char *strbeg, I32 minend, SV *screamer, U32 nosave)
313 /* strend: pointer to null at end of string */
314 /* strbeg: real beginning of string */
315 /* minend: end of match must be >=minend after stringarg. */
316 /* nosave: For optimizations. */
319 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
320 nosave ? 0 : REXEC_COPY_STR);
325 * Need to implement the following flags for reg_anch:
327 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
329 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
330 * INTUIT_AUTORITATIVE_ML
331 * INTUIT_ONCE_NOML - Intuit can match in one location only.
334 * Another flag for this function: SECOND_TIME (so that float substrs
335 * with giant delta may be not rechecked).
338 /* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
340 /* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
341 Otherwise, only SvCUR(sv) is used to get strbeg. */
343 /* XXXX We assume that strpos is strbeg unless sv. */
345 /* XXXX Some places assume that there is a fixed substring.
346 An update may be needed if optimizer marks as "INTUITable"
347 RExen without fixed substrings. Similarly, it is assumed that
348 lengths of all the strings are no more than minlen, thus they
349 cannot come from lookahead.
350 (Or minlen should take into account lookahead.)
351 NOTE: Some of this comment is not correct. minlen does now take account
352 of lookahead/behind. Further research is required. -- demerphq
356 /* A failure to find a constant substring means that there is no need to make
357 an expensive call to REx engine, thus we celebrate a failure. Similarly,
358 finding a substring too deep into the string means that less calls to
359 regtry() should be needed.
361 REx compiler's optimizer found 4 possible hints:
362 a) Anchored substring;
364 c) Whether we are anchored (beginning-of-line or \G);
365 d) First node (of those at offset 0) which may distingush positions;
366 We use a)b)d) and multiline-part of c), and try to find a position in the
367 string which does not contradict any of them.
370 /* Most of decisions we do here should have been done at compile time.
371 The nodes of the REx which we used for the search should have been
372 deleted from the finite automaton. */
375 Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
376 char *strend, U32 flags, re_scream_pos_data *data)
379 register I32 start_shift = 0;
380 /* Should be nonnegative! */
381 register I32 end_shift = 0;
386 const bool do_utf8 = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
388 register char *other_last = NULL; /* other substr checked before this */
389 char *check_at = NULL; /* check substr found at this pos */
390 const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
391 RXi_GET_DECL(prog,progi);
393 const char * const i_strpos = strpos;
396 GET_RE_DEBUG_FLAGS_DECL;
398 RX_MATCH_UTF8_set(prog,do_utf8);
400 if (prog->extflags & RXf_UTF8) {
401 PL_reg_flags |= RF_utf8;
404 debug_start_match(prog, do_utf8, strpos, strend,
405 sv ? "Guessing start of match in sv for"
406 : "Guessing start of match in string for");
409 /* CHR_DIST() would be more correct here but it makes things slow. */
410 if (prog->minlen > strend - strpos) {
411 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
412 "String too short... [re_intuit_start]\n"));
416 strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
419 if (!prog->check_utf8 && prog->check_substr)
420 to_utf8_substr(prog);
421 check = prog->check_utf8;
423 if (!prog->check_substr && prog->check_utf8)
424 to_byte_substr(prog);
425 check = prog->check_substr;
427 if (check == &PL_sv_undef) {
428 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
429 "Non-utf8 string cannot match utf8 check string\n"));
432 if (prog->extflags & RXf_ANCH) { /* Match at beg-of-str or after \n */
433 ml_anch = !( (prog->extflags & RXf_ANCH_SINGLE)
434 || ( (prog->extflags & RXf_ANCH_BOL)
435 && !multiline ) ); /* Check after \n? */
438 if ( !(prog->extflags & RXf_ANCH_GPOS) /* Checked by the caller */
439 && !(prog->intflags & PREGf_IMPLICIT) /* not a real BOL */
440 /* SvCUR is not set on references: SvRV and SvPVX_const overlap */
442 && (strpos != strbeg)) {
443 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
446 if (prog->check_offset_min == prog->check_offset_max &&
447 !(prog->extflags & RXf_CANY_SEEN)) {
448 /* Substring at constant offset from beg-of-str... */
451 s = HOP3c(strpos, prog->check_offset_min, strend);
454 slen = SvCUR(check); /* >= 1 */
456 if ( strend - s > slen || strend - s < slen - 1
457 || (strend - s == slen && strend[-1] != '\n')) {
458 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
461 /* Now should match s[0..slen-2] */
463 if (slen && (*SvPVX_const(check) != *s
465 && memNE(SvPVX_const(check), s, slen)))) {
467 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
471 else if (*SvPVX_const(check) != *s
472 || ((slen = SvCUR(check)) > 1
473 && memNE(SvPVX_const(check), s, slen)))
476 goto success_at_start;
479 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
481 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
482 end_shift = prog->check_end_shift;
485 const I32 end = prog->check_offset_max + CHR_SVLEN(check)
486 - (SvTAIL(check) != 0);
487 const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
489 if (end_shift < eshift)
493 else { /* Can match at random position */
496 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
497 end_shift = prog->check_end_shift;
499 /* end shift should be non negative here */
502 #ifdef QDEBUGGING /* 7/99: reports of failure (with the older version) */
504 Perl_croak(aTHX_ "panic: end_shift: %"IVdf" pattern:\n%s\n ",
505 (IV)end_shift, prog->precomp);
509 /* Find a possible match in the region s..strend by looking for
510 the "check" substring in the region corrected by start/end_shift. */
513 I32 srch_start_shift = start_shift;
514 I32 srch_end_shift = end_shift;
515 if (srch_start_shift < 0 && strbeg - s > srch_start_shift) {
516 srch_end_shift -= ((strbeg - s) - srch_start_shift);
517 srch_start_shift = strbeg - s;
519 DEBUG_OPTIMISE_MORE_r({
520 PerlIO_printf(Perl_debug_log, "Check offset min: %"IVdf" Start shift: %"IVdf" End shift %"IVdf" Real End Shift: %"IVdf"\n",
521 (IV)prog->check_offset_min,
522 (IV)srch_start_shift,
524 (IV)prog->check_end_shift);
527 if (flags & REXEC_SCREAM) {
528 I32 p = -1; /* Internal iterator of scream. */
529 I32 * const pp = data ? data->scream_pos : &p;
531 if (PL_screamfirst[BmRARE(check)] >= 0
532 || ( BmRARE(check) == '\n'
533 && (BmPREVIOUS(check) == SvCUR(check) - 1)
535 s = screaminstr(sv, check,
536 srch_start_shift + (s - strbeg), srch_end_shift, pp, 0);
539 /* we may be pointing at the wrong string */
540 if (s && RX_MATCH_COPIED(prog))
541 s = strbeg + (s - SvPVX_const(sv));
543 *data->scream_olds = s;
548 if (prog->extflags & RXf_CANY_SEEN) {
549 start_point= (U8*)(s + srch_start_shift);
550 end_point= (U8*)(strend - srch_end_shift);
552 start_point= HOP3(s, srch_start_shift, srch_start_shift < 0 ? strbeg : strend);
553 end_point= HOP3(strend, -srch_end_shift, strbeg);
555 DEBUG_OPTIMISE_MORE_r({
556 PerlIO_printf(Perl_debug_log, "fbm_instr len=%d str=<%.*s>\n",
557 (int)(end_point - start_point),
558 (int)(end_point - start_point) > 20 ? 20 : (int)(end_point - start_point),
562 s = fbm_instr( start_point, end_point,
563 check, multiline ? FBMrf_MULTILINE : 0);
566 /* Update the count-of-usability, remove useless subpatterns,
570 RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
571 SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
572 PerlIO_printf(Perl_debug_log, "%s %s substr %s%s%s",
573 (s ? "Found" : "Did not find"),
574 (check == (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr)
575 ? "anchored" : "floating"),
578 (s ? " at offset " : "...\n") );
583 /* Finish the diagnostic message */
584 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
586 /* XXX dmq: first branch is for positive lookbehind...
587 Our check string is offset from the beginning of the pattern.
588 So we need to do any stclass tests offset forward from that
597 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
598 Start with the other substr.
599 XXXX no SCREAM optimization yet - and a very coarse implementation
600 XXXX /ttx+/ results in anchored="ttx", floating="x". floating will
601 *always* match. Probably should be marked during compile...
602 Probably it is right to do no SCREAM here...
605 if (do_utf8 ? (prog->float_utf8 && prog->anchored_utf8)
606 : (prog->float_substr && prog->anchored_substr))
608 /* Take into account the "other" substring. */
609 /* XXXX May be hopelessly wrong for UTF... */
612 if (check == (do_utf8 ? prog->float_utf8 : prog->float_substr)) {
615 char * const last = HOP3c(s, -start_shift, strbeg);
617 char * const saved_s = s;
620 t = s - prog->check_offset_max;
621 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
623 || ((t = (char*)reghopmaybe3((U8*)s, -(prog->check_offset_max), (U8*)strpos))
628 t = HOP3c(t, prog->anchored_offset, strend);
629 if (t < other_last) /* These positions already checked */
631 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
634 /* XXXX It is not documented what units *_offsets are in.
635 We assume bytes, but this is clearly wrong.
636 Meaning this code needs to be carefully reviewed for errors.
640 /* On end-of-str: see comment below. */
641 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
642 if (must == &PL_sv_undef) {
644 DEBUG_r(must = prog->anchored_utf8); /* for debug */
649 HOP3(HOP3(last1, prog->anchored_offset, strend)
650 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
652 multiline ? FBMrf_MULTILINE : 0
655 RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
656 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
657 PerlIO_printf(Perl_debug_log, "%s anchored substr %s%s",
658 (s ? "Found" : "Contradicts"),
659 quoted, RE_SV_TAIL(must));
664 if (last1 >= last2) {
665 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
666 ", giving up...\n"));
669 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
670 ", trying floating at offset %ld...\n",
671 (long)(HOP3c(saved_s, 1, strend) - i_strpos)));
672 other_last = HOP3c(last1, prog->anchored_offset+1, strend);
673 s = HOP3c(last, 1, strend);
677 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
678 (long)(s - i_strpos)));
679 t = HOP3c(s, -prog->anchored_offset, strbeg);
680 other_last = HOP3c(s, 1, strend);
688 else { /* Take into account the floating substring. */
690 char * const saved_s = s;
693 t = HOP3c(s, -start_shift, strbeg);
695 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
696 if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
697 last = HOP3c(t, prog->float_max_offset, strend);
698 s = HOP3c(t, prog->float_min_offset, strend);
701 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
702 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
703 /* fbm_instr() takes into account exact value of end-of-str
704 if the check is SvTAIL(ed). Since false positives are OK,
705 and end-of-str is not later than strend we are OK. */
706 if (must == &PL_sv_undef) {
708 DEBUG_r(must = prog->float_utf8); /* for debug message */
711 s = fbm_instr((unsigned char*)s,
712 (unsigned char*)last + SvCUR(must)
714 must, multiline ? FBMrf_MULTILINE : 0);
716 RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
717 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
718 PerlIO_printf(Perl_debug_log, "%s floating substr %s%s",
719 (s ? "Found" : "Contradicts"),
720 quoted, RE_SV_TAIL(must));
724 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
725 ", giving up...\n"));
728 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
729 ", trying anchored starting at offset %ld...\n",
730 (long)(saved_s + 1 - i_strpos)));
732 s = HOP3c(t, 1, strend);
736 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
737 (long)(s - i_strpos)));
738 other_last = s; /* Fix this later. --Hugo */
748 t= (char*)HOP3( s, -prog->check_offset_max, (prog->check_offset_max<0) ? strend : strpos);
750 DEBUG_OPTIMISE_MORE_r(
751 PerlIO_printf(Perl_debug_log,
752 "Check offset min:%"IVdf" max:%"IVdf" S:%"IVdf" t:%"IVdf" D:%"IVdf" end:%"IVdf"\n",
753 (IV)prog->check_offset_min,
754 (IV)prog->check_offset_max,
762 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
764 || ((t = (char*)reghopmaybe3((U8*)s, -prog->check_offset_max, (U8*) ((prog->check_offset_max<0) ? strend : strpos)))
767 /* Fixed substring is found far enough so that the match
768 cannot start at strpos. */
770 if (ml_anch && t[-1] != '\n') {
771 /* Eventually fbm_*() should handle this, but often
772 anchored_offset is not 0, so this check will not be wasted. */
773 /* XXXX In the code below we prefer to look for "^" even in
774 presence of anchored substrings. And we search even
775 beyond the found float position. These pessimizations
776 are historical artefacts only. */
778 while (t < strend - prog->minlen) {
780 if (t < check_at - prog->check_offset_min) {
781 if (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) {
782 /* Since we moved from the found position,
783 we definitely contradict the found anchored
784 substr. Due to the above check we do not
785 contradict "check" substr.
786 Thus we can arrive here only if check substr
787 is float. Redo checking for "other"=="fixed".
790 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
791 PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
792 goto do_other_anchored;
794 /* We don't contradict the found floating substring. */
795 /* XXXX Why not check for STCLASS? */
797 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
798 PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
801 /* Position contradicts check-string */
802 /* XXXX probably better to look for check-string
803 than for "\n", so one should lower the limit for t? */
804 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
805 PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
806 other_last = strpos = s = t + 1;
811 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
812 PL_colors[0], PL_colors[1]));
816 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
817 PL_colors[0], PL_colors[1]));
821 ++BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
824 /* The found string does not prohibit matching at strpos,
825 - no optimization of calling REx engine can be performed,
826 unless it was an MBOL and we are not after MBOL,
827 or a future STCLASS check will fail this. */
829 /* Even in this situation we may use MBOL flag if strpos is offset
830 wrt the start of the string. */
831 if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
832 && (strpos != strbeg) && strpos[-1] != '\n'
833 /* May be due to an implicit anchor of m{.*foo} */
834 && !(prog->intflags & PREGf_IMPLICIT))
839 DEBUG_EXECUTE_r( if (ml_anch)
840 PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
841 (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
844 if (!(prog->intflags & PREGf_NAUGHTY) /* XXXX If strpos moved? */
846 prog->check_utf8 /* Could be deleted already */
847 && --BmUSEFUL(prog->check_utf8) < 0
848 && (prog->check_utf8 == prog->float_utf8)
850 prog->check_substr /* Could be deleted already */
851 && --BmUSEFUL(prog->check_substr) < 0
852 && (prog->check_substr == prog->float_substr)
855 /* If flags & SOMETHING - do not do it many times on the same match */
856 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
857 SvREFCNT_dec(do_utf8 ? prog->check_utf8 : prog->check_substr);
858 if (do_utf8 ? prog->check_substr : prog->check_utf8)
859 SvREFCNT_dec(do_utf8 ? prog->check_substr : prog->check_utf8);
860 prog->check_substr = prog->check_utf8 = NULL; /* disable */
861 prog->float_substr = prog->float_utf8 = NULL; /* clear */
862 check = NULL; /* abort */
864 /* XXXX This is a remnant of the old implementation. It
865 looks wasteful, since now INTUIT can use many
867 prog->extflags &= ~RXf_USE_INTUIT;
874 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
875 /* trie stclasses are too expensive to use here, we are better off to
876 leave it to regmatch itself */
877 if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) {
878 /* minlen == 0 is possible if regstclass is \b or \B,
879 and the fixed substr is ''$.
880 Since minlen is already taken into account, s+1 is before strend;
881 accidentally, minlen >= 1 guaranties no false positives at s + 1
882 even for \b or \B. But (minlen? 1 : 0) below assumes that
883 regstclass does not come from lookahead... */
884 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
885 This leaves EXACTF only, which is dealt with in find_byclass(). */
886 const U8* const str = (U8*)STRING(progi->regstclass);
887 const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
888 ? CHR_DIST(str+STR_LEN(progi->regstclass), str)
891 if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
892 endpos= HOP3c(s, (prog->minlen ? cl_l : 0), strend);
893 else if (prog->float_substr || prog->float_utf8)
894 endpos= HOP3c(HOP3c(check_at, -start_shift, strbeg), cl_l, strend);
898 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" s: %"IVdf" endpos: %"IVdf"\n",
899 (IV)start_shift, (IV)(check_at - strbeg), (IV)(s - strbeg), (IV)(endpos - strbeg)));
902 s = find_byclass(prog, progi->regstclass, s, endpos, NULL);
905 const char *what = NULL;
907 if (endpos == strend) {
908 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
909 "Could not match STCLASS...\n") );
912 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
913 "This position contradicts STCLASS...\n") );
914 if ((prog->extflags & RXf_ANCH) && !ml_anch)
916 /* Contradict one of substrings */
917 if (prog->anchored_substr || prog->anchored_utf8) {
918 if ((do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) == check) {
919 DEBUG_EXECUTE_r( what = "anchored" );
921 s = HOP3c(t, 1, strend);
922 if (s + start_shift + end_shift > strend) {
923 /* XXXX Should be taken into account earlier? */
924 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
925 "Could not match STCLASS...\n") );
930 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
931 "Looking for %s substr starting at offset %ld...\n",
932 what, (long)(s + start_shift - i_strpos)) );
935 /* Have both, check_string is floating */
936 if (t + start_shift >= check_at) /* Contradicts floating=check */
937 goto retry_floating_check;
938 /* Recheck anchored substring, but not floating... */
942 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
943 "Looking for anchored substr starting at offset %ld...\n",
944 (long)(other_last - i_strpos)) );
945 goto do_other_anchored;
947 /* Another way we could have checked stclass at the
948 current position only: */
953 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
954 "Looking for /%s^%s/m starting at offset %ld...\n",
955 PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
958 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
960 /* Check is floating subtring. */
961 retry_floating_check:
962 t = check_at - start_shift;
963 DEBUG_EXECUTE_r( what = "floating" );
964 goto hop_and_restart;
967 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
968 "By STCLASS: moving %ld --> %ld\n",
969 (long)(t - i_strpos), (long)(s - i_strpos))
973 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
974 "Does not contradict STCLASS...\n");
979 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
980 PL_colors[4], (check ? "Guessed" : "Giving up"),
981 PL_colors[5], (long)(s - i_strpos)) );
984 fail_finish: /* Substring not found */
985 if (prog->check_substr || prog->check_utf8) /* could be removed already */
986 BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
988 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
989 PL_colors[4], PL_colors[5]));
995 #define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len, \
996 uvc, charid, foldlen, foldbuf, uniflags) STMT_START { \
997 switch (trie_type) { \
998 case trie_utf8_fold: \
1000 uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags ); \
1005 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags ); \
1006 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
1007 foldlen -= UNISKIP( uvc ); \
1008 uscan = foldbuf + UNISKIP( uvc ); \
1012 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags ); \
1020 charid = trie->charmap[ uvc ]; \
1024 if (widecharmap) { \
1025 SV** const svpp = hv_fetch(widecharmap, \
1026 (char*)&uvc, sizeof(UV), 0); \
1028 charid = (U16)SvIV(*svpp); \
1033 #define REXEC_FBC_EXACTISH_CHECK(CoNd) \
1036 ibcmp_utf8(s, NULL, 0, do_utf8, \
1037 m, NULL, ln, (bool)UTF)) \
1038 && (!reginfo || regtry(reginfo, &s)) ) \
1041 U8 foldbuf[UTF8_MAXBYTES_CASE+1]; \
1042 uvchr_to_utf8(tmpbuf, c); \
1043 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen); \
1045 && (f == c1 || f == c2) \
1046 && (ln == foldlen || \
1047 !ibcmp_utf8((char *) foldbuf, \
1048 NULL, foldlen, do_utf8, \
1050 NULL, ln, (bool)UTF)) \
1051 && (!reginfo || regtry(reginfo, &s)) ) \
1056 #define REXEC_FBC_EXACTISH_SCAN(CoNd) \
1060 && (ln == 1 || !(OP(c) == EXACTF \
1062 : ibcmp_locale(s, m, ln))) \
1063 && (!reginfo || regtry(reginfo, &s)) ) \
1069 #define REXEC_FBC_UTF8_SCAN(CoDe) \
1071 while (s + (uskip = UTF8SKIP(s)) <= strend) { \
1077 #define REXEC_FBC_SCAN(CoDe) \
1079 while (s < strend) { \
1085 #define REXEC_FBC_UTF8_CLASS_SCAN(CoNd) \
1086 REXEC_FBC_UTF8_SCAN( \
1088 if (tmp && (!reginfo || regtry(reginfo, &s))) \
1097 #define REXEC_FBC_CLASS_SCAN(CoNd) \
1100 if (tmp && (!reginfo || regtry(reginfo, &s))) \
1109 #define REXEC_FBC_TRYIT \
1110 if ((!reginfo || regtry(reginfo, &s))) \
1113 #define REXEC_FBC_CSCAN(CoNdUtF8,CoNd) \
1115 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1118 REXEC_FBC_CLASS_SCAN(CoNd); \
1122 #define REXEC_FBC_CSCAN_PRELOAD(UtFpReLoAd,CoNdUtF8,CoNd) \
1125 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1128 REXEC_FBC_CLASS_SCAN(CoNd); \
1132 #define REXEC_FBC_CSCAN_TAINT(CoNdUtF8,CoNd) \
1133 PL_reg_flags |= RF_tainted; \
1135 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1138 REXEC_FBC_CLASS_SCAN(CoNd); \
1142 #define DUMP_EXEC_POS(li,s,doutf8) \
1143 dump_exec_pos(li,s,(PL_regeol),(PL_bostr),(PL_reg_starttry),doutf8)
1145 /* We know what class REx starts with. Try to find this position... */
1146 /* if reginfo is NULL, its a dryrun */
1147 /* annoyingly all the vars in this routine have different names from their counterparts
1148 in regmatch. /grrr */
1151 S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
1152 const char *strend, regmatch_info *reginfo)
1155 const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
1159 register STRLEN uskip;
1163 register I32 tmp = 1; /* Scratch variable? */
1164 register const bool do_utf8 = PL_reg_match_utf8;
1165 RXi_GET_DECL(prog,progi);
1167 /* We know what class it must start with. */
1171 REXEC_FBC_UTF8_CLASS_SCAN((ANYOF_FLAGS(c) & ANYOF_UNICODE) ||
1172 !UTF8_IS_INVARIANT((U8)s[0]) ?
1173 reginclass(prog, c, (U8*)s, 0, do_utf8) :
1174 REGINCLASS(prog, c, (U8*)s));
1177 while (s < strend) {
1180 if (REGINCLASS(prog, c, (U8*)s) ||
1181 (ANYOF_FOLD_SHARP_S(c, s, strend) &&
1182 /* The assignment of 2 is intentional:
1183 * for the folded sharp s, the skip is 2. */
1184 (skip = SHARP_S_SKIP))) {
1185 if (tmp && (!reginfo || regtry(reginfo, &s)))
1198 if (tmp && (!reginfo || regtry(reginfo, &s)))
1206 ln = STR_LEN(c); /* length to match in octets/bytes */
1207 lnc = (I32) ln; /* length to match in characters */
1209 STRLEN ulen1, ulen2;
1211 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
1212 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
1213 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1215 to_utf8_lower((U8*)m, tmpbuf1, &ulen1);
1216 to_utf8_upper((U8*)m, tmpbuf2, &ulen2);
1218 c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXBYTES_CASE,
1220 c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXBYTES_CASE,
1223 while (sm < ((U8 *) m + ln)) {
1238 c2 = PL_fold_locale[c1];
1240 e = HOP3c(strend, -((I32)lnc), s);
1242 if (!reginfo && e < s)
1243 e = s; /* Due to minlen logic of intuit() */
1245 /* The idea in the EXACTF* cases is to first find the
1246 * first character of the EXACTF* node and then, if
1247 * necessary, case-insensitively compare the full
1248 * text of the node. The c1 and c2 are the first
1249 * characters (though in Unicode it gets a bit
1250 * more complicated because there are more cases
1251 * than just upper and lower: one needs to use
1252 * the so-called folding case for case-insensitive
1253 * matching (called "loose matching" in Unicode).
1254 * ibcmp_utf8() will do just that. */
1258 U8 tmpbuf [UTF8_MAXBYTES+1];
1259 STRLEN len, foldlen;
1260 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1262 /* Upper and lower of 1st char are equal -
1263 * probably not a "letter". */
1265 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
1267 REXEC_FBC_EXACTISH_CHECK(c == c1);
1272 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
1275 /* Handle some of the three Greek sigmas cases.
1276 * Note that not all the possible combinations
1277 * are handled here: some of them are handled
1278 * by the standard folding rules, and some of
1279 * them (the character class or ANYOF cases)
1280 * are handled during compiletime in
1281 * regexec.c:S_regclass(). */
1282 if (c == (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA ||
1283 c == (UV)UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA)
1284 c = (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA;
1286 REXEC_FBC_EXACTISH_CHECK(c == c1 || c == c2);
1292 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
1294 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
1298 PL_reg_flags |= RF_tainted;
1305 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
1306 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);
1308 tmp = ((OP(c) == BOUND ?
1309 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1310 LOAD_UTF8_CHARCLASS_ALNUM();
1311 REXEC_FBC_UTF8_SCAN(
1312 if (tmp == !(OP(c) == BOUND ?
1313 (bool)swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
1314 isALNUM_LC_utf8((U8*)s)))
1322 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
1323 tmp = ((OP(c) == BOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1326 !(OP(c) == BOUND ? isALNUM(*s) : isALNUM_LC(*s))) {
1332 if ((!prog->minlen && tmp) && (!reginfo || regtry(reginfo, &s)))
1336 PL_reg_flags |= RF_tainted;
1343 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
1344 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);
1346 tmp = ((OP(c) == NBOUND ?
1347 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1348 LOAD_UTF8_CHARCLASS_ALNUM();
1349 REXEC_FBC_UTF8_SCAN(
1350 if (tmp == !(OP(c) == NBOUND ?
1351 (bool)swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
1352 isALNUM_LC_utf8((U8*)s)))
1354 else REXEC_FBC_TRYIT;
1358 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
1359 tmp = ((OP(c) == NBOUND ?
1360 isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1363 !(OP(c) == NBOUND ? isALNUM(*s) : isALNUM_LC(*s)))
1365 else REXEC_FBC_TRYIT;
1368 if ((!prog->minlen && !tmp) && (!reginfo || regtry(reginfo, &s)))
1372 REXEC_FBC_CSCAN_PRELOAD(
1373 LOAD_UTF8_CHARCLASS_ALNUM(),
1374 swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8),
1378 REXEC_FBC_CSCAN_TAINT(
1379 isALNUM_LC_utf8((U8*)s),
1383 REXEC_FBC_CSCAN_PRELOAD(
1384 LOAD_UTF8_CHARCLASS_ALNUM(),
1385 !swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8),
1389 REXEC_FBC_CSCAN_TAINT(
1390 !isALNUM_LC_utf8((U8*)s),
1394 REXEC_FBC_CSCAN_PRELOAD(
1395 LOAD_UTF8_CHARCLASS_SPACE(),
1396 *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8),
1400 REXEC_FBC_CSCAN_TAINT(
1401 *s == ' ' || isSPACE_LC_utf8((U8*)s),
1405 REXEC_FBC_CSCAN_PRELOAD(
1406 LOAD_UTF8_CHARCLASS_SPACE(),
1407 !(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8)),
1411 REXEC_FBC_CSCAN_TAINT(
1412 !(*s == ' ' || isSPACE_LC_utf8((U8*)s)),
1416 REXEC_FBC_CSCAN_PRELOAD(
1417 LOAD_UTF8_CHARCLASS_DIGIT(),
1418 swash_fetch(PL_utf8_digit,(U8*)s, do_utf8),
1422 REXEC_FBC_CSCAN_TAINT(
1423 isDIGIT_LC_utf8((U8*)s),
1427 REXEC_FBC_CSCAN_PRELOAD(
1428 LOAD_UTF8_CHARCLASS_DIGIT(),
1429 !swash_fetch(PL_utf8_digit,(U8*)s, do_utf8),
1433 REXEC_FBC_CSCAN_TAINT(
1434 !isDIGIT_LC_utf8((U8*)s),
1440 is_LNBREAK_latin1(s)
1450 !is_VERTWS_latin1(s)
1455 is_HORIZWS_latin1(s)
1459 !is_HORIZWS_utf8(s),
1460 !is_HORIZWS_latin1(s)
1465 const enum { trie_plain, trie_utf8, trie_utf8_fold }
1466 trie_type = do_utf8 ?
1467 (c->flags == EXACT ? trie_utf8 : trie_utf8_fold)
1469 /* what trie are we using right now */
1471 = (reg_ac_data*)progi->data->data[ ARG( c ) ];
1473 = (reg_trie_data*)progi->data->data[ aho->trie ];
1474 HV *widecharmap = (HV*) progi->data->data[ aho->trie + 1 ];
1476 const char *last_start = strend - trie->minlen;
1478 const char *real_start = s;
1480 STRLEN maxlen = trie->maxlen;
1482 U8 **points; /* map of where we were in the input string
1483 when reading a given char. For ASCII this
1484 is unnecessary overhead as the relationship
1485 is always 1:1, but for unicode, especially
1486 case folded unicode this is not true. */
1487 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1491 GET_RE_DEBUG_FLAGS_DECL;
1493 /* We can't just allocate points here. We need to wrap it in
1494 * an SV so it gets freed properly if there is a croak while
1495 * running the match */
1498 sv_points=newSV(maxlen * sizeof(U8 *));
1499 SvCUR_set(sv_points,
1500 maxlen * sizeof(U8 *));
1501 SvPOK_on(sv_points);
1502 sv_2mortal(sv_points);
1503 points=(U8**)SvPV_nolen(sv_points );
1504 if ( trie_type != trie_utf8_fold
1505 && (trie->bitmap || OP(c)==AHOCORASICKC) )
1508 bitmap=(U8*)trie->bitmap;
1510 bitmap=(U8*)ANYOF_BITMAP(c);
1512 /* this is the Aho-Corasick algorithm modified a touch
1513 to include special handling for long "unknown char"
1514 sequences. The basic idea being that we use AC as long
1515 as we are dealing with a possible matching char, when
1516 we encounter an unknown char (and we have not encountered
1517 an accepting state) we scan forward until we find a legal
1519 AC matching is basically that of trie matching, except
1520 that when we encounter a failing transition, we fall back
1521 to the current states "fail state", and try the current char
1522 again, a process we repeat until we reach the root state,
1523 state 1, or a legal transition. If we fail on the root state
1524 then we can either terminate if we have reached an accepting
1525 state previously, or restart the entire process from the beginning
1529 while (s <= last_start) {
1530 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1538 U8 *uscan = (U8*)NULL;
1539 U8 *leftmost = NULL;
1541 U32 accepted_word= 0;
1545 while ( state && uc <= (U8*)strend ) {
1547 U32 word = aho->states[ state ].wordnum;
1551 DEBUG_TRIE_EXECUTE_r(
1552 if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1553 dump_exec_pos( (char *)uc, c, strend, real_start,
1554 (char *)uc, do_utf8 );
1555 PerlIO_printf( Perl_debug_log,
1556 " Scanning for legal start char...\n");
1559 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1564 if (uc >(U8*)last_start) break;
1568 U8 *lpos= points[ (pointpos - trie->wordlen[word-1] ) % maxlen ];
1569 if (!leftmost || lpos < leftmost) {
1570 DEBUG_r(accepted_word=word);
1576 points[pointpos++ % maxlen]= uc;
1577 REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
1578 uscan, len, uvc, charid, foldlen,
1580 DEBUG_TRIE_EXECUTE_r({
1581 dump_exec_pos( (char *)uc, c, strend, real_start,
1583 PerlIO_printf(Perl_debug_log,
1584 " Charid:%3u CP:%4"UVxf" ",
1590 word = aho->states[ state ].wordnum;
1592 base = aho->states[ state ].trans.base;
1594 DEBUG_TRIE_EXECUTE_r({
1596 dump_exec_pos( (char *)uc, c, strend, real_start,
1598 PerlIO_printf( Perl_debug_log,
1599 "%sState: %4"UVxf", word=%"UVxf,
1600 failed ? " Fail transition to " : "",
1601 (UV)state, (UV)word);
1606 (base + charid > trie->uniquecharcount )
1607 && (base + charid - 1 - trie->uniquecharcount
1609 && trie->trans[base + charid - 1 -
1610 trie->uniquecharcount].check == state
1611 && (tmp=trie->trans[base + charid - 1 -
1612 trie->uniquecharcount ].next))
1614 DEBUG_TRIE_EXECUTE_r(
1615 PerlIO_printf( Perl_debug_log," - legal\n"));
1620 DEBUG_TRIE_EXECUTE_r(
1621 PerlIO_printf( Perl_debug_log," - fail\n"));
1623 state = aho->fail[state];
1627 /* we must be accepting here */
1628 DEBUG_TRIE_EXECUTE_r(
1629 PerlIO_printf( Perl_debug_log," - accepting\n"));
1638 if (!state) state = 1;
1641 if ( aho->states[ state ].wordnum ) {
1642 U8 *lpos = points[ (pointpos - trie->wordlen[aho->states[ state ].wordnum-1]) % maxlen ];
1643 if (!leftmost || lpos < leftmost) {
1644 DEBUG_r(accepted_word=aho->states[ state ].wordnum);
1649 s = (char*)leftmost;
1650 DEBUG_TRIE_EXECUTE_r({
1652 Perl_debug_log,"Matches word #%"UVxf" at position %"IVdf". Trying full pattern...\n",
1653 (UV)accepted_word, (IV)(s - real_start)
1656 if (!reginfo || regtry(reginfo, &s)) {
1662 DEBUG_TRIE_EXECUTE_r({
1663 PerlIO_printf( Perl_debug_log,"Pattern failed. Looking for new start point...\n");
1666 DEBUG_TRIE_EXECUTE_r(
1667 PerlIO_printf( Perl_debug_log,"No match.\n"));
1676 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
1685 S_swap_match_buff (pTHX_ regexp *prog) {
1686 regexp_paren_pair *t;
1689 /* We have to be careful. If the previous successful match
1690 was from this regex we don't want a subsequent paritally
1691 successful match to clobber the old results.
1692 So when we detect this possibility we add a swap buffer
1693 to the re, and switch the buffer each match. If we fail
1694 we switch it back, otherwise we leave it swapped.
1696 Newxz(prog->swap, (prog->nparens + 1), regexp_paren_pair);
1699 prog->swap = prog->offs;
1705 - regexec_flags - match a regexp against a string
1708 Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *strend,
1709 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
1710 /* strend: pointer to null at end of string */
1711 /* strbeg: real beginning of string */
1712 /* minend: end of match must be >=minend after stringarg. */
1713 /* data: May be used for some additional optimizations.
1714 Currently its only used, with a U32 cast, for transmitting
1715 the ganch offset when doing a /g match. This will change */
1716 /* nosave: For optimizations. */
1719 /*register*/ char *s;
1720 register regnode *c;
1721 /*register*/ char *startpos = stringarg;
1722 I32 minlen; /* must match at least this many chars */
1723 I32 dontbother = 0; /* how many characters not to try at end */
1724 I32 end_shift = 0; /* Same for the end. */ /* CC */
1725 I32 scream_pos = -1; /* Internal iterator of scream. */
1726 char *scream_olds = NULL;
1727 const bool do_utf8 = (bool)DO_UTF8(sv);
1729 RXi_GET_DECL(prog,progi);
1730 regmatch_info reginfo; /* create some info to pass to regtry etc */
1731 bool swap_on_fail = 0;
1733 GET_RE_DEBUG_FLAGS_DECL;
1735 PERL_UNUSED_ARG(data);
1737 /* Be paranoid... */
1738 if (prog == NULL || startpos == NULL) {
1739 Perl_croak(aTHX_ "NULL regexp parameter");
1743 multiline = prog->extflags & RXf_PMf_MULTILINE;
1744 reginfo.prog = prog;
1746 RX_MATCH_UTF8_set(prog, do_utf8);
1748 debug_start_match(prog, do_utf8, startpos, strend,
1752 minlen = prog->minlen;
1754 if (strend - startpos < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
1755 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1756 "String too short [regexec_flags]...\n"));
1761 /* Check validity of program. */
1762 if (UCHARAT(progi->program) != REG_MAGIC) {
1763 Perl_croak(aTHX_ "corrupted regexp program");
1767 PL_reg_eval_set = 0;
1770 if (prog->extflags & RXf_UTF8)
1771 PL_reg_flags |= RF_utf8;
1773 /* Mark beginning of line for ^ and lookbehind. */
1774 reginfo.bol = startpos; /* XXX not used ??? */
1778 /* Mark end of line for $ (and such) */
1781 /* see how far we have to get to not match where we matched before */
1782 reginfo.till = startpos+minend;
1784 /* If there is a "must appear" string, look for it. */
1787 if (prog->extflags & RXf_GPOS_SEEN) { /* Need to set reginfo->ganch */
1790 if (flags & REXEC_IGNOREPOS) /* Means: check only at start */
1791 reginfo.ganch = startpos + prog->gofs;
1792 else if (sv && SvTYPE(sv) >= SVt_PVMG
1794 && (mg = mg_find(sv, PERL_MAGIC_regex_global))
1795 && mg->mg_len >= 0) {
1796 reginfo.ganch = strbeg + mg->mg_len; /* Defined pos() */
1797 if (prog->extflags & RXf_ANCH_GPOS) {
1798 if (s > reginfo.ganch)
1800 s = reginfo.ganch - prog->gofs;
1804 reginfo.ganch = strbeg + PTR2UV(data);
1805 } else /* pos() not defined */
1806 reginfo.ganch = strbeg;
1808 if (PL_curpm && (PM_GETRE(PL_curpm) == prog)) {
1810 swap_match_buff(prog); /* do we need a save destructor here for
1813 if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) {
1814 re_scream_pos_data d;
1816 d.scream_olds = &scream_olds;
1817 d.scream_pos = &scream_pos;
1818 s = re_intuit_start(prog, sv, s, strend, flags, &d);
1820 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
1821 goto phooey; /* not present */
1827 /* Simplest case: anchored match need be tried only once. */
1828 /* [unless only anchor is BOL and multiline is set] */
1829 if (prog->extflags & (RXf_ANCH & ~RXf_ANCH_GPOS)) {
1830 if (s == startpos && regtry(®info, &startpos))
1832 else if (multiline || (prog->intflags & PREGf_IMPLICIT)
1833 || (prog->extflags & RXf_ANCH_MBOL)) /* XXXX SBOL? */
1838 dontbother = minlen - 1;
1839 end = HOP3c(strend, -dontbother, strbeg) - 1;
1840 /* for multiline we only have to try after newlines */
1841 if (prog->check_substr || prog->check_utf8) {
1845 if (regtry(®info, &s))
1850 if (prog->extflags & RXf_USE_INTUIT) {
1851 s = re_intuit_start(prog, sv, s + 1, strend, flags, NULL);
1862 if (*s++ == '\n') { /* don't need PL_utf8skip here */
1863 if (regtry(®info, &s))
1870 } else if (RXf_GPOS_CHECK == (prog->extflags & RXf_GPOS_CHECK))
1872 /* the warning about reginfo.ganch being used without intialization
1873 is bogus -- we set it above, when prog->extflags & RXf_GPOS_SEEN
1874 and we only enter this block when the same bit is set. */
1875 char *tmp_s = reginfo.ganch - prog->gofs;
1876 if (regtry(®info, &tmp_s))
1881 /* Messy cases: unanchored match. */
1882 if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) {
1883 /* we have /x+whatever/ */
1884 /* it must be a one character string (XXXX Except UTF?) */
1889 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1890 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1891 ch = SvPVX_const(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr)[0];
1896 DEBUG_EXECUTE_r( did_match = 1 );
1897 if (regtry(®info, &s)) goto got_it;
1899 while (s < strend && *s == ch)
1907 DEBUG_EXECUTE_r( did_match = 1 );
1908 if (regtry(®info, &s)) goto got_it;
1910 while (s < strend && *s == ch)
1915 DEBUG_EXECUTE_r(if (!did_match)
1916 PerlIO_printf(Perl_debug_log,
1917 "Did not find anchored character...\n")
1920 else if (prog->anchored_substr != NULL
1921 || prog->anchored_utf8 != NULL
1922 || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
1923 && prog->float_max_offset < strend - s)) {
1928 char *last1; /* Last position checked before */
1932 if (prog->anchored_substr || prog->anchored_utf8) {
1933 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1934 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1935 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
1936 back_max = back_min = prog->anchored_offset;
1938 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
1939 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1940 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
1941 back_max = prog->float_max_offset;
1942 back_min = prog->float_min_offset;
1946 if (must == &PL_sv_undef)
1947 /* could not downgrade utf8 check substring, so must fail */
1953 last = HOP3c(strend, /* Cannot start after this */
1954 -(I32)(CHR_SVLEN(must)
1955 - (SvTAIL(must) != 0) + back_min), strbeg);
1958 last1 = HOPc(s, -1);
1960 last1 = s - 1; /* bogus */
1962 /* XXXX check_substr already used to find "s", can optimize if
1963 check_substr==must. */
1965 dontbother = end_shift;
1966 strend = HOPc(strend, -dontbother);
1967 while ( (s <= last) &&
1968 ((flags & REXEC_SCREAM)
1969 ? (s = screaminstr(sv, must, HOP3c(s, back_min, (back_min<0 ? strbeg : strend)) - strbeg,
1970 end_shift, &scream_pos, 0))
1971 : (s = fbm_instr((unsigned char*)HOP3(s, back_min, (back_min<0 ? strbeg : strend)),
1972 (unsigned char*)strend, must,
1973 multiline ? FBMrf_MULTILINE : 0))) ) {
1974 /* we may be pointing at the wrong string */
1975 if ((flags & REXEC_SCREAM) && RX_MATCH_COPIED(prog))
1976 s = strbeg + (s - SvPVX_const(sv));
1977 DEBUG_EXECUTE_r( did_match = 1 );
1978 if (HOPc(s, -back_max) > last1) {
1979 last1 = HOPc(s, -back_min);
1980 s = HOPc(s, -back_max);
1983 char * const t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
1985 last1 = HOPc(s, -back_min);
1989 while (s <= last1) {
1990 if (regtry(®info, &s))
1996 while (s <= last1) {
1997 if (regtry(®info, &s))
2003 DEBUG_EXECUTE_r(if (!did_match) {
2004 RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
2005 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
2006 PerlIO_printf(Perl_debug_log, "Did not find %s substr %s%s...\n",
2007 ((must == prog->anchored_substr || must == prog->anchored_utf8)
2008 ? "anchored" : "floating"),
2009 quoted, RE_SV_TAIL(must));
2013 else if ( (c = progi->regstclass) ) {
2015 const OPCODE op = OP(progi->regstclass);
2016 /* don't bother with what can't match */
2017 if (PL_regkind[op] != EXACT && op != CANY && PL_regkind[op] != TRIE)
2018 strend = HOPc(strend, -(minlen - 1));
2021 SV * const prop = sv_newmortal();
2022 regprop(prog, prop, c);
2024 RE_PV_QUOTED_DECL(quoted,do_utf8,PERL_DEBUG_PAD_ZERO(1),
2026 PerlIO_printf(Perl_debug_log,
2027 "Matching stclass %.*s against %s (%d chars)\n",
2028 (int)SvCUR(prop), SvPVX_const(prop),
2029 quoted, (int)(strend - s));
2032 if (find_byclass(prog, c, s, strend, ®info))
2034 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
2038 if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
2043 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
2044 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
2045 float_real = do_utf8 ? prog->float_utf8 : prog->float_substr;
2047 if (flags & REXEC_SCREAM) {
2048 last = screaminstr(sv, float_real, s - strbeg,
2049 end_shift, &scream_pos, 1); /* last one */
2051 last = scream_olds; /* Only one occurrence. */
2052 /* we may be pointing at the wrong string */
2053 else if (RX_MATCH_COPIED(prog))
2054 s = strbeg + (s - SvPVX_const(sv));
2058 const char * const little = SvPV_const(float_real, len);
2060 if (SvTAIL(float_real)) {
2061 if (memEQ(strend - len + 1, little, len - 1))
2062 last = strend - len + 1;
2063 else if (!multiline)
2064 last = memEQ(strend - len, little, len)
2065 ? strend - len : NULL;
2071 last = rninstr(s, strend, little, little + len);
2073 last = strend; /* matching "$" */
2078 PerlIO_printf(Perl_debug_log,
2079 "%sCan't trim the tail, match fails (should not happen)%s\n",
2080 PL_colors[4], PL_colors[5]));
2081 goto phooey; /* Should not happen! */
2083 dontbother = strend - last + prog->float_min_offset;
2085 if (minlen && (dontbother < minlen))
2086 dontbother = minlen - 1;
2087 strend -= dontbother; /* this one's always in bytes! */
2088 /* We don't know much -- general case. */
2091 if (regtry(®info, &s))
2100 if (regtry(®info, &s))
2102 } while (s++ < strend);
2110 RX_MATCH_TAINTED_set(prog, PL_reg_flags & RF_tainted);
2112 if (PL_reg_eval_set)
2113 restore_pos(aTHX_ prog);
2114 if (prog->paren_names)
2115 (void)hv_iterinit(prog->paren_names);
2117 /* make sure $`, $&, $', and $digit will work later */
2118 if ( !(flags & REXEC_NOT_FIRST) ) {
2119 RX_MATCH_COPY_FREE(prog);
2120 if (flags & REXEC_COPY_STR) {
2121 const I32 i = PL_regeol - startpos + (stringarg - strbeg);
2122 #ifdef PERL_OLD_COPY_ON_WRITE
2124 || (SvFLAGS(sv) & CAN_COW_MASK) == CAN_COW_FLAGS)) {
2126 PerlIO_printf(Perl_debug_log,
2127 "Copy on write: regexp capture, type %d\n",
2130 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
2131 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
2132 assert (SvPOKp(prog->saved_copy));
2136 RX_MATCH_COPIED_on(prog);
2137 s = savepvn(strbeg, i);
2143 prog->subbeg = strbeg;
2144 prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
2151 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
2152 PL_colors[4], PL_colors[5]));
2153 if (PL_reg_eval_set)
2154 restore_pos(aTHX_ prog);
2156 /* we failed :-( roll it back */
2157 swap_match_buff(prog);
2164 - regtry - try match at specific point
2166 STATIC I32 /* 0 failure, 1 success */
2167 S_regtry(pTHX_ regmatch_info *reginfo, char **startpos)
2171 regexp *prog = reginfo->prog;
2172 RXi_GET_DECL(prog,progi);
2173 GET_RE_DEBUG_FLAGS_DECL;
2174 reginfo->cutpoint=NULL;
2176 if ((prog->extflags & RXf_EVAL_SEEN) && !PL_reg_eval_set) {
2179 PL_reg_eval_set = RS_init;
2180 DEBUG_EXECUTE_r(DEBUG_s(
2181 PerlIO_printf(Perl_debug_log, " setting stack tmpbase at %"IVdf"\n",
2182 (IV)(PL_stack_sp - PL_stack_base));
2185 cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base;
2186 /* Otherwise OP_NEXTSTATE will free whatever on stack now. */
2188 /* Apparently this is not needed, judging by wantarray. */
2189 /* SAVEI8(cxstack[cxstack_ix].blk_gimme);
2190 cxstack[cxstack_ix].blk_gimme = G_SCALAR; */
2193 /* Make $_ available to executed code. */
2194 if (reginfo->sv != DEFSV) {
2196 DEFSV = reginfo->sv;
2199 if (!(SvTYPE(reginfo->sv) >= SVt_PVMG && SvMAGIC(reginfo->sv)
2200 && (mg = mg_find(reginfo->sv, PERL_MAGIC_regex_global)))) {
2201 /* prepare for quick setting of pos */
2202 #ifdef PERL_OLD_COPY_ON_WRITE
2203 if (SvIsCOW(reginfo->sv))
2204 sv_force_normal_flags(reginfo->sv, 0);
2206 mg = sv_magicext(reginfo->sv, NULL, PERL_MAGIC_regex_global,
2207 &PL_vtbl_mglob, NULL, 0);
2211 PL_reg_oldpos = mg->mg_len;
2212 SAVEDESTRUCTOR_X(restore_pos, prog);
2214 if (!PL_reg_curpm) {
2215 Newxz(PL_reg_curpm, 1, PMOP);
2218 SV* const repointer = newSViv(0);
2219 /* so we know which PL_regex_padav element is PL_reg_curpm */
2220 SvFLAGS(repointer) |= SVf_BREAK;
2221 av_push(PL_regex_padav,repointer);
2222 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2223 PL_regex_pad = AvARRAY(PL_regex_padav);
2227 PM_SETRE(PL_reg_curpm, prog);
2228 PL_reg_oldcurpm = PL_curpm;
2229 PL_curpm = PL_reg_curpm;
2230 if (RX_MATCH_COPIED(prog)) {
2231 /* Here is a serious problem: we cannot rewrite subbeg,
2232 since it may be needed if this match fails. Thus
2233 $` inside (?{}) could fail... */
2234 PL_reg_oldsaved = prog->subbeg;
2235 PL_reg_oldsavedlen = prog->sublen;
2236 #ifdef PERL_OLD_COPY_ON_WRITE
2237 PL_nrs = prog->saved_copy;
2239 RX_MATCH_COPIED_off(prog);
2242 PL_reg_oldsaved = NULL;
2243 prog->subbeg = PL_bostr;
2244 prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2246 DEBUG_EXECUTE_r(PL_reg_starttry = *startpos);
2247 prog->offs[0].start = *startpos - PL_bostr;
2248 PL_reginput = *startpos;
2249 PL_reglastparen = &prog->lastparen;
2250 PL_reglastcloseparen = &prog->lastcloseparen;
2251 prog->lastparen = 0;
2252 prog->lastcloseparen = 0;
2254 PL_regoffs = prog->offs;
2255 if (PL_reg_start_tmpl <= prog->nparens) {
2256 PL_reg_start_tmpl = prog->nparens*3/2 + 3;
2257 if(PL_reg_start_tmp)
2258 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2260 Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2263 /* XXXX What this code is doing here?!!! There should be no need
2264 to do this again and again, PL_reglastparen should take care of
2267 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2268 * Actually, the code in regcppop() (which Ilya may be meaning by
2269 * PL_reglastparen), is not needed at all by the test suite
2270 * (op/regexp, op/pat, op/split), but that code is needed, oddly
2271 * enough, for building DynaLoader, or otherwise this
2272 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
2273 * will happen. Meanwhile, this code *is* needed for the
2274 * above-mentioned test suite tests to succeed. The common theme
2275 * on those tests seems to be returning null fields from matches.
2278 if (prog->nparens) {
2279 regexp_paren_pair *pp = PL_regoffs;
2281 for (i = prog->nparens; i > (I32)*PL_reglastparen; i--) {
2289 if (regmatch(reginfo, progi->program + 1)) {
2290 PL_regoffs[0].end = PL_reginput - PL_bostr;
2293 if (reginfo->cutpoint)
2294 *startpos= reginfo->cutpoint;
2295 REGCP_UNWIND(lastcp);
2300 #define sayYES goto yes
2301 #define sayNO goto no
2302 #define sayNO_SILENT goto no_silent
2304 /* we dont use STMT_START/END here because it leads to
2305 "unreachable code" warnings, which are bogus, but distracting. */
2306 #define CACHEsayNO \
2307 if (ST.cache_mask) \
2308 PL_reg_poscache[ST.cache_offset] |= ST.cache_mask; \
2311 /* this is used to determine how far from the left messages like
2312 'failed...' are printed. It should be set such that messages
2313 are inline with the regop output that created them.
2315 #define REPORT_CODE_OFF 32
2318 /* Make sure there is a test for this +1 options in re_tests */
2319 #define TRIE_INITAL_ACCEPT_BUFFLEN 4;
2321 #define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
2322 #define CHRTEST_VOID -1000 /* the c1/c2 "next char" test should be skipped */
2324 #define SLAB_FIRST(s) (&(s)->states[0])
2325 #define SLAB_LAST(s) (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
2327 /* grab a new slab and return the first slot in it */
2329 STATIC regmatch_state *
2332 #if PERL_VERSION < 9 && !defined(PERL_CORE)
2335 regmatch_slab *s = PL_regmatch_slab->next;
2337 Newx(s, 1, regmatch_slab);
2338 s->prev = PL_regmatch_slab;
2340 PL_regmatch_slab->next = s;
2342 PL_regmatch_slab = s;
2343 return SLAB_FIRST(s);
2347 /* push a new state then goto it */
2349 #define PUSH_STATE_GOTO(state, node) \
2351 st->resume_state = state; \
2354 /* push a new state with success backtracking, then goto it */
2356 #define PUSH_YES_STATE_GOTO(state, node) \
2358 st->resume_state = state; \
2359 goto push_yes_state;
2365 regmatch() - main matching routine
2367 This is basically one big switch statement in a loop. We execute an op,
2368 set 'next' to point the next op, and continue. If we come to a point which
2369 we may need to backtrack to on failure such as (A|B|C), we push a
2370 backtrack state onto the backtrack stack. On failure, we pop the top
2371 state, and re-enter the loop at the state indicated. If there are no more
2372 states to pop, we return failure.
2374 Sometimes we also need to backtrack on success; for example /A+/, where
2375 after successfully matching one A, we need to go back and try to
2376 match another one; similarly for lookahead assertions: if the assertion
2377 completes successfully, we backtrack to the state just before the assertion
2378 and then carry on. In these cases, the pushed state is marked as
2379 'backtrack on success too'. This marking is in fact done by a chain of
2380 pointers, each pointing to the previous 'yes' state. On success, we pop to
2381 the nearest yes state, discarding any intermediate failure-only states.
2382 Sometimes a yes state is pushed just to force some cleanup code to be
2383 called at the end of a successful match or submatch; e.g. (??{$re}) uses
2384 it to free the inner regex.
2386 Note that failure backtracking rewinds the cursor position, while
2387 success backtracking leaves it alone.
2389 A pattern is complete when the END op is executed, while a subpattern
2390 such as (?=foo) is complete when the SUCCESS op is executed. Both of these
2391 ops trigger the "pop to last yes state if any, otherwise return true"
2394 A common convention in this function is to use A and B to refer to the two
2395 subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
2396 the subpattern to be matched possibly multiple times, while B is the entire
2397 rest of the pattern. Variable and state names reflect this convention.
2399 The states in the main switch are the union of ops and failure/success of
2400 substates associated with with that op. For example, IFMATCH is the op
2401 that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
2402 'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
2403 successfully matched A and IFMATCH_A_fail is a state saying that we have
2404 just failed to match A. Resume states always come in pairs. The backtrack
2405 state we push is marked as 'IFMATCH_A', but when that is popped, we resume
2406 at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
2407 on success or failure.
2409 The struct that holds a backtracking state is actually a big union, with
2410 one variant for each major type of op. The variable st points to the
2411 top-most backtrack struct. To make the code clearer, within each
2412 block of code we #define ST to alias the relevant union.
2414 Here's a concrete example of a (vastly oversimplified) IFMATCH
2420 #define ST st->u.ifmatch
2422 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
2423 ST.foo = ...; // some state we wish to save
2425 // push a yes backtrack state with a resume value of
2426 // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
2428 PUSH_YES_STATE_GOTO(IFMATCH_A, A);
2431 case IFMATCH_A: // we have successfully executed A; now continue with B
2433 bar = ST.foo; // do something with the preserved value
2436 case IFMATCH_A_fail: // A failed, so the assertion failed
2437 ...; // do some housekeeping, then ...
2438 sayNO; // propagate the failure
2445 For any old-timers reading this who are familiar with the old recursive
2446 approach, the code above is equivalent to:
2448 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
2457 ...; // do some housekeeping, then ...
2458 sayNO; // propagate the failure
2461 The topmost backtrack state, pointed to by st, is usually free. If you
2462 want to claim it, populate any ST.foo fields in it with values you wish to
2463 save, then do one of
2465 PUSH_STATE_GOTO(resume_state, node);
2466 PUSH_YES_STATE_GOTO(resume_state, node);
2468 which sets that backtrack state's resume value to 'resume_state', pushes a
2469 new free entry to the top of the backtrack stack, then goes to 'node'.
2470 On backtracking, the free slot is popped, and the saved state becomes the
2471 new free state. An ST.foo field in this new top state can be temporarily
2472 accessed to retrieve values, but once the main loop is re-entered, it
2473 becomes available for reuse.
2475 Note that the depth of the backtrack stack constantly increases during the
2476 left-to-right execution of the pattern, rather than going up and down with
2477 the pattern nesting. For example the stack is at its maximum at Z at the
2478 end of the pattern, rather than at X in the following:
2480 /(((X)+)+)+....(Y)+....Z/
2482 The only exceptions to this are lookahead/behind assertions and the cut,
2483 (?>A), which pop all the backtrack states associated with A before
2486 Bascktrack state structs are allocated in slabs of about 4K in size.
2487 PL_regmatch_state and st always point to the currently active state,
2488 and PL_regmatch_slab points to the slab currently containing
2489 PL_regmatch_state. The first time regmatch() is called, the first slab is
2490 allocated, and is never freed until interpreter destruction. When the slab
2491 is full, a new one is allocated and chained to the end. At exit from
2492 regmatch(), slabs allocated since entry are freed.
2497 #define DEBUG_STATE_pp(pp) \
2499 DUMP_EXEC_POS(locinput, scan, do_utf8); \
2500 PerlIO_printf(Perl_debug_log, \
2501 " %*s"pp" %s%s%s%s%s\n", \
2503 PL_reg_name[st->resume_state], \
2504 ((st==yes_state||st==mark_state) ? "[" : ""), \
2505 ((st==yes_state) ? "Y" : ""), \
2506 ((st==mark_state) ? "M" : ""), \
2507 ((st==yes_state||st==mark_state) ? "]" : "") \
2512 #define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
2517 S_debug_start_match(pTHX_ const regexp *prog, const bool do_utf8,
2518 const char *start, const char *end, const char *blurb)
2520 const bool utf8_pat= prog->extflags & RXf_UTF8 ? 1 : 0;
2524 RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
2525 prog->precomp, prog->prelen, 60);
2527 RE_PV_QUOTED_DECL(s1, do_utf8, PERL_DEBUG_PAD_ZERO(1),
2528 start, end - start, 60);
2530 PerlIO_printf(Perl_debug_log,
2531 "%s%s REx%s %s against %s\n",
2532 PL_colors[4], blurb, PL_colors[5], s0, s1);
2534 if (do_utf8||utf8_pat)
2535 PerlIO_printf(Perl_debug_log, "UTF-8 %s%s%s...\n",
2536 utf8_pat ? "pattern" : "",
2537 utf8_pat && do_utf8 ? " and " : "",
2538 do_utf8 ? "string" : ""
2544 S_dump_exec_pos(pTHX_ const char *locinput,
2545 const regnode *scan,
2546 const char *loc_regeol,
2547 const char *loc_bostr,
2548 const char *loc_reg_starttry,
2551 const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
2552 const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
2553 int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
2554 /* The part of the string before starttry has one color
2555 (pref0_len chars), between starttry and current
2556 position another one (pref_len - pref0_len chars),
2557 after the current position the third one.
2558 We assume that pref0_len <= pref_len, otherwise we
2559 decrease pref0_len. */
2560 int pref_len = (locinput - loc_bostr) > (5 + taill) - l
2561 ? (5 + taill) - l : locinput - loc_bostr;
2564 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
2566 pref0_len = pref_len - (locinput - loc_reg_starttry);
2567 if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
2568 l = ( loc_regeol - locinput > (5 + taill) - pref_len
2569 ? (5 + taill) - pref_len : loc_regeol - locinput);
2570 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
2574 if (pref0_len > pref_len)
2575 pref0_len = pref_len;
2577 const int is_uni = (do_utf8 && OP(scan) != CANY) ? 1 : 0;
2579 RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
2580 (locinput - pref_len),pref0_len, 60, 4, 5);
2582 RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
2583 (locinput - pref_len + pref0_len),
2584 pref_len - pref0_len, 60, 2, 3);
2586 RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
2587 locinput, loc_regeol - locinput, 10, 0, 1);
2589 const STRLEN tlen=len0+len1+len2;
2590 PerlIO_printf(Perl_debug_log,
2591 "%4"IVdf" <%.*s%.*s%s%.*s>%*s|",
2592 (IV)(locinput - loc_bostr),
2595 (docolor ? "" : "> <"),
2597 (int)(tlen > 19 ? 0 : 19 - tlen),
2604 /* reg_check_named_buff_matched()
2605 * Checks to see if a named buffer has matched. The data array of
2606 * buffer numbers corresponding to the buffer is expected to reside
2607 * in the regexp->data->data array in the slot stored in the ARG() of
2608 * node involved. Note that this routine doesn't actually care about the
2609 * name, that information is not preserved from compilation to execution.
2610 * Returns the index of the leftmost defined buffer with the given name
2611 * or 0 if non of the buffers matched.
2614 S_reg_check_named_buff_matched(pTHX_ const regexp *rex, const regnode *scan) {
2616 RXi_GET_DECL(rex,rexi);
2617 SV *sv_dat=(SV*)rexi->data->data[ ARG( scan ) ];
2618 I32 *nums=(I32*)SvPVX(sv_dat);
2619 for ( n=0; n<SvIVX(sv_dat); n++ ) {
2620 if ((I32)*PL_reglastparen >= nums[n] &&
2621 PL_regoffs[nums[n]].end != -1)
2630 /* free all slabs above current one - called during LEAVE_SCOPE */
2633 S_clear_backtrack_stack(pTHX_ void *p)
2635 regmatch_slab *s = PL_regmatch_slab->next;
2640 PL_regmatch_slab->next = NULL;
2642 regmatch_slab * const osl = s;
2649 #define SETREX(Re1,Re2) \
2650 if (PL_reg_eval_set) PM_SETRE((PL_reg_curpm), (Re2)); \
2653 STATIC I32 /* 0 failure, 1 success */
2654 S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
2656 #if PERL_VERSION < 9 && !defined(PERL_CORE)
2660 register const bool do_utf8 = PL_reg_match_utf8;
2661 const U32 uniflags = UTF8_ALLOW_DEFAULT;
2663 regexp *rex = reginfo->prog;
2664 RXi_GET_DECL(rex,rexi);
2668 /* the current state. This is a cached copy of PL_regmatch_state */
2669 register regmatch_state *st;
2671 /* cache heavy used fields of st in registers */
2672 register regnode *scan;
2673 register regnode *next;
2674 register U32 n = 0; /* general value; init to avoid compiler warning */
2675 register I32 ln = 0; /* len or last; init to avoid compiler warning */
2676 register char *locinput = PL_reginput;
2677 register I32 nextchr; /* is always set to UCHARAT(locinput) */
2679 bool result = 0; /* return value of S_regmatch */
2680 int depth = 0; /* depth of backtrack stack */
2681 U32 nochange_depth = 0; /* depth of GOSUB recursion with nochange */
2682 const U32 max_nochange_depth =
2683 (3 * rex->nparens > MAX_RECURSE_EVAL_NOCHANGE_DEPTH) ?
2684 3 * rex->nparens : MAX_RECURSE_EVAL_NOCHANGE_DEPTH;
2686 regmatch_state *yes_state = NULL; /* state to pop to on success of
2688 /* mark_state piggy backs on the yes_state logic so that when we unwind
2689 the stack on success we can update the mark_state as we go */
2690 regmatch_state *mark_state = NULL; /* last mark state we have seen */
2692 regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
2693 struct regmatch_state *cur_curlyx = NULL; /* most recent curlyx */
2695 bool no_final = 0; /* prevent failure from backtracking? */
2696 bool do_cutgroup = 0; /* no_final only until next branch/trie entry */
2697 char *startpoint = PL_reginput;
2698 SV *popmark = NULL; /* are we looking for a mark? */
2699 SV *sv_commit = NULL; /* last mark name seen in failure */
2700 SV *sv_yes_mark = NULL; /* last mark name we have seen
2701 during a successfull match */
2702 U32 lastopen = 0; /* last open we saw */
2703 bool has_cutgroup = RX_HAS_CUTGROUP(rex) ? 1 : 0;
2705 SV* const oreplsv = GvSV(PL_replgv);
2708 /* these three flags are set by various ops to signal information to
2709 * the very next op. They have a useful lifetime of exactly one loop
2710 * iteration, and are not preserved or restored by state pushes/pops
2712 bool sw = 0; /* the condition value in (?(cond)a|b) */
2713 bool minmod = 0; /* the next "{n,m}" is a "{n,m}?" */
2714 int logical = 0; /* the following EVAL is:
2718 or the following IFMATCH/UNLESSM is:
2719 false: plain (?=foo)
2720 true: used as a condition: (?(?=foo))
2724 GET_RE_DEBUG_FLAGS_DECL;
2727 DEBUG_OPTIMISE_r( DEBUG_EXECUTE_r({
2728 PerlIO_printf(Perl_debug_log,"regmatch start\n");
2730 /* on first ever call to regmatch, allocate first slab */
2731 if (!PL_regmatch_slab) {
2732 Newx(PL_regmatch_slab, 1, regmatch_slab);
2733 PL_regmatch_slab->prev = NULL;
2734 PL_regmatch_slab->next = NULL;
2735 PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
2738 oldsave = PL_savestack_ix;
2739 SAVEDESTRUCTOR_X(S_clear_backtrack_stack, NULL);
2740 SAVEVPTR(PL_regmatch_slab);
2741 SAVEVPTR(PL_regmatch_state);
2743 /* grab next free state slot */
2744 st = ++PL_regmatch_state;
2745 if (st > SLAB_LAST(PL_regmatch_slab))
2746 st = PL_regmatch_state = S_push_slab(aTHX);
2748 /* Note that nextchr is a byte even in UTF */
2749 nextchr = UCHARAT(locinput);
2751 while (scan != NULL) {
2754 SV * const prop = sv_newmortal();
2755 regnode *rnext=regnext(scan);
2756 DUMP_EXEC_POS( locinput, scan, do_utf8 );
2757 regprop(rex, prop, scan);
2759 PerlIO_printf(Perl_debug_log,
2760 "%3"IVdf":%*s%s(%"IVdf")\n",
2761 (IV)(scan - rexi->program), depth*2, "",
2763 (PL_regkind[OP(scan)] == END || !rnext) ?
2764 0 : (IV)(rnext - rexi->program));
2767 next = scan + NEXT_OFF(scan);
2770 state_num = OP(scan);
2773 switch (state_num) {
2775 if (locinput == PL_bostr)
2777 /* reginfo->till = reginfo->bol; */
2782 if (locinput == PL_bostr ||
2783 ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n'))
2789 if (locinput == PL_bostr)
2793 if (locinput == reginfo->ganch)
2798 /* update the startpoint */
2799 st->u.keeper.val = PL_regoffs[0].start;
2800 PL_reginput = locinput;
2801 PL_regoffs[0].start = locinput - PL_bostr;
2802 PUSH_STATE_GOTO(KEEPS_next, next);
2804 case KEEPS_next_fail:
2805 /* rollback the start point change */
2806 PL_regoffs[0].start = st->u.keeper.val;
2812 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
2817 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
2819 if (PL_regeol - locinput > 1)
2823 if (PL_regeol != locinput)
2827 if (!nextchr && locinput >= PL_regeol)
2830 locinput += PL_utf8skip[nextchr];
2831 if (locinput > PL_regeol)
2833 nextchr = UCHARAT(locinput);
2836 nextchr = UCHARAT(++locinput);
2839 if (!nextchr && locinput >= PL_regeol)
2841 nextchr = UCHARAT(++locinput);
2844 if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
2847 locinput += PL_utf8skip[nextchr];
2848 if (locinput > PL_regeol)
2850 nextchr = UCHARAT(locinput);
2853 nextchr = UCHARAT(++locinput);
2857 #define ST st->u.trie
2859 /* In this case the charclass data is available inline so
2860 we can fail fast without a lot of extra overhead.
2862 if (scan->flags == EXACT || !do_utf8) {
2863 if(!ANYOF_BITMAP_TEST(scan, *locinput)) {
2865 PerlIO_printf(Perl_debug_log,
2866 "%*s %sfailed to match trie start class...%s\n",
2867 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
2876 /* what type of TRIE am I? (utf8 makes this contextual) */
2877 const enum { trie_plain, trie_utf8, trie_utf8_fold }
2878 trie_type = do_utf8 ?
2879 (scan->flags == EXACT ? trie_utf8 : trie_utf8_fold)
2882 /* what trie are we using right now */
2883 reg_trie_data * const trie
2884 = (reg_trie_data*)rexi->data->data[ ARG( scan ) ];
2885 HV * widecharmap = (HV *)rexi->data->data[ ARG( scan ) + 1 ];
2886 U32 state = trie->startstate;
2888 if (trie->bitmap && trie_type != trie_utf8_fold &&
2889 !TRIE_BITMAP_TEST(trie,*locinput)
2891 if (trie->states[ state ].wordnum) {
2893 PerlIO_printf(Perl_debug_log,
2894 "%*s %smatched empty string...%s\n",
2895 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
2900 PerlIO_printf(Perl_debug_log,
2901 "%*s %sfailed to match trie start class...%s\n",
2902 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
2909 U8 *uc = ( U8* )locinput;
2913 U8 *uscan = (U8*)NULL;
2915 SV *sv_accept_buff = NULL;
2916 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2918 ST.accepted = 0; /* how many accepting states we have seen */
2920 ST.jump = trie->jump;
2923 traverse the TRIE keeping track of all accepting states
2924 we transition through until we get to a failing node.
2927 while ( state && uc <= (U8*)PL_regeol ) {
2928 U32 base = trie->states[ state ].trans.base;
2931 /* We use charid to hold the wordnum as we don't use it
2932 for charid until after we have done the wordnum logic.
2933 We define an alias just so that the wordnum logic reads
2936 #define got_wordnum charid
2937 got_wordnum = trie->states[ state ].wordnum;
2939 if ( got_wordnum ) {
2940 if ( ! ST.accepted ) {
2943 bufflen = TRIE_INITAL_ACCEPT_BUFFLEN;
2944 sv_accept_buff=newSV(bufflen *
2945 sizeof(reg_trie_accepted) - 1);
2946 SvCUR_set(sv_accept_buff, 0);
2947 SvPOK_on(sv_accept_buff);
2948 sv_2mortal(sv_accept_buff);
2951 (reg_trie_accepted*)SvPV_nolen(sv_accept_buff );
2954 if (ST.accepted >= bufflen) {
2956 ST.accept_buff =(reg_trie_accepted*)
2957 SvGROW(sv_accept_buff,
2958 bufflen * sizeof(reg_trie_accepted));
2960 SvCUR_set(sv_accept_buff,SvCUR(sv_accept_buff)
2961 + sizeof(reg_trie_accepted));
2964 ST.accept_buff[ST.accepted].wordnum = got_wordnum;
2965 ST.accept_buff[ST.accepted].endpos = uc;
2967 } while (trie->nextword && (got_wordnum= trie->nextword[got_wordnum]));
2971 DEBUG_TRIE_EXECUTE_r({
2972 DUMP_EXEC_POS( (char *)uc, scan, do_utf8 );
2973 PerlIO_printf( Perl_debug_log,
2974 "%*s %sState: %4"UVxf" Accepted: %4"UVxf" ",
2975 2+depth * 2, "", PL_colors[4],
2976 (UV)state, (UV)ST.accepted );
2980 REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
2981 uscan, len, uvc, charid, foldlen,
2985 (base + charid > trie->uniquecharcount )
2986 && (base + charid - 1 - trie->uniquecharcount
2988 && trie->trans[base + charid - 1 -
2989 trie->uniquecharcount].check == state)
2991 state = trie->trans[base + charid - 1 -
2992 trie->uniquecharcount ].next;
3003 DEBUG_TRIE_EXECUTE_r(
3004 PerlIO_printf( Perl_debug_log,
3005 "Charid:%3x CP:%4"UVxf" After State: %4"UVxf"%s\n",
3006 charid, uvc, (UV)state, PL_colors[5] );
3013 PerlIO_printf( Perl_debug_log,
3014 "%*s %sgot %"IVdf" possible matches%s\n",
3015 REPORT_CODE_OFF + depth * 2, "",
3016 PL_colors[4], (IV)ST.accepted, PL_colors[5] );
3019 goto trie_first_try; /* jump into the fail handler */
3021 case TRIE_next_fail: /* we failed - try next alterative */
3023 REGCP_UNWIND(ST.cp);
3024 for (n = *PL_reglastparen; n > ST.lastparen; n--)
3025 PL_regoffs[n].end = -1;
3026 *PL_reglastparen = n;
3035 ST.lastparen = *PL_reglastparen;
3038 if ( ST.accepted == 1 ) {
3039 /* only one choice left - just continue */
3041 AV *const trie_words
3042 = (AV *) rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET];
3043 SV ** const tmp = av_fetch( trie_words,
3044 ST.accept_buff[ 0 ].wordnum-1, 0 );
3045 SV *sv= tmp ? sv_newmortal() : NULL;
3047 PerlIO_printf( Perl_debug_log,
3048 "%*s %sonly one match left: #%d <%s>%s\n",
3049 REPORT_CODE_OFF+depth*2, "", PL_colors[4],
3050 ST.accept_buff[ 0 ].wordnum,
3051 tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
3052 PL_colors[0], PL_colors[1],
3053 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)
3055 : "not compiled under -Dr",
3058 PL_reginput = (char *)ST.accept_buff[ 0 ].endpos;
3059 /* in this case we free tmps/leave before we call regmatch
3060 as we wont be using accept_buff again. */
3062 locinput = PL_reginput;
3063 nextchr = UCHARAT(locinput);
3064 if ( !ST.jump || !ST.jump[ST.accept_buff[0].wordnum])
3067 scan = ST.me + ST.jump[ST.accept_buff[0].wordnum];
3068 if (!has_cutgroup) {
3073 PUSH_YES_STATE_GOTO(TRIE_next, scan);
3076 continue; /* execute rest of RE */
3079 if ( !ST.accepted-- ) {
3081 PerlIO_printf( Perl_debug_log,
3082 "%*s %sTRIE failed...%s\n",
3083 REPORT_CODE_OFF+depth*2, "",
3094 There are at least two accepting states left. Presumably
3095 the number of accepting states is going to be low,
3096 typically two. So we simply scan through to find the one
3097 with lowest wordnum. Once we find it, we swap the last
3098 state into its place and decrement the size. We then try to
3099 match the rest of the pattern at the point where the word
3100 ends. If we succeed, control just continues along the
3101 regex; if we fail we return here to try the next accepting
3108 for( cur = 1 ; cur <= ST.accepted ; cur++ ) {
3109 DEBUG_TRIE_EXECUTE_r(
3110 PerlIO_printf( Perl_debug_log,
3111 "%*s %sgot %"IVdf" (%d) as best, looking at %"IVdf" (%d)%s\n",
3112 REPORT_CODE_OFF + depth * 2, "", PL_colors[4],
3113 (IV)best, ST.accept_buff[ best ].wordnum, (IV)cur,
3114 ST.accept_buff[ cur ].wordnum, PL_colors[5] );
3117 if (ST.accept_buff[cur].wordnum <
3118 ST.accept_buff[best].wordnum)
3123 AV *const trie_words
3124 = (AV *) rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET];
3125 SV ** const tmp = av_fetch( trie_words,
3126 ST.accept_buff[ best ].wordnum - 1, 0 );
3127 regnode *nextop=(!ST.jump || !ST.jump[ST.accept_buff[best].wordnum]) ?
3129 ST.me + ST.jump[ST.accept_buff[best].wordnum];
3130 SV *sv= tmp ? sv_newmortal() : NULL;
3132 PerlIO_printf( Perl_debug_log,
3133 "%*s %strying alternation #%d <%s> at node #%d %s\n",
3134 REPORT_CODE_OFF+depth*2, "", PL_colors[4],
3135 ST.accept_buff[best].wordnum,
3136 tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
3137 PL_colors[0], PL_colors[1],
3138 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)
3139 ) : "not compiled under -Dr",
3140 REG_NODE_NUM(nextop),
3144 if ( best<ST.accepted ) {
3145 reg_trie_accepted tmp = ST.accept_buff[ best ];
3146 ST.accept_buff[ best ] = ST.accept_buff[ ST.accepted ];
3147 ST.accept_buff[ ST.accepted ] = tmp;
3150 PL_reginput = (char *)ST.accept_buff[ best ].endpos;
3151 if ( !ST.jump || !ST.jump[ST.accept_buff[best].wordnum]) {
3155 scan = ST.me + ST.jump[ST.accept_buff[best].wordnum];
3159 PUSH_YES_STATE_GOTO(TRIE_next, scan);
3162 PUSH_STATE_GOTO(TRIE_next, scan);
3175 char *s = STRING(scan);
3177 if (do_utf8 != UTF) {
3178 /* The target and the pattern have differing utf8ness. */
3180 const char * const e = s + ln;
3183 /* The target is utf8, the pattern is not utf8. */
3188 if (NATIVE_TO_UNI(*(U8*)s) !=
3189 utf8n_to_uvuni((U8*)l, UTF8_MAXBYTES, &ulen,
3197 /* The target is not utf8, the pattern is utf8. */
3202 if (NATIVE_TO_UNI(*((U8*)l)) !=
3203 utf8n_to_uvuni((U8*)s, UTF8_MAXBYTES, &ulen,
3211 nextchr = UCHARAT(locinput);
3214 /* The target and the pattern have the same utf8ness. */
3215 /* Inline the first character, for speed. */
3216 if (UCHARAT(s) != nextchr)
3218 if (PL_regeol - locinput < ln)
3220 if (ln > 1 && memNE(s, locinput, ln))
3223 nextchr = UCHARAT(locinput);
3227 PL_reg_flags |= RF_tainted;
3230 char * const s = STRING(scan);
3233 if (do_utf8 || UTF) {
3234 /* Either target or the pattern are utf8. */
3235 const char * const l = locinput;
3236 char *e = PL_regeol;
3238 if (ibcmp_utf8(s, 0, ln, (bool)UTF,
3239 l, &e, 0, do_utf8)) {
3240 /* One more case for the sharp s:
3241 * pack("U0U*", 0xDF) =~ /ss/i,
3242 * the 0xC3 0x9F are the UTF-8
3243 * byte sequence for the U+00DF. */
3246 toLOWER(s[0]) == 's' &&
3248 toLOWER(s[1]) == 's' &&
3255 nextchr = UCHARAT(locinput);
3259 /* Neither the target and the pattern are utf8. */
3261 /* Inline the first character, for speed. */
3262 if (UCHARAT(s) != nextchr &&
3263 UCHARAT(s) != ((OP(scan) == EXACTF)
3264 ? PL_fold : PL_fold_locale)[nextchr])
3266 if (PL_regeol - locinput < ln)
3268 if (ln > 1 && (OP(scan) == EXACTF
3269 ? ibcmp(s, locinput, ln)
3270 : ibcmp_locale(s, locinput, ln)))
3273 nextchr = UCHARAT(locinput);
3278 STRLEN inclasslen = PL_regeol - locinput;
3280 if (!reginclass(rex, scan, (U8*)locinput, &inclasslen, do_utf8))
3282 if (locinput >= PL_regeol)
3284 locinput += inclasslen ? inclasslen : UTF8SKIP(locinput);
3285 nextchr = UCHARAT(locinput);
3290 nextchr = UCHARAT(locinput);
3291 if (!REGINCLASS(rex, scan, (U8*)locinput))
3293 if (!nextchr && locinput >= PL_regeol)
3295 nextchr = UCHARAT(++locinput);
3299 /* If we might have the case of the German sharp s
3300 * in a casefolding Unicode character class. */
3302 if (ANYOF_FOLD_SHARP_S(scan, locinput, PL_regeol)) {
3303 locinput += SHARP_S_SKIP;
3304 nextchr = UCHARAT(locinput);
3310 PL_reg_flags |= RF_tainted;
3316 LOAD_UTF8_CHARCLASS_ALNUM();
3317 if (!(OP(scan) == ALNUM
3318 ? (bool)swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
3319 : isALNUM_LC_utf8((U8*)locinput)))
3323 locinput += PL_utf8skip[nextchr];
3324 nextchr = UCHARAT(locinput);
3327 if (!(OP(scan) == ALNUM
3328 ? isALNUM(nextchr) : isALNUM_LC(nextchr)))
3330 nextchr = UCHARAT(++locinput);
3333 PL_reg_flags |= RF_tainted;
3336 if (!nextchr && locinput >= PL_regeol)
3339 LOAD_UTF8_CHARCLASS_ALNUM();
3340 if (OP(scan) == NALNUM
3341 ? (bool)swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
3342 : isALNUM_LC_utf8((U8*)locinput))
3346 locinput += PL_utf8skip[nextchr];
3347 nextchr = UCHARAT(locinput);
3350 if (OP(scan) == NALNUM
3351 ? isALNUM(nextchr) : isALNUM_LC(nextchr))
3353 nextchr = UCHARAT(++locinput);
3357 PL_reg_flags |= RF_tainted;
3361 /* was last char in word? */
3363 if (locinput == PL_bostr)
3366 const U8 * const r = reghop3((U8*)locinput, -1, (U8*)PL_bostr);
3368 ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, uniflags);
3370 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
3371 ln = isALNUM_uni(ln);
3372 LOAD_UTF8_CHARCLASS_ALNUM();
3373 n = swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8);
3376 ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(ln));
3377 n = isALNUM_LC_utf8((U8*)locinput);
3381 ln = (locinput != PL_bostr) ?
3382 UCHARAT(locinput - 1) : '\n';
3383 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
3385 n = isALNUM(nextchr);
3388 ln = isALNUM_LC(ln);
3389 n = isALNUM_LC(nextchr);
3392 if (((!ln) == (!n)) == (OP(scan) == BOUND ||
3393 OP(scan) == BOUNDL))
3397 PL_reg_flags |= RF_tainted;
3403 if (UTF8_IS_CONTINUED(nextchr)) {
3404 LOAD_UTF8_CHARCLASS_SPACE();
3405 if (!(OP(scan) == SPACE
3406 ? (bool)swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
3407 : isSPACE_LC_utf8((U8*)locinput)))
3411 locinput += PL_utf8skip[nextchr];
3412 nextchr = UCHARAT(locinput);
3415 if (!(OP(scan) == SPACE
3416 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
3418 nextchr = UCHARAT(++locinput);
3421 if (!(OP(scan) == SPACE
3422 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
3424 nextchr = UCHARAT(++locinput);
3428 PL_reg_flags |= RF_tainted;
3431 if (!nextchr && locinput >= PL_regeol)
3434 LOAD_UTF8_CHARCLASS_SPACE();
3435 if (OP(scan) == NSPACE
3436 ? (bool)swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
3437 : isSPACE_LC_utf8((U8*)locinput))
3441 locinput += PL_utf8skip[nextchr];
3442 nextchr = UCHARAT(locinput);
3445 if (OP(scan) == NSPACE
3446 ? isSPACE(nextchr) : isSPACE_LC(nextchr))
3448 nextchr = UCHARAT(++locinput);
3451 PL_reg_flags |= RF_tainted;
3457 LOAD_UTF8_CHARCLASS_DIGIT();
3458 if (!(OP(scan) == DIGIT
3459 ? (bool)swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
3460 : isDIGIT_LC_utf8((U8*)locinput)))
3464 locinput += PL_utf8skip[nextchr];
3465 nextchr = UCHARAT(locinput);
3468 if (!(OP(scan) == DIGIT
3469 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr)))
3471 nextchr = UCHARAT(++locinput);
3474 PL_reg_flags |= RF_tainted;
3477 if (!nextchr && locinput >= PL_regeol)
3480 LOAD_UTF8_CHARCLASS_DIGIT();
3481 if (OP(scan) == NDIGIT
3482 ? (bool)swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
3483 : isDIGIT_LC_utf8((U8*)locinput))
3487 locinput += PL_utf8skip[nextchr];
3488 nextchr = UCHARAT(locinput);
3491 if (OP(scan) == NDIGIT
3492 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr))
3494 nextchr = UCHARAT(++locinput);
3497 if (locinput >= PL_regeol)
3500 LOAD_UTF8_CHARCLASS_MARK();
3501 if (swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
3503 locinput += PL_utf8skip[nextchr];
3504 while (locinput < PL_regeol &&
3505 swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
3506 locinput += UTF8SKIP(locinput);
3507 if (locinput > PL_regeol)
3512 nextchr = UCHARAT(locinput);
3519 PL_reg_flags |= RF_tainted;
3524 n = reg_check_named_buff_matched(rex,scan);
3527 type = REF + ( type - NREF );
3534 PL_reg_flags |= RF_tainted;
3538 n = ARG(scan); /* which paren pair */
3541 ln = PL_regoffs[n].start;
3542 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
3543 if (*PL_reglastparen < n || ln == -1)
3544 sayNO; /* Do not match unless seen CLOSEn. */
3545 if (ln == PL_regoffs[n].end)
3549 if (do_utf8 && type != REF) { /* REF can do byte comparison */
3551 const char *e = PL_bostr + PL_regoffs[n].end;
3553 * Note that we can't do the "other character" lookup trick as
3554 * in the 8-bit case (no pun intended) because in Unicode we
3555 * have to map both upper and title case to lower case.
3559 STRLEN ulen1, ulen2;
3560 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
3561 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
3565 toLOWER_utf8((U8*)s, tmpbuf1, &ulen1);
3566 toLOWER_utf8((U8*)l, tmpbuf2, &ulen2);
3567 if (ulen1 != ulen2 || memNE((char *)tmpbuf1, (char *)tmpbuf2, ulen1))
3574 nextchr = UCHARAT(locinput);
3578 /* Inline the first character, for speed. */
3579 if (UCHARAT(s) != nextchr &&
3581 (UCHARAT(s) != (type == REFF
3582 ? PL_fold : PL_fold_locale)[nextchr])))
3584 ln = PL_regoffs[n].end - ln;
3585 if (locinput + ln > PL_regeol)
3587 if (ln > 1 && (type == REF
3588 ? memNE(s, locinput, ln)
3590 ? ibcmp(s, locinput, ln)
3591 : ibcmp_locale(s, locinput, ln))))
3594 nextchr = UCHARAT(locinput);
3604 #define ST st->u.eval
3608 regexp_internal *rei;
3609 regnode *startpoint;
3612 case GOSUB: /* /(...(?1))/ /(...(?&foo))/ */
3613 if (cur_eval && cur_eval->locinput==locinput) {
3614 if (cur_eval->u.eval.close_paren == (U32)ARG(scan))
3615 Perl_croak(aTHX_ "Infinite recursion in regex");
3616 if ( ++nochange_depth > max_nochange_depth )
3618 "Pattern subroutine nesting without pos change"
3619 " exceeded limit in regex");
3625 (void)ReREFCNT_inc(rex);
3626 if (OP(scan)==GOSUB) {
3627 startpoint = scan + ARG2L(scan);
3628 ST.close_paren = ARG(scan);
3630 startpoint = rei->program+1;
3633 goto eval_recurse_doit;
3635 case EVAL: /* /(?{A})B/ /(??{A})B/ and /(?(?{A})X|Y)B/ */
3636 if (cur_eval && cur_eval->locinput==locinput) {
3637 if ( ++nochange_depth > max_nochange_depth )
3638 Perl_croak(aTHX_ "EVAL without pos change exceeded limit in regex");
3643 /* execute the code in the {...} */
3645 SV ** const before = SP;
3646 OP_4tree * const oop = PL_op;
3647 COP * const ocurcop = PL_curcop;
3651 PL_op = (OP_4tree*)rexi->data->data[n];
3652 DEBUG_STATE_r( PerlIO_printf(Perl_debug_log,
3653 " re_eval 0x%"UVxf"\n", PTR2UV(PL_op)) );
3654 PAD_SAVE_LOCAL(old_comppad, (PAD*)rexi->data->data[n + 2]);
3655 PL_regoffs[0].end = PL_reg_magic->mg_len = locinput - PL_bostr;
3658 SV *sv_mrk = get_sv("REGMARK", 1);
3659 sv_setsv(sv_mrk, sv_yes_mark);
3662 CALLRUNOPS(aTHX); /* Scalar context. */
3665 ret = &PL_sv_undef; /* protect against empty (?{}) blocks. */
3672 PAD_RESTORE_LOCAL(old_comppad);
3673 PL_curcop = ocurcop;
3676 sv_setsv(save_scalar(PL_replgv), ret);
3680 if (logical == 2) { /* Postponed subexpression: /(??{...})/ */
3683 /* extract RE object from returned value; compiling if
3688 if(SvROK(ret) && SvSMAGICAL(sv = SvRV(ret)))
3689 mg = mg_find(sv, PERL_MAGIC_qr);
3690 else if (SvSMAGICAL(ret)) {
3691 if (SvGMAGICAL(ret))
3692 sv_unmagic(ret, PERL_MAGIC_qr);
3694 mg = mg_find(ret, PERL_MAGIC_qr);
3698 re = reg_temp_copy((regexp *)mg->mg_obj); /*XXX:dmq*/
3702 const I32 osize = PL_regsize;
3704 if (DO_UTF8(ret)) pm_flags |= RXf_UTF8;
3705 re = CALLREGCOMP(ret, pm_flags);
3707 & (SVs_TEMP | SVs_PADTMP | SVf_READONLY
3709 sv_magic(ret,(SV*)ReREFCNT_inc(re),
3714 RX_MATCH_COPIED_off(re);
3715 re->subbeg = rex->subbeg;
3716 re->sublen = rex->sublen;
3719 debug_start_match(re, do_utf8, locinput, PL_regeol,
3720 "Matching embedded");
3722 startpoint = rei->program + 1;
3723 ST.close_paren = 0; /* only used for GOSUB */
3724 /* borrowed from regtry */
3725 if (PL_reg_start_tmpl <= re->nparens) {
3726 PL_reg_start_tmpl = re->nparens*3/2 + 3;
3727 if(PL_reg_start_tmp)
3728 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
3730 Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
3733 eval_recurse_doit: /* Share code with GOSUB below this line */
3734 /* run the pattern returned from (??{...}) */
3735 ST.cp = regcppush(0); /* Save *all* the positions. */
3736 REGCP_SET(ST.lastcp);
3738 PL_regoffs = re->offs; /* essentially NOOP on GOSUB */
3740 *PL_reglastparen = 0;
3741 *PL_reglastcloseparen = 0;
3742 PL_reginput = locinput;
3745 /* XXXX This is too dramatic a measure... */
3748 ST.toggle_reg_flags = PL_reg_flags;
3749 if (re->extflags & RXf_UTF8)
3750 PL_reg_flags |= RF_utf8;
3752 PL_reg_flags &= ~RF_utf8;
3753 ST.toggle_reg_flags ^= PL_reg_flags; /* diff of old and new */
3756 ST.prev_curlyx = cur_curlyx;
3761 ST.prev_eval = cur_eval;
3763 /* now continue from first node in postoned RE */
3764 PUSH_YES_STATE_GOTO(EVAL_AB, startpoint);
3767 /* logical is 1, /(?(?{...})X|Y)/ */
3768 sw = (bool)SvTRUE(ret);
3773 case EVAL_AB: /* cleanup after a successful (??{A})B */
3774 /* note: this is called twice; first after popping B, then A */
3775 PL_reg_flags ^= ST.toggle_reg_flags;
3777 SETREX(rex,ST.prev_rex);
3778 rexi = RXi_GET(rex);
3780 cur_eval = ST.prev_eval;
3781 cur_curlyx = ST.prev_curlyx;
3782 /* XXXX This is too dramatic a measure... */
3784 if ( nochange_depth )
3789 case EVAL_AB_fail: /* unsuccessfully ran A or B in (??{A})B */
3790 /* note: this is called twice; first after popping B, then A */
3791 PL_reg_flags ^= ST.toggle_reg_flags;
3793 SETREX(rex,ST.prev_rex);
3794 rexi = RXi_GET(rex);
3795 PL_reginput = locinput;
3796 REGCP_UNWIND(ST.lastcp);
3798 cur_eval = ST.prev_eval;
3799 cur_curlyx = ST.prev_curlyx;
3800 /* XXXX This is too dramatic a measure... */
3802 if ( nochange_depth )
3808 n = ARG(scan); /* which paren pair */
3809 PL_reg_start_tmp[n] = locinput;
3815 n = ARG(scan); /* which paren pair */
3816 PL_regoffs[n].start = PL_reg_start_tmp[n] - PL_bostr;
3817 PL_regoffs[n].end = locinput - PL_bostr;
3818 /*if (n > PL_regsize)
3820 if (n > *PL_reglastparen)
3821 *PL_reglastparen = n;
3822 *PL_reglastcloseparen = n;
3823 if (cur_eval && cur_eval->u.eval.close_paren == n) {
3831 cursor && OP(cursor)!=END;
3832 cursor=regnext(cursor))
3834 if ( OP(cursor)==CLOSE ){
3836 if ( n <= lastopen ) {
3838 = PL_reg_start_tmp[n] - PL_bostr;
3839 PL_regoffs[n].end = locinput - PL_bostr;
3840 /*if (n > PL_regsize)
3842 if (n > *PL_reglastparen)
3843 *PL_reglastparen = n;
3844 *PL_reglastcloseparen = n;
3845 if ( n == ARG(scan) || (cur_eval &&
3846 cur_eval->u.eval.close_paren == n))
3855 n = ARG(scan); /* which paren pair */
3856 sw = (bool)(*PL_reglastparen >= n && PL_regoffs[n].end != -1);
3859 /* reg_check_named_buff_matched returns 0 for no match */
3860 sw = (bool)(0 < reg_check_named_buff_matched(rex,scan));
3864 sw = (cur_eval && (!n || cur_eval->u.eval.close_paren == n));
3870 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
3872 next = NEXTOPER(NEXTOPER(scan));
3874 next = scan + ARG(scan);
3875 if (OP(next) == IFTHEN) /* Fake one. */
3876 next = NEXTOPER(NEXTOPER(next));
3880 logical = scan->flags;
3883 /*******************************************************************
3885 The CURLYX/WHILEM pair of ops handle the most generic case of the /A*B/
3886 pattern, where A and B are subpatterns. (For simple A, CURLYM or
3887 STAR/PLUS/CURLY/CURLYN are used instead.)
3889 A*B is compiled as <CURLYX><A><WHILEM><B>
3891 On entry to the subpattern, CURLYX is called. This pushes a CURLYX
3892 state, which contains the current count, initialised to -1. It also sets
3893 cur_curlyx to point to this state, with any previous value saved in the
3896 CURLYX then jumps straight to the WHILEM op, rather than executing A,
3897 since the pattern may possibly match zero times (i.e. it's a while {} loop
3898 rather than a do {} while loop).
3900 Each entry to WHILEM represents a successful match of A. The count in the
3901 CURLYX block is incremented, another WHILEM state is pushed, and execution
3902 passes to A or B depending on greediness and the current count.
3904 For example, if matching against the string a1a2a3b (where the aN are
3905 substrings that match /A/), then the match progresses as follows: (the
3906 pushed states are interspersed with the bits of strings matched so far):
3909 <CURLYX cnt=0><WHILEM>
3910 <CURLYX cnt=1><WHILEM> a1 <WHILEM>
3911 <CURLYX cnt=2><WHILEM> a1 <WHILEM> a2 <WHILEM>
3912 <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM>
3913 <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM> b
3915 (Contrast this with something like CURLYM, which maintains only a single
3919 a1 <CURLYM cnt=1> a2
3920 a1 a2 <CURLYM cnt=2> a3
3921 a1 a2 a3 <CURLYM cnt=3> b
3924 Each WHILEM state block marks a point to backtrack to upon partial failure
3925 of A or B, and also contains some minor state data related to that
3926 iteration. The CURLYX block, pointed to by cur_curlyx, contains the
3927 overall state, such as the count, and pointers to the A and B ops.
3929 This is complicated slightly by nested CURLYX/WHILEM's. Since cur_curlyx
3930 must always point to the *current* CURLYX block, the rules are:
3932 When executing CURLYX, save the old cur_curlyx in the CURLYX state block,
3933 and set cur_curlyx to point the new block.
3935 When popping the CURLYX block after a successful or unsuccessful match,
3936 restore the previous cur_curlyx.
3938 When WHILEM is about to execute B, save the current cur_curlyx, and set it
3939 to the outer one saved in the CURLYX block.
3941 When popping the WHILEM block after a successful or unsuccessful B match,
3942 restore the previous cur_curlyx.
3944 Here's an example for the pattern (AI* BI)*BO
3945 I and O refer to inner and outer, C and W refer to CURLYX and WHILEM:
3948 curlyx backtrack stack
3949 ------ ---------------
3951 CO <CO prev=NULL> <WO>
3952 CI <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
3953 CO <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
3954 NULL <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi <WO prev=CO> bo
3956 At this point the pattern succeeds, and we work back down the stack to
3957 clean up, restoring as we go:
3959 CO <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
3960 CI <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
3961 CO <CO prev=NULL> <WO>
3964 *******************************************************************/
3966 #define ST st->u.curlyx
3968 case CURLYX: /* start of /A*B/ (for complex A) */
3970 /* No need to save/restore up to this paren */
3971 I32 parenfloor = scan->flags;
3973 assert(next); /* keep Coverity happy */
3974 if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
3977 /* XXXX Probably it is better to teach regpush to support
3978 parenfloor > PL_regsize... */
3979 if (parenfloor > (I32)*PL_reglastparen)
3980 parenfloor = *PL_reglastparen; /* Pessimization... */
3982 ST.prev_curlyx= cur_curlyx;
3984 ST.cp = PL_savestack_ix;
3986 /* these fields contain the state of the current curly.
3987 * they are accessed by subsequent WHILEMs */
3988 ST.parenfloor = parenfloor;
3989 ST.min = ARG1(scan);
3990 ST.max = ARG2(scan);
3991 ST.A = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3995 ST.count = -1; /* this will be updated by WHILEM */
3996 ST.lastloc = NULL; /* this will be updated by WHILEM */
3998 PL_reginput = locinput;
3999 PUSH_YES_STATE_GOTO(CURLYX_end, PREVOPER(next));
4003 case CURLYX_end: /* just finished matching all of A*B */
4004 cur_curlyx = ST.prev_curlyx;
4008 case CURLYX_end_fail: /* just failed to match all of A*B */
4010 cur_curlyx = ST.prev_curlyx;
4016 #define ST st->u.whilem
4018 case WHILEM: /* just matched an A in /A*B/ (for complex A) */
4020 /* see the discussion above about CURLYX/WHILEM */
4022 assert(cur_curlyx); /* keep Coverity happy */
4023 n = ++cur_curlyx->u.curlyx.count; /* how many A's matched */
4024 ST.save_lastloc = cur_curlyx->u.curlyx.lastloc;
4025 ST.cache_offset = 0;
4028 PL_reginput = locinput;
4030 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4031 "%*s whilem: matched %ld out of %ld..%ld\n",
4032 REPORT_CODE_OFF+depth*2, "", (long)n,
4033 (long)cur_curlyx->u.curlyx.min,
4034 (long)cur_curlyx->u.curlyx.max)
4037 /* First just match a string of min A's. */
4039 if (n < cur_curlyx->u.curlyx.min) {
4040 cur_curlyx->u.curlyx.lastloc = locinput;
4041 PUSH_STATE_GOTO(WHILEM_A_pre, cur_curlyx->u.curlyx.A);
4045 /* If degenerate A matches "", assume A done. */
4047 if (locinput == cur_curlyx->u.curlyx.lastloc) {
4048 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4049 "%*s whilem: empty match detected, trying continuation...\n",
4050 REPORT_CODE_OFF+depth*2, "")
4052 goto do_whilem_B_max;
4055 /* super-linear cache processing */
4059 if (!PL_reg_maxiter) {
4060 /* start the countdown: Postpone detection until we
4061 * know the match is not *that* much linear. */
4062 PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
4063 /* possible overflow for long strings and many CURLYX's */
4064 if (PL_reg_maxiter < 0)
4065 PL_reg_maxiter = I32_MAX;
4066 PL_reg_leftiter = PL_reg_maxiter;
4069 if (PL_reg_leftiter-- == 0) {
4070 /* initialise cache */
4071 const I32 size = (PL_reg_maxiter + 7)/8;
4072 if (PL_reg_poscache) {
4073 if ((I32)PL_reg_poscache_size < size) {
4074 Renew(PL_reg_poscache, size, char);
4075 PL_reg_poscache_size = size;
4077 Zero(PL_reg_poscache, size, char);
4080 PL_reg_poscache_size = size;
4081 Newxz(PL_reg_poscache, size, char);
4083 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4084 "%swhilem: Detected a super-linear match, switching on caching%s...\n",
4085 PL_colors[4], PL_colors[5])
4089 if (PL_reg_leftiter < 0) {
4090 /* have we already failed at this position? */
4092 offset = (scan->flags & 0xf) - 1
4093 + (locinput - PL_bostr) * (scan->flags>>4);
4094 mask = 1 << (offset % 8);
4096 if (PL_reg_poscache[offset] & mask) {
4097 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4098 "%*s whilem: (cache) already tried at this position...\n",
4099 REPORT_CODE_OFF+depth*2, "")
4101 sayNO; /* cache records failure */
4103 ST.cache_offset = offset;
4104 ST.cache_mask = mask;
4108 /* Prefer B over A for minimal matching. */
4110 if (cur_curlyx->u.curlyx.minmod) {
4111 ST.save_curlyx = cur_curlyx;
4112 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
4113 ST.cp = regcppush(ST.save_curlyx->u.curlyx.parenfloor);
4114 REGCP_SET(ST.lastcp);
4115 PUSH_YES_STATE_GOTO(WHILEM_B_min, ST.save_curlyx->u.curlyx.B);
4119 /* Prefer A over B for maximal matching. */
4121 if (n < cur_curlyx->u.curlyx.max) { /* More greed allowed? */
4122 ST.cp = regcppush(cur_curlyx->u.curlyx.parenfloor);
4123 cur_curlyx->u.curlyx.lastloc = locinput;
4124 REGCP_SET(ST.lastcp);
4125 PUSH_STATE_GOTO(WHILEM_A_max, cur_curlyx->u.curlyx.A);
4128 goto do_whilem_B_max;
4132 case WHILEM_B_min: /* just matched B in a minimal match */
4133 case WHILEM_B_max: /* just matched B in a maximal match */
4134 cur_curlyx = ST.save_curlyx;
4138 case WHILEM_B_max_fail: /* just failed to match B in a maximal match */
4139 cur_curlyx = ST.save_curlyx;
4140 cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
4141 cur_curlyx->u.curlyx.count--;
4145 case WHILEM_A_min_fail: /* just failed to match A in a minimal match */
4146 REGCP_UNWIND(ST.lastcp);
4149 case WHILEM_A_pre_fail: /* just failed to match even minimal A */
4150 cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
4151 cur_curlyx->u.curlyx.count--;
4155 case WHILEM_A_max_fail: /* just failed to match A in a maximal match */
4156 REGCP_UNWIND(ST.lastcp);
4157 regcppop(rex); /* Restore some previous $<digit>s? */
4158 PL_reginput = locinput;
4159 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
4160 "%*s whilem: failed, trying continuation...\n",
4161 REPORT_CODE_OFF+depth*2, "")
4164 if (cur_curlyx->u.curlyx.count >= REG_INFTY
4165 && ckWARN(WARN_REGEXP)
4166 && !(PL_reg_flags & RF_warned))
4168 PL_reg_flags |= RF_warned;
4169 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s limit (%d) exceeded",
4170 "Complex regular subexpression recursion",
4175 ST.save_curlyx = cur_curlyx;
4176 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
4177 PUSH_YES_STATE_GOTO(WHILEM_B_max, ST.save_curlyx->u.curlyx.B);
4180 case WHILEM_B_min_fail: /* just failed to match B in a minimal match */
4181 cur_curlyx = ST.save_curlyx;
4182 REGCP_UNWIND(ST.lastcp);
4185 if (cur_curlyx->u.curlyx.count >= cur_curlyx->u.curlyx.max) {
4186 /* Maximum greed exceeded */
4187 if (cur_curlyx->u.curlyx.count >= REG_INFTY
4188 && ckWARN(WARN_REGEXP)
4189 && !(PL_reg_flags & RF_warned))
4191 PL_reg_flags |= RF_warned;
4192 Perl_warner(aTHX_ packWARN(WARN_REGEXP),
4193 "%s limit (%d) exceeded",
4194 "Complex regular subexpression recursion",
4197 cur_curlyx->u.curlyx.count--;
4201 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
4202 "%*s trying longer...\n", REPORT_CODE_OFF+depth*2, "")
4204 /* Try grabbing another A and see if it helps. */
4205 PL_reginput = locinput;
4206 cur_curlyx->u.curlyx.lastloc = locinput;
4207 ST.cp = regcppush(cur_curlyx->u.curlyx.parenfloor);
4208 REGCP_SET(ST.lastcp);
4209 PUSH_STATE_GOTO(WHILEM_A_min, ST.save_curlyx->u.curlyx.A);
4213 #define ST st->u.branch
4215 case BRANCHJ: /* /(...|A|...)/ with long next pointer */
4216 next = scan + ARG(scan);
4219 scan = NEXTOPER(scan);
4222 case BRANCH: /* /(...|A|...)/ */
4223 scan = NEXTOPER(scan); /* scan now points to inner node */
4224 if ((!next || (OP(next) != BRANCH && OP(next) != BRANCHJ))
4227 /* last branch; skip state push and jump direct to node */
4230 ST.lastparen = *PL_reglastparen;
4231 ST.next_branch = next;
4233 PL_reginput = locinput;
4235 /* Now go into the branch */
4237 PUSH_YES_STATE_GOTO(BRANCH_next, scan);
4239 PUSH_STATE_GOTO(BRANCH_next, scan);
4243 PL_reginput = locinput;
4244 sv_yes_mark = st->u.mark.mark_name = scan->flags ? NULL :
4245 (SV*)rexi->data->data[ ARG( scan ) ];
4246 PUSH_STATE_GOTO(CUTGROUP_next,next);
4248 case CUTGROUP_next_fail:
4251 if (st->u.mark.mark_name)
4252 sv_commit = st->u.mark.mark_name;
4258 case BRANCH_next_fail: /* that branch failed; try the next, if any */
4263 REGCP_UNWIND(ST.cp);
4264 for (n = *PL_reglastparen; n > ST.lastparen; n--)
4265 PL_regoffs[n].end = -1;
4266 *PL_reglastparen = n;
4267 /*dmq: *PL_reglastcloseparen = n; */
4268 scan = ST.next_branch;
4269 /* no more branches? */
4270 if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ)) {
4272 PerlIO_printf( Perl_debug_log,
4273 "%*s %sBRANCH failed...%s\n",
4274 REPORT_CODE_OFF+depth*2, "",
4280 continue; /* execute next BRANCH[J] op */
4288 #define ST st->u.curlym
4290 case CURLYM: /* /A{m,n}B/ where A is fixed-length */
4292 /* This is an optimisation of CURLYX that enables us to push
4293 * only a single backtracking state, no matter now many matches
4294 * there are in {m,n}. It relies on the pattern being constant
4295 * length, with no parens to influence future backrefs
4299 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
4301 /* if paren positive, emulate an OPEN/CLOSE around A */
4303 U32 paren = ST.me->flags;
4304 if (paren > PL_regsize)
4306 if (paren > *PL_reglastparen)
4307 *PL_reglastparen = paren;
4308 scan += NEXT_OFF(scan); /* Skip former OPEN. */
4316 ST.c1 = CHRTEST_UNINIT;
4319 if (!(ST.minmod ? ARG1(ST.me) : ARG2(ST.me))) /* min/max */
4322 curlym_do_A: /* execute the A in /A{m,n}B/ */
4323 PL_reginput = locinput;
4324 PUSH_YES_STATE_GOTO(CURLYM_A, ST.A); /* match A */
4327 case CURLYM_A: /* we've just matched an A */
4328 locinput = st->locinput;
4329 nextchr = UCHARAT(locinput);
4332 /* after first match, determine A's length: u.curlym.alen */
4333 if (ST.count == 1) {
4334 if (PL_reg_match_utf8) {
4336 while (s < PL_reginput) {
4342 ST.alen = PL_reginput - locinput;
4345 ST.count = ST.minmod ? ARG1(ST.me) : ARG2(ST.me);
4348 PerlIO_printf(Perl_debug_log,
4349 "%*s CURLYM now matched %"IVdf" times, len=%"IVdf"...\n",
4350 (int)(REPORT_CODE_OFF+(depth*2)), "",
4351 (IV) ST.count, (IV)ST.alen)
4354 locinput = PL_reginput;
4356 if (cur_eval && cur_eval->u.eval.close_paren &&
4357 cur_eval->u.eval.close_paren == (U32)ST.me->flags)
4360 if ( ST.count < (ST.minmod ? ARG1(ST.me) : ARG2(ST.me)) )
4361 goto curlym_do_A; /* try to match another A */
4362 goto curlym_do_B; /* try to match B */
4364 case CURLYM_A_fail: /* just failed to match an A */
4365 REGCP_UNWIND(ST.cp);
4367 if (ST.minmod || ST.count < ARG1(ST.me) /* min*/
4368 || (cur_eval && cur_eval->u.eval.close_paren &&
4369 cur_eval->u.eval.close_paren == (U32)ST.me->flags))
4372 curlym_do_B: /* execute the B in /A{m,n}B/ */
4373 PL_reginput = locinput;
4374 if (ST.c1 == CHRTEST_UNINIT) {
4375 /* calculate c1 and c2 for possible match of 1st char
4376 * following curly */
4377 ST.c1 = ST.c2 = CHRTEST_VOID;
4378 if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) {
4379 regnode *text_node = ST.B;
4380 if (! HAS_TEXT(text_node))
4381 FIND_NEXT_IMPT(text_node);
4384 (HAS_TEXT(text_node) && PL_regkind[OP(text_node)] == EXACT)
4386 But the former is redundant in light of the latter.
4388 if this changes back then the macro for
4389 IS_TEXT and friends need to change.
4391 if (PL_regkind[OP(text_node)] == EXACT)
4394 ST.c1 = (U8)*STRING(text_node);
4396 (IS_TEXTF(text_node))
4398 : (IS_TEXTFL(text_node))
4399 ? PL_fold_locale[ST.c1]
4406 PerlIO_printf(Perl_debug_log,
4407 "%*s CURLYM trying tail with matches=%"IVdf"...\n",
4408 (int)(REPORT_CODE_OFF+(depth*2)),
4411 if (ST.c1 != CHRTEST_VOID
4412 && UCHARAT(PL_reginput) != ST.c1
4413 && UCHARAT(PL_reginput) != ST.c2)
4415 /* simulate B failing */
4417 PerlIO_printf(Perl_debug_log,
4418 "%*s CURLYM Fast bail c1=%"IVdf" c2=%"IVdf"\n",
4419 (int)(REPORT_CODE_OFF+(depth*2)),"",
4422 state_num = CURLYM_B_fail;
4423 goto reenter_switch;
4427 /* mark current A as captured */
4428 I32 paren = ST.me->flags;
4430 PL_regoffs[paren].start
4431 = HOPc(PL_reginput, -ST.alen) - PL_bostr;
4432 PL_regoffs[paren].end = PL_reginput - PL_bostr;
4433 /*dmq: *PL_reglastcloseparen = paren; */
4436 PL_regoffs[paren].end = -1;
4437 if (cur_eval && cur_eval->u.eval.close_paren &&
4438 cur_eval->u.eval.close_paren == (U32)ST.me->flags)
4447 PUSH_STATE_GOTO(CURLYM_B, ST.B); /* match B */
4450 case CURLYM_B_fail: /* just failed to match a B */
4451 REGCP_UNWIND(ST.cp);
4453 if (ST.count == ARG2(ST.me) /* max */)
4455 goto curlym_do_A; /* try to match a further A */
4457 /* backtrack one A */
4458 if (ST.count == ARG1(ST.me) /* min */)
4461 locinput = HOPc(locinput, -ST.alen);
4462 goto curlym_do_B; /* try to match B */
4465 #define ST st->u.curly
4467 #define CURLY_SETPAREN(paren, success) \
4470 PL_regoffs[paren].start = HOPc(locinput, -1) - PL_bostr; \
4471 PL_regoffs[paren].end = locinput - PL_bostr; \
4472 *PL_reglastcloseparen = paren; \
4475 PL_regoffs[paren].end = -1; \
4478 case STAR: /* /A*B/ where A is width 1 */
4482 scan = NEXTOPER(scan);
4484 case PLUS: /* /A+B/ where A is width 1 */
4488 scan = NEXTOPER(scan);
4490 case CURLYN: /* /(A){m,n}B/ where A is width 1 */
4491 ST.paren = scan->flags; /* Which paren to set */
4492 if (ST.paren > PL_regsize)
4493 PL_regsize = ST.paren;
4494 if (ST.paren > *PL_reglastparen)
4495 *PL_reglastparen = ST.paren;
4496 ST.min = ARG1(scan); /* min to match */
4497 ST.max = ARG2(scan); /* max to match */
4498 if (cur_eval && cur_eval->u.eval.close_paren &&
4499 cur_eval->u.eval.close_paren == (U32)ST.paren) {
4503 scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
4505 case CURLY: /* /A{m,n}B/ where A is width 1 */
4507 ST.min = ARG1(scan); /* min to match */
4508 ST.max = ARG2(scan); /* max to match */
4509 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
4512 * Lookahead to avoid useless match attempts
4513 * when we know what character comes next.
4515 * Used to only do .*x and .*?x, but now it allows
4516 * for )'s, ('s and (?{ ... })'s to be in the way
4517 * of the quantifier and the EXACT-like node. -- japhy
4520 if (ST.min > ST.max) /* XXX make this a compile-time check? */
4522 if (HAS_TEXT(next) || JUMPABLE(next)) {
4524 regnode *text_node = next;
4526 if (! HAS_TEXT(text_node))
4527 FIND_NEXT_IMPT(text_node);
4529 if (! HAS_TEXT(text_node))
4530 ST.c1 = ST.c2 = CHRTEST_VOID;
4532 if ( PL_regkind[OP(text_node)] != EXACT ) {
4533 ST.c1 = ST.c2 = CHRTEST_VOID;
4534 goto assume_ok_easy;
4537 s = (U8*)STRING(text_node);
4539 /* Currently we only get here when
4541 PL_rekind[OP(text_node)] == EXACT
4543 if this changes back then the macro for IS_TEXT and
4544 friends need to change. */
4547 if (IS_TEXTF(text_node))
4548 ST.c2 = PL_fold[ST.c1];
4549 else if (IS_TEXTFL(text_node))
4550 ST.c2 = PL_fold_locale[ST.c1];
4553 if (IS_TEXTF(text_node)) {
4554 STRLEN ulen1, ulen2;
4555 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
4556 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
4558 to_utf8_lower((U8*)s, tmpbuf1, &ulen1);
4559 to_utf8_upper((U8*)s, tmpbuf2, &ulen2);
4561 ST.c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXLEN, 0,
4563 0 : UTF8_ALLOW_ANY);
4564 ST.c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXLEN, 0,
4566 0 : UTF8_ALLOW_ANY);
4568 ST.c1 = utf8n_to_uvuni(tmpbuf1, UTF8_MAXBYTES, 0,
4570 ST.c2 = utf8n_to_uvuni(tmpbuf2, UTF8_MAXBYTES, 0,
4575 ST.c2 = ST.c1 = utf8n_to_uvchr(s, UTF8_MAXBYTES, 0,
4582 ST.c1 = ST.c2 = CHRTEST_VOID;
4587 PL_reginput = locinput;
4590 if (ST.min && regrepeat(rex, ST.A, ST.min, depth) < ST.min)
4593 locinput = PL_reginput;
4595 if (ST.c1 == CHRTEST_VOID)
4596 goto curly_try_B_min;
4598 ST.oldloc = locinput;
4600 /* set ST.maxpos to the furthest point along the
4601 * string that could possibly match */
4602 if (ST.max == REG_INFTY) {
4603 ST.maxpos = PL_regeol - 1;
4605 while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos))
4609 int m = ST.max - ST.min;
4610 for (ST.maxpos = locinput;
4611 m >0 && ST.maxpos + UTF8SKIP(ST.maxpos) <= PL_regeol; m--)
4612 ST.maxpos += UTF8SKIP(ST.maxpos);
4615 ST.maxpos = locinput + ST.max - ST.min;
4616 if (ST.maxpos >= PL_regeol)
4617 ST.maxpos = PL_regeol - 1;
4619 goto curly_try_B_min_known;
4623 ST.count = regrepeat(rex, ST.A, ST.max, depth);
4624 locinput = PL_reginput;
4625 if (ST.count < ST.min)
4627 if ((ST.count > ST.min)
4628 && (PL_regkind[OP(ST.B)] == EOL) && (OP(ST.B) != MEOL))
4630 /* A{m,n} must come at the end of the string, there's
4631 * no point in backing off ... */
4633 /* ...except that $ and \Z can match before *and* after
4634 newline at the end. Consider "\n\n" =~ /\n+\Z\n/.
4635 We may back off by one in this case. */
4636 if (UCHARAT(PL_reginput - 1) == '\n' && OP(ST.B) != EOS)
4640 goto curly_try_B_max;
4645 case CURLY_B_min_known_fail:
4646 /* failed to find B in a non-greedy match where c1,c2 valid */
4647 if (ST.paren && ST.count)
4648 PL_regoffs[ST.paren].end = -1;
4650 PL_reginput = locinput; /* Could be reset... */
4651 REGCP_UNWIND(ST.cp);
4652 /* Couldn't or didn't -- move forward. */
4653 ST.oldloc = locinput;
4655 locinput += UTF8SKIP(locinput);
4659 curly_try_B_min_known:
4660 /* find the next place where 'B' could work, then call B */
4664 n = (ST.oldloc == locinput) ? 0 : 1;
4665 if (ST.c1 == ST.c2) {
4667 /* set n to utf8_distance(oldloc, locinput) */
4668 while (locinput <= ST.maxpos &&
4669 utf8n_to_uvchr((U8*)locinput,
4670 UTF8_MAXBYTES, &len,
4671 uniflags) != (UV)ST.c1) {
4677 /* set n to utf8_distance(oldloc, locinput) */
4678 while (locinput <= ST.maxpos) {
4680 const UV c = utf8n_to_uvchr((U8*)locinput,
4681 UTF8_MAXBYTES, &len,
4683 if (c == (UV)ST.c1 || c == (UV)ST.c2)
4691 if (ST.c1 == ST.c2) {
4692 while (locinput <= ST.maxpos &&
4693 UCHARAT(locinput) != ST.c1)
4697 while (locinput <= ST.maxpos
4698 && UCHARAT(locinput) != ST.c1
4699 && UCHARAT(locinput) != ST.c2)
4702 n = locinput - ST.oldloc;
4704 if (locinput > ST.maxpos)
4706 /* PL_reginput == oldloc now */
4709 if (regrepeat(rex, ST.A, n, depth) < n)
4712 PL_reginput = locinput;
4713 CURLY_SETPAREN(ST.paren, ST.count);
4714 if (cur_eval && cur_eval->u.eval.close_paren &&
4715 cur_eval->u.eval.close_paren == (U32)ST.paren) {
4718 PUSH_STATE_GOTO(CURLY_B_min_known, ST.B);
4723 case CURLY_B_min_fail:
4724 /* failed to find B in a non-greedy match where c1,c2 invalid */
4725 if (ST.paren && ST.count)
4726 PL_regoffs[ST.paren].end = -1;
4728 REGCP_UNWIND(ST.cp);
4729 /* failed -- move forward one */
4730 PL_reginput = locinput;
4731 if (regrepeat(rex, ST.A, 1, depth)) {
4733 locinput = PL_reginput;
4734 if (ST.count <= ST.max || (ST.max == REG_INFTY &&
4735 ST.count > 0)) /* count overflow ? */
4738 CURLY_SETPAREN(ST.paren, ST.count);
4739 if (cur_eval && cur_eval->u.eval.close_paren &&
4740 cur_eval->u.eval.close_paren == (U32)ST.paren) {
4743 PUSH_STATE_GOTO(CURLY_B_min, ST.B);
4751 /* a successful greedy match: now try to match B */
4752 if (cur_eval && cur_eval->u.eval.close_paren &&
4753 cur_eval->u.eval.close_paren == (U32)ST.paren) {
4758 if (ST.c1 != CHRTEST_VOID)
4759 c = do_utf8 ? utf8n_to_uvchr((U8*)PL_reginput,
4760 UTF8_MAXBYTES, 0, uniflags)
4761 : (UV) UCHARAT(PL_reginput);
4762 /* If it could work, try it. */
4763 if (ST.c1 == CHRTEST_VOID || c == (UV)ST.c1 || c == (UV)ST.c2) {
4764 CURLY_SETPAREN(ST.paren, ST.count);
4765 PUSH_STATE_GOTO(CURLY_B_max, ST.B);
4770 case CURLY_B_max_fail:
4771 /* failed to find B in a greedy match */
4772 if (ST.paren && ST.count)
4773 PL_regoffs[ST.paren].end = -1;
4775 REGCP_UNWIND(ST.cp);
4777 if (--ST.count < ST.min)
4779 PL_reginput = locinput = HOPc(locinput, -1);
4780 goto curly_try_B_max;
4787 /* we've just finished A in /(??{A})B/; now continue with B */
4789 st->u.eval.toggle_reg_flags
4790 = cur_eval->u.eval.toggle_reg_flags;
4791 PL_reg_flags ^= st->u.eval.toggle_reg_flags;
4793 st->u.eval.prev_rex = rex; /* inner */
4794 SETREX(rex,cur_eval->u.eval.prev_rex);
4795 rexi = RXi_GET(rex);
4796 cur_curlyx = cur_eval->u.eval.prev_curlyx;
4798 st->u.eval.cp = regcppush(0); /* Save *all* the positions. */
4799 REGCP_SET(st->u.eval.lastcp);
4800 PL_reginput = locinput;
4802 /* Restore parens of the outer rex without popping the
4804 tmpix = PL_savestack_ix;
4805 PL_savestack_ix = cur_eval->u.eval.lastcp;
4807 PL_savestack_ix = tmpix;
4809 st->u.eval.prev_eval = cur_eval;
4810 cur_eval = cur_eval->u.eval.prev_eval;
4812 PerlIO_printf(Perl_debug_log, "%*s EVAL trying tail ... %"UVxf"\n",
4813 REPORT_CODE_OFF+depth*2, "",PTR2UV(cur_eval)););
4814 if ( nochange_depth )
4817 PUSH_YES_STATE_GOTO(EVAL_AB,
4818 st->u.eval.prev_eval->u.eval.B); /* match B */
4821 if (locinput < reginfo->till) {
4822 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
4823 "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
4825 (long)(locinput - PL_reg_starttry),
4826 (long)(reginfo->till - PL_reg_starttry),
4829 sayNO_SILENT; /* Cannot match: too short. */
4831 PL_reginput = locinput; /* put where regtry can find it */
4832 sayYES; /* Success! */
4834 case SUCCEED: /* successful SUSPEND/UNLESSM/IFMATCH/CURLYM */
4836 PerlIO_printf(Perl_debug_log,
4837 "%*s %ssubpattern success...%s\n",
4838 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5]));
4839 PL_reginput = locinput; /* put where regtry can find it */
4840 sayYES; /* Success! */
4843 #define ST st->u.ifmatch
4845 case SUSPEND: /* (?>A) */
4847 PL_reginput = locinput;
4850 case UNLESSM: /* -ve lookaround: (?!A), or with flags, (?<!A) */
4852 goto ifmatch_trivial_fail_test;
4854 case IFMATCH: /* +ve lookaround: (?=A), or with flags, (?<=A) */
4856 ifmatch_trivial_fail_test:
4858 char * const s = HOPBACKc(locinput, scan->flags);
4863 sw = 1 - (bool)ST.wanted;
4867 next = scan + ARG(scan);
4875 PL_reginput = locinput;
4879 ST.logical = logical;
4880 /* execute body of (?...A) */
4881 PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)));
4884 case IFMATCH_A_fail: /* body of (?...A) failed */
4885 ST.wanted = !ST.wanted;
4888 case IFMATCH_A: /* body of (?...A) succeeded */
4890 sw = (bool)ST.wanted;
4892 else if (!ST.wanted)
4895 if (OP(ST.me) == SUSPEND)
4896 locinput = PL_reginput;
4898 locinput = PL_reginput = st->locinput;
4899 nextchr = UCHARAT(locinput);
4901 scan = ST.me + ARG(ST.me);
4904 continue; /* execute B */
4909 next = scan + ARG(scan);
4914 reginfo->cutpoint = PL_regeol;
4917 PL_reginput = locinput;
4919 sv_yes_mark = sv_commit = (SV*)rexi->data->data[ ARG( scan ) ];
4920 PUSH_STATE_GOTO(COMMIT_next,next);
4922 case COMMIT_next_fail:
4929 #define ST st->u.mark
4931 ST.prev_mark = mark_state;
4932 ST.mark_name = sv_commit = sv_yes_mark
4933 = (SV*)rexi->data->data[ ARG( scan ) ];
4935 ST.mark_loc = PL_reginput = locinput;
4936 PUSH_YES_STATE_GOTO(MARKPOINT_next,next);
4938 case MARKPOINT_next:
4939 mark_state = ST.prev_mark;
4942 case MARKPOINT_next_fail:
4943 if (popmark && sv_eq(ST.mark_name,popmark))
4945 if (ST.mark_loc > startpoint)
4946 reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
4947 popmark = NULL; /* we found our mark */
4948 sv_commit = ST.mark_name;
4951 PerlIO_printf(Perl_debug_log,
4952 "%*s %ssetting cutpoint to mark:%"SVf"...%s\n",
4953 REPORT_CODE_OFF+depth*2, "",
4954 PL_colors[4], SVfARG(sv_commit), PL_colors[5]);
4957 mark_state = ST.prev_mark;
4958 sv_yes_mark = mark_state ?
4959 mark_state->u.mark.mark_name : NULL;
4963 PL_reginput = locinput;
4965 /* (*SKIP) : if we fail we cut here*/
4966 ST.mark_name = NULL;
4967 ST.mark_loc = locinput;
4968 PUSH_STATE_GOTO(SKIP_next,next);
4970 /* (*SKIP:NAME) : if there is a (*MARK:NAME) fail where it was,
4971 otherwise do nothing. Meaning we need to scan
4973 regmatch_state *cur = mark_state;
4974 SV *find = (SV*)rexi->data->data[ ARG( scan ) ];
4977 if ( sv_eq( cur->u.mark.mark_name,
4980 ST.mark_name = find;
4981 PUSH_STATE_GOTO( SKIP_next, next );
4983 cur = cur->u.mark.prev_mark;
4986 /* Didn't find our (*MARK:NAME) so ignore this (*SKIP:NAME) */
4988 case SKIP_next_fail:
4990 /* (*CUT:NAME) - Set up to search for the name as we
4991 collapse the stack*/
4992 popmark = ST.mark_name;
4994 /* (*CUT) - No name, we cut here.*/
4995 if (ST.mark_loc > startpoint)
4996 reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
4997 /* but we set sv_commit to latest mark_name if there
4998 is one so they can test to see how things lead to this
5001 sv_commit=mark_state->u.mark.mark_name;
5010 locinput += UTF8SKIP(locinput);
5013 /* This malarky is to handle LATIN SMALL LETTER SHARP S
5015 if (0xDF==n && (UTF||do_utf8) &&
5016 toLOWER(locinput[0])=='s' && toLOWER(locinput[1])=='s')
5019 } else if (do_utf8) {
5020 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
5022 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
5024 to_uni_fold(n, tmpbuf1, &tmplen1);
5025 to_utf8_fold(locinput, tmpbuf2, &tmplen2);
5026 if (tmplen1!=tmplen2 || !strnEQ(tmpbuf1,tmpbuf2,tmplen1))
5029 locinput += UTF8SKIP(locinput);
5033 nextchr = UCHARAT(locinput);
5036 if ((n=is_LNBREAK(locinput,do_utf8))) {
5038 nextchr = UCHARAT(locinput);
5043 #define CASE_CLASS(nAmE) \
5045 if ((n=is_##nAmE(locinput,do_utf8))) { \
5047 nextchr = UCHARAT(locinput); \
5052 if ((n=is_##nAmE(locinput,do_utf8))) { \
5055 locinput += UTF8SKIP(locinput); \
5056 nextchr = UCHARAT(locinput); \
5061 CASE_CLASS(HORIZWS);
5065 PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
5066 PTR2UV(scan), OP(scan));
5067 Perl_croak(aTHX_ "regexp memory corruption");
5071 /* switch break jumps here */
5072 scan = next; /* prepare to execute the next op and ... */
5073 continue; /* ... jump back to the top, reusing st */
5077 /* push a state that backtracks on success */
5078 st->u.yes.prev_yes_state = yes_state;
5082 /* push a new regex state, then continue at scan */
5084 regmatch_state *newst;
5087 regmatch_state *cur = st;
5088 regmatch_state *curyes = yes_state;
5090 regmatch_slab *slab = PL_regmatch_slab;
5091 for (;curd > -1;cur--,curd--) {
5092 if (cur < SLAB_FIRST(slab)) {
5094 cur = SLAB_LAST(slab);
5096 PerlIO_printf(Perl_error_log, "%*s#%-3d %-10s %s\n",
5097 REPORT_CODE_OFF + 2 + depth * 2,"",
5098 curd, PL_reg_name[cur->resume_state],
5099 (curyes == cur) ? "yes" : ""
5102 curyes = cur->u.yes.prev_yes_state;
5105 DEBUG_STATE_pp("push")
5108 st->locinput = locinput;
5110 if (newst > SLAB_LAST(PL_regmatch_slab))
5111 newst = S_push_slab(aTHX);
5112 PL_regmatch_state = newst;
5114 locinput = PL_reginput;
5115 nextchr = UCHARAT(locinput);
5123 * We get here only if there's trouble -- normally "case END" is
5124 * the terminating point.
5126 Perl_croak(aTHX_ "corrupted regexp pointers");
5132 /* we have successfully completed a subexpression, but we must now
5133 * pop to the state marked by yes_state and continue from there */
5134 assert(st != yes_state);
5136 while (st != yes_state) {
5138 if (st < SLAB_FIRST(PL_regmatch_slab)) {
5139 PL_regmatch_slab = PL_regmatch_slab->prev;
5140 st = SLAB_LAST(PL_regmatch_slab);
5144 DEBUG_STATE_pp("pop (no final)");
5146 DEBUG_STATE_pp("pop (yes)");
5152 while (yes_state < SLAB_FIRST(PL_regmatch_slab)
5153 || yes_state > SLAB_LAST(PL_regmatch_slab))
5155 /* not in this slab, pop slab */
5156 depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1);
5157 PL_regmatch_slab = PL_regmatch_slab->prev;
5158 st = SLAB_LAST(PL_regmatch_slab);
5160 depth -= (st - yes_state);
5163 yes_state = st->u.yes.prev_yes_state;
5164 PL_regmatch_state = st;
5167 locinput= st->locinput;
5168 nextchr = UCHARAT(locinput);
5170 state_num = st->resume_state + no_final;
5171 goto reenter_switch;
5174 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
5175 PL_colors[4], PL_colors[5]));
5177 if (PL_reg_eval_set) {
5178 /* each successfully executed (?{...}) block does the equivalent of
5179 * local $^R = do {...}
5180 * When popping the save stack, all these locals would be undone;
5181 * bypass this by setting the outermost saved $^R to the latest
5183 if (oreplsv != GvSV(PL_replgv))
5184 sv_setsv(oreplsv, GvSV(PL_replgv));
5191 PerlIO_printf(Perl_debug_log,
5192 "%*s %sfailed...%s\n",
5193 REPORT_CODE_OFF+depth*2, "",
5194 PL_colors[4], PL_colors[5])
5206 /* there's a previous state to backtrack to */
5208 if (st < SLAB_FIRST(PL_regmatch_slab)) {
5209 PL_regmatch_slab = PL_regmatch_slab->prev;
5210 st = SLAB_LAST(PL_regmatch_slab);
5212 PL_regmatch_state = st;
5213 locinput= st->locinput;
5214 nextchr = UCHARAT(locinput);
5216 DEBUG_STATE_pp("pop");
5218 if (yes_state == st)
5219 yes_state = st->u.yes.prev_yes_state;
5221 state_num = st->resume_state + 1; /* failure = success + 1 */
5222 goto reenter_switch;
5227 if (rex->intflags & PREGf_VERBARG_SEEN) {
5228 SV *sv_err = get_sv("REGERROR", 1);
5229 SV *sv_mrk = get_sv("REGMARK", 1);
5231 sv_commit = &PL_sv_no;
5233 sv_yes_mark = &PL_sv_yes;
5236 sv_commit = &PL_sv_yes;
5237 sv_yes_mark = &PL_sv_no;
5239 sv_setsv(sv_err, sv_commit);
5240 sv_setsv(sv_mrk, sv_yes_mark);
5243 /* clean up; in particular, free all slabs above current one */
5244 LEAVE_SCOPE(oldsave);
5250 - regrepeat - repeatedly match something simple, report how many
5253 * [This routine now assumes that it will only match on things of length 1.
5254 * That was true before, but now we assume scan - reginput is the count,
5255 * rather than incrementing count on every character. [Er, except utf8.]]
5258 S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth)
5261 register char *scan;
5263 register char *loceol = PL_regeol;
5264 register I32 hardcount = 0;
5265 register bool do_utf8 = PL_reg_match_utf8;
5267 PERL_UNUSED_ARG(depth);
5271 if (max == REG_INFTY)
5273 else if (max < loceol - scan)
5274 loceol = scan + max;
5279 while (scan < loceol && hardcount < max && *scan != '\n') {
5280 scan += UTF8SKIP(scan);
5284 while (scan < loceol && *scan != '\n')
5291 while (scan < loceol && hardcount < max) {
5292 scan += UTF8SKIP(scan);
5302 case EXACT: /* length of string is 1 */
5304 while (scan < loceol && UCHARAT(scan) == c)
5307 case EXACTF: /* length of string is 1 */
5309 while (scan < loceol &&
5310 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold[c]))
5313 case EXACTFL: /* length of string is 1 */
5314 PL_reg_flags |= RF_tainted;
5316 while (scan < loceol &&
5317 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold_locale[c]))
5323 while (hardcount < max && scan < loceol &&
5324 reginclass(prog, p, (U8*)scan, 0, do_utf8)) {
5325 scan += UTF8SKIP(scan);
5329 while (scan < loceol && REGINCLASS(prog, p, (U8*)scan))
5336 LOAD_UTF8_CHARCLASS_ALNUM();
5337 while (hardcount < max && scan < loceol &&
5338 swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) {
5339 scan += UTF8SKIP(scan);
5343 while (scan < loceol && isALNUM(*scan))
5348 PL_reg_flags |= RF_tainted;
5351 while (hardcount < max && scan < loceol &&
5352 isALNUM_LC_utf8((U8*)scan)) {
5353 scan += UTF8SKIP(scan);
5357 while (scan < loceol && isALNUM_LC(*scan))
5364 LOAD_UTF8_CHARCLASS_ALNUM();
5365 while (hardcount < max && scan < loceol &&
5366 !swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) {
5367 scan += UTF8SKIP(scan);
5371 while (scan < loceol && !isALNUM(*scan))
5376 PL_reg_flags |= RF_tainted;
5379 while (hardcount < max && scan < loceol &&
5380 !isALNUM_LC_utf8((U8*)scan)) {
5381 scan += UTF8SKIP(scan);
5385 while (scan < loceol && !isALNUM_LC(*scan))
5392 LOAD_UTF8_CHARCLASS_SPACE();
5393 while (hardcount < max && scan < loceol &&
5395 swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) {
5396 scan += UTF8SKIP(scan);
5400 while (scan < loceol && isSPACE(*scan))
5405 PL_reg_flags |= RF_tainted;
5408 while (hardcount < max && scan < loceol &&
5409 (*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
5410 scan += UTF8SKIP(scan);
5414 while (scan < loceol && isSPACE_LC(*scan))
5421 LOAD_UTF8_CHARCLASS_SPACE();
5422 while (hardcount < max && scan < loceol &&
5424 swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) {
5425 scan += UTF8SKIP(scan);
5429 while (scan < loceol && !isSPACE(*scan))
5434 PL_reg_flags |= RF_tainted;
5437 while (hardcount < max && scan < loceol &&
5438 !(*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
5439 scan += UTF8SKIP(scan);
5443 while (scan < loceol && !isSPACE_LC(*scan))
5450 LOAD_UTF8_CHARCLASS_DIGIT();
5451 while (hardcount < max && scan < loceol &&
5452 swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) {
5453 scan += UTF8SKIP(scan);
5457 while (scan < loceol && isDIGIT(*scan))
5464 LOAD_UTF8_CHARCLASS_DIGIT();
5465 while (hardcount < max && scan < loceol &&
5466 !swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) {
5467 scan += UTF8SKIP(scan);
5471 while (scan < loceol && !isDIGIT(*scan))
5477 while (hardcount < max && scan < loceol && (c=is_LNBREAK_utf8(scan))) {
5483 LNBREAK can match two latin chars, which is ok,
5484 because we have a null terminated string, but we
5485 have to use hardcount in this situation
5487 while (scan < loceol && (c=is_LNBREAK_latin1(scan))) {
5496 while (hardcount < max && scan < loceol && (c=is_HORIZWS_utf8(scan))) {
5501 while (scan < loceol && is_HORIZWS_latin1(scan))
5508 while (hardcount < max && scan < loceol && !is_HORIZWS_utf8(scan)) {
5509 scan += UTF8SKIP(scan);
5513 while (scan < loceol && !is_HORIZWS_latin1(scan))
5521 while (hardcount < max && scan < loceol && (c=is_VERTWS_utf8(scan))) {
5526 while (scan < loceol && is_VERTWS_latin1(scan))
5534 while (hardcount < max && scan < loceol && !is_VERTWS_utf8(scan)) {
5535 scan += UTF8SKIP(scan);
5539 while (scan < loceol && !is_VERTWS_latin1(scan))
5545 default: /* Called on something of 0 width. */
5546 break; /* So match right here or not at all. */
5552 c = scan - PL_reginput;
5556 GET_RE_DEBUG_FLAGS_DECL;
5558 SV * const prop = sv_newmortal();
5559 regprop(prog, prop, p);
5560 PerlIO_printf(Perl_debug_log,
5561 "%*s %s can match %"IVdf" times out of %"IVdf"...\n",
5562 REPORT_CODE_OFF + depth*2, "", SvPVX_const(prop),(IV)c,(IV)max);
5570 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
5572 - regclass_swash - prepare the utf8 swash
5576 Perl_regclass_swash(pTHX_ const regexp *prog, register const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
5582 RXi_GET_DECL(prog,progi);
5583 const struct reg_data * const data = prog ? progi->data : NULL;
5585 if (data && data->count) {
5586 const U32 n = ARG(node);
5588 if (data->what[n] == 's') {
5589 SV * const rv = (SV*)data->data[n];
5590 AV * const av = (AV*)SvRV((SV*)rv);
5591 SV **const ary = AvARRAY(av);
5594 /* See the end of regcomp.c:S_regclass() for
5595 * documentation of these array elements. */
5598 a = SvROK(ary[1]) ? &ary[1] : 0;
5599 b = SvTYPE(ary[2]) == SVt_PVAV ? &ary[2] : 0;
5603 else if (si && doinit) {
5604 sw = swash_init("utf8", "", si, 1, 0);
5605 (void)av_store(av, 1, sw);
5622 - reginclass - determine if a character falls into a character class
5624 The n is the ANYOF regnode, the p is the target string, lenp
5625 is pointer to the maximum length of how far to go in the p
5626 (if the lenp is zero, UTF8SKIP(p) is used),
5627 do_utf8 tells whether the target string is in UTF-8.
5632 S_reginclass(pTHX_ const regexp *prog, register const regnode *n, register const U8* p, STRLEN* lenp, register bool do_utf8)
5635 const char flags = ANYOF_FLAGS(n);
5641 if (do_utf8 && !UTF8_IS_INVARIANT(c)) {
5642 c = utf8n_to_uvchr(p, UTF8_MAXBYTES, &len,
5643 (UTF8_ALLOW_DEFAULT & UTF8_ALLOW_ANYUV) | UTF8_CHECK_ONLY);
5644 /* see [perl #37836] for UTF8_ALLOW_ANYUV */
5645 if (len == (STRLEN)-1)
5646 Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
5649 plen = lenp ? *lenp : UNISKIP(NATIVE_TO_UNI(c));
5650 if (do_utf8 || (flags & ANYOF_UNICODE)) {
5653 if (do_utf8 && !ANYOF_RUNTIME(n)) {
5654 if (len != (STRLEN)-1 && c < 256 && ANYOF_BITMAP_TEST(n, c))
5657 if (!match && do_utf8 && (flags & ANYOF_UNICODE_ALL) && c >= 256)
5661 SV * const sw = regclass_swash(prog, n, TRUE, 0, (SV**)&av);
5664 if (swash_fetch(sw, p, do_utf8))
5666 else if (flags & ANYOF_FOLD) {
5667 if (!match && lenp && av) {
5669 for (i = 0; i <= av_len(av); i++) {
5670 SV* const sv = *av_fetch(av, i, FALSE);
5672 const char * const s = SvPV_const(sv, len);
5674 if (len <= plen && memEQ(s, (char*)p, len)) {
5682 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
5685 to_utf8_fold(p, tmpbuf, &tmplen);
5686 if (swash_fetch(sw, tmpbuf, do_utf8))
5692 if (match && lenp && *lenp == 0)
5693 *lenp = UNISKIP(NATIVE_TO_UNI(c));
5695 if (!match && c < 256) {
5696 if (ANYOF_BITMAP_TEST(n, c))
5698 else if (flags & ANYOF_FOLD) {
5701 if (flags & ANYOF_LOCALE) {
5702 PL_reg_flags |= RF_tainted;
5703 f = PL_fold_locale[c];
5707 if (f != c && ANYOF_BITMAP_TEST(n, f))
5711 if (!match && (flags & ANYOF_CLASS)) {
5712 PL_reg_flags |= RF_tainted;
5714 (ANYOF_CLASS_TEST(n, ANYOF_ALNUM) && isALNUM_LC(c)) ||
5715 (ANYOF_CLASS_TEST(n, ANYOF_NALNUM) && !isALNUM_LC(c)) ||
5716 (ANYOF_CLASS_TEST(n, ANYOF_SPACE) && isSPACE_LC(c)) ||
5717 (ANYOF_CLASS_TEST(n, ANYOF_NSPACE) && !isSPACE_LC(c)) ||
5718 (ANYOF_CLASS_TEST(n, ANYOF_DIGIT) && isDIGIT_LC(c)) ||
5719 (ANYOF_CLASS_TEST(n, ANYOF_NDIGIT) && !isDIGIT_LC(c)) ||
5720 (ANYOF_CLASS_TEST(n, ANYOF_ALNUMC) && isALNUMC_LC(c)) ||
5721 (ANYOF_CLASS_TEST(n, ANYOF_NALNUMC) && !isALNUMC_LC(c)) ||
5722 (ANYOF_CLASS_TEST(n, ANYOF_ALPHA) && isALPHA_LC(c)) ||
5723 (ANYOF_CLASS_TEST(n, ANYOF_NALPHA) && !isALPHA_LC(c)) ||
5724 (ANYOF_CLASS_TEST(n, ANYOF_ASCII) && isASCII(c)) ||
5725 (ANYOF_CLASS_TEST(n, ANYOF_NASCII) && !isASCII(c)) ||
5726 (ANYOF_CLASS_TEST(n, ANYOF_CNTRL) && isCNTRL_LC(c)) ||
5727 (ANYOF_CLASS_TEST(n, ANYOF_NCNTRL) && !isCNTRL_LC(c)) ||
5728 (ANYOF_CLASS_TEST(n, ANYOF_GRAPH) && isGRAPH_LC(c)) ||
5729 (ANYOF_CLASS_TEST(n, ANYOF_NGRAPH) && !isGRAPH_LC(c)) ||
5730 (ANYOF_CLASS_TEST(n, ANYOF_LOWER) && isLOWER_LC(c)) ||
5731 (ANYOF_CLASS_TEST(n, ANYOF_NLOWER) && !isLOWER_LC(c)) ||
5732 (ANYOF_CLASS_TEST(n, ANYOF_PRINT) && isPRINT_LC(c)) ||
5733 (ANYOF_CLASS_TEST(n, ANYOF_NPRINT) && !isPRINT_LC(c)) ||
5734 (ANYOF_CLASS_TEST(n, ANYOF_PUNCT) && isPUNCT_LC(c)) ||
5735 (ANYOF_CLASS_TEST(n, ANYOF_NPUNCT) && !isPUNCT_LC(c)) ||
5736 (ANYOF_CLASS_TEST(n, ANYOF_UPPER) && isUPPER_LC(c)) ||
5737 (ANYOF_CLASS_TEST(n, ANYOF_NUPPER) && !isUPPER_LC(c)) ||
5738 (ANYOF_CLASS_TEST(n, ANYOF_XDIGIT) && isXDIGIT(c)) ||
5739 (ANYOF_CLASS_TEST(n, ANYOF_NXDIGIT) && !isXDIGIT(c)) ||
5740 (ANYOF_CLASS_TEST(n, ANYOF_PSXSPC) && isPSXSPC(c)) ||
5741 (ANYOF_CLASS_TEST(n, ANYOF_NPSXSPC) && !isPSXSPC(c)) ||
5742 (ANYOF_CLASS_TEST(n, ANYOF_BLANK) && isBLANK(c)) ||
5743 (ANYOF_CLASS_TEST(n, ANYOF_NBLANK) && !isBLANK(c))
5744 ) /* How's that for a conditional? */
5751 return (flags & ANYOF_INVERT) ? !match : match;
5755 S_reghop3(U8 *s, I32 off, const U8* lim)
5759 while (off-- && s < lim) {
5760 /* XXX could check well-formedness here */
5765 while (off++ && s > lim) {
5767 if (UTF8_IS_CONTINUED(*s)) {
5768 while (s > lim && UTF8_IS_CONTINUATION(*s))
5771 /* XXX could check well-formedness here */
5778 /* there are a bunch of places where we use two reghop3's that should
5779 be replaced with this routine. but since thats not done yet
5780 we ifdef it out - dmq
5783 S_reghop4(U8 *s, I32 off, const U8* llim, const U8* rlim)
5787 while (off-- && s < rlim) {
5788 /* XXX could check well-formedness here */
5793 while (off++ && s > llim) {
5795 if (UTF8_IS_CONTINUED(*s)) {
5796 while (s > llim && UTF8_IS_CONTINUATION(*s))
5799 /* XXX could check well-formedness here */
5807 S_reghopmaybe3(U8* s, I32 off, const U8* lim)
5811 while (off-- && s < lim) {
5812 /* XXX could check well-formedness here */
5819 while (off++ && s > lim) {
5821 if (UTF8_IS_CONTINUED(*s)) {
5822 while (s > lim && UTF8_IS_CONTINUATION(*s))
5825 /* XXX could check well-formedness here */
5834 restore_pos(pTHX_ void *arg)
5837 regexp * const rex = (regexp *)arg;
5838 if (PL_reg_eval_set) {
5839 if (PL_reg_oldsaved) {
5840 rex->subbeg = PL_reg_oldsaved;
5841 rex->sublen = PL_reg_oldsavedlen;
5842 #ifdef PERL_OLD_COPY_ON_WRITE
5843 rex->saved_copy = PL_nrs;
5845 RX_MATCH_COPIED_on(rex);
5847 PL_reg_magic->mg_len = PL_reg_oldpos;
5848 PL_reg_eval_set = 0;
5849 PL_curpm = PL_reg_oldcurpm;
5854 S_to_utf8_substr(pTHX_ register regexp *prog)
5858 if (prog->substrs->data[i].substr
5859 && !prog->substrs->data[i].utf8_substr) {
5860 SV* const sv = newSVsv(prog->substrs->data[i].substr);
5861 prog->substrs->data[i].utf8_substr = sv;
5862 sv_utf8_upgrade(sv);
5863 if (SvVALID(prog->substrs->data[i].substr)) {
5864 const U8 flags = BmFLAGS(prog->substrs->data[i].substr);
5865 if (flags & FBMcf_TAIL) {
5866 /* Trim the trailing \n that fbm_compile added last
5868 SvCUR_set(sv, SvCUR(sv) - 1);
5869 /* Whilst this makes the SV technically "invalid" (as its
5870 buffer is no longer followed by "\0") when fbm_compile()
5871 adds the "\n" back, a "\0" is restored. */
5873 fbm_compile(sv, flags);
5875 if (prog->substrs->data[i].substr == prog->check_substr)
5876 prog->check_utf8 = sv;
5882 S_to_byte_substr(pTHX_ register regexp *prog)
5887 if (prog->substrs->data[i].utf8_substr
5888 && !prog->substrs->data[i].substr) {
5889 SV* sv = newSVsv(prog->substrs->data[i].utf8_substr);
5890 if (sv_utf8_downgrade(sv, TRUE)) {
5891 if (SvVALID(prog->substrs->data[i].utf8_substr)) {
5893 = BmFLAGS(prog->substrs->data[i].utf8_substr);
5894 if (flags & FBMcf_TAIL) {
5895 /* Trim the trailing \n that fbm_compile added last
5897 SvCUR_set(sv, SvCUR(sv) - 1);
5899 fbm_compile(sv, flags);
5905 prog->substrs->data[i].substr = sv;
5906 if (prog->substrs->data[i].utf8_substr == prog->check_utf8)
5907 prog->check_substr = sv;
5914 * c-indentation-style: bsd
5916 * indent-tabs-mode: t
5919 * ex: set ts=8 sts=4 sw=4 noet: