From: Ævar Arnfjörð Bjarmason Date: Thu, 28 Jun 2007 20:06:50 +0000 (+0000) Subject: Move the RXf_WHITE logic for split " " into the regex engine X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=0ac6acaed7c2092a5668c6b70ddeaf3003e989d8;p=p5sagit%2Fp5-mst-13.2.git Move the RXf_WHITE logic for split " " into the regex engine From: "Ævar Arnfjörð Bjarmason" Message-ID: <51dd1af80706281306i4dbba39em3eeb8da1d67ea27c@mail.gmail.com> (with tweaks) p4raw-id: //depot/perl@31495 --- diff --git a/ext/B/t/concise-xs.t b/ext/B/t/concise-xs.t index 9430830..e4a0241 100644 --- a/ext/B/t/concise-xs.t +++ b/ext/B/t/concise-xs.t @@ -117,7 +117,7 @@ use Getopt::Std; use Carp; use Test::More tests => ( # per-pkg tests (function ct + require_ok) 40 + 16 # Data::Dumper, Digest::MD5 - + 521 + 276 # B::Deparse, B + + 521 + 277 # B::Deparse, B + 595 + 190 # POSIX, IO::Socket - 6); # fudge diff --git a/op.c b/op.c index 613cc2e..cdd7aaf 100644 --- a/op.c +++ b/op.c @@ -3424,32 +3424,15 @@ Perl_pmruntime(pTHX_ OP *o, OP *expr, bool isreg) pm = (PMOP*)o; if (expr->op_type == OP_CONST) { - STRLEN plen; SV * const pat = ((SVOP*)expr)->op_sv; - const char *p = SvPV_const(pat, plen); U32 pm_flags = pm->op_pmflags & PMf_COMPILETIME; - if ((o->op_flags & OPf_SPECIAL) && (plen == 1 && *p == ' ')) { - U32 was_readonly = SvREADONLY(pat); - if (was_readonly) { - if (SvFAKE(pat)) { - sv_force_normal_flags(pat, 0); - assert(!SvREADONLY(pat)); - was_readonly = 0; - } else { - SvREADONLY_off(pat); - } - } + if (o->op_flags & OPf_SPECIAL) + pm_flags |= RXf_SPLIT; - sv_setpvn(pat, "\\s+", 3); - - SvFLAGS(pat) |= was_readonly; - - p = SvPV_const(pat, plen); - pm_flags |= RXf_SKIPWHITE; - } - if (DO_UTF8(pat)) + if (DO_UTF8(pat)) pm_flags |= RXf_UTF8; + PM_SETRE(pm, CALLREGCOMP(pat, pm_flags)); #ifdef PERL_MAD diff --git a/op.h b/op.h index f9147cd..65102ba 100644 --- a/op.h +++ b/op.h @@ -120,7 +120,7 @@ Deprecated. Use C instead. /* On OP_EXISTS, treat av as av, not avhv. */ /* On OP_(ENTER|LEAVE)EVAL, don't clear $@ */ /* On OP_ENTERITER, loop var is per-thread */ - /* On pushre, re is /\s+/ imp. by split " " */ + /* On pushre, rx is used as part of split, e.g. split " " */ /* On regcomp, "use re 'eval'" was in scope */ /* On OP_READLINE, was <$filehandle> */ /* On RV2[ACGHS]V, don't create GV--in diff --git a/pod/perlreapi.pod b/pod/perlreapi.pod index 3b5dc85..084762d 100644 --- a/pod/perlreapi.pod +++ b/pod/perlreapi.pod @@ -124,20 +124,6 @@ Additional flags: =over 4 -=item RXf_SKIPWHITE - -If C is invoked as C or with no arguments (which -really means C, see L), perl will set -this flag and change the pattern from C<" "> to C<"\s+"> before it's -passed to the comp routine. - -If the flag is present in C<< rx->extflags >> C to delete -whitespace from the start of the subject string before it's operated -on. What is considered whitespace depends on whether the subject is a -UTF-8 string and whether the C flag is set. - -This probably always be preserved verbatim in C<< rx->extflags >>. - =item RXf_PMf_LOCALE Set if C is in effect. If present in C<< rx->extflags >> @@ -156,6 +142,16 @@ compilation. The perl engine for instance may upgrade non-UTF-8 strings to UTF-8 if the pattern includes constructs such as C<\x{...}> that can only match Unicode values. +=item RXf_SPLIT + +If C is invoked as C or with no arguments (which +really means C, see L), perl will +set this flag. The regex engine can then check for it and set the +SKIPWHITE and WHITE extflags. To do this the perl engine does: + + if (flags & RXf_SPLIT && r->prelen == 1 && r->precomp[0] == ' ') + r->extflags |= (RXf_SKIPWHITE|RXf_WHITE); + =back These flags can be set during compilation to enable optimizations in @@ -163,6 +159,16 @@ the C operator. =over 4 +=item RXf_SKIPWHITE + +If the flag is present in C<< rx->extflags >> C will delete +whitespace from the start of the subject string before it's operated +on. What is considered whitespace depends on whether the subject is a +UTF-8 string and whether the C flag is set. + +If RXf_WHITE is set in addition to this flag C will behave like +C under the perl engine. + =item RXf_START_ONLY Tells the split operator to split the target string on newlines @@ -180,9 +186,7 @@ without invoking the regex engine. The definition of whitespace varies depending on whether the target string is a UTF-8 string and on whether RXf_PMf_LOCALE is set. -Perl's engine sets this flag if the pattern is C<\s+>, which it will be if -the pattern actually was C<\s+> or if it was originally C<" "> (see -C above). +Perl's engine sets this flag if the pattern is C<\s+>. =back diff --git a/regcomp.c b/regcomp.c index 4e146b7..d7b9981 100644 --- a/regcomp.c +++ b/regcomp.c @@ -4751,8 +4751,12 @@ reStudy: r->paren_names = (HV*)SvREFCNT_inc(RExC_paren_names); else r->paren_names = NULL; - if (r->prelen == 3 && strnEQ("\\s+", r->precomp, 3)) /* precomp = "\\s+)" */ - r->extflags |= RXf_WHITE; + + if (r->extflags & RXf_SPLIT && r->prelen == 1 && r->precomp[0] == ' ') + /* XXX: this should happen BEFORE we compile */ + r->extflags |= (RXf_SKIPWHITE|RXf_WHITE); + else if (r->prelen == 3 && memEQ("\\s+", r->precomp, 3)) + r->extflags |= RXf_WHITE; else if (r->prelen == 1 && r->precomp[0] == '^') r->extflags |= RXf_START_ONLY; diff --git a/regexp.h b/regexp.h index 3ec8fb4..bf4c57d 100644 --- a/regexp.h +++ b/regexp.h @@ -307,7 +307,14 @@ and check for NULL. #define RXf_USE_INTUIT_NOML 0x01000000 #define RXf_USE_INTUIT_ML 0x02000000 #define RXf_INTUIT_TAIL 0x04000000 -/* one bit here */ + +/* + Set in Perl_pmruntime if op_flags & OPf_SPECIAL, i.e. split. Will + be used by regex engines to check whether they should set + RXf_SKIPWHITE +*/ +#define RXf_SPLIT 0x08000000 + #define RXf_USE_INTUIT (RXf_USE_INTUIT_NOML|RXf_USE_INTUIT_ML) /* Copy and tainted info */