From: Yves Orton Date: Sun, 22 Apr 2007 23:34:55 +0000 (+0200) Subject: Change meaning of \v, \V, and add \h, \H to match Perl6, add \R to match PCRE and... X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=e1d1eefb8c88e0dcaf2bb9e6c04d7f6192be966f;p=p5sagit%2Fp5-mst-13.2.git Change meaning of \v, \V, and add \h, \H to match Perl6, add \R to match PCRE and unicode tr18 Message-ID: <9b18b3110704221434g43457742p28cab00289f83639@mail.gmail.com> p4raw-id: //depot/perl@31026 --- diff --git a/MANIFEST b/MANIFEST index baf1e13..d997f37 100644 --- a/MANIFEST +++ b/MANIFEST @@ -3297,6 +3297,7 @@ regcomp.c Regular expression compiler regcomp.h Private declarations for above regcomp.pl Builder of regnodes.h regcomp.sym Data for regnodes.h +regcharclass.h Match various character classes efficiently regen_lib.pl Common file routines for generator scripts regen_perly.pl generate perly.{act,h,tab} from perly.y regen.pl Run all scripts that (re)generate files diff --git a/lib/unicore/mktables b/lib/unicore/mktables index 72f33cb..64de8b1 100644 --- a/lib/unicore/mktables +++ b/lib/unicore/mktables @@ -829,7 +829,10 @@ sub UnicodeData_Txt() Table->New(Is => 'Word', Desc => "[[:Word:]]", Fuzzy => 0); $Cat{SpacePerl} = Table->New(Is => 'SpacePerl', Desc => '\s', Fuzzy => 0); - + $Cat{VertSpace} = + Table->New(Is => 'VertSpace', Desc => '\v', Fuzzy => 0); + $Cat{HorizSpace} = + Table->New(Is => 'HorizSpace', Desc => '\h', Fuzzy => 0); my %To; $To{Upper} = Table->New(); $To{Lower} = Table->New(); @@ -886,6 +889,15 @@ sub UnicodeData_Txt() $Cat{SpacePerl}->$op($code) if $isspace && $code != 0x000B; # Backward compat. + $Cat{VertSpace}->$op($code) if grep {$code == $_} + ( 0x0A..0x0D,0x85,0x2028,0x2029 ); + + $Cat{HorizSpace}->$op($code) if grep {$code == $_} ( + 0x09, 0x20, 0xa0, 0x1680, 0x180e, 0x2000, 0x2001, 0x2002, + 0x2003, 0x2004, 0x2005, 0x2006, 0x2007, 0x2008, 0x2009, 0x200a, + 0x202f, 0x205f, 0x3000 + ); + $Cat{Blank}->$op($code) if $isspace && !($code == 0x000A || $code == 0x000B || diff --git a/pod/perlre.pod b/pod/perlre.pod index 66935d2..1865232 100644 --- a/pod/perlre.pod +++ b/pod/perlre.pod @@ -233,7 +233,7 @@ An unescaped C<$> or C<@> interpolates the corresponding variable, while escaping will cause the literal string C<\$> to be matched. You'll need to write something like C. -=head3 Character classes +=head3 Character Classes and other Special Escapes In addition, Perl defines the following: X<\w> X<\W> X<\s> X<\S> X<\d> X<\D> X<\X> X<\p> X<\P> X<\C> @@ -265,8 +265,11 @@ X X X X \x12 Hexadecimal escape sequence \x{1234} Long hexadecimal escape sequence \K Keep the stuff left of the \K, don't include it in $& - \v Shortcut for (*PRUNE) - \V Shortcut for (*SKIP) + \v Vertical whitespace + \V Not vertical whitespace + \h Horizontal whitespace + \H Not horizontal whitespace + \R Linebreak (matches like \v inside of a charclass) A C<\w> matches a single alphanumeric character (an alphabetic character, or a decimal digit) or C<_>, not a whole word. Use C<\w+> @@ -283,6 +286,15 @@ your own C<\p> and C<\P> properties, and L about Unicode in general. X<\w> X<\W> X +C<\R> will atomically match a linebreak, including the network line-ending +"\x0D\x0A". Specifically, X<\R> is exactly equivelent to + + (?>\x0D\x0A?|[\x0A-\x0C\x85\x{2028}\x{2029}]) + +B C<\R> has no special meaning inside of a character class; +use C<\v> instead (vertical whitespace). +X<\R> + The POSIX character class syntax X diff --git a/regcharclass.h b/regcharclass.h new file mode 100644 index 0000000..3fc92d7 --- /dev/null +++ b/regcharclass.h @@ -0,0 +1,250 @@ +/****** WARNING WARNING WARNING ********/ +/* */ +/* Autogenerated code, do not modify! */ +/* */ +/****** WARNING WARNING WARNING ********/ + +/* LNBREAK Line Break: \j \J + Codepoints: 0x0A, 0x0B, 0x0C, 0x0D, 0x0D.0x0A, 0x85, 0x2028, 0x2029 + */ +#define is_LNBREAK(s,is_utf8) /*** Line Break: \j \J ***/ \ +( ( ((U8*)s)[0]==13 ) ? \ + ( ( ((U8*)s)[1]==10 ) ? 2 : 1 ) : \ + ( (10<=((U8*)s)[0] && ((U8*)s)[0]<=12) ? 1 : \ +( (is_utf8) ? \ + ( ( ((U8*)s)[0]==194 ) ? \ + ( ( ((U8*)s)[1]==133 ) ? 2 : 0 ) : \ + (((( ((U8*)s)[0]==226 ) && ( ((U8*)s)[1]==128 )) && ( ((U8*)s)[2]==168 || ((U8*)s)[2]==169 )) ? 3 : 0) ) :\ + ( ((U8*)s)[0]==133 ) ) ) ) + +#define is_LNBREAK_safe(s,e,is_utf8) /*** Line Break: \j \J ***/ \ +( ( (e) - (s) > 2 ) ? \ + ( ( ((U8*)s)[0]==13 ) ? \ + ( ( ((U8*)s)[1]==10 ) ? 2 : 1 ) : \ + ( (10<=((U8*)s)[0] && ((U8*)s)[0]<=12) ? 1 : \ +( (is_utf8) ? \ + ( ( ((U8*)s)[0]==194 ) ? \ + ( ( ((U8*)s)[1]==133 ) ? 2 : 0 ) : \ + (((( ((U8*)s)[0]==226 ) && ( ((U8*)s)[1]==128 )) && ( ((U8*)s)[2]==168 || ((U8*)s)[2]==169 )) ? 3 : 0) ) :\ + ( ((U8*)s)[0]==133 ) ) ) ) : \ +( ( (e) - (s) > 1 ) ? \ + ( ( ((U8*)s)[0]==13 ) ? \ + ( ( ((U8*)s)[1]==10 ) ? 2 : 1 ) : \ + ( (10<=((U8*)s)[0] && ((U8*)s)[0]<=12) ? 1 : \ +( (is_utf8) ? \ + ((( ((U8*)s)[0]==194 ) && ( ((U8*)s)[1]==133 )) ? 2 : 0) : \ + ( ((U8*)s)[0]==133 ) ) ) ) : \ +( ( (e) - (s) > 0 ) ? \ + ( (10<=((U8*)s)[0] && ((U8*)s)[0]<=13) ? 1 : \ +( (!is_utf8) ? \ + ( ((U8*)s)[0]==133 ) : 0 ) ) : 0 ) ) ) + +#define is_LNBREAK_utf8(s) /*** Line Break: \j \J ***/ \ +( ( ((U8*)s)[0]==194 ) ? \ + ( ( ((U8*)s)[1]==133 ) ? 2 : 0 ) : \ + ( ( ((U8*)s)[0]==226 ) ? \ + ((( ((U8*)s)[1]==128 ) && ( ((U8*)s)[2]==168 || ((U8*)s)[2]==169 )) ? 3 : 0) :\ + ( ( ((U8*)s)[0]==13 ) ? \ + ( ( ((U8*)s)[1]==10 ) ? 2 : 1 ) : \ + (10<=((U8*)s)[0] && ((U8*)s)[0]<=12) ) ) ) + +#define is_LNBREAK_utf8_safe(s,e) /*** Line Break: \j \J ***/ \ +( ( (e) - (s) > 2 ) ? \ + ( ( ((U8*)s)[0]==194 ) ? \ + ( ( ((U8*)s)[1]==133 ) ? 2 : 0 ) : \ + ( ( ((U8*)s)[0]==226 ) ? \ + ((( ((U8*)s)[1]==128 ) && ( ((U8*)s)[2]==168 || ((U8*)s)[2]==169 )) ? 3 : 0) :\ + ( ( ((U8*)s)[0]==13 ) ? \ + ( ( ((U8*)s)[1]==10 ) ? 2 : 1 ) : \ + (10<=((U8*)s)[0] && ((U8*)s)[0]<=12) ) ) ) : \ +( ( (e) - (s) > 1 ) ? \ + ( ( ((U8*)s)[0]==194 ) ? \ + ( ( ((U8*)s)[1]==133 ) ? 2 : 0 ) : \ + ( ( ((U8*)s)[0]==13 ) ? \ + ( ( ((U8*)s)[1]==10 ) ? 2 : 1 ) : \ + (10<=((U8*)s)[0] && ((U8*)s)[0]<=12) ) ) : \ +( ( (e) - (s) > 0 ) ? \ + (10<=((U8*)s)[0] && ((U8*)s)[0]<=13) : 0 ) ) ) + +#define is_LNBREAK_latin1(s) /*** Line Break: \j \J ***/ \ +( ( ((U8*)s)[0]==13 ) ? \ + ( ( ((U8*)s)[1]==10 ) ? 2 : 1 ) : \ + ( (10<=((U8*)s)[0] && ((U8*)s)[0]<=12) || ((U8*)s)[0]==133 ) ) + +#define is_LNBREAK_latin1_safe(s,e) /*** Line Break: \j \J ***/ \ +( ( (e) - (s) > 1 ) ? \ + ( ( ((U8*)s)[0]==13 ) ? \ + ( ( ((U8*)s)[1]==10 ) ? 2 : 1 ) : \ + ( (10<=((U8*)s)[0] && ((U8*)s)[0]<=12) || ((U8*)s)[0]==133 ) ) : \ +( ( (e) - (s) > 0 ) ? \ + ( (10<=((U8*)s)[0] && ((U8*)s)[0]<=13) || ((U8*)s)[0]==133 ) : 0 ) ) + +#define is_LNBREAK_cp(cp) /*** Line Break: \j \J ***/ \ +( (10<=cp && cp<=13) || ( cp > 13 && ( cp==133 || ( cp > 133 && ( cp==8232 || ( cp > 8232 && cp==8233 ) ) ) ) ) ) + +/****** WARNING WARNING WARNING ********/ +/* */ +/* Autogenerated code, do not modify! */ +/* */ +/****** WARNING WARNING WARNING ********/ + +/* HORIZWS Horizontal Whitespace: \h \H + Codepoints: 0x09, 0x20, 0xA0, 0x1680, 0x180E, 0x2000, 0x2001, 0x2002, + 0x2003, 0x2004, 0x2005, 0x2006, 0x2007, 0x2008, 0x2009, 0x200A, 0x202F, + 0x205F, 0x3000 + */ +#define is_HORIZWS(s,is_utf8) /*** Horizontal Whitespace: \h \H ***/ \ +( ( ((U8*)s)[0]==9 || ((U8*)s)[0]==32 ) ? 1 : \ +( (is_utf8) ? \ + ( ( ((U8*)s)[0]==194 ) ? \ + ( ( ((U8*)s)[1]==160 ) ? 2 : 0 ) : \ + ( ( ((U8*)s)[0]==225 ) ? \ + ( ( ((U8*)s)[1]==160 ) ? \ + ( ( ((U8*)s)[2]==142 ) ? 3 : 0 ) : \ + ((( ((U8*)s)[1]==154 ) && ( ((U8*)s)[2]==128 )) ? 3 : 0) ) : \ + ( ( ((U8*)s)[0]==226 ) ? \ + ( ( ((U8*)s)[1]==129 ) ? \ + ( ( ((U8*)s)[2]==159 ) ? 3 : 0 ) : \ + ((( ((U8*)s)[1]==128 ) && ( (128<=((U8*)s)[2] && ((U8*)s)[2]<=138) || ((U8*)s)[2]==175 )) ? 3 : 0) ) :\ + (((( ((U8*)s)[0]==227 ) && ( ((U8*)s)[1]==128 )) && ( ((U8*)s)[2]==128 )) ? 3 : 0) ) ) ) :\ + ( ((U8*)s)[0]==160 ) ) ) + +#define is_HORIZWS_safe(s,e,is_utf8) /*** Horizontal Whitespace: \h \H ***/ \ +( ( (e) - (s) > 2 ) ? \ + ( ( ((U8*)s)[0]==9 || ((U8*)s)[0]==32 ) ? 1 : \ +( (is_utf8) ? \ + ( ( ((U8*)s)[0]==194 ) ? \ + ( ( ((U8*)s)[1]==160 ) ? 2 : 0 ) : \ + ( ( ((U8*)s)[0]==225 ) ? \ + ( ( ((U8*)s)[1]==160 ) ? \ + ( ( ((U8*)s)[2]==142 ) ? 3 : 0 ) : \ + ((( ((U8*)s)[1]==154 ) && ( ((U8*)s)[2]==128 )) ? 3 : 0) ) : \ + ( ( ((U8*)s)[0]==226 ) ? \ + ( ( ((U8*)s)[1]==129 ) ? \ + ( ( ((U8*)s)[2]==159 ) ? 3 : 0 ) : \ + ((( ((U8*)s)[1]==128 ) && ( (128<=((U8*)s)[2] && ((U8*)s)[2]<=138) || ((U8*)s)[2]==175 )) ? 3 : 0) ) :\ + (((( ((U8*)s)[0]==227 ) && ( ((U8*)s)[1]==128 )) && ( ((U8*)s)[2]==128 )) ? 3 : 0) ) ) ) :\ + ( ((U8*)s)[0]==160 ) ) ) : \ +( ( (e) - (s) > 1 ) ? \ + ( ( ((U8*)s)[0]==9 || ((U8*)s)[0]==32 ) ? 1 : \ +( (is_utf8) ? \ + ((( ((U8*)s)[0]==194 ) && ( ((U8*)s)[1]==160 )) ? 2 : 0) : \ + ( ((U8*)s)[0]==160 ) ) ) : \ +( ( (e) - (s) > 0 ) ? \ + ( ( ((U8*)s)[0]==9 || ((U8*)s)[0]==32 ) ? 1 : \ +( (!is_utf8) ? \ + ( ((U8*)s)[0]==160 ) : 0 ) ) : 0 ) ) ) + +#define is_HORIZWS_utf8(s) /*** Horizontal Whitespace: \h \H ***/ \ +( ( ((U8*)s)[0]==194 ) ? \ + ( ( ((U8*)s)[1]==160 ) ? 2 : 0 ) : \ + ( ( ((U8*)s)[0]==225 ) ? \ + ( ( ((U8*)s)[1]==160 ) ? \ + ( ( ((U8*)s)[2]==142 ) ? 3 : 0 ) : \ + ((( ((U8*)s)[1]==154 ) && ( ((U8*)s)[2]==128 )) ? 3 : 0) ) : \ + ( ( ((U8*)s)[0]==226 ) ? \ + ( ( ((U8*)s)[1]==129 ) ? \ + ( ( ((U8*)s)[2]==159 ) ? 3 : 0 ) : \ + ((( ((U8*)s)[1]==128 ) && ( (128<=((U8*)s)[2] && ((U8*)s)[2]<=138) || ((U8*)s)[2]==175 )) ? 3 : 0) ) :\ + ( ( ((U8*)s)[0]==227 ) ? \ + ((( ((U8*)s)[1]==128 ) && ( ((U8*)s)[2]==128 )) ? 3 : 0) : \ + ( ((U8*)s)[0]==9 || ((U8*)s)[0]==32 ) ) ) ) ) + +#define is_HORIZWS_utf8_safe(s,e) /*** Horizontal Whitespace: \h \H ***/ \ +( ( (e) - (s) > 2 ) ? \ + ( ( ((U8*)s)[0]==194 ) ? \ + ( ( ((U8*)s)[1]==160 ) ? 2 : 0 ) : \ + ( ( ((U8*)s)[0]==225 ) ? \ + ( ( ((U8*)s)[1]==160 ) ? \ + ( ( ((U8*)s)[2]==142 ) ? 3 : 0 ) : \ + ((( ((U8*)s)[1]==154 ) && ( ((U8*)s)[2]==128 )) ? 3 : 0) ) : \ + ( ( ((U8*)s)[0]==226 ) ? \ + ( ( ((U8*)s)[1]==129 ) ? \ + ( ( ((U8*)s)[2]==159 ) ? 3 : 0 ) : \ + ((( ((U8*)s)[1]==128 ) && ( (128<=((U8*)s)[2] && ((U8*)s)[2]<=138) || ((U8*)s)[2]==175 )) ? 3 : 0) ) :\ + ( ( ((U8*)s)[0]==227 ) ? \ + ((( ((U8*)s)[1]==128 ) && ( ((U8*)s)[2]==128 )) ? 3 : 0) : \ + ( ((U8*)s)[0]==9 || ((U8*)s)[0]==32 ) ) ) ) ) : \ +( ( (e) - (s) > 1 ) ? \ + ( ( ((U8*)s)[0]==194 ) ? \ + ( ( ((U8*)s)[1]==160 ) ? 2 : 0 ) : \ + ( ((U8*)s)[0]==9 || ((U8*)s)[0]==32 ) ) : \ +( ( (e) - (s) > 0 ) ? \ + ( ((U8*)s)[0]==9 || ((U8*)s)[0]==32 ) : 0 ) ) ) + +#define is_HORIZWS_latin1(s) /*** Horizontal Whitespace: \h \H ***/ \ +( ((U8*)s)[0]==9 || ((U8*)s)[0]==32 || ((U8*)s)[0]==160 ) + +#define is_HORIZWS_latin1_safe(s,e) /*** Horizontal Whitespace: \h \H ***/ \ +( ( (e) - (s) > 0 ) ? \ + ( ((U8*)s)[0]==9 || ((U8*)s)[0]==32 || ((U8*)s)[0]==160 ) : 0 ) + +#define is_HORIZWS_cp(cp) /*** Horizontal Whitespace: \h \H ***/ \ +( cp==9 || ( cp > 9 && ( cp==32 || ( cp > 32 && ( cp==160 || ( cp > 160 && ( cp==5760 || ( cp > 5760 && ( cp==6158 || ( cp > 6158 && ( (8192<=cp && cp<=8202) || ( cp > 8202 && ( cp==8239 || ( cp > 8239 && ( cp==8287 || ( cp > 8287 && cp==12288 ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) + +/****** WARNING WARNING WARNING ********/ +/* */ +/* Autogenerated code, do not modify! */ +/* */ +/****** WARNING WARNING WARNING ********/ + +/* VERTWS Vertical Whitespace: \v \V + Codepoints: 0x0A, 0x0B, 0x0C, 0x0D, 0x85, 0x2028, 0x2029 + */ +#define is_VERTWS(s,is_utf8) /*** Vertical Whitespace: \v \V ***/ \ +( (10<=((U8*)s)[0] && ((U8*)s)[0]<=13) ? 1 : \ +( (is_utf8) ? \ + ( ( ((U8*)s)[0]==194 ) ? \ + ( ( ((U8*)s)[1]==133 ) ? 2 : 0 ) : \ + (((( ((U8*)s)[0]==226 ) && ( ((U8*)s)[1]==128 )) && ( ((U8*)s)[2]==168 || ((U8*)s)[2]==169 )) ? 3 : 0) ) :\ + ( ((U8*)s)[0]==133 ) ) ) + +#define is_VERTWS_safe(s,e,is_utf8) /*** Vertical Whitespace: \v \V ***/ \ +( ( (e) - (s) > 2 ) ? \ + ( (10<=((U8*)s)[0] && ((U8*)s)[0]<=13) ? 1 : \ +( (is_utf8) ? \ + ( ( ((U8*)s)[0]==194 ) ? \ + ( ( ((U8*)s)[1]==133 ) ? 2 : 0 ) : \ + (((( ((U8*)s)[0]==226 ) && ( ((U8*)s)[1]==128 )) && ( ((U8*)s)[2]==168 || ((U8*)s)[2]==169 )) ? 3 : 0) ) :\ + ( ((U8*)s)[0]==133 ) ) ) : \ +( ( (e) - (s) > 1 ) ? \ + ( (10<=((U8*)s)[0] && ((U8*)s)[0]<=13) ? 1 : \ +( (is_utf8) ? \ + ((( ((U8*)s)[0]==194 ) && ( ((U8*)s)[1]==133 )) ? 2 : 0) : \ + ( ((U8*)s)[0]==133 ) ) ) : \ +( ( (e) - (s) > 0 ) ? \ + ( (10<=((U8*)s)[0] && ((U8*)s)[0]<=13) ? 1 : \ +( (!is_utf8) ? \ + ( ((U8*)s)[0]==133 ) : 0 ) ) : 0 ) ) ) + +#define is_VERTWS_utf8(s) /*** Vertical Whitespace: \v \V ***/ \ +( ( ((U8*)s)[0]==194 ) ? \ + ( ( ((U8*)s)[1]==133 ) ? 2 : 0 ) : \ + ( ( ((U8*)s)[0]==226 ) ? \ + ((( ((U8*)s)[1]==128 ) && ( ((U8*)s)[2]==168 || ((U8*)s)[2]==169 )) ? 3 : 0) :\ + (10<=((U8*)s)[0] && ((U8*)s)[0]<=13) ) ) + +#define is_VERTWS_utf8_safe(s,e) /*** Vertical Whitespace: \v \V ***/ \ +( ( (e) - (s) > 2 ) ? \ + ( ( ((U8*)s)[0]==194 ) ? \ + ( ( ((U8*)s)[1]==133 ) ? 2 : 0 ) : \ + ( ( ((U8*)s)[0]==226 ) ? \ + ((( ((U8*)s)[1]==128 ) && ( ((U8*)s)[2]==168 || ((U8*)s)[2]==169 )) ? 3 : 0) :\ + (10<=((U8*)s)[0] && ((U8*)s)[0]<=13) ) ) : \ +( ( (e) - (s) > 1 ) ? \ + ( ( ((U8*)s)[0]==194 ) ? \ + ( ( ((U8*)s)[1]==133 ) ? 2 : 0 ) : \ + (10<=((U8*)s)[0] && ((U8*)s)[0]<=13) ) : \ +( ( (e) - (s) > 0 ) ? \ + (10<=((U8*)s)[0] && ((U8*)s)[0]<=13) : 0 ) ) ) + +#define is_VERTWS_latin1(s) /*** Vertical Whitespace: \v \V ***/ \ +( (10<=((U8*)s)[0] && ((U8*)s)[0]<=13) || ((U8*)s)[0]==133 ) + +#define is_VERTWS_latin1_safe(s,e) /*** Vertical Whitespace: \v \V ***/ \ +( ( (e) - (s) > 0 ) ? \ + ( (10<=((U8*)s)[0] && ((U8*)s)[0]<=13) || ((U8*)s)[0]==133 ) : 0 ) + +#define is_VERTWS_cp(cp) /*** Vertical Whitespace: \v \V ***/ \ +( (10<=cp && cp<=13) || ( cp > 13 && ( cp==133 || ( cp > 133 && ( cp==8232 || ( cp > 8232 && cp==8233 ) ) ) ) ) ) + diff --git a/regcomp.c b/regcomp.c index e24d146..48a8a30 100644 --- a/regcomp.c +++ b/regcomp.c @@ -2400,6 +2400,34 @@ typedef struct scan_frame { #define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf) +#define CASE_SYNST_FNC(nAmE) \ +case nAmE: \ + if (flags & SCF_DO_STCLASS_AND) { \ + for (value = 0; value < 256; value++) \ + if (!is_ ## nAmE ## _cp(value)) \ + ANYOF_BITMAP_CLEAR(data->start_class, value); \ + } \ + else { \ + for (value = 0; value < 256; value++) \ + if (is_ ## nAmE ## _cp(value)) \ + ANYOF_BITMAP_SET(data->start_class, value); \ + } \ + break; \ +case N ## nAmE: \ + if (flags & SCF_DO_STCLASS_AND) { \ + for (value = 0; value < 256; value++) \ + if (is_ ## nAmE ## _cp(value)) \ + ANYOF_BITMAP_CLEAR(data->start_class, value); \ + } \ + else { \ + for (value = 0; value < 256; value++) \ + if (!is_ ## nAmE ## _cp(value)) \ + ANYOF_BITMAP_SET(data->start_class, value); \ + } \ + break + + + STATIC I32 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *minlenp, I32 *deltap, @@ -3330,6 +3358,34 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, break; } } + else if (OP(scan) == LNBREAK) { + if (flags & SCF_DO_STCLASS) { + int value = 0; + data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */ + if (flags & SCF_DO_STCLASS_AND) { + for (value = 0; value < 256; value++) + if (!is_LNBREAK_cp(value)) + ANYOF_BITMAP_CLEAR(data->start_class, value); + } + else { + for (value = 0; value < 256; value++) + if (is_LNBREAK_cp(value)) + ANYOF_BITMAP_SET(data->start_class, value); + } + if (flags & SCF_DO_STCLASS_OR) + cl_and(data->start_class, and_withp); + flags &= ~SCF_DO_STCLASS; + } + min += 1; + delta += 2; + if (flags & SCF_DO_SUBSTR) { + SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */ + data->pos_min += 1; + data->pos_delta += 2; + data->longest = &(data->longest_float); + } + + } else if (strchr((const char*)PL_simple,OP(scan))) { int value = 0; @@ -3524,6 +3580,9 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, } } break; + CASE_SYNST_FNC(VERTWS); + CASE_SYNST_FNC(HORIZWS); + } if (flags & SCF_DO_STCLASS_OR) cl_and(data->start_class, and_withp); @@ -3894,6 +3953,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, } #endif /* old or new */ #endif /* TRIE_STUDY_OPT */ + /* Else: zero-length, ignore. */ scan = regnext(scan); } @@ -6585,15 +6645,25 @@ tryagain: ret = reg_node(pRExC_state, NDIGIT); *flagp |= HASWIDTH|SIMPLE; goto finish_meta_pat; + case 'R': + ret = reg_node(pRExC_state, LNBREAK); + *flagp |= HASWIDTH|SIMPLE; + goto finish_meta_pat; + case 'h': + ret = reg_node(pRExC_state, HORIZWS); + *flagp |= HASWIDTH|SIMPLE; + goto finish_meta_pat; + case 'H': + ret = reg_node(pRExC_state, NHORIZWS); + *flagp |= HASWIDTH|SIMPLE; + goto finish_meta_pat; case 'v': - ret = reganode(pRExC_state, PRUNE, 0); - ret->flags = 1; - *flagp |= SIMPLE; + ret = reg_node(pRExC_state, VERTWS); + *flagp |= HASWIDTH|SIMPLE; goto finish_meta_pat; case 'V': - ret = reganode(pRExC_state, SKIP, 0); - ret->flags = 1; - *flagp |= SIMPLE; + ret = reg_node(pRExC_state, NVERTWS); + *flagp |= HASWIDTH|SIMPLE; finish_meta_pat: nextchar(pRExC_state); Set_Node_Length(ret, 2); /* MJD */ @@ -6815,11 +6885,13 @@ tryagain: case 'C': /* Single char !DANGEROUS! */ case 'd': case 'D': /* digit class */ case 'g': case 'G': /* generic-backref, pos assertion */ + case 'h': case 'H': /* HORIZWS */ case 'k': case 'K': /* named backref, keep marker */ case 'N': /* named char sequence */ case 'p': case 'P': /* unicode property */ + case 'R': /* LNBREAK */ case 's': case 'S': /* space class */ - case 'v': case 'V': /* (*PRUNE) and (*SKIP) */ + case 'v': case 'V': /* VERTWS */ case 'w': case 'W': /* word class */ case 'X': /* eXtended Unicode "combining character sequence" */ case 'z': case 'Z': /* End of line/string assertion */ @@ -7242,6 +7314,21 @@ case ANYOF_N##NAME: \ what = WORD; \ break +#define _C_C_T_NOLOC_(NAME,TEST,WORD) \ +ANYOF_##NAME: \ + for (value = 0; value < 256; value++) \ + if (TEST) \ + ANYOF_BITMAP_SET(ret, value); \ + yesno = '+'; \ + what = WORD; \ + break; \ +case ANYOF_N##NAME: \ + for (value = 0; value < 256; value++) \ + if (!TEST) \ + ANYOF_BITMAP_SET(ret, value); \ + yesno = '!'; \ + what = WORD; \ + break /* parse a class specification and produce either an ANYOF node that @@ -7254,10 +7341,10 @@ STATIC regnode * S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth) { dVAR; - register UV value = 0; register UV nextvalue; register IV prevvalue = OOB_UNICODE; register IV range = 0; + UV value = 0; /* XXX:dmq: needs to be referenceable (unfortunately) */ register regnode *ret; STRLEN numlen; IV namedclass; @@ -7360,6 +7447,10 @@ parseit: case 'S': namedclass = ANYOF_NSPACE; break; case 'd': namedclass = ANYOF_DIGIT; break; case 'D': namedclass = ANYOF_NDIGIT; break; + case 'v': namedclass = ANYOF_VERTWS; break; + case 'V': namedclass = ANYOF_NVERTWS; break; + case 'h': namedclass = ANYOF_HORIZWS; break; + case 'H': namedclass = ANYOF_NHORIZWS; break; case 'N': /* Handle \N{NAME} in class */ { /* We only pay attention to the first char of @@ -7538,6 +7629,8 @@ parseit: case _C_C_T_(SPACE, isSPACE(value), "SpacePerl"); case _C_C_T_(UPPER, isUPPER(value), "Upper"); case _C_C_T_(XDIGIT, isXDIGIT(value), "XDigit"); + case _C_C_T_NOLOC_(VERTWS, is_VERTWS_latin1(&value), "VertSpace"); + case _C_C_T_NOLOC_(HORIZWS, is_HORIZWS_latin1(&value), "HorizSpace"); case ANYOF_ASCII: if (LOC) ANYOF_CLASS_SET(ret, ANYOF_ASCII); diff --git a/regcomp.h b/regcomp.h index 1a0916a..70fdeb5 100644 --- a/regcomp.h +++ b/regcomp.h @@ -7,6 +7,7 @@ * License or the Artistic License, as specified in the README file. * */ +#include "regcharclass.h" typedef OP OP_4tree; /* Will be redefined later. */ @@ -177,7 +178,7 @@ struct regnode_2 { #define ANYOF_BITMAP_SIZE 32 /* 256 b/(8 b/B) */ -#define ANYOF_CLASSBITMAP_SIZE 4 /* up to 32 (8*4) named classes */ +#define ANYOF_CLASSBITMAP_SIZE 4 /* up to 40 (8*5) named classes */ /* also used by trie */ struct regnode_charclass { @@ -345,6 +346,14 @@ struct regnode_charclass_class { /* has [[:blah:]] classes */ #define ANYOF_MAX 32 +/* pseudo classes, not stored in the class bitmap, but used as flags + during compilation of char classes */ + +#define ANYOF_VERTWS (ANYOF_MAX+1) +#define ANYOF_NVERTWS (ANYOF_MAX+2) +#define ANYOF_HORIZWS (ANYOF_MAX+3) +#define ANYOF_NHORIZWS (ANYOF_MAX+4) + /* Backward source code compatibility. */ #define ANYOF_ALNUML ANYOF_ALNUM @@ -444,6 +453,8 @@ EXTCONST U8 PL_simple[] = { SPACE, SPACEL, NSPACE, NSPACEL, DIGIT, NDIGIT, + VERTWS, NVERTWS, + HORIZWS, NHORIZWS, 0 }; #endif @@ -799,3 +810,4 @@ re.pm, especially to the documentation. #endif /* DEBUG RELATED DEFINES */ + diff --git a/regcomp.sym b/regcomp.sym index c57a386..070fe98 100644 --- a/regcomp.sym +++ b/regcomp.sym @@ -185,7 +185,14 @@ CUTGROUP VERB, no-sv 1 On failure go to the next alternation in the group #*Control what to keep in $&. KEEPS KEEPS, no $& begins here. -# NEW STUFF ABOVE THIS LINE -- Please update counts below. +#*New charclass like patterns +LNBREAK LNBREAK, none generic newline pattern +VERTWS VERTWS, none vertical whitespace (Perl 6) +NVERTWS NVERTWS, none not vertical whitespace (Perl 6) +HORIZWS HORIZWS, none horizontal whitespace (Perl 6) +NHORIZWS NHORIZWS, none not horizontal whitespace (Perl 6) + +# NEW STUFF ABOVE THIS LINE ################################################################################ diff --git a/regexec.c b/regexec.c index 1eb7ff2..fa853a4 100644 --- a/regexec.c +++ b/regexec.c @@ -1110,6 +1110,15 @@ REXEC_FBC_SCAN( \ if ((!reginfo || regtry(reginfo, &s))) \ goto got_it +#define REXEC_FBC_CSCAN(CoNdUtF8,CoNd) \ + if (do_utf8) { \ + REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \ + } \ + else { \ + REXEC_FBC_CLASS_SCAN(CoNd); \ + } \ + break + #define REXEC_FBC_CSCAN_PRELOAD(UtFpReLoAd,CoNdUtF8,CoNd) \ if (do_utf8) { \ UtFpReLoAd; \ @@ -1425,6 +1434,31 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, !isDIGIT_LC_utf8((U8*)s), !isDIGIT_LC(*s) ); + case LNBREAK: + REXEC_FBC_CSCAN( + is_LNBREAK_utf8(s), + is_LNBREAK_latin1(s) + ); + case VERTWS: + REXEC_FBC_CSCAN( + is_VERTWS_utf8(s), + is_VERTWS_latin1(s) + ); + case NVERTWS: + REXEC_FBC_CSCAN( + !is_VERTWS_utf8(s), + !is_VERTWS_latin1(s) + ); + case HORIZWS: + REXEC_FBC_CSCAN( + is_HORIZWS_utf8(s), + is_HORIZWS_latin1(s) + ); + case NHORIZWS: + REXEC_FBC_CSCAN( + !is_HORIZWS_utf8(s), + !is_HORIZWS_latin1(s) + ); case AHOCORASICKC: case AHOCORASICK: { @@ -3207,8 +3241,9 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) * pack("U0U*", 0xDF) =~ /ss/i, * the 0xC3 0x9F are the UTF-8 * byte sequence for the U+00DF. */ + if (!(do_utf8 && - toLOWER(s[0]) == 's' && + toLOWER(s[0]) == 's' && ln >= 2 && toLOWER(s[1]) == 's' && (U8)l[0] == 0xC3 && @@ -4972,6 +5007,35 @@ NULL /* NOTREACHED */ #undef ST + case LNBREAK: + if ((n=is_LNBREAK(locinput,do_utf8))) { + locinput += n; + nextchr = UCHARAT(locinput); + } else + sayNO; + break; + +#define CASE_CLASS(nAmE) \ + case nAmE: \ + if ((n=is_##nAmE(locinput,do_utf8))) { \ + locinput += n; \ + nextchr = UCHARAT(locinput); \ + } else \ + sayNO; \ + break; \ + case N##nAmE: \ + if ((n=is_##nAmE(locinput,do_utf8))) { \ + sayNO; \ + } else { \ + locinput += UTF8SKIP(locinput); \ + nextchr = UCHARAT(locinput); \ + } \ + break + + CASE_CLASS(VERTWS); + CASE_CLASS(HORIZWS); +#undef CASE_CLASS + default: PerlIO_printf(Perl_error_log, "%"UVxf" %d\n", PTR2UV(scan), OP(scan)); @@ -5382,7 +5446,77 @@ S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth) while (scan < loceol && !isDIGIT(*scan)) scan++; } + case LNBREAK: + if (do_utf8) { + loceol = PL_regeol; + while (hardcount < max && scan < loceol && (c=is_LNBREAK_utf8(scan))) { + scan += c; + hardcount++; + } + } else { + /* + LNBREAK can match two latin chars, which is ok, + because we have a null terminated string, but we + have to use hardcount in this situation + */ + while (scan < loceol && (c=is_LNBREAK_latin1(scan))) { + scan+=c; + hardcount++; + } + } + break; + case HORIZWS: + if (do_utf8) { + loceol = PL_regeol; + while (hardcount < max && scan < loceol && (c=is_HORIZWS_utf8(scan))) { + scan += c; + hardcount++; + } + } else { + while (scan < loceol && is_HORIZWS_latin1(scan)) + scan++; + } break; + case NHORIZWS: + if (do_utf8) { + loceol = PL_regeol; + while (hardcount < max && scan < loceol && !is_HORIZWS_utf8(scan)) { + scan += UTF8SKIP(scan); + hardcount++; + } + } else { + while (scan < loceol && !is_HORIZWS_latin1(scan)) + scan++; + + } + break; + case VERTWS: + if (do_utf8) { + loceol = PL_regeol; + while (hardcount < max && scan < loceol && (c=is_VERTWS_utf8(scan))) { + scan += c; + hardcount++; + } + } else { + while (scan < loceol && is_VERTWS_latin1(scan)) + scan++; + + } + break; + case NVERTWS: + if (do_utf8) { + loceol = PL_regeol; + while (hardcount < max && scan < loceol && !is_VERTWS_utf8(scan)) { + scan += UTF8SKIP(scan); + hardcount++; + } + } else { + while (scan < loceol && !is_VERTWS_latin1(scan)) + scan++; + + } + break; + default: /* Called on something of 0 width. */ break; /* So match right here or not at all. */ } diff --git a/regnodes.h b/regnodes.h index e704427..3c3a5d6 100644 --- a/regnodes.h +++ b/regnodes.h @@ -6,8 +6,8 @@ /* Regops and State definitions */ -#define REGNODE_MAX 84 -#define REGMATCH_STATE_MAX 124 +#define REGNODE_MAX 89 +#define REGMATCH_STATE_MAX 129 #define END 0 /* 0000 End of program. */ #define SUCCEED 1 /* 0x01 Return from a subroutine, basically. */ @@ -92,8 +92,13 @@ #define COMMIT 80 /* 0x50 Pattern fails outright if backtracking through this */ #define CUTGROUP 81 /* 0x51 On failure go to the next alternation in the group */ #define KEEPS 82 /* 0x52 $& begins here. */ -#define OPTIMIZED 83 /* 0x53 Placeholder for dump. */ -#define PSEUDO 84 /* 0x54 Pseudo opcode for internal use. */ +#define LNBREAK 83 /* 0x53 generic newline pattern */ +#define VERTWS 84 /* 0x54 vertical whitespace (Perl 6) */ +#define NVERTWS 85 /* 0x55 not vertical whitespace (Perl 6) */ +#define HORIZWS 86 /* 0x56 horizontal whitespace (Perl 6) */ +#define NHORIZWS 87 /* 0x57 not horizontal whitespace (Perl 6) */ +#define OPTIMIZED 88 /* 0x58 Placeholder for dump. */ +#define PSEUDO 89 /* 0x59 Pseudo opcode for internal use. */ /* ------------ States ------------- */ #define TRIE_next (REGNODE_MAX + 1) /* state for TRIE */ #define TRIE_next_fail (REGNODE_MAX + 2) /* state for TRIE */ @@ -225,6 +230,11 @@ EXTCONST U8 PL_regkind[] = { VERB, /* COMMIT */ VERB, /* CUTGROUP */ KEEPS, /* KEEPS */ + LNBREAK, /* LNBREAK */ + VERTWS, /* VERTWS */ + NVERTWS, /* NVERTWS */ + HORIZWS, /* HORIZWS */ + NHORIZWS, /* NHORIZWS */ NOTHING, /* OPTIMIZED */ PSEUDO, /* PSEUDO */ /* ------------ States ------------- */ @@ -358,6 +368,11 @@ static const U8 regarglen[] = { EXTRA_SIZE(struct regnode_1), /* COMMIT */ EXTRA_SIZE(struct regnode_1), /* CUTGROUP */ 0, /* KEEPS */ + 0, /* LNBREAK */ + 0, /* VERTWS */ + 0, /* NVERTWS */ + 0, /* HORIZWS */ + 0, /* NHORIZWS */ 0, /* OPTIMIZED */ 0, /* PSEUDO */ }; @@ -448,6 +463,11 @@ static const char reg_off_by_arg[] = { 0, /* COMMIT */ 0, /* CUTGROUP */ 0, /* KEEPS */ + 0, /* LNBREAK */ + 0, /* VERTWS */ + 0, /* NVERTWS */ + 0, /* HORIZWS */ + 0, /* NHORIZWS */ 0, /* OPTIMIZED */ 0, /* PSEUDO */ }; @@ -543,8 +563,13 @@ EXTCONST char * const PL_reg_name[] = { "COMMIT", /* 0x50 */ "CUTGROUP", /* 0x51 */ "KEEPS", /* 0x52 */ - "OPTIMIZED", /* 0x53 */ - "PSEUDO", /* 0x54 */ + "LNBREAK", /* 0x53 */ + "VERTWS", /* 0x54 */ + "NVERTWS", /* 0x55 */ + "HORIZWS", /* 0x56 */ + "NHORIZWS", /* 0x57 */ + "OPTIMIZED", /* 0x58 */ + "PSEUDO", /* 0x59 */ /* ------------ States ------------- */ "TRIE_next", /* REGNODE_MAX +0x01 */ "TRIE_next_fail", /* REGNODE_MAX +0x02 */ diff --git a/t/op/pat.t b/t/op/pat.t index 1af8fb3..a5b98f6 100755 --- a/t/op/pat.t +++ b/t/op/pat.t @@ -3386,7 +3386,7 @@ ok(("foba ba${s}pxySS$s$s" =~ qr/(b(?:a${s}t|a${s}f|a${s}p)[xy]+$s*)/i) } - +print "# set PERL_SKIP_PSYCHO_TEST to skip this test\n"; if (!$ENV{PERL_SKIP_PSYCHO_TEST}){ my @normal=qw(these are some normal words); my $psycho=join "|",@normal,map chr $_,255..20000; @@ -3773,6 +3773,7 @@ sub iseq($$;$) { if ($ENV{PERL_SKIP_PSYCHO_TEST}){ printf "ok %d Skip: No psycho tests\n", $test++; } else { + print "# set PERL_SKIP_PSYCHO_TEST to skip this test\n"; my $r = qr/^ (?: ( (?:a|z+)+ ) @@ -3913,25 +3914,6 @@ for my $c ("z", "\0", "!", chr(254), chr(256)) { 1 while /.(??{'(*PRUNE)'})(?{$count++})(*FAIL)/g; iseq($count,4,"/.(*PRUNE)/"); } -{ # Test the \v form of the (*PRUNE) pattern - our $count = 0; - 'aaab'=~/a+b?(?{$count++})(*FAIL)/; - iseq($count,9,"expect 9 for no \\v"); - $count = 0; - 'aaab'=~/a+b?\v(?{$count++})(*FAIL)/; - iseq($count,3,"expect 3 with \\v"); - local $_='aaab'; - $count=0; - 1 while /.\v(?{$count++})(*FAIL)/g; - iseq($count,4,"/.\\v/"); - $count = 0; - 'aaab'=~/a+b?(??{'\v'})(?{$count++})(*FAIL)/; - iseq($count,3,"expect 3 with \\v"); - local $_='aaab'; - $count=0; - 1 while /.(??{'\v'})(?{$count++})(*FAIL)/g; - iseq($count,4,"/.\\v/"); -} { # Test the (*SKIP) pattern our $count = 0; 'aaab'=~/a+b?(*SKIP)(?{$count++})(*FAIL)/; @@ -3947,21 +3929,6 @@ for my $c ("z", "\0", "!", chr(254), chr(256)) { iseq($count,2,"Expect 2 with (*SKIP)" ); iseq("@res","aaab aaab","adjacent (*SKIP) works as expected" ); } -{ # Test the \V form of the (*SKIP) pattern - our $count = 0; - 'aaab'=~/a+b?\V(?{$count++})(*FAIL)/; - iseq($count,1,"expect 1 with \\V"); - local $_='aaab'; - $count=0; - 1 while /.\V(?{$count++})(*FAIL)/g; - iseq($count,4,"/.\\V/"); - $_='aaabaaab'; - $count=0; - our @res=(); - 1 while /(a+b?)\V(?{$count++; push @res,$1})(*FAIL)/g; - iseq($count,2,"Expect 2 with \\V" ); - iseq("@res","aaab aaab","adjacent \\V works as expected" ); -} { # Test the (*SKIP) pattern our $count = 0; 'aaab'=~/a+b?(*MARK:foo)(*SKIP)(?{$count++})(*FAIL)/; @@ -4345,7 +4312,41 @@ sub kt iseq("$1$2",'foooooobaaaaar'); } iseq("$1$2","foobar"); +} +{ + local $_="\t \r\n \n \t".chr(11)."\n"; + s/\H/H/g; + s/\h/h/g; + iseq($_,"hhHHhHhhHH"); + $_="\t \r\n \n \t".chr(11)."\n"; + utf8::upgrade($_); + s/\H/H/g; + s/\h/h/g; + iseq($_,"hhHHhHhhHH"); } +{ + my @h=map { chr( $_ ) } ( + 0x09, 0x20, 0xa0, 0x1680, 0x180e, 0x2000, 0x2001, 0x2002, + 0x2003, 0x2004, 0x2005, 0x2006, 0x2007, 0x2008, 0x2009, 0x200a, + 0x202f, 0x205f, 0x3000 + ); + my @v=map { chr( $_ ) } ( 0x0a, 0x0b, 0x0c, 0x0d, 0x85, 0x2028, 0x2029 ); + my @lb=( "\x0D\x0A", + map { chr( $_ ) } ( 0x0A..0x0D,0x85,0x2028,0x2029 )); + foreach my $t ([\@h,qr/\h/,qr/\h+/],[\@v,qr/\v/,qr/\v+/],[\@lb,qr/\R/,qr/\R+/],){ + my $ary=shift @$t; + foreach my $pat (@$t) { + foreach my $str (@$ary) { + ok($str=~/($pat)/); + iseq($1,$str); + utf8::upgrade($str); + ok($str=~/($pat)/); + iseq($1,$str); + } + } + } +} + # Test counter is at bottom of file. Put new tests above here. #------------------------------------------------------------------- # Keep the following tests last -- they may crash perl @@ -4427,7 +4428,8 @@ ok($@=~/\QSequence \k... not terminated in regex;\E/); iseq(0+$::test,$::TestCount,"Got the right number of tests!"); # Don't forget to update this! BEGIN { - $::TestCount = 1663; + $::TestCount = 1928; print "1..$::TestCount\n"; } + diff --git a/t/op/re_tests b/t/op/re_tests index aa07b56..6eb03a2 100644 --- a/t/op/re_tests +++ b/t/op/re_tests @@ -1283,3 +1283,44 @@ X(\w+)(?=\s)|X(\w+) Xab y [$1-$2] [-ab] (?(DEFINE)(?(?&B)+)(?a))(?&A) a y $& a (?(DEFINE)(?(?&B)+)(?a))(?&A) aa y $& aa \x{100}?(??{""})xxx xxx y $& xxx + +foo(\R)bar foo\r\nbar y $1 \r\n +foo(\R)bar foo\nbar y $1 \n +foo(\R)bar foo\rbar y $1 \r + +foo(\R+)bar foo\r\n\x{85}\r\n\nbar y $1 \r\n\x{85}\r\n\n +(\V+)(\R) foo\r\n\x{85}\r\n\nbar y $1-$2 foo-\r\n +(\R+)(\V) foo\r\n\x{85}\r\n\nbar y $1-$2 \r\n\x{85}\r\n\n-b +foo(\R)bar foo\x{85}bar y $1 \x{85} +(\V)(\R) foo\x{85}bar y $1-$2 o-\x{85} +(\R)(\V) foo\x{85}bar y $1-$2 \x{85}-b +foo(\R)bar foo\r\nbar y $1 \r\n +(\V)(\R) foo\r\nbar y $1-$2 o-\r\n +(\R)(\V) foo\r\nbar y $1-$2 \r\n-b +foo(\R)bar foo\r\nbar y $1 \r\n +(\V)(\R) foo\r\nbar y $1-$2 o-\r\n +(\R)(\V) foo\r\nbar y $1-$2 \r\n-b +foo(\R)bar foo\rbar y $1 \r +(\V)(\R) foo\rbar y $1-$2 o-\r +(\R)(\V) foo\rbar y $1-$2 \r-b + +foo(\v+)bar foo\r\n\x{85}\r\n\nbar y $1 \r\n\x{85}\r\n\n +(\V+)(\v) foo\r\n\x{85}\r\n\nbar y $1-$2 foo-\r +(\v+)(\V) foo\r\n\x{85}\r\n\nbar y $1-$2 \r\n\x{85}\r\n\n-b +foo(\v)bar foo\x{85}bar y $1 \x{85} +(\V)(\v) foo\x{85}bar y $1-$2 o-\x{85} +(\v)(\V) foo\x{85}bar y $1-$2 \x{85}-b +foo(\v)bar foo\rbar y $1 \r +(\V)(\v) foo\rbar y $1-$2 o-\r +(\v)(\V) foo\rbar y $1-$2 \r-b + + +foo(\h+)bar foo\t\x{A0}bar y $1 \t\x{A0} +(\H+)(\h) foo\t\x{A0}bar y $1-$2 foo-\t +(\h+)(\H) foo\t\x{A0}bar y $1-$2 \t\x{A0}-b +foo(\h)bar foo\x{A0}bar y $1 \x{A0} +(\H)(\h) foo\x{A0}bar y $1-$2 o-\x{A0} +(\h)(\H) foo\x{A0}bar y $1-$2 \x{A0}-b +foo(\h)bar foo\tbar y $1 \t +(\H)(\h) foo\tbar y $1-$2 o-\t +(\h)(\H) foo\tbar y $1-$2 \t-b diff --git a/t/op/regexp.t b/t/op/regexp.t index 919a239..7ad7d89 100755 --- a/t/op/regexp.t +++ b/t/op/regexp.t @@ -127,6 +127,9 @@ EOFCODE \$got = "$repl"; EOFCODE } + #$code.=qq[\n\$expect="$expect";\n]; + #use Devel::Peek; + #die Dump($code) if $pat=~/\\h/ and $subject=~/\x{A0}/; { # Probably we should annotate specific tests with which warnings # categories they're known to trigger, and hence should be