From: Jeffrey Friedl Date: Sun, 16 Jul 2000 17:55:29 +0000 (-0700) Subject: Add [[:blank:]] as suggested in X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=aaa51d5e11b8b0db616a7f939c784733b4cfef87;p=p5sagit%2Fp5-mst-13.2.git Add [[:blank:]] as suggested in Subject: [ID 20000716.024] [=cc=] / [:blank:] Message-Id: <200007170055.RAA23528@fummy.dsl.yahoo.com> (the [=cc=] has already been taken care of by #6439 so the whole bug report can be closed) and make [[:space:]] to be equivalent to isspace(3) (as opposed to \s, which is isSPACE()). The difference is that now [[:space:]] matches the mythical vertical tab, while \s doesn't. p4raw-id: //depot/perl@6703 --- diff --git a/handy.h b/handy.h index 9e6f223..d82b1c6 100644 --- a/handy.h +++ b/handy.h @@ -296,6 +296,8 @@ Converts the specified character to lowercase. #define isALPHA(c) (isUPPER(c) || isLOWER(c)) #define isSPACE(c) \ ((c) == ' ' || (c) == '\t' || (c) == '\n' || (c) =='\r' || (c) == '\f') +#define isPSXSPC(c) (isSPACE(c) || (c) == '\v') +#define isBLANK(c) ((c) == ' ' || (c) == '\t') #define isDIGIT(c) ((c) >= '0' && (c) <= '9') #ifdef EBCDIC /* In EBCDIC we do not do locales: therefore() isupper() is fine. */ @@ -382,6 +384,9 @@ Converts the specified character to lowercase. # endif #endif /* USE_NEXT_CTYPE */ +#define isPSXSPC_LC(c) (isSPACE_LC(c) || (c) == '\v') +#define isBLANK_LC(c) isBLANK(c) /* could be wrong */ + #define isALNUM_uni(c) is_uni_alnum(c) #define isIDFIRST_uni(c) is_uni_idfirst(c) #define isALPHA_uni(c) is_uni_alpha(c) @@ -400,6 +405,9 @@ Converts the specified character to lowercase. #define toTITLE_uni(c) to_uni_title(c) #define toLOWER_uni(c) to_uni_lower(c) +#define isPSXSPC_uni(c) (isSPACE_uni(c) ||(c) == '\f') +#define isBLANK_uni(c) isBLANK(c) /* could be wrong */ + #define isALNUM_LC_uni(c) (c < 256 ? isALNUM_LC(c) : is_uni_alnum_lc(c)) #define isIDFIRST_LC_uni(c) (c < 256 ? isIDFIRST_LC(c) : is_uni_idfirst_lc(c)) #define isALPHA_LC_uni(c) (c < 256 ? isALPHA_LC(c) : is_uni_alpha_lc(c)) @@ -416,6 +424,9 @@ Converts the specified character to lowercase. #define toTITLE_LC_uni(c) (c < 256 ? toUPPER_LC(c) : to_uni_title_lc(c)) #define toLOWER_LC_uni(c) (c < 256 ? toLOWER_LC(c) : to_uni_lower_lc(c)) +#define isPSXSPC_LC_uni(c) (isSPACE_LC_uni(c) ||(c) == '\f') +#define isBLANK_LC_uni(c) isBLANK(c) /* could be wrong */ + #define isALNUM_utf8(p) is_utf8_alnum(p) #define isIDFIRST_utf8(p) is_utf8_idfirst(p) #define isALPHA_utf8(p) is_utf8_alpha(p) @@ -434,6 +445,9 @@ Converts the specified character to lowercase. #define toTITLE_utf8(p) to_utf8_title(p) #define toLOWER_utf8(p) to_utf8_lower(p) +#define isPSXSPC_utf8(c) (isSPACE_utf8(c) ||(c) == '\f') +#define isBLANK_utf8(c) isBLANK(c) /* could be wrong */ + #define isALNUM_LC_utf8(p) isALNUM_LC_uni(utf8_to_uv(p, 0)) #define isIDFIRST_LC_utf8(p) isIDFIRST_LC_uni(utf8_to_uv(p, 0)) #define isALPHA_LC_utf8(p) isALPHA_LC_uni(utf8_to_uv(p, 0)) @@ -450,6 +464,9 @@ Converts the specified character to lowercase. #define toTITLE_LC_utf8(p) toTITLE_LC_uni(utf8_to_uv(p, 0)) #define toLOWER_LC_utf8(p) toLOWER_LC_uni(utf8_to_uv(p, 0)) +#define isPSXSPC_LC_utf8(c) (isSPACE_LC_utf8(c) ||(c) == '\f') +#define isBLANK_LC_utf8(c) isBLANK(c) /* could be wrong */ + #ifdef EBCDIC EXT int ebcdic_control (int); # define toCTRL(c) ebcdic_control(c) diff --git a/pod/perlre.pod b/pod/perlre.pod index c964be8..fa4aad2 100644 --- a/pod/perlre.pod +++ b/pod/perlre.pod @@ -199,20 +199,26 @@ equivalents (if available) are as follows: alpha alnum ascii + blank [1] cntrl digit \d graph lower print punct - space \s + space \s [2] upper - word \w + word \w [3] xdigit + [1] A GNU extension equivalent to C<[ \t]>, `all horizontal whitespace'. + [2] Not I to C<\s> since the C<[[:space:]]> includes + also the (very rare) `vertical tabulator', "\ck", chr(11). + [3] A Perl extension. + For example use C<[:upper:]> to match all the uppercase characters. -Note that the C<[]> are part of the C<[::]> construct, not part of the whole -character class. For example: +Note that the C<[]> are part of the C<[::]> construct, not part of the +whole character class. For example: [01[:alpha:]%] @@ -224,6 +230,7 @@ If the C pragma is used, the following equivalences to Unicode alpha IsAlpha alnum IsAlnum ascii IsASCII + blank IsSpace cntrl IsCntrl digit IsDigit graph IsGraph @@ -238,8 +245,8 @@ If the C pragma is used, the following equivalences to Unicode For example C<[:lower:]> and C<\p{IsLower}> are equivalent. If the C pragma is not used but the C pragma is, the -classes correlate with the isalpha(3) interface (except for `word', -which is a Perl extension, mirroring C<\w>). +classes correlate with the usual isalpha(3) interface (except for +`word' and `blank'). The assumedly non-obviously named classes are: diff --git a/pod/perlretut.pod b/pod/perlretut.pod index 66f8179..87669e5 100644 --- a/pod/perlretut.pod +++ b/pod/perlretut.pod @@ -1672,15 +1672,17 @@ i.e., a non-mark followed by one or more marks. As if all those classes weren't enough, Perl also defines POSIX style character classes. These have the form C<[:name:]>, with C the -name of the POSIX class. The POSIX classes are alpha, alnum, ascii, -cntrl, digit, graph, lower, print, punct, space, upper, word, and -xdigit. If C is being used, then these classes are defined the -same as their corresponding perl Unicode classes: C<[:upper:]> is the -same as C<\p{IsUpper}>, etc. The POSIX character classes, however, -don't require using C. The C<[:digit:]>, C<[:word:]>, and +name of the POSIX class. The POSIX classes are C, C, +C, C, C, C, C, C, C, +C, C, and C, and two extensions, C (a Perl +extension to match C<\w>), and C (a GNU extension). If C +is being used, then these classes are defined the same as their +corresponding perl Unicode classes: C<[:upper:]> is the same as +C<\p{IsUpper}>, etc. The POSIX character classes, however, don't +require using C. The C<[:digit:]>, C<[:word:]>, and C<[:space:]> correspond to the familiar C<\d>, C<\w>, and C<\s> -character classes. To negate a POSIX class, put a C<^> in front of the -name, so that, e.g., C<[:^digit:]> corresponds to C<\D> and under +character classes. To negate a POSIX class, put a C<^> in front of +the name, so that, e.g., C<[:^digit:]> corresponds to C<\D> and under C, C<\P{IsDigit}>. The Unicode and POSIX character classes can be used just like C<\d>, both inside and outside of character classes: diff --git a/regcomp.c b/regcomp.c index f0b7c5c..d2195b0 100644 --- a/regcomp.c +++ b/regcomp.c @@ -443,7 +443,7 @@ S_cl_is_anything(pTHX_ struct regnode_charclass_class *cl) { int value; - for (value = 0; value < ANYOF_MAX; value += 2) + for (value = 0; value <= ANYOF_MAX; value += 2) if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1)) return 1; for (value = 0; value < 256; ++value) @@ -3004,6 +3004,11 @@ S_regpposixcc(pTHX_ I32 value) namedclass = complement ? ANYOF_NASCII : ANYOF_ASCII; break; + case 'b': + if (strnEQ(posixcc, "blank", 5)) + namedclass = + complement ? ANYOF_NBLANK : ANYOF_BLANK; + break; case 'c': if (strnEQ(posixcc, "cntrl", 5)) namedclass = @@ -3035,7 +3040,7 @@ S_regpposixcc(pTHX_ I32 value) case 's': if (strnEQ(posixcc, "space", 5)) namedclass = - complement ? ANYOF_NSPACE : ANYOF_SPACE; + complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC; break; case 'u': if (strnEQ(posixcc, "upper", 5)) @@ -3160,7 +3165,7 @@ S_regclass(pTHX) else if (value == '\\') { value = UCHARAT(PL_regcomp_parse++); /* Some compilers cannot handle switching on 64-bit integer - * values, therefore value cannot be an UV. --jhi */ + * values, therefore the 'value' cannot be an UV. --jhi */ switch (value) { case 'w': namedclass = ANYOF_ALNUM; break; case 'W': namedclass = ANYOF_NALNUM; break; @@ -3339,6 +3344,24 @@ S_regclass(pTHX) #endif /* EBCDIC */ } break; + case ANYOF_BLANK: + if (LOC) + ANYOF_CLASS_SET(ret, ANYOF_BLANK); + else { + for (value = 0; value < 256; value++) + if (isBLANK(value)) + ANYOF_BITMAP_SET(ret, value); + } + break; + case ANYOF_NBLANK: + if (LOC) + ANYOF_CLASS_SET(ret, ANYOF_NBLANK); + else { + for (value = 0; value < 256; value++) + if (!isBLANK(value)) + ANYOF_BITMAP_SET(ret, value); + } + break; case ANYOF_CNTRL: if (LOC) ANYOF_CLASS_SET(ret, ANYOF_CNTRL); @@ -3412,6 +3435,24 @@ S_regclass(pTHX) ANYOF_BITMAP_SET(ret, value); } break; + case ANYOF_PSXSPC: + if (LOC) + ANYOF_CLASS_SET(ret, ANYOF_PSXSPC); + else { + for (value = 0; value < 256; value++) + if (isPSXSPC(value)) + ANYOF_BITMAP_SET(ret, value); + } + break; + case ANYOF_NPSXSPC: + if (LOC) + ANYOF_CLASS_SET(ret, ANYOF_NPSXSPC); + else { + for (value = 0; value < 256; value++) + if (!isPSXSPC(value)) + ANYOF_BITMAP_SET(ret, value); + } + break; case ANYOF_PUNCT: if (LOC) ANYOF_CLASS_SET(ret, ANYOF_PUNCT); @@ -3739,8 +3780,12 @@ S_regclassutf8(pTHX) case ANYOF_NPUNCT: Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPunct\n"); break; case ANYOF_SPACE: + case ANYOF_PSXSPC: + case ANYOF_BLANK: Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpace\n"); break; case ANYOF_NSPACE: + case ANYOF_NPSXSPC: + case ANYOF_NBLANK: Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpace\n"); break; case ANYOF_UPPER: Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsUpper\n"); break; @@ -4193,7 +4238,7 @@ Perl_regprop(pTHX_ SV *sv, regnode *o) else if (k == ANYOF) { int i, rangestart = -1; const char * const out[] = { /* Should be syncronized with - a table in regcomp.h */ + ANYOF_ #xdefines in regcomp.h */ "\\w", "\\W", "\\s", @@ -4217,9 +4262,13 @@ Perl_regprop(pTHX_ SV *sv, regnode *o) "[:punct:]", "[:^punct:]", "[:upper:]", - "[:!upper:]", + "[:^upper:]", "[:xdigit:]", - "[:^xdigit:]" + "[:^xdigit:]", + "[:space:]", + "[:^space:]", + "[:blank:]", + "[:^blank:]" }; if (o->flags & ANYOF_LOCALE) diff --git a/regcomp.h b/regcomp.h index 34c5c25..e30e67f 100644 --- a/regcomp.h +++ b/regcomp.h @@ -194,7 +194,7 @@ struct regnode_charclass_class { #define ANYOF_ALNUM 0 /* \w, utf8::IsWord, isALNUM() */ #define ANYOF_NALNUM 1 -#define ANYOF_SPACE 2 +#define ANYOF_SPACE 2 /* \s */ #define ANYOF_NSPACE 3 #define ANYOF_DIGIT 4 #define ANYOF_NDIGIT 5 @@ -218,8 +218,12 @@ struct regnode_charclass_class { #define ANYOF_NUPPER 23 #define ANYOF_XDIGIT 24 #define ANYOF_NXDIGIT 25 +#define ANYOF_PSXSPC 26 /* POSIX space: \s plus the vertical tab */ +#define ANYOF_NPSXSPC 27 +#define ANYOF_BLANK 28 +#define ANYOF_NBLANK 29 /* GNU extension: space and tab */ -#define ANYOF_MAX 31 +#define ANYOF_MAX 32 /* Backward source code compatibility. */ diff --git a/regexec.c b/regexec.c index cbc8c19..2004cc4 100644 --- a/regexec.c +++ b/regexec.c @@ -3625,7 +3625,11 @@ S_reginclass(pTHX_ register regnode *p, register I32 c) (ANYOF_CLASS_TEST(p, ANYOF_UPPER) && isUPPER_LC(c)) || (ANYOF_CLASS_TEST(p, ANYOF_NUPPER) && !isUPPER_LC(c)) || (ANYOF_CLASS_TEST(p, ANYOF_XDIGIT) && isXDIGIT(c)) || - (ANYOF_CLASS_TEST(p, ANYOF_NXDIGIT) && !isXDIGIT(c)) + (ANYOF_CLASS_TEST(p, ANYOF_NXDIGIT) && !isXDIGIT(c)) || + (ANYOF_CLASS_TEST(p, ANYOF_PSXSPC) && isPSXSPC(c)) || + (ANYOF_CLASS_TEST(p, ANYOF_NPSXSPC) && !isPSXSPC(c)) || + (ANYOF_CLASS_TEST(p, ANYOF_BLANK) && isBLANK(c)) || + (ANYOF_CLASS_TEST(p, ANYOF_NBLANK) && !isBLANK(c)) ) /* How's that for a conditional? */ { match = TRUE; diff --git a/t/op/pat.t b/t/op/pat.t index 91c4b7d..2ba6d93 100755 --- a/t/op/pat.t +++ b/t/op/pat.t @@ -4,7 +4,7 @@ # the format supported by op/regexp.t. If you want to add a test # that does fit that format, add it to op/re_tests, not here. -print "1..220\n"; +print "1..223\n"; BEGIN { chdir 't' if -d 't'; @@ -1058,3 +1058,29 @@ $w = 0; } print $w ? "not " : "", "ok $test\n"; $test++; + +my %space = ( spc => " ", + tab => "\t", + cr => "\r", + lf => "\n", + ff => "\f", +# The vertical tabulator seems miraculously be 12 both in ASCII and EBCDIC. + vt => chr(11), + false => "space" ); + +my @space0 = sort grep { $space{$_} =~ /\s/ } keys %space; +my @space1 = sort grep { $space{$_} =~ /[[:space:]]/ } keys %space; +my @space2 = sort grep { $space{$_} =~ /[[:blank:]]/ } keys %space; + +print "not " unless "@space0" eq "cr ff lf spc tab"; +print "ok $test\n"; +$test++; + +print "not " unless "@space1" eq "cr ff lf spc tab vt"; +print "ok $test\n"; +$test++; + +print "not " unless "@space2" eq "spc tab"; +print "ok $test\n"; +$test++; +