From: Robin Barker Date: Fri, 25 Apr 2008 14:21:06 +0000 (+0100) Subject: another go; was RE: [perl #49302] [[:print:]] v \p{Print} X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=fdf0a293a88d8a14c42b43c2f82c991c50f7dc39;p=p5sagit%2Fp5-mst-13.2.git another go; was RE: [perl #49302] [[:print:]] v \p{Print} From: "Robin Barker" Message-ID: <46A0F33545E63740BC7563DE59CA9C6D093B12@exchsvr2.npl.ad.local> p4raw-id: //depot/perl@33752 --- diff --git a/pod/perlre.pod b/pod/perlre.pod index 04c7b8e..a076d3a 100644 --- a/pod/perlre.pod +++ b/pod/perlre.pod @@ -375,20 +375,60 @@ X X<\p> X<\p{}> digit IsDigit \d graph IsGraph lower IsLower - print IsPrint - punct IsPunct + print IsPrint (but see [2] below) + punct IsPunct (but see [3] below) space IsSpace IsSpacePerl \s upper IsUpper - word IsWord + word IsWord \w xdigit IsXDigit For example C<[[:lower:]]> and C<\p{IsLower}> are equivalent. +However, the equivalence between C<[[:xxxxx:]]> and C<\p{IsXxxxx}> +is not exact. + +=over 4 + +=item [1] + If the C pragma is not used but the C pragma is, the classes correlate with the usual isalpha(3) interface (except for "word" and "blank"). +But if the C or C pragmas are not used and +the string is not C, then C<[[:xxxxx:]]> (and C<\w>, etc.) +will not match characters 0x80-0xff; whereas C<\p{IsXxxxx}> will +force the string to C and can match these characters +(as Unicode). + +=item [2] + +C<\p{IsPrint}> matches characters 0x09-0x0d but C<[[:print:]]> does not. + +=item [3] + +C<[[:punct::]]> matches the following but C<\p{IsPunct}> does not, +because they are classed as symbols (not punctuation) in Unicode. + +=over 4 + +=item C<$> + +Currency symbol + +=item C<+> C<< < >> C<=> C<< > >> C<|> C<~> + +Mathematical symbols + +=item C<^> C<`> + +Modifier symbols (accents) + +=back + +=back + The other named classes are: =over 4 diff --git a/t/op/pat.t b/t/op/pat.t index 82cf498..5ff4b92 100755 --- a/t/op/pat.t +++ b/t/op/pat.t @@ -4604,6 +4604,32 @@ sub kt iseq($te[0], '../'); } +SKIP: { + if (ordA == 193) { skip("Assumes ASCII", 4) } + + my @notIsPunct = grep {/[[:punct:]]/ and not /\p{IsPunct}/} + map {chr} 0x20..0x7f; + iseq( join('', @notIsPunct), '$+<=>^`|~', + '[:punct:] disagress with IsPunct on Symbols'); + + my @isPrint = grep {not/[[:print:]]/ and /\p{IsPrint}/} + map {chr} 0..0x1f, 0x7f..0x9f; + iseq( join('', @isPrint), "\x09\x0a\x0b\x0c\x0d\x85", + 'IsPrint disagrees with [:print:] on control characters'); + + my @isPunct = grep {/[[:punct:]]/ != /\p{IsPunct}/} + map {chr} 0x80..0xff; + iseq( join('', @isPunct), "\xa1\xab\xb7\xbb\xbf", # ¡ « · » ¿ + 'IsPunct disagrees with [:punct:] outside ASCII'); + + my @isPunctLatin1 = eval q{ + use encoding 'latin1'; + grep {/[[:punct:]]/ != /\p{IsPunct}/} map {chr} 0x80..0xff; + }; + if( $@ ){ skip( $@, 1); } + iseq( join('', @isPunctLatin1), '', + 'IsPunct agrees with [:punct:] with explicit Latin1'); +} # Test counter is at bottom of file. Put new tests above here. @@ -4667,7 +4693,7 @@ iseq(0+$::test,$::TestCount,"Got the right number of tests!"); # Don't forget to update this! BEGIN { - $::TestCount = 4031; + $::TestCount = 4035; print "1..$::TestCount\n"; }