From: Jarkko Hietaniemi Date: Mon, 1 Oct 2001 02:02:08 +0000 (+0000) Subject: Further tweaks to the Unicode properties. X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=4193bef7c285e086ba2845c76259534be73b65fa;p=p5sagit%2Fp5-mst-13.2.git Further tweaks to the Unicode properties. p4raw-id: //depot/perl@12286 --- diff --git a/lib/utf8_heavy.pl b/lib/utf8_heavy.pl index 3e145de..025a70b 100644 --- a/lib/utf8_heavy.pl +++ b/lib/utf8_heavy.pl @@ -53,6 +53,7 @@ sub SWASHNEW { { $list ||= do "$file.pl" + || do "unicore/Is/$type.pl" || croak("Can't find Unicode character property \"$type\""); } diff --git a/pod/perlunicode.pod b/pod/perlunicode.pod index 4864909..4d6be20 100644 --- a/pod/perlunicode.pod +++ b/pod/perlunicode.pod @@ -169,17 +169,27 @@ character with the Unicode uppercase property, while C<\p{M}> matches any mark character. Single letter properties may omit the brackets, so that can be written C<\pM> also. Many predefined character classes are available, such as C<\p{IsMirrored}> and C<\p{InTibetan}>. -The recommended naming convention of the C classes are the -official Unicode script and block names, but with all non-alphanumeric -characters removed, for example the block name C<"Latin-1 Supplement"> -becomes C<\p{InLatin1Supplement}>. Perl will ignore the case of -letters, and any space or dash can be a space, dash, underbar, or be -missing altogether, so C<\p{ in latin 1 supplement }> will work, too. + +The C<\p{Is...}> test for "general properties" such as "letter", +"digit", while the C<\p{In...}> test for Unicode scripts and blocks. + +The official Unicode script and block names have spaces and +dashes and separators, but for convenience you can have +dashes, spaces, and underbars at every word division, and +you need not care about correct casing. It is recommended, +however, that for consistency you use the following naming: +the official Unicode script or block name (see below for +the additional rules that apply to block names), with the whitespace +and dashes removed, and the words "uppercase-first-lowercase-otherwise". +That is, "Latin-1 Supplement" becomes "Latin1Supplement". + You can also negate both C<\p{}> and C<\P{}> by introducing a caret (^) between the first curly and the property name: C<\p{^InTamil}> is -equal to C<\P{Tamil}>. +equal to C<\P{InTamil}>. + +The C can be left out: C<\p{Greek}> is equal to C<\p{InGreek}>. -Here is the list as of Unicode 3.1.0 (the two-letter classes) and +Here is the list as of Unicode 3.1.1 (the two-letter classes) and as defined by Perl (the one-letter classes) (in Unicode materials what Perl calls C is often called C): diff --git a/regcomp.c b/regcomp.c index dda273d..96bafd3 100644 --- a/regcomp.c +++ b/regcomp.c @@ -2881,7 +2881,7 @@ tryagain: if (!RExC_end) { RExC_parse += 2; RExC_end = oldregxend; - vFAIL("Missing right brace on \\p{}"); + vFAIL2("Missing right brace on \\%c{}", UCHARAT(RExC_parse - 2)); } RExC_end++; } @@ -3085,7 +3085,7 @@ tryagain: /* FALL THROUGH */ default: if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(*p)) - vWARN2(p +1, "Unrecognized escape \\%c passed through", *p); + vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p)); goto normal_default; } break; diff --git a/t/op/pat.t b/t/op/pat.t index a3f6522..fa4d1b3 100755 --- a/t/op/pat.t +++ b/t/op/pat.t @@ -6,7 +6,7 @@ $| = 1; -print "1..717\n"; +print "1..722\n"; BEGIN { chdir 't' if -d 't'; @@ -2127,12 +2127,42 @@ sub ok ($$) { print "ok 715\n"; } +print "# some Unicode properties\n"; + { + # Dashes, underbars, case. print "not " unless "\x80" =~ /\p{in-latin1_SUPPLEMENT}/; print "ok 716\n"; } { + # Complement, leading and trailing whitespace. print "not " unless "\x80" =~ /\P{ ^ In Latin 1 Supplement }/; print "ok 717\n"; } + +{ + # No ^In, dashes, case. + print "not " unless "\x80" =~ /\p{latin-1-supplement}/; + print "ok 718\n"; +} + +{ + print "not " unless "a" =~ /\pL/; + print "ok 719\n"; +} + +{ + print "not " unless "a" =~ /\p{IsLl}/; + print "ok 720\n"; +} + +{ + print "not " unless "A" =~ /\pL/; + print "ok 721\n"; +} + +{ + print "not " unless "A" =~ /\p{IsLu}/; + print "ok 722\n"; +}