From: Jarkko Hietaniemi <jhi@iki.fi>
Date: Mon, 1 Oct 2001 02:02:08 +0000 (+0000)
Subject: Further tweaks to the Unicode properties.
X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=4193bef7c285e086ba2845c76259534be73b65fa;p=p5sagit%2Fp5-mst-13.2.git

Further tweaks to the Unicode properties.

p4raw-id: //depot/perl@12286
---

diff --git a/lib/utf8_heavy.pl b/lib/utf8_heavy.pl
index 3e145de..025a70b 100644
--- a/lib/utf8_heavy.pl
+++ b/lib/utf8_heavy.pl
@@ -53,6 +53,7 @@ sub SWASHNEW {
 
     {
         $list ||= do "$file.pl"
+	      ||  do "unicore/Is/$type.pl"
 	      ||  croak("Can't find Unicode character property \"$type\"");
     }
 
diff --git a/pod/perlunicode.pod b/pod/perlunicode.pod
index 4864909..4d6be20 100644
--- a/pod/perlunicode.pod
+++ b/pod/perlunicode.pod
@@ -169,17 +169,27 @@ character with the Unicode uppercase property, while C<\p{M}> matches
 any mark character.  Single letter properties may omit the brackets,
 so that can be written C<\pM> also.  Many predefined character classes
 are available, such as C<\p{IsMirrored}> and C<\p{InTibetan}>.
-The recommended naming convention of the C<In> classes are the
-official Unicode script and block names, but with all non-alphanumeric
-characters removed, for example the block name C<"Latin-1 Supplement">
-becomes C<\p{InLatin1Supplement}>.  Perl will ignore the case of
-letters, and any space or dash can be a space, dash, underbar, or be
-missing altogether, so C<\p{ in latin 1 supplement }> will work, too.
+
+The C<\p{Is...}> test for "general properties" such as "letter",
+"digit", while the C<\p{In...}> test for Unicode scripts and blocks.
+
+The official Unicode script and block names have spaces and
+dashes and separators, but for convenience you can have
+dashes, spaces, and underbars at every word division, and
+you need not care about correct casing.  It is recommended,
+however, that for consistency you use the following naming:
+the official Unicode script or block name (see below for
+the additional rules that apply to block names), with the whitespace
+and dashes removed, and the words "uppercase-first-lowercase-otherwise".
+That is, "Latin-1 Supplement" becomes "Latin1Supplement".
+
 You can also negate both C<\p{}> and C<\P{}> by introducing a caret
 (^) between the first curly and the property name: C<\p{^InTamil}> is
-equal to C<\P{Tamil}>.
+equal to C<\P{InTamil}>.
+
+The C<In> can be left out: C<\p{Greek}> is equal to C<\p{InGreek}>.
 
-Here is the list as of Unicode 3.1.0 (the two-letter classes) and
+Here is the list as of Unicode 3.1.1 (the two-letter classes) and
 as defined by Perl (the one-letter classes) (in Unicode materials
 what Perl calls C<L> is often called C<L&>):
 
diff --git a/regcomp.c b/regcomp.c
index dda273d..96bafd3 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -2881,7 +2881,7 @@ tryagain:
 		    if (!RExC_end) {
 			RExC_parse += 2;
 			RExC_end = oldregxend;
-			vFAIL("Missing right brace on \\p{}");
+			vFAIL2("Missing right brace on \\%c{}", UCHARAT(RExC_parse - 2));
 		    }
 		    RExC_end++;
 		}
@@ -3085,7 +3085,7 @@ tryagain:
 			/* FALL THROUGH */
 		    default:
 			if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(*p))
-			    vWARN2(p +1, "Unrecognized escape \\%c passed through", *p);
+			    vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
 			goto normal_default;
 		    }
 		    break;
diff --git a/t/op/pat.t b/t/op/pat.t
index a3f6522..fa4d1b3 100755
--- a/t/op/pat.t
+++ b/t/op/pat.t
@@ -6,7 +6,7 @@
 
 $| = 1;
 
-print "1..717\n";
+print "1..722\n";
 
 BEGIN {
     chdir 't' if -d 't';
@@ -2127,12 +2127,42 @@ sub ok ($$) {
     print "ok 715\n";
 }
 
+print "# some Unicode properties\n";
+
 {
+    # Dashes, underbars, case.
     print "not " unless "\x80" =~ /\p{in-latin1_SUPPLEMENT}/;
     print "ok 716\n";
 }
 
 {
+    # Complement, leading and trailing whitespace.
     print "not " unless "\x80" =~ /\P{  ^  In Latin 1 Supplement  }/;
     print "ok 717\n";
 }
+
+{
+    # No ^In, dashes, case.
+    print "not " unless "\x80" =~ /\p{latin-1-supplement}/;
+    print "ok 718\n";
+}
+
+{
+    print "not " unless "a" =~ /\pL/;
+    print "ok 719\n";
+}
+
+{
+    print "not " unless "a" =~ /\p{IsLl}/;
+    print "ok 720\n";
+}
+
+{
+    print "not " unless "A" =~ /\pL/;
+    print "ok 721\n";
+}
+
+{
+    print "not " unless "A" =~ /\p{IsLu}/;
+    print "ok 722\n";
+}