Get the three different space character classes right under utf8.
Jarkko Hietaniemi [Fri, 1 Dec 2000 06:45:53 +0000 (06:45 +0000)]
p4raw-id: //depot/perl@7940

MANIFEST
lib/unicode/Is/Blank.pl [new file with mode: 0644]
lib/unicode/Is/SpacePerl.pl [new file with mode: 0644]
lib/unicode/mktables.PL
pod/perlre.pod
regcomp.c
regexec.c
t/op/pat.t
utf8.c

index 4607250..7da209e 100644 (file)
--- a/MANIFEST
+++ b/MANIFEST
@@ -922,6 +922,7 @@ lib/unicode/Is/BidiRLE.pl                   Unicode character database
 lib/unicode/Is/BidiRLO.pl                      Unicode character database
 lib/unicode/Is/BidiS.pl                                Unicode character database
 lib/unicode/Is/BidiWS.pl                       Unicode character database
+lib/unicode/Is/Blank.pl                                Unicode character database
 lib/unicode/Is/C.pl                            Unicode character database
 lib/unicode/Is/Cc.pl                           Unicode character database
 lib/unicode/Is/Cf.pl                           Unicode character database
@@ -1010,6 +1011,7 @@ lib/unicode/Is/Sk.pl                              Unicode character database
 lib/unicode/Is/Sm.pl                           Unicode character database
 lib/unicode/Is/So.pl                           Unicode character database
 lib/unicode/Is/Space.pl                                Unicode character database
+lib/unicode/Is/SpacePerl.pl                    Unicode character database
 lib/unicode/Is/SylA.pl                         Unicode character database
 lib/unicode/Is/SylAA.pl                                Unicode character database
 lib/unicode/Is/SylAAI.pl                       Unicode character database
diff --git a/lib/unicode/Is/Blank.pl b/lib/unicode/Is/Blank.pl
new file mode 100644 (file)
index 0000000..8642921
--- /dev/null
@@ -0,0 +1,12 @@
+# !!!!!!!   DO NOT EDIT THIS FILE   !!!!!!! 
+# This file is built by mktables.PL from e.g. Unicode.301.
+# Any changes made here will be lost!
+return <<'END';
+0009   
+0020   
+00a0   
+1680   
+2000   200b
+202f   
+3000   
+END
diff --git a/lib/unicode/Is/SpacePerl.pl b/lib/unicode/Is/SpacePerl.pl
new file mode 100644 (file)
index 0000000..2bb74de
--- /dev/null
@@ -0,0 +1,14 @@
+# !!!!!!!   DO NOT EDIT THIS FILE   !!!!!!! 
+# This file is built by mktables.PL from e.g. Unicode.301.
+# Any changes made here will be lost!
+return <<'END';
+0009   000a
+000c   000d
+0020   
+00a0   
+1680   
+2000   200b
+2028   2029
+202f   
+3000   
+END
index d8b57b6..82b35ef 100755 (executable)
@@ -25,8 +25,13 @@ mkdir "To", 0755;
     # 000B: VERTICAL TABULATION
     # 000C: FORM FEED
     # 000D: CARRIAGE RETURN
+    # 0020: SPACE
     ['IsSpace',  '$cat  =~ /^Z/ ||
                   $code =~ /^(0009|000A|000B|000C|000D)$/',    ''],
+    ['IsSpacePerl',
+                 '$cat  =~ /^Z/ ||
+                  $code =~ /^(0009|000A|000C|000D)$/',         ''],
+    ['IsBlank',  '$cat  =~ /^Z[^lp]$/ ||  $code eq "0009"',    ''],
     ['IsDigit',  '$cat =~ /^Nd$/',     ''],
     ['IsUpper',  '$cat =~ /^L[ut]$/',  ''],
     ['IsLower',  '$cat =~ /^Ll$/',     ''],
index 182f5bd..c5ecb13 100644 (file)
@@ -225,19 +225,21 @@ whole character class.  For example:
 matches zero, one, any alphabetic character, and the percentage sign.
 
 If the C<utf8> pragma is used, the following equivalences to Unicode
-\p{} constructs hold:
+\p{} constructs and equivalent backslash character classes (if available),
+will hold:
 
     alpha       IsAlpha
     alnum       IsAlnum
     ascii       IsASCII
     blank      IsSpace
     cntrl       IsCntrl
-    digit       IsDigit
+    digit       IsDigit        \d
     graph       IsGraph
     lower       IsLower
     print       IsPrint
     punct       IsPunct
     space       IsSpace
+                IsSpacePerl    \s
     upper       IsUpper
     word        IsWord
     xdigit      IsXDigit
index 784e83e..3b4f481 100644 (file)
--- a/regcomp.c
+++ b/regcomp.c
@@ -3705,7 +3705,7 @@ S_regclassutf8(pTHX_ RExC_state_t *pRExC_state)
            flags |= ANYOF_FOLD;
        if (LOC)
            flags |= ANYOF_LOCALE;
-       listsv = newSVpvn("# comment\n",10);
+       listsv = newSVpvn("# comment\n", 10);
     }
 
     if (!SIZE_ONLY && ckWARN(WARN_REGEXP))
@@ -3868,15 +3868,16 @@ S_regclassutf8(pTHX_ RExC_state_t *pRExC_state)
                case ANYOF_NPUNCT:
                    Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPunct\n");   break;
                case ANYOF_SPACE:
-               case ANYOF_PSXSPC:
+                   Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpacePerl\n");break;
+               case ANYOF_NSPACE:
+                   Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpacePerl\n");break;
                case ANYOF_BLANK:
-                   /* Not very true for PSXSPC and BLANK
-                    * but not feeling like creating IsPOSIXSpace and
-                    * IsBlank right now. --jhi */
+                   Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsBlank\n");   break;
+               case ANYOF_NBLANK:
+                   Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsBlank\n");   break;
+               case ANYOF_PSXSPC:
                    Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpace\n");   break;
-               case ANYOF_NSPACE:
                case ANYOF_NPSXSPC:
-               case ANYOF_NBLANK:
                    Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpace\n");   break;
                case ANYOF_UPPER:
                    Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsUpper\n");   break;
index 018c6c8..18c06d5 100644 (file)
--- a/regexec.c
+++ b/regexec.c
@@ -3773,7 +3773,7 @@ S_reginclass(pTHX_ register regnode *p, register I32 c)
            (ANYOF_CLASS_TEST(p, ANYOF_NXDIGIT) && !isXDIGIT(c))    ||
            (ANYOF_CLASS_TEST(p, ANYOF_PSXSPC)  &&  isPSXSPC(c))    ||
            (ANYOF_CLASS_TEST(p, ANYOF_NPSXSPC) && !isPSXSPC(c))    ||
-           (ANYOF_CLASS_TEST(p, ANYOF_BLANK)   &&  isBLANK(c))    ||
+           (ANYOF_CLASS_TEST(p, ANYOF_BLANK)   &&  isBLANK(c))     ||
            (ANYOF_CLASS_TEST(p, ANYOF_NBLANK)  && !isBLANK(c))
            ) /* How's that for a conditional? */
        {
index 8c3638c..aaec39d 100755 (executable)
@@ -1107,15 +1107,15 @@ my @space1 = sort grep { $space{$_} =~ /[[:space:]]/ } keys %space;
 my @space2 = sort grep { $space{$_} =~ /[[:blank:]]/ } keys %space;
 
 print "not " unless "@space0" eq "cr ff lf spc tab";
-print "ok $test\n";
+print "ok $test # @space0\n";
 $test++;
 
 print "not " unless "@space1" eq "cr ff lf spc tab vt";
-print "ok $test\n";
+print "ok $test # @space1\n";
 $test++;
 
 print "not " unless "@space2" eq "spc tab";
-print "ok $test\n";
+print "ok $test # @space2\n";
 $test++;
  
 # bugid 20001021.005 - this caused a SEGV
diff --git a/utf8.c b/utf8.c
index e313258..9e943ac 100644 (file)
--- a/utf8.c
+++ b/utf8.c
@@ -899,7 +899,7 @@ Perl_is_utf8_space(pTHX_ U8 *p)
     if (!is_utf8_char(p))
        return FALSE;
     if (!PL_utf8_space)
-       PL_utf8_space = swash_init("utf8", "IsSpace", &PL_sv_undef, 0, 0);
+       PL_utf8_space = swash_init("utf8", "IsSpacePerl", &PL_sv_undef, 0, 0);
     return swash_fetch(PL_utf8_space, p);
 }