create new unicode props as defined in POSIX spec (optionally use them in the regex...

diff --git a/MANIFEST b/MANIFEST

index 8af2e5e..97705e2 100644 (file)
--- a/MANIFEST
+++ b/MANIFEST
@@ -4032,6 +4032,7 @@ t/op/read.t                       See if read() works
 t/op/recurse.t                 See if deep recursion works
 t/op/ref.t                     See if refs and objects work
 t/op/reg_email.t               See if regex recursion works by parsing email addresses
+t/op/reg_posixcc.t             See if posix characterclasses behave consistantly
 t/op/reg_email_thr.t           See if regex recursion works by parsing email addresses in another thread
 t/op/regexp_noamp.t            See if regular expressions work with optimizations
 t/op/regexp_notrie.t           See if regular expressions work without trie optimisation
diff --git a/lib/unicore/mktables b/lib/unicore/mktables

index 64de8b1..242465d 100644 (file)
--- a/lib/unicore/mktables
+++ b/lib/unicore/mktables
@@ -779,6 +779,31 @@ sub simple_dumper {
 ##
 ## Process UnicodeData.txt (Categories, etc.)
 ##
+# These are the character mappings as defined in the POSIX standard
+# and in the case of PerlSpace and PerlWord as is defined in the test macros
+# for binary strings. IOW, PerlWord is [A-Za-z_] and PerlSpace is [\f\r\n\t ]
+# This differs from Word and the existing SpacePerl (note the prefix/suffix difference)
+# which is basically the Unicode WhiteSpace without the vertical tab included
+#
+my %TRUE_POSIX_PERL_CC= (
+    PosixAlnum => { map { $_ => 1 } ( 0x0030..0x0039, 0x0041..0x005a, 0x0061..0x007a )},
+    PosixAlpha => { map { $_ => 1 } ( 0x0041..0x005a, 0x0061..0x007a )},
+    # Not Needed: Ascii => { map { $_ => 1 } ( 0x0000..0x007f )},
+    PosixBlank => { map { $_ => 1 } ( 0x0009, 0x0020 )},
+    PosixCntrl => { map { $_ => 1 } ( 0x0000..0x001f, 0x007f )},
+    PosixGraph => { map { $_ => 1 } ( 0x0021..0x007e )},
+    PosixLower => { map { $_ => 1 } ( 0x0061..0x007a )},
+    PosixPrint => { map { $_ => 1 } ( 0x0020..0x007e )},
+    PosixPunct => { map { $_ => 1 } ( 0x0021..0x002f, 0x003a..0x0040, 0x005b..0x0060, 0x007b..0x007e )},
+    PosixSpace => { map { $_ => 1 } ( 0x0009..0x000d, 0x0020 )},
+    PosixUpper => { map { $_ => 1 } ( 0x0041..0x005a )},
+    # Not needed:  PosixXdigit => { map { $_ => 1 } ( 0x0030..0x0039, 0x0041..0x0046, 0x0061..0x0066 )},
+    PosixDigit => { map { $_ => 1 } ( 0x0030..0x0039 )},
+    
+    PerlSpace  => { map { $_ => 1 } ( 0x0009..0x000a, 0x000c..0x000d, 0x0020 )},
+    PerlWord   => { map { $_ => 1 } ( 0x0030..0x0039, 0x0041..0x005a, 0x005f, 0x0061..0x007a )},
+);
+
 sub UnicodeData_Txt()
 {
     my $Bidi     = Table->New();
@@ -795,7 +820,7 @@ sub UnicodeData_Txt()
     $DC{can} = Table->New();
     $DC{com} = Table->New();
 
-    ## Initialize Perl-generated categories
+    ## Initialize Broken Perl-generated categories
     ## (Categories from UnicodeData.txt are auto-initialized in gencat)
     $Cat{Alnum}  =
        Table->New(Is => 'Alnum',  Desc => "[[:Alnum:]]",  Fuzzy => 0);
@@ -839,6 +864,10 @@ sub UnicodeData_Txt()
     $To{Title} = Table->New();
     $To{Digit} = Table->New();
 
+    foreach my $cat (keys %TRUE_POSIX_PERL_CC) {
+        $Cat{$cat} = Table->New(Is=>$cat, Fuzzy => 0);
+    }
+
     sub gencat($$$$)
     {
         my ($name, ## Name ("LATIN CAPITAL LETTER A")
@@ -920,6 +949,13 @@ sub UnicodeData_Txt()
         $Cat{XDigit}->$op($code) if ($code >= 0x30 && $code <= 0x39)  ## 0..9
                                  || ($code >= 0x41 && $code <= 0x46)  ## A..F
                                  || ($code >= 0x61 && $code <= 0x66); ## a..f
+        if ($code<=0x7F) {
+            foreach my $cat (keys %TRUE_POSIX_PERL_CC) {
+                if ($TRUE_POSIX_PERL_CC{$cat}{$code}) {
+                    $Cat{$cat}->$op($code);
+                }
+            }
+        }
     }
 
     ## open ane read file.....
diff --git a/regcomp.c b/regcomp.c

index 016e099..3ad1f3b 100644 (file)
--- a/regcomp.c
+++ b/regcomp.c
@@ -7804,6 +7804,22 @@ case ANYOF_N##NAME:                                     \
     what = WORD;                                        \
     break
 
+/* 
+   We dont use PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS as the direct test
+   so that it is possible to override the option here without having to 
+   rebuild the entire core. as we are required to do if we change regcomp.h
+   which is where PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS is defined.
+*/
+#if PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS
+#define BROKEN_UNICODE_CHARCLASS_MAPPINGS
+#endif
+
+#ifdef BROKEN_UNICODE_CHARCLASS_MAPPINGS
+#define POSIX_CC_UNI_NAME(CCNAME) CCNAME
+#else
+#define POSIX_CC_UNI_NAME(CCNAME) "Posix" CCNAME
+#endif
+
 /*
    parse a class specification and produce either an ANYOF node that
    matches the pattern or if the pattern matches a single char only and
@@ -8092,18 +8108,24 @@ parseit:
                 * A similar issue a little earlier when switching on value.
                 * --jhi */
                switch ((I32)namedclass) {
+               
+               case _C_C_T_(ALNUMC, isALNUMC(value), POSIX_CC_UNI_NAME("Alnum"));
+               case _C_C_T_(ALPHA, isALPHA(value), POSIX_CC_UNI_NAME("Alpha"));
+               case _C_C_T_(BLANK, isBLANK(value), POSIX_CC_UNI_NAME("Blank"));
+               case _C_C_T_(CNTRL, isCNTRL(value), POSIX_CC_UNI_NAME("Cntrl"));
+               case _C_C_T_(GRAPH, isGRAPH(value), POSIX_CC_UNI_NAME("Graph"));
+               case _C_C_T_(LOWER, isLOWER(value), POSIX_CC_UNI_NAME("Lower"));
+               case _C_C_T_(PRINT, isPRINT(value), POSIX_CC_UNI_NAME("Print"));
+               case _C_C_T_(PSXSPC, isPSXSPC(value), POSIX_CC_UNI_NAME("Space"));
+               case _C_C_T_(PUNCT, isPUNCT(value), POSIX_CC_UNI_NAME("Punct"));
+               case _C_C_T_(UPPER, isUPPER(value), POSIX_CC_UNI_NAME("Upper"));
+#ifdef BROKEN_UNICODE_CHARCLASS_MAPPINGS
                case _C_C_T_(ALNUM, isALNUM(value), "Word");
-               case _C_C_T_(ALNUMC, isALNUMC(value), "Alnum");
-               case _C_C_T_(ALPHA, isALPHA(value), "Alpha");
-               case _C_C_T_(BLANK, isBLANK(value), "Blank");
-               case _C_C_T_(CNTRL, isCNTRL(value), "Cntrl");
-               case _C_C_T_(GRAPH, isGRAPH(value), "Graph");
-               case _C_C_T_(LOWER, isLOWER(value), "Lower");
-               case _C_C_T_(PRINT, isPRINT(value), "Print");
-               case _C_C_T_(PSXSPC, isPSXSPC(value), "Space");
-               case _C_C_T_(PUNCT, isPUNCT(value), "Punct");
                case _C_C_T_(SPACE, isSPACE(value), "SpacePerl");
-               case _C_C_T_(UPPER, isUPPER(value), "Upper");
+#else
+               case _C_C_T_(SPACE, isSPACE(value), "PerlSpace");
+               case _C_C_T_(ALNUM, isALNUM(value), "PerlWord");
+#endif         
                case _C_C_T_(XDIGIT, isXDIGIT(value), "XDigit");
                case _C_C_T_NOLOC_(VERTWS, is_VERTWS_latin1(&value), "VertSpace");
                case _C_C_T_NOLOC_(HORIZWS, is_HORIZWS_latin1(&value), "HorizSpace");
@@ -8150,7 +8172,7 @@ parseit:
                            ANYOF_BITMAP_SET(ret, value);
                    }
                    yesno = '+';
-                   what = "Digit";
+                   what = POSIX_CC_UNI_NAME("Digit");
                    break;
                case ANYOF_NDIGIT:
                    if (LOC)
@@ -8163,7 +8185,7 @@ parseit:
                            ANYOF_BITMAP_SET(ret, value);
                    }
                    yesno = '!';
-                   what = "Digit";
+                   what = POSIX_CC_UNI_NAME("Digit");
                    break;              
                case ANYOF_MAX:
                    /* this is to handle \p and \P */
diff --git a/regcomp.h b/regcomp.h

index 1664871..2ac1be1 100644 (file)
--- a/regcomp.h
+++ b/regcomp.h
@@ -18,6 +18,24 @@ typedef OP OP_4tree;                 /* Will be redefined later. */
 /* Be really agressive about optimising patterns with trie sequences? */
 #define PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION 1
 
+/* Use old style unicode mappings for perl and posix character classes
+ *
+ * NOTE: Enabling this essentially breaks character class matching against unicode 
+ * strings, so that POSIX char classes match when they shouldn't, and \d matches 
+ * way more than 10 characters, and sometimes a charclass and its complement either
+ * both match or neither match.
+ * NOTE: Disabling this will cause various backwards compatibility issues to rear 
+ * their head, and tests to fail. However it will make the charclass behaviour 
+ * consistant regardless of internal string type, and make character class inversions
+ * consistant. The tests that fail in the regex engine are basically broken tests.
+ *
+ * Personally I think 5.12 should disable this for sure. Its a bit more debatable for
+ * 5.10, so for now im leaving it enabled.
+ *
+ * -demerphq
+ */
+#define PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS 1
+
 /* Should the optimiser take positive assertions into account? */
 #define PERL_ENABLE_POSITIVE_ASSERTION_STUDY 0
 
diff --git a/t/op/pat.t b/t/op/pat.t

index a2d49ac..798a3da 100755 (executable)
--- a/t/op/pat.t
+++ b/t/op/pat.t
@@ -4617,6 +4617,9 @@ sub kt
 }
 
 SKIP: {
+    # XXX: This set of tests is essentially broken, POSIX character classes
+    # should not have differing definitions under unicode. 
+    # There are property names for that.
     unless ($ordA == 65) { skip("Assumes ASCII", 4) }
 
     my @notIsPunct = grep {/[[:punct:]]/ and not /\p{IsPunct}/}
diff --git a/t/op/reg_posixcc.t b/t/op/reg_posixcc.t

new file mode 100644 (file)

index 0000000..7335399
--- /dev/null
+++ b/t/op/reg_posixcc.t
@@ -0,0 +1,127 @@
+#!perl
+
+BEGIN {
+    chdir 't' if -d 't';
+    @INC = '../lib';
+}
+
+use strict;
+use warnings;
+use Test::More tests => 1;
+my @pats=(
+            "\\w",
+           "\\W",
+           "\\s",
+           "\\S",
+           "\\d",
+           "\\D",
+           "[:alnum:]",
+           "[:^alnum:]",
+           "[:alpha:]",
+           "[:^alpha:]",
+           "[:ascii:]",
+           "[:^ascii:]",
+           "[:cntrl:]",
+           "[:^cntrl:]",
+           "[:graph:]",
+           "[:^graph:]",
+           "[:lower:]",
+           "[:^lower:]",
+           "[:print:]",
+           "[:^print:]",
+           "[:punct:]",
+           "[:^punct:]",
+           "[:upper:]",
+           "[:^upper:]",
+           "[:xdigit:]",
+           "[:^xdigit:]",
+           "[:space:]",
+           "[:^space:]",
+           "[:blank:]",
+           "[:^blank:]" );
+sub rangify {
+    my $ary= shift;
+    my $fmt= shift || '%d';
+    my $sep= shift || ' ';
+    my $rng= shift || '..';
+    
+    
+    my $first= $ary->[0];
+    my $last= $ary->[0];
+    my $ret= sprintf $fmt, $first;
+    for my $idx (1..$#$ary) {
+        if ( $ary->[$idx] != $last + 1) {
+            if ($last!=$first) {
+                $ret.=sprintf "%s$fmt",$rng, $last;
+            }             
+            $first= $last= $ary->[$idx];
+            $ret.=sprintf "%s$fmt",$sep,$first;
+         } else {
+            $last= $ary->[$idx];
+         }
+    }
+    if ( $last != $first) {
+        $ret.=sprintf "%s$fmt",$rng, $last;
+    }
+    return $ret;
+}
+
+my $description = "";
+while (@pats) {
+    my ($yes,$no)= splice @pats,0,2;
+    
+    my %err_by_type;
+    my %singles;
+    foreach my $b (0..255) {
+        my %got;
+        for my $type ('unicode','not-unicode') {
+            my $str=chr($b).chr($b);
+            if ($type eq 'unicode') {
+                $str.=chr(256);
+                chop $str;
+            }
+            if ($str=~/[$yes][$no]/) {
+                push @{$err_by_type{$type}},$b;
+            }
+            $got{"[$yes]"}{$type} = $str=~/[$yes]/ ? 1 : 0;
+            $got{"[$no]"}{$type} = $str=~/[$no]/ ? 1 : 0;
+            $got{"[^$yes]"}{$type} = $str=~/[^$yes]/ ? 1 : 0;
+            $got{"[^$no]"}{$type} = $str=~/[^$no]/ ? 1 : 0;
+        }
+        foreach my $which ("[$yes]","[$no]","[^$yes]","[^$no]") {
+            if ($got{$which}{'unicode'} != $got{$which}{'not-unicode'}) {
+                push @{$singles{$which}},$b;
+            }
+        }
+    }
+    
+    
+    if (%err_by_type || %singles) {
+        $description||=" Error:\n";
+        $description .= "/[$yes][$no]/\n";
+        if (%err_by_type) {
+            foreach my $type (keys %err_by_type) {
+                $description .= "\tmatches $type codepoints:\t";
+                $description .= rangify($err_by_type{$type});
+                $description .= "\n";
+            }
+            $description .= "\n";
+        }
+        if (%singles) {
+            $description .= "Unicode/Nonunicode mismatches:\n";
+            foreach my $type (keys %singles) {
+                $description .= "\t$type:\t";
+                $description .= rangify($singles{$type});
+                $description .= "\n";
+            }
+            $description .= "\n";
+        }
+     
+    }
+    
+}
+TODO: {
+    local $TODO = "Only works under PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS = 0";
+    is( $description, "", "POSIX and perl charclasses should not depend on string type");
+};
+__DATA__
MANIFEST		patch \| blob \| blame \| history
lib/unicore/mktables		patch \| blob \| blame \| history
regcomp.c		patch \| blob \| blame \| history
regcomp.h		patch \| blob \| blame \| history
t/op/pat.t		patch \| blob \| blame \| history
t/op/reg_posixcc.t	[new file with mode: 0644]	patch \| blob