PATCH: deprecation warnings for unreasonable charnames
Karl Williamson [Sat, 20 Feb 2010 06:53:36 +0000 (23:53 -0700)]
Prior to now just about anything has been legal for a character name in
\N{...}.  This means that legal code was broken by having \N{3,4} for
example mean [^\n]{3,4}.  Such code doesn't come from standard
charnames, but from legal custom translators.

This patch deprecates "unreasonable" names.  handy.h is changed by the
addition of macros that taken together define the names we deem
reasonable,  namely alpha beginning with alphanumerics and some
punctuations as continuations.

toke.c is changed to parse each name and to raise a warning if any
problematic characters are found.

Some tests and diagnostic documentation are also included.

handy.h
pod/perldiag.pod
t/re/pat_advanced.t
toke.c

diff --git a/handy.h b/handy.h
index 911deca..d12972d 100644 (file)
--- a/handy.h
+++ b/handy.h
@@ -462,6 +462,18 @@ US-ASCII (Basic Latin) range are viewed as not having any case.
 #define isALNUM(c)     (isALPHA(c) || isDIGIT(c) || (c) == '_')
 #define isIDFIRST(c)   (isALPHA(c) || (c) == '_')
 #define isALPHA(c)     (isUPPER(c) || isLOWER(c))
+/* ALPHAU includes Unicode semantics for latin1 characters.  It has an extra
+ * >= AA test to speed up ASCII-only tests at the expense of the others */
+#define isALPHAU(c)    (isALPHA(c) || (NATIVE_TO_UNI((U8) c) >= 0xAA \
+    && ((NATIVE_TO_UNI((U8) c) >= 0xC0 \
+           && NATIVE_TO_UNI((U8) c) != 0xD7 && NATIVE_TO_UNI((U8) c) != 0xF7) \
+       || NATIVE_TO_UNI((U8) c) == 0xAA \
+       || NATIVE_TO_UNI((U8) c) == 0xB5 \
+       || NATIVE_TO_UNI((U8) c) == 0xBA)))
+#define isALNUMU(c)    (isDIGIT(c) || isALPHAU(c) || (c) == '_')
+
+/* continuation character for legal NAME in \N{NAME} */
+#define isCHARNAME_CONT(c) (isALNUMU(c) || (c) == ' ' || (c) == '-' || (c) == '(' || (c) == ')' || (c) == ':' || NATIVE_TO_UNI((U8) c) == 0xA0)
 #define isSPACE(c) \
        ((c) == ' ' || (c) == '\t' || (c) == '\n' || (c) =='\r' || (c) == '\f')
 #define isPSXSPC(c)    (isSPACE(c) || (c) == '\v')
index 95b45f7..e1f02db 100644 (file)
@@ -1457,6 +1457,14 @@ there are neither package declarations nor a C<$VERSION>.
 long for Perl to handle.  You have to be seriously twisted to write code
 that triggers this error.
 
+=item Deprecated character(s) in \\N{...} starting at '%s'
+
+(D deprecated) Just about anything is legal for the C<...> in C<\N{...}>.
+But starting in 5.12, non-reasonable ones that don't look like names are
+deprecated.  A reasonable name begins with an alphabetic character and
+continues with any combination of alphanumerics, dashes, spaces, parentheses or
+colons.
+
 =item Deprecated use of my() in false conditional
 
 (D deprecated) You used a declaration similar to C<my $x if 0>.
index 86735ec..881fd9e 100644 (file)
@@ -21,7 +21,7 @@ BEGIN {
 }
 
 
-plan tests => 1155;  # Update this when adding/deleting tests.
+plan tests => 1159;  # Update this when adding/deleting tests.
 
 run_tests() unless caller;
 
@@ -1065,6 +1065,20 @@ sub run_tests {
         undef $w;
         eval 'q(syntax error) =~ /\N{MALFORMED}/';
         ok $@ && $@ =~ /Malformed/, 'Verify that malformed utf8 gives an error';
+        undef $w;
+        eval 'q() =~ /\N{4F}/';
+        ok $w && $w =~ /Deprecated/, 'Verify that leading digit in name gives warning';
+        undef $w;
+        eval 'q() =~ /\N{COM,MA}/';
+        ok $w && $w =~ /Deprecated/, 'Verify that comma in name gives warning';
+        undef $w;
+        my $name = "A\x{D7}O";
+        eval "q(W) =~ /\\N{$name}/";
+        ok $w && $w =~ /Deprecated/, 'Verify that latin1 symbol in name gives warning';
+        undef $w;
+        $name = "A\x{D1}O";
+        eval "q(W) =~ /\\N{$name}/";
+        ok ! $w, 'Verify that latin1 letter in name doesnt give warning';
 
     }
 
diff --git a/toke.c b/toke.c
index 361d7d2..7167004 100644 (file)
--- a/toke.c
+++ b/toke.c
@@ -3206,7 +3206,70 @@ S_scan_const(pTHX_ char *start)
                        d += len;
                    }
                    SvREFCNT_dec(res);
-               }
+
+                   /* Deprecate non-approved name syntax */
+                   if (ckWARN_d(WARN_DEPRECATED)) {
+                       bool problematic = FALSE;
+                       char* i = s;
+
+                       /* For non-ut8 input, look to see that the first
+                        * character is an alpha, then loop through the rest
+                        * checking that each is a continuation */
+                       if (! this_utf8) {
+                           if (! isALPHAU(*i)) problematic = TRUE;
+                           else for (i = s + 1; i < e; i++) {
+                               if (isCHARNAME_CONT(*i)) continue;
+                               problematic = TRUE;
+                               break;
+                           }
+                       }
+                       else {
+                           /* Similarly for utf8.  For invariants can check
+                            * directly.  We accept anything above the latin1
+                            * range because it is immaterial to Perl if it is
+                            * correct or not, and is expensive to check.  But
+                            * it is fairly easy in the latin1 range to convert
+                            * the variants into a single character and check
+                            * those */
+                           if (UTF8_IS_INVARIANT(*i)) {
+                               if (! isALPHAU(*i)) problematic = TRUE;
+                           } else if (UTF8_IS_DOWNGRADEABLE_START(*i)) {
+                               if (! isALPHAU(UNI_TO_NATIVE(UTF8_ACCUMULATE(*i,
+                                                                           *(i+1)))))
+                               {
+                                   problematic = TRUE;
+                               }
+                           }
+                           if (! problematic) for (i = s + UTF8SKIP(s);
+                                                   i < e;
+                                                   i+= UTF8SKIP(i))
+                           {
+                               if (UTF8_IS_INVARIANT(*i)) {
+                                   if (isCHARNAME_CONT(*i)) continue;
+                               } else if (! UTF8_IS_DOWNGRADEABLE_START(*i)) {
+                                   continue;
+                               } else if (isCHARNAME_CONT(
+                                           UNI_TO_NATIVE(
+                                           UTF8_ACCUMULATE(*i, *(i+1)))))
+                               {
+                                   continue;
+                               }
+                               problematic = TRUE;
+                               break;
+                           }
+                       }
+                       if (problematic) {
+                           char *string;
+                           Newx(string, e - i + 1, char);
+                           Copy(i, string, e - i, char);
+                           string[e - i] = '\0';
+                           Perl_warner(aTHX_ packWARN(WARN_DEPRECATED),
+                               "Deprecated character(s) in \\N{...} starting at '%s'",
+                               string);
+                           Safefree(string);
+                       }
+                   }
+               } /* End \N{NAME} */
 #ifdef EBCDIC
                if (!dorange) 
                    native_range = FALSE; /* \N{} is defined to be Unicode */