improvements for high-bit text literals (from Gisle Aas)

diff --git a/t/pragma/warn/doop b/t/pragma/warn/doop

index c16e24f..5803b44 100644 (file)
--- a/t/pragma/warn/doop
+++ b/t/pragma/warn/doop
@@ -1,29 +1,6 @@
-  doop.c       AOK
-
-  \x%s will produce malformed UTF-8 character; use \x{%s} for that
-
-
-__END__
 # doop.c
 use utf8 ;
 $_ = "\x80  \xff" ;
 chop ;
 EXPECT
 ########
-# doop.c
-BEGIN {
-    if (ord("\t") == 5) {
-        print "SKIPPED\n# Character codes differ on ebcdic machines.";
-        exit 0;
-    }
-}
-use warnings 'utf8'  ;
-use utf8 ;
-$_ = "\x80  \xff" ;
-chop ;
-no warnings 'utf8'  ;
-$_ = "\x80  \xff" ;
-chop ;
-EXPECT
-\x80 will produce malformed UTF-8 character; use \x{80} for that at - line 10.
-\xff will produce malformed UTF-8 character; use \x{ff} for that at - line 10.
diff --git a/t/pragma/warn/pp b/t/pragma/warn/pp

index b392029..8f42ba6 100644 (file)
--- a/t/pragma/warn/pp
+++ b/t/pragma/warn/pp
@@ -28,11 +28,6 @@
   Constant subroutine %s undefined                     <<<TODO
   Constant subroutine (anonymous) undefined            <<<TODO
 
-  Mandatory Warnings
-  ------------------
-  Malformed UTF-8 character (not tested: difficult to produce with
-                             perl now)
-
 __END__
 # pp.c
 use warnings 'substr' ;
@@ -113,20 +108,3 @@ $_ = "\x80  \xff" ;
 reverse ;
 EXPECT
 ########
-# pp.c
-BEGIN {
-    if (ord("\t") == 5) {
-        print "SKIPPED\n# Character codes differ on ebcdic machines.";
-        exit 0;
-    }
-}
-use warnings 'utf8'  ;
-use utf8 ;
-$_ = "\x80  \xff" ;
-reverse ;
-no warnings 'utf8'  ;
-$_ = "\x80  \xff" ;
-reverse ;
-EXPECT
-\x80 will produce malformed UTF-8 character; use \x{80} for that at - line 10.
-\xff will produce malformed UTF-8 character; use \x{ff} for that at - line 10.
diff --git a/t/pragma/warn/sv b/t/pragma/warn/sv

index 9a2428e..758137f 100644 (file)
--- a/t/pragma/warn/sv
+++ b/t/pragma/warn/sv
@@ -269,25 +269,6 @@ EXPECT
 Undefined value assigned to typeglob at - line 3.
 ########
 # sv.c
-BEGIN {
-    if (ord("\t") == 5) {
-        print "SKIPPED\n# ebcdic \\x characters differ.";
-        exit 0;
-    }
-}
-use utf8 ;
-$^W =0 ;
-{
-  use warnings 'utf8' ;
-  my $a = rindex "a\xff bc ", "bc" ;
-  no warnings 'utf8' ;
-  $a = rindex "a\xff bc ", "bc" ;
-}
-my $a = rindex "a\xff bc ", "bc" ;
-EXPECT
-\xff will produce malformed UTF-8 character; use \x{ff} for that at - line 12.
-########
-# sv.c
 use warnings 'y2k';
 use Config;
 BEGIN {
diff --git a/t/pragma/warn/toke b/t/pragma/warn/toke

index 271ef63..cfdea78 100644 (file)
--- a/t/pragma/warn/toke
+++ b/t/pragma/warn/toke
@@ -89,10 +89,6 @@ toke.c       AOK
        sub time {} 
        my $a = time()
 
-    \x%.*s will produce malformed UTF-8 character; use \x{%.*s} for that
-        use utf8 ; 
-       $_ = "\xffe"
-
     Unrecognized escape \\%c passed through
         $a = "\m" ;
 
@@ -447,21 +443,6 @@ EXPECT
 
 ########
 # toke.c
-BEGIN {
-    if (ord("\t") == 5) {
-        print "SKIPPED\n# Ebcdic platforms have different \\x constructs.";
-        exit 0;
-    }
-}
-use warnings 'utf8' ;
-use utf8 ;
-$_ = " \xffe " ;
-no warnings 'utf8' ;
-$_ = " \xffe " ;
-EXPECT
-\xff will produce malformed UTF-8 character; use \x{ff} for that at - line 10.
-########
-# toke.c
 my $a = rand + 4 ;
 EXPECT
 Warning: Use of "rand" without parens is ambiguous at - line 2.
diff --git a/t/pragma/warn/utf8 b/t/pragma/warn/utf8

index cb1f202..6a2fe54 100644 (file)
--- a/t/pragma/warn/utf8
+++ b/t/pragma/warn/utf8
@@ -14,48 +14,16 @@
      <<<<<< Add a test when somethig actually calls utf16_to_utf8
 
 __END__
-# utf8.c [utf8_to_uv]
+# utf8.c [utf8_to_uv] -W
 use utf8 ;
-my $a = ord "\x80" ;
-EXPECT
-########
-# utf8.c [utf8_to_uv]
-BEGIN {
-    if (ord("\t") == 5) {
-        print "SKIPPED\n# Ebcdic platforms have different \\x constructs.";
-        exit 0;
-    }
-}
-use utf8 ;
-my $a = ord "\x80" ;
+my $a = "snøstorm" ;
 {
-    use warnings 'utf8' ;
-    my $a = ord "\x80" ;
     no warnings 'utf8' ;
-    my $a = ord "\x80" ;
-}
-EXPECT
-\x80 will produce malformed UTF-8 character; use \x{80} for that at - line 12.
-########
-# utf8.c [utf8_to_uv]
-use utf8 ;
-my $a = ord "\xf080" ;
-EXPECT
-########
-# utf8.c [utf8_to_uv]
-BEGIN {
-    if (ord("\t") == 5) {
-        print "SKIPPED\n# Ebcdic platforms have different \\x constructs.";
-        exit 0;
-    }
-}
-use utf8 ;
-my $a = ord "\xf080" ;
-{
+    my $a = "snøstorm";
     use warnings 'utf8' ;
-    my $a = ord "\xf080" ;
-    no warnings 'utf8' ;
-    my $a = ord "\xf080" ;
+    my $a = "snøstorm";
 }
 EXPECT
-\xf0 will produce malformed UTF-8 character; use \x{f0} for that at - line 12.
+Malformed UTF-8 character at - line 3.
+Malformed UTF-8 character at - line 8.
+########
diff --git a/toke.c b/toke.c

index 727fc01..bdf8e51 100644 (file)
--- a/toke.c
+++ b/toke.c
@@ -1172,6 +1172,8 @@ S_scan_const(pTHX_ char *start)
     bool dorange = FALSE;                      /* are we in a translit range? */
     bool has_utf = FALSE;                      /* embedded \x{} */
     I32 len;                                   /* ? */
+    UV uv;
+
     I32 utf = (PL_lex_inwhat == OP_TRANS && PL_sublex_info.sub_op)
        ? (PL_sublex_info.sub_op->op_private & (OPpTRANS_FROM_UTF|OPpTRANS_TO_UTF))
        : UTF;
@@ -1293,18 +1295,20 @@ S_scan_const(pTHX_ char *start)
        /* (now in tr/// code again) */
 
        if (*s & 0x80 && thisutf) {
-           dTHR;                       /* only for ckWARN */
-           if (ckWARN(WARN_UTF8)) {
-               (void)utf8_to_uv((U8*)s, &len); /* could cvt latin-1 to utf8 here... */
-               if (len) {
-                   has_utf = TRUE;
-                   while (len--)
-                       *d++ = *s++;
-                   continue;
-               }
-           }
-           else
-               has_utf = TRUE;         /* assume valid utf8 */
+          (void)utf8_to_uv((U8*)s, &len);
+          if (len == 1) {
+              /* illegal UTF8, make it valid */
+              /* need to grow with 1 char to be safe */
+              char *old_pvx = SvPVX(sv);
+              d = SvGROW(sv, SvCUR(sv)+2) + (d - old_pvx);
+              d = (char*)uv_to_utf8((U8*)d, (U8)*s++);
+          }
+          else {
+              while (len--)
+                  *d++ = *s++;
+          }
+          has_utf = TRUE;
+          continue;
        }
 
        /* backslashes */
@@ -1360,51 +1364,75 @@ S_scan_const(pTHX_ char *start)
            /* \132 indicates an octal constant */
            case '0': case '1': case '2': case '3':
            case '4': case '5': case '6': case '7':
-               *d++ = (char)scan_oct(s, 3, &len);
+               uv = (UV)scan_oct(s, 3, &len);
                s += len;
-               continue;
+               goto NUM_ESCAPE_INSERT;
 
            /* \x24 indicates a hex constant */
            case 'x':
                ++s;
                if (*s == '{') {
                    char* e = strchr(s, '}');
-                   UV uv;
-
                    if (!e) {
                        yyerror("Missing right brace on \\x{}");
                        e = s;
                    }
-                   /* note: utf always shorter than hex */
-                   uv = (UV)scan_hex(s + 1, e - s - 1, &len);
-                   if (uv > 127) {
-                       d = (char*)uv_to_utf8((U8*)d, uv);
-                       has_utf = TRUE;
-                   }
-                   else
-                       *d++ = (char)uv;
-                   s = e + 1;
+                    uv = (UV)scan_hex(s + 1, e - s - 1, &len);
+                    s = e + 1;
                }
                else {
-                   /* XXX collapse this branch into the one above */
-                   UV uv = (UV)scan_hex(s, 2, &len);
-                   if (utf && PL_lex_inwhat == OP_TRANS &&
-                       utf != (OPpTRANS_FROM_UTF|OPpTRANS_TO_UTF))
-                   {
-                       d = (char*)uv_to_utf8((U8*)d, uv);      /* doing a CU or UC */
+                   uv = (UV)scan_hex(s, 2, &len);
+                   s += len;
+               }
+
+             NUM_ESCAPE_INSERT:
+               /* Insert oct or hex escaped character.
+                * There will always enough room in sv since such escapes will
+                * be longer than any utf8 sequence they can end up as
+                */
+               if (uv > 127) {
+                   if (!thisutf && !has_utf && uv > 255) {
+                       /* might need to recode whatever we have accumulated so far
+                        * if it contains any hibit chars
+                        */
+                       int hicount = 0;
+                       char *c;
+                       for (c = SvPVX(sv); c < d; c++) {
+                           if (*c & 0x80)
+                               hicount++;
+                       }
+                       if (hicount) {
+                           char *old_pvx = SvPVX(sv);
+                           char *src, *dst;
+                           d = SvGROW(sv, SvCUR(sv) + hicount + 1) + (d - old_pvx);
+
+                           src = d - 1;
+                           d += hicount;
+                           dst = d - 1;
+
+                           while (src < dst) {
+                               if (*src & 0x80) {
+                                   dst--;
+                                   uv_to_utf8((U8*)dst, (U8)*src--);
+                                   dst--;
+                               }
+                               else {
+                                   *dst-- = *src--;
+                               }
+                           }
+                        }
+                    }
+
+                    if (thisutf || uv > 255) {
+                       d = (char*)uv_to_utf8((U8*)d, uv);
                        has_utf = TRUE;
-                   }
+                    }
                    else {
-                       if (uv >= 127 && UTF) {
-                           dTHR;
-                           if (ckWARN(WARN_UTF8))
-                               Perl_warner(aTHX_ WARN_UTF8,
-                                   "\\x%.*s will produce malformed UTF-8 character; use \\x{%.*s} for that",
-                                   (int)len,s,(int)len,s);
-                       }
-                       *d++ = (char)uv;
+                       *d++ = (char)uv;
                    }
-                   s += len;
+               }
+               else {
+                   *d++ = (char)uv;
                }
                continue;
t/pragma/warn/doop		patch \| blob \| blame \| history
t/pragma/warn/pp		patch \| blob \| blame \| history
t/pragma/warn/sv		patch \| blob \| blame \| history
t/pragma/warn/toke		patch \| blob \| blame \| history
t/pragma/warn/utf8		patch \| blob \| blame \| history
toke.c		patch \| blob \| blame \| history