NI-S' cunning idea of how to de-UTF8 the "\C-broken" submatches.

diff --git a/mg.c b/mg.c

index eb79dc4..931b1a1 100644 (file)
--- a/mg.c
+++ b/mg.c
@@ -412,7 +412,9 @@ Perl_magic_len(pTHX_ SV *sv, MAGIC *mg)
                    char *s    = rx->subbeg + s1;
                    char *send = rx->subbeg + t1;
 
-                   i = Perl_utf8_length(aTHX_ (U8*)s, (U8*)send);
+                   i = t1 - s1;
+                   if (is_utf8_string((U8*)s, i))
+                       i = Perl_utf8_length(aTHX_ (U8*)s, (U8*)send);
                }
                if (i < 0)
                    Perl_croak(aTHX_ "panic: magic_len: %"IVdf, (IV)i);
@@ -630,7 +632,7 @@ Perl_magic_get(pTHX_ SV *sv, MAGIC *mg)
                        PL_tainted = FALSE;
                    }
                    sv_setpvn(sv, s, i);
-                   if (DO_UTF8(PL_reg_sv))
+                   if (DO_UTF8(PL_reg_sv) && is_utf8_string((U8*)s, i))
                        SvUTF8_on(sv);
                    else
                        SvUTF8_off(sv);
diff --git a/t/op/pat.t b/t/op/pat.t

index a66ea45..8575ca8 100755 (executable)
--- a/t/op/pat.t
+++ b/t/op/pat.t
@@ -1133,8 +1133,6 @@ $test++;
 $_ = "a\x{100}b";
 if (/(.)(\C)(\C)(.)/) {
   print "ok 232\n";
-  # currently \C are still tagged as UTF-8
-  use bytes;
   if ($1 eq "a") {
     print "ok 233\n";
   } else {
@@ -1164,7 +1162,6 @@ $_ = "\x{100}";
 if (/(\C)/g) {
   print "ok 237\n";
   # currently \C are still tagged as UTF-8
-  use bytes;
   if ($1 eq "\xC4") {
     print "ok 238\n";
   } else {
@@ -1178,7 +1175,6 @@ if (/(\C)/g) {
 if (/(\C)/g) {
   print "ok 239\n";
   # currently \C are still tagged as UTF-8
-  use bytes;
   if ($1 eq "\x80") {
     print "ok 240\n";
   } else {
@@ -1231,7 +1227,7 @@ if (ord('i') == 0x89 && ord('J') == 0xd1) { # EBCDIC
   }
 } else {
   for (244..245) {
-    print "ok $_ # Skip: not EBCDIC\n";
+    print "ok $_ # Skip: only in EBCDIC\n";
   }
 }
mg.c		patch \| blob \| blame \| history
t/op/pat.t		patch \| blob \| blame \| history