make comparisons promote to utf8 as necessary (from Gisle Aas)
Gurusamy Sarathy [Sat, 19 Feb 2000 07:51:39 +0000 (07:51 +0000)]
p4raw-id: //depot/perl@5138

Todo-5.6
embed.h
embed.pl
global.sym
objXSUB.h
perlapi.c
pp_hot.c
proto.h
sv.c
toke.c

index 52fbc50..28b146d 100644 (file)
--- a/Todo-5.6
+++ b/Todo-5.6
@@ -4,7 +4,7 @@ Bugs
 
 Unicode support
     finish byte <-> utf8 and localencoding <-> utf8 conversions 
-    make "$bytestr$charstr" do the right conversion
+    make substr($bytestr,0,0, $charstr) do the right conversion
     add Unicode::Map equivivalent to core
     add support for I/O disciplines
         - open(F, "<!crlf!utf16", $file)
@@ -15,6 +15,7 @@ Unicode support
     support C<print v1.2.3>
     make C<v123> mean C<chr(123)> (if !exists(&v123))
     autoload utf8_heavy.pl's swash routines in swash_init()
+    check uv_to_utf8() calls for buffer overflow
 
 Multi-threading
     support "use Thread;" under useithreads
diff --git a/embed.h b/embed.h
index b16eb3d..be6a685 100644 (file)
--- a/embed.h
+++ b/embed.h
 #define sv_pv                  Perl_sv_pv
 #define sv_pvutf8              Perl_sv_pvutf8
 #define sv_pvbyte              Perl_sv_pvbyte
+#define sv_utf8_upgrade                Perl_sv_utf8_upgrade
+#define sv_utf8_downgrade      Perl_sv_utf8_downgrade
+#define sv_utf8_encode         Perl_sv_utf8_encode
+#define sv_utf8_decode         Perl_sv_utf8_decode
 #define sv_force_normal                Perl_sv_force_normal
 #define tmps_grow              Perl_tmps_grow
 #define sv_rvweaken            Perl_sv_rvweaken
 #define sv_pv(a)               Perl_sv_pv(aTHX_ a)
 #define sv_pvutf8(a)           Perl_sv_pvutf8(aTHX_ a)
 #define sv_pvbyte(a)           Perl_sv_pvbyte(aTHX_ a)
+#define sv_utf8_upgrade(a)     Perl_sv_utf8_upgrade(aTHX_ a)
+#define sv_utf8_downgrade(a,b) Perl_sv_utf8_downgrade(aTHX_ a,b)
+#define sv_utf8_encode(a)      Perl_sv_utf8_encode(aTHX_ a)
+#define sv_utf8_decode(a)      Perl_sv_utf8_decode(aTHX_ a)
 #define sv_force_normal(a)     Perl_sv_force_normal(aTHX_ a)
 #define tmps_grow(a)           Perl_tmps_grow(aTHX_ a)
 #define sv_rvweaken(a)         Perl_sv_rvweaken(aTHX_ a)
 #define sv_pvutf8              Perl_sv_pvutf8
 #define Perl_sv_pvbyte         CPerlObj::Perl_sv_pvbyte
 #define sv_pvbyte              Perl_sv_pvbyte
+#define Perl_sv_utf8_upgrade   CPerlObj::Perl_sv_utf8_upgrade
+#define sv_utf8_upgrade                Perl_sv_utf8_upgrade
+#define Perl_sv_utf8_downgrade CPerlObj::Perl_sv_utf8_downgrade
+#define sv_utf8_downgrade      Perl_sv_utf8_downgrade
+#define Perl_sv_utf8_encode    CPerlObj::Perl_sv_utf8_encode
+#define sv_utf8_encode         Perl_sv_utf8_encode
+#define Perl_sv_utf8_decode    CPerlObj::Perl_sv_utf8_decode
+#define sv_utf8_decode         Perl_sv_utf8_decode
 #define Perl_sv_force_normal   CPerlObj::Perl_sv_force_normal
 #define sv_force_normal                Perl_sv_force_normal
 #define Perl_tmps_grow         CPerlObj::Perl_tmps_grow
index 952e673..3366a24 100755 (executable)
--- a/embed.pl
+++ b/embed.pl
@@ -2108,6 +2108,10 @@ Ap       |char*  |sv_2pvbyte_nolen|SV* sv
 Ap     |char*  |sv_pv          |SV *sv
 Ap     |char*  |sv_pvutf8      |SV *sv
 Ap     |char*  |sv_pvbyte      |SV *sv
+Ap      |void   |sv_utf8_upgrade|SV *sv
+Ap      |bool   |sv_utf8_downgrade|SV *sv|bool fail_ok
+Ap      |void   |sv_utf8_encode |SV *sv
+Ap      |bool   |sv_utf8_decode |SV *sv
 Ap     |void   |sv_force_normal|SV *sv
 Ap     |void   |tmps_grow      |I32 n
 Ap     |SV*    |sv_rvweaken    |SV *sv
index 1451d85..fee7614 100644 (file)
@@ -499,6 +499,10 @@ Perl_sv_2pvbyte_nolen
 Perl_sv_pv
 Perl_sv_pvutf8
 Perl_sv_pvbyte
+Perl_sv_utf8_upgrade
+Perl_sv_utf8_downgrade
+Perl_sv_utf8_encode
+Perl_sv_utf8_decode
 Perl_sv_force_normal
 Perl_tmps_grow
 Perl_sv_rvweaken
index 1243e9e..2897a6a 100644 (file)
--- a/objXSUB.h
+++ b/objXSUB.h
 #define Perl_sv_pvbyte         pPerl->Perl_sv_pvbyte
 #undef  sv_pvbyte
 #define sv_pvbyte              Perl_sv_pvbyte
+#undef  Perl_sv_utf8_upgrade
+#define Perl_sv_utf8_upgrade   pPerl->Perl_sv_utf8_upgrade
+#undef  sv_utf8_upgrade
+#define sv_utf8_upgrade                Perl_sv_utf8_upgrade
+#undef  Perl_sv_utf8_downgrade
+#define Perl_sv_utf8_downgrade pPerl->Perl_sv_utf8_downgrade
+#undef  sv_utf8_downgrade
+#define sv_utf8_downgrade      Perl_sv_utf8_downgrade
+#undef  Perl_sv_utf8_encode
+#define Perl_sv_utf8_encode    pPerl->Perl_sv_utf8_encode
+#undef  sv_utf8_encode
+#define sv_utf8_encode         Perl_sv_utf8_encode
+#undef  Perl_sv_utf8_decode
+#define Perl_sv_utf8_decode    pPerl->Perl_sv_utf8_decode
+#undef  sv_utf8_decode
+#define sv_utf8_decode         Perl_sv_utf8_decode
 #undef  Perl_sv_force_normal
 #define Perl_sv_force_normal   pPerl->Perl_sv_force_normal
 #undef  sv_force_normal
index f897146..f082498 100644 (file)
--- a/perlapi.c
+++ b/perlapi.c
@@ -3659,6 +3659,34 @@ Perl_sv_pvbyte(pTHXo_ SV *sv)
     return ((CPerlObj*)pPerl)->Perl_sv_pvbyte(sv);
 }
 
+#undef  Perl_sv_utf8_upgrade
+void
+Perl_sv_utf8_upgrade(pTHXo_ SV *sv)
+{
+    ((CPerlObj*)pPerl)->Perl_sv_utf8_upgrade(sv);
+}
+
+#undef  Perl_sv_utf8_downgrade
+bool
+Perl_sv_utf8_downgrade(pTHXo_ SV *sv, bool fail_ok)
+{
+    return ((CPerlObj*)pPerl)->Perl_sv_utf8_downgrade(sv, fail_ok);
+}
+
+#undef  Perl_sv_utf8_encode
+void
+Perl_sv_utf8_encode(pTHXo_ SV *sv)
+{
+    ((CPerlObj*)pPerl)->Perl_sv_utf8_encode(sv);
+}
+
+#undef  Perl_sv_utf8_decode
+bool
+Perl_sv_utf8_decode(pTHXo_ SV *sv)
+{
+    return ((CPerlObj*)pPerl)->Perl_sv_utf8_decode(sv);
+}
+
 #undef  Perl_sv_force_normal
 void
 Perl_sv_force_normal(pTHXo_ SV *sv)
index 8dab651..ddb900f 100644 (file)
--- a/pp_hot.c
+++ b/pp_hot.c
@@ -184,7 +184,16 @@ PP(pp_concat)
            }
        }
 #endif
+       if (DO_UTF8(right))
+           sv_utf8_upgrade(TARG);
        sv_catpvn(TARG,s,len);
+       if (!IN_BYTE) {
+           if (SvUTF8(right))
+               SvUTF8_on(TARG);
+       }
+       else if (!SvUTF8(right)) {
+           SvUTF8_off(TARG);
+       }
     }
     else
        sv_setpvn(TARG,s,len);  /* suppress warning */
diff --git a/proto.h b/proto.h
index df2ddb4..31b8f45 100644 (file)
--- a/proto.h
+++ b/proto.h
@@ -885,6 +885,10 @@ PERL_CALLCONV char*        Perl_sv_2pvbyte_nolen(pTHX_ SV* sv);
 PERL_CALLCONV char*    Perl_sv_pv(pTHX_ SV *sv);
 PERL_CALLCONV char*    Perl_sv_pvutf8(pTHX_ SV *sv);
 PERL_CALLCONV char*    Perl_sv_pvbyte(pTHX_ SV *sv);
+PERL_CALLCONV void     Perl_sv_utf8_upgrade(pTHX_ SV *sv);
+PERL_CALLCONV bool     Perl_sv_utf8_downgrade(pTHX_ SV *sv, bool fail_ok);
+PERL_CALLCONV void     Perl_sv_utf8_encode(pTHX_ SV *sv);
+PERL_CALLCONV bool     Perl_sv_utf8_decode(pTHX_ SV *sv);
 PERL_CALLCONV void     Perl_sv_force_normal(pTHX_ SV *sv);
 PERL_CALLCONV void     Perl_tmps_grow(pTHX_ I32 n);
 PERL_CALLCONV SV*      Perl_sv_rvweaken(pTHX_ SV *sv);
diff --git a/sv.c b/sv.c
index 616344b..e22dbc2 100644 (file)
--- a/sv.c
+++ b/sv.c
@@ -2214,7 +2214,8 @@ Perl_sv_2pv(pTHX_ register SV *sv, STRLEN *lp)
 char *
 Perl_sv_2pvbyte_nolen(pTHX_ register SV *sv)
 {
-    return sv_2pv_nolen(sv);
+    STRLEN n_a;
+    return sv_2pvbyte(sv, &n_a);
 }
 
 char *
@@ -2226,12 +2227,14 @@ Perl_sv_2pvbyte(pTHX_ register SV *sv, STRLEN *lp)
 char *
 Perl_sv_2pvutf8_nolen(pTHX_ register SV *sv)
 {
-    return sv_2pv_nolen(sv);
+    STRLEN n_a;
+    return sv_2pvutf8(sv, &n_a);
 }
 
 char *
 Perl_sv_2pvutf8(pTHX_ register SV *sv, STRLEN *lp)
 {
+    sv_utf8_upgrade(sv);
     return sv_2pv(sv,lp);
 }
  
@@ -2273,6 +2276,139 @@ Perl_sv_2bool(pTHX_ register SV *sv)
     }
 }
 
+void
+Perl_sv_utf8_upgrade(pTHX_ register SV *sv)
+{
+    int hicount;
+    char *c;
+
+    if (!sv || !SvPOK(sv) || SvUTF8(sv))
+       return;
+
+    /* This function could be much more efficient if we had a FLAG
+     * to signal if there are any hibit chars in the string
+     */
+    hicount = 0;
+    for (c = SvPVX(sv); c < SvEND(sv); c++) {
+       if (*c & 0x80)
+           hicount++;
+    }
+
+    if (hicount) {
+       char *src, *dst;
+       SvGROW(sv, SvCUR(sv) + hicount + 1);
+
+       src = SvEND(sv) - 1;
+       SvCUR_set(sv, SvCUR(sv) + hicount);
+       dst = SvEND(sv) - 1;
+
+       while (src < dst) {
+           if (*src & 0x80) {
+               dst--;
+               uv_to_utf8((U8*)dst, (U8)*src--);
+               dst--;
+           }
+           else {
+               *dst-- = *src--;
+           }
+       }
+
+       SvUTF8_on(sv);
+    }
+}
+
+bool
+Perl_sv_utf8_downgrade(pTHX_ register SV* sv, bool fail_ok)
+{
+    if (SvPOK(sv) && SvUTF8(sv)) {
+        char *c = SvPVX(sv);
+        char *first_hi = 0;
+        /* need to figure out if this is possible at all first */
+        while (c < SvEND(sv)) {
+            if (*c & 0x80) {
+                I32 len;
+                UV uv = utf8_to_uv(c, &len);
+                if (uv >= 256) {
+                   if (fail_ok)
+                       return FALSE;
+                   else {
+                       /* XXX might want to make a callback here instead */
+                       croak("Big byte");
+                   }
+               }
+                if (!first_hi)
+                    first_hi = c;
+                c += len;
+            }
+            else {
+                c++;
+            }
+        }
+
+        if (first_hi) {
+            char *src = first_hi;
+            char *dst = first_hi;
+            while (src < SvEND(sv)) {
+                if (*src & 0x80) {
+                    I32 len;
+                    U8 u = (U8)utf8_to_uv(src, &len);
+                    *dst++ = u;
+                    src += len;
+                }
+                else {
+                    *dst++ = *src++;
+                }
+            }
+            SvCUR_set(sv, dst - SvPVX(sv));
+        }
+        SvUTF8_off(sv);
+    }
+    return TRUE;
+}
+
+void
+Perl_sv_utf8_encode(pTHX_ register SV *sv)
+{
+    sv_utf8_upgrade(sv);
+    SvUTF8_off(sv);
+}
+
+bool
+Perl_sv_utf8_decode(pTHX_ register SV *sv)
+{
+    if (SvPOK(sv)) {
+        char *c;
+        bool has_utf = FALSE;
+        if (!sv_utf8_downgrade(sv, TRUE))
+           return FALSE;
+
+        /* it is actually just a matter of turning the utf8 flag on, but
+         * we want to make sure everything inside is valid utf8 first.
+         */
+        c = SvPVX(sv);
+        while (c < SvEND(sv)) {
+            if (*c & 0x80) {
+                I32 len;
+                (void)utf8_to_uv((U8*)c, &len);
+                if (len == 1) {
+                    /* bad utf8 */
+                    return FALSE;
+                }
+                c += len;
+                has_utf = TRUE;
+            }
+            else {
+                c++;
+            }
+        }
+
+        if (has_utf)
+            SvUTF8_on(sv);
+    }
+    return TRUE;
+}
+
+
 /* Note: sv_setsv() should not be called with a source string that needs
  * to be reused, since it may destroy the source string if it is marked
  * as temporary.
@@ -2955,10 +3091,13 @@ Perl_sv_catsv(pTHX_ SV *dstr, register SV *sstr)
     STRLEN len;
     if (!sstr)
        return;
-    if (s = SvPV(sstr, len))
+    if (s = SvPV(sstr, len)) {
+       if (SvUTF8(sstr))
+           sv_utf8_upgrade(dstr);
        sv_catpvn(dstr,s,len);
-    if (SvUTF8(sstr))
-       SvUTF8_on(dstr);
+       if (SvUTF8(sstr))
+           SvUTF8_on(dstr);
+    }
 }
 
 /*
@@ -3807,11 +3946,42 @@ C<sv2>.
 I32
 Perl_sv_cmp(pTHX_ register SV *str1, register SV *str2)
 {
-    STRLEN cur1 = 0;
-    char *pv1 = str1 ? SvPV(str1, cur1) : (char *) NULL;
-    STRLEN cur2 = 0;
-    char *pv2 = str2 ? SvPV(str2, cur2) : (char *) NULL;
+    STRLEN cur1, cur2;
+    char *pv1, *pv2;
     I32 retval;
+    bool utf1;
+
+    if (str1) {
+        pv1 = SvPV(str1, cur1);
+    }
+    else {
+       cur1 = 0;
+    }
+
+    if (str2) {
+       if (SvPOK(str2)) {
+           if (SvPOK(str1) && SvUTF8(str1) != SvUTF8(str2) && !IN_BYTE) {
+               /* must upgrade other to UTF8 first */
+               if (SvUTF8(str1)) {
+                   sv_utf8_upgrade(str2);
+               }
+               else {
+                   sv_utf8_upgrade(str1);
+                   /* refresh pointer and length */
+                   pv1  = SvPVX(str1);
+                   cur1 = SvCUR(str1);
+               }
+           }
+           pv2  = SvPVX(str2);
+           cur2 = SvCUR(str2);
+       }
+       else {
+           pv2 = sv_2pv(str2, &cur2);
+       }
+    }
+    else {
+       cur2 = 0;
+    }
 
     if (!cur1)
        return cur2 ? -1 : 0;
@@ -4957,18 +5127,21 @@ Perl_sv_pvbyten_force(pTHX_ SV *sv, STRLEN *lp)
 char *
 Perl_sv_pvutf8(pTHX_ SV *sv)
 {
+    sv_utf8_upgrade(sv);
     return sv_pv(sv);
 }
 
 char *
 Perl_sv_pvutf8n(pTHX_ SV *sv, STRLEN *lp)
 {
+    sv_utf8_upgrade(sv);
     return sv_pvn(sv,lp);
 }
 
 char *
 Perl_sv_pvutf8n_force(pTHX_ SV *sv, STRLEN *lp)
 {
+    sv_utf8_upgrade(sv);
     return sv_pvn_force(sv,lp);
 }
 
diff --git a/toke.c b/toke.c
index b6ffc2b..d978140 100644 (file)
--- a/toke.c
+++ b/toke.c
@@ -6904,7 +6904,7 @@ Perl_scan_num(pTHX_ char *start)
                pos++;
            if (*pos == '.' && isDIGIT(pos[1])) {
                UV rev;
-               U8 tmpbuf[10];
+               U8 tmpbuf[UTF8_MAXLEN];
                U8 *tmpend;
                NV nshift = 1.0;
                bool utf8 = FALSE;
@@ -6930,7 +6930,6 @@ Perl_scan_num(pTHX_ char *start)
                        tmpbuf[0] = (U8)rev;
                        tmpend = &tmpbuf[1];
                    }
-                   *tmpend = '\0';
                    sv_catpvn(sv, (const char*)tmpbuf, tmpend - tmpbuf);
                    if (rev > 0)
                        SvNVX(sv) += (NV)rev/nshift;
@@ -6943,7 +6942,6 @@ Perl_scan_num(pTHX_ char *start)
                s = pos;
                tmpend = uv_to_utf8(tmpbuf, rev);
                utf8 = utf8 || rev > 127;
-               *tmpend = '\0';
                sv_catpvn(sv, (const char*)tmpbuf, tmpend - tmpbuf);
                if (rev > 0)
                    SvNVX(sv) += (NV)rev/nshift;
@@ -6951,8 +6949,10 @@ Perl_scan_num(pTHX_ char *start)
                SvPOK_on(sv);
                SvNOK_on(sv);
                SvREADONLY_on(sv);
-               if (utf8)
+               if (utf8) {
                    SvUTF8_on(sv);
+                   sv_utf8_downgrade(sv, TRUE);
+               }
            }
        }
        break;