[ID 20001113.003] utf8_to_uv on malformed utf returns wrong values
Yitzchak Scott-Thoennes [Mon, 13 Nov 2000 14:49:40 +0000 (06:49 -0800)]
Message-Id: <200011132249.eADMnek09679@garcia.efn.org>

p4raw-id: //depot/perl@7677

doop.c
pp.c
toke.c
utf8.c
utf8.h

diff --git a/doop.c b/doop.c
index 3d22eb4..a2990ce 100644 (file)
--- a/doop.c
+++ b/doop.c
@@ -968,10 +968,10 @@ Perl_do_vop(pTHX_ I32 optype, SV *sv, SV *left, SV *right)
        switch (optype) {
        case OP_BIT_AND:
            while (lulen && rulen) {
-               luc = utf8_to_uv((U8*)lc, lulen, &ulen, UTF8_ALLOW_ANY);
+               luc = utf8_to_uv((U8*)lc, lulen, &ulen, UTF8_ALLOW_ANYUV);
                lc += ulen;
                lulen -= ulen;
-               ruc = utf8_to_uv((U8*)rc, rulen, &ulen, UTF8_ALLOW_ANY);
+               ruc = utf8_to_uv((U8*)rc, rulen, &ulen, UTF8_ALLOW_ANYUV);
                rc += ulen;
                rulen -= ulen;
                duc = luc & ruc;
@@ -983,10 +983,10 @@ Perl_do_vop(pTHX_ I32 optype, SV *sv, SV *left, SV *right)
            break;
        case OP_BIT_XOR:
            while (lulen && rulen) {
-               luc = utf8_to_uv((U8*)lc, lulen, &ulen, UTF8_ALLOW_ANY);
+               luc = utf8_to_uv((U8*)lc, lulen, &ulen, UTF8_ALLOW_ANYUV);
                lc += ulen;
                lulen -= ulen;
-               ruc = utf8_to_uv((U8*)rc, rulen, &ulen, UTF8_ALLOW_ANY);
+               ruc = utf8_to_uv((U8*)rc, rulen, &ulen, UTF8_ALLOW_ANYUV);
                rc += ulen;
                rulen -= ulen;
                duc = luc ^ ruc;
@@ -995,10 +995,10 @@ Perl_do_vop(pTHX_ I32 optype, SV *sv, SV *left, SV *right)
            goto mop_up_utf;
        case OP_BIT_OR:
            while (lulen && rulen) {
-               luc = utf8_to_uv((U8*)lc, lulen, &ulen, UTF8_ALLOW_ANY);
+               luc = utf8_to_uv((U8*)lc, lulen, &ulen, UTF8_ALLOW_ANYUV);
                lc += ulen;
                lulen -= ulen;
-               ruc = utf8_to_uv((U8*)rc, rulen, &ulen, UTF8_ALLOW_ANY);
+               ruc = utf8_to_uv((U8*)rc, rulen, &ulen, UTF8_ALLOW_ANYUV);
                rc += ulen;
                rulen -= ulen;
                duc = luc | ruc;
diff --git a/pp.c b/pp.c
index 2a414b8..6001165 100644 (file)
--- a/pp.c
+++ b/pp.c
@@ -1486,7 +1486,7 @@ PP(pp_complement)
 
          send = tmps + len;
          while (tmps < send) {
-           UV c = utf8_to_uv(tmps, 0, &l, UTF8_ALLOW_ANY);
+           UV c = utf8_to_uv(tmps, send-tmps, &l, UTF8_ALLOW_ANYUV);
            tmps += UTF8SKIP(tmps);
            targlen += UNISKIP(~c);
            nchar++;
@@ -1500,7 +1500,7 @@ PP(pp_complement)
          if (nwide) {
              Newz(0, result, targlen + 1, U8);
              while (tmps < send) {
-                 UV c = utf8_to_uv(tmps, 0, &l, UTF8_ALLOW_ANY);
+                 UV c = utf8_to_uv(tmps, send-tmps, &l, UTF8_ALLOW_ANYUV);
                  tmps += UTF8SKIP(tmps);
                  result = uv_to_utf8(result, ~c);
              }
diff --git a/toke.c b/toke.c
index 458e258..b48577e 100644 (file)
--- a/toke.c
+++ b/toke.c
@@ -1332,7 +1332,7 @@ S_scan_const(pTHX_ char *start)
            UV uv;
 
            uv = utf8_to_uv((U8*)s, send - s, &len, UTF8_CHECK_ONLY);
-           if (len == 1) {
+           if (len == (STRLEN)-1) {
                /* Illegal UTF8 (a high-bit byte), make it valid. */
                char *old_pvx = SvPVX(sv);
                /* need space for one extra char (NOTE: SvCUR() not set here) */
diff --git a/utf8.c b/utf8.c
index 6ddf42b..a54726f 100644 (file)
--- a/utf8.c
+++ b/utf8.c
@@ -312,12 +312,12 @@ malformed:
 
     if (flags & UTF8_CHECK_ONLY) {
        if (retlen)
-           *retlen = len;
+           *retlen = -1;
        return 0;
     }
 
     if (retlen)
-       *retlen = -1;
+       *retlen = expectlen ? expectlen : len;
 
     return UNICODE_REPLACEMENT_CHARACTER;
 }
diff --git a/utf8.h b/utf8.h
index dc93e95..269ad3e 100644 (file)
--- a/utf8.h
+++ b/utf8.h
@@ -41,6 +41,8 @@ END_EXTERN_C
 #define UTF8_ALLOW_BOM                 0x0020
 #define UTF8_ALLOW_FFFF                        0x0040
 #define UTF8_ALLOW_LONG                        0x0080
+#define UTF8_ALLOW_ANYUV               (UTF8_ALLOW_FE_FF|UTF8_ALLOW_FFFF \
+                                       |UTF8_ALLOW_BOM|UTF8_ALLOW_SURROGATE)
 #define UTF8_ALLOW_ANY                 0x00ff
 #define UTF8_CHECK_ONLY                        0x0100