Make utf8_length() and utf8_distance() (the latter of which
Jarkko Hietaniemi [Sun, 26 Nov 2000 19:01:05 +0000 (19:01 +0000)]
is unused at the moment) to be less forgiving about bad UTF-8.

p4raw-id: //depot/perl@7869

embed.pl
proto.h
utf8.c

index 1d35bf6..6412ef6 100755 (executable)
--- a/embed.pl
+++ b/embed.pl
@@ -2072,7 +2072,7 @@ p |void   |utilize        |int aver|I32 floor|OP* version|OP* id|OP* arg
 Ap     |U8*    |utf16_to_utf8  |U8* p|U8 *d|I32 bytelen|I32 *newlen
 Ap     |U8*    |utf16_to_utf8_reversed|U8* p|U8 *d|I32 bytelen|I32 *newlen
 Ap     |STRLEN |utf8_length    |U8* s|U8 *e
-Ap     |I32    |utf8_distance  |U8 *a|U8 *b
+Ap     |IV     |utf8_distance  |U8 *a|U8 *b
 Ap     |U8*    |utf8_hop       |U8 *s|I32 off
 ApM    |U8*    |utf8_to_bytes  |U8 *s|STRLEN *len
 ApM    |U8*    |bytes_to_utf8  |U8 *s|STRLEN *len
diff --git a/proto.h b/proto.h
index 2a60195..1e34c81 100644 (file)
--- a/proto.h
+++ b/proto.h
@@ -807,7 +807,7 @@ PERL_CALLCONV void  Perl_utilize(pTHX_ int aver, I32 floor, OP* version, OP* id,
 PERL_CALLCONV U8*      Perl_utf16_to_utf8(pTHX_ U8* p, U8 *d, I32 bytelen, I32 *newlen);
 PERL_CALLCONV U8*      Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8 *d, I32 bytelen, I32 *newlen);
 PERL_CALLCONV STRLEN   Perl_utf8_length(pTHX_ U8* s, U8 *e);
-PERL_CALLCONV I32      Perl_utf8_distance(pTHX_ U8 *a, U8 *b);
+PERL_CALLCONV IV       Perl_utf8_distance(pTHX_ U8 *a, U8 *b);
 PERL_CALLCONV U8*      Perl_utf8_hop(pTHX_ U8 *s, I32 off);
 PERL_CALLCONV U8*      Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *len);
 PERL_CALLCONV U8*      Perl_bytes_to_utf8(pTHX_ U8 *s, STRLEN *len);
diff --git a/utf8.c b/utf8.c
index fc625dc..d25b43b 100644 (file)
--- a/utf8.c
+++ b/utf8.c
@@ -357,8 +357,8 @@ Perl_utf8_to_uv_simple(pTHX_ U8* s, STRLEN* retlen)
 =for apidoc|utf8_length|U8 *s|U8 *e
 
 Return the length of the UTF-8 char encoded string C<s> in characters.
-Stops at string C<e>.  If C<e E<lt> s> or if the scan would end up 
-past C<e>, return -1.
+Stops at C<e> (inclusive).  If C<e E<lt> s> or if the scan would end
+up past C<e>, croaks.
 
 =cut
 */
@@ -369,12 +369,12 @@ Perl_utf8_length(pTHX_ U8* s, U8* e)
     STRLEN len = 0;
 
     if (e < s)
-       return -1;
+       Perl_croak(aTHX_ "panic: utf8_length: unexpected end");
     while (s < e) {
-       STRLEN t = UTF8SKIP(s);
+       U8 t = UTF8SKIP(s);
 
        if (e - s < t)
-           return -1;
+           Perl_croak(aTHX_ "panic: utf8_length: unaligned end");
        s += t;
        len++;
     }
@@ -385,22 +385,32 @@ Perl_utf8_length(pTHX_ U8* s, U8* e)
 /* utf8_distance(a,b) returns the number of UTF8 characters between
    the pointers a and b                                                        */
 
-I32
+IV
 Perl_utf8_distance(pTHX_ U8 *a, U8 *b)
 {
-    I32 off = 0;
+    IV off = 0;
+
     if (a < b) {
        while (a < b) {
-           a += UTF8SKIP(a);
+           U8 c = UTF8SKIP(a);
+
+           if (b - a < c)
+               Perl_croak(aTHX_ "panic: utf8_distance: unaligned end");
+           a += c;
            off--;
        }
     }
     else {
        while (b < a) {
-           b += UTF8SKIP(b);
+           U8 c = UTF8SKIP(b);
+
+           if (a - b < c)
+               Perl_croak(aTHX_ "panic: utf8_distance: unaligned end");
+           b += c;
            off++;
        }
     }
+
     return off;
 }