From: Jarkko Hietaniemi Date: Sun, 26 Nov 2000 19:01:05 +0000 (+0000) Subject: Make utf8_length() and utf8_distance() (the latter of which X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=02eb7b47b8a6793752e5b001af6e62c374b2c440;p=p5sagit%2Fp5-mst-13.2.git Make utf8_length() and utf8_distance() (the latter of which is unused at the moment) to be less forgiving about bad UTF-8. p4raw-id: //depot/perl@7869 --- diff --git a/embed.pl b/embed.pl index 1d35bf6..6412ef6 100755 --- a/embed.pl +++ b/embed.pl @@ -2072,7 +2072,7 @@ p |void |utilize |int aver|I32 floor|OP* version|OP* id|OP* arg Ap |U8* |utf16_to_utf8 |U8* p|U8 *d|I32 bytelen|I32 *newlen Ap |U8* |utf16_to_utf8_reversed|U8* p|U8 *d|I32 bytelen|I32 *newlen Ap |STRLEN |utf8_length |U8* s|U8 *e -Ap |I32 |utf8_distance |U8 *a|U8 *b +Ap |IV |utf8_distance |U8 *a|U8 *b Ap |U8* |utf8_hop |U8 *s|I32 off ApM |U8* |utf8_to_bytes |U8 *s|STRLEN *len ApM |U8* |bytes_to_utf8 |U8 *s|STRLEN *len diff --git a/proto.h b/proto.h index 2a60195..1e34c81 100644 --- a/proto.h +++ b/proto.h @@ -807,7 +807,7 @@ PERL_CALLCONV void Perl_utilize(pTHX_ int aver, I32 floor, OP* version, OP* id, PERL_CALLCONV U8* Perl_utf16_to_utf8(pTHX_ U8* p, U8 *d, I32 bytelen, I32 *newlen); PERL_CALLCONV U8* Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8 *d, I32 bytelen, I32 *newlen); PERL_CALLCONV STRLEN Perl_utf8_length(pTHX_ U8* s, U8 *e); -PERL_CALLCONV I32 Perl_utf8_distance(pTHX_ U8 *a, U8 *b); +PERL_CALLCONV IV Perl_utf8_distance(pTHX_ U8 *a, U8 *b); PERL_CALLCONV U8* Perl_utf8_hop(pTHX_ U8 *s, I32 off); PERL_CALLCONV U8* Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *len); PERL_CALLCONV U8* Perl_bytes_to_utf8(pTHX_ U8 *s, STRLEN *len); diff --git a/utf8.c b/utf8.c index fc625dc..d25b43b 100644 --- a/utf8.c +++ b/utf8.c @@ -357,8 +357,8 @@ Perl_utf8_to_uv_simple(pTHX_ U8* s, STRLEN* retlen) =for apidoc|utf8_length|U8 *s|U8 *e Return the length of the UTF-8 char encoded string C in characters. -Stops at string C. If C s> or if the scan would end up -past C, return -1. +Stops at C (inclusive). If C s> or if the scan would end +up past C, croaks. =cut */ @@ -369,12 +369,12 @@ Perl_utf8_length(pTHX_ U8* s, U8* e) STRLEN len = 0; if (e < s) - return -1; + Perl_croak(aTHX_ "panic: utf8_length: unexpected end"); while (s < e) { - STRLEN t = UTF8SKIP(s); + U8 t = UTF8SKIP(s); if (e - s < t) - return -1; + Perl_croak(aTHX_ "panic: utf8_length: unaligned end"); s += t; len++; } @@ -385,22 +385,32 @@ Perl_utf8_length(pTHX_ U8* s, U8* e) /* utf8_distance(a,b) returns the number of UTF8 characters between the pointers a and b */ -I32 +IV Perl_utf8_distance(pTHX_ U8 *a, U8 *b) { - I32 off = 0; + IV off = 0; + if (a < b) { while (a < b) { - a += UTF8SKIP(a); + U8 c = UTF8SKIP(a); + + if (b - a < c) + Perl_croak(aTHX_ "panic: utf8_distance: unaligned end"); + a += c; off--; } } else { while (b < a) { - b += UTF8SKIP(b); + U8 c = UTF8SKIP(b); + + if (a - b < c) + Perl_croak(aTHX_ "panic: utf8_distance: unaligned end"); + b += c; off++; } } + return off; }