From: Tony Cook Date: Tue, 16 Mar 2010 12:46:48 +0000 (+1100) Subject: handle perl extended utf8 start bytes X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=65ab9279784aa811d78b2903b57bc0e7947dec78;p=p5sagit%2Fp5-mst-13.2.git handle perl extended utf8 start bytes perl uses UTF8_IS_START() to test if a byte is a valid start byte, this didn't take perl's extended UTF-8 range into account. --- diff --git a/t/op/chop.t b/t/op/chop.t index 30f7bff..36f8cad 100644 --- a/t/op/chop.t +++ b/t/op/chop.t @@ -6,7 +6,7 @@ BEGIN { require './test.pl'; } -plan tests => 139; +plan tests => 143; $_ = 'abc'; $c = foo(); @@ -243,3 +243,22 @@ foreach my $start (@chars) { map chomp(+()), ('')x68; ok(1, "extend sp in pp_chomp"); } + +{ + # [perl #73246] chop doesn't support utf8 + # the problem was UTF8_IS_START() didn't handle perl's extended UTF8 + my $utf = "\x{80000001}\x{80000000}"; + my $result = chop($utf); + is($utf, "\x{80000001}", "chopping high 'unicode'- remnant"); + is($result, "\x{80000000}", "chopping high 'unicode' - result"); + + SKIP: { + use Config; + $Config{ivsize} >= 8 + or skip("this build can't handle very large characters", 2); + my $utf = "\x{ffffffffffffffff}\x{fffffffffffffffe}"; + my $result = chop $utf; + is($utf, "\x{ffffffffffffffff}", "chop even higher 'unicode' - remnant"); + is($result, "\x{fffffffffffffffe}", "chop even higher 'unicode' - result"); + } +} diff --git a/utf8.h b/utf8.h index e58dded..b0cfedf 100644 --- a/utf8.h +++ b/utf8.h @@ -104,13 +104,15 @@ As you can see, the continuation bytes all begin with C<10>, and the leading bits of the start byte tell how many bytes there are in the encoded character. +Perl's extended UTF-8 means we can have start bytes up to FF. + */ #define UNI_IS_INVARIANT(c) (((UV)c) < 0x80) /* Note that C0 and C1 are invalid in legal UTF8, so the lower bound of the * below might ought to be C2 */ -#define UTF8_IS_START(c) (((U8)c) >= 0xc0 && (((U8)c) <= 0xfd)) +#define UTF8_IS_START(c) (((U8)c) >= 0xc0) #define UTF8_IS_CONTINUATION(c) (((U8)c) >= 0x80 && (((U8)c) <= 0xbf)) #define UTF8_IS_CONTINUED(c) (((U8)c) & 0x80) #define UTF8_IS_DOWNGRADEABLE_START(c) (((U8)c & 0xfc) == 0xc0)