From: SADAHIRO Tomoyuki Date: Sun, 21 May 2006 14:00:43 +0000 (+0900) Subject: replace S_sv_pos_b2u_forwards with utf8_length (was: sv_pos_b2u dislikes the extended... X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=6448472a7a0e4a2f91d781fd6991f5f5126d6fbc;p=p5sagit%2Fp5-mst-13.2.git replace S_sv_pos_b2u_forwards with utf8_length (was: sv_pos_b2u dislikes the extended UTF-8) Message-Id: <20060521140011.358D.BQW10602@nifty.com> p4raw-id: //depot/perl@29201 --- diff --git a/embed.fnc b/embed.fnc index 7511a88..d7b3592 100644 --- a/embed.fnc +++ b/embed.fnc @@ -1426,7 +1426,6 @@ s |STRLEN |sv_pos_u2b_cached|NN SV *sv|NN MAGIC **mgp \ |STRLEN uoffset|STRLEN uoffset0|STRLEN boffset0 s |void |utf8_mg_pos_cache_update|NN SV *sv|NN MAGIC **mgp \ |STRLEN byte|STRLEN utf8|STRLEN blen -s |STRLEN |sv_pos_b2u_forwards|NN const U8 *s|NN const U8 *const target s |STRLEN |sv_pos_b2u_midway|NN const U8 *s|NN const U8 *const target \ |NN const U8 *end|STRLEN endu s |char * |stringify_regexp|NN SV *sv|NN MAGIC *mg|NULLOK STRLEN *lp diff --git a/embed.h b/embed.h index 713d7da..74adbd2 100644 --- a/embed.h +++ b/embed.h @@ -1427,7 +1427,6 @@ #define sv_pos_u2b_midway S_sv_pos_u2b_midway #define sv_pos_u2b_cached S_sv_pos_u2b_cached #define utf8_mg_pos_cache_update S_utf8_mg_pos_cache_update -#define sv_pos_b2u_forwards S_sv_pos_b2u_forwards #define sv_pos_b2u_midway S_sv_pos_b2u_midway #define stringify_regexp S_stringify_regexp #define F0convert S_F0convert @@ -3630,7 +3629,6 @@ #define sv_pos_u2b_midway S_sv_pos_u2b_midway #define sv_pos_u2b_cached(a,b,c,d,e,f,g) S_sv_pos_u2b_cached(aTHX_ a,b,c,d,e,f,g) #define utf8_mg_pos_cache_update(a,b,c,d,e) S_utf8_mg_pos_cache_update(aTHX_ a,b,c,d,e) -#define sv_pos_b2u_forwards(a,b) S_sv_pos_b2u_forwards(aTHX_ a,b) #define sv_pos_b2u_midway(a,b,c,d) S_sv_pos_b2u_midway(aTHX_ a,b,c,d) #define stringify_regexp(a,b,c) S_stringify_regexp(aTHX_ a,b,c) #define F0convert S_F0convert diff --git a/proto.h b/proto.h index 47f302e..b751dba 100644 --- a/proto.h +++ b/proto.h @@ -3856,10 +3856,6 @@ STATIC void S_utf8_mg_pos_cache_update(pTHX_ SV *sv, MAGIC **mgp, STRLEN byte, S __attribute__nonnull__(pTHX_1) __attribute__nonnull__(pTHX_2); -STATIC STRLEN S_sv_pos_b2u_forwards(pTHX_ const U8 *s, const U8 *const target) - __attribute__nonnull__(pTHX_1) - __attribute__nonnull__(pTHX_2); - STATIC STRLEN S_sv_pos_b2u_midway(pTHX_ const U8 *s, const U8 *const target, const U8 *end, STRLEN endu) __attribute__nonnull__(pTHX_1) __attribute__nonnull__(pTHX_2) diff --git a/sv.c b/sv.c index d6039e8..a959d66 100644 --- a/sv.c +++ b/sv.c @@ -5687,18 +5687,7 @@ S_utf8_mg_pos_cache_update(pTHX_ SV *sv, MAGIC **mgp, STRLEN byte, STRLEN utf8, if (PL_utf8cache < 0) { const U8 *start = (const U8 *) SvPVX_const(sv); - const U8 *const end = start + byte; - STRLEN realutf8 = 0; - - while (start < end) { - start += UTF8SKIP(start); - realutf8++; - } - - /* Can't use S_sv_pos_b2u_forwards as it will scream warnings on - surrogates. FIXME - is it inconsistent that b2u warns, but u2b - doesn't? I don't know whether this difference was introduced with - the caching code in 5.8.1. */ + const STRLEN realutf8 = utf8_length(start, start + byte); if (realutf8 != utf8) { /* Need to turn the assertions off otherwise we may recurse @@ -5809,29 +5798,6 @@ S_utf8_mg_pos_cache_update(pTHX_ SV *sv, MAGIC **mgp, STRLEN byte, STRLEN utf8, ASSERT_UTF8_CACHE(cache); } -/* If we don't know the character offset of the end of a region, our only - option is to walk forwards to the target byte offset. */ -static STRLEN -S_sv_pos_b2u_forwards(pTHX_ const U8 *s, const U8 *const target) -{ - STRLEN len = 0; - while (s < target) { - STRLEN n = 1; - - /* Call utf8n_to_uvchr() to validate the sequence - * (unless a simple non-UTF character) */ - if (!UTF8_IS_INVARIANT(*s)) - utf8n_to_uvchr(s, UTF8SKIP(s), &n, 0); - if (n > 0) { - s += n; - len++; - } - else - break; - } - return len; -} - /* We already know all of the way, now we may be able to walk back. The same assumption is made as in S_sv_pos_u2b_midway(), namely that walking backward is half the speed of walking forward. */ @@ -5843,7 +5809,7 @@ S_sv_pos_b2u_midway(pTHX_ const U8 *s, const U8 *const target, const U8 *end, STRLEN backw = end - target; if (forw < 2 * backw) { - return S_sv_pos_b2u_forwards(aTHX_ s, target); + return utf8_length(s, target); } while (end > target) { @@ -5916,8 +5882,7 @@ Perl_sv_pos_b2u(pTHX_ register SV* sv, I32* offsetp) + S_sv_pos_b2u_midway(aTHX_ s + cache[1], send, s + blen, mg->mg_len - cache[0]); } else { - len = cache[0] - + S_sv_pos_b2u_forwards(aTHX_ s + cache[1], send); + len = cache[0] + utf8_length(s + cache[1], send); } } else if (cache[3] < byte) { @@ -5943,7 +5908,7 @@ Perl_sv_pos_b2u(pTHX_ register SV* sv, I32* offsetp) } } if (!found || PL_utf8cache < 0) { - const STRLEN real_len = S_sv_pos_b2u_forwards(aTHX_ s, send); + const STRLEN real_len = utf8_length(s, send); if (found && PL_utf8cache < 0) { if (len != real_len) { diff --git a/t/op/index.t b/t/op/index.t index b7cdfb0..b384bef 100755 --- a/t/op/index.t +++ b/t/op/index.t @@ -3,11 +3,11 @@ BEGIN { chdir 't' if -d 't'; @INC = '../lib'; + require './test.pl'; } use strict; -require './test.pl'; -plan( tests => 66 ); +plan( tests => 69 ); my $foo = 'Now is the time for all good men to come to the aid of their country.'; @@ -140,3 +140,18 @@ foreach my $utf8 ('', ', utf-8') { fresh_perl_is($prog, $expect_pos, {}, "\$[ = $arraybase$utf8"); } } + +SKIP: { + skip "UTF-EBCDIC is limited to 0x7fffffff", 3 if ord("A") == 193; + + my $a = "\x{80000000}"; + my $s = $a.'defxyz'; + is(index($s, 'def'), 1, "0x80000000 is a single character"); + + my $b = "\x{fffffffd}"; + my $t = $b.'pqrxyz'; + is(index($t, 'pqr'), 1, "0xfffffffd is a single character"); + + local ${^UTF8CACHE} = -1; + is(index($t, 'xyz'), 4, "0xfffffffd and utf8cache"); +}