|STRLEN uoffset|STRLEN uoffset0|STRLEN boffset0
s |void |utf8_mg_pos_cache_update|NN SV *sv|NN MAGIC **mgp \
|STRLEN byte|STRLEN utf8|STRLEN blen
-s |STRLEN |sv_pos_b2u_forwards|NN const U8 *s|NN const U8 *const target
s |STRLEN |sv_pos_b2u_midway|NN const U8 *s|NN const U8 *const target \
|NN const U8 *end|STRLEN endu
s |char * |stringify_regexp|NN SV *sv|NN MAGIC *mg|NULLOK STRLEN *lp
#define sv_pos_u2b_midway S_sv_pos_u2b_midway
#define sv_pos_u2b_cached S_sv_pos_u2b_cached
#define utf8_mg_pos_cache_update S_utf8_mg_pos_cache_update
-#define sv_pos_b2u_forwards S_sv_pos_b2u_forwards
#define sv_pos_b2u_midway S_sv_pos_b2u_midway
#define stringify_regexp S_stringify_regexp
#define F0convert S_F0convert
#define sv_pos_u2b_midway S_sv_pos_u2b_midway
#define sv_pos_u2b_cached(a,b,c,d,e,f,g) S_sv_pos_u2b_cached(aTHX_ a,b,c,d,e,f,g)
#define utf8_mg_pos_cache_update(a,b,c,d,e) S_utf8_mg_pos_cache_update(aTHX_ a,b,c,d,e)
-#define sv_pos_b2u_forwards(a,b) S_sv_pos_b2u_forwards(aTHX_ a,b)
#define sv_pos_b2u_midway(a,b,c,d) S_sv_pos_b2u_midway(aTHX_ a,b,c,d)
#define stringify_regexp(a,b,c) S_stringify_regexp(aTHX_ a,b,c)
#define F0convert S_F0convert
__attribute__nonnull__(pTHX_1)
__attribute__nonnull__(pTHX_2);
-STATIC STRLEN S_sv_pos_b2u_forwards(pTHX_ const U8 *s, const U8 *const target)
- __attribute__nonnull__(pTHX_1)
- __attribute__nonnull__(pTHX_2);
-
STATIC STRLEN S_sv_pos_b2u_midway(pTHX_ const U8 *s, const U8 *const target, const U8 *end, STRLEN endu)
__attribute__nonnull__(pTHX_1)
__attribute__nonnull__(pTHX_2)
if (PL_utf8cache < 0) {
const U8 *start = (const U8 *) SvPVX_const(sv);
- const U8 *const end = start + byte;
- STRLEN realutf8 = 0;
-
- while (start < end) {
- start += UTF8SKIP(start);
- realutf8++;
- }
-
- /* Can't use S_sv_pos_b2u_forwards as it will scream warnings on
- surrogates. FIXME - is it inconsistent that b2u warns, but u2b
- doesn't? I don't know whether this difference was introduced with
- the caching code in 5.8.1. */
+ const STRLEN realutf8 = utf8_length(start, start + byte);
if (realutf8 != utf8) {
/* Need to turn the assertions off otherwise we may recurse
ASSERT_UTF8_CACHE(cache);
}
-/* If we don't know the character offset of the end of a region, our only
- option is to walk forwards to the target byte offset. */
-static STRLEN
-S_sv_pos_b2u_forwards(pTHX_ const U8 *s, const U8 *const target)
-{
- STRLEN len = 0;
- while (s < target) {
- STRLEN n = 1;
-
- /* Call utf8n_to_uvchr() to validate the sequence
- * (unless a simple non-UTF character) */
- if (!UTF8_IS_INVARIANT(*s))
- utf8n_to_uvchr(s, UTF8SKIP(s), &n, 0);
- if (n > 0) {
- s += n;
- len++;
- }
- else
- break;
- }
- return len;
-}
-
/* We already know all of the way, now we may be able to walk back. The same
assumption is made as in S_sv_pos_u2b_midway(), namely that walking
backward is half the speed of walking forward. */
STRLEN backw = end - target;
if (forw < 2 * backw) {
- return S_sv_pos_b2u_forwards(aTHX_ s, target);
+ return utf8_length(s, target);
}
while (end > target) {
+ S_sv_pos_b2u_midway(aTHX_ s + cache[1], send,
s + blen, mg->mg_len - cache[0]);
} else {
- len = cache[0]
- + S_sv_pos_b2u_forwards(aTHX_ s + cache[1], send);
+ len = cache[0] + utf8_length(s + cache[1], send);
}
}
else if (cache[3] < byte) {
}
}
if (!found || PL_utf8cache < 0) {
- const STRLEN real_len = S_sv_pos_b2u_forwards(aTHX_ s, send);
+ const STRLEN real_len = utf8_length(s, send);
if (found && PL_utf8cache < 0) {
if (len != real_len) {
BEGIN {
chdir 't' if -d 't';
@INC = '../lib';
+ require './test.pl';
}
use strict;
-require './test.pl';
-plan( tests => 66 );
+plan( tests => 69 );
my $foo = 'Now is the time for all good men to come to the aid of their country.';
fresh_perl_is($prog, $expect_pos, {}, "\$[ = $arraybase$utf8");
}
}
+
+SKIP: {
+ skip "UTF-EBCDIC is limited to 0x7fffffff", 3 if ord("A") == 193;
+
+ my $a = "\x{80000000}";
+ my $s = $a.'defxyz';
+ is(index($s, 'def'), 1, "0x80000000 is a single character");
+
+ my $b = "\x{fffffffd}";
+ my $t = $b.'pqrxyz';
+ is(index($t, 'pqr'), 1, "0xfffffffd is a single character");
+
+ local ${^UTF8CACHE} = -1;
+ is(index($t, 'xyz'), 4, "0xfffffffd and utf8cache");
+}