replace S_sv_pos_b2u_forwards with utf8_length (was: sv_pos_b2u dislikes the extended...

diff --git a/embed.fnc b/embed.fnc

index 7511a88..d7b3592 100644 (file)
--- a/embed.fnc
+++ b/embed.fnc
@@ -1426,7 +1426,6 @@ s |STRLEN |sv_pos_u2b_cached|NN SV *sv|NN MAGIC **mgp \
                |STRLEN uoffset|STRLEN uoffset0|STRLEN boffset0
 s      |void   |utf8_mg_pos_cache_update|NN SV *sv|NN MAGIC **mgp \
                |STRLEN byte|STRLEN utf8|STRLEN blen
-s      |STRLEN |sv_pos_b2u_forwards|NN const U8 *s|NN const U8 *const target
 s      |STRLEN |sv_pos_b2u_midway|NN const U8 *s|NN const U8 *const target \
                |NN const U8 *end|STRLEN endu
 s      |char * |stringify_regexp|NN SV *sv|NN MAGIC *mg|NULLOK STRLEN *lp
diff --git a/embed.h b/embed.h

index 713d7da..74adbd2 100644 (file)
--- a/embed.h
+++ b/embed.h
@@ -1427,7 +1427,6 @@
 #define sv_pos_u2b_midway      S_sv_pos_u2b_midway
 #define sv_pos_u2b_cached      S_sv_pos_u2b_cached
 #define utf8_mg_pos_cache_update       S_utf8_mg_pos_cache_update
-#define sv_pos_b2u_forwards    S_sv_pos_b2u_forwards
 #define sv_pos_b2u_midway      S_sv_pos_b2u_midway
 #define stringify_regexp       S_stringify_regexp
 #define F0convert              S_F0convert
@@ -3630,7 +3629,6 @@
 #define sv_pos_u2b_midway      S_sv_pos_u2b_midway
 #define sv_pos_u2b_cached(a,b,c,d,e,f,g)       S_sv_pos_u2b_cached(aTHX_ a,b,c,d,e,f,g)
 #define utf8_mg_pos_cache_update(a,b,c,d,e)    S_utf8_mg_pos_cache_update(aTHX_ a,b,c,d,e)
-#define sv_pos_b2u_forwards(a,b)       S_sv_pos_b2u_forwards(aTHX_ a,b)
 #define sv_pos_b2u_midway(a,b,c,d)     S_sv_pos_b2u_midway(aTHX_ a,b,c,d)
 #define stringify_regexp(a,b,c)        S_stringify_regexp(aTHX_ a,b,c)
 #define F0convert              S_F0convert
diff --git a/proto.h b/proto.h

index 47f302e..b751dba 100644 (file)
--- a/proto.h
+++ b/proto.h
@@ -3856,10 +3856,6 @@ STATIC void      S_utf8_mg_pos_cache_update(pTHX_ SV *sv, MAGIC **mgp, STRLEN byte, S
                        __attribute__nonnull__(pTHX_1)
                        __attribute__nonnull__(pTHX_2);
 
-STATIC STRLEN  S_sv_pos_b2u_forwards(pTHX_ const U8 *s, const U8 *const target)
-                       __attribute__nonnull__(pTHX_1)
-                       __attribute__nonnull__(pTHX_2);
-
 STATIC STRLEN  S_sv_pos_b2u_midway(pTHX_ const U8 *s, const U8 *const target, const U8 *end, STRLEN endu)
                        __attribute__nonnull__(pTHX_1)
                        __attribute__nonnull__(pTHX_2)
diff --git a/sv.c b/sv.c

index d6039e8..a959d66 100644 (file)
--- a/sv.c
+++ b/sv.c
@@ -5687,18 +5687,7 @@ S_utf8_mg_pos_cache_update(pTHX_ SV *sv, MAGIC **mgp, STRLEN byte, STRLEN utf8,
 
     if (PL_utf8cache < 0) {
        const U8 *start = (const U8 *) SvPVX_const(sv);
-       const U8 *const end = start + byte;
-       STRLEN realutf8 = 0;
-
-       while (start < end) {
-           start += UTF8SKIP(start);
-           realutf8++;
-       }
-
-       /* Can't use S_sv_pos_b2u_forwards as it will scream warnings on
-          surrogates.  FIXME - is it inconsistent that b2u warns, but u2b
-          doesn't?  I don't know whether this difference was introduced with
-          the caching code in 5.8.1.  */
+       const STRLEN realutf8 = utf8_length(start, start + byte);
 
        if (realutf8 != utf8) {
            /* Need to turn the assertions off otherwise we may recurse
@@ -5809,29 +5798,6 @@ S_utf8_mg_pos_cache_update(pTHX_ SV *sv, MAGIC **mgp, STRLEN byte, STRLEN utf8,
     ASSERT_UTF8_CACHE(cache);
 }
 
-/* If we don't know the character offset of the end of a region, our only
-   option is to walk forwards to the target byte offset.  */
-static STRLEN
-S_sv_pos_b2u_forwards(pTHX_ const U8 *s, const U8 *const target)
-{
-    STRLEN len = 0;
-    while (s < target) {
-       STRLEN n = 1;
-
-       /* Call utf8n_to_uvchr() to validate the sequence
-        * (unless a simple non-UTF character) */
-       if (!UTF8_IS_INVARIANT(*s))
-           utf8n_to_uvchr(s, UTF8SKIP(s), &n, 0);
-       if (n > 0) {
-           s += n;
-           len++;
-       }
-       else
-           break;
-    }
-    return len;
-}
-
 /* We already know all of the way, now we may be able to walk back.  The same
    assumption is made as in S_sv_pos_u2b_midway(), namely that walking
    backward is half the speed of walking forward. */
@@ -5843,7 +5809,7 @@ S_sv_pos_b2u_midway(pTHX_ const U8 *s, const U8 *const target, const U8 *end,
     STRLEN backw = end - target;
 
     if (forw < 2 * backw) {
-       return S_sv_pos_b2u_forwards(aTHX_ s, target);
+       return utf8_length(s, target);
     }
 
     while (end > target) {
@@ -5916,8 +5882,7 @@ Perl_sv_pos_b2u(pTHX_ register SV* sv, I32* offsetp)
                        + S_sv_pos_b2u_midway(aTHX_ s + cache[1], send,
                                              s + blen, mg->mg_len - cache[0]);
                } else {
-                   len = cache[0]
-                       + S_sv_pos_b2u_forwards(aTHX_ s + cache[1], send);
+                   len = cache[0] + utf8_length(s + cache[1], send);
                }
            }
            else if (cache[3] < byte) {
@@ -5943,7 +5908,7 @@ Perl_sv_pos_b2u(pTHX_ register SV* sv, I32* offsetp)
        }
     }
     if (!found || PL_utf8cache < 0) {
-       const STRLEN real_len = S_sv_pos_b2u_forwards(aTHX_ s, send);
+       const STRLEN real_len = utf8_length(s, send);
 
        if (found && PL_utf8cache < 0) {
            if (len != real_len) {
diff --git a/t/op/index.t b/t/op/index.t

index b7cdfb0..b384bef 100755 (executable)
--- a/t/op/index.t
+++ b/t/op/index.t
@@ -3,11 +3,11 @@
 BEGIN {
     chdir 't' if -d 't';
     @INC = '../lib';
+    require './test.pl';
 }
 
 use strict;
-require './test.pl';
-plan( tests => 66 );
+plan( tests => 69 );
 
 my $foo = 'Now is the time for all good men to come to the aid of their country.';
 
@@ -140,3 +140,18 @@ foreach my $utf8 ('', ', utf-8') {
        fresh_perl_is($prog, $expect_pos, {}, "\$[ = $arraybase$utf8");
     }
 }
+
+SKIP: {
+    skip "UTF-EBCDIC is limited to 0x7fffffff", 3 if ord("A") == 193;
+
+    my $a = "\x{80000000}";
+    my $s = $a.'defxyz';
+    is(index($s, 'def'), 1, "0x80000000 is a single character");
+
+    my $b = "\x{fffffffd}";
+    my $t = $b.'pqrxyz';
+    is(index($t, 'pqr'), 1, "0xfffffffd is a single character");
+
+    local ${^UTF8CACHE} = -1;
+    is(index($t, 'xyz'), 4, "0xfffffffd and utf8cache");
+}
embed.fnc		patch \| blob \| blame \| history
embed.h		patch \| blob \| blame \| history
proto.h		patch \| blob \| blame \| history
sv.c		patch \| blob \| blame \| history
t/op/index.t		patch \| blob \| blame \| history