Faster sv_utf8_upgrade()

diff --git a/embed.fnc b/embed.fnc

index 1a0e5d3..749b975 100644 (file)
--- a/embed.fnc
+++ b/embed.fnc
@@ -1871,7 +1871,8 @@ Apd       |void   |sv_setsv_flags |NN SV *dstr|NULLOK SV *sstr|const I32 flags
 Apd    |void   |sv_catpvn_flags|NN SV *const dstr|NN const char *sstr|const STRLEN len \
                                |const I32 flags
 Apd    |void   |sv_catsv_flags |NN SV *const dsv|NULLOK SV *const ssv|const I32 flags
-Apd    |STRLEN |sv_utf8_upgrade_flags|NN SV *const sv|const I32 flags
+Apmd   |STRLEN |sv_utf8_upgrade_flags|NN SV *const sv|const I32 flags
+Apd    |STRLEN |sv_utf8_upgrade_flags_grow|NN SV *const sv|const I32 flags|STRLEN extra
 Apd    |char*  |sv_pvn_force_flags|NN SV *const sv|NULLOK STRLEN *const lp|const I32 flags
 Apd    |void   |sv_copypv      |NN SV *const dsv|NN SV *const ssv
 Ap     |char*  |my_atof2       |NN const char *s|NN NV* value
diff --git a/embed.h b/embed.h

index b1f9741..2d43283 100644 (file)
--- a/embed.h
+++ b/embed.h
@@ -1649,7 +1649,7 @@
 #define sv_setsv_flags         Perl_sv_setsv_flags
 #define sv_catpvn_flags                Perl_sv_catpvn_flags
 #define sv_catsv_flags         Perl_sv_catsv_flags
-#define sv_utf8_upgrade_flags  Perl_sv_utf8_upgrade_flags
+#define sv_utf8_upgrade_flags_grow     Perl_sv_utf8_upgrade_flags_grow
 #define sv_pvn_force_flags     Perl_sv_pvn_force_flags
 #define sv_copypv              Perl_sv_copypv
 #define my_atof2               Perl_my_atof2
@@ -4007,7 +4007,7 @@
 #define sv_setsv_flags(a,b,c)  Perl_sv_setsv_flags(aTHX_ a,b,c)
 #define sv_catpvn_flags(a,b,c,d)       Perl_sv_catpvn_flags(aTHX_ a,b,c,d)
 #define sv_catsv_flags(a,b,c)  Perl_sv_catsv_flags(aTHX_ a,b,c)
-#define sv_utf8_upgrade_flags(a,b)     Perl_sv_utf8_upgrade_flags(aTHX_ a,b)
+#define sv_utf8_upgrade_flags_grow(a,b,c)      Perl_sv_utf8_upgrade_flags_grow(aTHX_ a,b,c)
 #define sv_pvn_force_flags(a,b,c)      Perl_sv_pvn_force_flags(aTHX_ a,b,c)
 #define sv_copypv(a,b)         Perl_sv_copypv(aTHX_ a,b)
 #define my_atof2(a,b)          Perl_my_atof2(aTHX_ a,b)
diff --git a/global.sym b/global.sym

index 0b21dcd..1bc756a 100644 (file)
--- a/global.sym
+++ b/global.sym
@@ -709,7 +709,7 @@ Perl_Slab_Free
 Perl_sv_setsv_flags
 Perl_sv_catpvn_flags
 Perl_sv_catsv_flags
-Perl_sv_utf8_upgrade_flags
+Perl_sv_utf8_upgrade_flags_grow
 Perl_sv_pvn_force_flags
 Perl_sv_copypv
 Perl_my_atof2
diff --git a/pod/perlintern.pod b/pod/perlintern.pod

index e622841..4107d5e 100644 (file)
--- a/pod/perlintern.pod
+++ b/pod/perlintern.pod
@@ -470,25 +470,6 @@ Found in file mg.c
 
 =over 8
 
-=item mro_get_linear_isa_c3
-X<mro_get_linear_isa_c3>
-
-Returns the C3 linearization of @ISA
-the given stash.  The return value is a read-only AV*.
-C<level> should be 0 (it is used internally in this
-function's recursion).
-
-You are responsible for C<SvREFCNT_inc()> on the
-return value if you plan to store it anywhere
-semi-permanently (otherwise it might be deleted
-out from under you the next time the cache is
-invalidated).
-
-       AV*     mro_get_linear_isa_c3(HV* stash, U32 level)
-
-=for hackers
-Found in file mro.c
-
 =item mro_get_linear_isa_dfs
 X<mro_get_linear_isa_dfs>
 
diff --git a/proto.h b/proto.h

index ffbc9fb..4207599 100644 (file)
--- a/proto.h
+++ b/proto.h
@@ -5951,11 +5951,16 @@ PERL_CALLCONV void      Perl_sv_catsv_flags(pTHX_ SV *const dsv, SV *const ssv, const
 #define PERL_ARGS_ASSERT_SV_CATSV_FLAGS        \
        assert(dsv)
 
-PERL_CALLCONV STRLEN   Perl_sv_utf8_upgrade_flags(pTHX_ SV *const sv, const I32 flags)
-                       __attribute__nonnull__(pTHX_1);
+/* PERL_CALLCONV STRLEN        Perl_sv_utf8_upgrade_flags(pTHX_ SV *const sv, const I32 flags)
+                       __attribute__nonnull__(pTHX_1); */
 #define PERL_ARGS_ASSERT_SV_UTF8_UPGRADE_FLAGS \
        assert(sv)
 
+PERL_CALLCONV STRLEN   Perl_sv_utf8_upgrade_flags_grow(pTHX_ SV *const sv, const I32 flags, STRLEN extra)
+                       __attribute__nonnull__(pTHX_1);
+#define PERL_ARGS_ASSERT_SV_UTF8_UPGRADE_FLAGS_GROW    \
+       assert(sv)
+
 PERL_CALLCONV char*    Perl_sv_pvn_force_flags(pTHX_ SV *const sv, STRLEN *const lp, const I32 flags)
                        __attribute__nonnull__(pTHX_1);
 #define PERL_ARGS_ASSERT_SV_PVN_FORCE_FLAGS    \
diff --git a/sv.c b/sv.c

index 9a5d0e3..470d5b1 100644 (file)
--- a/sv.c
+++ b/sv.c
@@ -3173,14 +3173,44 @@ This is not as a general purpose byte encoding to Unicode interface:
 use the Encode extension for that.
 
 =cut
+
+The grow version is currently not externally documented.  It adds a parameter,
+extra, which is the number of unused bytes the string of 'sv' is guaranteed to
+have free after it upon return.  This allows the caller to reserve extra space
+that it intends to fill, to avoid extra grows.
+
+Also externally undocumented for the moment is the flag SV_FORCE_UTF8_UPGRADE,
+which can be used to tell this function to not first check to see if there are
+any characters that are different in UTF-8 (variant characters) which would
+force it to allocate a new string to sv, but to assume there are.  Typically
+this flag is used by a routine that has already parsed the string to find that
+there are such characters, and passes this information on so that the work
+doesn't have to be repeated.
+
+(One might think that the calling routine could pass in the position of the
+first such variant, so it wouldn't have to be found again.  But that is not the
+case, because typically when the caller is likely to use this flag, it won't be
+calling this routine unless it finds something that won't fit into a byte.
+Otherwise it tries to not upgrade and just use bytes.  But some things that
+do fit into a byte are variants in utf8, and the caller may not have been
+keeping track of these.)
+
+If the routine itself changes the string, it adds a trailing NUL.  Such a NUL
+isn't guaranteed due to having other routines do the work in some input cases,
+or if the input is already flagged as being in utf8.
+
+The speed of this could perhaps be improved for many cases if someone wanted to
+write a fast function that counts the number of variant characters in a string,
+especially if it could return the position of the first one.
+
 */
 
 STRLEN
-Perl_sv_utf8_upgrade_flags(pTHX_ register SV *const sv, const I32 flags)
+Perl_sv_utf8_upgrade_flags_grow(pTHX_ register SV *const sv, const I32 flags, STRLEN extra)
 {
     dVAR;
 
-    PERL_ARGS_ASSERT_SV_UTF8_UPGRADE_FLAGS;
+    PERL_ARGS_ASSERT_SV_UTF8_UPGRADE_FLAGS_GROW;
 
     if (sv == &PL_sv_undef)
        return 0;
@@ -3188,14 +3218,17 @@ Perl_sv_utf8_upgrade_flags(pTHX_ register SV *const sv, const I32 flags)
        STRLEN len = 0;
        if (SvREADONLY(sv) && (SvPOKp(sv) || SvIOKp(sv) || SvNOKp(sv))) {
            (void) sv_2pv_flags(sv,&len, flags);
-           if (SvUTF8(sv))
+           if (SvUTF8(sv)) {
+               if (extra) SvGROW(sv, SvCUR(sv) + extra);
                return len;
+           }
        } else {
            (void) SvPV_force(sv,len);
        }
     }
 
     if (SvUTF8(sv)) {
+       if (extra) SvGROW(sv, SvCUR(sv) + extra);
        return SvCUR(sv);
     }
 
@@ -3203,42 +3236,204 @@ Perl_sv_utf8_upgrade_flags(pTHX_ register SV *const sv, const I32 flags)
         sv_force_normal_flags(sv, 0);
     }
 
-    if (PL_encoding && !(flags & SV_UTF8_NO_ENCODING))
+    if (PL_encoding && !(flags & SV_UTF8_NO_ENCODING)) {
         sv_recode_to_utf8(sv, PL_encoding);
-    else { /* Assume Latin-1/EBCDIC */
+       if (extra) SvGROW(sv, SvCUR(sv) + extra);
+       return SvCUR(sv);
+    }
+
+    if (SvCUR(sv) > 0) { /* Assume Latin-1/EBCDIC */
        /* This function could be much more efficient if we
         * had a FLAG in SVs to signal if there are any variant
         * chars in the PV.  Given that there isn't such a flag
-        * make the loop as fast as possible. */
-       const U8 * const s = (U8 *) SvPVX_const(sv);
-       const U8 * const e = (U8 *) SvEND(sv);
-       const U8 *t = s;
+        * make the loop as fast as possible (although there are certainly ways
+        * to speed this up, eg. through vectorization) */
+       U8 * s = (U8 *) SvPVX_const(sv);
+       U8 * e = (U8 *) SvEND(sv);
+       U8 *t = s;
+       STRLEN two_byte_count = 0;
        
+       if (flags & SV_FORCE_UTF8_UPGRADE) goto must_be_utf8;
+
+       /* See if really will need to convert to utf8.  We mustn't rely on our
+        * incoming SV being well formed and having a trailing '\0', as certain
+        * code in pp_formline can send us partially built SVs. */
+
        while (t < e) {
            const U8 ch = *t++;
-           /* Check for variant */
-           if (!NATIVE_IS_INVARIANT(ch)) {
-               STRLEN len = SvCUR(sv);
-               /* *Currently* bytes_to_utf8() adds a '\0' after every string
-                  it converts. This isn't documented. It's not clear if it's
-                  a bad thing to be doing, and should be changed to do exactly
-                  what the documentation says. If so, this code will have to
-                  be changed.
-                  As is, we mustn't rely on our incoming SV being well formed
-                  and having a trailing '\0', as certain code in pp_formline
-                  can send us partially built SVs. */
-               U8 * const recoded = bytes_to_utf8((U8*)s, &len);
-
-               SvPV_free(sv); /* No longer using what was there before. */
-               SvPV_set(sv, (char*)recoded);
-               SvCUR_set(sv, len);
-               SvLEN_set(sv, len + 1); /* No longer know the real size. */
-               break;
-           }
+           if (NATIVE_IS_INVARIANT(ch)) continue;
+
+           t--;    /* t already incremented; re-point to first variant */
+           two_byte_count = 1;
+           goto must_be_utf8;
        }
-       /* Mark as UTF-8 even if no variant - saves scanning loop */
+
+       /* utf8 conversion not needed because all are invariants.  Mark as
+        * UTF-8 even if no variant - saves scanning loop */
        SvUTF8_on(sv);
+       return SvCUR(sv);
+
+must_be_utf8:
+
+       /* Here, the string should be converted to utf8, either because of an
+        * input flag (two_byte_count = 0), or because a character that
+        * requires 2 bytes was found (two_byte_count = 1).  t points either to
+        * the beginning of the string (if we didn't examine anything), or to
+        * the first variant.  In either case, everything from s to t - 1 will
+        * occupy only 1 byte each on output.
+        *
+        * There are two main ways to convert.  One is to create a new string
+        * and go through the input starting from the beginning, appending each
+        * converted value onto the new string as we go along.  It's probably
+        * best to allocate enough space in the string for the worst possible
+        * case rather than possibly running out of space and having to
+        * reallocate and then copy what we've done so far.  Since everything
+        * from s to t - 1 is invariant, the destination can be initialized
+        * with these using a fast memory copy
+        *
+        * The other way is to figure out exactly how big the string should be
+        * by parsing the entire input.  Then you don't have to make it big
+        * enough to handle the worst possible case, and more importantly, if
+        * the string you already have is large enough, you don't have to
+        * allocate a new string, you can copy the last character in the input
+        * string to the final position(s) that will be occupied by the
+        * converted string and go backwards, stopping at t, since everything
+        * before that is invariant.
+        *
+        * There are advantages and disadvantages to each method.
+        *
+        * In the first method, we can allocate a new string, do the memory
+        * copy from the s to t - 1, and then proceed through the rest of the
+        * string byte-by-byte.
+        *
+        * In the second method, we proceed through the rest of the input
+        * string just calculating how big the converted string will be.  Then
+        * there are two cases:
+        *  1)  if the string has enough extra space to handle the converted
+        *      value.  We go backwards through the string, converting until we
+        *      get to the position we are at now, and then stop.  If this
+        *      position is far enough along in the string, this method is
+        *      faster than the other method.  If the memory copy were the same
+        *      speed as the byte-by-byte loop, that position would be about
+        *      half-way, as at the half-way mark, parsing to the end and back
+        *      is one complete string's parse, the same amount as starting
+        *      over and going all the way through.  Actually, it would be
+        *      somewhat less than half-way, as it's faster to just count bytes
+        *      than to also copy, and we don't have the overhead of allocating
+        *      a new string, changing the scalar to use it, and freeing the
+        *      existing one.  But if the memory copy is fast, the break-even
+        *      point is somewhere after half way.  The counting loop could be
+        *      sped up by vectorization, etc, to move the break-even point
+        *      further towards the beginning.
+        *  2)  if the string doesn't have enough space to handle the converted
+        *      value.  A new string will have to be allocated, and one might
+        *      as well, given that, start from the beginning doing the first
+        *      method.  We've spent extra time parsing the string and in
+        *      exchange all we've gotten is that we know precisely how big to
+        *      make the new one.  Perl is more optimized for time than space,
+        *      so this case is a loser.
+        * So what I've decided to do is not use the 2nd method unless it is
+        * guaranteed that a new string won't have to be allocated, assuming
+        * the worst case.  I also decided not to put any more conditions on it
+        * than this, for now.  It seems likely that, since the worst case is
+        * twice as big as the unknown portion of the string (plus 1), we won't
+        * be guaranteed enough space, causing us to go to the first method,
+        * unless the string is short, or the first variant character is near
+        * the end of it.  In either of these cases, it seems best to use the
+        * 2nd method.  The only circumstance I can think of where this would
+        * be really slower is if the string had once had much more data in it
+        * than it does now, but there is still a substantial amount in it  */
+
+       {
+           STRLEN invariant_head = t - s;
+           STRLEN size = invariant_head + (e - t) * 2 + 1 + extra;
+           if (SvLEN(sv) < size) {
+
+               /* Here, have decided to allocate a new string */
+
+               U8 *dst;
+               U8 *d;
+
+               Newx(dst, size, U8);
+
+               /* If no known invariants at the beginning of the input string,
+                * set so starts from there.  Otherwise, can use memory copy to
+                * get up to where we are now, and then start from here */
+
+               if (invariant_head <= 0) {
+                   d = dst;
+               } else {
+                   Copy(s, dst, invariant_head, char);
+                   d = dst + invariant_head;
+               }
+
+               while (t < e) {
+                   const UV uv = NATIVE8_TO_UNI(*t++);
+                   if (UNI_IS_INVARIANT(uv))
+                       *d++ = (U8)UNI_TO_NATIVE(uv);
+                   else {
+                       *d++ = (U8)UTF8_EIGHT_BIT_HI(uv);
+                       *d++ = (U8)UTF8_EIGHT_BIT_LO(uv);
+                   }
+               }
+               *d = '\0';
+               SvPV_free(sv); /* No longer using pre-existing string */
+               SvPV_set(sv, (char*)dst);
+               SvCUR_set(sv, d - dst);
+               SvLEN_set(sv, size);
+           } else {
+
+               /* Here, have decided to get the exact size of the string.
+                * Currently this happens only when we know that there is
+                * guaranteed enough space to fit the converted string, so
+                * don't have to worry about growing.  If two_byte_count is 0,
+                * then t points to the first byte of the string which hasn't
+                * been examined yet.  Otherwise two_byte_count is 1, and t
+                * points to the first byte in the string that will expand to
+                * two.  Depending on this, start examining at t or 1 after t.
+                * */
+
+               U8 *d = t + two_byte_count;
+
+
+               /* Count up the remaining bytes that expand to two */
+
+               while (d < e) {
+                   const U8 chr = *d++;
+                   if (! NATIVE_IS_INVARIANT(chr)) two_byte_count++;
+               }
+
+               /* The string will expand by just the number of bytes that
+                * occupy two positions.  But we are one afterwards because of
+                * the increment just above.  This is the place to put the
+                * trailing NUL, and to set the length before we decrement */
+
+               d += two_byte_count;
+               SvCUR_set(sv, d - s);
+               *d-- = '\0';
+
+
+               /* Having decremented d, it points to the position to put the
+                * very last byte of the expanded string.  Go backwards through
+                * the string, copying and expanding as we go, stopping when we
+                * get to the part that is invariant the rest of the way down */
+
+               e--;
+               while (e >= t) {
+                   const U8 ch = NATIVE8_TO_UNI(*e--);
+                   if (UNI_IS_INVARIANT(ch)) {
+                       *d-- = UNI_TO_NATIVE(ch);
+                   } else {
+                       *d-- = (U8)UTF8_EIGHT_BIT_LO(ch);
+                       *d-- = (U8)UTF8_EIGHT_BIT_HI(ch);
+                   }
+               }
+           }
+       }
     }
+
+    /* Mark as UTF-8 even if no variant - saves scanning loop */
+    SvUTF8_on(sv);
     return SvCUR(sv);
 }
 
diff --git a/sv.h b/sv.h

index 43bc541..9acb84e 100644 (file)
--- a/sv.h
+++ b/sv.h
@@ -1716,6 +1716,10 @@ Like sv_utf8_upgrade, but doesn't do magic on C<sv>
 #define SV_COW_OTHER_PVS       1024
 /* Make sv_2pv_flags return NULL if something is undefined.  */
 #define SV_UNDEF_RETURNS_NULL  2048
+/* Tell sv_utf8_upgrade() to not check to see if an upgrade is really needed.
+ * This is used when the caller has already determined it is, and avoids
+ * redundant work */
+#define SV_FORCE_UTF8_UPGRADE  4096
 
 /* The core is safe for this COW optimisation. XS code on CPAN may not be.
    So only default to doing the COW setup if we're in the core.
@@ -1775,6 +1779,7 @@ mg.c:1024: warning: left-hand operand of comma expression has no effect
 #define sv_pvbyte(sv) SvPVbyte_nolen(sv)
 
 #define sv_pvn_force_nomg(sv, lp) sv_pvn_force_flags(sv, lp, 0)
+#define sv_utf8_upgrade_flags(sv, flags) sv_utf8_upgrade_flags_grow(sv, flags, 0)
 #define sv_utf8_upgrade_nomg(sv) sv_utf8_upgrade_flags(sv, 0)
 #define sv_catpvn_nomg(dsv, sstr, slen) sv_catpvn_flags(dsv, sstr, slen, 0)
 #define sv_setsv(dsv, ssv) \
diff --git a/utf8.h b/utf8.h

index e8efd14..f9f1bec 100644 (file)
--- a/utf8.h
+++ b/utf8.h
@@ -51,6 +51,7 @@ END_EXTERN_C
 
 /* Native character to iso-8859-1 */
 #define NATIVE_TO_ASCII(ch)      (ch)
+#define NATIVE8_TO_UNI(ch)        (ch)
 #define ASCII_TO_NATIVE(ch)      (ch)
 /* Transform after encoding */
 #define NATIVE_TO_UTF(ch)        (ch)
@@ -213,7 +214,7 @@ encoded character.
 #define UNICODE_ILLEGAL                        0xFFFF
 
 /* Though our UTF-8 encoding can go beyond this,
- * let's be conservative and do as Unicode 3.2 says. */
+ * let's be conservative and do as Unicode 5.1 says. */
 #define PERL_UNICODE_MAX       0x10FFFF
 
 #define UNICODE_ALLOW_SURROGATE 0x0001 /* Allow UTF-16 surrogates (EVIL) */
diff --git a/utfebcdic.h b/utfebcdic.h

index bb88571..7ed9375 100644 (file)
--- a/utfebcdic.h
+++ b/utfebcdic.h
@@ -29,7 +29,7 @@
  *                         in I8, far beyond the current Unicode standard's
  *                         max, as shown in the comment later in this file.)
  *  3) Use the table published in tr16 to convert each byte from step 2 into
- *     final UTF-EBCDIC.  The table in this file is PL_utf2e, and its invverse
+ *     final UTF-EBCDIC.  The table in this file is PL_utf2e, and its inverse
  *     is PL_e2utf.  They are constructed so that all EBCDIC invariants remain
  *     invariant, but no others do.  For example, the ordinal value of 'A' is
  *     193 in EBCDIC, and also is 193 in UTF-EBCDIC.  Step 1) converts it to
@@ -406,6 +406,7 @@ END_EXTERN_C
 
 /* Native to iso-8859-1 */
 #define NATIVE_TO_ASCII(ch)      PL_e2a[(U8)(ch)]
+#define NATIVE8_TO_UNI(ch)     NATIVE_TO_ASCII(ch)     /* synonym */
 #define ASCII_TO_NATIVE(ch)      PL_a2e[(U8)(ch)]
 /* Transform after encoding */
 #define NATIVE_TO_UTF(ch)        PL_e2utf[(U8)(ch)]
@@ -461,7 +462,7 @@ END_EXTERN_C
 
 #define UNI_IS_INVARIANT(c)            ((c) <  0xA0)
 /* UTF-EBCDIC sematic macros - transform back into UTF-8-Mod and then compare */
-#define NATIVE_IS_INVARIANT(c)         UNI_IS_INVARIANT(NATIVE_TO_ASCII(c))
+#define NATIVE_IS_INVARIANT(c)         UNI_IS_INVARIANT(NATIVE8_TO_UNI(c))
 #define UTF8_IS_INVARIANT(c)           UNI_IS_INVARIANT(NATIVE_TO_UTF(c))
 #define UTF8_IS_START(c)               (NATIVE_TO_UTF(c) >= 0xA0 && (NATIVE_TO_UTF(c) & 0xE0) != 0xA0)
 #define UTF8_IS_CONTINUATION(c)                ((NATIVE_TO_UTF(c) & 0xE0) == 0xA0)
embed.fnc		patch \| blob \| blame \| history
embed.h		patch \| blob \| blame \| history
global.sym		patch \| blob \| blame \| history
pod/perlintern.pod		patch \| blob \| blame \| history
proto.h		patch \| blob \| blame \| history
sv.c		patch \| blob \| blame \| history
sv.h		patch \| blob \| blame \| history
utf8.h		patch \| blob \| blame \| history
utfebcdic.h		patch \| blob \| blame \| history