-Dt padsv($var)

[p5sagit/p5-mst-13.2.git] / utf8.c
diff --git a/utf8.c b/utf8.c

index 156e63f..f00659a 100644 (file)
--- a/utf8.c
+++ b/utf8.c
@@ -121,12 +121,12 @@ Perl_uv_to_utf8(pTHX_ U8 *d, UV uv)
 /*
 =for apidoc A|STRLEN|is_utf8_char|U8 *s
 
-Tests if some arbitrary number of bytes begins in a valid UTF-8 character.
-The actual number of bytes in the UTF-8 character will be returned if it
-is valid, otherwise 0. 
+Tests if some arbitrary number of bytes begins in a valid UTF-8
+character.  Note that an ASCII character is a valid UTF-8 character.
+The actual number of bytes in the UTF-8 character will be returned if
+it is valid, otherwise 0.
  
-=cut
-*/
+=cut */
 STRLEN
 Perl_is_utf8_char(pTHX_ U8 *s)
 {
@@ -168,8 +168,10 @@ Perl_is_utf8_char(pTHX_ U8 *s)
 /*
 =for apidoc A|bool|is_utf8_string|U8 *s|STRLEN len
 
-Returns true if first C<len> bytes of the given string form valid a UTF8
-string, false otherwise.
+Returns true if first C<len> bytes of the given string form a valid UTF8
+string, false otherwise.  Note that 'a valid UTF8 string' does not mean
+'a string that contains UTF8' because a valid ASCII string is a valid
+UTF8 string.
 
 =cut
 */
@@ -198,7 +200,7 @@ Perl_is_utf8_string(pTHX_ U8 *s, STRLEN len)
 }
 
 /*
-=for apidoc A|U8* s|utf8_to_uv|STRLEN curlen|STRLEN *retlen|U32 flags
+=for apidoc A|UV|utf8_to_uv|U8 *s|STRLEN curlen|STRLEN *retlen|U32 flags
 
 Returns the character value of the first character in the string C<s>
 which is assumed to be in UTF8 encoding and no longer than C<curlen>;
@@ -583,6 +585,60 @@ Perl_utf8_to_bytes(pTHX_ U8* s, STRLEN *len)
 }
 
 /*
+=for apidoc A|U8 *|bytes_from_utf8|U8 *s|STRLEN *len|bool *is_utf8
+
+Converts a string C<s> of length C<len> from UTF8 into byte encoding.
+Unlike <utf8_to_bytes> but like C<bytes_to_utf8>, returns a pointer to
+the newly-created string, and updates C<len> to contain the new
+length.  Returns the original string if no conversion occurs, C<len>
+is unchanged. Do nothing if C<is_utf8> points to 0. Sets C<is_utf8> to
+0 if C<s> is converted or contains all 7bit characters.
+
+=cut */
+
+U8 *
+Perl_bytes_from_utf8(pTHX_ U8* s, STRLEN *len, bool *is_utf8)
+{
+    U8 *send;
+    U8 *d;
+    U8 *start = s;
+    I32 count = 0;
+
+    if (!*is_utf8)
+       return start;
+
+    /* ensure valid UTF8 and chars < 256 before converting string */
+    for (send = s + *len; s < send;) {
+       U8 c = *s++;
+        if (!UTF8_IS_ASCII(c)) {
+           if (UTF8_IS_CONTINUATION(c) || s >= send ||
+               !UTF8_IS_CONTINUATION(*s) || UTF8_IS_DOWNGRADEABLE_START(c))
+               return start;
+           s++, count++;
+        }
+    }
+
+    *is_utf8 = 0;              
+
+    if (!count)
+       return start;
+
+    Newz(801, d, (*len) - count + 1, U8);
+    s = start; start = d;
+    while (s < send) {
+       U8 c = *s++;
+
+       if (UTF8_IS_ASCII(c))
+           *d++ = c;
+       else
+           *d++ = UTF8_ACCUMULATE(c, *s++);
+    }
+    *d = '\0';
+    *len = d - start;
+    return start;
+}
+
+/*
 =for apidoc A|U8 *|bytes_to_utf8|U8 *s|STRLEN *len
 
 Converts a string C<s> of length C<len> from ASCII into UTF8 encoding.
@@ -604,12 +660,13 @@ Perl_bytes_to_utf8(pTHX_ U8* s, STRLEN *len)
     dst = d;
 
     while (s < send) {
-        if (*s < 0x80)
+        if (UTF8_IS_ASCII(*s))
             *d++ = *s++;
         else {
             UV uv = *s++;
-            *d++ = (( uv >>  6)         | 0xc0);
-            *d++ = (( uv        & 0x3f) | 0x80);
+
+            *d++ = UTF8_EIGHT_BIT_HI(uv);
+            *d++ = UTF8_EIGHT_BIT_LO(uv);
         }
     }
     *d = '\0';