sigaction test condition tweakage.

[p5sagit/p5-mst-13.2.git] / utf8.c
diff --git a/utf8.c b/utf8.c

index 156e63f..7302bb7 100644 (file)
--- a/utf8.c
+++ b/utf8.c
@@ -27,14 +27,14 @@
 /* Unicode support */
 
 /*
-=for apidoc A|U8*|uv_to_utf8|U8 *d|UV uv
+=for apidoc A|U8*|uvuni_to_utf8|U8 *d|UV uv
 
 Adds the UTF8 representation of the Unicode codepoint C<uv> to the end
 of the string C<d>; C<d> should be have at least C<UTF8_MAXLEN+1> free
 bytes available. The return value is the pointer to the byte after the
-end of the new character. In other words, 
+end of the new character. In other words,
 
-    d = uv_to_utf8(d, uv);
+    d = uvuni_to_utf8(d, uv);
 
 is the recommended Unicode-aware way of saying
 
@@ -44,7 +44,7 @@ is the recommended Unicode-aware way of saying
 */
 
 U8 *
-Perl_uv_to_utf8(pTHX_ U8 *d, UV uv)
+Perl_uvuni_to_utf8(pTHX_ U8 *d, UV uv)
 {
     if (uv < 0x80) {
        *d++ = uv;
@@ -119,14 +119,40 @@ Perl_uv_to_utf8(pTHX_ U8 *d, UV uv)
 }
 
 /*
-=for apidoc A|STRLEN|is_utf8_char|U8 *s
+=for apidoc A|U8*|uvchr_to_utf8|U8 *d|UV uv
+
+Adds the UTF8 representation of the Native codepoint C<uv> to the end
+of the string C<d>; C<d> should be have at least C<UTF8_MAXLEN+1> free
+bytes available. The return value is the pointer to the byte after the
+end of the new character. In other words,
+
+    d = uvchr_to_utf8(d, uv);
+
+is the recommended wide native character-aware way of saying
+
+    *(d++) = uv;
 
-Tests if some arbitrary number of bytes begins in a valid UTF-8 character.
-The actual number of bytes in the UTF-8 character will be returned if it
-is valid, otherwise 0. 
- 
 =cut
 */
+
+U8 *
+Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
+{
+    if (uv < 0x100)
+       uv = NATIVE_TO_ASCII(uv);
+    return Perl_uvuni_to_utf8(aTHX_ d, uv);
+}
+
+
+/*
+=for apidoc A|STRLEN|is_utf8_char|U8 *s
+
+Tests if some arbitrary number of bytes begins in a valid UTF-8
+character.  Note that an ASCII character is a valid UTF-8 character.
+The actual number of bytes in the UTF-8 character will be returned if
+it is valid, otherwise 0.
+
+=cut */
 STRLEN
 Perl_is_utf8_char(pTHX_ U8 *s)
 {
@@ -168,8 +194,10 @@ Perl_is_utf8_char(pTHX_ U8 *s)
 /*
 =for apidoc A|bool|is_utf8_string|U8 *s|STRLEN len
 
-Returns true if first C<len> bytes of the given string form valid a UTF8
-string, false otherwise.
+Returns true if first C<len> bytes of the given string form a valid UTF8
+string, false otherwise.  Note that 'a valid UTF8 string' does not mean
+'a string that contains UTF8' because a valid ASCII string is a valid
+UTF8 string.
 
 =cut
 */
@@ -198,9 +226,10 @@ Perl_is_utf8_string(pTHX_ U8 *s, STRLEN len)
 }
 
 /*
-=for apidoc A|U8* s|utf8_to_uv|STRLEN curlen|STRLEN *retlen|U32 flags
+=for apidoc A|UV|utf8n_to_uvuni|U8 *s|STRLEN curlen|STRLEN *retlen|U32 flags
 
-Returns the character value of the first character in the string C<s>
+Bottom level UTF-8 decode routine.
+Returns the unicode code point value of the first character in the string C<s>
 which is assumed to be in UTF8 encoding and no longer than C<curlen>;
 C<retlen> will be set to the length, in bytes, of that character.
 
@@ -215,18 +244,16 @@ length of the UTF-8 character in bytes, and zero will be returned.
 The C<flags> can also contain various flags to allow deviations from
 the strict UTF-8 encoding (see F<utf8.h>).
 
+Most code should use utf8_to_uvchr() rather than call this directly.
+
 =cut */
 
 UV
-Perl_utf8_to_uv(pTHX_ U8* s, STRLEN curlen, STRLEN* retlen, U32 flags)
+Perl_utf8n_to_uvuni(pTHX_ U8* s, STRLEN curlen, STRLEN* retlen, U32 flags)
 {
     UV uv = *s, ouv;
     STRLEN len = 1;
-#ifdef EBCDIC
-    bool dowarn = 0;
-#else
     bool dowarn = ckWARN_d(WARN_UTF8);
-#endif
     STRLEN expectlen = 0;
     U32 warning = 0;
 
@@ -252,7 +279,7 @@ Perl_utf8_to_uv(pTHX_ U8* s, STRLEN curlen, STRLEN* retlen, U32 flags)
     if (UTF8_IS_ASCII(uv)) {
        if (retlen)
            *retlen = 1;
-       return *s;
+       return (UV) (*s);
     }
 
     if (UTF8_IS_CONTINUATION(uv) &&
@@ -266,7 +293,7 @@ Perl_utf8_to_uv(pTHX_ U8* s, STRLEN curlen, STRLEN* retlen, U32 flags)
        warning = UTF8_WARN_NON_CONTINUATION;
        goto malformed;
     }
-    
+
     if ((uv == 0xfe || uv == 0xff) &&
        !(flags & UTF8_ALLOW_FE_FF)) {
        warning = UTF8_WARN_FE_FF;
@@ -283,7 +310,7 @@ Perl_utf8_to_uv(pTHX_ U8* s, STRLEN curlen, STRLEN* retlen, U32 flags)
        
     if (retlen)
        *retlen = len;
-    
+
     expectlen = len;
 
     if ((curlen < expectlen) &&
@@ -413,12 +440,55 @@ malformed:
 }
 
 /*
-=for apidoc A|U8* s|utf8_to_uv_simple|STRLEN *retlen
+=for apidoc A|U8* s|utf8n_to_uvchr|STRLEN curlen, STRLEN *retlen, U32 flags
+
+Returns the native character value of the first character in the string C<s>
+which is assumed to be in UTF8 encoding; C<retlen> will be set to the
+length, in bytes, of that character.
+
+Allows length and flags to be passed to low level routine.
+
+=cut
+*/
+
+UV
+Perl_utf8n_to_uvchr(pTHX_ U8* s, STRLEN curlen, STRLEN* retlen, U32 flags)
+{
+    UV uv = Perl_utf8n_to_uvuni(aTHX_ s, curlen, retlen, flags);
+    if (uv < 0x100)
+        return (UV) ASCII_TO_NATIVE(uv);
+    return uv;
+}
+
+/*
+=for apidoc A|U8* s|utf8_to_uvchr|STRLEN *retlen
+
+Returns the native character value of the first character in the string C<s>
+which is assumed to be in UTF8 encoding; C<retlen> will be set to the
+length, in bytes, of that character.
+
+If C<s> does not point to a well-formed UTF8 character, zero is
+returned and retlen is set, if possible, to -1.
+
+=cut
+*/
+
+UV
+Perl_utf8_to_uvchr(pTHX_ U8* s, STRLEN* retlen)
+{
+    return Perl_utf8n_to_uvchr(aTHX_ s, UTF8_MAXLEN, retlen, 0);
+}
+
+/*
+=for apidoc A|U8* s|utf8_to_uvuni|STRLEN *retlen
 
-Returns the character value of the first character in the string C<s>
+Returns the Unicode code point of the first character in the string C<s>
 which is assumed to be in UTF8 encoding; C<retlen> will be set to the
 length, in bytes, of that character.
 
+This function should only be used when returned UV is considered
+an index into the Unicode semantic tables (e.g. swashes).
+
 If C<s> does not point to a well-formed UTF8 character, zero is
 returned and retlen is set, if possible, to -1.
 
@@ -426,9 +496,10 @@ returned and retlen is set, if possible, to -1.
 */
 
 UV
-Perl_utf8_to_uv_simple(pTHX_ U8* s, STRLEN* retlen)
+Perl_utf8_to_uvuni(pTHX_ U8* s, STRLEN* retlen)
 {
-    return Perl_utf8_to_uv(aTHX_ s, UTF8_MAXLEN, retlen, 0);
+    /* Call the low level routine asking for checks */
+    return Perl_utf8n_to_uvuni(aTHX_ s, UTF8_MAXLEN, retlen, 0);
 }
 
 /*
@@ -574,7 +645,7 @@ Perl_utf8_to_bytes(pTHX_ U8* s, STRLEN *len)
     d = s = save;
     while (s < send) {
         STRLEN ulen;
-        *d++ = (U8)utf8_to_uv_simple(s, &ulen);
+        *d++ = (U8)utf8_to_uvchr(s, &ulen);
         s += ulen;
     }
     *d = '\0';
@@ -583,6 +654,60 @@ Perl_utf8_to_bytes(pTHX_ U8* s, STRLEN *len)
 }
 
 /*
+=for apidoc A|U8 *|bytes_from_utf8|U8 *s|STRLEN *len|bool *is_utf8
+
+Converts a string C<s> of length C<len> from UTF8 into byte encoding.
+Unlike <utf8_to_bytes> but like C<bytes_to_utf8>, returns a pointer to
+the newly-created string, and updates C<len> to contain the new
+length.  Returns the original string if no conversion occurs, C<len>
+is unchanged. Do nothing if C<is_utf8> points to 0. Sets C<is_utf8> to
+0 if C<s> is converted or contains all 7bit characters.
+
+=cut */
+
+U8 *
+Perl_bytes_from_utf8(pTHX_ U8* s, STRLEN *len, bool *is_utf8)
+{
+    U8 *send;
+    U8 *d;
+    U8 *start = s;
+    I32 count = 0;
+
+    if (!*is_utf8)
+       return start;
+
+    /* ensure valid UTF8 and chars < 256 before converting string */
+    for (send = s + *len; s < send;) {
+       U8 c = *s++;
+        if (!UTF8_IS_ASCII(c)) {
+           if (UTF8_IS_CONTINUATION(c) || s >= send ||
+               !UTF8_IS_CONTINUATION(*s) || UTF8_IS_DOWNGRADEABLE_START(c))
+               return start;
+           s++, count++;
+        }
+    }
+
+    *is_utf8 = 0;              
+
+    if (!count)
+       return start;
+
+    Newz(801, d, (*len) - count + 1, U8);
+    s = start; start = d;
+    while (s < send) {
+       U8 c = *s++;
+
+       if (UTF8_IS_ASCII(c))
+           *d++ = c;
+       else
+           *d++ = UTF8_ACCUMULATE(c, *s++);
+    }
+    *d = '\0';
+    *len = d - start;
+    return start;
+}
+
+/*
 =for apidoc A|U8 *|bytes_to_utf8|U8 *s|STRLEN *len
 
 Converts a string C<s> of length C<len> from ASCII into UTF8 encoding.
@@ -604,12 +729,13 @@ Perl_bytes_to_utf8(pTHX_ U8* s, STRLEN *len)
     dst = d;
 
     while (s < send) {
-        if (*s < 0x80)
+        if (UTF8_IS_ASCII(*s))
             *d++ = *s++;
         else {
             UV uv = *s++;
-            *d++ = (( uv >>  6)         | 0xc0);
-            *d++ = (( uv        & 0x3f) | 0x80);
+
+            *d++ = UTF8_EIGHT_BIT_HI(uv);
+            *d++ = UTF8_EIGHT_BIT_LO(uv);
         }
     }
     *d = '\0';
@@ -692,7 +818,7 @@ bool
 Perl_is_uni_alnum(pTHX_ U32 c)
 {
     U8 tmpbuf[UTF8_MAXLEN+1];
-    uv_to_utf8(tmpbuf, (UV)c);
+    uvuni_to_utf8(tmpbuf, (UV)c);
     return is_utf8_alnum(tmpbuf);
 }
 
@@ -700,7 +826,7 @@ bool
 Perl_is_uni_alnumc(pTHX_ U32 c)
 {
     U8 tmpbuf[UTF8_MAXLEN+1];
-    uv_to_utf8(tmpbuf, (UV)c);
+    uvuni_to_utf8(tmpbuf, (UV)c);
     return is_utf8_alnumc(tmpbuf);
 }
 
@@ -708,7 +834,7 @@ bool
 Perl_is_uni_idfirst(pTHX_ U32 c)
 {
     U8 tmpbuf[UTF8_MAXLEN+1];
-    uv_to_utf8(tmpbuf, (UV)c);
+    uvuni_to_utf8(tmpbuf, (UV)c);
     return is_utf8_idfirst(tmpbuf);
 }
 
@@ -716,7 +842,7 @@ bool
 Perl_is_uni_alpha(pTHX_ U32 c)
 {
     U8 tmpbuf[UTF8_MAXLEN+1];
-    uv_to_utf8(tmpbuf, (UV)c);
+    uvuni_to_utf8(tmpbuf, (UV)c);
     return is_utf8_alpha(tmpbuf);
 }
 
@@ -724,7 +850,7 @@ bool
 Perl_is_uni_ascii(pTHX_ U32 c)
 {
     U8 tmpbuf[UTF8_MAXLEN+1];
-    uv_to_utf8(tmpbuf, (UV)c);
+    uvuni_to_utf8(tmpbuf, (UV)c);
     return is_utf8_ascii(tmpbuf);
 }
 
@@ -732,7 +858,7 @@ bool
 Perl_is_uni_space(pTHX_ U32 c)
 {
     U8 tmpbuf[UTF8_MAXLEN+1];
-    uv_to_utf8(tmpbuf, (UV)c);
+    uvuni_to_utf8(tmpbuf, (UV)c);
     return is_utf8_space(tmpbuf);
 }
 
@@ -740,7 +866,7 @@ bool
 Perl_is_uni_digit(pTHX_ U32 c)
 {
     U8 tmpbuf[UTF8_MAXLEN+1];
-    uv_to_utf8(tmpbuf, (UV)c);
+    uvuni_to_utf8(tmpbuf, (UV)c);
     return is_utf8_digit(tmpbuf);
 }
 
@@ -748,7 +874,7 @@ bool
 Perl_is_uni_upper(pTHX_ U32 c)
 {
     U8 tmpbuf[UTF8_MAXLEN+1];
-    uv_to_utf8(tmpbuf, (UV)c);
+    uvuni_to_utf8(tmpbuf, (UV)c);
     return is_utf8_upper(tmpbuf);
 }
 
@@ -756,7 +882,7 @@ bool
 Perl_is_uni_lower(pTHX_ U32 c)
 {
     U8 tmpbuf[UTF8_MAXLEN+1];
-    uv_to_utf8(tmpbuf, (UV)c);
+    uvuni_to_utf8(tmpbuf, (UV)c);
     return is_utf8_lower(tmpbuf);
 }
 
@@ -764,7 +890,7 @@ bool
 Perl_is_uni_cntrl(pTHX_ U32 c)
 {
     U8 tmpbuf[UTF8_MAXLEN+1];
-    uv_to_utf8(tmpbuf, (UV)c);
+    uvuni_to_utf8(tmpbuf, (UV)c);
     return is_utf8_cntrl(tmpbuf);
 }
 
@@ -772,7 +898,7 @@ bool
 Perl_is_uni_graph(pTHX_ U32 c)
 {
     U8 tmpbuf[UTF8_MAXLEN+1];
-    uv_to_utf8(tmpbuf, (UV)c);
+    uvuni_to_utf8(tmpbuf, (UV)c);
     return is_utf8_graph(tmpbuf);
 }
 
@@ -780,7 +906,7 @@ bool
 Perl_is_uni_print(pTHX_ U32 c)
 {
     U8 tmpbuf[UTF8_MAXLEN+1];
-    uv_to_utf8(tmpbuf, (UV)c);
+    uvuni_to_utf8(tmpbuf, (UV)c);
     return is_utf8_print(tmpbuf);
 }
 
@@ -788,7 +914,7 @@ bool
 Perl_is_uni_punct(pTHX_ U32 c)
 {
     U8 tmpbuf[UTF8_MAXLEN+1];
-    uv_to_utf8(tmpbuf, (UV)c);
+    uvuni_to_utf8(tmpbuf, (UV)c);
     return is_utf8_punct(tmpbuf);
 }
 
@@ -796,7 +922,7 @@ bool
 Perl_is_uni_xdigit(pTHX_ U32 c)
 {
     U8 tmpbuf[UTF8_MAXLEN+1];
-    uv_to_utf8(tmpbuf, (UV)c);
+    uvuni_to_utf8(tmpbuf, (UV)c);
     return is_utf8_xdigit(tmpbuf);
 }
 
@@ -804,7 +930,7 @@ U32
 Perl_to_uni_upper(pTHX_ U32 c)
 {
     U8 tmpbuf[UTF8_MAXLEN+1];
-    uv_to_utf8(tmpbuf, (UV)c);
+    uvuni_to_utf8(tmpbuf, (UV)c);
     return to_utf8_upper(tmpbuf);
 }
 
@@ -812,7 +938,7 @@ U32
 Perl_to_uni_title(pTHX_ U32 c)
 {
     U8 tmpbuf[UTF8_MAXLEN+1];
-    uv_to_utf8(tmpbuf, (UV)c);
+    uvuni_to_utf8(tmpbuf, (UV)c);
     return to_utf8_title(tmpbuf);
 }
 
@@ -820,7 +946,7 @@ U32
 Perl_to_uni_lower(pTHX_ U32 c)
 {
     U8 tmpbuf[UTF8_MAXLEN+1];
-    uv_to_utf8(tmpbuf, (UV)c);
+    uvuni_to_utf8(tmpbuf, (UV)c);
     return to_utf8_lower(tmpbuf);
 }
 
@@ -1099,7 +1225,7 @@ Perl_to_utf8_upper(pTHX_ U8 *p)
     if (!PL_utf8_toupper)
        PL_utf8_toupper = swash_init("utf8", "ToUpper", &PL_sv_undef, 4, 0);
     uv = swash_fetch(PL_utf8_toupper, p);
-    return uv ? uv : utf8_to_uv(p,UTF8_MAXLEN,0,0);
+    return uv ? UNI_TO_NATIVE(uv) : utf8_to_uvchr(p,0);
 }
 
 UV
@@ -1110,7 +1236,7 @@ Perl_to_utf8_title(pTHX_ U8 *p)
     if (!PL_utf8_totitle)
        PL_utf8_totitle = swash_init("utf8", "ToTitle", &PL_sv_undef, 4, 0);
     uv = swash_fetch(PL_utf8_totitle, p);
-    return uv ? uv : utf8_to_uv(p,UTF8_MAXLEN,0,0);
+    return uv ? UNI_TO_NATIVE(uv) : utf8_to_uvchr(p,0);
 }
 
 UV
@@ -1121,7 +1247,7 @@ Perl_to_utf8_lower(pTHX_ U8 *p)
     if (!PL_utf8_tolower)
        PL_utf8_tolower = swash_init("utf8", "ToLower", &PL_sv_undef, 4, 0);
     uv = swash_fetch(PL_utf8_tolower, p);
-    return uv ? uv : utf8_to_uv(p,UTF8_MAXLEN,0,0);
+    return uv ? UNI_TO_NATIVE(uv) : utf8_to_uvchr(p,0);
 }
 
 /* a "swash" is a swatch hash */
@@ -1215,7 +1341,10 @@ Perl_swash_fetch(pTHX_ SV *sv, U8 *ptr)
            PUSHMARK(SP);
            EXTEND(SP,3);
            PUSHs((SV*)sv);
-           PUSHs(sv_2mortal(newSViv(utf8_to_uv(ptr, UTF8_MAXLEN, 0, 0) & ~(needents - 1))));
+           /* We call utf8_to_uni as we want and index into Unicode tables,
+              not a native character number.
+            */
+           PUSHs(sv_2mortal(newSViv(utf8_to_uvuni(ptr, 0) & ~(needents - 1))));
            PUSHs(sv_2mortal(newSViv(needents)));
            PUTBACK;
            if (call_method("SWASHGET", G_SCALAR))