Make the toupper/lower/title API for Unicode not right
Jarkko Hietaniemi [Tue, 9 Oct 2001 01:48:17 +0000 (01:48 +0000)]
but at least less wrong: prepare for the mapping being
more than just one-character-to-one-character.

p4raw-id: //depot/perl@12371

embed.h
embed.pl
global.sym
handy.h
pp.c
proto.h
regcomp.c
regexec.c
utf8.c

diff --git a/embed.h b/embed.h
index 29ee843..a3f43d0 100644 (file)
--- a/embed.h
+++ b/embed.h
 #define is_uni_print_lc                Perl_is_uni_print_lc
 #define is_uni_punct_lc                Perl_is_uni_punct_lc
 #define is_uni_xdigit_lc       Perl_is_uni_xdigit_lc
-#define to_uni_upper_lc                Perl_to_uni_upper_lc
-#define to_uni_title_lc                Perl_to_uni_title_lc
-#define to_uni_lower_lc                Perl_to_uni_lower_lc
 #define is_utf8_char           Perl_is_utf8_char
 #define is_utf8_string         Perl_is_utf8_string
 #define is_utf8_alnum          Perl_is_utf8_alnum
 #define is_uni_print(a)                Perl_is_uni_print(aTHX_ a)
 #define is_uni_punct(a)                Perl_is_uni_punct(aTHX_ a)
 #define is_uni_xdigit(a)       Perl_is_uni_xdigit(aTHX_ a)
-#define to_uni_upper(a)                Perl_to_uni_upper(aTHX_ a)
-#define to_uni_title(a)                Perl_to_uni_title(aTHX_ a)
-#define to_uni_lower(a)                Perl_to_uni_lower(aTHX_ a)
+#define to_uni_upper(a,b,c)    Perl_to_uni_upper(aTHX_ a,b,c)
+#define to_uni_title(a,b,c)    Perl_to_uni_title(aTHX_ a,b,c)
+#define to_uni_lower(a,b,c)    Perl_to_uni_lower(aTHX_ a,b,c)
 #define is_uni_alnum_lc(a)     Perl_is_uni_alnum_lc(aTHX_ a)
 #define is_uni_alnumc_lc(a)    Perl_is_uni_alnumc_lc(aTHX_ a)
 #define is_uni_idfirst_lc(a)   Perl_is_uni_idfirst_lc(aTHX_ a)
 #define is_uni_print_lc(a)     Perl_is_uni_print_lc(aTHX_ a)
 #define is_uni_punct_lc(a)     Perl_is_uni_punct_lc(aTHX_ a)
 #define is_uni_xdigit_lc(a)    Perl_is_uni_xdigit_lc(aTHX_ a)
-#define to_uni_upper_lc(a)     Perl_to_uni_upper_lc(aTHX_ a)
-#define to_uni_title_lc(a)     Perl_to_uni_title_lc(aTHX_ a)
-#define to_uni_lower_lc(a)     Perl_to_uni_lower_lc(aTHX_ a)
 #define is_utf8_char(a)                Perl_is_utf8_char(aTHX_ a)
 #define is_utf8_string(a,b)    Perl_is_utf8_string(aTHX_ a,b)
 #define is_utf8_alnum(a)       Perl_is_utf8_alnum(aTHX_ a)
 #define swash_fetch(a,b,c)     Perl_swash_fetch(aTHX_ a,b,c)
 #define taint_env()            Perl_taint_env(aTHX)
 #define taint_proper(a,b)      Perl_taint_proper(aTHX_ a,b)
-#define to_utf8_lower(a)       Perl_to_utf8_lower(aTHX_ a)
-#define to_utf8_upper(a)       Perl_to_utf8_upper(aTHX_ a)
-#define to_utf8_title(a)       Perl_to_utf8_title(aTHX_ a)
+#define to_utf8_lower(a,b,c)   Perl_to_utf8_lower(aTHX_ a,b,c)
+#define to_utf8_upper(a,b,c)   Perl_to_utf8_upper(aTHX_ a,b,c)
+#define to_utf8_title(a,b,c)   Perl_to_utf8_title(aTHX_ a,b,c)
 #if defined(UNLINK_ALL_VERSIONS)
 #define unlnk(a)               Perl_unlnk(aTHX_ a)
 #endif
index 73e72d2..cec8d7e 100755 (executable)
--- a/embed.pl
+++ b/embed.pl
@@ -1353,9 +1353,9 @@ Ap        |bool   |is_uni_lower   |U32 c
 Ap     |bool   |is_uni_print   |U32 c
 Ap     |bool   |is_uni_punct   |U32 c
 Ap     |bool   |is_uni_xdigit  |U32 c
-Ap     |U32    |to_uni_upper   |U32 c
-Ap     |U32    |to_uni_title   |U32 c
-Ap     |U32    |to_uni_lower   |U32 c
+Ap     |U32    |to_uni_upper   |U32 c|U8 *p|STRLEN *lenp
+Ap     |U32    |to_uni_title   |U32 c|U8 *p|STRLEN *lenp
+Ap     |U32    |to_uni_lower   |U32 c|U8 *p|STRLEN *lenp
 Ap     |bool   |is_uni_alnum_lc|U32 c
 Ap     |bool   |is_uni_alnumc_lc|U32 c
 Ap     |bool   |is_uni_idfirst_lc|U32 c
@@ -1370,9 +1370,6 @@ Ap        |bool   |is_uni_lower_lc|U32 c
 Ap     |bool   |is_uni_print_lc|U32 c
 Ap     |bool   |is_uni_punct_lc|U32 c
 Ap     |bool   |is_uni_xdigit_lc|U32 c
-Ap     |U32    |to_uni_upper_lc|U32 c
-Ap     |U32    |to_uni_title_lc|U32 c
-Ap     |U32    |to_uni_lower_lc|U32 c
 Apd    |STRLEN |is_utf8_char   |U8 *p
 Apd    |bool   |is_utf8_string |U8 *s|STRLEN len
 Ap     |bool   |is_utf8_alnum  |U8 *p
@@ -1817,9 +1814,9 @@ Ap        |SV*    |swash_init     |char* pkg|char* name|SV* listsv \
 Ap     |UV     |swash_fetch    |SV *sv|U8 *ptr|bool do_utf8
 Ap     |void   |taint_env
 Ap     |void   |taint_proper   |const char* f|const char* s
-Ap     |UV     |to_utf8_lower  |U8 *p
-Ap     |UV     |to_utf8_upper  |U8 *p
-Ap     |UV     |to_utf8_title  |U8 *p
+Ap     |UV     |to_utf8_lower  |U8 *p|U8* ustrp|STRLEN *lenp
+Ap     |UV     |to_utf8_upper  |U8 *p|U8* ustrp|STRLEN *lenp
+Ap     |UV     |to_utf8_title  |U8 *p|U8* ustrp|STRLEN *lenp
 #if defined(UNLINK_ALL_VERSIONS)
 Ap     |I32    |unlnk          |char* f
 #endif
index 28d86a5..b5c912b 100644 (file)
@@ -186,9 +186,6 @@ Perl_is_uni_lower_lc
 Perl_is_uni_print_lc
 Perl_is_uni_punct_lc
 Perl_is_uni_xdigit_lc
-Perl_to_uni_upper_lc
-Perl_to_uni_title_lc
-Perl_to_uni_lower_lc
 Perl_is_utf8_char
 Perl_is_utf8_string
 Perl_is_utf8_alnum
diff --git a/handy.h b/handy.h
index d912735..35373f4 100644 (file)
--- a/handy.h
+++ b/handy.h
@@ -425,9 +425,9 @@ Converts the specified character to lowercase.
 #define isPRINT_uni(c)         is_uni_print(c)
 #define isPUNCT_uni(c)         is_uni_punct(c)
 #define isXDIGIT_uni(c)                is_uni_xdigit(c)
-#define toUPPER_uni(c)         to_uni_upper(c)
-#define toTITLE_uni(c)         to_uni_title(c)
-#define toLOWER_uni(c)         to_uni_lower(c)
+#define toUPPER_uni(c,s,l)     to_uni_upper(c,s,l)
+#define toTITLE_uni(c,s,l)     to_uni_title(c,s,l)
+#define toLOWER_uni(c,s,l)     to_uni_lower(c,s,l)
 
 #define isPSXSPC_uni(c)                (isSPACE_uni(c) ||(c) == '\f')
 #define isBLANK_uni(c)         isBLANK(c) /* could be wrong */
@@ -444,9 +444,6 @@ Converts the specified character to lowercase.
 #define isGRAPH_LC_uvchr(c)    (c < 256 ? isGRAPH_LC(c) : is_uni_graph_lc(c))
 #define isPRINT_LC_uvchr(c)    (c < 256 ? isPRINT_LC(c) : is_uni_print_lc(c))
 #define isPUNCT_LC_uvchr(c)    (c < 256 ? isPUNCT_LC(c) : is_uni_punct_lc(c))
-#define toUPPER_LC_uvchr(c)    (c < 256 ? toUPPER_LC(c) : to_uni_upper_lc(c))
-#define toTITLE_LC_uvchr(c)    (c < 256 ? toUPPER_LC(c) : to_uni_title_lc(c))
-#define toLOWER_LC_uvchr(c)    (c < 256 ? toLOWER_LC(c) : to_uni_lower_lc(c))
 
 #define isPSXSPC_LC_uni(c)     (isSPACE_LC_uni(c) ||(c) == '\f')
 #define isBLANK_LC_uni(c)      isBLANK(c) /* could be wrong */
@@ -465,9 +462,9 @@ Converts the specified character to lowercase.
 #define isPRINT_utf8(p)                is_utf8_print(p)
 #define isPUNCT_utf8(p)                is_utf8_punct(p)
 #define isXDIGIT_utf8(p)       is_utf8_xdigit(p)
-#define toUPPER_utf8(p)                to_utf8_upper(p)
-#define toTITLE_utf8(p)                to_utf8_title(p)
-#define toLOWER_utf8(p)                to_utf8_lower(p)
+#define toUPPER_utf8(p,s,l)    to_utf8_upper(p,s,l)
+#define toTITLE_utf8(p,s,l)    to_utf8_title(p,s,l)
+#define toLOWER_utf8(p,s,l)    to_utf8_lower(p,s,l)
 
 #define isPSXSPC_utf8(c)       (isSPACE_utf8(c) ||(c) == '\f')
 #define isBLANK_utf8(c)                isBLANK(c) /* could be wrong */
@@ -484,9 +481,6 @@ Converts the specified character to lowercase.
 #define isGRAPH_LC_utf8(p)     isGRAPH_LC_uvchr(utf8_to_uvchr(p,  0))
 #define isPRINT_LC_utf8(p)     isPRINT_LC_uvchr(utf8_to_uvchr(p,  0))
 #define isPUNCT_LC_utf8(p)     isPUNCT_LC_uvchr(utf8_to_uvchr(p,  0))
-#define toUPPER_LC_utf8(p)     toUPPER_LC_uvchr(utf8_to_uvchr(p,  0))
-#define toTITLE_LC_utf8(p)     toTITLE_LC_uvchr(utf8_to_uvchr(p,  0))
-#define toLOWER_LC_utf8(p)     toLOWER_LC_uvchr(utf8_to_uvchr(p,  0))
 
 #define isPSXSPC_LC_utf8(c)    (isSPACE_LC_utf8(c) ||(c) == '\f')
 #define isBLANK_LC_utf8(c)     isBLANK(c) /* could be wrong */
diff --git a/pp.c b/pp.c
index eca00c8..134f243 100644 (file)
--- a/pp.c
+++ b/pp.c
@@ -3151,19 +3151,12 @@ PP(pp_ucfirst)
 
     if (DO_UTF8(sv) && (s = (U8*)SvPV(sv, slen)) && slen && UTF8_IS_START(*s)) {
        STRLEN ulen;
-       U8 tmpbuf[UTF8_MAXLEN+1];
+       U8 tmpbuf[UTF8_MAXLEN*2+1];
        U8 *tend;
        UV uv;
 
-       if (IN_LOCALE_RUNTIME) {
-           TAINT;
-           SvTAINTED_on(sv);
-           uv = toTITLE_LC_uvchr(utf8n_to_uvchr(s, slen, &ulen, 0));
-       }
-       else {
-           uv   = toTITLE_utf8(s);
-           ulen = UNISKIP(uv);
-       }
+       toTITLE_utf8(s, tmpbuf, &ulen); /* XXX --jhi */
+       uv = utf8_to_uvchr(tmpbuf, 0);
        
        tend = uvchr_to_utf8(tmpbuf, uv);
 
@@ -3212,19 +3205,12 @@ PP(pp_lcfirst)
 
     if (DO_UTF8(sv) && (s = (U8*)SvPV(sv, slen)) && slen && UTF8_IS_START(*s)) {
        STRLEN ulen;
-       U8 tmpbuf[UTF8_MAXLEN+1];
+       U8 tmpbuf[UTF8_MAXLEN*2+1];
        U8 *tend;
        UV uv;
 
-       if (IN_LOCALE_RUNTIME) {
-           TAINT;
-           SvTAINTED_on(sv);
-           uv = toLOWER_LC_uvchr(utf8n_to_uvchr(s, slen, &ulen, 0));
-       }
-       else {
-           uv   = toLOWER_utf8(s);
-           ulen = UNISKIP(uv);
-       }
+       toLOWER_utf8(s, tmpbuf, &ulen); /* XXX --jhi */
+       uv = utf8_to_uvchr(tmpbuf, 0);
        
        tend = uvchr_to_utf8(tmpbuf, uv);
 
@@ -3276,6 +3262,7 @@ PP(pp_uc)
        STRLEN ulen;
        register U8 *d;
        U8 *send;
+       U8 tmpbuf[UTF8_MAXLEN*2+1];
 
        s = (U8*)SvPV(sv,len);
        if (!len) {
@@ -3289,19 +3276,11 @@ PP(pp_uc)
            (void)SvPOK_only(TARG);
            d = (U8*)SvPVX(TARG);
            send = s + len;
-           if (IN_LOCALE_RUNTIME) {
-               TAINT;
-               SvTAINTED_on(TARG);
-               while (s < send) {
-                   d = uvchr_to_utf8(d, toUPPER_LC_uvchr( utf8n_to_uvchr(s, len, &ulen, 0)));
-                   s += ulen;
-               }
-           }
-           else {
-               while (s < send) {
-                   d = uvchr_to_utf8(d, toUPPER_utf8( s ));
-                   s += UTF8SKIP(s);
-               }
+           while (s < send) {
+               toUPPER_utf8(s, tmpbuf, &ulen); /* XXX --jhi */
+               Copy(tmpbuf, d, ulen, U8);
+               d += ulen;
+               s += UTF8SKIP(s);
            }
            *d = '\0';
            SvUTF8_on(TARG);
@@ -3350,6 +3329,7 @@ PP(pp_lc)
        STRLEN ulen;
        register U8 *d;
        U8 *send;
+       U8 tmpbuf[UTF8_MAXLEN*2+1];
 
        s = (U8*)SvPV(sv,len);
        if (!len) {
@@ -3363,19 +3343,11 @@ PP(pp_lc)
            (void)SvPOK_only(TARG);
            d = (U8*)SvPVX(TARG);
            send = s + len;
-           if (IN_LOCALE_RUNTIME) {
-               TAINT;
-               SvTAINTED_on(TARG);
-               while (s < send) {
-                   d = uvchr_to_utf8(d, toLOWER_LC_uvchr( utf8n_to_uvchr(s, len, &ulen, 0)));
-                   s += ulen;
-               }
-           }
-           else {
-               while (s < send) {
-                   d = uvchr_to_utf8(d, toLOWER_utf8(s));
-                   s += UTF8SKIP(s);
-               }
+           while (s < send) {
+               toLOWER_utf8(s, tmpbuf, &ulen); /* XXX --jhi */
+               Copy(tmpbuf, d, ulen, U8);
+               d += ulen;
+               s += UTF8SKIP(s);
            }
            *d = '\0';
            SvUTF8_on(TARG);
diff --git a/proto.h b/proto.h
index 44e0a03..2e2427a 100644 (file)
--- a/proto.h
+++ b/proto.h
@@ -339,9 +339,9 @@ PERL_CALLCONV bool  Perl_is_uni_lower(pTHX_ U32 c);
 PERL_CALLCONV bool     Perl_is_uni_print(pTHX_ U32 c);
 PERL_CALLCONV bool     Perl_is_uni_punct(pTHX_ U32 c);
 PERL_CALLCONV bool     Perl_is_uni_xdigit(pTHX_ U32 c);
-PERL_CALLCONV U32      Perl_to_uni_upper(pTHX_ U32 c);
-PERL_CALLCONV U32      Perl_to_uni_title(pTHX_ U32 c);
-PERL_CALLCONV U32      Perl_to_uni_lower(pTHX_ U32 c);
+PERL_CALLCONV U32      Perl_to_uni_upper(pTHX_ U32 c, U8 *p, STRLEN *lenp);
+PERL_CALLCONV U32      Perl_to_uni_title(pTHX_ U32 c, U8 *p, STRLEN *lenp);
+PERL_CALLCONV U32      Perl_to_uni_lower(pTHX_ U32 c, U8 *p, STRLEN *lenp);
 PERL_CALLCONV bool     Perl_is_uni_alnum_lc(pTHX_ U32 c);
 PERL_CALLCONV bool     Perl_is_uni_alnumc_lc(pTHX_ U32 c);
 PERL_CALLCONV bool     Perl_is_uni_idfirst_lc(pTHX_ U32 c);
@@ -356,9 +356,6 @@ PERL_CALLCONV bool  Perl_is_uni_lower_lc(pTHX_ U32 c);
 PERL_CALLCONV bool     Perl_is_uni_print_lc(pTHX_ U32 c);
 PERL_CALLCONV bool     Perl_is_uni_punct_lc(pTHX_ U32 c);
 PERL_CALLCONV bool     Perl_is_uni_xdigit_lc(pTHX_ U32 c);
-PERL_CALLCONV U32      Perl_to_uni_upper_lc(pTHX_ U32 c);
-PERL_CALLCONV U32      Perl_to_uni_title_lc(pTHX_ U32 c);
-PERL_CALLCONV U32      Perl_to_uni_lower_lc(pTHX_ U32 c);
 PERL_CALLCONV STRLEN   Perl_is_utf8_char(pTHX_ U8 *p);
 PERL_CALLCONV bool     Perl_is_utf8_string(pTHX_ U8 *s, STRLEN len);
 PERL_CALLCONV bool     Perl_is_utf8_alnum(pTHX_ U8 *p);
@@ -800,9 +797,9 @@ PERL_CALLCONV SV*   Perl_swash_init(pTHX_ char* pkg, char* name, SV* listsv, I32 m
 PERL_CALLCONV UV       Perl_swash_fetch(pTHX_ SV *sv, U8 *ptr, bool do_utf8);
 PERL_CALLCONV void     Perl_taint_env(pTHX);
 PERL_CALLCONV void     Perl_taint_proper(pTHX_ const char* f, const char* s);
-PERL_CALLCONV UV       Perl_to_utf8_lower(pTHX_ U8 *p);
-PERL_CALLCONV UV       Perl_to_utf8_upper(pTHX_ U8 *p);
-PERL_CALLCONV UV       Perl_to_utf8_title(pTHX_ U8 *p);
+PERL_CALLCONV UV       Perl_to_utf8_lower(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp);
+PERL_CALLCONV UV       Perl_to_utf8_upper(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp);
+PERL_CALLCONV UV       Perl_to_utf8_title(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp);
 #if defined(UNLINK_ALL_VERSIONS)
 PERL_CALLCONV I32      Perl_unlnk(pTHX_ char* f);
 #endif
index 69fe024..a223533 100644 (file)
--- a/regcomp.c
+++ b/regcomp.c
@@ -2962,6 +2962,8 @@ tryagain:
            register char *p;
            char *oldp, *s;
            STRLEN numlen;
+           STRLEN ulen;
+           U8 tmpbuf[UTF8_MAXLEN*2+1];
 
             parse_start = RExC_parse - 1;
 
@@ -3104,10 +3106,8 @@ tryagain:
                if (RExC_flags16 & PMf_EXTENDED)
                    p = regwhite(p, RExC_end);
                if (UTF && FOLD) {
-                   if (LOC)
-                       ender = toLOWER_LC_uvchr(ender);
-                   else
-                       ender = toLOWER_uni(ender);
+                   toLOWER_uni(ender, tmpbuf, &ulen);
+                   ender = utf8_to_uvchr(tmpbuf, 0);
                }
                if (ISMULT2(p)) { /* Back off on ?+*. */
                    if (len)
index b691162..58a7808 100644 (file)
--- a/regexec.c
+++ b/regexec.c
@@ -917,8 +917,15 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta
            m = STRING(c);
            ln = STR_LEN(c);
            if (UTF) {
-               c1 = to_utf8_lower((U8*)m);
-               c2 = to_utf8_upper((U8*)m);
+               STRLEN ulen1, ulen2;
+               U8 tmpbuf1[UTF8_MAXLEN*2+1];
+               U8 tmpbuf2[UTF8_MAXLEN*2+1];
+
+               to_utf8_lower((U8*)m, tmpbuf1, &ulen1);
+               to_utf8_upper((U8*)m, tmpbuf2, &ulen2);
+
+               c1 = utf8_to_uvuni(tmpbuf1, 0);
+               c2 = utf8_to_uvuni(tmpbuf2, 0);
            }
            else {
                c1 = *(U8*)m;
@@ -2199,17 +2206,17 @@ S_regmatch(pTHX_ regnode *prog)
            if (do_utf8) {
                char *l = locinput;
                char *e;
+               STRLEN ulen;
+               U8 tmpbuf[UTF8_MAXLEN*2+1];
                e = s + ln;
-               c1 = OP(scan) == EXACTF;
                while (s < e) {
-                   if (l >= PL_regeol) {
+                   if (l >= PL_regeol)
                        sayNO;
-                   }
-                   if ((UTF ? utf8n_to_uvchr((U8*)s, e - s, 0, 0) : *((U8*)s)) !=
-                       (c1 ? toLOWER_utf8((U8*)l) : toLOWER_LC_utf8((U8*)l)))
-                           sayNO;
-                   s += UTF ? UTF8SKIP(s) : 1;
-                   l += UTF8SKIP(l);
+                   toLOWER_utf8((U8*)l, tmpbuf, &ulen);
+                   if (memNE(s, tmpbuf, ulen))
+                       sayNO;
+                   s += UTF8SKIP(s);
+                   l += ulen;
                }
                locinput = l;
                nextchr = UCHARAT(locinput);
@@ -2472,23 +2479,18 @@ S_regmatch(pTHX_ regnode *prog)
                 * have to map both upper and title case to lower case.
                 */
                if (OP(scan) == REFF) {
+                   STRLEN ulen1, ulen2;
+                   U8 tmpbuf1[UTF8_MAXLEN*2+1];
+                   U8 tmpbuf2[UTF8_MAXLEN*2+1];
                    while (s < e) {
                        if (l >= PL_regeol)
                            sayNO;
-                       if (toLOWER_utf8((U8*)s) != toLOWER_utf8((U8*)l))
-                           sayNO;
-                       s += UTF8SKIP(s);
-                       l += UTF8SKIP(l);
-                   }
-               }
-               else {
-                   while (s < e) {
-                       if (l >= PL_regeol)
-                           sayNO;
-                       if (toLOWER_LC_utf8((U8*)s) != toLOWER_LC_utf8((U8*)l))
+                       toLOWER_utf8((U8*)s, tmpbuf1, &ulen1);
+                       toLOWER_utf8((U8*)l, tmpbuf2, &ulen2);
+                       if (ulen1 != ulen2 || memNE(tmpbuf1, tmpbuf2, ulen1))
                            sayNO;
-                       s += UTF8SKIP(s);
-                       l += UTF8SKIP(l);
+                       s += ulen1;
+                       l += ulen2;
                    }
                }
                locinput = l;
@@ -3237,8 +3239,15 @@ S_regmatch(pTHX_ regnode *prog)
                    }
                    else { /* UTF */
                        if (OP(text_node) == EXACTF) {
-                           c1 = to_utf8_lower(s);
-                           c2 = to_utf8_upper(s);
+                            STRLEN ulen1, ulen2;
+                            U8 tmpbuf1[UTF8_MAXLEN*2+1];
+                            U8 tmpbuf2[UTF8_MAXLEN*2+1];
+
+                            to_utf8_lower((U8*)s, tmpbuf1, &ulen1);
+                            to_utf8_upper((U8*)s, tmpbuf2, &ulen2);
+
+                            c1 = utf8_to_uvuni(tmpbuf1, 0);
+                            c2 = utf8_to_uvuni(tmpbuf2, 0);
                        }
                        else {
                            c2 = c1 = utf8_to_uvchr(s, NULL);
@@ -3975,14 +3984,10 @@ S_reginclass(pTHX_ register regnode *n, register U8* p, register bool do_utf8)
                if (swash_fetch(sw, p, do_utf8))
                    match = TRUE;
                else if (flags & ANYOF_FOLD) {
-                   U8 tmpbuf[UTF8_MAXLEN+1];
-               
-                   if (flags & ANYOF_LOCALE) {
-                       PL_reg_flags |= RF_tainted;
-                       uvchr_to_utf8(tmpbuf, toLOWER_LC_utf8(p));
-                   }
-                   else
-                       uvchr_to_utf8(tmpbuf, toLOWER_utf8(p));
+                   STRLEN ulen;
+                   U8 tmpbuf[UTF8_MAXLEN*2+1];
+
+                   toLOWER_utf8(p, tmpbuf, &ulen);
                    if (swash_fetch(sw, tmpbuf, do_utf8))
                        match = TRUE;
                }
diff --git a/utf8.c b/utf8.c
index 5a5f56c..e1a7e63 100644 (file)
--- a/utf8.c
+++ b/utf8.c
@@ -902,33 +902,33 @@ Perl_is_uni_punct(pTHX_ U32 c)
 bool
 Perl_is_uni_xdigit(pTHX_ U32 c)
 {
-    U8 tmpbuf[UTF8_MAXLEN+1];
+    U8 tmpbuf[UTF8_MAXLEN*2+1];
     uvchr_to_utf8(tmpbuf, (UV)c);
     return is_utf8_xdigit(tmpbuf);
 }
 
 U32
-Perl_to_uni_upper(pTHX_ U32 c)
+Perl_to_uni_upper(pTHX_ U32 c, U8* p, STRLEN *lenp)
 {
-    U8 tmpbuf[UTF8_MAXLEN+1];
+    U8 tmpbuf[UTF8_MAXLEN*2+1];
     uvchr_to_utf8(tmpbuf, (UV)c);
-    return to_utf8_upper(tmpbuf);
+    return to_utf8_upper(tmpbuf, p, lenp);
 }
 
 U32
-Perl_to_uni_title(pTHX_ U32 c)
+Perl_to_uni_title(pTHX_ U32 c, U8* p, STRLEN *lenp)
 {
-    U8 tmpbuf[UTF8_MAXLEN+1];
+    U8 tmpbuf[UTF8_MAXLEN*2+1];
     uvchr_to_utf8(tmpbuf, (UV)c);
-    return to_utf8_title(tmpbuf);
+    return to_utf8_title(tmpbuf, p, lenp);
 }
 
 U32
-Perl_to_uni_lower(pTHX_ U32 c)
+Perl_to_uni_lower(pTHX_ U32 c, U8* p, STRLEN *lenp)
 {
     U8 tmpbuf[UTF8_MAXLEN+1];
     uvchr_to_utf8(tmpbuf, (UV)c);
-    return to_utf8_lower(tmpbuf);
+    return to_utf8_lower(tmpbuf, p, lenp);
 }
 
 /* for now these all assume no locale info available for Unicode > 255 */
@@ -1017,24 +1017,6 @@ Perl_is_uni_xdigit_lc(pTHX_ U32 c)
     return is_uni_xdigit(c);   /* XXX no locale support yet */
 }
 
-U32
-Perl_to_uni_upper_lc(pTHX_ U32 c)
-{
-    return to_uni_upper(c);    /* XXX no locale support yet */
-}
-
-U32
-Perl_to_uni_title_lc(pTHX_ U32 c)
-{
-    return to_uni_title(c);    /* XXX no locale support yet */
-}
-
-U32
-Perl_to_uni_lower_lc(pTHX_ U32 c)
-{
-    return to_uni_lower(c);    /* XXX no locale support yet */
-}
-
 bool
 Perl_is_utf8_alnum(pTHX_ U8 *p)
 {
@@ -1199,36 +1181,45 @@ Perl_is_utf8_mark(pTHX_ U8 *p)
 }
 
 UV
-Perl_to_utf8_upper(pTHX_ U8 *p)
+Perl_to_utf8_upper(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
 {
     UV uv;
 
     if (!PL_utf8_toupper)
        PL_utf8_toupper = swash_init("utf8", "ToUpper", &PL_sv_undef, 4, 0);
     uv = swash_fetch(PL_utf8_toupper, p, TRUE);
-    return uv ? UNI_TO_NATIVE(uv) : utf8_to_uvchr(p,0);
+    uv = uv ? UNI_TO_NATIVE(uv) : utf8_to_uvchr(p, 0);
+    *lenp = UNISKIP(uv);
+    uvuni_to_utf8(ustrp, uv);
+    return uv;
 }
 
 UV
-Perl_to_utf8_title(pTHX_ U8 *p)
+Perl_to_utf8_title(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
 {
     UV uv;
 
     if (!PL_utf8_totitle)
        PL_utf8_totitle = swash_init("utf8", "ToTitle", &PL_sv_undef, 4, 0);
     uv = swash_fetch(PL_utf8_totitle, p, TRUE);
-    return uv ? UNI_TO_NATIVE(uv) : utf8_to_uvchr(p,0);
+    uv = uv ? UNI_TO_NATIVE(uv) : utf8_to_uvchr(p, 0);
+    *lenp = UNISKIP(uv);
+    uvuni_to_utf8(ustrp, uv);
+    return uv;
 }
 
 UV
-Perl_to_utf8_lower(pTHX_ U8 *p)
+Perl_to_utf8_lower(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
 {
     UV uv;
 
     if (!PL_utf8_tolower)
        PL_utf8_tolower = swash_init("utf8", "ToLower", &PL_sv_undef, 4, 0);
     uv = swash_fetch(PL_utf8_tolower, p, TRUE);
-    return uv ? UNI_TO_NATIVE(uv) : utf8_to_uvchr(p,0);
+    uv = uv ? UNI_TO_NATIVE(uv) : utf8_to_uvchr(p, 0);
+    *lenp = UNISKIP(uv);
+    uvuni_to_utf8(ustrp, uv);
+    return uv;
 }
 
 /* a "swash" is a swatch hash */