From: Jarkko Hietaniemi <jhi@iki.fi>
Date: Tue, 9 Oct 2001 01:48:17 +0000 (+0000)
Subject: Make the toupper/lower/title API for Unicode not right
X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=a2a2844f59a5c91f404052ef98a588c171fc29f8;p=p5sagit%2Fp5-mst-13.2.git

Make the toupper/lower/title API for Unicode not right
but at least less wrong: prepare for the mapping being
more than just one-character-to-one-character.

p4raw-id: //depot/perl@12371
---

diff --git a/embed.h b/embed.h
index 29ee843..a3f43d0 100644
--- a/embed.h
+++ b/embed.h
@@ -302,9 +302,6 @@
 #define is_uni_print_lc		Perl_is_uni_print_lc
 #define is_uni_punct_lc		Perl_is_uni_punct_lc
 #define is_uni_xdigit_lc	Perl_is_uni_xdigit_lc
-#define to_uni_upper_lc		Perl_to_uni_upper_lc
-#define to_uni_title_lc		Perl_to_uni_title_lc
-#define to_uni_lower_lc		Perl_to_uni_lower_lc
 #define is_utf8_char		Perl_is_utf8_char
 #define is_utf8_string		Perl_is_utf8_string
 #define is_utf8_alnum		Perl_is_utf8_alnum
@@ -1808,9 +1805,9 @@
 #define is_uni_print(a)		Perl_is_uni_print(aTHX_ a)
 #define is_uni_punct(a)		Perl_is_uni_punct(aTHX_ a)
 #define is_uni_xdigit(a)	Perl_is_uni_xdigit(aTHX_ a)
-#define to_uni_upper(a)		Perl_to_uni_upper(aTHX_ a)
-#define to_uni_title(a)		Perl_to_uni_title(aTHX_ a)
-#define to_uni_lower(a)		Perl_to_uni_lower(aTHX_ a)
+#define to_uni_upper(a,b,c)	Perl_to_uni_upper(aTHX_ a,b,c)
+#define to_uni_title(a,b,c)	Perl_to_uni_title(aTHX_ a,b,c)
+#define to_uni_lower(a,b,c)	Perl_to_uni_lower(aTHX_ a,b,c)
 #define is_uni_alnum_lc(a)	Perl_is_uni_alnum_lc(aTHX_ a)
 #define is_uni_alnumc_lc(a)	Perl_is_uni_alnumc_lc(aTHX_ a)
 #define is_uni_idfirst_lc(a)	Perl_is_uni_idfirst_lc(aTHX_ a)
@@ -1825,9 +1822,6 @@
 #define is_uni_print_lc(a)	Perl_is_uni_print_lc(aTHX_ a)
 #define is_uni_punct_lc(a)	Perl_is_uni_punct_lc(aTHX_ a)
 #define is_uni_xdigit_lc(a)	Perl_is_uni_xdigit_lc(aTHX_ a)
-#define to_uni_upper_lc(a)	Perl_to_uni_upper_lc(aTHX_ a)
-#define to_uni_title_lc(a)	Perl_to_uni_title_lc(aTHX_ a)
-#define to_uni_lower_lc(a)	Perl_to_uni_lower_lc(aTHX_ a)
 #define is_utf8_char(a)		Perl_is_utf8_char(aTHX_ a)
 #define is_utf8_string(a,b)	Perl_is_utf8_string(aTHX_ a,b)
 #define is_utf8_alnum(a)	Perl_is_utf8_alnum(aTHX_ a)
@@ -2246,9 +2240,9 @@
 #define swash_fetch(a,b,c)	Perl_swash_fetch(aTHX_ a,b,c)
 #define taint_env()		Perl_taint_env(aTHX)
 #define taint_proper(a,b)	Perl_taint_proper(aTHX_ a,b)
-#define to_utf8_lower(a)	Perl_to_utf8_lower(aTHX_ a)
-#define to_utf8_upper(a)	Perl_to_utf8_upper(aTHX_ a)
-#define to_utf8_title(a)	Perl_to_utf8_title(aTHX_ a)
+#define to_utf8_lower(a,b,c)	Perl_to_utf8_lower(aTHX_ a,b,c)
+#define to_utf8_upper(a,b,c)	Perl_to_utf8_upper(aTHX_ a,b,c)
+#define to_utf8_title(a,b,c)	Perl_to_utf8_title(aTHX_ a,b,c)
 #if defined(UNLINK_ALL_VERSIONS)
 #define unlnk(a)		Perl_unlnk(aTHX_ a)
 #endif
diff --git a/embed.pl b/embed.pl
index 73e72d2..cec8d7e 100755
--- a/embed.pl
+++ b/embed.pl
@@ -1353,9 +1353,9 @@ Ap	|bool	|is_uni_lower	|U32 c
 Ap	|bool	|is_uni_print	|U32 c
 Ap	|bool	|is_uni_punct	|U32 c
 Ap	|bool	|is_uni_xdigit	|U32 c
-Ap	|U32	|to_uni_upper	|U32 c
-Ap	|U32	|to_uni_title	|U32 c
-Ap	|U32	|to_uni_lower	|U32 c
+Ap	|U32	|to_uni_upper	|U32 c|U8 *p|STRLEN *lenp
+Ap	|U32	|to_uni_title	|U32 c|U8 *p|STRLEN *lenp
+Ap	|U32	|to_uni_lower	|U32 c|U8 *p|STRLEN *lenp
 Ap	|bool	|is_uni_alnum_lc|U32 c
 Ap	|bool	|is_uni_alnumc_lc|U32 c
 Ap	|bool	|is_uni_idfirst_lc|U32 c
@@ -1370,9 +1370,6 @@ Ap	|bool	|is_uni_lower_lc|U32 c
 Ap	|bool	|is_uni_print_lc|U32 c
 Ap	|bool	|is_uni_punct_lc|U32 c
 Ap	|bool	|is_uni_xdigit_lc|U32 c
-Ap	|U32	|to_uni_upper_lc|U32 c
-Ap	|U32	|to_uni_title_lc|U32 c
-Ap	|U32	|to_uni_lower_lc|U32 c
 Apd	|STRLEN	|is_utf8_char	|U8 *p
 Apd	|bool	|is_utf8_string	|U8 *s|STRLEN len
 Ap	|bool	|is_utf8_alnum	|U8 *p
@@ -1817,9 +1814,9 @@ Ap	|SV*	|swash_init	|char* pkg|char* name|SV* listsv \
 Ap	|UV	|swash_fetch	|SV *sv|U8 *ptr|bool do_utf8
 Ap	|void	|taint_env
 Ap	|void	|taint_proper	|const char* f|const char* s
-Ap	|UV	|to_utf8_lower	|U8 *p
-Ap	|UV	|to_utf8_upper	|U8 *p
-Ap	|UV	|to_utf8_title	|U8 *p
+Ap	|UV	|to_utf8_lower	|U8 *p|U8* ustrp|STRLEN *lenp
+Ap	|UV	|to_utf8_upper	|U8 *p|U8* ustrp|STRLEN *lenp
+Ap	|UV	|to_utf8_title	|U8 *p|U8* ustrp|STRLEN *lenp
 #if defined(UNLINK_ALL_VERSIONS)
 Ap	|I32	|unlnk		|char* f
 #endif
diff --git a/global.sym b/global.sym
index 28d86a5..b5c912b 100644
--- a/global.sym
+++ b/global.sym
@@ -186,9 +186,6 @@ Perl_is_uni_lower_lc
 Perl_is_uni_print_lc
 Perl_is_uni_punct_lc
 Perl_is_uni_xdigit_lc
-Perl_to_uni_upper_lc
-Perl_to_uni_title_lc
-Perl_to_uni_lower_lc
 Perl_is_utf8_char
 Perl_is_utf8_string
 Perl_is_utf8_alnum
diff --git a/handy.h b/handy.h
index d912735..35373f4 100644
--- a/handy.h
+++ b/handy.h
@@ -425,9 +425,9 @@ Converts the specified character to lowercase.
 #define isPRINT_uni(c)		is_uni_print(c)
 #define isPUNCT_uni(c)		is_uni_punct(c)
 #define isXDIGIT_uni(c)		is_uni_xdigit(c)
-#define toUPPER_uni(c)		to_uni_upper(c)
-#define toTITLE_uni(c)		to_uni_title(c)
-#define toLOWER_uni(c)		to_uni_lower(c)
+#define toUPPER_uni(c,s,l)	to_uni_upper(c,s,l)
+#define toTITLE_uni(c,s,l)	to_uni_title(c,s,l)
+#define toLOWER_uni(c,s,l)	to_uni_lower(c,s,l)
 
 #define isPSXSPC_uni(c)		(isSPACE_uni(c) ||(c) == '\f')
 #define isBLANK_uni(c)		isBLANK(c) /* could be wrong */
@@ -444,9 +444,6 @@ Converts the specified character to lowercase.
 #define isGRAPH_LC_uvchr(c)	(c < 256 ? isGRAPH_LC(c) : is_uni_graph_lc(c))
 #define isPRINT_LC_uvchr(c)	(c < 256 ? isPRINT_LC(c) : is_uni_print_lc(c))
 #define isPUNCT_LC_uvchr(c)	(c < 256 ? isPUNCT_LC(c) : is_uni_punct_lc(c))
-#define toUPPER_LC_uvchr(c)	(c < 256 ? toUPPER_LC(c) : to_uni_upper_lc(c))
-#define toTITLE_LC_uvchr(c)	(c < 256 ? toUPPER_LC(c) : to_uni_title_lc(c))
-#define toLOWER_LC_uvchr(c)	(c < 256 ? toLOWER_LC(c) : to_uni_lower_lc(c))
 
 #define isPSXSPC_LC_uni(c)	(isSPACE_LC_uni(c) ||(c) == '\f')
 #define isBLANK_LC_uni(c)	isBLANK(c) /* could be wrong */
@@ -465,9 +462,9 @@ Converts the specified character to lowercase.
 #define isPRINT_utf8(p)		is_utf8_print(p)
 #define isPUNCT_utf8(p)		is_utf8_punct(p)
 #define isXDIGIT_utf8(p)	is_utf8_xdigit(p)
-#define toUPPER_utf8(p)		to_utf8_upper(p)
-#define toTITLE_utf8(p)		to_utf8_title(p)
-#define toLOWER_utf8(p)		to_utf8_lower(p)
+#define toUPPER_utf8(p,s,l)	to_utf8_upper(p,s,l)
+#define toTITLE_utf8(p,s,l)	to_utf8_title(p,s,l)
+#define toLOWER_utf8(p,s,l)	to_utf8_lower(p,s,l)
 
 #define isPSXSPC_utf8(c)	(isSPACE_utf8(c) ||(c) == '\f')
 #define isBLANK_utf8(c)		isBLANK(c) /* could be wrong */
@@ -484,9 +481,6 @@ Converts the specified character to lowercase.
 #define isGRAPH_LC_utf8(p)	isGRAPH_LC_uvchr(utf8_to_uvchr(p,  0))
 #define isPRINT_LC_utf8(p)	isPRINT_LC_uvchr(utf8_to_uvchr(p,  0))
 #define isPUNCT_LC_utf8(p)	isPUNCT_LC_uvchr(utf8_to_uvchr(p,  0))
-#define toUPPER_LC_utf8(p)	toUPPER_LC_uvchr(utf8_to_uvchr(p,  0))
-#define toTITLE_LC_utf8(p)	toTITLE_LC_uvchr(utf8_to_uvchr(p,  0))
-#define toLOWER_LC_utf8(p)	toLOWER_LC_uvchr(utf8_to_uvchr(p,  0))
 
 #define isPSXSPC_LC_utf8(c)	(isSPACE_LC_utf8(c) ||(c) == '\f')
 #define isBLANK_LC_utf8(c)	isBLANK(c) /* could be wrong */
diff --git a/pp.c b/pp.c
index eca00c8..134f243 100644
--- a/pp.c
+++ b/pp.c
@@ -3151,19 +3151,12 @@ PP(pp_ucfirst)
 
     if (DO_UTF8(sv) && (s = (U8*)SvPV(sv, slen)) && slen && UTF8_IS_START(*s)) {
 	STRLEN ulen;
-	U8 tmpbuf[UTF8_MAXLEN+1];
+	U8 tmpbuf[UTF8_MAXLEN*2+1];
 	U8 *tend;
 	UV uv;
 
-	if (IN_LOCALE_RUNTIME) {
-	    TAINT;
-	    SvTAINTED_on(sv);
-	    uv = toTITLE_LC_uvchr(utf8n_to_uvchr(s, slen, &ulen, 0));
-	}
-	else {
-	    uv   = toTITLE_utf8(s);
-	    ulen = UNISKIP(uv);
-	}
+	toTITLE_utf8(s, tmpbuf, &ulen); /* XXX --jhi */
+	uv = utf8_to_uvchr(tmpbuf, 0);
 	
 	tend = uvchr_to_utf8(tmpbuf, uv);
 
@@ -3212,19 +3205,12 @@ PP(pp_lcfirst)
 
     if (DO_UTF8(sv) && (s = (U8*)SvPV(sv, slen)) && slen && UTF8_IS_START(*s)) {
 	STRLEN ulen;
-	U8 tmpbuf[UTF8_MAXLEN+1];
+	U8 tmpbuf[UTF8_MAXLEN*2+1];
 	U8 *tend;
 	UV uv;
 
-	if (IN_LOCALE_RUNTIME) {
-	    TAINT;
-	    SvTAINTED_on(sv);
-	    uv = toLOWER_LC_uvchr(utf8n_to_uvchr(s, slen, &ulen, 0));
-	}
-	else {
-	    uv   = toLOWER_utf8(s);
-	    ulen = UNISKIP(uv);
-	}
+	toLOWER_utf8(s, tmpbuf, &ulen); /* XXX --jhi */
+	uv = utf8_to_uvchr(tmpbuf, 0);
 	
 	tend = uvchr_to_utf8(tmpbuf, uv);
 
@@ -3276,6 +3262,7 @@ PP(pp_uc)
 	STRLEN ulen;
 	register U8 *d;
 	U8 *send;
+	U8 tmpbuf[UTF8_MAXLEN*2+1];
 
 	s = (U8*)SvPV(sv,len);
 	if (!len) {
@@ -3289,19 +3276,11 @@ PP(pp_uc)
 	    (void)SvPOK_only(TARG);
 	    d = (U8*)SvPVX(TARG);
 	    send = s + len;
-	    if (IN_LOCALE_RUNTIME) {
-		TAINT;
-		SvTAINTED_on(TARG);
-		while (s < send) {
-		    d = uvchr_to_utf8(d, toUPPER_LC_uvchr( utf8n_to_uvchr(s, len, &ulen, 0)));
-		    s += ulen;
-		}
-	    }
-	    else {
-		while (s < send) {
-		    d = uvchr_to_utf8(d, toUPPER_utf8( s ));
-		    s += UTF8SKIP(s);
-		}
+	    while (s < send) {
+	        toUPPER_utf8(s, tmpbuf, &ulen); /* XXX --jhi */
+		Copy(tmpbuf, d, ulen, U8);
+		d += ulen;
+		s += UTF8SKIP(s);
 	    }
 	    *d = '\0';
 	    SvUTF8_on(TARG);
@@ -3350,6 +3329,7 @@ PP(pp_lc)
 	STRLEN ulen;
 	register U8 *d;
 	U8 *send;
+	U8 tmpbuf[UTF8_MAXLEN*2+1];
 
 	s = (U8*)SvPV(sv,len);
 	if (!len) {
@@ -3363,19 +3343,11 @@ PP(pp_lc)
 	    (void)SvPOK_only(TARG);
 	    d = (U8*)SvPVX(TARG);
 	    send = s + len;
-	    if (IN_LOCALE_RUNTIME) {
-		TAINT;
-		SvTAINTED_on(TARG);
-		while (s < send) {
-		    d = uvchr_to_utf8(d, toLOWER_LC_uvchr( utf8n_to_uvchr(s, len, &ulen, 0)));
-		    s += ulen;
-		}
-	    }
-	    else {
-		while (s < send) {
-		    d = uvchr_to_utf8(d, toLOWER_utf8(s));
-		    s += UTF8SKIP(s);
-		}
+	    while (s < send) {
+	        toLOWER_utf8(s, tmpbuf, &ulen); /* XXX --jhi */
+		Copy(tmpbuf, d, ulen, U8);
+		d += ulen;
+		s += UTF8SKIP(s);
 	    }
 	    *d = '\0';
 	    SvUTF8_on(TARG);
diff --git a/proto.h b/proto.h
index 44e0a03..2e2427a 100644
--- a/proto.h
+++ b/proto.h
@@ -339,9 +339,9 @@ PERL_CALLCONV bool	Perl_is_uni_lower(pTHX_ U32 c);
 PERL_CALLCONV bool	Perl_is_uni_print(pTHX_ U32 c);
 PERL_CALLCONV bool	Perl_is_uni_punct(pTHX_ U32 c);
 PERL_CALLCONV bool	Perl_is_uni_xdigit(pTHX_ U32 c);
-PERL_CALLCONV U32	Perl_to_uni_upper(pTHX_ U32 c);
-PERL_CALLCONV U32	Perl_to_uni_title(pTHX_ U32 c);
-PERL_CALLCONV U32	Perl_to_uni_lower(pTHX_ U32 c);
+PERL_CALLCONV U32	Perl_to_uni_upper(pTHX_ U32 c, U8 *p, STRLEN *lenp);
+PERL_CALLCONV U32	Perl_to_uni_title(pTHX_ U32 c, U8 *p, STRLEN *lenp);
+PERL_CALLCONV U32	Perl_to_uni_lower(pTHX_ U32 c, U8 *p, STRLEN *lenp);
 PERL_CALLCONV bool	Perl_is_uni_alnum_lc(pTHX_ U32 c);
 PERL_CALLCONV bool	Perl_is_uni_alnumc_lc(pTHX_ U32 c);
 PERL_CALLCONV bool	Perl_is_uni_idfirst_lc(pTHX_ U32 c);
@@ -356,9 +356,6 @@ PERL_CALLCONV bool	Perl_is_uni_lower_lc(pTHX_ U32 c);
 PERL_CALLCONV bool	Perl_is_uni_print_lc(pTHX_ U32 c);
 PERL_CALLCONV bool	Perl_is_uni_punct_lc(pTHX_ U32 c);
 PERL_CALLCONV bool	Perl_is_uni_xdigit_lc(pTHX_ U32 c);
-PERL_CALLCONV U32	Perl_to_uni_upper_lc(pTHX_ U32 c);
-PERL_CALLCONV U32	Perl_to_uni_title_lc(pTHX_ U32 c);
-PERL_CALLCONV U32	Perl_to_uni_lower_lc(pTHX_ U32 c);
 PERL_CALLCONV STRLEN	Perl_is_utf8_char(pTHX_ U8 *p);
 PERL_CALLCONV bool	Perl_is_utf8_string(pTHX_ U8 *s, STRLEN len);
 PERL_CALLCONV bool	Perl_is_utf8_alnum(pTHX_ U8 *p);
@@ -800,9 +797,9 @@ PERL_CALLCONV SV*	Perl_swash_init(pTHX_ char* pkg, char* name, SV* listsv, I32 m
 PERL_CALLCONV UV	Perl_swash_fetch(pTHX_ SV *sv, U8 *ptr, bool do_utf8);
 PERL_CALLCONV void	Perl_taint_env(pTHX);
 PERL_CALLCONV void	Perl_taint_proper(pTHX_ const char* f, const char* s);
-PERL_CALLCONV UV	Perl_to_utf8_lower(pTHX_ U8 *p);
-PERL_CALLCONV UV	Perl_to_utf8_upper(pTHX_ U8 *p);
-PERL_CALLCONV UV	Perl_to_utf8_title(pTHX_ U8 *p);
+PERL_CALLCONV UV	Perl_to_utf8_lower(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp);
+PERL_CALLCONV UV	Perl_to_utf8_upper(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp);
+PERL_CALLCONV UV	Perl_to_utf8_title(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp);
 #if defined(UNLINK_ALL_VERSIONS)
 PERL_CALLCONV I32	Perl_unlnk(pTHX_ char* f);
 #endif
diff --git a/regcomp.c b/regcomp.c
index 69fe024..a223533 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -2962,6 +2962,8 @@ tryagain:
 	    register char *p;
 	    char *oldp, *s;
 	    STRLEN numlen;
+	    STRLEN ulen;
+	    U8 tmpbuf[UTF8_MAXLEN*2+1];
 
             parse_start = RExC_parse - 1;
 
@@ -3104,10 +3106,8 @@ tryagain:
 		if (RExC_flags16 & PMf_EXTENDED)
 		    p = regwhite(p, RExC_end);
 		if (UTF && FOLD) {
-		    if (LOC)
-			ender = toLOWER_LC_uvchr(ender);
-		    else
-			ender = toLOWER_uni(ender);
+		    toLOWER_uni(ender, tmpbuf, &ulen);
+		    ender = utf8_to_uvchr(tmpbuf, 0);
 		}
 		if (ISMULT2(p)) { /* Back off on ?+*. */
 		    if (len)
diff --git a/regexec.c b/regexec.c
index b691162..58a7808 100644
--- a/regexec.c
+++ b/regexec.c
@@ -917,8 +917,15 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta
 	    m = STRING(c);
 	    ln = STR_LEN(c);
 	    if (UTF) {
-		c1 = to_utf8_lower((U8*)m);
-		c2 = to_utf8_upper((U8*)m);
+	        STRLEN ulen1, ulen2;
+		U8 tmpbuf1[UTF8_MAXLEN*2+1];
+		U8 tmpbuf2[UTF8_MAXLEN*2+1];
+
+		to_utf8_lower((U8*)m, tmpbuf1, &ulen1);
+		to_utf8_upper((U8*)m, tmpbuf2, &ulen2);
+
+		c1 = utf8_to_uvuni(tmpbuf1, 0);
+		c2 = utf8_to_uvuni(tmpbuf2, 0);
 	    }
 	    else {
 		c1 = *(U8*)m;
@@ -2199,17 +2206,17 @@ S_regmatch(pTHX_ regnode *prog)
 	    if (do_utf8) {
 		char *l = locinput;
 		char *e;
+		STRLEN ulen;
+		U8 tmpbuf[UTF8_MAXLEN*2+1];
 		e = s + ln;
-		c1 = OP(scan) == EXACTF;
 		while (s < e) {
-		    if (l >= PL_regeol) {
+		    if (l >= PL_regeol)
 			sayNO;
-		    }
-		    if ((UTF ? utf8n_to_uvchr((U8*)s, e - s, 0, 0) : *((U8*)s)) !=
-			(c1 ? toLOWER_utf8((U8*)l) : toLOWER_LC_utf8((U8*)l)))
-			    sayNO;
-		    s += UTF ? UTF8SKIP(s) : 1;
-		    l += UTF8SKIP(l);
+		    toLOWER_utf8((U8*)l, tmpbuf, &ulen);
+		    if (memNE(s, tmpbuf, ulen))
+		        sayNO;
+		    s += UTF8SKIP(s);
+		    l += ulen;
 		}
 		locinput = l;
 		nextchr = UCHARAT(locinput);
@@ -2472,23 +2479,18 @@ S_regmatch(pTHX_ regnode *prog)
 		 * have to map both upper and title case to lower case.
 		 */
 		if (OP(scan) == REFF) {
+		    STRLEN ulen1, ulen2;
+		    U8 tmpbuf1[UTF8_MAXLEN*2+1];
+		    U8 tmpbuf2[UTF8_MAXLEN*2+1];
 		    while (s < e) {
 			if (l >= PL_regeol)
 			    sayNO;
-			if (toLOWER_utf8((U8*)s) != toLOWER_utf8((U8*)l))
-			    sayNO;
-			s += UTF8SKIP(s);
-			l += UTF8SKIP(l);
-		    }
-		}
-		else {
-		    while (s < e) {
-			if (l >= PL_regeol)
-			    sayNO;
-			if (toLOWER_LC_utf8((U8*)s) != toLOWER_LC_utf8((U8*)l))
+			toLOWER_utf8((U8*)s, tmpbuf1, &ulen1);
+			toLOWER_utf8((U8*)l, tmpbuf2, &ulen2);
+			if (ulen1 != ulen2 || memNE(tmpbuf1, tmpbuf2, ulen1))
 			    sayNO;
-			s += UTF8SKIP(s);
-			l += UTF8SKIP(l);
+			s += ulen1;
+			l += ulen2;
 		    }
 		}
 		locinput = l;
@@ -3237,8 +3239,15 @@ S_regmatch(pTHX_ regnode *prog)
 		    }
 		    else { /* UTF */
 			if (OP(text_node) == EXACTF) {
-			    c1 = to_utf8_lower(s);
-			    c2 = to_utf8_upper(s);
+			     STRLEN ulen1, ulen2;
+			     U8 tmpbuf1[UTF8_MAXLEN*2+1];
+			     U8 tmpbuf2[UTF8_MAXLEN*2+1];
+
+			     to_utf8_lower((U8*)s, tmpbuf1, &ulen1);
+			     to_utf8_upper((U8*)s, tmpbuf2, &ulen2);
+
+			     c1 = utf8_to_uvuni(tmpbuf1, 0);
+			     c2 = utf8_to_uvuni(tmpbuf2, 0);
 			}
 			else {
 			    c2 = c1 = utf8_to_uvchr(s, NULL);
@@ -3975,14 +3984,10 @@ S_reginclass(pTHX_ register regnode *n, register U8* p, register bool do_utf8)
 		if (swash_fetch(sw, p, do_utf8))
 		    match = TRUE;
 		else if (flags & ANYOF_FOLD) {
-		    U8 tmpbuf[UTF8_MAXLEN+1];
-		
-		    if (flags & ANYOF_LOCALE) {
-			PL_reg_flags |= RF_tainted;
-			uvchr_to_utf8(tmpbuf, toLOWER_LC_utf8(p));
-		    }
-		    else
-		        uvchr_to_utf8(tmpbuf, toLOWER_utf8(p));
+		    STRLEN ulen;
+		    U8 tmpbuf[UTF8_MAXLEN*2+1];
+
+		    toLOWER_utf8(p, tmpbuf, &ulen);
 		    if (swash_fetch(sw, tmpbuf, do_utf8))
 			match = TRUE;
 		}
diff --git a/utf8.c b/utf8.c
index 5a5f56c..e1a7e63 100644
--- a/utf8.c
+++ b/utf8.c
@@ -902,33 +902,33 @@ Perl_is_uni_punct(pTHX_ U32 c)
 bool
 Perl_is_uni_xdigit(pTHX_ U32 c)
 {
-    U8 tmpbuf[UTF8_MAXLEN+1];
+    U8 tmpbuf[UTF8_MAXLEN*2+1];
     uvchr_to_utf8(tmpbuf, (UV)c);
     return is_utf8_xdigit(tmpbuf);
 }
 
 U32
-Perl_to_uni_upper(pTHX_ U32 c)
+Perl_to_uni_upper(pTHX_ U32 c, U8* p, STRLEN *lenp)
 {
-    U8 tmpbuf[UTF8_MAXLEN+1];
+    U8 tmpbuf[UTF8_MAXLEN*2+1];
     uvchr_to_utf8(tmpbuf, (UV)c);
-    return to_utf8_upper(tmpbuf);
+    return to_utf8_upper(tmpbuf, p, lenp);
 }
 
 U32
-Perl_to_uni_title(pTHX_ U32 c)
+Perl_to_uni_title(pTHX_ U32 c, U8* p, STRLEN *lenp)
 {
-    U8 tmpbuf[UTF8_MAXLEN+1];
+    U8 tmpbuf[UTF8_MAXLEN*2+1];
     uvchr_to_utf8(tmpbuf, (UV)c);
-    return to_utf8_title(tmpbuf);
+    return to_utf8_title(tmpbuf, p, lenp);
 }
 
 U32
-Perl_to_uni_lower(pTHX_ U32 c)
+Perl_to_uni_lower(pTHX_ U32 c, U8* p, STRLEN *lenp)
 {
     U8 tmpbuf[UTF8_MAXLEN+1];
     uvchr_to_utf8(tmpbuf, (UV)c);
-    return to_utf8_lower(tmpbuf);
+    return to_utf8_lower(tmpbuf, p, lenp);
 }
 
 /* for now these all assume no locale info available for Unicode > 255 */
@@ -1017,24 +1017,6 @@ Perl_is_uni_xdigit_lc(pTHX_ U32 c)
     return is_uni_xdigit(c);	/* XXX no locale support yet */
 }
 
-U32
-Perl_to_uni_upper_lc(pTHX_ U32 c)
-{
-    return to_uni_upper(c);	/* XXX no locale support yet */
-}
-
-U32
-Perl_to_uni_title_lc(pTHX_ U32 c)
-{
-    return to_uni_title(c);	/* XXX no locale support yet */
-}
-
-U32
-Perl_to_uni_lower_lc(pTHX_ U32 c)
-{
-    return to_uni_lower(c);	/* XXX no locale support yet */
-}
-
 bool
 Perl_is_utf8_alnum(pTHX_ U8 *p)
 {
@@ -1199,36 +1181,45 @@ Perl_is_utf8_mark(pTHX_ U8 *p)
 }
 
 UV
-Perl_to_utf8_upper(pTHX_ U8 *p)
+Perl_to_utf8_upper(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
 {
     UV uv;
 
     if (!PL_utf8_toupper)
 	PL_utf8_toupper = swash_init("utf8", "ToUpper", &PL_sv_undef, 4, 0);
     uv = swash_fetch(PL_utf8_toupper, p, TRUE);
-    return uv ? UNI_TO_NATIVE(uv) : utf8_to_uvchr(p,0);
+    uv = uv ? UNI_TO_NATIVE(uv) : utf8_to_uvchr(p, 0);
+    *lenp = UNISKIP(uv);
+    uvuni_to_utf8(ustrp, uv);
+    return uv;
 }
 
 UV
-Perl_to_utf8_title(pTHX_ U8 *p)
+Perl_to_utf8_title(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
 {
     UV uv;
 
     if (!PL_utf8_totitle)
 	PL_utf8_totitle = swash_init("utf8", "ToTitle", &PL_sv_undef, 4, 0);
     uv = swash_fetch(PL_utf8_totitle, p, TRUE);
-    return uv ? UNI_TO_NATIVE(uv) : utf8_to_uvchr(p,0);
+    uv = uv ? UNI_TO_NATIVE(uv) : utf8_to_uvchr(p, 0);
+    *lenp = UNISKIP(uv);
+    uvuni_to_utf8(ustrp, uv);
+    return uv;
 }
 
 UV
-Perl_to_utf8_lower(pTHX_ U8 *p)
+Perl_to_utf8_lower(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
 {
     UV uv;
 
     if (!PL_utf8_tolower)
 	PL_utf8_tolower = swash_init("utf8", "ToLower", &PL_sv_undef, 4, 0);
     uv = swash_fetch(PL_utf8_tolower, p, TRUE);
-    return uv ? UNI_TO_NATIVE(uv) : utf8_to_uvchr(p,0);
+    uv = uv ? UNI_TO_NATIVE(uv) : utf8_to_uvchr(p, 0);
+    *lenp = UNISKIP(uv);
+    uvuni_to_utf8(ustrp, uv);
+    return uv;
 }
 
 /* a "swash" is a swatch hash */