X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=blobdiff_plain;f=utf8.c;h=4a3fe1d0acc71056105d579cf758cd8c2f508eb0;hb=895fe8448c39ec9ce61fb5a2b7f671d3d15dcb46;hp=5a5f56c4222ac26de6dd70a0f7c1ad93b4656c90;hpb=53e06cf030da5eb71c0b61c0690494f3c70e0555;p=p5sagit%2Fp5-mst-13.2.git diff --git a/utf8.c b/utf8.c index 5a5f56c..4a3fe1d 100644 --- a/utf8.c +++ b/utf8.c @@ -902,33 +902,33 @@ Perl_is_uni_punct(pTHX_ U32 c) bool Perl_is_uni_xdigit(pTHX_ U32 c) { - U8 tmpbuf[UTF8_MAXLEN+1]; + U8 tmpbuf[UTF8_MAXLEN*2+1]; uvchr_to_utf8(tmpbuf, (UV)c); return is_utf8_xdigit(tmpbuf); } U32 -Perl_to_uni_upper(pTHX_ U32 c) +Perl_to_uni_upper(pTHX_ U32 c, U8* p, STRLEN *lenp) { - U8 tmpbuf[UTF8_MAXLEN+1]; + U8 tmpbuf[UTF8_MAXLEN*2+1]; uvchr_to_utf8(tmpbuf, (UV)c); - return to_utf8_upper(tmpbuf); + return to_utf8_upper(tmpbuf, p, lenp); } U32 -Perl_to_uni_title(pTHX_ U32 c) +Perl_to_uni_title(pTHX_ U32 c, U8* p, STRLEN *lenp) { - U8 tmpbuf[UTF8_MAXLEN+1]; + U8 tmpbuf[UTF8_MAXLEN*2+1]; uvchr_to_utf8(tmpbuf, (UV)c); - return to_utf8_title(tmpbuf); + return to_utf8_title(tmpbuf, p, lenp); } U32 -Perl_to_uni_lower(pTHX_ U32 c) +Perl_to_uni_lower(pTHX_ U32 c, U8* p, STRLEN *lenp) { U8 tmpbuf[UTF8_MAXLEN+1]; uvchr_to_utf8(tmpbuf, (UV)c); - return to_utf8_lower(tmpbuf); + return to_utf8_lower(tmpbuf, p, lenp); } /* for now these all assume no locale info available for Unicode > 255 */ @@ -1017,24 +1017,6 @@ Perl_is_uni_xdigit_lc(pTHX_ U32 c) return is_uni_xdigit(c); /* XXX no locale support yet */ } -U32 -Perl_to_uni_upper_lc(pTHX_ U32 c) -{ - return to_uni_upper(c); /* XXX no locale support yet */ -} - -U32 -Perl_to_uni_title_lc(pTHX_ U32 c) -{ - return to_uni_title(c); /* XXX no locale support yet */ -} - -U32 -Perl_to_uni_lower_lc(pTHX_ U32 c) -{ - return to_uni_lower(c); /* XXX no locale support yet */ -} - bool Perl_is_utf8_alnum(pTHX_ U8 *p) { @@ -1199,36 +1181,63 @@ Perl_is_utf8_mark(pTHX_ U8 *p) } UV -Perl_to_utf8_upper(pTHX_ U8 *p) +Perl_to_utf8_case(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp, SV **swashp,char *normal, char *special) { UV uv; - if (!PL_utf8_toupper) - PL_utf8_toupper = swash_init("utf8", "ToUpper", &PL_sv_undef, 4, 0); - uv = swash_fetch(PL_utf8_toupper, p, TRUE); - return uv ? UNI_TO_NATIVE(uv) : utf8_to_uvchr(p,0); + if (!*swashp) + *swashp = swash_init("utf8", normal, &PL_sv_undef, 4, 0); + uv = swash_fetch(*swashp, p, TRUE); + if (uv) + uv = UNI_TO_NATIVE(uv); + else { + HV *hv; + SV *keysv; + HE *he; + + uv = utf8_to_uvchr(p, 0); + + if ((hv = get_hv(special, FALSE)) && + (keysv = sv_2mortal(Perl_newSVpvf(aTHX_ "%"UVuf, uv))) && + (he = hv_fetch_ent(hv, keysv, FALSE, 0))) { + SV *val = HeVAL(he); + char *s = SvPV(val, *lenp); + U8 c = *(U8*)s; + if (*lenp > 1 || UNI_IS_INVARIANT(c)) + Copy(s, ustrp, *lenp, U8); + else { + /* something in the 0x80..0xFF range */ + ustrp[0] = UTF8_EIGHT_BIT_HI(c); + ustrp[1] = UTF8_EIGHT_BIT_LO(c); + *lenp = 2; + } + return 0; + } + } + *lenp = UNISKIP(uv); + uvuni_to_utf8(ustrp, uv); + return uv; } UV -Perl_to_utf8_title(pTHX_ U8 *p) +Perl_to_utf8_upper(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp) { - UV uv; - - if (!PL_utf8_totitle) - PL_utf8_totitle = swash_init("utf8", "ToTitle", &PL_sv_undef, 4, 0); - uv = swash_fetch(PL_utf8_totitle, p, TRUE); - return uv ? UNI_TO_NATIVE(uv) : utf8_to_uvchr(p,0); + return Perl_to_utf8_case(aTHX_ p, ustrp, lenp, + &PL_utf8_toupper, "ToUpper", "utf8::ToSpecUpper"); } UV -Perl_to_utf8_lower(pTHX_ U8 *p) +Perl_to_utf8_title(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp) { - UV uv; + return Perl_to_utf8_case(aTHX_ p, ustrp, lenp, + &PL_utf8_totitle, "ToTitle", "utf8::ToSpecTitle"); +} - if (!PL_utf8_tolower) - PL_utf8_tolower = swash_init("utf8", "ToLower", &PL_sv_undef, 4, 0); - uv = swash_fetch(PL_utf8_tolower, p, TRUE); - return uv ? UNI_TO_NATIVE(uv) : utf8_to_uvchr(p,0); +UV +Perl_to_utf8_lower(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp) +{ + return Perl_to_utf8_case(aTHX_ p, ustrp, lenp, + &PL_utf8_tolower, "ToLower", "utf8::ToSpecLower"); } /* a "swash" is a swatch hash */ @@ -1290,6 +1299,12 @@ Perl_swash_init(pTHX_ char* pkg, char* name, SV *listsv, I32 minbits, I32 none) return retval; } + +/* This API is wrong for special case conversions since we may need to + * return several Unicode characters for a single Unicode character + * (see lib/unicore/SpecCase.txt) The SWASHGET in lib/utf8_heavy.pl is + * the lower-level routine, and it is similarly broken for returning + * multiple values. --jhi */ UV Perl_swash_fetch(pTHX_ SV *sv, U8 *ptr, bool do_utf8) {