From: Gurusamy Sarathy Date: Sat, 19 Feb 2000 07:51:39 +0000 (+0000) Subject: make comparisons promote to utf8 as necessary (from Gisle Aas) X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=560a288e13336a11c08649232e4f81decff70a5d;p=p5sagit%2Fp5-mst-13.2.git make comparisons promote to utf8 as necessary (from Gisle Aas) p4raw-id: //depot/perl@5138 --- diff --git a/Todo-5.6 b/Todo-5.6 index 52fbc50..28b146d 100644 --- a/Todo-5.6 +++ b/Todo-5.6 @@ -4,7 +4,7 @@ Bugs Unicode support finish byte <-> utf8 and localencoding <-> utf8 conversions - make "$bytestr$charstr" do the right conversion + make substr($bytestr,0,0, $charstr) do the right conversion add Unicode::Map equivivalent to core add support for I/O disciplines - open(F, " make C mean C (if !exists(&v123)) autoload utf8_heavy.pl's swash routines in swash_init() + check uv_to_utf8() calls for buffer overflow Multi-threading support "use Thread;" under useithreads diff --git a/embed.h b/embed.h index b16eb3d..be6a685 100644 --- a/embed.h +++ b/embed.h @@ -783,6 +783,10 @@ #define sv_pv Perl_sv_pv #define sv_pvutf8 Perl_sv_pvutf8 #define sv_pvbyte Perl_sv_pvbyte +#define sv_utf8_upgrade Perl_sv_utf8_upgrade +#define sv_utf8_downgrade Perl_sv_utf8_downgrade +#define sv_utf8_encode Perl_sv_utf8_encode +#define sv_utf8_decode Perl_sv_utf8_decode #define sv_force_normal Perl_sv_force_normal #define tmps_grow Perl_tmps_grow #define sv_rvweaken Perl_sv_rvweaken @@ -2191,6 +2195,10 @@ #define sv_pv(a) Perl_sv_pv(aTHX_ a) #define sv_pvutf8(a) Perl_sv_pvutf8(aTHX_ a) #define sv_pvbyte(a) Perl_sv_pvbyte(aTHX_ a) +#define sv_utf8_upgrade(a) Perl_sv_utf8_upgrade(aTHX_ a) +#define sv_utf8_downgrade(a,b) Perl_sv_utf8_downgrade(aTHX_ a,b) +#define sv_utf8_encode(a) Perl_sv_utf8_encode(aTHX_ a) +#define sv_utf8_decode(a) Perl_sv_utf8_decode(aTHX_ a) #define sv_force_normal(a) Perl_sv_force_normal(aTHX_ a) #define tmps_grow(a) Perl_tmps_grow(aTHX_ a) #define sv_rvweaken(a) Perl_sv_rvweaken(aTHX_ a) @@ -4299,6 +4307,14 @@ #define sv_pvutf8 Perl_sv_pvutf8 #define Perl_sv_pvbyte CPerlObj::Perl_sv_pvbyte #define sv_pvbyte Perl_sv_pvbyte +#define Perl_sv_utf8_upgrade CPerlObj::Perl_sv_utf8_upgrade +#define sv_utf8_upgrade Perl_sv_utf8_upgrade +#define Perl_sv_utf8_downgrade CPerlObj::Perl_sv_utf8_downgrade +#define sv_utf8_downgrade Perl_sv_utf8_downgrade +#define Perl_sv_utf8_encode CPerlObj::Perl_sv_utf8_encode +#define sv_utf8_encode Perl_sv_utf8_encode +#define Perl_sv_utf8_decode CPerlObj::Perl_sv_utf8_decode +#define sv_utf8_decode Perl_sv_utf8_decode #define Perl_sv_force_normal CPerlObj::Perl_sv_force_normal #define sv_force_normal Perl_sv_force_normal #define Perl_tmps_grow CPerlObj::Perl_tmps_grow diff --git a/embed.pl b/embed.pl index 952e673..3366a24 100755 --- a/embed.pl +++ b/embed.pl @@ -2108,6 +2108,10 @@ Ap |char* |sv_2pvbyte_nolen|SV* sv Ap |char* |sv_pv |SV *sv Ap |char* |sv_pvutf8 |SV *sv Ap |char* |sv_pvbyte |SV *sv +Ap |void |sv_utf8_upgrade|SV *sv +Ap |bool |sv_utf8_downgrade|SV *sv|bool fail_ok +Ap |void |sv_utf8_encode |SV *sv +Ap |bool |sv_utf8_decode |SV *sv Ap |void |sv_force_normal|SV *sv Ap |void |tmps_grow |I32 n Ap |SV* |sv_rvweaken |SV *sv diff --git a/global.sym b/global.sym index 1451d85..fee7614 100644 --- a/global.sym +++ b/global.sym @@ -499,6 +499,10 @@ Perl_sv_2pvbyte_nolen Perl_sv_pv Perl_sv_pvutf8 Perl_sv_pvbyte +Perl_sv_utf8_upgrade +Perl_sv_utf8_downgrade +Perl_sv_utf8_encode +Perl_sv_utf8_decode Perl_sv_force_normal Perl_tmps_grow Perl_sv_rvweaken diff --git a/objXSUB.h b/objXSUB.h index 1243e9e..2897a6a 100644 --- a/objXSUB.h +++ b/objXSUB.h @@ -2023,6 +2023,22 @@ #define Perl_sv_pvbyte pPerl->Perl_sv_pvbyte #undef sv_pvbyte #define sv_pvbyte Perl_sv_pvbyte +#undef Perl_sv_utf8_upgrade +#define Perl_sv_utf8_upgrade pPerl->Perl_sv_utf8_upgrade +#undef sv_utf8_upgrade +#define sv_utf8_upgrade Perl_sv_utf8_upgrade +#undef Perl_sv_utf8_downgrade +#define Perl_sv_utf8_downgrade pPerl->Perl_sv_utf8_downgrade +#undef sv_utf8_downgrade +#define sv_utf8_downgrade Perl_sv_utf8_downgrade +#undef Perl_sv_utf8_encode +#define Perl_sv_utf8_encode pPerl->Perl_sv_utf8_encode +#undef sv_utf8_encode +#define sv_utf8_encode Perl_sv_utf8_encode +#undef Perl_sv_utf8_decode +#define Perl_sv_utf8_decode pPerl->Perl_sv_utf8_decode +#undef sv_utf8_decode +#define sv_utf8_decode Perl_sv_utf8_decode #undef Perl_sv_force_normal #define Perl_sv_force_normal pPerl->Perl_sv_force_normal #undef sv_force_normal diff --git a/perlapi.c b/perlapi.c index f897146..f082498 100644 --- a/perlapi.c +++ b/perlapi.c @@ -3659,6 +3659,34 @@ Perl_sv_pvbyte(pTHXo_ SV *sv) return ((CPerlObj*)pPerl)->Perl_sv_pvbyte(sv); } +#undef Perl_sv_utf8_upgrade +void +Perl_sv_utf8_upgrade(pTHXo_ SV *sv) +{ + ((CPerlObj*)pPerl)->Perl_sv_utf8_upgrade(sv); +} + +#undef Perl_sv_utf8_downgrade +bool +Perl_sv_utf8_downgrade(pTHXo_ SV *sv, bool fail_ok) +{ + return ((CPerlObj*)pPerl)->Perl_sv_utf8_downgrade(sv, fail_ok); +} + +#undef Perl_sv_utf8_encode +void +Perl_sv_utf8_encode(pTHXo_ SV *sv) +{ + ((CPerlObj*)pPerl)->Perl_sv_utf8_encode(sv); +} + +#undef Perl_sv_utf8_decode +bool +Perl_sv_utf8_decode(pTHXo_ SV *sv) +{ + return ((CPerlObj*)pPerl)->Perl_sv_utf8_decode(sv); +} + #undef Perl_sv_force_normal void Perl_sv_force_normal(pTHXo_ SV *sv) diff --git a/pp_hot.c b/pp_hot.c index 8dab651..ddb900f 100644 --- a/pp_hot.c +++ b/pp_hot.c @@ -184,7 +184,16 @@ PP(pp_concat) } } #endif + if (DO_UTF8(right)) + sv_utf8_upgrade(TARG); sv_catpvn(TARG,s,len); + if (!IN_BYTE) { + if (SvUTF8(right)) + SvUTF8_on(TARG); + } + else if (!SvUTF8(right)) { + SvUTF8_off(TARG); + } } else sv_setpvn(TARG,s,len); /* suppress warning */ diff --git a/proto.h b/proto.h index df2ddb4..31b8f45 100644 --- a/proto.h +++ b/proto.h @@ -885,6 +885,10 @@ PERL_CALLCONV char* Perl_sv_2pvbyte_nolen(pTHX_ SV* sv); PERL_CALLCONV char* Perl_sv_pv(pTHX_ SV *sv); PERL_CALLCONV char* Perl_sv_pvutf8(pTHX_ SV *sv); PERL_CALLCONV char* Perl_sv_pvbyte(pTHX_ SV *sv); +PERL_CALLCONV void Perl_sv_utf8_upgrade(pTHX_ SV *sv); +PERL_CALLCONV bool Perl_sv_utf8_downgrade(pTHX_ SV *sv, bool fail_ok); +PERL_CALLCONV void Perl_sv_utf8_encode(pTHX_ SV *sv); +PERL_CALLCONV bool Perl_sv_utf8_decode(pTHX_ SV *sv); PERL_CALLCONV void Perl_sv_force_normal(pTHX_ SV *sv); PERL_CALLCONV void Perl_tmps_grow(pTHX_ I32 n); PERL_CALLCONV SV* Perl_sv_rvweaken(pTHX_ SV *sv); diff --git a/sv.c b/sv.c index 616344b..e22dbc2 100644 --- a/sv.c +++ b/sv.c @@ -2214,7 +2214,8 @@ Perl_sv_2pv(pTHX_ register SV *sv, STRLEN *lp) char * Perl_sv_2pvbyte_nolen(pTHX_ register SV *sv) { - return sv_2pv_nolen(sv); + STRLEN n_a; + return sv_2pvbyte(sv, &n_a); } char * @@ -2226,12 +2227,14 @@ Perl_sv_2pvbyte(pTHX_ register SV *sv, STRLEN *lp) char * Perl_sv_2pvutf8_nolen(pTHX_ register SV *sv) { - return sv_2pv_nolen(sv); + STRLEN n_a; + return sv_2pvutf8(sv, &n_a); } char * Perl_sv_2pvutf8(pTHX_ register SV *sv, STRLEN *lp) { + sv_utf8_upgrade(sv); return sv_2pv(sv,lp); } @@ -2273,6 +2276,139 @@ Perl_sv_2bool(pTHX_ register SV *sv) } } +void +Perl_sv_utf8_upgrade(pTHX_ register SV *sv) +{ + int hicount; + char *c; + + if (!sv || !SvPOK(sv) || SvUTF8(sv)) + return; + + /* This function could be much more efficient if we had a FLAG + * to signal if there are any hibit chars in the string + */ + hicount = 0; + for (c = SvPVX(sv); c < SvEND(sv); c++) { + if (*c & 0x80) + hicount++; + } + + if (hicount) { + char *src, *dst; + SvGROW(sv, SvCUR(sv) + hicount + 1); + + src = SvEND(sv) - 1; + SvCUR_set(sv, SvCUR(sv) + hicount); + dst = SvEND(sv) - 1; + + while (src < dst) { + if (*src & 0x80) { + dst--; + uv_to_utf8((U8*)dst, (U8)*src--); + dst--; + } + else { + *dst-- = *src--; + } + } + + SvUTF8_on(sv); + } +} + +bool +Perl_sv_utf8_downgrade(pTHX_ register SV* sv, bool fail_ok) +{ + if (SvPOK(sv) && SvUTF8(sv)) { + char *c = SvPVX(sv); + char *first_hi = 0; + /* need to figure out if this is possible at all first */ + while (c < SvEND(sv)) { + if (*c & 0x80) { + I32 len; + UV uv = utf8_to_uv(c, &len); + if (uv >= 256) { + if (fail_ok) + return FALSE; + else { + /* XXX might want to make a callback here instead */ + croak("Big byte"); + } + } + if (!first_hi) + first_hi = c; + c += len; + } + else { + c++; + } + } + + if (first_hi) { + char *src = first_hi; + char *dst = first_hi; + while (src < SvEND(sv)) { + if (*src & 0x80) { + I32 len; + U8 u = (U8)utf8_to_uv(src, &len); + *dst++ = u; + src += len; + } + else { + *dst++ = *src++; + } + } + SvCUR_set(sv, dst - SvPVX(sv)); + } + SvUTF8_off(sv); + } + return TRUE; +} + +void +Perl_sv_utf8_encode(pTHX_ register SV *sv) +{ + sv_utf8_upgrade(sv); + SvUTF8_off(sv); +} + +bool +Perl_sv_utf8_decode(pTHX_ register SV *sv) +{ + if (SvPOK(sv)) { + char *c; + bool has_utf = FALSE; + if (!sv_utf8_downgrade(sv, TRUE)) + return FALSE; + + /* it is actually just a matter of turning the utf8 flag on, but + * we want to make sure everything inside is valid utf8 first. + */ + c = SvPVX(sv); + while (c < SvEND(sv)) { + if (*c & 0x80) { + I32 len; + (void)utf8_to_uv((U8*)c, &len); + if (len == 1) { + /* bad utf8 */ + return FALSE; + } + c += len; + has_utf = TRUE; + } + else { + c++; + } + } + + if (has_utf) + SvUTF8_on(sv); + } + return TRUE; +} + + /* Note: sv_setsv() should not be called with a source string that needs * to be reused, since it may destroy the source string if it is marked * as temporary. @@ -2955,10 +3091,13 @@ Perl_sv_catsv(pTHX_ SV *dstr, register SV *sstr) STRLEN len; if (!sstr) return; - if (s = SvPV(sstr, len)) + if (s = SvPV(sstr, len)) { + if (SvUTF8(sstr)) + sv_utf8_upgrade(dstr); sv_catpvn(dstr,s,len); - if (SvUTF8(sstr)) - SvUTF8_on(dstr); + if (SvUTF8(sstr)) + SvUTF8_on(dstr); + } } /* @@ -3807,11 +3946,42 @@ C. I32 Perl_sv_cmp(pTHX_ register SV *str1, register SV *str2) { - STRLEN cur1 = 0; - char *pv1 = str1 ? SvPV(str1, cur1) : (char *) NULL; - STRLEN cur2 = 0; - char *pv2 = str2 ? SvPV(str2, cur2) : (char *) NULL; + STRLEN cur1, cur2; + char *pv1, *pv2; I32 retval; + bool utf1; + + if (str1) { + pv1 = SvPV(str1, cur1); + } + else { + cur1 = 0; + } + + if (str2) { + if (SvPOK(str2)) { + if (SvPOK(str1) && SvUTF8(str1) != SvUTF8(str2) && !IN_BYTE) { + /* must upgrade other to UTF8 first */ + if (SvUTF8(str1)) { + sv_utf8_upgrade(str2); + } + else { + sv_utf8_upgrade(str1); + /* refresh pointer and length */ + pv1 = SvPVX(str1); + cur1 = SvCUR(str1); + } + } + pv2 = SvPVX(str2); + cur2 = SvCUR(str2); + } + else { + pv2 = sv_2pv(str2, &cur2); + } + } + else { + cur2 = 0; + } if (!cur1) return cur2 ? -1 : 0; @@ -4957,18 +5127,21 @@ Perl_sv_pvbyten_force(pTHX_ SV *sv, STRLEN *lp) char * Perl_sv_pvutf8(pTHX_ SV *sv) { + sv_utf8_upgrade(sv); return sv_pv(sv); } char * Perl_sv_pvutf8n(pTHX_ SV *sv, STRLEN *lp) { + sv_utf8_upgrade(sv); return sv_pvn(sv,lp); } char * Perl_sv_pvutf8n_force(pTHX_ SV *sv, STRLEN *lp) { + sv_utf8_upgrade(sv); return sv_pvn_force(sv,lp); } diff --git a/toke.c b/toke.c index b6ffc2b..d978140 100644 --- a/toke.c +++ b/toke.c @@ -6904,7 +6904,7 @@ Perl_scan_num(pTHX_ char *start) pos++; if (*pos == '.' && isDIGIT(pos[1])) { UV rev; - U8 tmpbuf[10]; + U8 tmpbuf[UTF8_MAXLEN]; U8 *tmpend; NV nshift = 1.0; bool utf8 = FALSE; @@ -6930,7 +6930,6 @@ Perl_scan_num(pTHX_ char *start) tmpbuf[0] = (U8)rev; tmpend = &tmpbuf[1]; } - *tmpend = '\0'; sv_catpvn(sv, (const char*)tmpbuf, tmpend - tmpbuf); if (rev > 0) SvNVX(sv) += (NV)rev/nshift; @@ -6943,7 +6942,6 @@ Perl_scan_num(pTHX_ char *start) s = pos; tmpend = uv_to_utf8(tmpbuf, rev); utf8 = utf8 || rev > 127; - *tmpend = '\0'; sv_catpvn(sv, (const char*)tmpbuf, tmpend - tmpbuf); if (rev > 0) SvNVX(sv) += (NV)rev/nshift; @@ -6951,8 +6949,10 @@ Perl_scan_num(pTHX_ char *start) SvPOK_on(sv); SvNOK_on(sv); SvREADONLY_on(sv); - if (utf8) + if (utf8) { SvUTF8_on(sv); + sv_utf8_downgrade(sv, TRUE); + } } } break;