From: Jarkko Hietaniemi Date: Sun, 28 Jan 2001 19:28:40 +0000 (+0000) Subject: Patch from Inaba Hiroto: X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=f9a6324217cffea75ff769ddd313748c0613a128;p=p5sagit%2Fp5-mst-13.2.git Patch from Inaba Hiroto: - canonical UTF-8 hash keys: if a key string for a hash is UTF8-on, try downgrade the string and use it if unicode::distinct is not in effect. For the task, I added a function bytes_from_utf8() to utf8.c. It might resemble utf8_to_bytes() but it is not convenient to the task. Made a test for it and added to t/op/each.t - Changed do_print in doio.c to apply sv_utf8_(downgrade|upgrade) to the mortal copy of the argument SV. And changed t/io/utf8.t test 18 which expects print() to upgrade its argument. - re-implement sv_eq with bytes_from_utf8() - some bug fixes - tr/// does not handle UTF8 range (\x{}-\x{}) - \ before raw UTF8 character produced "Malformed UTF-8 character" warning. - "\x{100}\N{CENT SIGN}" is Malformed. Added tests for these 3. - and one silly bug (by me) with qu operator. p4raw-id: //depot/perl@8583 --- diff --git a/doio.c b/doio.c index 6056ea7..a1d0e46 100644 --- a/doio.c +++ b/doio.c @@ -1169,13 +1169,12 @@ Perl_do_print(pTHX_ register SV *sv, PerlIO *fp) /* FALL THROUGH */ default: if (PerlIO_isutf8(fp)) { - tmps = SvPVutf8(sv, len); - } - else { - if (DO_UTF8(sv)) - sv_utf8_downgrade(sv, FALSE); - tmps = SvPV(sv, len); + if (!SvUTF8(sv)) + sv_utf8_upgrade(sv = sv_mortalcopy(sv)); } + else if (DO_UTF8(sv)) + sv_utf8_downgrade((sv = sv_mortalcopy(sv)), FALSE); + tmps = SvPV(sv, len); break; } /* To detect whether the process is about to overstep its diff --git a/embed.h b/embed.h index f2a05c7..fe0b6b3 100644 --- a/embed.h +++ b/embed.h @@ -737,6 +737,7 @@ #define utf8_distance Perl_utf8_distance #define utf8_hop Perl_utf8_hop #define utf8_to_bytes Perl_utf8_to_bytes +#define bytes_from_utf8 Perl_bytes_from_utf8 #define bytes_to_utf8 Perl_bytes_to_utf8 #define utf8_to_uv_simple Perl_utf8_to_uv_simple #define utf8_to_uv Perl_utf8_to_uv @@ -2214,6 +2215,7 @@ #define utf8_distance(a,b) Perl_utf8_distance(aTHX_ a,b) #define utf8_hop(a,b) Perl_utf8_hop(aTHX_ a,b) #define utf8_to_bytes(a,b) Perl_utf8_to_bytes(aTHX_ a,b) +#define bytes_from_utf8(a,b,c) Perl_bytes_from_utf8(aTHX_ a,b,c) #define bytes_to_utf8(a,b) Perl_bytes_to_utf8(aTHX_ a,b) #define utf8_to_uv_simple(a,b) Perl_utf8_to_uv_simple(aTHX_ a,b) #define utf8_to_uv(a,b,c,d) Perl_utf8_to_uv(aTHX_ a,b,c,d) @@ -4342,6 +4344,8 @@ #define utf8_hop Perl_utf8_hop #define Perl_utf8_to_bytes CPerlObj::Perl_utf8_to_bytes #define utf8_to_bytes Perl_utf8_to_bytes +#define Perl_bytes_from_utf8 CPerlObj::Perl_bytes_from_utf8 +#define bytes_from_utf8 Perl_bytes_from_utf8 #define Perl_bytes_to_utf8 CPerlObj::Perl_bytes_to_utf8 #define bytes_to_utf8 Perl_bytes_to_utf8 #define Perl_utf8_to_uv_simple CPerlObj::Perl_utf8_to_uv_simple diff --git a/embed.pl b/embed.pl index e71337b..9732773 100755 --- a/embed.pl +++ b/embed.pl @@ -2085,6 +2085,7 @@ Adp |STRLEN |utf8_length |U8* s|U8 *e Apd |IV |utf8_distance |U8 *a|U8 *b Apd |U8* |utf8_hop |U8 *s|I32 off ApMd |U8* |utf8_to_bytes |U8 *s|STRLEN *len +ApMd |U8* |bytes_from_utf8|U8 *s|STRLEN *len|bool *is_utf8 ApMd |U8* |bytes_to_utf8 |U8 *s|STRLEN *len Apd |UV |utf8_to_uv_simple|U8 *s|STRLEN* retlen Adp |UV |utf8_to_uv |U8 *s|STRLEN curlen|STRLEN* retlen|U32 flags diff --git a/global.sym b/global.sym index 48128c9..dab2a7c 100644 --- a/global.sym +++ b/global.sym @@ -470,6 +470,7 @@ Perl_utf8_length Perl_utf8_distance Perl_utf8_hop Perl_utf8_to_bytes +Perl_bytes_from_utf8 Perl_bytes_to_utf8 Perl_utf8_to_uv_simple Perl_utf8_to_uv diff --git a/hv.c b/hv.c index 0e50523..c999488 100644 --- a/hv.c +++ b/hv.c @@ -152,6 +152,7 @@ Perl_hv_fetch(pTHX_ HV *hv, const char *key, I32 klen, I32 lval) register HE *entry; SV *sv; bool is_utf8 = FALSE; + const char *keysave = key; if (!hv) return 0; @@ -196,6 +197,9 @@ Perl_hv_fetch(pTHX_ HV *hv, const char *key, I32 klen, I32 lval) return 0; } + if (is_utf8 && !(PL_hints & HINT_UTF8_DISTINCT)) + key = (char*)bytes_from_utf8((U8*)key, (STRLEN*)&klen, &is_utf8); + PERL_HASH(hash, key, klen); entry = ((HE**)xhv->xhv_array)[hash & (I32) xhv->xhv_max]; @@ -208,6 +212,8 @@ Perl_hv_fetch(pTHX_ HV *hv, const char *key, I32 klen, I32 lval) continue; if (HeKUTF8(entry) != (char)is_utf8) continue; + if (key != keysave) + Safefree(key); return &HeVAL(entry); } #ifdef DYNAMIC_ENV_FETCH /* %ENV lookup? If so, try to fetch the value now */ @@ -217,14 +223,24 @@ Perl_hv_fetch(pTHX_ HV *hv, const char *key, I32 klen, I32 lval) if (env) { sv = newSVpvn(env,len); SvTAINTED_on(sv); + if (key != keysave) + Safefree(key); return hv_store(hv,key,klen,sv,hash); } } #endif if (lval) { /* gonna assign to this, so it better be there */ sv = NEWSV(61,0); - return hv_store(hv,key,is_utf8?-klen:klen,sv,hash); + if (key != keysave) { /* must be is_utf8 == 0 */ + SV **ret = hv_store(hv,key,klen,sv,hash); + Safefree(key); + return ret; + } + else + return hv_store(hv,key,is_utf8?-klen:klen,sv,hash); } + if (key != keysave) + Safefree(key); return 0; } @@ -256,6 +272,7 @@ Perl_hv_fetch_ent(pTHX_ HV *hv, SV *keysv, I32 lval, register U32 hash) register HE *entry; SV *sv; bool is_utf8; + char *keysave; if (!hv) return 0; @@ -304,9 +321,12 @@ Perl_hv_fetch_ent(pTHX_ HV *hv, SV *keysv, I32 lval, register U32 hash) return 0; } - key = SvPV(keysv, klen); + keysave = key = SvPV(keysv, klen); is_utf8 = (SvUTF8(keysv)!=0); + if (is_utf8 && !(PL_hints & HINT_UTF8_DISTINCT)) + key = (char*)bytes_from_utf8((U8*)key, &klen, &is_utf8); + if (!hash) PERL_HASH(hash, key, klen); @@ -320,6 +340,8 @@ Perl_hv_fetch_ent(pTHX_ HV *hv, SV *keysv, I32 lval, register U32 hash) continue; if (HeKUTF8(entry) != (char)is_utf8) continue; + if (key != keysave) + Safefree(key); return entry; } #ifdef DYNAMIC_ENV_FETCH /* %ENV lookup? If so, try to fetch the value now */ @@ -333,6 +355,8 @@ Perl_hv_fetch_ent(pTHX_ HV *hv, SV *keysv, I32 lval, register U32 hash) } } #endif + if (key != keysave) + Safefree(key); if (lval) { /* gonna assign to this, so it better be there */ sv = NEWSV(61,0); return hv_store_ent(hv,keysv,sv,hash); @@ -385,6 +409,7 @@ Perl_hv_store(pTHX_ HV *hv, const char *key, I32 klen, SV *val, register U32 has register HE *entry; register HE **oentry; bool is_utf8 = FALSE; + const char *keysave = key; if (!hv) return 0; @@ -412,6 +437,9 @@ Perl_hv_store(pTHX_ HV *hv, const char *key, I32 klen, SV *val, register U32 has #endif } } + if (is_utf8 && !(PL_hints & HINT_UTF8_DISTINCT)) + key = (char*)bytes_from_utf8((U8*)key, (STRLEN*)&klen, &is_utf8); + if (!hash) PERL_HASH(hash, key, klen); @@ -433,6 +461,8 @@ Perl_hv_store(pTHX_ HV *hv, const char *key, I32 klen, SV *val, register U32 has continue; SvREFCNT_dec(HeVAL(entry)); HeVAL(entry) = val; + if (key != keysave) + Safefree(key); return &HeVAL(entry); } @@ -441,6 +471,8 @@ Perl_hv_store(pTHX_ HV *hv, const char *key, I32 klen, SV *val, register U32 has HeKEY_hek(entry) = share_hek(key, is_utf8?-klen:klen, hash); else /* gotta do the real thing */ HeKEY_hek(entry) = save_hek(key, is_utf8?-klen:klen, hash); + if (key != keysave) + Safefree(key); HeVAL(entry) = val; HeNEXT(entry) = *oentry; *oentry = entry; @@ -484,6 +516,7 @@ Perl_hv_store_ent(pTHX_ HV *hv, SV *keysv, SV *val, register U32 hash) register HE *entry; register HE **oentry; bool is_utf8; + char *keysave; if (!hv) return 0; @@ -513,9 +546,12 @@ Perl_hv_store_ent(pTHX_ HV *hv, SV *keysv, SV *val, register U32 hash) } } - key = SvPV(keysv, klen); + keysave = key = SvPV(keysv, klen); is_utf8 = (SvUTF8(keysv) != 0); + if (is_utf8 && !(PL_hints & HINT_UTF8_DISTINCT)) + key = (char*)bytes_from_utf8((U8*)key, &klen, &is_utf8); + if (!hash) PERL_HASH(hash, key, klen); @@ -537,6 +573,8 @@ Perl_hv_store_ent(pTHX_ HV *hv, SV *keysv, SV *val, register U32 hash) continue; SvREFCNT_dec(HeVAL(entry)); HeVAL(entry) = val; + if (key != keysave) + Safefree(key); return entry; } @@ -545,6 +583,8 @@ Perl_hv_store_ent(pTHX_ HV *hv, SV *keysv, SV *val, register U32 hash) HeKEY_hek(entry) = share_hek(key, is_utf8?-klen:klen, hash); else /* gotta do the real thing */ HeKEY_hek(entry) = save_hek(key, is_utf8?-klen:klen, hash); + if (key != keysave) + Safefree(key); HeVAL(entry) = val; HeNEXT(entry) = *oentry; *oentry = entry; @@ -581,6 +621,7 @@ Perl_hv_delete(pTHX_ HV *hv, const char *key, I32 klen, I32 flags) SV **svp; SV *sv; bool is_utf8 = FALSE; + const char *keysave = key; if (!hv) return Nullsv; @@ -615,6 +656,9 @@ Perl_hv_delete(pTHX_ HV *hv, const char *key, I32 klen, I32 flags) if (!xhv->xhv_array) return Nullsv; + if (is_utf8 && !(PL_hints & HINT_UTF8_DISTINCT)) + key = (char*)bytes_from_utf8((U8*)key, (STRLEN*)&klen, &is_utf8); + PERL_HASH(hash, key, klen); oentry = &((HE**)xhv->xhv_array)[hash & (I32) xhv->xhv_max]; @@ -629,6 +673,8 @@ Perl_hv_delete(pTHX_ HV *hv, const char *key, I32 klen, I32 flags) continue; if (HeKUTF8(entry) != (char)is_utf8) continue; + if (key != keysave) + Safefree(key); *oentry = HeNEXT(entry); if (i && !*oentry) xhv->xhv_fill--; @@ -645,6 +691,8 @@ Perl_hv_delete(pTHX_ HV *hv, const char *key, I32 klen, I32 flags) --xhv->xhv_keys; return sv; } + if (key != keysave) + Safefree(key); return Nullsv; } @@ -670,6 +718,7 @@ Perl_hv_delete_ent(pTHX_ HV *hv, SV *keysv, I32 flags, U32 hash) register HE **oentry; SV *sv; bool is_utf8; + char *keysave; if (!hv) return Nullsv; @@ -702,9 +751,12 @@ Perl_hv_delete_ent(pTHX_ HV *hv, SV *keysv, I32 flags, U32 hash) if (!xhv->xhv_array) return Nullsv; - key = SvPV(keysv, klen); + keysave = key = SvPV(keysv, klen); is_utf8 = (SvUTF8(keysv) != 0); + if (is_utf8 && !(PL_hints & HINT_UTF8_DISTINCT)) + key = (char*)bytes_from_utf8((U8*)key, &klen, &is_utf8); + if (!hash) PERL_HASH(hash, key, klen); @@ -720,6 +772,8 @@ Perl_hv_delete_ent(pTHX_ HV *hv, SV *keysv, I32 flags, U32 hash) continue; if (HeKUTF8(entry) != (char)is_utf8) continue; + if (key != keysave) + Safefree(key); *oentry = HeNEXT(entry); if (i && !*oentry) xhv->xhv_fill--; @@ -736,6 +790,8 @@ Perl_hv_delete_ent(pTHX_ HV *hv, SV *keysv, I32 flags, U32 hash) --xhv->xhv_keys; return sv; } + if (key != keysave) + Safefree(key); return Nullsv; } @@ -756,6 +812,7 @@ Perl_hv_exists(pTHX_ HV *hv, const char *key, I32 klen) register HE *entry; SV *sv; bool is_utf8 = FALSE; + const char *keysave = key; if (!hv) return 0; @@ -786,6 +843,9 @@ Perl_hv_exists(pTHX_ HV *hv, const char *key, I32 klen) return 0; #endif + if (is_utf8 && !(PL_hints & HINT_UTF8_DISTINCT)) + key = (char*)bytes_from_utf8((U8*)key, (STRLEN*)&klen, &is_utf8); + PERL_HASH(hash, key, klen); #ifdef DYNAMIC_ENV_FETCH @@ -802,6 +862,8 @@ Perl_hv_exists(pTHX_ HV *hv, const char *key, I32 klen) continue; if (HeKUTF8(entry) != (char)is_utf8) continue; + if (key != keysave) + Safefree(key); return TRUE; } #ifdef DYNAMIC_ENV_FETCH /* is it out there? */ @@ -816,6 +878,8 @@ Perl_hv_exists(pTHX_ HV *hv, const char *key, I32 klen) } } #endif + if (key != keysave) + Safefree(key); return FALSE; } @@ -839,6 +903,7 @@ Perl_hv_exists_ent(pTHX_ HV *hv, SV *keysv, U32 hash) register HE *entry; SV *sv; bool is_utf8; + char *keysave; if (!hv) return 0; @@ -867,8 +932,10 @@ Perl_hv_exists_ent(pTHX_ HV *hv, SV *keysv, U32 hash) return 0; #endif - key = SvPV(keysv, klen); + keysave = key = SvPV(keysv, klen); is_utf8 = (SvUTF8(keysv) != 0); + if (is_utf8 && !(PL_hints & HINT_UTF8_DISTINCT)) + key = (char*)bytes_from_utf8((U8*)key, &klen, &is_utf8); if (!hash) PERL_HASH(hash, key, klen); @@ -886,6 +953,8 @@ Perl_hv_exists_ent(pTHX_ HV *hv, SV *keysv, U32 hash) continue; if (HeKUTF8(entry) != (char)is_utf8) continue; + if (key != keysave) + Safefree(key); return TRUE; } #ifdef DYNAMIC_ENV_FETCH /* is it out there? */ @@ -900,6 +969,8 @@ Perl_hv_exists_ent(pTHX_ HV *hv, SV *keysv, U32 hash) } } #endif + if (key != keysave) + Safefree(key); return FALSE; } @@ -1471,10 +1542,13 @@ Perl_unsharepvn(pTHX_ const char *str, I32 len, U32 hash) register I32 i = 1; I32 found = 0; bool is_utf8 = FALSE; + const char *save = str; if (len < 0) { len = -len; is_utf8 = TRUE; + if (!(PL_hints & HINT_UTF8_DISTINCT)) + str = (char*)bytes_from_utf8((U8*)str, (STRLEN*)&len, &is_utf8); } /* what follows is the moral equivalent of: @@ -1507,7 +1581,8 @@ Perl_unsharepvn(pTHX_ const char *str, I32 len, U32 hash) break; } UNLOCK_STRTAB_MUTEX; - + if (str != save) + Safefree(str); if (!found && ckWARN_d(WARN_INTERNAL)) Perl_warner(aTHX_ WARN_INTERNAL, "Attempt to free non-existent shared string '%s'",str); } @@ -1525,10 +1600,13 @@ Perl_share_hek(pTHX_ const char *str, I32 len, register U32 hash) register I32 i = 1; I32 found = 0; bool is_utf8 = FALSE; + const char *save = str; if (len < 0) { len = -len; is_utf8 = TRUE; + if (!(PL_hints & HINT_UTF8_DISTINCT)) + str = (char*)bytes_from_utf8((U8*)str, (STRLEN*)&len, &is_utf8); } /* what follows is the moral equivalent of: @@ -1568,8 +1646,7 @@ Perl_share_hek(pTHX_ const char *str, I32 len, register U32 hash) ++HeVAL(entry); /* use value slot as REFCNT */ UNLOCK_STRTAB_MUTEX; + if (str != save) + Safefree(str); return HeKEY_hek(entry); } - - - diff --git a/objXSUB.h b/objXSUB.h index 8cdfec0..51e4eb2 100644 --- a/objXSUB.h +++ b/objXSUB.h @@ -1889,6 +1889,10 @@ #define Perl_utf8_to_bytes pPerl->Perl_utf8_to_bytes #undef utf8_to_bytes #define utf8_to_bytes Perl_utf8_to_bytes +#undef Perl_bytes_from_utf8 +#define Perl_bytes_from_utf8 pPerl->Perl_bytes_from_utf8 +#undef bytes_from_utf8 +#define bytes_from_utf8 Perl_bytes_from_utf8 #undef Perl_bytes_to_utf8 #define Perl_bytes_to_utf8 pPerl->Perl_bytes_to_utf8 #undef bytes_to_utf8 diff --git a/perlapi.c b/perlapi.c index f0016d5..04d48b9 100644 --- a/perlapi.c +++ b/perlapi.c @@ -3413,6 +3413,13 @@ Perl_utf8_to_bytes(pTHXo_ U8 *s, STRLEN *len) return ((CPerlObj*)pPerl)->Perl_utf8_to_bytes(s, len); } +#undef Perl_bytes_from_utf8 +U8* +Perl_bytes_from_utf8(pTHXo_ U8 *s, STRLEN *len, bool *is_utf8) +{ + return ((CPerlObj*)pPerl)->Perl_bytes_from_utf8(s, len, is_utf8); +} + #undef Perl_bytes_to_utf8 U8* Perl_bytes_to_utf8(pTHXo_ U8 *s, STRLEN *len) diff --git a/pod/perlapi.pod b/pod/perlapi.pod index 25fe18a..1828b55 100644 --- a/pod/perlapi.pod +++ b/pod/perlapi.pod @@ -182,6 +182,23 @@ must then use C to assign values to these new elements. =for hackers Found in file av.c +=item bytes_from_utf8 + +Converts a string C of length C from UTF8 into byte encoding. +Unlike but like C, returns a pointer to +the newly-created string, and updates C to contain the new length. +Returns the original string if no conversion occurs, C and +C are unchanged. Do nothing if C points to 0. Sets +C to 0 if C is converted or malformed . + +NOTE: this function is experimental and may change or be +removed without notice. + + U8* bytes_from_utf8(U8 *s, STRLEN *len, bool *is_utf8) + +=for hackers +Found in file utf8.c + =item bytes_to_utf8 Converts a string C of length C from ASCII into UTF8 encoding. @@ -2420,19 +2437,19 @@ false, defined or undefined. Does not handle 'get' magic. =for hackers Found in file sv.h -=item svtype +=item SvTYPE -An enum of flags for Perl types. These are found in the file B -in the C enum. Test these flags with the C macro. +Returns the type of the SV. See C. + + svtype SvTYPE(SV* sv) =for hackers Found in file sv.h -=item SvTYPE - -Returns the type of the SV. See C. +=item svtype - svtype SvTYPE(SV* sv) +An enum of flags for Perl types. These are found in the file B +in the C enum. Test these flags with the C macro. =for hackers Found in file sv.h diff --git a/proto.h b/proto.h index a1f0fee..e39d33e 100644 --- a/proto.h +++ b/proto.h @@ -818,6 +818,7 @@ PERL_CALLCONV STRLEN Perl_utf8_length(pTHX_ U8* s, U8 *e); PERL_CALLCONV IV Perl_utf8_distance(pTHX_ U8 *a, U8 *b); PERL_CALLCONV U8* Perl_utf8_hop(pTHX_ U8 *s, I32 off); PERL_CALLCONV U8* Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *len); +PERL_CALLCONV U8* Perl_bytes_from_utf8(pTHX_ U8 *s, STRLEN *len, bool *is_utf8); PERL_CALLCONV U8* Perl_bytes_to_utf8(pTHX_ U8 *s, STRLEN *len); PERL_CALLCONV UV Perl_utf8_to_uv_simple(pTHX_ U8 *s, STRLEN* retlen); PERL_CALLCONV UV Perl_utf8_to_uv(pTHX_ U8 *s, STRLEN curlen, STRLEN* retlen, U32 flags); diff --git a/sv.c b/sv.c index ed7ebdc..c53486a 100644 --- a/sv.c +++ b/sv.c @@ -4690,30 +4690,24 @@ Perl_sv_eq(pTHX_ register SV *sv1, register SV *sv2) /* do not utf8ize the comparands as a side-effect */ if (cur1 && cur2 && SvUTF8(sv1) != SvUTF8(sv2) && !IN_BYTE) { + bool is_utf8 = TRUE; + if (PL_hints & HINT_UTF8_DISTINCT) return FALSE; if (SvUTF8(sv1)) { - (void)utf8_to_bytes((U8*)(pv1 = savepvn(pv1, cur1)), &cur1); - { - IV scur1 = cur1; - if (scur1 < 0) { - Safefree(pv1); - return 0; - } - } - pv1tmp = TRUE; + char *pv = bytes_from_utf8((U8*)pv1, &cur1, &is_utf8); + if (is_utf8) + return 0; + pv1tmp = (pv != pv1); + pv1 = pv; } else { - (void)utf8_to_bytes((U8*)(pv2 = savepvn(pv2, cur2)), &cur2); - { - IV scur2 = cur2; - if (scur2 < 0) { - Safefree(pv2); - return 0; - } - } - pv2tmp = TRUE; + char *pv = bytes_from_utf8((U8*)pv2, &cur2, &is_utf8); + if (is_utf8) + return 0; + pv2tmp = (pv != pv2); + pv2 = pv; } } @@ -5601,6 +5595,8 @@ Perl_newSVpvn_share(pTHX_ const char *src, I32 len, U32 hash) len = -len; is_utf8 = TRUE; } + if (is_utf8 && !(PL_hints & HINT_UTF8_DISTINCT)) + src = (char*)bytes_from_utf8((U8*)src, (STRLEN*)&len, &is_utf8); if (!hash) PERL_HASH(hash, src, len); new_SV(sv); diff --git a/t/io/utf8.t b/t/io/utf8.t index 04554e7..d0201aa 100755 --- a/t/io/utf8.t +++ b/t/io/utf8.t @@ -91,17 +91,17 @@ print "not " unless $y == 1; print "ok 17\n"; } -print F $b,"\n"; # This upgrades $b! +print F $b,"\n"; # Don't upgrades $b { # Check byte length of $b use bytes; my $y = length($b); -print "not " unless $y == 2; +print "not ($y) " unless $y == 1; print "ok 18\n"; } { my $x = tell(F); { use bytes; $y += 3;} - print "not " unless $x == $y; + print "not ($x,$y) " unless $x == $y; print "ok 19\n"; } @@ -110,14 +110,14 @@ close F; open F, "a" or die $!; # Not UTF $x = ; chomp($x); -print "not " unless $x eq v196.172.194.130; +printf "not (%vd) ", $x unless $x eq v196.172.194.130; print "ok 20\n"; open F, "<:utf8", "a" or die $!; $x = ; chomp($x); close F; -print "not " unless $x eq chr(300).chr(130); +printf "not (%vd) ", $x unless $x eq chr(300).chr(130); print "ok 21\n"; # Now let's make it suffer. diff --git a/t/lib/charnames.t b/t/lib/charnames.t index 14da2e0..d00396f 100644 --- a/t/lib/charnames.t +++ b/t/lib/charnames.t @@ -8,7 +8,7 @@ BEGIN { } $| = 1; -print "1..13\n"; +print "1..14\n"; use charnames ':full'; @@ -44,8 +44,7 @@ $encoded_alpha = "\316\261"; $encoded_bet = "\327\221"; sub to_bytes { - use bytes; - "".shift; + pack"a*", shift; } { @@ -90,3 +89,10 @@ sub to_bytes { print "ok 13\n"; } +{ + use charnames qw(:full); + use utf8; + print "not " unless "\x{100}\N{CENT SIGN}" eq "\x{100}"."\N{CENT SIGN}"; + print "ok 14\n"; +} + diff --git a/t/op/each.t b/t/op/each.t index 35792ab..f1012c6 100755 --- a/t/op/each.t +++ b/t/op/each.t @@ -1,6 +1,12 @@ #!./perl -print "1..24\n"; +BEGIN { + chdir 't' if -d 't'; + @INC = '.'; + push @INC, '../lib'; +} + +print "1..25\n"; $h{'abc'} = 'ABC'; $h{'def'} = 'DEF'; @@ -156,3 +162,10 @@ print "#$b{$_}\n" for keys %b; # Used to core dump before change #8056. print "ok 23\n"; print "#$u{$_}\n" for keys %u; # Used to core dump before change #8056. print "ok 24\n"; + +%u = (qu"\xe3\x81\x82" => "downglade"); +for (keys %u) { + use bytes; + print "not " if length ne 3 or $_ ne "\xe3\x81\x82"; + print "ok 25\n"; +} diff --git a/t/op/tr.t b/t/op/tr.t index 6a4e1aa..75887ab 100755 --- a/t/op/tr.t +++ b/t/op/tr.t @@ -5,7 +5,7 @@ BEGIN { @INC = '../lib'; } -print "1..49\n"; +print "1..51\n"; $_ = "abcdefghijklmnopqrstuvwxyz"; @@ -287,3 +287,12 @@ print "ok 48\n"; print "not " unless sprintf("%vd", $a) eq '196.172.200'; print "ok 49\n"; +# UTF8 range + +($a = v300.196.172.302.197.172) =~ tr/\x{12c}-\x{130}/\xc0-\xc4/; +print "not " unless $a eq v192.196.172.194.197.172; +print "ok 50\n"; + +($a = v300.196.172.302.197.172) =~ tr/\xc4-\xc8/\x{12c}-\x{130}/; +print "not " unless $a eq v300.300.172.302.301.172; +print "ok 51\n"; diff --git a/t/pragma/utf8.t b/t/pragma/utf8.t index 546b217..577e6b4 100755 --- a/t/pragma/utf8.t +++ b/t/pragma/utf8.t @@ -10,7 +10,7 @@ BEGIN { } } -print "1..106\n"; +print "1..107\n"; my $test = 1; @@ -564,3 +564,16 @@ sub nok_bytes { print "ok $test\n"; $test++; # 106 } + +{ + use utf8; + + my $w = 0; + local $SIG{__WARN__} = sub { print "#($_[0])\n"; $w++ }; + my $x = eval q/"\\/ . "\x{100}" . q/"/;; + + print "not " unless $w == 0 && $x eq "\x{100}"; + print "ok $test\n"; + $test++; # 107 +} + diff --git a/toke.c b/toke.c index 1b41dcc..a85a725 100644 --- a/toke.c +++ b/toke.c @@ -1388,8 +1388,7 @@ S_scan_const(pTHX_ char *start) "Unrecognized escape \\%c passed through", *s); /* default action is to copy the quoted character */ - *d++ = *s++; - continue; + goto default_action; } /* \132 indicates an octal constant */ @@ -1479,6 +1478,13 @@ S_scan_const(pTHX_ char *start) if (has_utf8 || uv > 255) { d = (char*)uv_to_utf8((U8*)d, uv); has_utf8 = TRUE; + if (PL_lex_inwhat == OP_TRANS && + PL_sublex_info.sub_op) { + PL_sublex_info.sub_op->op_private |= + (PL_lex_repl ? OPpTRANS_FROM_UTF + : OPpTRANS_TO_UTF); + utf = TRUE; + } } else { *d++ = (char)uv; @@ -1506,6 +1512,8 @@ S_scan_const(pTHX_ char *start) res = newSVpvn(s + 1, e - s - 1); res = new_constant( Nullch, 0, "charnames", res, Nullsv, "\\N{...}" ); + if (has_utf8) + sv_utf8_upgrade(res); str = SvPV(res,len); if (!has_utf8 && SvUTF8(res)) { char *ostart = SvPVX(sv); @@ -1588,8 +1596,7 @@ S_scan_const(pTHX_ char *start) continue; } /* end if (backslash) */ - /* (now in tr/// code again) */ - + default_action: if (UTF8_IS_CONTINUED(*s) && (this_utf8 || has_utf8)) { STRLEN len = (STRLEN) -1; UV uv; @@ -1608,10 +1615,15 @@ S_scan_const(pTHX_ char *start) *d++ = *s++; } has_utf8 = TRUE; + if (PL_lex_inwhat == OP_TRANS && PL_sublex_info.sub_op) { + PL_sublex_info.sub_op->op_private |= + (PL_lex_repl ? OPpTRANS_FROM_UTF : OPpTRANS_TO_UTF); + utf = TRUE; + } continue; } - *d++ = *s++; + *d++ = *s++; } /* while loop to process each character */ /* terminate the string and set up the sv */ @@ -4742,7 +4754,8 @@ Perl_yylex(pTHX) case KEY_qq: case KEY_qu: s = scan_str(s,FALSE,FALSE); - if (tmp == KEY_qu && is_utf8_string((U8*)s, SvCUR(PL_lex_stuff))) + if (tmp == KEY_qu && + is_utf8_string((U8*)SvPVX(PL_lex_stuff), SvCUR(PL_lex_stuff))) SvUTF8_on(PL_lex_stuff); if (!s) missingterm((char*)0); diff --git a/utf8.c b/utf8.c index 156e63f..046df74 100644 --- a/utf8.c +++ b/utf8.c @@ -583,6 +583,63 @@ Perl_utf8_to_bytes(pTHX_ U8* s, STRLEN *len) } /* +=for apidoc A|U8 *|bytes_from_utf8|U8 *s|STRLEN *len|bool *is_utf8 + +Converts a string C of length C from UTF8 into byte encoding. +Unlike but like C, returns a pointer to +the newly-created string, and updates C to contain the new length. +Returns the original string if no conversion occurs, C and +C are unchanged. Do nothing if C points to 0. Sets +C to 0 if C is converted or malformed . + +=cut */ + +U8 * +Perl_bytes_from_utf8(pTHX_ U8* s, STRLEN *len, bool *is_utf8) +{ + U8 *send; + U8 *d; + U8 *start = s; + I32 count = 0; + + if (!*is_utf8) + return start; + + /* ensure valid UTF8 and chars < 256 before updating string */ + for (send = s + *len; s < send;) { + U8 c = *s++; + if (!UTF8_IS_ASCII(c)) { + if (UTF8_IS_CONTINUATION(c) || s >= send || + !UTF8_IS_CONTINUATION(*s)) { + *is_utf8 = 0; + return start; + } + if ((c & 0xfc) != 0xc0) + return start; + s++, count++; + } + } + + *is_utf8 = 0; + + if (!count) + return start; + + Newz(801, d, (*len) - count + 1, U8); + d = s = start; + while (s < send) { + U8 c = *s++; + if (UTF8_IS_ASCII(c)) + *d++ = c; + else + *d++ = UTF8_ACCUMULATE(c&3, *s++); + } + *d = '\0'; + *len = d - start; + return start; +} + +/* =for apidoc A|U8 *|bytes_to_utf8|U8 *s|STRLEN *len Converts a string C of length C from ASCII into UTF8 encoding.