From: Jarkko Hietaniemi Date: Fri, 29 Dec 2000 07:08:32 +0000 (+0000) Subject: (Retracted by #8264) Externally: join() was still quite UTF-8-unaware. X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=c512ce4f7f4a9bd0f491f91cb5a15fcb65ee37d9;p=p5sagit%2Fp5-mst-13.2.git (Retracted by #8264) Externally: join() was still quite UTF-8-unaware. Internally: sv_catsv() wasn't quite okay on UTF-8, it assumed that the only cases to care about are byte+byte and byte+character. TODO: See how well pp_concat() could be implemented in terms of sv_catsv(). p4raw-id: //depot/perl@8248 --- diff --git a/doop.c b/doop.c index ea65a68..3548556 100644 --- a/doop.c +++ b/doop.c @@ -504,8 +504,6 @@ Perl_do_join(pTHX_ register SV *sv, SV *del, register SV **mark, register SV **s } if (items-- > 0) { - char *s; - sv_setpv(sv, ""); if (*mark) sv_catsv(sv, *mark); @@ -513,10 +511,9 @@ Perl_do_join(pTHX_ register SV *sv, SV *del, register SV **mark, register SV **s } else sv_setpv(sv,""); - len = delimlen; - if (len) { + if (delimlen) { for (; items > 0; items--,mark++) { - sv_catpvn(sv,delim,len); + sv_catsv(sv,del); sv_catsv(sv,*mark); } } diff --git a/sv.c b/sv.c index 4794596..97ee2ad 100644 --- a/sv.c +++ b/sv.c @@ -2934,7 +2934,7 @@ Perl_sv_utf8_upgrade(pTHX_ register SV *sv) char *s, *t, *e; int hibit = 0; - if (!sv || !SvPOK(sv) || SvUTF8(sv)) + if (!sv || !SvPOK(sv) || !SvCUR(sv) || SvUTF8(sv)) return; /* This function could be much more efficient if we had a FLAG in SVs @@ -3755,20 +3755,54 @@ C. Handles 'get' magic, but not 'set' magic. See C. */ void -Perl_sv_catsv(pTHX_ SV *dstr, register SV *sstr) +Perl_sv_catsv(pTHX_ SV *dsv, register SV *ssv) { - char *s; - STRLEN len; - if (!sstr) + if (!ssv) return; - if ((s = SvPV(sstr, len))) { - if (DO_UTF8(sstr)) { - sv_utf8_upgrade(dstr); - sv_catpvn(dstr,s,len); - SvUTF8_on(dstr); + else { + STRLEN slen; + char *spv; + + if ((spv = SvPV(ssv, slen))) { + bool dutf8 = DO_UTF8(dsv); + bool sutf8 = DO_UTF8(ssv); + + if (dutf8 != sutf8) { + char *s = spv; + char *send = s + slen; + STRLEN dlen; + char *dpv; + char *d; + + /* We may modify dsv but not ssv. */ + + if (!dutf8) + sv_utf8_upgrade(dsv); + dpv = SvPV(dsv, dlen); + /* Overguestimate on the slen. */ + SvGROW(dsv, dlen + (sutf8 ? 2 * slen : slen) + 1); + d = dpv + dlen; + if (dutf8) /* && !sutf8 */ { + while (s < send) { + if (UTF8_IS_ASCII(*s)) + *d++ = *s++; + else { + *d++ = UTF8_EIGHT_BIT_HI(*s); + *d++ = UTF8_EIGHT_BIT_LO(*s); + s += 2; + } + } + SvCUR(dsv) += s - spv; + *SvEND(dsv) = 0; + } + else /* !dutf8 (was) && sutf8 */ { + sv_catpvn(dsv, spv, slen); + SvUTF8_on(dsv); + } + } + else + sv_catpvn(dsv, spv, slen); } - else - sv_catpvn(dstr,s,len); } } @@ -3781,10 +3815,10 @@ Like C, but also handles 'set' magic. */ void -Perl_sv_catsv_mg(pTHX_ SV *dstr, register SV *sstr) +Perl_sv_catsv_mg(pTHX_ SV *dsv, register SV *ssv) { - sv_catsv(dstr,sstr); - SvSETMAGIC(dstr); + sv_catsv(dsv,ssv); + SvSETMAGIC(dsv); } /* @@ -3797,20 +3831,20 @@ Handles 'get' magic, but not 'set' magic. See C. */ void -Perl_sv_catpv(pTHX_ register SV *sv, register const char *ptr) +Perl_sv_catpv(pTHX_ register SV *sv, register const char *pv) { register STRLEN len; STRLEN tlen; char *junk; - if (!ptr) + if (!pv) return; junk = SvPV_force(sv, tlen); - len = strlen(ptr); + len = strlen(pv); SvGROW(sv, tlen + len + 1); - if (ptr == junk) - ptr = SvPVX(sv); - Move(ptr,SvPVX(sv)+tlen,len+1,char); + if (pv == junk) + pv = SvPVX(sv); + Move(pv,SvPVX(sv)+tlen,len+1,char); SvCUR(sv) += len; (void)SvPOK_only_UTF8(sv); /* validate pointer */ SvTAINT(sv); @@ -3825,9 +3859,9 @@ Like C, but also handles 'set' magic. */ void -Perl_sv_catpv_mg(pTHX_ register SV *sv, register const char *ptr) +Perl_sv_catpv_mg(pTHX_ register SV *sv, register const char *pv) { - sv_catpv(sv,ptr); + sv_catpv(sv,pv); SvSETMAGIC(sv); } diff --git a/t/op/join.t b/t/op/join.t index b50878e..eea9add 100755 --- a/t/op/join.t +++ b/t/op/join.t @@ -1,6 +1,6 @@ #!./perl -print "1..10\n"; +print "1..14\n"; @x = (1, 2, 3); if (join(':',@x) eq '1:2:3') {print "ok 1\n";} else {print "not ok 1\n";} @@ -44,3 +44,23 @@ if ($f eq 'baeak') {print "ok 6\n";} else {print "# '$f'\nnot ok 6\n";} print "# expected 'a17b21c' got '$r'\nnot " if $r ne 'a17b21c'; print "ok 10\n"; }; + +{ my $s = join("", chr(1234),chr(255)); + print "not " unless length($s) == 2; + print "ok 11\n"; +} + +{ my $s = join(chr(2345), chr(1234),chr(255)); + print "not " unless length($s) == 3; + print "ok 12\n"; +} + +{ my $s = join(chr(2345), chr(1234),chr(3456)); + print "not " unless length($s) == 3; + print "ok 13\n"; +} + +{ my $s = join(chr(255), chr(1234),chr(2345)); + print "not " unless length($s) == 3; + print "ok 14\n"; +} diff --git a/utf8.h b/utf8.h index 26ef723..e9598b8 100644 --- a/utf8.h +++ b/utf8.h @@ -62,15 +62,18 @@ END_EXTERN_C #define UTF8_QUAD_MAX UINT64_C(0x1000000000) -#define UTF8_IS_ASCII(c) ((c) < 0x80) -#define UTF8_IS_START(c) ((c) >= 0xc0 && ((c) <= 0xfd)) -#define UTF8_IS_CONTINUATION(c) ((c) >= 0x80 && ((c) <= 0xbf)) -#define UTF8_IS_CONTINUED(c) ((c) & 0x80) +#define UTF8_IS_ASCII(c) (((U8)c) < 0x80) +#define UTF8_IS_START(c) (((U8)c) >= 0xc0 && (((U8)c) <= 0xfd)) +#define UTF8_IS_CONTINUATION(c) (((U8)c) >= 0x80 && (((U8)c) <= 0xbf)) +#define UTF8_IS_CONTINUED(c) (((U8)c) & 0x80) -#define UTF8_CONTINUATION_MASK 0x3f +#define UTF8_CONTINUATION_MASK ((U8)0x3f) #define UTF8_ACCUMULATION_SHIFT 6 #define UTF8_ACCUMULATE(old, new) ((old) << UTF8_ACCUMULATION_SHIFT | ((new) & UTF8_CONTINUATION_MASK)) +#define UTF8_EIGHT_BIT_HI(c) ( (((U8)c)>>6) |0xc0) +#define UTF8_EIGHT_BIT_LO(c) (((((U8)c)>>6)&0x3f)|0x80) + #ifdef HAS_QUAD #define UNISKIP(uv) ( (uv) < 0x80 ? 1 : \ (uv) < 0x800 ? 2 : \