From: Nicholas Clark Date: Sat, 6 Oct 2007 10:23:26 +0000 (+0000) Subject: Revert one hunk of change 32034 that had the possibility of being buggy X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=7fddd94457983d86b562b409f0a846c0a764f8d7;p=p5sagit%2Fp5-mst-13.2.git Revert one hunk of change 32034 that had the possibility of being buggy (the sprintf "%c" code will work correctly when the SV is UTF-8). Audit all the rest for UTF-8 correctness, and force SvUTF-8_off() in utf8.c to ensure correctness. (The string is reset to "", so this will not be a behaviour change.) p4raw-id: //depot/perl@32040 --- diff --git a/dump.c b/dump.c index bdaf41a..0d7fd34 100644 --- a/dump.c +++ b/dump.c @@ -219,8 +219,10 @@ Perl_pv_escape( pTHX_ SV *dsv, char const * const str, const char * const end = pv + count; /* end of string */ octbuf[0] = esc; - if (!flags & PERL_PV_ESCAPE_NOCLEAR) + if (!flags & PERL_PV_ESCAPE_NOCLEAR) { + /* This won't alter the UTF-8 flag */ sv_setpvn(dsv, "", 0); + } if ((flags & PERL_PV_ESCAPE_UNI_DETECT) && is_utf8_string((U8*)pv, count)) isuni = 1; @@ -279,8 +281,13 @@ Perl_pv_escape( pTHX_ SV *dsv, char const * const str, sv_catpvn(dsv, octbuf, chsize); wrote += chsize; } else { - const char string = (char) c; - sv_catpvn(dsv, &string, 1); + /* If PERL_PV_ESCAPE_NOBACKSLASH is set then bytes in the range + 128-255 can be appended raw to the dsv. If dsv happens to be + UTF-8 then we need catpvf to upgrade them for us. + Or add a new API call sv_catpvc(). Think about that name, and + how to keep it clear that it's unlike the s of catpvs, which is + really an array octets, not a string. */ + Perl_sv_catpvf( aTHX_ dsv, "%c", c); wrote++; } if ( flags & PERL_PV_ESCAPE_FIRSTCHAR ) diff --git a/regcomp.c b/regcomp.c index b302561..f649188 100644 --- a/regcomp.c +++ b/regcomp.c @@ -9674,7 +9674,17 @@ clear_re(pTHX_ void *r) STATIC void S_put_byte(pTHX_ SV *sv, int c) { - if (isCNTRL(c) || c == 255 || !isPRINT(c)) + /* Our definition of isPRINT() ignores locales, so only bytes that are + not part of UTF-8 are considered printable. I assume that the same + holds for UTF-EBCDIC. + Also, code point 255 is not printable in either (it's E0 in EBCDIC, + which Wikipedia says: + + EO, or Eight Ones, is an 8-bit EBCDIC character code represented as all + ones (binary 1111 1111, hexadecimal FF). It is similar, but not + identical, to the ASCII delete (DEL) or rubout control character. + ) So the old condition can be simplified to !isPRINT(c) */ + if (!isPRINT(c)) Perl_sv_catpvf(aTHX_ sv, "\\%o", c); else { const unsigned char string = (unsigned char) c; diff --git a/utf8.c b/utf8.c index c665a41..45b17b1 100644 --- a/utf8.c +++ b/utf8.c @@ -2136,6 +2136,7 @@ Perl_pv_uni_display(pTHX_ SV *dsv, const U8 *spv, STRLEN len, STRLEN pvlim, UV f const char *s, *e; sv_setpvn(dsv, "", 0); + SvUTF8_off(dsv); for (s = (const char *)spv, e = s + len; s < e; s += UTF8SKIP(s)) { UV u; /* This serves double duty as a flag and a character to print after