From: Nicholas Clark Date: Sat, 6 Apr 2002 14:54:51 +0000 (+0100) Subject: Re: Dumper.pm and Unicode code points (was Re: Data::Dumper and integer conversions) X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=f052740f1f57186936c99755d580d91ae19122de;p=p5sagit%2Fp5-mst-13.2.git Re: Dumper.pm and Unicode code points (was Re: Data::Dumper and integer conversions) Message-ID: <20020406135450.GA644@Bagpuss.unfortu.net> p4raw-id: //depot/perl@15760 --- diff --git a/ext/Data/Dumper/Dumper.xs b/ext/Data/Dumper/Dumper.xs index 30c6558..bc2b454 100644 --- a/ext/Data/Dumper/Dumper.xs +++ b/ext/Data/Dumper/Dumper.xs @@ -100,45 +100,77 @@ esc_q(register char *d, register char *s, register STRLEN slen) static I32 esc_q_utf8(pTHX_ SV* sv, register char *src, register STRLEN slen) { - char *s, *send, *r; - STRLEN grow = 0, j = 1, l; - bool dquote = FALSE; + char *s, *send, *r, *rstart; + STRLEN j, cur = SvCUR(sv); + /* Could count 128-255 and 256+ in two variables, if we want to + be like &qquote and make a distinction. */ + STRLEN grow = 0; /* bytes needed to represent chars 128+ */ + /* STRLEN topbit_grow = 0; bytes needed to represent chars 128-255 */ + STRLEN backslashes = 0; + STRLEN single_quotes = 0; + STRLEN qq_escapables = 0; /* " $ @ will need a \ in "" strings. */ + STRLEN normal = 0; /* this will need EBCDICification */ for (s = src, send = src + slen; s < send; s += UTF8SKIP(s)) { - UV k = utf8_to_uvchr((U8*)s, &l); + UV k = utf8_to_uvchr((U8*)s, NULL); - grow += - (*s == '"' || *s == '\\') ? 2 : - (k < 0x80 ? 1 : UNISKIP(k) + 1 + 4); /* 4: \x{} */ + if (k > 127) { + /* 4: \x{} then count the number of hex digits. */ + grow += 4 + (k <= 0xFF ? 2 : k <= 0xFFF ? 3 : k <= 0xFFFF ? 4 : +#if UVSIZE == 4 + 8 /* We may allocate a bit more than the minimum here. */ +#else + k <= 0xFFFFFFFF ? 8 : UVSIZE * 4 +#endif + ); + } else if (k == '\\') { + backslashes++; + } else if (k == '\'') { + single_quotes++; + } else if (k == '"' || k == '$' || k == '@') { + qq_escapables++; + } else { + normal++; + } } - sv_grow(sv, SvCUR(sv)+3+grow); /* 3: ""\0 */ - r = SvPVX(sv) + SvCUR(sv); + if (grow) { + /* We have something needing hex. 3 is ""\0 */ + sv_grow(sv, cur+3+grow+2*qq_escapables+2*backslashes+normal); + rstart = r = SvPVX(sv) + cur; - for (s = src; s < send; s += UTF8SKIP(s)) { - UV k = utf8_to_uvchr((U8*)s, &l); + *r++ = '"'; - if (*s == '"' || *s == '\\') { - r[j++] = '\\'; - r[j++] = *s; - } - else if (k < 0x80) - r[j++] = k; - else { - r[j++] = '\\'; - r[j++] = 'x'; - r[j++] = '{'; - j += sprintf(r + j, "%"UVxf, k); - r[j++] = '}'; - dquote = TRUE; - } + for (s = src; s < send; s += UTF8SKIP(s)) { + UV k = utf8_to_uvchr((U8*)s, NULL); + + if (k == '"' || k == '\\' || k == '$' || k == '@') { + *r++ = '\\'; + *r++ = k; + } + else if (k < 0x80) + *r++ = k; + else { + r += sprintf(r, "\\x{%"UVxf"}", k); + } + } + *r++ = '"'; + } else { + /* Single quotes. */ + sv_grow(sv, cur+3+grow+2*single_quotes+2*backslashes+normal); + rstart = r = SvPVX(sv) + cur; + *r++ = '\''; + for (s = src; s < send; s ++) { + char k = *s; + if (k == '\'' || k == '\\') + *r++ = '\\'; + *r++ = k; + } + *r++ = '\''; } - if (dquote) - r[0] = r[j++] = '"'; - else - r[0] = r[j++] = '\''; - r[j] = '\0'; - SvCUR_set(sv, SvCUR(sv) + j); + *r = '\0'; + j = r - rstart; + SvCUR_set(sv, cur + j); return j; } diff --git a/ext/Data/Dumper/t/dumper.t b/ext/Data/Dumper/t/dumper.t index 01a386e..d33af74 100755 --- a/ext/Data/Dumper/t/dumper.t +++ b/ext/Data/Dumper/t/dumper.t @@ -67,11 +67,11 @@ sub TEST { $Data::Dumper::Useperl = 1; if (defined &Data::Dumper::Dumpxs) { print "### XS extension loaded, will run XS tests\n"; - $TMAX = 321; $XS = 1; + $TMAX = 339; $XS = 1; } else { print "### XS extensions not loaded, will NOT run XS tests\n"; - $TMAX = 162; $XS = 0; + $TMAX = 171; $XS = 0; } print "1..$TMAX\n"; @@ -1193,9 +1193,6 @@ EOT if $XS; } -#XXX} - - { @a = ( 999999999, @@ -1268,3 +1265,48 @@ EOT } } +#XXX} +{ + $b = "Bad. XS didn't escape dollar sign"; +############# 322 + $WANT = <<"EOT"; # Careful. This is '' string written inside '' here doc +#\$VAR1 = '\$b\"\@\\\\\xA3'; +EOT + + $a = "\$b\"\@\\\xA3\x{100}"; + chop $a; + TEST q(Data::Dumper->Dump([$a])), "utf8 flag with \" and \$"; + if ($XS) { + $WANT = <<'EOT'; # While this is "" string written inside "" here doc +#$VAR1 = "\$b\"\@\\\x{a3}"; +EOT + TEST q(Data::Dumper->Dumpxs([$a])), "XS utf8 flag with \" and \$"; + } + # XS used to produce "$b\"' which is 4 chars, not 3. [ie wrongly qq(\$b\\\")] +############# 328 + $WANT = <<'EOT'; +#$VAR1 = '$b"'; +EOT + + $a = "\$b\"\x{100}"; + chop $a; + TEST q(Data::Dumper->Dump([$a])), "utf8 flag with \" and \$"; + if ($XS) { + TEST q(Data::Dumper->Dumpxs([$a])), "XS utf8 flag with \" and \$"; + } + + + # XS used to produce 'D'oh!' which is well, D'oh! + # Andreas found this one, which in turn discovered the previous two. +############# 334 + $WANT = <<'EOT'; +#$VAR1 = 'D\'oh!'; +EOT + + $a = "D'oh!\x{100}"; + chop $a; + TEST q(Data::Dumper->Dump([$a])), "utf8 flag with '"; + if ($XS) { + TEST q(Data::Dumper->Dumpxs([$a])), "XS utf8 flag with '"; + } +}