From: Dan Kogai Date: Mon, 16 Feb 2009 10:45:53 +0000 (+0100) Subject: Upgrade to Encode 2.31 X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=40bed538bd6707bb5804c5afb3f7d8bd26c2bddb;p=p5sagit%2Fp5-mst-13.2.git Upgrade to Encode 2.31 --- diff --git a/ext/Encode/Changes b/ext/Encode/Changes index 69ffd5d..1828e49 100644 --- a/ext/Encode/Changes +++ b/ext/Encode/Changes @@ -1,8 +1,22 @@ # Revision history for Perl extension Encode. # -# $Id: Changes,v 2.29 2009/02/01 13:14:37 dankogai Exp $ +# $Id: Changes,v 2.31 2009/02/16 06:18:09 dankogai Exp dankogai $ # -$Revision: 2.29 $ $Date: 2009/02/01 13:14:37 $ +$Revision: 2.31 $ $Date: 2009/02/16 06:18:09 $ +! lib/Encode/MIME/Header.pm + "Revert [29767] and [29771] since it breaks perl 5.8" by miyagawa + http://coderepos.org/share/changeset/30111 + +2.30 2009/02/15 17:44:13 +! encoding.pm + fixed regexes, et cetera. by drry + http://coderepos.org/share/changeset/29767 +! lib/Encode/MIME/Header.pm + Addressed: Encode::MIME::Header::decode should respect CHECK + http://rt.cpan.org/Ticket/Display.html?id=43204 + http://coderepos.org/share/changeset/29767 + +2.29 2009/02/01 13:14:37 ! Encode.pm VERSION++ just to make PAUSE happy Message-Id: <877i4anwwt.fsf@k75.linux.bogus> diff --git a/ext/Encode/Encode.pm b/ext/Encode/Encode.pm index 1c6e7c8..b0344d1 100644 --- a/ext/Encode/Encode.pm +++ b/ext/Encode/Encode.pm @@ -1,10 +1,10 @@ # -# $Id: Encode.pm,v 2.29 2009/02/01 13:10:07 dankogai Exp $ +# $Id: Encode.pm,v 2.31 2009/02/16 06:13:11 dankogai Exp $ # package Encode; use strict; use warnings; -our $VERSION = sprintf "%d.%02d", q$Revision: 2.29 $ =~ /(\d+)/g; +our $VERSION = sprintf "%d.%02d", q$Revision: 2.31 $ =~ /(\d+)/g; sub DEBUG () { 0 } use XSLoader (); XSLoader::load( __PACKAGE__, $VERSION ); diff --git a/ext/Encode/Unicode/Unicode.pm b/ext/Encode/Unicode/Unicode.pm index cdfe02d..16982bb 100644 --- a/ext/Encode/Unicode/Unicode.pm +++ b/ext/Encode/Unicode/Unicode.pm @@ -4,7 +4,7 @@ use strict; use warnings; no warnings 'redefine'; -our $VERSION = do { my @r = ( q$Revision: 2.5 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r }; +our $VERSION = do { my @r = ( q$Revision: 2.6 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r }; use XSLoader; XSLoader::load( __PACKAGE__, $VERSION ); @@ -69,7 +69,7 @@ Encode::Unicode -- Various Unicode Transformation Formats =head1 SYNOPSIS - use Encode qw/encode decode/; + use Encode qw/encode decode/; $ucs2 = encode("UCS-2BE", $utf8); $utf8 = decode("UCS-2BE", $ucs2); @@ -230,7 +230,7 @@ And to desurrogate; $uni = 0x10000 + ($hi - 0xD800) * 0x400 + ($lo - 0xDC00); Note this move has made \x{D800}-\x{DFFF} into a forbidden zone but -perl does not prohibit the use of characters within this range. To perl, +perl does not prohibit the use of characters within this range. To perl, every one of \x{0000_0000} up to \x{ffff_ffff} (*) is I. (*) or \x{ffff_ffff_ffff_ffff} if your perl is compiled with 64-bit @@ -241,11 +241,11 @@ every one of \x{0000_0000} up to \x{ffff_ffff} (*) is I. Unlike most encodings which accept various ways to handle errors, Unicode encodings simply croaks. - % perl -MEncode -e '$_ = "\xfe\xff\xd8\xd9\xda\xdb\0\n"' \ - -e 'Encode::from_to($_, "utf16","shift_jis", 0); print' + % perl -MEncode -e'$_ = "\xfe\xff\xd8\xd9\xda\xdb\0\n"' \ + -e'Encode::from_to($_, "utf16","shift_jis", 0); print' UTF-16:Malformed LO surrogate d8d9 at /path/to/Encode.pm line 184. - % perl -MEncode -e '$a = "BOM missing"' \ - -e ' Encode::from_to($a, "utf16", "shift_jis", 0); print' + % perl -MEncode -e'$a = "BOM missing"' \ + -e' Encode::from_to($a, "utf16", "shift_jis", 0); print' UTF-16:Unrecognised BOM 424f at /path/to/Encode.pm line 184. Unlike other encodings where mappings are not one-to-one against @@ -264,7 +264,7 @@ RFC 2781 L, The whole Unicode standard L Ch. 15, pp. 403 of C -by Larry Wall, Tom Christiansen, Jon Orwant; +by Larry Wall, Tom Christiansen, Jon Orwant; O'Reilly & Associates; ISBN 0-596-00027-8 =cut diff --git a/ext/Encode/Unicode/Unicode.xs b/ext/Encode/Unicode/Unicode.xs index 3283ced..1f041d4 100644 --- a/ext/Encode/Unicode/Unicode.xs +++ b/ext/Encode/Unicode/Unicode.xs @@ -1,5 +1,5 @@ /* - $Id: Unicode.xs,v 2.5 2009/02/01 13:14:41 dankogai Exp dankogai $ + $Id: Unicode.xs,v 2.5 2009/02/01 13:14:41 dankogai Exp $ */ #define PERL_NO_GET_CONTEXT diff --git a/ext/Encode/encoding.pm b/ext/Encode/encoding.pm index 77ba447..be20a49 100644 --- a/ext/Encode/encoding.pm +++ b/ext/Encode/encoding.pm @@ -1,4 +1,4 @@ -# $Id: encoding.pm,v 2.7 2008/03/12 09:51:11 dankogai Exp $ +# $Id: encoding.pm,v 2.8 2009/02/15 17:44:13 dankogai Exp $ package encoding; our $VERSION = '2.6_01'; @@ -206,8 +206,8 @@ encoding - allows you to write your script in non-ascii or non-utf8 # or you can even do this if your shell supports your native encoding - perl -Mencoding=latin2 -e '...' # Feeling centrally European? - perl -Mencoding=euc-kr -e '...' # Or Korean? + perl -Mencoding=latin2 -e'...' # Feeling centrally European? + perl -Mencoding=euc-kr -e'...' # Or Korean? # more control @@ -331,14 +331,14 @@ encodings as Shift_JIS and Big-5 that may contain '\' (BACKSLASH; accidentally escape the quoting character that follows. Perl 5.8.1 or later fixes this problem. -=item tr// +=item tr// C was overlooked by Perl 5 porters when they released perl 5.8.0 See the section below for details. =item DATA pseudo-filehandle -Another feature that was overlooked was C. +Another feature that was overlooked was C. =back @@ -348,7 +348,7 @@ Another feature that was overlooked was C. =item use encoding [I] ; -Sets the script encoding to I. And unless ${^UNICODE} +Sets the script encoding to I. And unless ${^UNICODE} exists and non-zero, PerlIO layers of STDIN and STDOUT are set to ":encoding(I)". @@ -426,13 +426,13 @@ utf8> to C<${"\x{4eba}"}++>. =head2 NOT SCOPED The pragma is a per script, not a per block lexical. Only the last -C or C matters, and it affects -B. However, the pragma is supported and -B can appear as many times as you want in a given script. +C or C matters, and it affects +B. However, the pragma is supported and +B can appear as many times as you want in a given script. The multiple use of this pragma is discouraged. By the same reason, the use this pragma inside modules is also -discouraged (though not as strongly discouraged as the case above. +discouraged (though not as strongly discouraged as the case above. See below). If you still have to write a module with this pragma, be very careful @@ -601,7 +601,7 @@ To understand it, try the code below. . $camel = "*non-ascii*"; binmode(STDOUT=>':encoding(utf8)'); # bang! - write; # funny + write; # funny print $camel, "\n"; # fine Without binmode this happens to work but without binmode, print() @@ -634,7 +634,7 @@ returned is used as the default encoding for the open pragma. If 1. didn't work but we are under the locale pragma, the environment variables LC_ALL and LANG (in that order) are matched for encodings -(the part after C<.>, if any), and if any found, that is used +(the part after C<.>, if any), and if any found, that is used as the default encoding for the open pragma. =item 3. @@ -653,7 +653,7 @@ B, is UTF-8. =head1 HISTORY -This pragma first appeared in Perl 5.8.0. For features that require +This pragma first appeared in Perl 5.8.0. For features that require 5.8.1 and better, see above. The C<:locale> subpragma was implemented in 2.01, or Perl 5.8.6. diff --git a/ext/Encode/lib/Encode/MIME/Header.pm b/ext/Encode/lib/Encode/MIME/Header.pm index e9bf93b..624bf17 100644 --- a/ext/Encode/lib/Encode/MIME/Header.pm +++ b/ext/Encode/lib/Encode/MIME/Header.pm @@ -3,7 +3,7 @@ use strict; use warnings; no warnings 'redefine'; -our $VERSION = do { my @r = ( q$Revision: 2.7 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r }; +our $VERSION = do { my @r = ( q$Revision: 2.9 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r }; use Encode qw(find_encoding encode_utf8 decode_utf8); use MIME::Base64; use Carp; @@ -44,32 +44,32 @@ sub decode($$;$) { $str =~ s/\?=\s+=\?/\?==\?/gos; # multi-line header to single line - $str =~ s/(?:\r|\n|\r\n)[ \t]+//gos; + $str =~ s/(?:\r\n|[\r\n])[ \t]+//gos; 1 while ( $str =~ - s/(\=\?[0-9A-Za-z\-_]+\?[Qq]\?)(.*?)\?\=\1(.*?)\?\=/$1$2$3\?\=/ ) + s/(=\?[-0-9A-Za-z_]+\?[Qq]\?)(.*?)\?=\1(.*?\?=)/$1$2$3/ ) ; # Concat consecutive QP encoded mime headers # Fixes breaking inside multi-byte characters $str =~ s{ - =\? # begin encoded word - ([0-9A-Za-z\-_]+) # charset (encoding) - (?:\*[A-Za-z]{1,8}(?:-[A-Za-z]{1,8})*)? # language (RFC 2231) + =\? # begin encoded word + ([-0-9A-Za-z_]+) # charset (encoding) + (?:\*[A-Za-z]{1,8}(?:-[A-Za-z]{1,8})*)? # language (RFC 2231) \?([QqBb])\? # delimiter (.*?) # Base64-encodede contents - \?= # end encoded word - }{ - if (uc($2) eq 'B'){ + \?= # end encoded word + }{ + if (uc($2) eq 'B'){ $obj->{decode_b} or croak qq(MIME "B" unsupported); - decode_b($1, $3); - }elsif(uc($2) eq 'Q'){ + decode_b($1, $3, $chk); + } elsif (uc($2) eq 'Q'){ $obj->{decode_q} or croak qq(MIME "Q" unsupported); - decode_q($1, $3); - }else{ + decode_q($1, $3, $chk); + } else { croak qq(MIME "$2" encoding is nonexistent!); } - }egox; - $_[1] = '' if $chk; + }egox; + $_[1] = $str if $chk; return $str; } @@ -77,42 +77,41 @@ sub decode_b { my $enc = shift; my $d = find_encoding($enc) or croak qq(Unknown encoding "$enc"); my $db64 = decode_base64(shift); + my $chk = shift; return $d->name eq 'utf8' ? Encode::decode_utf8($db64) - : $d->decode( $db64, Encode::FB_PERLQQ ); + : $d->decode( $db64, $chk || Encode::FB_PERLQQ ); } sub decode_q { - my ( $enc, $q ) = @_; + my ( $enc, $q, $chk ) = @_; my $d = find_encoding($enc) or croak qq(Unknown encoding "$enc"); $q =~ s/_/ /go; $q =~ s/=([0-9A-Fa-f]{2})/pack("C", hex($1))/ego; return $d->name eq 'utf8' ? Encode::decode_utf8($q) - : $d->decode( $q, Encode::FB_PERLQQ ); + : $d->decode( $q, $chk || Encode::FB_PERLQQ ); } my $especials = join( '|' => map { quotemeta( chr($_) ) } - unpack( "C*", qq{()<>@,;:\"\'/[]?.=} ) ); + unpack( "C*", qq{()<>@,;:"'/[]?.=} ) ); my $re_encoded_word = qr{ - (?: - =\? # begin encoded word - (?:[0-9A-Za-z\-_]+) # charset (encoding) - (?:\*\w+(?:-\w+)*)? # language (RFC 2231) - \?(?:[QqBb])\? # delimiter - (?:.*?) # Base64-encodede contents - \?= # end encoded word - ) - }xo; + =\? # begin encoded word + (?:[-0-9A-Za-z_]+) # charset (encoding) + (?:\*[A-Za-z]{1,8}(?:-[A-Za-z]{1,8})*)? # language (RFC 2231) + \?(?:[QqBb])\? # delimiter + (?:.*?) # Base64-encodede contents + \?= # end encoded word +}xo; my $re_especials = qr{$re_encoded_word|$especials}xo; sub encode($$;$) { my ( $obj, $str, $chk ) = @_; my @line = (); - for my $line ( split /\r|\n|\r\n/o, $str ) { + for my $line ( split /\r\n|[\r\n]/o, $str ) { my ( @word, @subline ); for my $word ( split /($re_especials)/o, $line ) { if ( $word =~ /[^\x00-\x7f]/o @@ -176,10 +175,10 @@ sub _encode_q { my $chunk = shift; $chunk = encode_utf8($chunk); $chunk =~ s{ - ([^0-9A-Za-z]) - }{ - join("" => map {sprintf "=%02X", $_} unpack("C*", $1)) - }egox; + [^0-9A-Za-z] + }{ + join("" => map {sprintf "=%02X", $_} unpack("C*", $&)) + }egox; return HEAD . 'Q?' . $chunk . TAIL; } @@ -192,7 +191,7 @@ Encode::MIME::Header -- MIME 'B' and 'Q' header encoding =head1 SYNOPSIS - use Encode qw/encode decode/; + use Encode qw/encode decode/; $utf8 = decode('MIME-Header', $header); $header = encode('MIME-Header', $utf8); @@ -237,6 +236,6 @@ handsets which does not grok UTF-8. L RFC 2047, L and many other -locations. +locations. =cut