# Revision history for Perl extension Encode.
#
-# $Id: Changes,v 2.29 2009/02/01 13:14:37 dankogai Exp $
+# $Id: Changes,v 2.31 2009/02/16 06:18:09 dankogai Exp dankogai $
#
-$Revision: 2.29 $ $Date: 2009/02/01 13:14:37 $
+$Revision: 2.31 $ $Date: 2009/02/16 06:18:09 $
+! lib/Encode/MIME/Header.pm
+ "Revert [29767] and [29771] since it breaks perl 5.8" by miyagawa
+ http://coderepos.org/share/changeset/30111
+
+2.30 2009/02/15 17:44:13
+! encoding.pm
+ fixed regexes, et cetera. by drry
+ http://coderepos.org/share/changeset/29767
+! lib/Encode/MIME/Header.pm
+ Addressed: Encode::MIME::Header::decode should respect CHECK
+ http://rt.cpan.org/Ticket/Display.html?id=43204
+ http://coderepos.org/share/changeset/29767
+
+2.29 2009/02/01 13:14:37
! Encode.pm
VERSION++ just to make PAUSE happy
Message-Id: <877i4anwwt.fsf@k75.linux.bogus>
#
-# $Id: Encode.pm,v 2.29 2009/02/01 13:10:07 dankogai Exp $
+# $Id: Encode.pm,v 2.31 2009/02/16 06:13:11 dankogai Exp $
#
package Encode;
use strict;
use warnings;
-our $VERSION = sprintf "%d.%02d", q$Revision: 2.29 $ =~ /(\d+)/g;
+our $VERSION = sprintf "%d.%02d", q$Revision: 2.31 $ =~ /(\d+)/g;
sub DEBUG () { 0 }
use XSLoader ();
XSLoader::load( __PACKAGE__, $VERSION );
use warnings;
no warnings 'redefine';
-our $VERSION = do { my @r = ( q$Revision: 2.5 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r };
+our $VERSION = do { my @r = ( q$Revision: 2.6 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r };
use XSLoader;
XSLoader::load( __PACKAGE__, $VERSION );
=head1 SYNOPSIS
- use Encode qw/encode decode/;
+ use Encode qw/encode decode/;
$ucs2 = encode("UCS-2BE", $utf8);
$utf8 = decode("UCS-2BE", $ucs2);
$uni = 0x10000 + ($hi - 0xD800) * 0x400 + ($lo - 0xDC00);
Note this move has made \x{D800}-\x{DFFF} into a forbidden zone but
-perl does not prohibit the use of characters within this range. To perl,
+perl does not prohibit the use of characters within this range. To perl,
every one of \x{0000_0000} up to \x{ffff_ffff} (*) is I<a character>.
(*) or \x{ffff_ffff_ffff_ffff} if your perl is compiled with 64-bit
Unlike most encodings which accept various ways to handle errors,
Unicode encodings simply croaks.
- % perl -MEncode -e '$_ = "\xfe\xff\xd8\xd9\xda\xdb\0\n"' \
- -e 'Encode::from_to($_, "utf16","shift_jis", 0); print'
+ % perl -MEncode -e'$_ = "\xfe\xff\xd8\xd9\xda\xdb\0\n"' \
+ -e'Encode::from_to($_, "utf16","shift_jis", 0); print'
UTF-16:Malformed LO surrogate d8d9 at /path/to/Encode.pm line 184.
- % perl -MEncode -e '$a = "BOM missing"' \
- -e ' Encode::from_to($a, "utf16", "shift_jis", 0); print'
+ % perl -MEncode -e'$a = "BOM missing"' \
+ -e' Encode::from_to($a, "utf16", "shift_jis", 0); print'
UTF-16:Unrecognised BOM 424f at /path/to/Encode.pm line 184.
Unlike other encodings where mappings are not one-to-one against
The whole Unicode standard L<http://www.unicode.org/unicode/uni2book/u2.html>
Ch. 15, pp. 403 of C<Programming Perl (3rd Edition)>
-by Larry Wall, Tom Christiansen, Jon Orwant;
+by Larry Wall, Tom Christiansen, Jon Orwant;
O'Reilly & Associates; ISBN 0-596-00027-8
=cut
/*
- $Id: Unicode.xs,v 2.5 2009/02/01 13:14:41 dankogai Exp dankogai $
+ $Id: Unicode.xs,v 2.5 2009/02/01 13:14:41 dankogai Exp $
*/
#define PERL_NO_GET_CONTEXT
-# $Id: encoding.pm,v 2.7 2008/03/12 09:51:11 dankogai Exp $
+# $Id: encoding.pm,v 2.8 2009/02/15 17:44:13 dankogai Exp $
package encoding;
our $VERSION = '2.6_01';
# or you can even do this if your shell supports your native encoding
- perl -Mencoding=latin2 -e '...' # Feeling centrally European?
- perl -Mencoding=euc-kr -e '...' # Or Korean?
+ perl -Mencoding=latin2 -e'...' # Feeling centrally European?
+ perl -Mencoding=euc-kr -e'...' # Or Korean?
# more control
accidentally escape the quoting character that follows. Perl 5.8.1
or later fixes this problem.
-=item tr//
+=item tr//
C<tr//> was overlooked by Perl 5 porters when they released perl 5.8.0
See the section below for details.
=item DATA pseudo-filehandle
-Another feature that was overlooked was C<DATA>.
+Another feature that was overlooked was C<DATA>.
=back
=item use encoding [I<ENCNAME>] ;
-Sets the script encoding to I<ENCNAME>. And unless ${^UNICODE}
+Sets the script encoding to I<ENCNAME>. And unless ${^UNICODE}
exists and non-zero, PerlIO layers of STDIN and STDOUT are set to
":encoding(I<ENCNAME>)".
=head2 NOT SCOPED
The pragma is a per script, not a per block lexical. Only the last
-C<use encoding> or C<no encoding> matters, and it affects
-B<the whole script>. However, the <no encoding> pragma is supported and
-B<use encoding> can appear as many times as you want in a given script.
+C<use encoding> or C<no encoding> matters, and it affects
+B<the whole script>. However, the <no encoding> pragma is supported and
+B<use encoding> can appear as many times as you want in a given script.
The multiple use of this pragma is discouraged.
By the same reason, the use this pragma inside modules is also
-discouraged (though not as strongly discouraged as the case above.
+discouraged (though not as strongly discouraged as the case above.
See below).
If you still have to write a module with this pragma, be very careful
.
$camel = "*non-ascii*";
binmode(STDOUT=>':encoding(utf8)'); # bang!
- write; # funny
+ write; # funny
print $camel, "\n"; # fine
Without binmode this happens to work but without binmode, print()
If 1. didn't work but we are under the locale pragma, the environment
variables LC_ALL and LANG (in that order) are matched for encodings
-(the part after C<.>, if any), and if any found, that is used
+(the part after C<.>, if any), and if any found, that is used
as the default encoding for the open pragma.
=item 3.
=head1 HISTORY
-This pragma first appeared in Perl 5.8.0. For features that require
+This pragma first appeared in Perl 5.8.0. For features that require
5.8.1 and better, see above.
The C<:locale> subpragma was implemented in 2.01, or Perl 5.8.6.
use warnings;
no warnings 'redefine';
-our $VERSION = do { my @r = ( q$Revision: 2.7 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r };
+our $VERSION = do { my @r = ( q$Revision: 2.9 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r };
use Encode qw(find_encoding encode_utf8 decode_utf8);
use MIME::Base64;
use Carp;
$str =~ s/\?=\s+=\?/\?==\?/gos;
# multi-line header to single line
- $str =~ s/(?:\r|\n|\r\n)[ \t]+//gos;
+ $str =~ s/(?:\r\n|[\r\n])[ \t]+//gos;
1 while ( $str =~
- s/(\=\?[0-9A-Za-z\-_]+\?[Qq]\?)(.*?)\?\=\1(.*?)\?\=/$1$2$3\?\=/ )
+ s/(=\?[-0-9A-Za-z_]+\?[Qq]\?)(.*?)\?=\1(.*?\?=)/$1$2$3/ )
; # Concat consecutive QP encoded mime headers
# Fixes breaking inside multi-byte characters
$str =~ s{
- =\? # begin encoded word
- ([0-9A-Za-z\-_]+) # charset (encoding)
- (?:\*[A-Za-z]{1,8}(?:-[A-Za-z]{1,8})*)? # language (RFC 2231)
+ =\? # begin encoded word
+ ([-0-9A-Za-z_]+) # charset (encoding)
+ (?:\*[A-Za-z]{1,8}(?:-[A-Za-z]{1,8})*)? # language (RFC 2231)
\?([QqBb])\? # delimiter
(.*?) # Base64-encodede contents
- \?= # end encoded word
- }{
- if (uc($2) eq 'B'){
+ \?= # end encoded word
+ }{
+ if (uc($2) eq 'B'){
$obj->{decode_b} or croak qq(MIME "B" unsupported);
- decode_b($1, $3);
- }elsif(uc($2) eq 'Q'){
+ decode_b($1, $3, $chk);
+ } elsif (uc($2) eq 'Q'){
$obj->{decode_q} or croak qq(MIME "Q" unsupported);
- decode_q($1, $3);
- }else{
+ decode_q($1, $3, $chk);
+ } else {
croak qq(MIME "$2" encoding is nonexistent!);
}
- }egox;
- $_[1] = '' if $chk;
+ }egox;
+ $_[1] = $str if $chk;
return $str;
}
my $enc = shift;
my $d = find_encoding($enc) or croak qq(Unknown encoding "$enc");
my $db64 = decode_base64(shift);
+ my $chk = shift;
return $d->name eq 'utf8'
? Encode::decode_utf8($db64)
- : $d->decode( $db64, Encode::FB_PERLQQ );
+ : $d->decode( $db64, $chk || Encode::FB_PERLQQ );
}
sub decode_q {
- my ( $enc, $q ) = @_;
+ my ( $enc, $q, $chk ) = @_;
my $d = find_encoding($enc) or croak qq(Unknown encoding "$enc");
$q =~ s/_/ /go;
$q =~ s/=([0-9A-Fa-f]{2})/pack("C", hex($1))/ego;
return $d->name eq 'utf8'
? Encode::decode_utf8($q)
- : $d->decode( $q, Encode::FB_PERLQQ );
+ : $d->decode( $q, $chk || Encode::FB_PERLQQ );
}
my $especials =
join( '|' => map { quotemeta( chr($_) ) }
- unpack( "C*", qq{()<>@,;:\"\'/[]?.=} ) );
+ unpack( "C*", qq{()<>@,;:"'/[]?.=} ) );
my $re_encoded_word = qr{
- (?:
- =\? # begin encoded word
- (?:[0-9A-Za-z\-_]+) # charset (encoding)
- (?:\*\w+(?:-\w+)*)? # language (RFC 2231)
- \?(?:[QqBb])\? # delimiter
- (?:.*?) # Base64-encodede contents
- \?= # end encoded word
- )
- }xo;
+ =\? # begin encoded word
+ (?:[-0-9A-Za-z_]+) # charset (encoding)
+ (?:\*[A-Za-z]{1,8}(?:-[A-Za-z]{1,8})*)? # language (RFC 2231)
+ \?(?:[QqBb])\? # delimiter
+ (?:.*?) # Base64-encodede contents
+ \?= # end encoded word
+}xo;
my $re_especials = qr{$re_encoded_word|$especials}xo;
sub encode($$;$) {
my ( $obj, $str, $chk ) = @_;
my @line = ();
- for my $line ( split /\r|\n|\r\n/o, $str ) {
+ for my $line ( split /\r\n|[\r\n]/o, $str ) {
my ( @word, @subline );
for my $word ( split /($re_especials)/o, $line ) {
if ( $word =~ /[^\x00-\x7f]/o
my $chunk = shift;
$chunk = encode_utf8($chunk);
$chunk =~ s{
- ([^0-9A-Za-z])
- }{
- join("" => map {sprintf "=%02X", $_} unpack("C*", $1))
- }egox;
+ [^0-9A-Za-z]
+ }{
+ join("" => map {sprintf "=%02X", $_} unpack("C*", $&))
+ }egox;
return HEAD . 'Q?' . $chunk . TAIL;
}
=head1 SYNOPSIS
- use Encode qw/encode decode/;
+ use Encode qw/encode decode/;
$utf8 = decode('MIME-Header', $header);
$header = encode('MIME-Header', $utf8);
L<Encode>
RFC 2047, L<http://www.faqs.org/rfcs/rfc2047.html> and many other
-locations.
+locations.
=cut