ext/Encode/lib/Encode/MIME/Header.pm

   1 package Encode::MIME::Header;
   2 use strict;
   3 # use warnings;
   4 our $VERSION = do { my @r = (q$Revision: 1.7 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
   5
   6 use Encode qw(find_encoding encode_utf8);
   7 use MIME::Base64;
   8 use Carp;
   9
  10 my %seed =
  11     (
  12      decode_b     => '1', # decodes 'B' encoding ?
  13      decode_q     => '1', # decodes 'Q' encoding ?
  14      encode       => 'B', # encode with 'B' or 'Q' ?
  15      bpl          => 75,  # bytes per line
  16      );
  17
  18 $Encode::Encoding{'MIME-Header'} =
  19     bless {
  20         %seed,
  21         Name => 'MIME-Header',
  22     } => __PACKAGE__;
  23
  24 $Encode::Encoding{'MIME-B'} =
  25     bless {
  26         %seed,
  27         decode_q  => 0,
  28         Name      => 'MIME-B',
  29     } => __PACKAGE__;
  30
  31 $Encode::Encoding{'MIME-Q'} =
  32     bless {
  33         %seed,
  34         decode_q    => 1,
  35         encode      => 'Q',
  36         Name        => 'MIME-Q',
  37     } => __PACKAGE__;
  38
  39 use base qw(Encode::Encoding);
  40
  41 sub needs_lines { 1 }
  42 sub perlio_ok{ 0 };
  43
  44 sub decode($$;$){
  45     use utf8;
  46     my ($obj, $str, $chk) = @_;
  47     # zap spaces between encoded words
  48     $str =~ s/\?=\s+=\?/\?==\?/gos;
  49     # multi-line header to single line
  50     $str =~ s/(:?\r|\n|\r\n)[ \t]//gos;
  51     $str =~
  52         s{
  53             =\?                  # begin encoded word
  54                 ([0-9A-Za-z\-_]+) # charset (encoding)
  55                 \?([QqBb])\?     # delimiter
  56                 (.*?)            # Base64-encodede contents
  57                 \?=              # end encoded word
  58             }{
  59                 if    (uc($2) eq 'B'){
  60                     $obj->{decode_b} or croak qq(MIME "B" unsupported);
  61                     decode_b($1, $3);
  62                 }elsif(uc($2) eq 'Q'){
  63                     $obj->{decode_q} or croak qq(MIME "Q" unsupported);
  64                     decode_q($1, $3);
  65                 }else{
  66                     croak qq(MIME "$2" encoding is nonexistent!);
  67                 }
  68             }egox;
  69     $_[1] = '' if $chk;
  70     return $str;
  71 }
  72
  73 sub decode_b{
  74     my $enc = shift;
  75     my $d = find_encoding($enc) or croak(Unknown encoding "$enc");
  76     my $db64 = decode_base64(shift);
  77     return $d->name eq 'utf8' ?
  78         Encode::decode_utf8($db64) : $d->decode($db64, Encode::FB_PERLQQ);
  79 }
  80
  81 sub decode_q{
  82     my ($enc, $q) = @_;
  83     my $d = find_encoding($enc) or croak(Unknown encoding "$enc");
  84     $q =~ s/_/ /go;
  85     $q =~ s/=([0-9A-Fa-f]{2})/pack("C", hex($1))/ego;
  86     return $d->name eq 'utf8' ?
  87         Encode::decode_utf8($q) : $d->decode($q, Encode::FB_PERLQQ);
  88 }
  89
  90 my $especials =
  91     join('|' =>
  92          map {quotemeta(chr($_))}
  93          unpack("C*", qq{()<>@,;:\"\'/[]?.=}));
  94
  95 my $re_especials = qr/$especials/o;
  96
  97 sub encode($$;$){
  98     my ($obj, $str, $chk) = @_;
  99     my @line = ();
 100     for my $line (split /\r|\n|\r\n/o, $str){
 101         my (@word, @subline);
 102         for my $word (split /($re_especials)/o, $line){
 103             if ($word =~ /[^\x00-\x7f]/o){
 104                 push @word, $obj->_encode($word);
 105             }else{
 106                 push @word, $word;
 107             }
 108         }
 109         my $subline = '';
 110         for my $word (@word){
 111             use bytes ();
 112             if (bytes::length($subline) + bytes::length($word) > $obj->{bpl}){
 113                 push @subline, $subline;
 114                 $subline = '';
 115             }
 116             $subline .= $word;
 117         }
 118         $subline and push @subline, $subline;
 119         push @line, join("\n " => @subline);
 120     }
 121     $_[1] = '' if $chk;
 122     return join("\n", @line);
 123 }
 124
 125 use constant HEAD  => '=?UTF-8?';
 126 use constant TAIL    => '?=';
 127 use constant SINGLE => { B => \&_encode_b, Q => \&_encode_q, };
 128
 129 sub _encode{
 130     my ($o, $str) = @_;
 131     my $enc = $o->{encode};
 132     my $llen = ($o->{bpl} - length(HEAD) - 2 - length(TAIL));
 133     # to coerce a floating-point arithmetics, the following contains
 134     # .0 in numbers -- dankogai
 135     $llen *= $enc eq 'B' ? 3.0/4.0 : 1.0/3.0;
 136     my @result = ();
 137     my $chunk = '';
 138     while(my $chr = substr($str, 0, 1, '')){
 139         use bytes ();
 140         if (bytes::length($chunk) + bytes::length($chr) > $llen){
 141             push @result, SINGLE->{$enc}($chunk);
 142             $chunk = '';
 143         }
 144         $chunk .= $chr;
 145     }
 146     $chunk and push @result, SINGLE->{$enc}($chunk);
 147     return @result;
 148 }
 149
 150 sub _encode_b{
 151     HEAD . 'B?' . encode_base64(encode_utf8(shift), '') . TAIL;
 152 }
 153
 154 sub _encode_q{
 155     my $chunk = shift;
 156     $chunk =~ s{
 157                 ([^0-9A-Za-z])
 158                }{
 159                    join("" => map {sprintf "=%02X", $_} unpack("C*", $1))
 160                }egox;
 161     return HEAD . 'Q?' . $chunk . TAIL;
 162 }
 163
 164 1;
 165 __END__
 166
 167 =head1 NAME
 168
 169 Encode::MIME::Header -- MIME 'B' and 'Q' header encoding
 170
 171 =head1 SYNOPSIS
 172
 173     use Encode qw/encode decode/;
 174     $utf8   = decode('MIME-Header', $header);
 175     $header = encode('MIME-Header', $utf8);
 176
 177 =head1 ABSTRACT
 178
 179 This module implements RFC 2047 Mime Header Encoding.  There are 3
 180 variant encoding names; C<MIME-Header>, C<MIME-B> and C<MIME-Q>.  The
 181 difference is described below
 182
 183               decode()          encode()
 184   ----------------------------------------------
 185   MIME-Header Both B and Q      =?UTF-8?B?....?=
 186   MIME-B      B only; Q croaks  =?UTF-8?B?....?=
 187   MIME-Q      Q only; B croaks  =?UTF-8?Q?....?=
 188
 189 =head1 DESCRIPTION
 190
 191 When you decode(=?I<encoding>?I<X>?I<ENCODED WORD>?=), I<ENCODED WORD>
 192 is extracted and decoded for I<X> encoding (B for Base64, Q for
 193 Quoted-Printable). Then the decoded chunk is fed to
 194 decode(I<encoding>).  So long as I<encoding> is supported by Encode,
 195 any source encoding is fine.
 196
 197 When you encode, it just encodes UTF-8 string with I<X> encoding then
 198 quoted with =?UTF-8?I<X>?....?= .  The parts that RFC 2047 forbids to
 199 encode are left as is and long lines are folded within 76 bytes per
 200 line.
 201
 202 =head1 BUGS
 203
 204 It would be nice to support encoding to non-UTF8, such as =?ISO-2022-JP?
 205 and =?ISO-8859-1?= but that makes the implementation too complicated.
 206 These days major mail agents all support =?UTF-8? so I think it is
 207 just good enough.
 208
 209 =head1 SEE ALSO
 210
 211 L<Encode>
 212
 213 RFC 2047, L<http://www.faqs.org/rfcs/rfc2047.html> and many other
 214 locations.
 215
 216 =cut