ext/Encode/lib/Encode/MIME/Header.pm

   1 package Encode::MIME::Header;
   2 use strict;
   3 # use warnings;
   4 our $VERSION = do { my @r = (q$Revision: 2.2 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
   5 use Encode qw(find_encoding encode_utf8 decode_utf8);
   6 use MIME::Base64;
   7 use Carp;
   8
   9 my %seed =
  10     (
  11      decode_b     => '1', # decodes 'B' encoding ?
  12      decode_q     => '1', # decodes 'Q' encoding ?
  13      encode       => 'B', # encode with 'B' or 'Q' ?
  14      bpl          => 75,  # bytes per line
  15      );
  16
  17 $Encode::Encoding{'MIME-Header'} =
  18     bless {
  19         %seed,
  20         Name => 'MIME-Header',
  21     } => __PACKAGE__;
  22
  23 $Encode::Encoding{'MIME-B'} =
  24     bless {
  25         %seed,
  26         decode_q  => 0,
  27         Name      => 'MIME-B',
  28     } => __PACKAGE__;
  29
  30 $Encode::Encoding{'MIME-Q'} =
  31     bless {
  32         %seed,
  33         decode_q    => 1,
  34         encode      => 'Q',
  35         Name        => 'MIME-Q',
  36     } => __PACKAGE__;
  37
  38 use base qw(Encode::Encoding);
  39
  40 sub needs_lines { 1 }
  41 sub perlio_ok{ 0 };
  42
  43 sub decode($$;$){
  44     use utf8;
  45     my ($obj, $str, $chk) = @_;
  46     # zap spaces between encoded words
  47     $str =~ s/\?=\s+=\?/\?==\?/gos;
  48     # multi-line header to single line
  49     $str =~ s/(:?\r|\n|\r\n)[ \t]//gos;
  50
  51     1 while ($str =~ s/(\=\?[0-9A-Za-z\-_]+\?[Qq]\?)(.*?)\?\=\1(.*?)\?\=/$1$2$3\?\=/);  # Concat consecutive QP encoded mime headers
  52                                                                                         # Fixes breaking inside multi-byte characters
  53
  54     $str =~
  55         s{
  56             =\?                  # begin encoded word
  57                 ([0-9A-Za-z\-_]+) # charset (encoding)
  58                 (?:\*[A-Za-z]{1,8}(?:-[A-Za-z]{1,8})*)? # language (RFC 2231)
  59                 \?([QqBb])\?     # delimiter
  60                 (.*?)            # Base64-encodede contents
  61                 \?=              # end encoded word
  62             }{
  63                 if    (uc($2) eq 'B'){
  64                     $obj->{decode_b} or croak qq(MIME "B" unsupported);
  65                     decode_b($1, $3);
  66                 }elsif(uc($2) eq 'Q'){
  67                     $obj->{decode_q} or croak qq(MIME "Q" unsupported);
  68                     decode_q($1, $3);
  69                 }else{
  70                     croak qq(MIME "$2" encoding is nonexistent!);
  71                 }
  72             }egox;
  73     $_[1] = '' if $chk;
  74     return $str;
  75 }
  76
  77 sub decode_b{
  78     my $enc = shift;
  79     my $d = find_encoding($enc)        or croak qq(Unknown encoding "$enc");
  80     my $db64 = decode_base64(shift);
  81     return $d->name eq 'utf8' ?
  82         Encode::decode_utf8($db64) : $d->decode($db64, Encode::FB_PERLQQ);
  83 }
  84
  85 sub decode_q{
  86     my ($enc, $q) = @_;
  87     my $d = find_encoding($enc) or croak qq(Unknown encoding "$enc");
  88     $q =~ s/_/ /go;
  89     $q =~ s/=([0-9A-Fa-f]{2})/pack("C", hex($1))/ego;
  90     return $d->name eq 'utf8' ?
  91         Encode::decode_utf8($q) : $d->decode($q, Encode::FB_PERLQQ);
  92 }
  93
  94 my $especials =
  95     join('|' =>
  96          map {quotemeta(chr($_))}
  97          unpack("C*", qq{()<>@,;:\"\'/[]?.=}));
  98
  99 my $re_encoded_word =
 100     qr{
 101        (?:
 102         =\?               # begin encoded word
 103         (?:[0-9A-Za-z\-_]+) # charset (encoding)
 104         (?:\*\w+(?:-\w+)*)? # language (RFC 2231)
 105         \?(?:[QqBb])\?      # delimiter
 106         (?:.*?)             # Base64-encodede contents
 107         \?=                 # end encoded word
 108        )
 109       }xo;
 110
 111 my $re_especials = qr{$re_encoded_word|$especials}xo;
 112
 113 sub encode($$;$){
 114     my ($obj, $str, $chk) = @_;
 115     my @line = ();
 116     for my $line (split /\r|\n|\r\n/o, $str){
 117         my (@word, @subline);
 118         for my $word (split /($re_especials)/o, $line){
 119             if ($word =~ /[^\x00-\x7f]/o or $word =~ /^$re_encoded_word$/o){
 120                 push @word, $obj->_encode($word);
 121             }else{
 122                 push @word, $word;
 123             }
 124         }
 125         my $subline = '';
 126         for my $word (@word){
 127             use bytes ();
 128             if (bytes::length($subline) + bytes::length($word) > $obj->{bpl}){
 129                 push @subline, $subline;
 130                 $subline = '';
 131             }
 132             $subline .= $word;
 133         }
 134         $subline and push @subline, $subline;
 135         push @line, join("\n " => @subline);
 136     }
 137     $_[1] = '' if $chk;
 138     return join("\n", @line);
 139 }
 140
 141 use constant HEAD  => '=?UTF-8?';
 142 use constant TAIL    => '?=';
 143 use constant SINGLE => { B => \&_encode_b, Q => \&_encode_q, };
 144
 145 sub _encode{
 146     my ($o, $str) = @_;
 147     my $enc = $o->{encode};
 148     my $llen = ($o->{bpl} - length(HEAD) - 2 - length(TAIL));
 149     # to coerce a floating-point arithmetics, the following contains
 150     # .0 in numbers -- dankogai
 151     $llen *= $enc eq 'B' ? 3.0/4.0 : 1.0/3.0;
 152     my @result = ();
 153     my $chunk = '';
 154     while(length(my $chr = substr($str, 0, 1, ''))){
 155         use bytes ();
 156         if (bytes::length($chunk) + bytes::length($chr) > $llen){
 157             push @result, SINGLE->{$enc}($chunk);
 158             $chunk = '';
 159         }
 160         $chunk .= $chr;
 161     }
 162     $chunk and push @result, SINGLE->{$enc}($chunk);
 163     return @result;
 164 }
 165
 166 sub _encode_b{
 167     HEAD . 'B?' . encode_base64(encode_utf8(shift), '') . TAIL;
 168 }
 169
 170 sub _encode_q{
 171     my $chunk = shift;
 172     $chunk =~ s{
 173                 ([^0-9A-Za-z])
 174                }{
 175                    join("" => map {sprintf "=%02X", $_} unpack("C*", $1))
 176                }egox;
 177     return decode_utf8(HEAD . 'Q?' . $chunk . TAIL);
 178 }
 179
 180 1;
 181 __END__
 182
 183 =head1 NAME
 184
 185 Encode::MIME::Header -- MIME 'B' and 'Q' header encoding
 186
 187 =head1 SYNOPSIS
 188
 189     use Encode qw/encode decode/;
 190     $utf8   = decode('MIME-Header', $header);
 191     $header = encode('MIME-Header', $utf8);
 192
 193 =head1 ABSTRACT
 194
 195 This module implements RFC 2047 Mime Header Encoding.  There are 3
 196 variant encoding names; C<MIME-Header>, C<MIME-B> and C<MIME-Q>.  The
 197 difference is described below
 198
 199               decode()          encode()
 200   ----------------------------------------------
 201   MIME-Header Both B and Q      =?UTF-8?B?....?=
 202   MIME-B      B only; Q croaks  =?UTF-8?B?....?=
 203   MIME-Q      Q only; B croaks  =?UTF-8?Q?....?=
 204
 205 =head1 DESCRIPTION
 206
 207 When you decode(=?I<encoding>?I<X>?I<ENCODED WORD>?=), I<ENCODED WORD>
 208 is extracted and decoded for I<X> encoding (B for Base64, Q for
 209 Quoted-Printable). Then the decoded chunk is fed to
 210 decode(I<encoding>).  So long as I<encoding> is supported by Encode,
 211 any source encoding is fine.
 212
 213 When you encode, it just encodes UTF-8 string with I<X> encoding then
 214 quoted with =?UTF-8?I<X>?....?= .  The parts that RFC 2047 forbids to
 215 encode are left as is and long lines are folded within 76 bytes per
 216 line.
 217
 218 =head1 BUGS
 219
 220 It would be nice to support encoding to non-UTF8, such as =?ISO-2022-JP?
 221 and =?ISO-8859-1?= but that makes the implementation too complicated.
 222 These days major mail agents all support =?UTF-8? so I think it is
 223 just good enough.
 224
 225 Due to popular demand, 'MIME-Header-ISO_2022_JP' was introduced by
 226 Makamaka.  Thre are still too many MUAs especially cellular phone
 227 handsets which does not grok UTF-8.
 228
 229 =head1 SEE ALSO
 230
 231 L<Encode>
 232
 233 RFC 2047, L<http://www.faqs.org/rfcs/rfc2047.html> and many other
 234 locations.
 235
 236 =cut