ext/Encode/lib/Encode/MIME/Header.pm

   1 package Encode::MIME::Header;
   2 use strict;
   3 use warnings;
   4 no warnings 'redefine';
   5
   6 our $VERSION = do { my @r = ( q$Revision: 2.7 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r };
   7 use Encode qw(find_encoding encode_utf8 decode_utf8);
   8 use MIME::Base64;
   9 use Carp;
  10
  11 my %seed = (
  12     decode_b => '1',    # decodes 'B' encoding ?
  13     decode_q => '1',    # decodes 'Q' encoding ?
  14     encode   => 'B',    # encode with 'B' or 'Q' ?
  15     bpl      => 75,     # bytes per line
  16 );
  17
  18 $Encode::Encoding{'MIME-Header'} =
  19   bless { %seed, Name => 'MIME-Header', } => __PACKAGE__;
  20
  21 $Encode::Encoding{'MIME-B'} = bless {
  22     %seed,
  23     decode_q => 0,
  24     Name     => 'MIME-B',
  25 } => __PACKAGE__;
  26
  27 $Encode::Encoding{'MIME-Q'} = bless {
  28     %seed,
  29     decode_q => 1,
  30     encode   => 'Q',
  31     Name     => 'MIME-Q',
  32 } => __PACKAGE__;
  33
  34 use base qw(Encode::Encoding);
  35
  36 sub needs_lines { 1 }
  37 sub perlio_ok   { 0 }
  38
  39 sub decode($$;$) {
  40     use utf8;
  41     my ( $obj, $str, $chk ) = @_;
  42
  43     # zap spaces between encoded words
  44     $str =~ s/\?=\s+=\?/\?==\?/gos;
  45
  46     # multi-line header to single line
  47     $str =~ s/(?:\r|\n|\r\n)[ \t]+//gos;
  48
  49     1 while ( $str =~
  50         s/(\=\?[0-9A-Za-z\-_]+\?[Qq]\?)(.*?)\?\=\1(.*?)\?\=/$1$2$3\?\=/ )
  51       ;    # Concat consecutive QP encoded mime headers
  52            # Fixes breaking inside multi-byte characters
  53
  54     $str =~ s{
  55         =\?                  # begin encoded word
  56         ([0-9A-Za-z\-_]+) # charset (encoding)
  57                 (?:\*[A-Za-z]{1,8}(?:-[A-Za-z]{1,8})*)? # language (RFC 2231)
  58         \?([QqBb])\?     # delimiter
  59         (.*?)            # Base64-encodede contents
  60         \?=              # end encoded word
  61         }{
  62         if    (uc($2) eq 'B'){
  63             $obj->{decode_b} or croak qq(MIME "B" unsupported);
  64             decode_b($1, $3);
  65         }elsif(uc($2) eq 'Q'){
  66             $obj->{decode_q} or croak qq(MIME "Q" unsupported);
  67             decode_q($1, $3);
  68         }else{
  69             croak qq(MIME "$2" encoding is nonexistent!);
  70         }
  71         }egox;
  72     $_[1] = '' if $chk;
  73     return $str;
  74 }
  75
  76 sub decode_b {
  77     my $enc  = shift;
  78     my $d    = find_encoding($enc) or croak qq(Unknown encoding "$enc");
  79     my $db64 = decode_base64(shift);
  80     return $d->name eq 'utf8'
  81       ? Encode::decode_utf8($db64)
  82       : $d->decode( $db64, Encode::FB_PERLQQ );
  83 }
  84
  85 sub decode_q {
  86     my ( $enc, $q ) = @_;
  87     my $d = find_encoding($enc) or croak qq(Unknown encoding "$enc");
  88     $q =~ s/_/ /go;
  89     $q =~ s/=([0-9A-Fa-f]{2})/pack("C", hex($1))/ego;
  90     return $d->name eq 'utf8'
  91       ? Encode::decode_utf8($q)
  92       : $d->decode( $q, Encode::FB_PERLQQ );
  93 }
  94
  95 my $especials =
  96   join( '|' => map { quotemeta( chr($_) ) }
  97       unpack( "C*", qq{()<>@,;:\"\'/[]?.=} ) );
  98
  99 my $re_encoded_word = qr{
 100        (?:
 101     =\?               # begin encoded word
 102     (?:[0-9A-Za-z\-_]+) # charset (encoding)
 103         (?:\*\w+(?:-\w+)*)? # language (RFC 2231)
 104     \?(?:[QqBb])\?      # delimiter
 105     (?:.*?)             # Base64-encodede contents
 106     \?=                 # end encoded word
 107        )
 108       }xo;
 109
 110 my $re_especials = qr{$re_encoded_word|$especials}xo;
 111
 112 sub encode($$;$) {
 113     my ( $obj, $str, $chk ) = @_;
 114     my @line = ();
 115     for my $line ( split /\r|\n|\r\n/o, $str ) {
 116         my ( @word, @subline );
 117         for my $word ( split /($re_especials)/o, $line ) {
 118             if (   $word =~ /[^\x00-\x7f]/o
 119                 or $word =~ /^$re_encoded_word$/o )
 120             {
 121                 push @word, $obj->_encode($word);
 122             }
 123             else {
 124                 push @word, $word;
 125             }
 126         }
 127         my $subline = '';
 128         for my $word (@word) {
 129             use bytes ();
 130             if ( bytes::length($subline) + bytes::length($word) >
 131                 $obj->{bpl} )
 132             {
 133                 push @subline, $subline;
 134                 $subline = '';
 135             }
 136             $subline .= $word;
 137         }
 138         $subline and push @subline, $subline;
 139         push @line, join( "\n " => @subline );
 140     }
 141     $_[1] = '' if $chk;
 142     return join( "\n", @line );
 143 }
 144
 145 use constant HEAD   => '=?UTF-8?';
 146 use constant TAIL   => '?=';
 147 use constant SINGLE => { B => \&_encode_b, Q => \&_encode_q, };
 148
 149 sub _encode {
 150     my ( $o, $str ) = @_;
 151     my $enc  = $o->{encode};
 152     my $llen = ( $o->{bpl} - length(HEAD) - 2 - length(TAIL) );
 153
 154     # to coerce a floating-point arithmetics, the following contains
 155     # .0 in numbers -- dankogai
 156     $llen *= $enc eq 'B' ? 3.0 / 4.0 : 1.0 / 3.0;
 157     my @result = ();
 158     my $chunk  = '';
 159     while ( length( my $chr = substr( $str, 0, 1, '' ) ) ) {
 160         use bytes ();
 161         if ( bytes::length($chunk) + bytes::length($chr) > $llen ) {
 162             push @result, SINGLE->{$enc}($chunk);
 163             $chunk = '';
 164         }
 165         $chunk .= $chr;
 166     }
 167     length($chunk) and push @result, SINGLE->{$enc}($chunk);
 168     return @result;
 169 }
 170
 171 sub _encode_b {
 172     HEAD . 'B?' . encode_base64( encode_utf8(shift), '' ) . TAIL;
 173 }
 174
 175 sub _encode_q {
 176     my $chunk = shift;
 177     $chunk = encode_utf8($chunk);
 178     $chunk =~ s{
 179         ([^0-9A-Za-z])
 180            }{
 181            join("" => map {sprintf "=%02X", $_} unpack("C*", $1))
 182            }egox;
 183     return HEAD . 'Q?' . $chunk . TAIL;
 184 }
 185
 186 1;
 187 __END__
 188
 189 =head1 NAME
 190
 191 Encode::MIME::Header -- MIME 'B' and 'Q' header encoding
 192
 193 =head1 SYNOPSIS
 194
 195     use Encode qw/encode decode/;
 196     $utf8   = decode('MIME-Header', $header);
 197     $header = encode('MIME-Header', $utf8);
 198
 199 =head1 ABSTRACT
 200
 201 This module implements RFC 2047 Mime Header Encoding.  There are 3
 202 variant encoding names; C<MIME-Header>, C<MIME-B> and C<MIME-Q>.  The
 203 difference is described below
 204
 205               decode()          encode()
 206   ----------------------------------------------
 207   MIME-Header Both B and Q      =?UTF-8?B?....?=
 208   MIME-B      B only; Q croaks  =?UTF-8?B?....?=
 209   MIME-Q      Q only; B croaks  =?UTF-8?Q?....?=
 210
 211 =head1 DESCRIPTION
 212
 213 When you decode(=?I<encoding>?I<X>?I<ENCODED WORD>?=), I<ENCODED WORD>
 214 is extracted and decoded for I<X> encoding (B for Base64, Q for
 215 Quoted-Printable). Then the decoded chunk is fed to
 216 decode(I<encoding>).  So long as I<encoding> is supported by Encode,
 217 any source encoding is fine.
 218
 219 When you encode, it just encodes UTF-8 string with I<X> encoding then
 220 quoted with =?UTF-8?I<X>?....?= .  The parts that RFC 2047 forbids to
 221 encode are left as is and long lines are folded within 76 bytes per
 222 line.
 223
 224 =head1 BUGS
 225
 226 It would be nice to support encoding to non-UTF8, such as =?ISO-2022-JP?
 227 and =?ISO-8859-1?= but that makes the implementation too complicated.
 228 These days major mail agents all support =?UTF-8? so I think it is
 229 just good enough.
 230
 231 Due to popular demand, 'MIME-Header-ISO_2022_JP' was introduced by
 232 Makamaka.  Thre are still too many MUAs especially cellular phone
 233 handsets which does not grok UTF-8.
 234
 235 =head1 SEE ALSO
 236
 237 L<Encode>
 238
 239 RFC 2047, L<http://www.faqs.org/rfcs/rfc2047.html> and many other
 240 locations.
 241
 242 =cut