[p5sagit/p5-mst-13.2.git] / ext / Encode / lib / Encode / MIME / Header.pm

package Encode::MIME::Header;
use strict;
use warnings;
no warnings 'redefine';

our $VERSION = do { my @r = ( q$Revision: 2.4 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r };
use Encode qw(find_encoding encode_utf8 decode_utf8);
use MIME::Base64;
use Carp;

my %seed = (
    decode_b => '1',    # decodes 'B' encoding ?
    decode_q => '1',    # decodes 'Q' encoding ?
    encode   => 'B',    # encode with 'B' or 'Q' ?
    bpl      => 75,     # bytes per line
);

$Encode::Encoding{'MIME-Header'} =
  bless { %seed, Name => 'MIME-Header', } => __PACKAGE__;

$Encode::Encoding{'MIME-B'} = bless {
    %seed,
    decode_q => 0,
    Name     => 'MIME-B',
} => __PACKAGE__;

$Encode::Encoding{'MIME-Q'} = bless {
    %seed,
    decode_q => 1,
    encode   => 'Q',
    Name     => 'MIME-Q',
} => __PACKAGE__;

use base qw(Encode::Encoding);

sub needs_lines { 1 }
sub perlio_ok   { 0 }

sub decode($$;$) {
    use utf8;
    my ( $obj, $str, $chk ) = @_;

    # zap spaces between encoded words
    $str =~ s/\?=\s+=\?/\?==\?/gos;

    # multi-line header to single line
    $str =~ s/(:?\r|\n|\r\n)[ \t]//gos;

    1 while ( $str =~
        s/(\=\?[0-9A-Za-z\-_]+\?[Qq]\?)(.*?)\?\=\1(.*?)\?\=/$1$2$3\?\=/ )
      ;    # Concat consecutive QP encoded mime headers
           # Fixes breaking inside multi-byte characters

    $str =~ s{
        =\?                  # begin encoded word
        ([0-9A-Za-z\-_]+) # charset (encoding)
                (?:\*[A-Za-z]{1,8}(?:-[A-Za-z]{1,8})*)? # language (RFC 2231)
        \?([QqBb])\?     # delimiter
        (.*?)            # Base64-encodede contents
        \?=              # end encoded word      
        }{
        if    (uc($2) eq 'B'){
            $obj->{decode_b} or croak qq(MIME "B" unsupported);
            decode_b($1, $3);
        }elsif(uc($2) eq 'Q'){
            $obj->{decode_q} or croak qq(MIME "Q" unsupported);
            decode_q($1, $3);
        }else{
            croak qq(MIME "$2" encoding is nonexistent!);
        }
        }egox;
    $_[1] = '' if $chk;
    return $str;
}

sub decode_b {
    my $enc  = shift;
    my $d    = find_encoding($enc) or croak qq(Unknown encoding "$enc");
    my $db64 = decode_base64(shift);
    return $d->name eq 'utf8'
      ? Encode::decode_utf8($db64)
      : $d->decode( $db64, Encode::FB_PERLQQ );
}

sub decode_q {
    my ( $enc, $q ) = @_;
    my $d = find_encoding($enc) or croak qq(Unknown encoding "$enc");
    $q =~ s/_/ /go;
    $q =~ s/=([0-9A-Fa-f]{2})/pack("C", hex($1))/ego;
    return $d->name eq 'utf8'
      ? Encode::decode_utf8($q)
      : $d->decode( $q, Encode::FB_PERLQQ );
}

my $especials =
  join( '|' => map { quotemeta( chr($_) ) }
      unpack( "C*", qq{()<>@,;:\"\'/[]?.=} ) );

my $re_encoded_word = qr{
       (?:
    =\?               # begin encoded word
    (?:[0-9A-Za-z\-_]+) # charset (encoding)
        (?:\*\w+(?:-\w+)*)? # language (RFC 2231)
    \?(?:[QqBb])\?      # delimiter
    (?:.*?)             # Base64-encodede contents
    \?=                 # end encoded word
       )
      }xo;

my $re_especials = qr{$re_encoded_word|$especials}xo;

sub encode($$;$) {
    my ( $obj, $str, $chk ) = @_;
    my @line = ();
    for my $line ( split /\r|\n|\r\n/o, $str ) {
        my ( @word, @subline );
        for my $word ( split /($re_especials)/o, $line ) {
            if (   $word =~ /[^\x00-\x7f]/o
                or $word =~ /^$re_encoded_word$/o )
            {
                push @word, $obj->_encode($word);
            }
            else {
                push @word, $word;
            }
        }
        my $subline = '';
        for my $word (@word) {
            use bytes ();
            if ( bytes::length($subline) + bytes::length($word) >
                $obj->{bpl} )
            {
                push @subline, $subline;
                $subline = '';
            }
            $subline .= $word;
        }
        $subline and push @subline, $subline;
        push @line, join( "\n " => @subline );
    }
    $_[1] = '' if $chk;
    return join( "\n", @line );
}

use constant HEAD   => '=?UTF-8?';
use constant TAIL   => '?=';
use constant SINGLE => { B => \&_encode_b, Q => \&_encode_q, };

sub _encode {
    my ( $o, $str ) = @_;
    my $enc  = $o->{encode};
    my $llen = ( $o->{bpl} - length(HEAD) - 2 - length(TAIL) );

    # to coerce a floating-point arithmetics, the following contains
    # .0 in numbers -- dankogai
    $llen *= $enc eq 'B' ? 3.0 / 4.0 : 1.0 / 3.0;
    my @result = ();
    my $chunk  = '';
    while ( length( my $chr = substr( $str, 0, 1, '' ) ) ) {
        use bytes ();
        if ( bytes::length($chunk) + bytes::length($chr) > $llen ) {
            push @result, SINGLE->{$enc}($chunk);
            $chunk = '';
        }
        $chunk .= $chr;
    }
    $chunk and push @result, SINGLE->{$enc}($chunk);
    return @result;
}

sub _encode_b {
    HEAD . 'B?' . encode_base64( encode_utf8(shift), '' ) . TAIL;
}

sub _encode_q {
    my $chunk = shift;
    $chunk =~ s{
        ([^0-9A-Za-z])
           }{
           join("" => map {sprintf "=%02X", $_} unpack("C*", $1))
           }egox;
    return decode_utf8( HEAD . 'Q?' . $chunk . TAIL );
}

1;
__END__

=head1 NAME

Encode::MIME::Header -- MIME 'B' and 'Q' header encoding

=head1 SYNOPSIS

    use Encode qw/encode decode/; 
    $utf8   = decode('MIME-Header', $header);
    $header = encode('MIME-Header', $utf8);

=head1 ABSTRACT

This module implements RFC 2047 Mime Header Encoding.  There are 3
variant encoding names; C<MIME-Header>, C<MIME-B> and C<MIME-Q>.  The
difference is described below

              decode()          encode()
  ----------------------------------------------
  MIME-Header Both B and Q      =?UTF-8?B?....?=
  MIME-B      B only; Q croaks  =?UTF-8?B?....?=
  MIME-Q      Q only; B croaks  =?UTF-8?Q?....?=

=head1 DESCRIPTION

When you decode(=?I<encoding>?I<X>?I<ENCODED WORD>?=), I<ENCODED WORD>
is extracted and decoded for I<X> encoding (B for Base64, Q for
Quoted-Printable). Then the decoded chunk is fed to
decode(I<encoding>).  So long as I<encoding> is supported by Encode,
any source encoding is fine.

When you encode, it just encodes UTF-8 string with I<X> encoding then
quoted with =?UTF-8?I<X>?....?= .  The parts that RFC 2047 forbids to
encode are left as is and long lines are folded within 76 bytes per
line.

=head1 BUGS

It would be nice to support encoding to non-UTF8, such as =?ISO-2022-JP?
and =?ISO-8859-1?= but that makes the implementation too complicated.
These days major mail agents all support =?UTF-8? so I think it is
just good enough.

Due to popular demand, 'MIME-Header-ISO_2022_JP' was introduced by
Makamaka.  Thre are still too many MUAs especially cellular phone
handsets which does not grok UTF-8.

=head1 SEE ALSO

L<Encode>

RFC 2047, L<http://www.faqs.org/rfcs/rfc2047.html> and many other
locations. 

=cut
Commit	Line	Data
af1f55d9	1	package Encode::MIME::Header;
af1f55d9	2	use strict;
656ebd29	3	use warnings;
656ebd29	4	no warnings 'redefine';
d1256cb1	5
656ebd29	6	our $VERSION = do { my @r = ( q$Revision: 2.4 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r };
bedba681	7	use Encode qw(find_encoding encode_utf8 decode_utf8);
af1f55d9	8	use MIME::Base64;
	9	use Carp;
	10
d1256cb1	11	my %seed = (
	12	decode_b => '1', # decodes 'B' encoding ?
	13	decode_q => '1', # decodes 'Q' encoding ?
	14	encode => 'B', # encode with 'B' or 'Q' ?
	15	bpl => 75, # bytes per line
	16	);
af1f55d9	17
af1f55d9	18	$Encode::Encoding{'MIME-Header'} =
d1256cb1	19	bless { %seed, Name => 'MIME-Header', } => __PACKAGE__;
	20
	21	$Encode::Encoding{'MIME-B'} = bless {
	22	%seed,
	23	decode_q => 0,
	24	Name => 'MIME-B',
	25	} => __PACKAGE__;
	26
	27	$Encode::Encoding{'MIME-Q'} = bless {
	28	%seed,
	29	decode_q => 1,
	30	encode => 'Q',
	31	Name => 'MIME-Q',
	32	} => __PACKAGE__;
af1f55d9	33
10c5ecbb	34	use base qw(Encode::Encoding);
10c5ecbb	35
af1f55d9	36	sub needs_lines { 1 }
d1256cb1	37	sub perlio_ok { 0 }
af1f55d9	38
d1256cb1	39	sub decode($$;$) {
af1f55d9	40	use utf8;
d1256cb1	41	my ( $obj, $str, $chk ) = @_;
d1256cb1	42
af1f55d9	43	# zap spaces between encoded words
af1f55d9	44	$str =~ s/\?=\s+=\?/\?==\?/gos;
d1256cb1	45
af1f55d9	46	# multi-line header to single line
af1f55d9	47	$str =~ s/(:?\r\|\n\|\r\n)[ \t]//gos;
41c240f5	48
d1256cb1	49	1 while ( $str =~
	50	s/(\=\?[0-9A-Za-z\-_]+\?[Qq]\?)(.?)\?\=\1(.?)\?\=/$1$2$3\?\=/ )
	51	; # Concat consecutive QP encoded mime headers
	52	# Fixes breaking inside multi-byte characters
41c240f5	53
d1256cb1	54	$str =~ s{
	55	=\? # begin encoded word
	56	([0-9A-Za-z\-_]+) # charset (encoding)
41c240f5	57	(?:\[A-Za-z]{1,8}(?:-[A-Za-z]{1,8}))? # language (RFC 2231)
d1256cb1	58	\?([QqBb])\? # delimiter
	59	(.*?) # Base64-encodede contents
	60	\?= # end encoded word
	61	}{
	62	if (uc($2) eq 'B'){
	63	$obj->{decode_b} or croak qq(MIME "B" unsupported);
	64	decode_b($1, $3);
	65	}elsif(uc($2) eq 'Q'){
	66	$obj->{decode_q} or croak qq(MIME "Q" unsupported);
	67	decode_q($1, $3);
	68	}else{
	69	croak qq(MIME "$2" encoding is nonexistent!);
	70	}
	71	}egox;
af1f55d9	72	$_[1] = '' if $chk;
	73	return $str;
	74	}
	75
d1256cb1	76	sub decode_b {
	77	my $enc = shift;
	78	my $d = find_encoding($enc) or croak qq(Unknown encoding "$enc");
af1f55d9	79	my $db64 = decode_base64(shift);
d1256cb1	80	return $d->name eq 'utf8'
	81	? Encode::decode_utf8($db64)
	82	: $d->decode( $db64, Encode::FB_PERLQQ );
af1f55d9	83	}
af1f55d9	84
d1256cb1	85	sub decode_q {
d1256cb1	86	my ( $enc, $q ) = @_;
bedba681	87	my $d = find_encoding($enc) or croak qq(Unknown encoding "$enc");
af1f55d9	88	$q =~ s/_/ /go;
af1f55d9	89	$q =~ s/=([0-9A-Fa-f]{2})/pack("C", hex($1))/ego;
d1256cb1	90	return $d->name eq 'utf8'
	91	? Encode::decode_utf8($q)
	92	: $d->decode( $q, Encode::FB_PERLQQ );
af1f55d9	93	}
af1f55d9	94
d1256cb1	95	my $especials =
	96	join( '\|' => map { quotemeta( chr($_) ) }
	97	unpack( "C*", qq{()<>@,;:\"\'/[]?.=} ) );
af1f55d9	98
d1256cb1	99	my $re_encoded_word = qr{
bedba681	100	(?:
d1256cb1	101	=\? # begin encoded word
d1256cb1	102	(?:[0-9A-Za-z\-_]+) # charset (encoding)
41c240f5	103	(?:\\w+(?:-\w+))? # language (RFC 2231)
d1256cb1	104	\?(?:[QqBb])\? # delimiter
	105	(?:.*?) # Base64-encodede contents
	106	\?= # end encoded word
bedba681	107	)
	108	}xo;
	109
	110	my $re_especials = qr{$re_encoded_word\|$especials}xo;
af1f55d9	111
d1256cb1	112	sub encode($$;$) {
d1256cb1	113	my ( $obj, $str, $chk ) = @_;
af1f55d9	114	my @line = ();
d1256cb1	115	for my $line ( split /\r\|\n\|\r\n/o, $str ) {
	116	my ( @word, @subline );
	117	for my $word ( split /($re_especials)/o, $line ) {
	118	if ( $word =~ /[^\x00-\x7f]/o
	119	or $word =~ /^$re_encoded_word$/o )
	120	{
	121	push @word, $obj->_encode($word);
	122	}
	123	else {
	124	push @word, $word;
	125	}
	126	}
	127	my $subline = '';
	128	for my $word (@word) {
	129	use bytes ();
	130	if ( bytes::length($subline) + bytes::length($word) >
	131	$obj->{bpl} )
	132	{
	133	push @subline, $subline;
	134	$subline = '';
	135	}
	136	$subline .= $word;
	137	}
	138	$subline and push @subline, $subline;
	139	push @line, join( "\n " => @subline );
af1f55d9	140	}
af1f55d9	141	$_[1] = '' if $chk;
d1256cb1	142	return join( "\n", @line );
af1f55d9	143	}
af1f55d9	144
d1256cb1	145	use constant HEAD => '=?UTF-8?';
d1256cb1	146	use constant TAIL => '?=';
af1f55d9	147	use constant SINGLE => { B => \&_encode_b, Q => \&_encode_q, };
af1f55d9	148
d1256cb1	149	sub _encode {
	150	my ( $o, $str ) = @_;
	151	my $enc = $o->{encode};
	152	my $llen = ( $o->{bpl} - length(HEAD) - 2 - length(TAIL) );
	153
11067275	154	# to coerce a floating-point arithmetics, the following contains
11067275	155	# .0 in numbers -- dankogai
d1256cb1	156	$llen *= $enc eq 'B' ? 3.0 / 4.0 : 1.0 / 3.0;
af1f55d9	157	my @result = ();
d1256cb1	158	my $chunk = '';
	159	while ( length( my $chr = substr( $str, 0, 1, '' ) ) ) {
	160	use bytes ();
	161	if ( bytes::length($chunk) + bytes::length($chr) > $llen ) {
	162	push @result, SINGLE->{$enc}($chunk);
	163	$chunk = '';
	164	}
	165	$chunk .= $chr;
af1f55d9	166	}
	167	$chunk and push @result, SINGLE->{$enc}($chunk);
	168	return @result;
	169	}
	170
d1256cb1	171	sub _encode_b {
d1256cb1	172	HEAD . 'B?' . encode_base64( encode_utf8(shift), '' ) . TAIL;
af1f55d9	173	}
af1f55d9	174
d1256cb1	175	sub _encode_q {
af1f55d9	176	my $chunk = shift;
af1f55d9	177	$chunk =~ s{
d1256cb1	178	([^0-9A-Za-z])
	179	}{
	180	join("" => map {sprintf "=%02X", $_} unpack("C*", $1))
	181	}egox;
	182	return decode_utf8( HEAD . 'Q?' . $chunk . TAIL );
af1f55d9	183	}
	184
	185	1;
	186	__END__
	187
	188	=head1 NAME
	189
	190	Encode::MIME::Header -- MIME 'B' and 'Q' header encoding
	191
	192	=head1 SYNOPSIS
	193
	194	use Encode qw/encode decode/;
	195	$utf8 = decode('MIME-Header', $header);
	196	$header = encode('MIME-Header', $utf8);
	197
	198	=head1 ABSTRACT
	199
	200	This module implements RFC 2047 Mime Header Encoding. There are 3
	201	variant encoding names; C<MIME-Header>, C<MIME-B> and C<MIME-Q>. The
	202	difference is described below
	203
	204	decode() encode()
	205	----------------------------------------------
	206	MIME-Header Both B and Q =?UTF-8?B?....?=
	207	MIME-B B only; Q croaks =?UTF-8?B?....?=
	208	MIME-Q Q only; B croaks =?UTF-8?Q?....?=
	209
	210	=head1 DESCRIPTION
	211
	212	When you decode(=?I<encoding>?I<X>?I<ENCODED WORD>?=), I<ENCODED WORD>
	213	is extracted and decoded for I<X> encoding (B for Base64, Q for
	214	Quoted-Printable). Then the decoded chunk is fed to
	215	decode(I<encoding>). So long as I<encoding> is supported by Encode,
	216	any source encoding is fine.
	217
	218	When you encode, it just encodes UTF-8 string with I<X> encoding then
	219	quoted with =?UTF-8?I<X>?....?= . The parts that RFC 2047 forbids to
	220	encode are left as is and long lines are folded within 76 bytes per
	221	line.
	222
	223	=head1 BUGS
	224
7e19fb92	225	It would be nice to support encoding to non-UTF8, such as =?ISO-2022-JP?
af1f55d9	226	and =?ISO-8859-1?= but that makes the implementation too complicated.
	227	These days major mail agents all support =?UTF-8? so I think it is
	228	just good enough.
	229
56ff7374	230	Due to popular demand, 'MIME-Header-ISO_2022_JP' was introduced by
	231	Makamaka. Thre are still too many MUAs especially cellular phone
	232	handsets which does not grok UTF-8.
	233
af1f55d9	234	=head1 SEE ALSO
	235
	236	L<Encode>
	237
	238	RFC 2047, L<http://www.faqs.org/rfcs/rfc2047.html> and many other
	239	locations.
	240
	241	=cut