[p5sagit/p5-mst-13.2.git] / ext / Encode / lib / Encode / MIME / Header.pm

package Encode::MIME::Header;
use strict;
# use warnings;
our $VERSION = do { my @r = (q$Revision: 2.2 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
use Encode qw(find_encoding encode_utf8 decode_utf8);
use MIME::Base64;
use Carp;

my %seed = 
    (
     decode_b     => '1', # decodes 'B' encoding ?
     decode_q     => '1', # decodes 'Q' encoding ?
     encode       => 'B', # encode with 'B' or 'Q' ?
     bpl          => 75,  # bytes per line
     );

$Encode::Encoding{'MIME-Header'} =
    bless {
	%seed,
	Name => 'MIME-Header',
    } => __PACKAGE__;

$Encode::Encoding{'MIME-B'} =
    bless {
	%seed,
	decode_q  => 0,
	Name      => 'MIME-B',
    } => __PACKAGE__;

$Encode::Encoding{'MIME-Q'} =
    bless {
	%seed,
	decode_q    => 1,
	encode      => 'Q',
	Name        => 'MIME-Q',
    } => __PACKAGE__;

use base qw(Encode::Encoding);

sub needs_lines { 1 }
sub perlio_ok{ 0 };

sub decode($$;$){
    use utf8;
    my ($obj, $str, $chk) = @_;
    # zap spaces between encoded words
    $str =~ s/\?=\s+=\?/\?==\?/gos;
    # multi-line header to single line
    $str =~ s/(:?\r|\n|\r\n)[ \t]//gos;

    1 while ($str =~ s/(\=\?[0-9A-Za-z\-_]+\?[Qq]\?)(.*?)\?\=\1(.*?)\?\=/$1$2$3\?\=/);  # Concat consecutive QP encoded mime headers
                                                                                        # Fixes breaking inside multi-byte characters

    $str =~
	s{
	    =\?                  # begin encoded word
		([0-9A-Za-z\-_]+) # charset (encoding)
                (?:\*[A-Za-z]{1,8}(?:-[A-Za-z]{1,8})*)? # language (RFC 2231)
		\?([QqBb])\?     # delimiter
		(.*?)            # Base64-encodede contents
		\?=              # end encoded word      
	    }{
		if    (uc($2) eq 'B'){
		    $obj->{decode_b} or croak qq(MIME "B" unsupported);
		    decode_b($1, $3);
		}elsif(uc($2) eq 'Q'){
		    $obj->{decode_q} or croak qq(MIME "Q" unsupported);
		    decode_q($1, $3);
		}else{
		    croak qq(MIME "$2" encoding is nonexistent!);
		}
	    }egox;
    $_[1] = '' if $chk;
    return $str;
}

sub decode_b{
    my $enc = shift;
    my $d = find_encoding($enc)        or croak qq(Unknown encoding "$enc");
    my $db64 = decode_base64(shift);
    return $d->name eq 'utf8' ?
	Encode::decode_utf8($db64) : $d->decode($db64, Encode::FB_PERLQQ);
}

sub decode_q{
    my ($enc, $q) = @_;
    my $d = find_encoding($enc) or croak qq(Unknown encoding "$enc");
    $q =~ s/_/ /go;
    $q =~ s/=([0-9A-Fa-f]{2})/pack("C", hex($1))/ego;
    return $d->name eq 'utf8' ? 
	Encode::decode_utf8($q) : $d->decode($q, Encode::FB_PERLQQ);
}

my $especials = 
    join('|' =>
	 map {quotemeta(chr($_))} 
	 unpack("C*", qq{()<>@,;:\"\'/[]?.=}));

my $re_encoded_word =
    qr{
       (?:
	=\?               # begin encoded word
	(?:[0-9A-Za-z\-_]+) # charset (encoding)
        (?:\*\w+(?:-\w+)*)? # language (RFC 2231)
	\?(?:[QqBb])\?      # delimiter
	(?:.*?)             # Base64-encodede contents
	\?=                 # end encoded word
       )
      }xo;

my $re_especials = qr{$re_encoded_word|$especials}xo;

sub encode($$;$){
    my ($obj, $str, $chk) = @_;
    my @line = ();
    for my $line (split /\r|\n|\r\n/o, $str){
	my (@word, @subline);
        for my $word (split /($re_especials)/o, $line){
	    if ($word =~ /[^\x00-\x7f]/o or $word =~ /^$re_encoded_word$/o){
		push @word, $obj->_encode($word);
	    }else{
		push @word, $word;
	    }
	}
	my $subline = '';
	for my $word (@word){
	    use bytes ();
	    if (bytes::length($subline) + bytes::length($word) > $obj->{bpl}){
		push @subline, $subline;
		$subline = '';
	    }
	    $subline .= $word;
	}
	$subline and push @subline, $subline;
	push @line, join("\n " => @subline);
    }
    $_[1] = '' if $chk;
    return join("\n", @line);
}

use constant HEAD  => '=?UTF-8?';
use constant TAIL    => '?=';
use constant SINGLE => { B => \&_encode_b, Q => \&_encode_q, };

sub _encode{
    my ($o, $str) = @_;
    my $enc = $o->{encode};
    my $llen = ($o->{bpl} - length(HEAD) - 2 - length(TAIL));
    # to coerce a floating-point arithmetics, the following contains
    # .0 in numbers -- dankogai
    $llen *= $enc eq 'B' ? 3.0/4.0 : 1.0/3.0;
    my @result = ();
    my $chunk = '';
    while(length(my $chr = substr($str, 0, 1, ''))){
	use bytes ();
	if (bytes::length($chunk) + bytes::length($chr) > $llen){
	    push @result, SINGLE->{$enc}($chunk);
	    $chunk = '';
	}
	$chunk .= $chr;
    }
    $chunk and push @result, SINGLE->{$enc}($chunk);
    return @result;
}

sub _encode_b{
    HEAD . 'B?' . encode_base64(encode_utf8(shift), '') . TAIL;
}

sub _encode_q{
    my $chunk = shift;
    $chunk =~ s{
		([^0-9A-Za-z])
	       }{
		   join("" => map {sprintf "=%02X", $_} unpack("C*", $1))
	       }egox;
    return decode_utf8(HEAD . 'Q?' . $chunk . TAIL);
}

1;
__END__

=head1 NAME

Encode::MIME::Header -- MIME 'B' and 'Q' header encoding

=head1 SYNOPSIS

    use Encode qw/encode decode/; 
    $utf8   = decode('MIME-Header', $header);
    $header = encode('MIME-Header', $utf8);

=head1 ABSTRACT

This module implements RFC 2047 Mime Header Encoding.  There are 3
variant encoding names; C<MIME-Header>, C<MIME-B> and C<MIME-Q>.  The
difference is described below

              decode()          encode()
  ----------------------------------------------
  MIME-Header Both B and Q      =?UTF-8?B?....?=
  MIME-B      B only; Q croaks  =?UTF-8?B?....?=
  MIME-Q      Q only; B croaks  =?UTF-8?Q?....?=

=head1 DESCRIPTION

When you decode(=?I<encoding>?I<X>?I<ENCODED WORD>?=), I<ENCODED WORD>
is extracted and decoded for I<X> encoding (B for Base64, Q for
Quoted-Printable). Then the decoded chunk is fed to
decode(I<encoding>).  So long as I<encoding> is supported by Encode,
any source encoding is fine.

When you encode, it just encodes UTF-8 string with I<X> encoding then
quoted with =?UTF-8?I<X>?....?= .  The parts that RFC 2047 forbids to
encode are left as is and long lines are folded within 76 bytes per
line.

=head1 BUGS

It would be nice to support encoding to non-UTF8, such as =?ISO-2022-JP?
and =?ISO-8859-1?= but that makes the implementation too complicated.
These days major mail agents all support =?UTF-8? so I think it is
just good enough.

Due to popular demand, 'MIME-Header-ISO_2022_JP' was introduced by
Makamaka.  Thre are still too many MUAs especially cellular phone
handsets which does not grok UTF-8.

=head1 SEE ALSO

L<Encode>

RFC 2047, L<http://www.faqs.org/rfcs/rfc2047.html> and many other
locations. 

=cut
Commit	Line	Data
af1f55d9	1	package Encode::MIME::Header;
	2	use strict;
	3	# use warnings;
41c240f5	4	our $VERSION = do { my @r = (q$Revision: 2.2 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
bedba681	5	use Encode qw(find_encoding encode_utf8 decode_utf8);
af1f55d9	6	use MIME::Base64;
	7	use Carp;
	8
	9	my %seed =
	10	(
	11	decode_b => '1', # decodes 'B' encoding ?
	12	decode_q => '1', # decodes 'Q' encoding ?
	13	encode => 'B', # encode with 'B' or 'Q' ?
	14	bpl => 75, # bytes per line
	15	);
	16
	17	$Encode::Encoding{'MIME-Header'} =
	18	bless {
	19	%seed,
	20	Name => 'MIME-Header',
	21	} => __PACKAGE__;
	22
	23	$Encode::Encoding{'MIME-B'} =
	24	bless {
	25	%seed,
	26	decode_q => 0,
	27	Name => 'MIME-B',
	28	} => __PACKAGE__;
	29
	30	$Encode::Encoding{'MIME-Q'} =
	31	bless {
	32	%seed,
	33	decode_q => 1,
	34	encode => 'Q',
	35	Name => 'MIME-Q',
	36	} => __PACKAGE__;
	37
10c5ecbb	38	use base qw(Encode::Encoding);
10c5ecbb	39
af1f55d9	40	sub needs_lines { 1 }
	41	sub perlio_ok{ 0 };
	42
	43	sub decode($$;$){
	44	use utf8;
	45	my ($obj, $str, $chk) = @_;
	46	# zap spaces between encoded words
	47	$str =~ s/\?=\s+=\?/\?==\?/gos;
	48	# multi-line header to single line
	49	$str =~ s/(:?\r\|\n\|\r\n)[ \t]//gos;
41c240f5	50
	51	1 while ($str =~ s/(\=\?[0-9A-Za-z\-_]+\?[Qq]\?)(.?)\?\=\1(.?)\?\=/$1$2$3\?\=/); # Concat consecutive QP encoded mime headers
	52	# Fixes breaking inside multi-byte characters
	53
af1f55d9	54	$str =~
	55	s{
	56	=\? # begin encoded word
f74b3917	57	([0-9A-Za-z\-_]+) # charset (encoding)
41c240f5	58	(?:\[A-Za-z]{1,8}(?:-[A-Za-z]{1,8}))? # language (RFC 2231)
af1f55d9	59	\?([QqBb])\? # delimiter
	60	(.*?) # Base64-encodede contents
	61	\?= # end encoded word
	62	}{
	63	if (uc($2) eq 'B'){
	64	$obj->{decode_b} or croak qq(MIME "B" unsupported);
	65	decode_b($1, $3);
	66	}elsif(uc($2) eq 'Q'){
	67	$obj->{decode_q} or croak qq(MIME "Q" unsupported);
	68	decode_q($1, $3);
	69	}else{
	70	croak qq(MIME "$2" encoding is nonexistent!);
	71	}
	72	}egox;
	73	$_[1] = '' if $chk;
	74	return $str;
	75	}
	76
	77	sub decode_b{
	78	my $enc = shift;
bedba681	79	my $d = find_encoding($enc) or croak qq(Unknown encoding "$enc");
af1f55d9	80	my $db64 = decode_base64(shift);
ab3374e4	81	return $d->name eq 'utf8' ?
ab3374e4	82	Encode::decode_utf8($db64) : $d->decode($db64, Encode::FB_PERLQQ);
af1f55d9	83	}
	84
	85	sub decode_q{
	86	my ($enc, $q) = @_;
bedba681	87	my $d = find_encoding($enc) or croak qq(Unknown encoding "$enc");
af1f55d9	88	$q =~ s/_/ /go;
af1f55d9	89	$q =~ s/=([0-9A-Fa-f]{2})/pack("C", hex($1))/ego;
ab3374e4	90	return $d->name eq 'utf8' ?
ab3374e4	91	Encode::decode_utf8($q) : $d->decode($q, Encode::FB_PERLQQ);
af1f55d9	92	}
	93
	94	my $especials =
	95	join('\|' =>
	96	map {quotemeta(chr($_))}
	97	unpack("C*", qq{()<>@,;:\"\'/[]?.=}));
	98
bedba681	99	my $re_encoded_word =
	100	qr{
	101	(?:
	102	=\? # begin encoded word
	103	(?:[0-9A-Za-z\-_]+) # charset (encoding)
41c240f5	104	(?:\\w+(?:-\w+))? # language (RFC 2231)
bedba681	105	\?(?:[QqBb])\? # delimiter
	106	(?:.*?) # Base64-encodede contents
	107	\?= # end encoded word
	108	)
	109	}xo;
	110
	111	my $re_especials = qr{$re_encoded_word\|$especials}xo;
af1f55d9	112
	113	sub encode($$;$){
	114	my ($obj, $str, $chk) = @_;
	115	my @line = ();
	116	for my $line (split /\r\|\n\|\r\n/o, $str){
	117	my (@word, @subline);
	118	for my $word (split /($re_especials)/o, $line){
bedba681	119	if ($word =~ /[^\x00-\x7f]/o or $word =~ /^$re_encoded_word$/o){
af1f55d9	120	push @word, $obj->_encode($word);
	121	}else{
	122	push @word, $word;
	123	}
	124	}
	125	my $subline = '';
	126	for my $word (@word){
	127	use bytes ();
	128	if (bytes::length($subline) + bytes::length($word) > $obj->{bpl}){
	129	push @subline, $subline;
	130	$subline = '';
	131	}
	132	$subline .= $word;
	133	}
	134	$subline and push @subline, $subline;
	135	push @line, join("\n " => @subline);
	136	}
	137	$_[1] = '' if $chk;
	138	return join("\n", @line);
	139	}
	140
	141	use constant HEAD => '=?UTF-8?';
	142	use constant TAIL => '?=';
	143	use constant SINGLE => { B => \&_encode_b, Q => \&_encode_q, };
	144
	145	sub _encode{
	146	my ($o, $str) = @_;
	147	my $enc = $o->{encode};
	148	my $llen = ($o->{bpl} - length(HEAD) - 2 - length(TAIL));
11067275	149	# to coerce a floating-point arithmetics, the following contains
	150	# .0 in numbers -- dankogai
	151	$llen *= $enc eq 'B' ? 3.0/4.0 : 1.0/3.0;
af1f55d9	152	my @result = ();
af1f55d9	153	my $chunk = '';
b786ee6f	154	while(length(my $chr = substr($str, 0, 1, ''))){
af1f55d9	155	use bytes ();
	156	if (bytes::length($chunk) + bytes::length($chr) > $llen){
	157	push @result, SINGLE->{$enc}($chunk);
	158	$chunk = '';
	159	}
	160	$chunk .= $chr;
	161	}
	162	$chunk and push @result, SINGLE->{$enc}($chunk);
	163	return @result;
	164	}
	165
	166	sub _encode_b{
	167	HEAD . 'B?' . encode_base64(encode_utf8(shift), '') . TAIL;
	168	}
	169
	170	sub _encode_q{
	171	my $chunk = shift;
	172	$chunk =~ s{
	173	([^0-9A-Za-z])
	174	}{
	175	join("" => map {sprintf "=%02X", $_} unpack("C*", $1))
	176	}egox;
bedba681	177	return decode_utf8(HEAD . 'Q?' . $chunk . TAIL);
af1f55d9	178	}
	179
	180	1;
	181	__END__
	182
	183	=head1 NAME
	184
	185	Encode::MIME::Header -- MIME 'B' and 'Q' header encoding
	186
	187	=head1 SYNOPSIS
	188
	189	use Encode qw/encode decode/;
	190	$utf8 = decode('MIME-Header', $header);
	191	$header = encode('MIME-Header', $utf8);
	192
	193	=head1 ABSTRACT
	194
	195	This module implements RFC 2047 Mime Header Encoding. There are 3
	196	variant encoding names; C<MIME-Header>, C<MIME-B> and C<MIME-Q>. The
	197	difference is described below
	198
	199	decode() encode()
	200	----------------------------------------------
	201	MIME-Header Both B and Q =?UTF-8?B?....?=
	202	MIME-B B only; Q croaks =?UTF-8?B?....?=
	203	MIME-Q Q only; B croaks =?UTF-8?Q?....?=
	204
	205	=head1 DESCRIPTION
	206
	207	When you decode(=?I<encoding>?I<X>?I<ENCODED WORD>?=), I<ENCODED WORD>
	208	is extracted and decoded for I<X> encoding (B for Base64, Q for
	209	Quoted-Printable). Then the decoded chunk is fed to
	210	decode(I<encoding>). So long as I<encoding> is supported by Encode,
	211	any source encoding is fine.
	212
	213	When you encode, it just encodes UTF-8 string with I<X> encoding then
	214	quoted with =?UTF-8?I<X>?....?= . The parts that RFC 2047 forbids to
	215	encode are left as is and long lines are folded within 76 bytes per
	216	line.
	217
	218	=head1 BUGS
	219
7e19fb92	220	It would be nice to support encoding to non-UTF8, such as =?ISO-2022-JP?
af1f55d9	221	and =?ISO-8859-1?= but that makes the implementation too complicated.
	222	These days major mail agents all support =?UTF-8? so I think it is
	223	just good enough.
	224
56ff7374	225	Due to popular demand, 'MIME-Header-ISO_2022_JP' was introduced by
	226	Makamaka. Thre are still too many MUAs especially cellular phone
	227	handsets which does not grok UTF-8.
	228
af1f55d9	229	=head1 SEE ALSO
	230
	231	L<Encode>
	232
	233	RFC 2047, L<http://www.faqs.org/rfcs/rfc2047.html> and many other
	234	locations.
	235
	236	=cut