Commit | Line | Data |
af1f55d9 |
1 | package Encode::MIME::Header; |
2 | use strict; |
656ebd29 |
3 | use warnings; |
4 | no warnings 'redefine'; |
d1256cb1 |
5 | |
a37eaad4 |
6 | our $VERSION = do { my @r = ( q$Revision: 2.7 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r }; |
bedba681 |
7 | use Encode qw(find_encoding encode_utf8 decode_utf8); |
af1f55d9 |
8 | use MIME::Base64; |
9 | use Carp; |
10 | |
d1256cb1 |
11 | my %seed = ( |
12 | decode_b => '1', # decodes 'B' encoding ? |
13 | decode_q => '1', # decodes 'Q' encoding ? |
14 | encode => 'B', # encode with 'B' or 'Q' ? |
15 | bpl => 75, # bytes per line |
16 | ); |
af1f55d9 |
17 | |
18 | $Encode::Encoding{'MIME-Header'} = |
d1256cb1 |
19 | bless { %seed, Name => 'MIME-Header', } => __PACKAGE__; |
20 | |
21 | $Encode::Encoding{'MIME-B'} = bless { |
22 | %seed, |
23 | decode_q => 0, |
24 | Name => 'MIME-B', |
25 | } => __PACKAGE__; |
26 | |
27 | $Encode::Encoding{'MIME-Q'} = bless { |
28 | %seed, |
29 | decode_q => 1, |
30 | encode => 'Q', |
31 | Name => 'MIME-Q', |
32 | } => __PACKAGE__; |
af1f55d9 |
33 | |
10c5ecbb |
34 | use base qw(Encode::Encoding); |
35 | |
af1f55d9 |
36 | sub needs_lines { 1 } |
d1256cb1 |
37 | sub perlio_ok { 0 } |
af1f55d9 |
38 | |
d1256cb1 |
39 | sub decode($$;$) { |
af1f55d9 |
40 | use utf8; |
d1256cb1 |
41 | my ( $obj, $str, $chk ) = @_; |
42 | |
af1f55d9 |
43 | # zap spaces between encoded words |
44 | $str =~ s/\?=\s+=\?/\?==\?/gos; |
d1256cb1 |
45 | |
af1f55d9 |
46 | # multi-line header to single line |
a37eaad4 |
47 | $str =~ s/(?:\r|\n|\r\n)[ \t]+//gos; |
41c240f5 |
48 | |
d1256cb1 |
49 | 1 while ( $str =~ |
50 | s/(\=\?[0-9A-Za-z\-_]+\?[Qq]\?)(.*?)\?\=\1(.*?)\?\=/$1$2$3\?\=/ ) |
51 | ; # Concat consecutive QP encoded mime headers |
52 | # Fixes breaking inside multi-byte characters |
41c240f5 |
53 | |
d1256cb1 |
54 | $str =~ s{ |
55 | =\? # begin encoded word |
56 | ([0-9A-Za-z\-_]+) # charset (encoding) |
41c240f5 |
57 | (?:\*[A-Za-z]{1,8}(?:-[A-Za-z]{1,8})*)? # language (RFC 2231) |
d1256cb1 |
58 | \?([QqBb])\? # delimiter |
59 | (.*?) # Base64-encodede contents |
60 | \?= # end encoded word |
61 | }{ |
62 | if (uc($2) eq 'B'){ |
63 | $obj->{decode_b} or croak qq(MIME "B" unsupported); |
64 | decode_b($1, $3); |
65 | }elsif(uc($2) eq 'Q'){ |
66 | $obj->{decode_q} or croak qq(MIME "Q" unsupported); |
67 | decode_q($1, $3); |
68 | }else{ |
69 | croak qq(MIME "$2" encoding is nonexistent!); |
70 | } |
71 | }egox; |
af1f55d9 |
72 | $_[1] = '' if $chk; |
73 | return $str; |
74 | } |
75 | |
d1256cb1 |
76 | sub decode_b { |
77 | my $enc = shift; |
78 | my $d = find_encoding($enc) or croak qq(Unknown encoding "$enc"); |
af1f55d9 |
79 | my $db64 = decode_base64(shift); |
d1256cb1 |
80 | return $d->name eq 'utf8' |
81 | ? Encode::decode_utf8($db64) |
82 | : $d->decode( $db64, Encode::FB_PERLQQ ); |
af1f55d9 |
83 | } |
84 | |
d1256cb1 |
85 | sub decode_q { |
86 | my ( $enc, $q ) = @_; |
bedba681 |
87 | my $d = find_encoding($enc) or croak qq(Unknown encoding "$enc"); |
af1f55d9 |
88 | $q =~ s/_/ /go; |
89 | $q =~ s/=([0-9A-Fa-f]{2})/pack("C", hex($1))/ego; |
d1256cb1 |
90 | return $d->name eq 'utf8' |
91 | ? Encode::decode_utf8($q) |
92 | : $d->decode( $q, Encode::FB_PERLQQ ); |
af1f55d9 |
93 | } |
94 | |
d1256cb1 |
95 | my $especials = |
96 | join( '|' => map { quotemeta( chr($_) ) } |
97 | unpack( "C*", qq{()<>@,;:\"\'/[]?.=} ) ); |
af1f55d9 |
98 | |
d1256cb1 |
99 | my $re_encoded_word = qr{ |
bedba681 |
100 | (?: |
d1256cb1 |
101 | =\? # begin encoded word |
102 | (?:[0-9A-Za-z\-_]+) # charset (encoding) |
41c240f5 |
103 | (?:\*\w+(?:-\w+)*)? # language (RFC 2231) |
d1256cb1 |
104 | \?(?:[QqBb])\? # delimiter |
105 | (?:.*?) # Base64-encodede contents |
106 | \?= # end encoded word |
bedba681 |
107 | ) |
108 | }xo; |
109 | |
110 | my $re_especials = qr{$re_encoded_word|$especials}xo; |
af1f55d9 |
111 | |
d1256cb1 |
112 | sub encode($$;$) { |
113 | my ( $obj, $str, $chk ) = @_; |
af1f55d9 |
114 | my @line = (); |
d1256cb1 |
115 | for my $line ( split /\r|\n|\r\n/o, $str ) { |
116 | my ( @word, @subline ); |
117 | for my $word ( split /($re_especials)/o, $line ) { |
118 | if ( $word =~ /[^\x00-\x7f]/o |
119 | or $word =~ /^$re_encoded_word$/o ) |
120 | { |
121 | push @word, $obj->_encode($word); |
122 | } |
123 | else { |
124 | push @word, $word; |
125 | } |
126 | } |
127 | my $subline = ''; |
128 | for my $word (@word) { |
129 | use bytes (); |
130 | if ( bytes::length($subline) + bytes::length($word) > |
131 | $obj->{bpl} ) |
132 | { |
133 | push @subline, $subline; |
134 | $subline = ''; |
135 | } |
136 | $subline .= $word; |
137 | } |
138 | $subline and push @subline, $subline; |
139 | push @line, join( "\n " => @subline ); |
af1f55d9 |
140 | } |
141 | $_[1] = '' if $chk; |
d1256cb1 |
142 | return join( "\n", @line ); |
af1f55d9 |
143 | } |
144 | |
d1256cb1 |
145 | use constant HEAD => '=?UTF-8?'; |
146 | use constant TAIL => '?='; |
af1f55d9 |
147 | use constant SINGLE => { B => \&_encode_b, Q => \&_encode_q, }; |
148 | |
d1256cb1 |
149 | sub _encode { |
150 | my ( $o, $str ) = @_; |
151 | my $enc = $o->{encode}; |
152 | my $llen = ( $o->{bpl} - length(HEAD) - 2 - length(TAIL) ); |
153 | |
11067275 |
154 | # to coerce a floating-point arithmetics, the following contains |
155 | # .0 in numbers -- dankogai |
d1256cb1 |
156 | $llen *= $enc eq 'B' ? 3.0 / 4.0 : 1.0 / 3.0; |
af1f55d9 |
157 | my @result = (); |
d1256cb1 |
158 | my $chunk = ''; |
159 | while ( length( my $chr = substr( $str, 0, 1, '' ) ) ) { |
160 | use bytes (); |
161 | if ( bytes::length($chunk) + bytes::length($chr) > $llen ) { |
162 | push @result, SINGLE->{$enc}($chunk); |
163 | $chunk = ''; |
164 | } |
165 | $chunk .= $chr; |
af1f55d9 |
166 | } |
64bc6d54 |
167 | length($chunk) and push @result, SINGLE->{$enc}($chunk); |
af1f55d9 |
168 | return @result; |
169 | } |
170 | |
d1256cb1 |
171 | sub _encode_b { |
172 | HEAD . 'B?' . encode_base64( encode_utf8(shift), '' ) . TAIL; |
af1f55d9 |
173 | } |
174 | |
d1256cb1 |
175 | sub _encode_q { |
af1f55d9 |
176 | my $chunk = shift; |
51e4e64d |
177 | $chunk = encode_utf8($chunk); |
af1f55d9 |
178 | $chunk =~ s{ |
d1256cb1 |
179 | ([^0-9A-Za-z]) |
180 | }{ |
181 | join("" => map {sprintf "=%02X", $_} unpack("C*", $1)) |
182 | }egox; |
51e4e64d |
183 | return HEAD . 'Q?' . $chunk . TAIL; |
af1f55d9 |
184 | } |
185 | |
186 | 1; |
187 | __END__ |
188 | |
189 | =head1 NAME |
190 | |
191 | Encode::MIME::Header -- MIME 'B' and 'Q' header encoding |
192 | |
193 | =head1 SYNOPSIS |
194 | |
195 | use Encode qw/encode decode/; |
196 | $utf8 = decode('MIME-Header', $header); |
197 | $header = encode('MIME-Header', $utf8); |
198 | |
199 | =head1 ABSTRACT |
200 | |
201 | This module implements RFC 2047 Mime Header Encoding. There are 3 |
202 | variant encoding names; C<MIME-Header>, C<MIME-B> and C<MIME-Q>. The |
203 | difference is described below |
204 | |
205 | decode() encode() |
206 | ---------------------------------------------- |
207 | MIME-Header Both B and Q =?UTF-8?B?....?= |
208 | MIME-B B only; Q croaks =?UTF-8?B?....?= |
209 | MIME-Q Q only; B croaks =?UTF-8?Q?....?= |
210 | |
211 | =head1 DESCRIPTION |
212 | |
213 | When you decode(=?I<encoding>?I<X>?I<ENCODED WORD>?=), I<ENCODED WORD> |
214 | is extracted and decoded for I<X> encoding (B for Base64, Q for |
215 | Quoted-Printable). Then the decoded chunk is fed to |
216 | decode(I<encoding>). So long as I<encoding> is supported by Encode, |
217 | any source encoding is fine. |
218 | |
219 | When you encode, it just encodes UTF-8 string with I<X> encoding then |
220 | quoted with =?UTF-8?I<X>?....?= . The parts that RFC 2047 forbids to |
221 | encode are left as is and long lines are folded within 76 bytes per |
222 | line. |
223 | |
224 | =head1 BUGS |
225 | |
7e19fb92 |
226 | It would be nice to support encoding to non-UTF8, such as =?ISO-2022-JP? |
af1f55d9 |
227 | and =?ISO-8859-1?= but that makes the implementation too complicated. |
228 | These days major mail agents all support =?UTF-8? so I think it is |
229 | just good enough. |
230 | |
56ff7374 |
231 | Due to popular demand, 'MIME-Header-ISO_2022_JP' was introduced by |
232 | Makamaka. Thre are still too many MUAs especially cellular phone |
233 | handsets which does not grok UTF-8. |
234 | |
af1f55d9 |
235 | =head1 SEE ALSO |
236 | |
237 | L<Encode> |
238 | |
239 | RFC 2047, L<http://www.faqs.org/rfcs/rfc2047.html> and many other |
240 | locations. |
241 | |
242 | =cut |