[p5sagit/p5-mst-13.2.git] / ext / Encode / lib / Encode / Unicode / UTF7.pm

#
# $Id: UTF7.pm,v 0.1 2003/05/16 18:06:24 dankogai Exp dankogai $
#
package Encode::Unicode::UTF7;
use strict;
no warnings 'redefine';
use base qw(Encode::Encoding);
__PACKAGE__->Define('UTF-7');
our $VERSION = do { my @r = (q$Revision: 0.1 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
use MIME::Base64;
use Encode;

#
# Algorithms taken from Unicode::String by Gisle Aas
#

our $OPTIONAL_DIRECT_CHARS = 1;
my $specials =   quotemeta "\'(),-.:?";
$OPTIONAL_DIRECT_CHARS and
    $specials .= quotemeta "!\"#$%&*;<=>@[]^_`{|}";
# \s will not work because it matches U+3000 DEOGRAPHIC SPACE
# We use \x00-\x20 instead (controls + space)
my $re_asis =     qr/(?:[\x00-\x20A-Za-z0-9$specials])/;
my $re_encoded = qr/(?:[^\x00-\x20A-Za-z0-9$specials])/;
my $e_utf16 = find_encoding("UTF-16BE");

sub needs_lines { 1 };

sub encode($$;$){
    my ($obj, $str, $chk) = @_;
    my $len = length($str);
    pos($str) = 0;
    my $bytes = '';
    while (pos($str) < $len){
	if    ($str =~ /\G($re_asis+)/ogc){
	    $bytes .= $1;
	}elsif($str =~ /\G($re_encoded+)/ogsc){
	    if ($1 eq "+"){
		$bytes .= "+-";
	    }else{
		my $base64 = encode_base64($e_utf16->encode($1), '');
		$base64 =~ s/=+$//;
		$bytes .= "+$base64-";
	    }
	}else{
	    die "This should not happen! (pos=" . pos($str) . ")";
	}
    }
    $_[1] = '' if $chk;
    return $bytes;
}
	   
sub decode{
    my ($obj, $bytes, $chk) = @_;
    my $len = length($bytes);
    my $str = "";
    while (pos($bytes) < $len) {
	if    ($bytes =~ /\G([^+]+)/ogc) {
	    $str .= $1;
	}elsif($bytes =~ /\G\+-/ogc) {
	    $str .= "+";
	}elsif($bytes =~ /\G\+([A-Za-z0-9+\/]+)-?/ogsc) {
	    my $base64 = $1;
	    my $pad = length($base64) % 4;
	    $base64 .= "=" x (4 - $pad) if $pad;
	    $str .= $e_utf16->decode(decode_base64($base64));
	}elsif($bytes =~ /\G\+/ogc) {
	    $^W and warn "Bad UTF7 data escape";
	    $str .= "+";
	}else{
	    die "This should not happen " . pos($bytes);
	}
    }
    $_[1] = '' if $chk;
    return $str;
}
1;
__END__

=head1 NAME

Encode::Unicode::UTF7 -- UTF-7 encoding

=head1 SYNOPSIS

    use Encode qw/encode decode/; 
    $utf7 = encode("UTF-7", $utf8);
    $utf8 = decode("UTF-7", $ucs2);

=head1 ABSTRACT

This module implements UTF-7 encoding documented in RFC 2152.  UTF-7,
as its name suggests, is a 7-bit re-encoded version of UTF-16BE.  It
is designed to be MTA-safe and expected to be a standard way to
exchange Unicoded mails via mails.  But with the advent of UTF-8 and
8-bit compliant MTAs, UTF-7 is hardly ever used.

UTF-7 was not supported by Encode until version 1.95 because of that.
But Unicode::String, a module by Gisle Aas which adds Unicode supports
to non-utf8-savvy perl did support UTF-7, the UTF-7 support was added
so Encode can supersede Unicode::String 100%.

=head1 In Practice

When you want to encode Unicode for mails and web pages, however, do
not use UTF-7 unless you are sure your recipients and readers can
handle it.  Very few MUAs and WWW Browsers support these days (only
Mozilla seems to support one).  For general cases, use UTF-8 for
message body and MIME-Header for header instead.

=head1 SEE ALSO

L<Encode>, L<Encode::Unicode>, L<Unicode::String>

RFC 2781 L<http://www.ietf.org/rfc/rfc2152.txt>

=cut
Commit	Line	Data
1485817e	1	#
	2	# $Id: UTF7.pm,v 0.1 2003/05/16 18:06:24 dankogai Exp dankogai $
	3	#
	4	package Encode::Unicode::UTF7;
	5	use strict;
	6	no warnings 'redefine';
	7	use base qw(Encode::Encoding);
	8	__PACKAGE__->Define('UTF-7');
	9	our $VERSION = do { my @r = (q$Revision: 0.1 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
	10	use MIME::Base64;
	11	use Encode;
	12
	13	#
	14	# Algorithms taken from Unicode::String by Gisle Aas
	15	#
	16
	17	our $OPTIONAL_DIRECT_CHARS = 1;
	18	my $specials = quotemeta "\'(),-.:?";
	19	$OPTIONAL_DIRECT_CHARS and
	20	$specials .= quotemeta "!\"#$%&*;<=>@[]^_`{\|}";
	21	# \s will not work because it matches U+3000 DEOGRAPHIC SPACE
	22	# We use \x00-\x20 instead (controls + space)
	23	my $re_asis = qr/(?:[\x00-\x20A-Za-z0-9$specials])/;
	24	my $re_encoded = qr/(?:[^\x00-\x20A-Za-z0-9$specials])/;
	25	my $e_utf16 = find_encoding("UTF-16BE");
	26
	27	sub needs_lines { 1 };
	28
	29	sub encode($$;$){
	30	my ($obj, $str, $chk) = @_;
	31	my $len = length($str);
	32	pos($str) = 0;
	33	my $bytes = '';
	34	while (pos($str) < $len){
	35	if ($str =~ /\G($re_asis+)/ogc){
	36	$bytes .= $1;
	37	}elsif($str =~ /\G($re_encoded+)/ogsc){
	38	if ($1 eq "+"){
	39	$bytes .= "+-";
	40	}else{
	41	my $base64 = encode_base64($e_utf16->encode($1), '');
	42	$base64 =~ s/=+$//;
	43	$bytes .= "+$base64-";
	44	}
	45	}else{
	46	die "This should not happen! (pos=" . pos($str) . ")";
	47	}
	48	}
	49	$_[1] = '' if $chk;
	50	return $bytes;
	51	}
	52
	53	sub decode{
	54	my ($obj, $bytes, $chk) = @_;
	55	my $len = length($bytes);
	56	my $str = "";
	57	while (pos($bytes) < $len) {
	58	if ($bytes =~ /\G([^+]+)/ogc) {
	59	$str .= $1;
	60	}elsif($bytes =~ /\G\+-/ogc) {
	61	$str .= "+";
	62	}elsif($bytes =~ /\G\+([A-Za-z0-9+\/]+)-?/ogsc) {
	63	my $base64 = $1;
	64	my $pad = length($base64) % 4;
65	$base64 .= "=" x (4 - $pad) if $pad;
66	$str .= $e_utf16->decode(decode_base64($base64));
67	}elsif($bytes =~ /\G\+/ogc) {
68	$^W and warn "Bad UTF7 data escape";
69	$str .= "+";
70	}else{
71	die "This should not happen " . pos($bytes);
72	}
73	}
74	$_[1] = '' if $chk;
75	return $str;
76	}
77	1;
78	__END__
79
80	=head1 NAME
81
82	Encode::Unicode::UTF7 -- UTF-7 encoding
83
84	=head1 SYNOPSIS
85
86	use Encode qw/encode decode/;
87	$utf7 = encode("UTF-7", $utf8);
88	$utf8 = decode("UTF-7", $ucs2);
89
90	=head1 ABSTRACT
91
92	This module implements UTF-7 encoding documented in RFC 2152. UTF-7,
93	as its name suggests, is a 7-bit re-encoded version of UTF-16BE. It
94	is designed to be MTA-safe and expected to be a standard way to
95	exchange Unicoded mails via mails. But with the advent of UTF-8 and
96	8-bit compliant MTAs, UTF-7 is hardly ever used.
97
98	UTF-7 was not supported by Encode until version 1.95 because of that.
99	But Unicode::String, a module by Gisle Aas which adds Unicode supports
100	to non-utf8-savvy perl did support UTF-7, the UTF-7 support was added
101	so Encode can supersede Unicode::String 100%.
102
103	=head1 In Practice
104
105	When you want to encode Unicode for mails and web pages, however, do
106	not use UTF-7 unless you are sure your recipients and readers can
107	handle it. Very few MUAs and WWW Browsers support these days (only
108	Mozilla seems to support one). For general cases, use UTF-8 for
109	message body and MIME-Header for header instead.
110
111	=head1 SEE ALSO
112
113	L<Encode>, L<Encode::Unicode>, L<Unicode::String>
114
115	RFC 2781 L<http://www.ietf.org/rfc/rfc2152.txt>
116
117	=cut