[p5sagit/p5-mst-13.2.git] / ext / Encode / lib / Encode / Unicode / UTF7.pm

#
# $Id: UTF7.pm,v 2.4 2006/06/03 20:28:48 dankogai Exp dankogai $
#
package Encode::Unicode::UTF7;
use strict;
use warnings;
no warnings 'redefine';
use base qw(Encode::Encoding);
__PACKAGE__->Define('UTF-7');
our $VERSION = do { my @r = ( q$Revision: 2.4 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r };
use MIME::Base64;
use Encode;

#
# Algorithms taken from Unicode::String by Gisle Aas
#

our $OPTIONAL_DIRECT_CHARS = 1;
my $specials = quotemeta "\'(),-./:?";
$OPTIONAL_DIRECT_CHARS
  and $specials .= quotemeta "!\"#$%&*;<=>@[]^_`{|}";

# \s will not work because it matches U+3000 DEOGRAPHIC SPACE
# We use qr/[\n\r\t\ ] instead
my $re_asis    = qr/(?:[\n\r\t\ A-Za-z0-9$specials])/;
my $re_encoded = qr/(?:[^\n\r\t\ A-Za-z0-9$specials])/;
my $e_utf16    = find_encoding("UTF-16BE");

sub needs_lines { 1 }

sub encode($$;$) {
    my ( $obj, $str, $chk ) = @_;
    my $len = length($str);
    pos($str) = 0;
    my $bytes = '';
    while ( pos($str) < $len ) {
        if ( $str =~ /\G($re_asis+)/ogc ) {
            $bytes .= $1;
        }
        elsif ( $str =~ /\G($re_encoded+)/ogsc ) {
            if ( $1 eq "+" ) {
                $bytes .= "+-";
            }
            else {
                my $s = $1;
                my $base64 = encode_base64( $e_utf16->encode($s), '' );
                $base64 =~ s/=+$//;
                $bytes .= "+$base64-";
            }
        }
        else {
            die "This should not happen! (pos=" . pos($str) . ")";
        }
    }
    $_[1] = '' if $chk;
    return $bytes;
}

sub decode($$;$) {
    my ( $obj, $bytes, $chk ) = @_;
    my $len = length($bytes);
    my $str = "";
    no warnings 'uninitialized';
    while ( pos($bytes) < $len ) {
        if ( $bytes =~ /\G([^+]+)/ogc ) {
            $str .= $1;
        }
        elsif ( $bytes =~ /\G\+-/ogc ) {
            $str .= "+";
        }
        elsif ( $bytes =~ /\G\+([A-Za-z0-9+\/]+)-?/ogsc ) {
            my $base64 = $1;
            my $pad    = length($base64) % 4;
            $base64 .= "=" x ( 4 - $pad ) if $pad;
            $str .= $e_utf16->decode( decode_base64($base64) );
        }
        elsif ( $bytes =~ /\G\+/ogc ) {
            $^W and warn "Bad UTF7 data escape";
            $str .= "+";
        }
        else {
            die "This should not happen " . pos($bytes);
        }
    }
    $_[1] = '' if $chk;
    return $str;
}
1;
__END__

=head1 NAME

Encode::Unicode::UTF7 -- UTF-7 encoding

=head1 SYNOPSIS

    use Encode qw/encode decode/; 
    $utf7 = encode("UTF-7", $utf8);
    $utf8 = decode("UTF-7", $ucs2);

=head1 ABSTRACT

This module implements UTF-7 encoding documented in RFC 2152.  UTF-7,
as its name suggests, is a 7-bit re-encoded version of UTF-16BE.  It
is designed to be MTA-safe and expected to be a standard way to
exchange Unicoded mails via mails.  But with the advent of UTF-8 and
8-bit compliant MTAs, UTF-7 is hardly ever used.

UTF-7 was not supported by Encode until version 1.95 because of that.
But Unicode::String, a module by Gisle Aas which adds Unicode supports
to non-utf8-savvy perl did support UTF-7, the UTF-7 support was added
so Encode can supersede Unicode::String 100%.

=head1 In Practice

When you want to encode Unicode for mails and web pages, however, do
not use UTF-7 unless you are sure your recipients and readers can
handle it.  Very few MUAs and WWW Browsers support these days (only
Mozilla seems to support one).  For general cases, use UTF-8 for
message body and MIME-Header for header instead.

=head1 SEE ALSO

L<Encode>, L<Encode::Unicode>, L<Unicode::String>

RFC 2781 L<http://www.ietf.org/rfc/rfc2152.txt>

=cut
Commit	Line	Data
1485817e	1	#
656ebd29	2	# $Id: UTF7.pm,v 2.4 2006/06/03 20:28:48 dankogai Exp dankogai $
1485817e	3	#
	4	package Encode::Unicode::UTF7;
	5	use strict;
656ebd29	6	use warnings;
1485817e	7	no warnings 'redefine';
	8	use base qw(Encode::Encoding);
	9	__PACKAGE__->Define('UTF-7');
656ebd29	10	our $VERSION = do { my @r = ( q$Revision: 2.4 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r };
1485817e	11	use MIME::Base64;
	12	use Encode;
	13
	14	#
	15	# Algorithms taken from Unicode::String by Gisle Aas
	16	#
	17
	18	our $OPTIONAL_DIRECT_CHARS = 1;
d1256cb1	19	my $specials = quotemeta "\'(),-./:?";
	20	$OPTIONAL_DIRECT_CHARS
	21	and $specials .= quotemeta "!\"#$%&*;<=>@[]^_`{\|}";
	22
1485817e	23	# \s will not work because it matches U+3000 DEOGRAPHIC SPACE
d1256cb1	24	# We use qr/[\n\r\t\ ] instead
d1256cb1	25	my $re_asis = qr/(?:[\n\r\t\ A-Za-z0-9$specials])/;
842a5aa6	26	my $re_encoded = qr/(?:[^\n\r\t\ A-Za-z0-9$specials])/;
d1256cb1	27	my $e_utf16 = find_encoding("UTF-16BE");
1485817e	28
d1256cb1	29	sub needs_lines { 1 }
1485817e	30
d1256cb1	31	sub encode($$;$) {
d1256cb1	32	my ( $obj, $str, $chk ) = @_;
1485817e	33	my $len = length($str);
	34	pos($str) = 0;
	35	my $bytes = '';
d1256cb1	36	while ( pos($str) < $len ) {
	37	if ( $str =~ /\G($re_asis+)/ogc ) {
	38	$bytes .= $1;
	39	}
	40	elsif ( $str =~ /\G($re_encoded+)/ogsc ) {
	41	if ( $1 eq "+" ) {
	42	$bytes .= "+-";
	43	}
	44	else {
	45	my $s = $1;
	46	my $base64 = encode_base64( $e_utf16->encode($s), '' );
	47	$base64 =~ s/=+$//;
	48	$bytes .= "+$base64-";
	49	}
	50	}
	51	else {
	52	die "This should not happen! (pos=" . pos($str) . ")";
	53	}
1485817e	54	}
	55	$_[1] = '' if $chk;
	56	return $bytes;
	57	}
d1256cb1	58
	59	sub decode($$;$) {
	60	my ( $obj, $bytes, $chk ) = @_;
1485817e	61	my $len = length($bytes);
1485817e	62	my $str = "";
0a8c69ed	63	no warnings 'uninitialized';
d1256cb1	64	while ( pos($bytes) < $len ) {
	65	if ( $bytes =~ /\G([^+]+)/ogc ) {
	66	$str .= $1;
	67	}
	68	elsif ( $bytes =~ /\G\+-/ogc ) {
	69	$str .= "+";
	70	}
	71	elsif ( $bytes =~ /\G\+([A-Za-z0-9+\/]+)-?/ogsc ) {
	72	my $base64 = $1;
	73	my $pad = length($base64) % 4;
	74	$base64 .= "=" x ( 4 - $pad ) if $pad;
	75	$str .= $e_utf16->decode( decode_base64($base64) );
	76	}
	77	elsif ( $bytes =~ /\G\+/ogc ) {
	78	$^W and warn "Bad UTF7 data escape";
	79	$str .= "+";
	80	}
	81	else {
	82	die "This should not happen " . pos($bytes);
	83	}
1485817e	84	}
	85	$_[1] = '' if $chk;
	86	return $str;
	87	}
	88	1;
	89	__END__
	90
	91	=head1 NAME
	92
	93	Encode::Unicode::UTF7 -- UTF-7 encoding
	94
	95	=head1 SYNOPSIS
	96
	97	use Encode qw/encode decode/;
	98	$utf7 = encode("UTF-7", $utf8);
	99	$utf8 = decode("UTF-7", $ucs2);
	100
	101	=head1 ABSTRACT
	102
	103	This module implements UTF-7 encoding documented in RFC 2152. UTF-7,
	104	as its name suggests, is a 7-bit re-encoded version of UTF-16BE. It
	105	is designed to be MTA-safe and expected to be a standard way to
	106	exchange Unicoded mails via mails. But with the advent of UTF-8 and
	107	8-bit compliant MTAs, UTF-7 is hardly ever used.
	108
	109	UTF-7 was not supported by Encode until version 1.95 because of that.
	110	But Unicode::String, a module by Gisle Aas which adds Unicode supports
	111	to non-utf8-savvy perl did support UTF-7, the UTF-7 support was added
	112	so Encode can supersede Unicode::String 100%.
	113
	114	=head1 In Practice
	115
	116	When you want to encode Unicode for mails and web pages, however, do
	117	not use UTF-7 unless you are sure your recipients and readers can
	118	handle it. Very few MUAs and WWW Browsers support these days (only
	119	Mozilla seems to support one). For general cases, use UTF-8 for
	120	message body and MIME-Header for header instead.
	121
	122	=head1 SEE ALSO
	123
	124	L<Encode>, L<Encode::Unicode>, L<Unicode::String>
	125
	126	RFC 2781 L<http://www.ietf.org/rfc/rfc2152.txt>
	127
	128	=cut