ext/Encode/lib/Encode/MIME/Header.pm Encode extension
ext/Encode/lib/Encode/PerlIO.pod Documents for Encode & PerlIO
ext/Encode/lib/Encode/Supported.pod Documents for supported encodings
+ext/Encode/lib/Encode/Unicode/UTF7.pm Encode extension
ext/Encode/Makefile.PL Encode extension makefile writer
ext/Encode/MANIFEST Encode extension
ext/Encode/META.yml Module meta-data in YAML
# $Id: Changes,v 1.94 2003/05/10 18:13:59 dankogai Exp $
#
$Revision: 1.94 $ $Date: 2003/05/10 18:13:59 $
++ lib/Encode/Unicode/UTF7.pm
+! lib/Encode/Config.pm lib/Encode/Alias.pm Unicode/Unicode.pm t/Unicode.t
+ lib/Encode/Supported.pod
+ UTF-7 support is now added. With this Encode now has all transcoding
+ methods in Unicode::String.
+
+1.94 2003/05/10 18:13:59
! lib/Encode/MIME/Header.pm
A more sophisticated solution for double-encoding by dankogai
! lib/Encode/MIME/Header.pm AUTHORS
lib/Encode/MIME/Header.pm Encode extension
lib/Encode/PerlIO.pod Documents for Encode & PerlIO
lib/Encode/Supported.pod Documents for supported encodings
+lib/Encode/Unicode/UTF7.pm Encode extension
t/Aliases.t test script
t/CJKT.t test script
t/Encode.t test script
=item L<http://www.unicode.org/glossary/> says:
I<Character Encoding Scheme> A character encoding form plus byte
-serialization. There are seven character encoding schemes in Unicode:
+serialization. There are Seven character encoding schemes in Unicode:
UTF-8, UTF-16, UTF-16BE, UTF-16LE, UTF-32 (UCS-4), UTF-32BE (UCS-4BE) and
-UTF-32LE (UCS-4LE).
+UTF-32LE (UCS-4LE), and UTF-7.
+
+Since UTF-7 is a 7-bit (re)encoded version of UTF-16BE, It is not part of
+Unicode's Character Encoding Scheme. It is separately implemented in
+Encode::Unicode::UTF7. For details see L<Encode::Unicode::UTF7>.
=item Quick Reference
=head1 SEE ALSO
-L<Encode>, L<http://www.unicode.org/glossary/>,
+L<Encode>, L<Encode::Unicode::UTF7>, L<http://www.unicode.org/glossary/>,
L<http://www.unicode.org/unicode/faq/utf_bom.html>,
RFC 2781 L<http://rfc.net/rfc2781.html>,
package Encode::Alias;
use strict;
+no warnings 'redefine';
use Encode;
our $VERSION = do { my @r = (q$Revision: 1.35 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
our $DEBUG = 0;
define_alias( qr/^(.*)$/ => '"\L$1"' );
# UTF/UCS stuff
+ define_alias( qr/^UTF-?7$/i => '"UTF-7"');
define_alias( qr/^UCS-?2-?LE$/i => '"UCS-2LE"' );
define_alias( qr/^UCS-?2-?(BE)?$/i => '"UCS-2BE"',
qr/^UCS-?4-?(BE|LE)?$/i => 'uc("UTF-32$1")',
'UTF-32' => 'Encode::Unicode',
'UTF-32BE' => 'Encode::Unicode',
'UTF-32LE' => 'Encode::Unicode',
+ 'UTF-7' => 'Encode::Unicode::UTF7',
);
unless (ord("A") == 193){
UTF-32 [UC]
UTF-32BE UCS-4 [UC]
UTF-32LE [UC]
+ UTF-7 [RFC2152]
----------------------------------------------------------------
To find how (UCS-2|UTF-(16|32))(LE|BE)? differ from one another,
see L<Encode::Unicode>.
+UTF-7 is a special encoding which "re-encodes" UTF-16BE into a 7-bit
+encoding. It is implemeneted seperately by Encode::Unicode::UTF7.
+
=head2 Encode::Byte -- Extended ASCII
Encode::Byte implements most single-byte encodings except for
--- /dev/null
+#
+# $Id: UTF7.pm,v 0.1 2003/05/16 18:06:24 dankogai Exp dankogai $
+#
+package Encode::Unicode::UTF7;
+use strict;
+no warnings 'redefine';
+use base qw(Encode::Encoding);
+__PACKAGE__->Define('UTF-7');
+our $VERSION = do { my @r = (q$Revision: 0.1 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
+use MIME::Base64;
+use Encode;
+
+#
+# Algorithms taken from Unicode::String by Gisle Aas
+#
+
+our $OPTIONAL_DIRECT_CHARS = 1;
+my $specials = quotemeta "\'(),-.:?";
+$OPTIONAL_DIRECT_CHARS and
+ $specials .= quotemeta "!\"#$%&*;<=>@[]^_`{|}";
+# \s will not work because it matches U+3000 DEOGRAPHIC SPACE
+# We use \x00-\x20 instead (controls + space)
+my $re_asis = qr/(?:[\x00-\x20A-Za-z0-9$specials])/;
+my $re_encoded = qr/(?:[^\x00-\x20A-Za-z0-9$specials])/;
+my $e_utf16 = find_encoding("UTF-16BE");
+
+sub needs_lines { 1 };
+
+sub encode($$;$){
+ my ($obj, $str, $chk) = @_;
+ my $len = length($str);
+ pos($str) = 0;
+ my $bytes = '';
+ while (pos($str) < $len){
+ if ($str =~ /\G($re_asis+)/ogc){
+ $bytes .= $1;
+ }elsif($str =~ /\G($re_encoded+)/ogsc){
+ if ($1 eq "+"){
+ $bytes .= "+-";
+ }else{
+ my $base64 = encode_base64($e_utf16->encode($1), '');
+ $base64 =~ s/=+$//;
+ $bytes .= "+$base64-";
+ }
+ }else{
+ die "This should not happen! (pos=" . pos($str) . ")";
+ }
+ }
+ $_[1] = '' if $chk;
+ return $bytes;
+}
+
+sub decode{
+ my ($obj, $bytes, $chk) = @_;
+ my $len = length($bytes);
+ my $str = "";
+ while (pos($bytes) < $len) {
+ if ($bytes =~ /\G([^+]+)/ogc) {
+ $str .= $1;
+ }elsif($bytes =~ /\G\+-/ogc) {
+ $str .= "+";
+ }elsif($bytes =~ /\G\+([A-Za-z0-9+\/]+)-?/ogsc) {
+ my $base64 = $1;
+ my $pad = length($base64) % 4;
+ $base64 .= "=" x (4 - $pad) if $pad;
+ $str .= $e_utf16->decode(decode_base64($base64));
+ }elsif($bytes =~ /\G\+/ogc) {
+ $^W and warn "Bad UTF7 data escape";
+ $str .= "+";
+ }else{
+ die "This should not happen " . pos($bytes);
+ }
+ }
+ $_[1] = '' if $chk;
+ return $str;
+}
+1;
+__END__
+
+=head1 NAME
+
+Encode::Unicode::UTF7 -- UTF-7 encoding
+
+=head1 SYNOPSIS
+
+ use Encode qw/encode decode/;
+ $utf7 = encode("UTF-7", $utf8);
+ $utf8 = decode("UTF-7", $ucs2);
+
+=head1 ABSTRACT
+
+This module implements UTF-7 encoding documented in RFC 2152. UTF-7,
+as its name suggests, is a 7-bit re-encoded version of UTF-16BE. It
+is designed to be MTA-safe and expected to be a standard way to
+exchange Unicoded mails via mails. But with the advent of UTF-8 and
+8-bit compliant MTAs, UTF-7 is hardly ever used.
+
+UTF-7 was not supported by Encode until version 1.95 because of that.
+But Unicode::String, a module by Gisle Aas which adds Unicode supports
+to non-utf8-savvy perl did support UTF-7, the UTF-7 support was added
+so Encode can supersede Unicode::String 100%.
+
+=head1 In Practice
+
+When you want to encode Unicode for mails and web pages, however, do
+not use UTF-7 unless you are sure your recipients and readers can
+handle it. Very few MUAs and WWW Browsers support these days (only
+Mozilla seems to support one). For general cases, use UTF-8 for
+message body and MIME-Header for header instead.
+
+=head1 SEE ALSO
+
+L<Encode>, L<Encode::Unicode>, L<Unicode::String>
+
+RFC 2781 L<http://www.ietf.org/rfc/rfc2152.txt>
+
+=cut
use strict;
#use Test::More 'no_plan';
-use Test::More tests => 30;
+use Test::More tests => 37;
use Encode qw(encode decode);
#
}
};
+#
+# CJKT vs. UTF-7
+#
+use File::Spec;
+use File::Basename;
+
+my $dir = dirname(__FILE__);
+opendir my $dh, $dir or die "$dir:$!";
+my @file = sort grep {/\.utf$/o} readdir $dh;
+closedir $dh;
+for my $file (@file){
+ my $path = File::Spec->catfile($dir, $file);
+ open my $fh, '<:utf8', $path or die "$path:$!";
+ my $content = join('' => <$fh>);
+ close $fh;
+ is(decode("UTF-7", encode("UTF-7", $content)), $content,
+ "UTF-7 RT:$file");
+}
1;
__END__