From: Jarkko Hietaniemi Date: Fri, 26 Apr 2002 02:33:19 +0000 (+0000) Subject: Upgrade to Encode 1.61, from Dan Kogai. X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=7e19fb92789b07f9ae6ba1ee1b4f5fbb72612161;p=p5sagit%2Fp5-mst-13.2.git Upgrade to Encode 1.61, from Dan Kogai. p4raw-id: //depot/perl@16177 --- diff --git a/ext/Encode/AUTHORS b/ext/Encode/AUTHORS index 2ba72f8..8610012 100644 --- a/ext/Encode/AUTHORS +++ b/ext/Encode/AUTHORS @@ -27,6 +27,7 @@ Nicholas Clark Nick Ing-Simmons Paul Marquess Philip Newton +Robin Barker SADAHIRO Tomoyuki Spider Boardman Tatsuhiko Miyagawa diff --git a/ext/Encode/Changes b/ext/Encode/Changes index 314358e..ad4fabb 100644 --- a/ext/Encode/Changes +++ b/ext/Encode/Changes @@ -1,9 +1,28 @@ # Revision history for Perl extension Encode. # -# $Id: Changes,v 1.60 2002/04/24 20:06:52 dankogai Exp $ +# $Id: Changes,v 1.61 2002/04/26 03:02:04 dankogai Exp $ # -$Revision: 1.60 $ $Date: 2002/04/24 20:06:52 $ +$Revision: 1.61 $ $Date: 2002/04/26 03:02:04 $ +! t/mime-header.t + Now does decent tests besides use_ok() +! lib/Encode/Guess.pm t/guess.t + UI streamlined, document added +! Unicode/Unicode.xs + various signed/unsigned mismatch nits (#16173) + http://public.activestate.com/cgi-bin/perlbrowse?patch=16173 +! Encode.pm + POD: utf8-flag-related caveats added. A few sections completely + rewritten. +! Encode.xs +! AUTHORS + Thou shalt not assume %d works, either! + Robin Baker added to AUTHORS for this + Message-Id: <200204251132.MAA28237@tempest.npl.co.uk> +! t/CJKT.t + "Change 16144 by gsar@onru on 2002/04/24 18:59:05" + +1.60 2002/04/24 20:06:52 ! Encode.xs "Thou shalt not assume %x works." -- jhi Message-Id: <20020424210618.E24347@alpha.hut.fi> @@ -492,7 +511,7 @@ $Revision: 1.60 $ $Date: 2002/04/24 20:06:52 $ Typo fixes and improvements by jhi Message-Id: <200204010201.FAA03564@alpha.hut.fi>, et al. -1.11 $Date: 2002/04/24 20:06:52 $ +1.11 $Date: 2002/04/26 03:02:04 $ + t/encoding.t + t/jperl.t ! MANIFEST diff --git a/ext/Encode/Encode.pm b/ext/Encode/Encode.pm index 65ef50b..b502e8f 100644 --- a/ext/Encode/Encode.pm +++ b/ext/Encode/Encode.pm @@ -1,12 +1,12 @@ package Encode; use strict; -our $VERSION = do { my @r = (q$Revision: 1.60 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; +our $VERSION = do { my @r = (q$Revision: 1.61 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; our $DEBUG = 0; use XSLoader (); XSLoader::load 'Encode'; require Exporter; -our @ISA = qw(Exporter); +use base qw/Exporter/; # Public, encouraged API is exported by default @@ -331,7 +331,7 @@ byte has 256 possible values, it easily fits in Perl's much larger =head2 TERMINOLOGY -=over 4 +=over 2 =item * @@ -356,7 +356,7 @@ and such details may change in future releases. =head1 PERL ENCODING API -=over 4 +=over 2 =item $octets = encode(ENCODING, $string[, CHECK]) @@ -368,7 +368,13 @@ For CHECK, see L. For example, to convert (internally UTF-8 encoded) Unicode string to iso-8859-1 (also known as Latin1), - $octets = encode("iso-8859-1", $unicode); + $octets = encode("iso-8859-1", $utf8); + +B: When you C<$octets = encode("utf8", $utf8)>, then $octets +B $utf8. Though they both contain the same data, the utf8 flag +for $octets is B off. When you encode anything, utf8 flag of +the result is always off, even when it contains completely valid utf8 +string. See L below. =item $string = decode(ENCODING, $octets[, CHECK]) @@ -382,16 +388,22 @@ For example, to convert ISO-8859-1 data to UTF-8: $utf8 = decode("iso-8859-1", $latin1); -=item [$length =] from_to($string, FROM_ENCODING, TO_ENCODING [,CHECK]) +B: When you C<$utf8 = encode("utf8", $octets)>, then $utf8 +B $utf8. Though they both contain the same data, +the utf8 flag for $utf8 is on unless $octets entirely conststs of +ASCII data (or EBCDIC on EBCDIC machines). See L +below. -Converts B data between two encodings. -For example, to convert ISO-8859-1 data to UTF-8: +=item [$length =] from_to($string, FROM_ENC, TO_ENC [, CHECK]) + +Converts B data between two encodings. For example, to +convert ISO-8859-1 data to UTF-8: - from_to($data, "iso-8859-1", "utf-8"); + from_to($data, "iso-8859-1", "utf8"); and to convert it back: - from_to($data, "utf-8", "iso-8859-1"); + from_to($data, "utf8", "iso-8859-1"); Note that because the conversion happens in place, the data to be converted cannot be a string constant; it must be a scalar variable. @@ -399,32 +411,34 @@ converted cannot be a string constant; it must be a scalar variable. from_to() returns the length of the converted string on success, undef otherwise. -=back +B: The following operations look the same but not quite so; + + from_to($data, "iso-8859-1", "utf8"); #1 + $data = decode("iso-8859-1", $data); #2 -=head2 UTF-8 / utf8 +Both #1 and #2 makes $data consists of completely valid UTF-8 string +but only #2 turns utf8 flag on. #1 is equivalent to -The Unicode Consortium defines the UTF-8 transformation format as a -way of encoding the entire Unicode repertoire as sequences of octets. -This encoding is expected to become very widespread. Perl can use this -form internally to represent strings, so conversions to and from this -form are particularly efficient (as octets in memory do not have to -change, just the meta-data that tells Perl how to treat them). + $data = encode("utf8", decode("iso-8859-1", $data)); -=over 4 +See L below. =item $octets = encode_utf8($string); -The characters that comprise $string are encoded in Perl's superset of -UTF-8 and the resulting octets are returned as a sequence of bytes. All -possible characters have a UTF-8 representation so this function cannot -fail. +Equivalent to C<$octets = encode("utf8", $string);> The characters +that comprise $string are encoded in Perl's superset of UTF-8 and the +resulting octets are returned as a sequence of bytes. All possible +characters have a UTF-8 representation so this function cannot fail. + =item $string = decode_utf8($octets [, CHECK]); -The sequence of octets represented by $octets is decoded from UTF-8 -into a sequence of logical characters. Not all sequences of octets -form valid UTF-8 encodings, so it is possible for this call to fail. -For CHECK, see L. +equivalent to C<$string = decode("utf8", $octets [, CHECK])>. +decode_utf8($octets [, CHECK]); The sequence of octets represented by +$octets is decoded from UTF-8 into a sequence of logical +characters. Not all sequences of octets form valid UTF-8 encodings, so +it is possible for this call to fail. For CHECK, see +L. =back @@ -510,7 +524,7 @@ For gory details, see L. =head1 Handling Malformed Data -=over 4 +=over 2 The I argument is used as follows. When you omit it, the behaviour is the same as if you had passed a value of 0 for @@ -524,7 +538,7 @@ EsubcharE will be used. For Unicode, "\x{FFFD}" is used. If the data is supposed to be UTF-8, an optional lexical warning (category utf8) is given. -=item I = Encode::DIE_ON_ERROR (== 1) +=item I = Encode::FB_CROAK ( == 1) If I is 1, methods will die immediately with an error message. Therefore, when I is set to 1, you should trap the @@ -608,12 +622,84 @@ arguments are taken as aliases for I<$object>, as for C. See L for more details. -=head1 Messing with Perl's Internals +=head1 The UTF-8 flag + +Before the introduction of utf8 support in perl, The C operator +just compares internal data of the scalars. Now C means internal +data equality AND I. To explain why we made it so, I +will quote page 402 of C + +=over 2 + +=item Goal #1: + +Old byte-oriented programs should not spontaneously break on the old +byte-oriented data they used to work on. + +=item Goal #2: + +Old byte-oriented programs should magically start working on the new +character-oriented data when appropriate. + +=item Goal #3: + +Programs should run just as fast in the new character-oriented mode +as in the old byte-oriented mode. + +=item Goal #4: + +Perl should remain one language, rather than forking into a +byte-oriented Perl and a character-oriented Perl. + +=back + +Back when C was written, not even Perl 5.6.0 +was born and many features documented in the book remained +unimplemented. Perl 5.8 hopefully correct this and the introduction +of UTF-8 flag is one of them. You can think this perl notion of +byte-oriented mode (utf8 flag off) and character-oriented mode (utf8 +flag on). + +Here is how Encode takes care of the utf8 flag. + +=over2 + +=item * + +When you encode, the resulting utf8 flag is always off. + +=item + +When you decode, the resuting utf8 flag is on unless you can +unambiguously represent data. Here is the definition of +dis-ambiguity. + + After C<$utf8 = decode('foo', $octet);>, + + When $octet is... The utf8 flag in $utf8 is + --------------------------------------------- + In ASCII only (or EBCDIC only) OFF + In ISO-8859-1 ON + In any other Encoding ON + --------------------------------------------- + +As you see, there is one exception, In ASCII. That way you can assue +Goal #1. And with Encode Goal #2 is assumed but you still have to be +careful in such cases mentioned in B paragraphs. + +This utf8 flag is not visible in perl scripts, exactly for the same +reason you cannot (or you I) see if a scalar contains a +string, integer, or floating point number. But you can still peek +and poke these if you will. See the section below. + +=back + +=head2 Messing with Perl's Internals The following API uses parts of Perl's internals in the current implementation. As such, they are efficient but may change. -=over 4 +=over 2 =item is_utf8(STRING [, CHECK]) @@ -653,8 +739,8 @@ the Perl Unicode Mailing List Eperl-unicode@perl.orgE =head1 MAINTAINER This project was originated by Nick Ing-Simmons and later maintained -by Dan Kogai Edankogai@dan.co.jpE. See AUTHORS for a full list -of people involved. For any questions, use -Eperl-unicode@perl.orgE so others can share. +by Dan Kogai Edankogai@dan.co.jpE. See AUTHORS for a full +list of people involved. For any questions, use +Eperl-unicode@perl.orgE so we can all share share. =cut diff --git a/ext/Encode/Encode.xs b/ext/Encode/Encode.xs index a7e7c6a..566d066 100644 --- a/ext/Encode/Encode.xs +++ b/ext/Encode/Encode.xs @@ -1,5 +1,5 @@ /* - $Id: Encode.xs,v 1.38 2002/04/24 20:11:14 dankogai Exp dankogai $ + $Id: Encode.xs,v 1.39 2002/04/26 03:02:04 dankogai Exp $ */ #define PERL_NO_GET_CONTEXT @@ -147,7 +147,7 @@ encode_method(pTHX_ encode_t * enc, encpage_t * dir, SV * src, sv_catsv(dst, perlqq); }else if (check & ENCODE_HTMLCREF){ SV* htmlcref = - sv_2mortal(newSVpvf("&#%d;", ch)); + sv_2mortal(newSVpvf("&#%" UVuf ";", ch)); sdone += slen + clen; ddone += dlen + SvCUR(htmlcref); sv_catsv(dst, htmlcref); diff --git a/ext/Encode/Unicode/Unicode.xs b/ext/Encode/Unicode/Unicode.xs index 3df25a7..e3ad82c 100644 --- a/ext/Encode/Unicode/Unicode.xs +++ b/ext/Encode/Unicode/Unicode.xs @@ -1,5 +1,5 @@ /* - $Id: Unicode.xs,v 1.3 2002/04/20 23:43:47 dankogai Exp $ + $Id: Unicode.xs,v 1.4 2002/04/26 03:02:04 dankogai Exp $ */ #define PERL_NO_GET_CONTEXT diff --git a/ext/Encode/lib/Encode/Guess.pm b/ext/Encode/lib/Encode/Guess.pm index e027e38..d2aac44 100644 --- a/ext/Encode/lib/Encode/Guess.pm +++ b/ext/Encode/lib/Encode/Guess.pm @@ -1,51 +1,94 @@ package Encode::Guess; use strict; use Carp; + use Encode qw(:fallbacks find_encoding); -our $VERSION = do { my @r = (q$Revision: 1.1 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; +our $VERSION = do { my @r = (q$Revision: 1.2 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; my $Canon = 'Guess'; -$Encode::Encoding{$Canon} = bless { Name => $Canon } => __PACKAGE__; our $DEBUG = 0; -our %DEF_CANDIDATES = - map { $_ => find_encoding($_) } qw(ascii utf8); -our %CANDIDATES; +our %DEF_SUSPECTS = map { $_ => find_encoding($_) } qw(ascii utf8); +$Encode::Encoding{$Canon} = + bless { + Name => $Canon, + Suspects => { %DEF_SUSPECTS }, + } => __PACKAGE__; + +sub name { shift->{'Name'} } +sub new_sequence { $_[0] } +sub needs_lines { 1 } +sub perlio_ok { 0 } +sub DESTROY{} + +our @EXPORT = qw(guess_encoding); + +sub import { # Exporter not used so we do it on our own + my $callpkg = caller; + for my $item (@EXPORT){ + no strict 'refs'; + *{"$callpkg\::$item"} = \&{"$item"}; + } + set_suspects(@_); +} +sub set_suspects{ + my $class = shift; + my $self = ref($class) ? $class : $Encode::Encoding{$Canon}; + $self->{Suspects} = { %DEF_SUSPECTS }; + $self->add_suspects(@_); +} -sub import{ +sub add_suspects{ my $class = shift; - %CANDIDATES = %DEF_CANDIDATES; + my $self = ref($class) ? $class : $Encode::Encoding{$Canon}; for my $c (@_){ my $e = find_encoding($c) or die "Unknown encoding: $c"; - $CANDIDATES{$e->name} = $e; + $self->{Suspects}{$e->name} = $e; $DEBUG and warn "Added: ", $e->name; } } -sub name { shift->{'Name'} } -sub new_sequence { $_[0] } -sub needs_lines { 1 } -sub perlio_ok { 0 } - sub decode($$;$){ my ($obj, $octet, $chk) = @_; - my $utf8 = $obj->guess($octet)->decode($octet, $chk); + my $guessed = guess($obj, $octet); + ref($guessed) or croak $guessed; + my $utf8 = $guessed->decode($octet, $chk); $_[1] = $octet if $chk; return $utf8; } sub encode{ - croak "Tsk, tsk, tsk. You can't be too lazy here here!"; + croak "Tsk, tsk, tsk. You can't be too lazy here!"; +} + +sub guess_encoding{ + guess($Encode::Encoding{$Canon}, @_); } sub guess { - my ($obj, $octet) = @_; - # cheat 1: utf8 flag; + my $class = shift; + my $obj = ref($class) ? $class : $Encode::Encoding{$Canon}; + my $octet = shift; + # cheat 0: utf8 flag; Encode::is_utf8($octet) and return find_encoding('utf8'); - my %try = %CANDIDATES; + # cheat 1: BOM + use Encode::Unicode; + my $BOM = unpack('n', $octet); + return find_encoding('UTF-16') + if ($BOM == 0xFeFF or $BOM == 0xFFFe); + $BOM = unpack('N', $octet); + return find_encoding('UTF-32') + if ($BOM == 0xFeFF or $BOM == 0xFFFe0000); + + my %try = %{$obj->{Suspects}}; + for my $c (@_){ + my $e = find_encoding($c) or die "Unknown encoding: $c"; + $try{$e->name} = $e; + $DEBUG and warn "Added: ", $e->name; + } my $nline = 1; for my $line (split /\r|\n|\r\n/, $octet){ - # cheat 2 -- escape + # cheat 2 -- \e in the string if ($line =~ /\e/o){ my @keys = keys %try; delete @try{qw/utf8 ascii/}; @@ -69,27 +112,186 @@ sub guess { } } - %ok or croak "No appropriate encodings found!"; + %ok or return "No appropriate encodings found!"; if (scalar(keys(%ok)) == 1){ my ($retval) = values(%ok); return $retval; } %try = %ok; $nline++; } - unless ($try{ascii}){ - croak "Encodings too ambiguous: ", - join(" or ", keys %try); - } + $try{ascii} or + return "Encodings too ambiguous: ", join(" or ", keys %try); return $try{ascii}; } + 1; __END__ =head1 NAME -Encode::Guess -- guesscoding! +Encode::Guess -- Guesses encoding from data + +=head1 SYNOPSIS + + # if you are sure $data won't contain anything bogus + + use Encode::Guess qw/euc-jp shiftjis 7bit-jis/; + my $utf8 = decode("Guess", $data); + my $data = encode("Guess", $utf8); # this doesn't work! + + # more elaborate way + use Encode::Guess, + my $enc = guess_encoding($data, qw/euc-jp shiftjis 7bit-jis/); + ref($enc) or die "Can't guess: $enc"; # trap error this way + $utf8 = $enc->decode($data); + # or + $utf8 = decode($enc->name, $data) + +=head1 ABSTRACT + +Encode::Guess enables you to guess in what encoding a given data is +encoded, or at least tries to. + +=head1 DESCRIPTION + +By default, it checks only ascii, utf8 and UTF-16/32 with BOM. + + use Encode::Guess; # ascii/utf8/BOMed UTF + +To use it more practically, you have to give the names of encodings to +check (I as follows). The name of suspects can either be +canonical names or aliases. + + # tries all major Japanese Encodings as well + use Encode::Guess qw/euc-jp shiftjis 7bit-jis/; + +=over 4 + +=item Encode::Guess->set_suspects + +You can also change the internal suspects list via C +method. + + use Encode::Guess; + Encode::Guess->set_suspects(qw/euc-jp shiftjis 7bit-jis/); + +=item Encode::Guess->add_suspects + +Or you can use C method. The difference is that +C flushes the current suspects list while +C adds. + + use Encode::Guess; + Encode::Guess->add_suspects(qw/euc-jp shiftjis 7bit-jis/); + # now the suspects are euc-jp,shiftjis,7bit-jis, AND + # euc-kr,euc-cn, and big5-eten + Encode::Guess->add_suspects(qw/euc-kr euc-cn big5-eten/); + +=item Encode::decode("Guess" ...) + +When you are content with suspects list, you can now + + my $utf8 = Encode::decode("Guess", $data); + +=item Encode::Guess->guess($data) + +But it will croak if Encode::Guess fails to eliminate all other +suspects but the right one or no suspect was good. So you should +instead try this; + + my $decoder = Encode::Guess->guess($data); + +On success, $decoder is an object that is documented in +L. So you can now do this; + + my $utf8 = $decoder->decode($data); + +On failure, $decoder now contains an error message so the whole thing +would be as follows; + + my $decoder = Encode::Guess->guess($data); + die $decoder unless ref($decoder); + my $utf8 = $decoder->decode($data); + +=item guess_encoding($data, [, I]) + +You can also try C function which is exported by +default. It takes $data to check and it also takes the list of +suspects by option. The optional suspect list is I to +the internal suspects list. + + my $decoder = guess_encoding($data, qw/euc-jp euc-kr euc-cn/); + die $decoder unless ref($decoder); + my $utf8 = $decoder->decode($data); + # check only ascii and utf8 + my $decoder = guess_encoding($data); + +=back + +=head1 CAVEATS + +=over 4 + +=item * + +Because of the algorithm used, ISO-8859 series and other single-byte +encodings do not work well unless either one of ISO-8859 is the only +one suspect (besides ascii and utf8). + + use Encode::Guess; + # perhaps ok + my $decoder = guess_encoding($data, 'latin1'); + # definitely NOT ok + my $decoder = guess_encoding($data, qw/latin1 greek/); + +The reason is that Encode::Guess guesses encoding by trial and error. +It first splits $data into lines and tries to decode the line for each +suspect. It keeps it going until all but one encoding was eliminated +out of suspects list. ISO-8859 series is just too successful for most +cases (because it fills almost all code points in \x00-\xff). + +=item * + +Do not mix national standard encodings and the corresponding vendor +encodings. + + # a very bad idea + my $decoder + = guess_encoding($data, qw/shiftjis MacJapanese cp932/); + +The reason is that vendor encoding is usually a superset of national +standard so it becomes too ambiguous for most cases. + +=item * + +On the other hand, mixing various national standard encodings +automagically works unless $data is too short to allow for guessing. + + # This is ok if $data is long enough + my $decoder = + guess_encoding($data, qw/euc-cn + euc-jp shiftjis 7bit-jis + euc-kr + big5-eten/); + +=item * + +DO NOT PUT TOO MANY SUSPECTS! Don't you try something like this! + + my $decoder = guess_encoding($data, + Encode->encodings(":all")); + +=back + +It is, after all, just a guess. You should alway be explicit when it +comes to encodings. But there are some, especially Japanese, +environment that guess-coding is a must. Use this module with care. + +=head1 SEE ALSO + +L, L =cut diff --git a/ext/Encode/lib/Encode/MIME/Header.pm b/ext/Encode/lib/Encode/MIME/Header.pm index 51f0923..ce7b872 100644 --- a/ext/Encode/lib/Encode/MIME/Header.pm +++ b/ext/Encode/lib/Encode/MIME/Header.pm @@ -1,7 +1,7 @@ package Encode::MIME::Header; use strict; # use warnings; -our $VERSION = do { my @r = (q$Revision: 1.1 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; +our $VERSION = do { my @r = (q$Revision: 1.2 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; use Encode qw(find_encoding encode_utf8); use MIME::Base64; @@ -197,7 +197,7 @@ line. =head1 BUGS -It would be nice to support non-UTF8 encoding, such as =?ISO-2022-JP? +It would be nice to support encoding to non-UTF8, such as =?ISO-2022-JP? and =?ISO-8859-1?= but that makes the implementation too complicated. These days major mail agents all support =?UTF-8? so I think it is just good enough. diff --git a/ext/Encode/t/CJKT.t b/ext/Encode/t/CJKT.t index f66bb32..31c0aa1 100644 --- a/ext/Encode/t/CJKT.t +++ b/ext/Encode/t/CJKT.t @@ -55,6 +55,8 @@ for my $charset (sort keys %Charset){ open $src, "<$src_enc" or die "$src_enc : $!"; + # binmode($src); # not needed! + $txt = join('',<$src>); close($src); diff --git a/ext/Encode/t/guess.t b/ext/Encode/t/guess.t index 7b5d3ca..ace13dd 100644 --- a/ext/Encode/t/guess.t +++ b/ext/Encode/t/guess.t @@ -17,7 +17,7 @@ use File::Spec; use Encode qw(decode encode find_encoding _utf8_off); #use Test::More qw(no_plan); -use Test::More tests => 11; +use Test::More tests => 17; use_ok("Encode::Guess"); { no warnings; @@ -28,19 +28,16 @@ my $ascii = join('' => map {chr($_)}(0x21..0x7e)); my $latin1 = join('' => map {chr($_)}(0xa1..0xfe)); my $utf8on = join('' => map {chr($_)}(0x3000..0x30fe)); my $utf8off = $utf8on; _utf8_off($utf8off); +my $utf16 = encode('UTF-16', $utf8on); +my $utf32 = encode('UTF-32', $utf8on); -is(Encode::Guess->guess($ascii)->name, 'ascii'); - -eval { Encode::Guess->guess($latin1) } ; -like($@, qr/No appropriate encoding/io); - -Encode::Guess->import(qw(latin1)); - -is(Encode::Guess->guess($latin1)->name, 'iso-8859-1'); -is(Encode::Guess->guess($utf8on)->name, 'utf8'); - -eval { Encode::Guess->guess($utf8off) }; -like($@, qr/ambiguous/io); +is(guess_encoding($ascii)->name, 'ascii', 'ascii'); +like(guess_encoding($latin1), qr/No appropriate encoding/io, 'no ascii'); +is(guess_encoding($latin1, 'latin1')->name, 'iso-8859-1', 'iso-8859-1'); +is(guess_encoding($utf8on)->name, 'utf8', 'utf8 w/ flag'); +is(guess_encoding($utf8off)->name, 'utf8', 'utf8 w/o flag'); +is(guess_encoding($utf16)->name, 'UTF-16', 'UTF-16'); +is(guess_encoding($utf32)->name, 'UTF-32', 'UTF-32'); my $jisx0201 = File::Spec->catfile(dirname(__FILE__), 'jisx0201.utf'); my $jisx0208 = File::Spec->catfile(dirname(__FILE__), 'jisx0208.utf'); @@ -50,15 +47,37 @@ open my $fh, $jisx0208 or die "$jisx0208: $!"; $utf8off = join('' => <$fh>); close $fh; $utf8on = decode('utf8', $utf8off); + my @jp = qw(7bit-jis shiftjis euc-jp); -Encode::Guess->import(@jp); +Encode::Guess->set_suspects(@jp); for my $jp (@jp){ my $test = encode($jp, $utf8on); - is(Encode::Guess->guess($test)->name, $jp, $jp); + is(guess_encoding($test)->name, $jp, "JP:$jp"); } + is (decode('Guess', encode('euc-jp', $utf8on)), $utf8on, "decode('Guess')"); eval{ encode('Guess', $utf8on) }; like($@, qr/lazy/io, "no encode()"); + +my %CJKT = + ( + 'euc-cn' => File::Spec->catfile(dirname(__FILE__), 'gb2312.utf'), + 'euc-jp' => File::Spec->catfile(dirname(__FILE__), 'jisx0208.utf'), + 'euc-kr' => File::Spec->catfile(dirname(__FILE__), 'ksc5601.utf'), + 'big5-eten' => File::Spec->catfile(dirname(__FILE__), 'big5-eten.utf'), +); + +Encode::Guess->set_suspects(keys %CJKT); + +for my $name (keys %CJKT){ + open my $fh, $CJKT{$name} or die "$CJKT{$name}: $!"; + $utf8off = join('' => <$fh>); + close $fh; + + my $test = encode($name, decode('utf8', $utf8off)); + is(guess_encoding($test)->name, $name, "CJKT:$name"); +} + __END__; diff --git a/ext/Encode/t/jperl.t b/ext/Encode/t/jperl.t index 4c2ef4d..82f7a84 100644 --- a/ext/Encode/t/jperl.t +++ b/ext/Encode/t/jperl.t @@ -1,5 +1,5 @@ # -# $Id: jperl.t,v 1.23 2002/04/22 09:48:07 dankogai Exp $ +# $Id: jperl.t,v 1.24 2002/04/26 03:02:04 dankogai Exp $ # # This script is written in euc-jp diff --git a/ext/Encode/t/mime-header.t b/ext/Encode/t/mime-header.t index 8255952..826efbf 100644 --- a/ext/Encode/t/mime-header.t +++ b/ext/Encode/t/mime-header.t @@ -1,3 +1,7 @@ +# +# $Id: mime-header.t,v 1.3 2002/04/26 03:07:59 dankogai Exp $ +# This script is written in utf8 +# BEGIN { if ($ENV{'PERL_CORE'}){ chdir 't'; @@ -12,8 +16,62 @@ BEGIN { } use strict; -use Test::More tests => 1; +#use Test::More qw(no_plan); +use Test::More tests => 6; use_ok("Encode::MIME::Header"); +my $eheader =<<'EOS'; +From: =?US-ASCII?Q?Keith_Moore?= +To: =?ISO-8859-1?Q?Keld_J=F8rn_Simonsen?= +CC: =?ISO-8859-1?Q?Andr=E9?= Pirard +Subject: =?ISO-8859-1?B?SWYgeW91IGNhbiByZWFkIHRoaXMgeW8=?= + =?ISO-8859-2?B?dSB1bmRlcnN0YW5kIHRoZSBleGFtcGxlLg==?= +EOS +my $dheader=<<"EOS"; +From: Keith Moore +To: Keld J\xF8rn Simonsen +CC: Andr\xE9 Pirard +Subject: If you can read this you understand the example. +EOS + +is(Encode::decode('MIME-Header', $eheader), $dheader, "decode (RFC2047)"); + +use utf8; + +$dheader=<<'EOS'; +From: 小飼 弾 +To: dankogai@dan.co.jp (小飼=Kogai, 弾=Dan) +Subject: 漢字、カタカナ、ひらがなを含む、非常に長いタイトル行が一体全体どのようにしてEncodeされるのか? +EOS + +my $bheader =<<'EOS'; +From:=?UTF-8?B?IOWwj+mjvCDlvL4g?= +To: dankogai@dan.co.jp (=?UTF-8?B?5bCP6aO8?==Kogai,=?UTF-8?B?IOW8vg==?==Dan + ) +Subject: + =?UTF-8?B?IOa8ouWtl+OAgeOCq+OCv+OCq+ODiuOAgeOBsuOCieOBjOOBquOCkuWQq+OCgA==?= + =?UTF-8?B?44CB6Z2e5bi444Gr6ZW344GE44K/44Kk44OI44Or6KGM44GM5LiA5L2T5YWo?= + =?UTF-8?B?5L2T44Gp44Gu44KI44GG44Gr44GX44GmRW5jb2Rl44GV44KM44KL44Gu44GL?= + =?UTF-8?B?77yf?= +EOS + +my $qheader=<<'EOS'; +From:=?UTF-8?Q?=20=E5=B0=8F=E9=A3=BC=20=E5=BC=BE=20?= +To: dankogai@dan.co.jp (=?UTF-8?Q?=E5=B0=8F=E9=A3=BC?==Kogai, + =?UTF-8?Q?=20=E5=BC=BE?==Dan) +Subject: + =?UTF-8?Q?=20=E6=BC=A2=E5=AD=97=E3=80=81=E3=82=AB=E3=82=BF=E3=82=AB?= + =?UTF-8?Q?=E3=83=8A=E3=80=81=E3=81=B2=E3=82=89=E3=81=8C=E3=81=AA=E3=82=92?= + =?UTF-8?Q?=E5=90=AB=E3=82=80=E3=80=81=E9=9D=9E=E5=B8=B8=E3=81=AB=E9=95=B7?= + =?UTF-8?Q?=E3=81=84=E3=82=BF=E3=82=A4=E3=83=88=E3=83=AB=E8=A1=8C=E3=81=8C?= + =?UTF-8?Q?=E4=B8=80=E4=BD=93=E5=85=A8=E4=BD=93=E3=81=A9=E3=81=AE=E3=82=88?= + =?UTF-8?Q?=E3=81=86=E3=81=AB=E3=81=97=E3=81=A6Encode=E3=81=95?= + =?UTF-8?Q?=E3=82=8C=E3=82=8B=E3=81=AE=E3=81=8B=EF=BC=9F?= +EOS + +is(Encode::decode('MIME-Header', $bheader), $dheader, "decode B"); +is(Encode::decode('MIME-Header', $qheader), $dheader, "decode Q"); +is(Encode::encode('MIME-B', $dheader)."\n", $bheader, "encode B"); +is(Encode::encode('MIME-Q', $dheader)."\n", $qheader, "encode Q"); __END__;