Gurusamy Sarathy <gsar@activestate.com>
H.Merijn Brand <h.m.brand@hccnet.nl>
Hugo van der Sanden <hv@crypt.org>
+Inaba HIroto <inaba@st.rim.or.jp>
Jarkko Hietaniemi <jhi@iki.fi>
Jungshik Shin <jshin@mailaps.org>
Laszlo Molnar <ml1050@freemail.hu>
Philip Newton <pne@cpan.org>
Robin Barker <rmb1@cise.npl.co.uk>
SADAHIRO Tomoyuki <SADAHIRO@cpan.org>
+SUZUKI Norio <ZAP00217@nifty.com>
Spider Boardman <spider@web.zk3.dec.com>
Tatsuhiko Miyagawa <miyagawa@edge.co.jp>
Vadim Konovalov <vkonovalov@peterstar.ru>
package Encode::Byte;
use Encode;
-our $VERSION = do { my @r = (q$Revision: 1.22 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
+our $VERSION = do { my @r = (q$Revision: 1.23 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
use XSLoader;
XSLoader::load(__PACKAGE__,$VERSION);
# Vietnamese
viscii
-
+
# all cp* are also available as ibm-*, ms-*, and windows-*
# also see L<http://msdn.microsoft.com/workshop/author/dhtml/reference/charsets/charset4.asp>
# Revision history for Perl extension Encode.
#
-# $Id: Changes,v 1.86 2003/01/22 03:29:07 dankogai Exp $
+# $Id: Changes,v 1.87 2003/02/06 01:52:11 dankogai Exp dankogai $
#
-$Revision: 1.86 $ $Date: 2003/01/22 03:29:07 $
+$Revision: 1.87 $ $Date: 2003/02/06 01:52:11 $
+! AUTHORS
+ * Inaba "Sensei" Hirohito added (I thought I have done so a long
+ ago but apparently I did not).
+ * SUZUKI Norio added for verious and useful bug reports.
+! Byte/Byte.pm KR/KR.pm Unicode/Unicode.pm
+ lib/Encode/Encoder.pm lib/Encode/CJKConstants.pm
+ podchecked so all warnings are gone except for L<http://>.
+! encoding.pm t/enc_eucjp.t
+ * t/uni/tr_utf8.t now t ok on maintperl (sorry, jhi)
+ * Filter option overhaul
+ * POD revision
+! Encode.pm Encode.xs encengine.c Encode/encode.h
+ lib/Encode/Encoding.pm lib/Encode/JP/JIS7.pm
+ Merged inaba-san's patch that fixes "use encoding 'shiftjis'"
+ without filter. podchecked by Dan Kogai.
+ Message-Id: <3E3BC46B.6C687CFD@st.rim.or.jp>
+! lib/Encode/Alias.pm
+ decode('alias', $1) went wild because of local $_ in find_alias()
+ the evil local $_ is eradicated but that changes find_alias()
+ format for coderef aliasing. See Encode::Alias for details
+ Message-Id: <200302051704.AA00042@kipp0.nifty.com>
+
+1.86 2003/01/22 03:29:07
! encoding.pm
* Don't forget to canonize when you attempt an exact match!
Message-Id: <73E7F801-2DAA-11D7-BF9A-000393AE4244@dan.co.jp>
#
-# $Id: Encode.pm,v 1.86 2003/01/22 03:30:40 dankogai Exp $
+# $Id: Encode.pm,v 1.87 2003/02/06 01:52:11 dankogai Exp dankogai $
#
package Encode;
use strict;
-our $VERSION = do { my @r = (q$Revision: 1.86 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
+our $VERSION = do { my @r = (q$Revision: 1.87 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
our $DEBUG = 0;
use XSLoader ();
XSLoader::load(__PACKAGE__, $VERSION);
=head1 Handling Malformed Data
-=over 2
-
The I<CHECK> argument is used as follows. When you omit it,
the behaviour is the same as if you had passed a value of 0 for
I<CHECK>.
+=over 2
+
=item I<CHECK> = Encode::FB_DEFAULT ( == 0)
If I<CHECK> is 0, (en|de)code will put a I<substitution character>
HTMLCREF 0x0200
XMLCREF 0x0400
+=back
+
=head2 Unimplemented fallback schemes
In the future, you will be able to use a code reference to a callback
When you encode, the resulting utf8 flag is always off.
-=item
+=item *
When you decode, the resulting utf8 flag is on unless you can
unambiguously represent data. Here is the definition of
/*
- $Id: Encode.xs,v 1.52 2002/11/18 17:28:49 dankogai Exp $
+ $Id: Encode.xs,v 1.53 2003/02/06 01:52:11 dankogai Exp dankogai $
*/
#define PERL_NO_GET_CONTEXT
die "Encode::KR not supported on EBCDIC\n";
}
}
-our $VERSION = do { my @r = (q$Revision: 1.22 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
+our $VERSION = do { my @r = (q$Revision: 1.23 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
use Encode;
use XSLoader;
Annex 3 of KS X 1001:1998
iso-2022-kr iso-2022-kr [RFC1557]
--------------------------------------------------------------------
-
+
To find how to use this module in detail, see L<Encode>.
=head1 BUGS
use strict;
use warnings;
-our $VERSION = do { my @r = (q$Revision: 1.37 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
+our $VERSION = do { my @r = (q$Revision: 1.38 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
use XSLoader;
XSLoader::load(__PACKAGE__,$VERSION);
-------------------------
=back
-
+
This modules handles the BOM as follows.
=over 4
endian = 'V';
}
else {
- croak("%"SVf": Unrecognised BOM %"UVxf,
+ croak("%"SVf":Unregognised BOM %"UVxf,
*hv_fetch((HV *)SvRV(obj),"Name",4,0),
bom);
}
package encoding;
-our $VERSION = do { my @r = (q$Revision: 1.40 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
+our $VERSION = do { my @r = (q$Revision: 1.41 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
use Encode;
use strict;
+our $DEBUG = 0;
BEGIN {
if (ord("A") == 193) {
$HAS_PERLIO = (PerlIO::encoding->VERSION >= 0.02);
}
-my %utfs = map {$_=>1}
- qw(utf8 UCS-2BE UCS-2LE UTF-16 UTF-16BE UTF-16LE UTF-32 UTF-32BE UTF-32LE);
+sub _exception{
+ my $name = shift;
+ $] > 5.008 and return 0; # 5.8.1 then no
+ my %utfs = map {$_=>1}
+ qw(utf8 UCS-2BE UCS-2LE UTF-16 UTF-16BE UTF-16LE
+ UTF-32 UTF-32BE UTF-32LE);
+ $utfs{$name} or return 0; # UTFs or no
+ require Config; Config->import(); our %Config;
+ return $Config{perl_patchlevel} == 0 # maintperl then no
+}
sub import {
my $class = shift;
my $name = shift;
my %arg = @_;
$name ||= $ENV{PERL_ENCODING};
-
my $enc = find_encoding($name);
unless (defined $enc) {
require Carp;
}
$name = $enc->name; # canonize
unless ($arg{Filter}) {
- ${^ENCODING} = $enc unless $] <= 5.008 and $utfs{$name};
+ $DEBUG and warn "_exception($name) = ", _exception($name);
+ _exception($name) or ${^ENCODING} = $enc;
$HAS_PERLIO or return 1;
- for my $h (qw(STDIN STDOUT)){
- if ($arg{$h}){
- unless (defined find_encoding($arg{$h})) {
- require Carp;
- Carp::croak("Unknown encoding for $h, '$arg{$h}'");
- }
- eval { binmode($h, ":encoding($arg{$h})") };
- }else{
- unless (exists $arg{$h}){
- eval {
- no warnings 'uninitialized';
- binmode($h, ":encoding($name)");
- };
- }
- }
- if ($@){
- require Carp;
- Carp::croak($@);
- }
- }
}else{
defined(${^ENCODING}) and undef ${^ENCODING};
+ # implicitly 'use utf8'
+ require utf8; # to fetch $utf8::hint_bits;
+ $^H |= $utf8::hint_bits;
eval {
require Filter::Util::Call ;
Filter::Util::Call->import ;
- binmode(STDIN);
- binmode(STDOUT);
filter_add(sub{
- my $status;
- if (($status = filter_read()) > 0){
+ my $status = filter_read();
+ if ($status > 0){
+ # $DEBUG and warn $_;
$_ = $enc->decode($_, 1);
- # warn $_;
+ $DEBUG and warn $_;
}
$status ;
});
};
- # warn "Filter installed";
+ } $DEBUG and warn "Filter installed";
+ for my $h (qw(STDIN STDOUT)){
+ if ($arg{$h}){
+ unless (defined find_encoding($arg{$h})) {
+ require Carp;
+ Carp::croak("Unknown encoding for $h, '$arg{$h}'");
+ }
+ eval { binmode($h, ":encoding($arg{$h})") };
+ }else{
+ unless (exists $arg{$h}){
+ eval {
+ no warnings 'uninitialized';
+ binmode($h, ":encoding($name)");
+ };
+ }
+ }
+ if ($@){
+ require Carp;
+ Carp::croak($@);
+ }
}
return 1; # I doubt if we need it, though
}
# an alternate way, Filter
use encoding "euc-jp", Filter=>1;
- use utf8;
# now you can use kanji identifiers -- in euc-jp!
=head1 ABSTRACT
first I<ENCNAME>. C<< STDIN => undef >> turns the IO transcoding
completely off.
+=item use encoding I<ENCNAME> Filter=E<gt>1;
+
+This turns the encoding pragma into a source filter. While the
+default approach just decodes interpolated literals (in qq() and
+qr()), this will apply a source filter to the entire source code. See
+L</"The Filter Option"> below for details
+
=item no encoding;
Unsets the script encoding. The disciplines of STDIN, STDOUT are
=back
+=head1 The Filter Option
+
+The magic of C<use encoding> is not applied to the names of
+identifiers. In order to make C<${"\x{4eba}"}++> ($human++, where human
+is a single Han ideograph) work, you still need to write your script
+in UTF-8 -- or use a source filter. That's what 'Filter=>1' does.
+
+
+What does this mean? Your source code behaves as if it is written in
+UTF-8 with 'use utf8' in effect. So even if your editor only supports
+Shift_JIS, for example, you can still try examples in Chapter 15 of
+C<Programming Perl, 3rd Ed.>. For instance, you can use UTF-8
+identifiers.
+
+This option is significantly slower and (as of this writing) non-ASCII
+identifiers are not very stable WITHOUT this option and with the
+source code written in UTF-8.
+
+=head2 Filter-related changes at Encode version 1.87
+
+=over
+
+=item *
+
+The Filter option now sets STDIN and STDOUT like non-filter options.
+And C<< STDIN=>I<ENCODING> >> and C<< STDOUT=>I<ENCODING> >> work like
+non-filter version.
+
+=item *
+
+C<use utf8> is implicitly declared so you no longer have to C<use
+utf8> to C<${"\x{4eba}"}++>.
+
+=back
+
=head1 CAVEATS
=head2 NOT SCOPED
So feel free to put your strings in your encoding in quotes and
regexes.
-=head2 tr/// with ranges remain unaffected
+=head2 format doesn't work well
+
+This pragma doesn't work well with format because PerlIO does not
+get along very well with it. When format contains non-ascii
+characters it prints funny or gets "wide character warnings".
+To understand it, try the code below.
+
+ # Save this one in utf8
+ # replace *non-ascii* with a non-ascii string
+ my $camel;
+ format STDOUT =
+ *non-ascii*@>>>>>>>
+ $camel
+ .
+ $camel = "*non-ascii*";
+ binmode(STDOUT=>':encoding(utf8)'); # bang!
+ write; # funny
+ print $camel, "\n"; # fine
+
+Without binmode this happens to work but without binmode, print()
+fails instead of write().
+
+At any rate, the very use of format is questionable when it comes to
+unicode characters since you have to consider such things as character
+width (i.e. double-width for ideographs) and directions (i.e. BIDI for
+Arabic and Hebrew).
+
+=head2 tr/// with ranges
The B<encoding> pragma works by decoding string literals in
-C<q//,qq//,qr//,qw///, qx//> and so forth. As of perl 5.8.0, this
+C<q//,qq//,qr//,qw///, qx//> and so forth. In perl 5.8.0, this
does not apply to C<tr///>. Therefore,
use encoding 'euc-jp';
=back
+This counterintuitive behavior has been fixed in perl 5.8.1 and up
+by INABA Hirohito.
+
=head3 workaround to tr///;
-You can, however, achieve the same as simply as follows;
+In perl 5.8.0, you can work aroud as follows;
use encoding 'euc-jp';
- # ....
+ # ....
eval qq{ \$kana =~ tr/\xA4\xA1-\xA4\xF3/\xA5\xA1-\xA5\xF3/ };
Note the C<tr//> expression is surronded by C<qq{}>. The idea behind
Nevertheless, in case of B<encoding> pragma even C<q//> is affected so
C<tr///> not being decoded was obviously against the will of Perl5
-Porters. In future version of perl, this counter-intuitive behaviour
-of C<tr///> will be fixed so C<eval qq{}> trick will be unneccesary.
-
-=head1 Non-ASCII Identifiers and Filter option
-
-The magic of C<use encoding> is not applied to the names of
-identifiers. In order to make C<${"\x{4eba}"}++> ($human++, where human
-is a single Han ideograph) work, you still need to write your script
-in UTF-8 or use a source filter.
-
-In other words, the same restriction as with Jperl applies.
-
-If you dare to experiment, however, you can try the Filter option.
-
-=over 4
-
-=item use encoding I<ENCNAME> Filter=E<gt>1;
-
-This turns the encoding pragma into a source filter. While the default
-approach just decodes interpolated literals (in qq() and qr()), this
-will apply a source filter to the entire source code. In this case,
-STDIN and STDOUT remain untouched.
-
-=back
-
-What does this mean? Your source code behaves as if it is written in
-UTF-8. So even if your editor only supports Shift_JIS, for example,
-you can still try examples in Chapter 15 of C<Programming Perl, 3rd
-Ed.>. For instance, you can use UTF-8 identifiers.
-
-This option is significantly slower and (as of this writing) non-ASCII
-identifiers are not very stable WITHOUT this option and with the
-source code written in UTF-8.
-
-To make your script in legacy encoding work with minimum effort,
-do not use Filter=E<gt>1.
+Porters so it has been fixed.
=head1 EXAMPLE - Greekperl
=head1 KNOWN PROBLEMS
+=over
+
+=item *
+
For native multibyte encodings (either fixed or variable length),
the current implementation of the regular expressions may introduce
recoding errors for regular expression literals longer than 127 bytes.
+=item *
+
The encoding pragma is not supported on EBCDIC platforms.
(Porters who are willing and able to remove this limitation are
welcome.)
+=back
+
=head1 SEE ALSO
L<perlunicode>, L<Encode>, L<open>, L<Filter::Util::Call>,
package Encode::Alias;
use strict;
use Encode;
-our $VERSION = do { my @r = (q$Revision: 1.34 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
+our $VERSION = do { my @r = (q$Revision: 1.35 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
our $DEBUG = 0;
use base qw(Exporter);
sub find_alias
{
my $class = shift;
- local $_ = shift;
- unless (exists $Alias{$_})
+ my $find = shift;
+ unless (exists $Alias{$find})
{
- $Alias{$_} = undef; # Recursion guard
+ $Alias{$find} = undef; # Recursion guard
for (my $i=0; $i < @Alias; $i += 2)
{
my $alias = $Alias[$i];
my $val = $Alias[$i+1];
my $new;
- if (ref($alias) eq 'Regexp' && $_ =~ $alias)
+ if (ref($alias) eq 'Regexp' && $find =~ $alias)
{
$DEBUG and warn "eval $val";
$new = eval $val;
- # $@ and warn "$val, $@";
+ $DEBUG and $@ and warn "$val, $@";
}
elsif (ref($alias) eq 'CODE')
{
- $DEBUG and warn "$alias", "->", "($val)";
- $new = $alias->($val);
+ $DEBUG and warn "$alias", "->", "($find)";
+ $new = $alias->($find);
}
- elsif (lc($_) eq lc($alias))
+ elsif (lc($find) eq lc($alias))
{
$new = $val;
}
if (defined($new))
{
- next if $new eq $_; # avoid (direct) recursion on bugs
+ next if $new eq $find; # avoid (direct) recursion on bugs
$DEBUG and warn "$alias, $new";
my $enc = (ref($new)) ? $new : Encode::find_encoding($new);
if ($enc)
{
- $Alias{$_} = $enc;
+ $Alias{$find} = $enc;
last;
}
}
}
if ($DEBUG){
my $name;
- if (my $e = $Alias{$_}){
+ if (my $e = $Alias{$find}){
$name = $e->name;
}else{
$name = "";
}
- warn "find_alias($class, $_)->name = $name";
+ warn "find_alias($class, $find)->name = $name";
}
- return $Alias{$_};
+ return $Alias{$find};
}
sub define_alias
way to alias names as used in X11 fonts to the MIME names for the
iso-8859-* family. Note the double quotes inside the single quotes.
+(or, you don't have to do this yourself because this example is predefined)
+
If you are using a regex here, you have to use the quotes as shown or
it won't work. Also note that regex handling is tricky even for the
-experienced. Use it with caution.
+experienced. Use this feature with caution.
=item As a code reference, e.g.:
- define_alias( sub { return /^iso8859-(\d+)$/i ? "iso-8859-$1" : undef } , '');
+ define_alias( sub {shift =~ /^iso8859-(\d+)$/i ? "iso-8859-$1" : undef } );
-In this case, C<$_> will be set to the name that is being looked up and
-I<ENCODING> is passed to the sub as its first argument. The example
-is another way to alias names as used in X11 fonts to the MIME names
-for the iso-8859-* family.
+The same effect as the example above in a different way. The coderef
+takes the alias name as an argument and returns a canonical name on
+success or undef if not. Note the second argument is not required.
+Use this with even more caution than the regex version.
=back
+=head3 Changes in code reference aliasing
+
+As of Encode 1.87, the older form
+
+ define_alias( sub { return /^iso8859-(\d+)$/i ? "iso-8859-$1" : undef } );
+
+no longer works.
+
+Encode up to 1.86 internally used "local $_" to implement ths older
+form. But consider the code below;
+
+ use Encode;
+ $_ = "eeeee" ;
+ while (/(e)/g) {
+ my $utf = decode('aliased-encoding-name', $1);
+ print "position:",pos,"\n";
+ }
+
+Prior to Encode 1.86 this fails because of "local $_".
+
=head2 Alias overloading
You can override predefined aliases by simply applying define_alias().
#
-# $Id: CJKConstants.pm,v 1.0 2002/03/28 23:26:28 dankogai Exp $
+# $Id: CJKConstants.pm,v 1.1 2003/02/06 01:52:11 dankogai Exp dankogai $
#
package Encode::CJKConstants;
use strict;
-our $RCSID = q$Id: CJKConstants.pm,v 1.0 2002/03/28 23:26:28 dankogai Exp $;
-our $VERSION = do { my @r = (q$Revision: 1.0 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
+our $RCSID = q$Id: CJKConstants.pm,v 1.1 2003/02/06 01:52:11 dankogai Exp dankogai $;
+our $VERSION = do { my @r = (q$Revision: 1.1 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
use Carp;
=head1 NAME
- Encode::CJKConstants.pm -- Internally used by Encode::??::ISO_2022_*
+Encode::CJKConstants.pm -- Internally used by Encode::??::ISO_2022_*
=cut
#
-# $Id: Encoder.pm,v 0.5 2002/04/22 02:45:50 dankogai Exp $
+# $Id: Encoder.pm,v 0.6 2003/02/06 01:52:11 dankogai Exp dankogai $
#
package Encode::Encoder;
use strict;
use warnings;
-our $VERSION = do { my @r = (q$Revision: 0.5 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
+our $VERSION = do { my @r = (q$Revision: 0.6 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
require Exporter;
our @ISA = qw(Exporter);
Encode::Encoder -- Object Oriented Encoder
=head1 SYNOPSIS
-
+
use Encode::Encoder;
# Encode::encode("ISO-8859-1", $data);
Encode::Encoder->new($data)->iso_8859_1; # OOP way
package Encode::Encoding;
# Base class for classes which implement encodings
use strict;
-our $VERSION = do { my @r = (q$Revision: 1.30 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
+our $VERSION = do { my @r = (q$Revision: 1.31 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
require Encode;
I<$offset> will be modified to the last $octets position at end of decode.
Returns true if $terminator appears output, else returns false.
+=back
+
=head2 Other methods defined in Encode::Encodings
You do not have to override methods shown below unless you have to.
package Encode::JP::JIS7;
use strict;
-our $VERSION = do { my @r = (q$Revision: 1.9 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
+our $VERSION = do { my @r = (q$Revision: 1.10 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
use Encode qw(:fallbacks);
open $src, "<$src_enc" or die "$src_enc : $!";
- binmode($src); # needed if UTF-8 locales enabled!
+ # binmode($src); # not needed!
$txt = join('',<$src>);
close($src);
-# $Id: enc_eucjp.t,v 1.1 2003/01/21 22:19:14 dankogai Exp $
+# $Id: enc_eucjp.t,v 1.2 2003/02/06 01:52:11 dankogai Exp dankogai $
# This is the twin of enc_utf8.t, the only difference is that
# this has "use encoding 'euc-jp'".
BEGIN {
- if ($] <= 5.008){
- print "1..0 # Skip: Perl 5.8.1 or later required\n";
- exit 0;
- }
require Config; import Config;
if ($Config{'extensions'} !~ /\bEncode\b/) {
print "1..0 # Skip: Encode was not built\n";
print "1..0 # encoding pragma does not support EBCDIC platforms\n";
exit(0);
}
+ if ($] <= 5.008 and !$Config{perl_patchlevel}){
+ print "1..0 # Skip: Perl 5.8.1 or later required\n";
+ exit 0;
+ }
}
use encoding 'euc-jp';