ext/Encode/Encode.pm Mother of all Encode extensions
ext/Encode/Encode.xs Encode extension
ext/Encode/Encode/Changes.e2x Skeleton file for enc2xs
+ext/Encode/Encode/ConfigLocal_PM.e2x Skeleton file for enc2xs
ext/Encode/Encode/encode.h Encode extension header file
ext/Encode/Encode/Makefile_PL.e2x Skeleton file for enc2xs
ext/Encode/Encode/README.e2x Skeleton file for enc2xs
ext/Encode/lib/Encode/Config.pm Encode configuration module
ext/Encode/lib/Encode/Encoder.pm OO Encoder
ext/Encode/lib/Encode/Encoding.pm Encode extension
-ext/Encode/lib/Encode/JP/2022_JP.pm Encode extension
-ext/Encode/lib/Encode/JP/2022_JP1.pm Encode extension
ext/Encode/lib/Encode/JP/H2Z.pm Encode extension
-ext/Encode/lib/Encode/JP/JIS.pm Encode extension
+ext/Encode/lib/Encode/JP/JIS7.pm Encode extension
ext/Encode/lib/Encode/KR/2022_KR.pm Encode extension
ext/Encode/lib/Encode/Supported.pod Documents supported encodings
ext/Encode/lib/Encode/Unicode.pm Encode extension
Andreas J. Koenig <andreas.koenig@anima.de>
Anton Tagunov <tagunov@motor.ru>
Autrijus Tang <autrijus@autrijus.org>
+Benjamin Goldberg <goldbb2@earthlink.net>
Craig A. Berry <craigberry@mac.com>
Dan Kogai <dankogai@dan.co.jp>
Gerrit P. Haase <gp@familiehaase.de>
die "Encode::CN not supported on EBCDIC\n";
}
}
-our $VERSION = do { my @r = (q$Revision: 1.20 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
+our $VERSION = do { my @r = (q$Revision: 1.21 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
use Encode;
use Encode::CN::HZ;
# Relocated from Encode.pm
+use Encode::CN::HZ;
# use Encode::CN::2022_CN;
1;
# Revision history for Perl extension Encode.
#
-# $Id: Changes,v 1.34 2002/04/12 20:23:05 dankogai Exp dankogai $
+# $Id: Changes,v 1.40 2002/04/14 22:27:14 dankogai Exp $
#
-1.34 $Date: 2002/04/12 20:23:05 $ (Unreleased)
+1.40 $Date: 2002/04/14 22:27:14 $
++ Encode/ConfigLocal_PM.e2x
+! lib/Encode/Config.pm
+! bin/enc2xs
+ "enc2xs -C" now generates/updates Encode::ConfigLocal.
+ ConfigLocal_PM.e2x is a skelton thereof.
+! lib/Encode/Config.pm
+! CN/CN.pm
+ "use Encode::CN::HZ;" was missing.
+! t/Unicode.t
+! t/unibench.t
+ More rigorous tests added to test XS, especially on memory allocation.
+! Encode.xs
+! lib/Encode/Unicode.pm
+ NI-S implemented an XS version -- merged
+ Message-Id: <20020414154857.2066.4@bactrian.ni-s.u-net.com>
+! encoding.pm
+! t/jperl.t
+ Source filter option added. With this option on, you can write
+ perl 5.8-savvy scripts (such as UTF-8 identifiers) in legacy
+ encodings. t/jperl.t enhanced to test this feature.
+! t/Unicode.t
+ ok() gotcha addressed by Benjamin fixed. Though I didn't exactly
+ apply his suggestion, this degree of nitting is enough to add him
+ to AUTHORS list.
+ Message-Id: <3CB93223.291E5E2E@earthlink.net>
+! JP/JP.pm
++ lib/Encode/JP/JIS7.pm
+- lib/Encode/JP/JIS.pm
+- lib/Encode/JP/2022_JP.pm
+- lib/Encode/JP/2022_JP1.pm
+ 7bit-jis, iso-2022-jp and iso-2022-jp1 are all aggregated to
+ JIS7.pm for better maintainability and performance
+! encoding.pm
+ Added caveat for non-ascii identifiers.
+! encoding.pm
+ fixes by jhi, the original author of this pragramtic module.
+ Message-Id: <20020413231527.V1826@alpha.hut.fi>
+
+1.34 2002/04/12 20:23:05 (Unreleased)
! Encode.pm
! t/Unicode.t
EBCDIC fixes addressed by jhi.
! AUTHORS
! t/Encoder.t
! lib/Encode/Encoder.pm
- s/= shift;/= @_/g # trivial but a common idiomatic typo :)
+ s/ = shift;/ = @_;/ # trivial but a common idiomatic typo :)
This adds Miyagawa-kun to AUTHORS.
* encoding() no longer exported by default but on demand
* t/Encoder.t updated to test all these
Typo fixes and improvements by jhi
Message-Id: <200204010201.FAA03564@alpha.hut.fi>, et al.
-1.11 $Date: 2002/04/12 20:23:05 $
+1.11 $Date: 2002/04/14 22:27:14 $
+ t/encoding.t
+ t/jperl.t
! MANIFEST
package Encode;
use strict;
-our $VERSION = do { my @r = (q$Revision: 1.34 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
+our $VERSION = do { my @r = (q$Revision: 1.40 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
our $DEBUG = 0;
require DynaLoader;
# Make a %Encoding package variable to allow a certain amount of cheating
our %Encoding;
-use Encode::Config;
+our %ExtModule;
+require Encode::Config;
+eval { require Encode::ConfigLocal };
sub encodings
{
$Encode::Encoding{utf8} =
bless {Name => "utf8"} => "Encode::utf8";
}
- # do externals if necessary
- require File::Basename;
- require File::Spec;
- for my $ext (qw()){
- my $pm =
- File::Spec->catfile(File::Basename::dirname($INC{'Encode.pm'}),
- "Encode", "$ext.pm");
- do $pm;
- }
}
require Encode::Encoding;
L<utf8>,
the Perl Unicode Mailing List E<lt>perl-unicode@perl.orgE<gt>
+head2 MAINTAINER
+
+This project was originated by Nick Ing-Simmons and later maintained
+by Dan Kogai E<lt>dankogai@dan.co.jpE<gt>. See AUTHORS for full list
+of people involved. For any questions, use
+E<lt>perl-unicode@perl.orgE<gt> so others can share.
+
=cut
#include "encode.h"
#include "def_t.h"
+#define FBCHAR 0xFFFd
+#define BOM_BE 0xFeFF
+#define BOM16LE 0xFFFe
+#define BOM32LE 0xFFFe0000
+
+#define valid_ucs2(x) ((0 <= (x) && (x) < 0xD800) || (0xDFFF < (x) && (x) <= 0xFFFF))
+
+#define issurrogate(x) (0xD800 <= (x) && (x) <= 0xDFFF )
+#define isHiSurrogate(x) (0xD800 <= (x) && (x) < 0xDC00 )
+#define isLoSurrogate(x) (0xDC00 <= (x) && (x) <= 0xDFFF )
+
+static UV
+enc_unpack(pTHX_ U8 **sp,U8 *e,STRLEN size,U8 endian)
+{
+ U8 *s = *sp;
+ UV v = 0;
+ if (s+size > e) {
+ croak("Partial character %c",(char) endian);
+ }
+ switch(endian) {
+ case 'N':
+ v = *s++;
+ v = (v << 8) | *s++;
+ case 'n':
+ v = (v << 8) | *s++;
+ v = (v << 8) | *s++;
+ break;
+ case 'V':
+ case 'v':
+ v |= *s++;
+ v |= (*s++ << 8);
+ if (endian == 'v')
+ break;
+ v |= (*s++ << 16);
+ v |= (*s++ << 24);
+ break;
+ default:
+ croak("Unknown endian %c",(char) endian);
+ break;
+ }
+ *sp = s;
+ return v;
+}
+
+void
+enc_pack(pTHX_ SV *result,STRLEN size,U8 endian,UV value)
+{
+ U8 *d = SvGROW(result,SvCUR(result)+size);
+ switch(endian) {
+ case 'v':
+ case 'V':
+ d += SvCUR(result);
+ SvCUR_set(result,SvCUR(result)+size);
+ while (size--) {
+ *d++ = value & 0xFF;
+ value >>= 8;
+ }
+ break;
+ case 'n':
+ case 'N':
+ SvCUR_set(result,SvCUR(result)+size);
+ d += SvCUR(result);
+ while (size--) {
+ *--d = value & 0xFF;
+ value >>= 8;
+ }
+ break;
+ default:
+ croak("Unknown endian %c",(char) endian);
+ break;
+ }
+}
+
#define ENCODE_XS_PROFILE 0 /* set 1 or more to profile.
t/encoding.t dumps core because of
Perl_warner and PerlIO don't work well */
XSRETURN(1);
}
+MODULE = Encode PACKAGE = Encode::Unicode
+
+void
+decode_xs(obj, str, chk = &PL_sv_undef)
+SV * obj
+SV * str
+SV * chk
+CODE:
+{
+ int size = SvIV(*hv_fetch((HV *)SvRV(obj),"size",4,0));
+ U8 endian = *((U8 *)SvPV_nolen(*hv_fetch((HV *)SvRV(obj),"endian",6,0)));
+ int ucs2 = SvTRUE(*hv_fetch((HV *)SvRV(obj),"ucs2",4,0));
+ SV *result = newSVpvn("",0);
+ STRLEN ulen;
+ U8 *s = SvPVbyte(str,ulen);
+ U8 *e = SvEND(str);
+ ST(0) = sv_2mortal(result);
+ SvUTF8_on(result);
+
+ if (!endian && s+size <= e) {
+ UV bom;
+ endian = (size == 4) ? 'N' : 'n';
+ bom = enc_unpack(aTHX_ &s,e,size,endian);
+ if (bom != BOM_BE) {
+ if (bom == BOM16LE) {
+ endian = 'v';
+ }
+ else if (bom == BOM32LE) {
+ endian = 'V';
+ }
+ else {
+ croak("%s:Unregognised BOM %"UVxf,
+ SvPV_nolen(*hv_fetch((HV *)SvRV(obj),"Name",4,0)),bom);
+ }
+ }
+#if 0
+ /* Update endian for this sequence */
+ hv_store((HV *)SvRV(obj),"endian",6,newSVpv((char *)&endian,1),0);
+#endif
+ }
+ while (s < e && s+size <= e) {
+ UV ord = enc_unpack(aTHX_ &s,e,size,endian);
+ U8 *d;
+ if (size != 4 && !valid_ucs2(ord)) {
+ if (ucs2) {
+ if (SvTRUE(chk)) {
+ croak("%s:no surrogates allowed %"UVxf,
+ SvPV_nolen(*hv_fetch((HV *)SvRV(obj),"Name",4,0)),ord);
+ }
+ if (s+size <= e) {
+ enc_unpack(aTHX_ &s,e,size,endian); /* skip the next one as well */
+ }
+ ord = FBCHAR;
+ }
+ else {
+ UV lo;
+ if (!isHiSurrogate(ord)) {
+ croak("%s:Malformed HI surrogate %"UVxf,
+ SvPV_nolen(*hv_fetch((HV *)SvRV(obj),"Name",4,0)),ord);
+ }
+ if (s+size > e) {
+ /* Partial character */
+ s -= size; /* back up to 1st half */
+ break; /* And exit loop */
+ }
+ lo = enc_unpack(aTHX_ &s,e,size,endian);
+ if (!isLoSurrogate(lo)){
+ croak("%s:Malformed LO surrogate %"UVxf,
+ SvPV_nolen(*hv_fetch((HV *)SvRV(obj),"Name",4,0)),ord);
+ }
+ ord = 0x10000 + ((ord - 0xD800) << 10) + (lo - 0xDC00);
+ }
+ }
+ d = (U8 *) SvGROW(result,SvCUR(result)+UTF8_MAXLEN+1);
+ d = uvuni_to_utf8_flags(d+SvCUR(result), ord, 0);
+ SvCUR_set(result,d - (U8 *)SvPVX(result));
+ }
+ if (SvTRUE(chk)) {
+ if (s < e) {
+ Perl_warner(aTHX_ packWARN(WARN_UTF8),"%s:Partial character",
+ SvPV_nolen(*hv_fetch((HV *)SvRV(obj),"Name",4,0)));
+ Move(s,SvPVX(str),e-s,U8);
+ SvCUR_set(str,(e-s));
+ }
+ else {
+ SvCUR_set(str,0);
+ }
+ *SvEND(str) = '\0';
+ }
+ XSRETURN(1);
+}
+
+void
+encode_xs(obj, utf8, chk = &PL_sv_undef)
+SV * obj
+SV * utf8
+SV * chk
+CODE:
+{
+ int size = SvIV(*hv_fetch((HV *)SvRV(obj),"size",4,0));
+ U8 endian = *((U8 *)SvPV_nolen(*hv_fetch((HV *)SvRV(obj),"endian",6,0)));
+ int ucs2 = SvTRUE(*hv_fetch((HV *)SvRV(obj),"ucs2",4,0));
+ SV *result = newSVpvn("",0);
+ STRLEN ulen;
+ U8 *s = SvPVutf8(utf8,ulen);
+ U8 *e = SvEND(utf8);
+ ST(0) = sv_2mortal(result);
+ if (!endian) {
+ endian = (size == 4) ? 'N' : 'n';
+ enc_pack(aTHX_ result,size,endian,BOM_BE);
+#if 0
+ /* Update endian for this sequence */
+ hv_store((HV *)SvRV(obj),"endian",6,newSVpv((char *)&endian,1),0);
+#endif
+ }
+ while (s < e && s+UTF8SKIP(s) <= e) {
+ STRLEN len;
+ UV ord = utf8n_to_uvuni(s, e-s, &len, 0);
+ s += len;
+ if (size != 4 && !valid_ucs2(ord)) {
+ if (!issurrogate(ord)){
+ if (ucs2) {
+ if (SvTRUE(chk)) {
+ croak("%s:code point \"\\x{"UVxf"}\" too high",
+ SvPV_nolen(*hv_fetch((HV *)SvRV(obj),"Name",4,0)),ord);
+ }
+ enc_pack(aTHX_ result,size,endian,FBCHAR);
+ }else{
+ UV hi = ((ord - 0x10000) >> 10) + 0xD800;
+ UV lo = ((ord - 0x10000) & 0x3FF) + 0xDC00;
+ enc_pack(aTHX_ result,size,endian,hi);
+ enc_pack(aTHX_ result,size,endian,lo);
+ }
+ }
+ else {
+ /* not supposed to happen */
+ enc_pack(aTHX_ result,size,endian,FBCHAR);
+ }
+ }
+ else {
+ enc_pack(aTHX_ result,size,endian,ord);
+ }
+ }
+ if (SvTRUE(chk)) {
+ if (s < e) {
+ Perl_warner(aTHX_ packWARN(WARN_UTF8),"%s:Partial character",
+ SvPV_nolen(*hv_fetch((HV *)SvRV(obj),"Name",4,0)));
+ Move(s,SvPVX(utf8),e-s,U8);
+ SvCUR_set(utf8,(e-s));
+ }
+ else {
+ SvCUR_set(utf8,0);
+ }
+ *SvEND(utf8) = '\0';
+ }
+ XSRETURN(1);
+}
+
MODULE = Encode PACKAGE = Encode
PROTOTYPES: ENABLE
--- /dev/null
+#
+# Local demand-load module list
+#
+# You should not edit this file by hand! use "enc2xs -C"
+#
+package Encode::ConfigLocal;
+our $VERSION = $_LocalVer_;
+
+use strict;
+
+$_ModLines_
+
+1;
}
}
use Encode;
-our $VERSION = do { my @r = (q$Revision: 1.21 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
+our $VERSION = do { my @r = (q$Revision: 1.22 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
use XSLoader;
XSLoader::load('Encode::JP',$VERSION);
-use Encode::JP::JIS;
-use Encode::JP::2022_JP;
-use Encode::JP::2022_JP1;
+use Encode::JP::JIS7;
1;
__END__
Encode.pm Mother of all Encode extensions
Encode.xs Encode extension
Encode/Changes.e2x Skeleton file for enc2xs
+Encode/ConfigLocal_PM.e2x Skeleton file for enc2xs
Encode/Makefile_PL.e2x Skeleton file for enc2xs
Encode/README.e2x Skeleton file for enc2xs
Encode/_PM.e2x Skeleton file for enc2xs
lib/Encode/Config.pm Encode configuration module
lib/Encode/Encoder.pm OO Encoder
lib/Encode/Encoding.pm Encode extension
-lib/Encode/JP/2022_JP.pm Encode extension
-lib/Encode/JP/2022_JP1.pm Encode extension
lib/Encode/JP/H2Z.pm Encode extension
-lib/Encode/JP/JIS.pm Encode extension
+lib/Encode/JP/JIS7.pm Encode extension
lib/Encode/KR/2022_KR.pm Encode extension
lib/Encode/Supported.pod Documents supported encodings
lib/Encode/Unicode.pm Encode extension
This module requires perl5.7.3 or later.
+MAINTAINER
+
+This project was originated by Nick Ing-Simmons and later maintained by
+Dan Kogai <dankogai@dan.co.jp>. See AUTHORS for full list of people
+involved.
+
QUESTIONS?
If you have any questions "perldoc Encode" does not answer, please
use strict;
use Getopt::Std;
my @orig_ARGV = @ARGV;
-our $VERSION = do { my @r = (q$Revision: 1.20 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
+our $VERSION = do { my @r = (q$Revision: 1.21 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
# These may get re-ordered.
# RAW is a do_now as inserted by &enter
# -o <output> to specify the output file name (else it's the first arg)
# -f <inlist> to give a file with a list of input files (else use the args)
# -n <name> to name the encoding (else use the basename of the input file.
-getopts('M:SQqOo:f:n:',\%opt);
+getopts('CM:SQqOo:f:n:',\%opt);
$opt{M} and make_makefile_pl($opt{M}, @ARGV);
+$opt{C} and make_configlocal_pm($opt{C}, @ARGV);
# This really should go first, else the die here causes empty (non-erroneous)
# output files to be written.
$_Name = shift;
$_TableFiles = join(",", map {qq('$_')} @_);
$_Now = scalar localtime();
+ eval { require File::Spec; };
warn "Generating Makefile.PL\n";
- _print_expand("$_Inc/Makefile_PL.e2x", "Makefile.PL");
+ _print_expand(File::Spec->catfile($_Inc,"Makefile_PL.e2x"),"Makefile.PL");
warn "Generating $_Name.pm\n";
- _print_expand("$_Inc/_PM.e2x", "$_Name.pm");
+ _print_expand(File::Spec->catfile($_Inc,"_PM.e2x"), "$_Name.pm");
warn "Generating t/$_Name.t\n";
- _print_expand("$_Inc/_T.e2x", "t/$_Name.t");
+ _print_expand(File::Spec->catfile($_Inc,"_T.e2x"), "t/$_Name.t");
warn "Generating README\n";
- _print_expand("$_Inc/README.e2x", "README");
+ _print_expand(File::Spec->catfile($_Inc,"README.e2x"), "README");
warn "Generating t/$_Name.t\n";
- _print_expand("$_Inc/Changes.e2x", "Changes");
+ _print_expand(File::Spec->catfile($_Inc,"Changes.e2x"), "Changes");
exit;
}
+use vars qw(
+ $_ModLines
+ $_LocalVer
+ );
+
+sub make_configlocal_pm
+{
+ eval { require Encode; };
+ $@ and die "Unable to require Encode: $@\n";
+ eval { require File::Spec; };
+ # our used for variable expanstion
+ my %in_core = map {$_=>1}('ascii','iso-8859-1','utf8');
+ my %LocalMod = ();
+ for my $d (@INC){
+ my $inc = File::Spec->catfile($d, "Encode");
+ -d $inc or next;
+ opendir my $dh, $inc or die "$inc:$!";
+ warn "Checking $inc...\n";
+ for my $f (grep /\.pm$/o, readdir($dh)){
+ -f File::Spec->catfile($inc, "$f") or next;
+ $INC{"Encode/$f"} and next;
+ warn "require Encode/$f;\n";
+ eval { require "Encode/$f"; };
+ $@ and die "Can't require Encode/$f: $@\n";
+ for my $enc (Encode->encodings()){
+ $in_core{$enc} and next;
+ $Encode::Config::ExtModule{$enc} and next;
+ my $mod = "Encode/$f";
+ $mod =~ s/\.pm$//o; $mod =~ s,/,::,og;
+ warn "$enc => $mod\n";
+ $LocalMod{$enc} = $mod;
+ }
+ }
+ }
+ $_ModLines = "";
+ for my $enc (sort keys %LocalMod){
+ $_ModLines .=
+ qq(\$Encode::ExtModule{'$enc'} =\t"$LocalMod{$enc}";\n);
+ }
+ $_LocalVer = _mkversion();
+ $_Inc = $INC{"Encode.pm"}; $_Inc =~ s/\.pm$//o;
+ warn "Writing Encode::ConfigLocal\n";
+ _print_expand(File::Spec->catfile($_Inc,"ConfigLocal_PM.e2x"),
+ File::Spec->catfile($_Inc,"ConfigLocal.pm"));
+ exit;
+}
+
+sub _mkversion{
+ my ($ss,$mm,$hh,$dd,$mo,$yyyy) = localtime();
+ $yyyy += 1900, $mo +=1;
+ return sprintf("v%04d.%04d.%04d", $yyyy, $mo*100+$dd, $hh*100+$mm);
+}
+
sub _print_expand{
eval { require File::Basename; };
$@ and die "File::Basename needed. Are you on miniperl?;\nerror: $@\n";
=head1 SYNOPSIS
- enc2xs -M ModName mapfiles...
enc2xs -[options]
+ enc2xs -M ModName mapfiles...
+ enc2xs -C
=head1 DESCRIPTION
If you are content with the test result, just "make install"
+=item 7.
+
+If you want to add your encoding to Encode demand-loading list
+(so you don't have to "use Encode::YourEncoding"), run
+
+ enc2xs -C
+
+to update Encode::ConfigLocal, a module that controls local settings.
+After that, "use Encode;" is enough to load your encodings on demand.
+
=back
=head1 The Unicode Character Map
package encoding;
-our $VERSION = do { my @r = (q$Revision: 1.25 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
+our $VERSION = do { my @r = (q$Revision: 1.26 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
use Encode;
use strict;
require Carp;
Carp::croak "Unknown encoding '$name'";
}
- ${^ENCODING} = $enc; # this is all you need, actually.
-
- # $_OPEN_ORIG = ${^OPEN};
- for my $h (qw(STDIN STDOUT STDERR)){
- if ($arg{$h}){
- unless (defined find_encoding($name)) {
+ unless ($arg{Filter}){
+ ${^ENCODING} = $enc; # this is all you need, actually.
+ for my $h (qw(STDIN STDOUT)){
+ if ($arg{$h}){
+ unless (defined find_encoding($arg{h})) {
+ require Carp;
+ Carp::croak "Unknown encoding for $h, '$arg{$h}'";
+ }
+ eval qq{ binmode($h, ":encoding($arg{$h})") };
+ }else{
+ unless (exists $arg{$h}){
+ eval qq{ binmode($h, ":encoding($name)") };
+ }
+ }
+ if ($@){
require Carp;
- Carp::croak "Unknown encoding for $h, '$arg{$h}'";
+ Carp::croak($@);
}
- eval qq{ binmode($h, ":encoding($arg{$h})") };
- }else{
- eval qq{ binmode($h, ":encoding($name)") };
- }
- if ($@){
- require Carp;
- Carp::croak($@);
}
+ }else{
+ defined(${^ENCODING}) and undef ${^ENCODING};
+ eval {
+ require Filter::Util::Call ;
+ Filter::Util::Call->import ;
+ binmode(STDIN, ":raw");
+ binmode(STDOUT, ":raw");
+ filter_add(sub{
+ my $status;
+ if (($status = filter_read()) > 0){
+ $_ = $enc->decode($_, 1);
+ # warn $_;
+ }
+ $status ;
+ });
+ };
+ # warn "Filter installed";
}
return 1; # I doubt if we need it, though
}
undef ${^ENCODING};
binmode(STDIN, ":raw");
binmode(STDOUT, ":raw");
- # Leaves STDERR alone.
- # binmode(STDERR, ":raw");
+ if ($INC{"Filter/Util/Call.pm"}){
+ eval { filter_del() };
+ }
}
1;
# "no encoding;" supported (but not scoped!)
no encoding;
+ # an alternate way, Filter
+ use encoding "euc-jp", Filter=>1;
+ use utf8;
+ # now you can use kanji identifiers -- in euc-jp!
+
=head1 ABSTRACT
Perl 5.6.0 has introduced Unicode support. You could apply
Note that non-STD file handles remain unaffected. Use C<use open> or
C<binmode> to change disciplines of those.
-=item use encoding I<ENCNAME> [ STDIN => I<ENCNAME_IN> ...] ;
+=item use encoding I<ENCNAME> [ STDIN =E<gt> I<ENCNAME_IN> ...] ;
-You can also individually set encodings of STDIN, STDOUT, and STDERR
-via STDI<FH> => I<ENCNAME_FH> form. In this case, you cannot omit the
-first I<ENCNAME>.
+You can also individually set encodings of STDIN and STDOUT via
+STDI<FH> =E<gt> I<ENCNAME_FH> form. In this case, you cannot omit the
+first I<ENCNAME>. C<STDI<FH> =E<gt> undef> turns IO transcoding
+completely off.
=item no encoding;
resort to \x... just to spell your name in native encoding. So feel
free to put your strings in your encoding in quotes and regexes.
+=head1 NON-ASCII Identifiers and Filter option
+
+The magic of C<use encoding> is not applied to the names of identifiers.
+In order to make C<${"4eba"}++> ($man++, where man is a single ideograph)
+work, you still need to write your script in UTF-8 or use a source filter.
+
+In other words, the same restriction as Jperl applies.
+
+If you dare experiment, however, you can try Fitlter option.
+
+=over 4
+
+=item use encoding I<ENCNAME> Filter=E<gt>1;
+
+This turns encoding pragma into source filter. While the default
+approach just decodes interpolated literals (in qq() and qr()), this
+will apply source filter to entire source code. In this case, STDIN
+and STDOUT remain untouched.
+
+=back
+
+What does this mean? Your source code behaves as if it is written
+in UTF-8. So even if your editor only supports Shift_JIS, for
+example. You can still try examples in Chapter 15 of
+C<Programming Perl, 3rd Ed.> For instance, you can use UTF-8
+identifiers.
+
+This option is significantly slower and (as of this writing) non-ASCII
+identifiers are not very stable WITHOUT this option and with the
+source code written in UTF-8.
+
+To make your script in legacy encoding work with minimum effort, do
+not use Filter=E<gt>1
+
+
=head1 EXAMPLE - Greekperl
use encoding "iso 8859-7";
=head1 SEE ALSO
-L<perlunicode>, L<Encode>, L<open>
+L<perlunicode>, L<Encode>, L<open>, L<Filter::Util::Call>,
+
+Ch. 15 of C<Programming Perl (3rd Edition)>
+by Larry Wall, Tom Christiansen, Jon Orwant;
+O'Reilly & Associates; ISBN 0-596-00027-8
=cut
# Demand-load module list
#
package Encode::Config;
-our $VERSION = do { my @r = (q$Revision: 1.1 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
+our $VERSION = do { my @r = (q$Revision: 1.2 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
use strict;
-require Exporter;
our %ExtModule =
(
'euc-cn' => 'Encode::CN',
'gb12345-raw' => 'Encode::CN',
'gb2312-raw' => 'Encode::CN',
+ 'hz' => 'Encode::CN',
'iso-ir-165' => 'Encode::CN',
'cp936' => 'Encode::CN',
'MacChineseSimp' => 'Encode::CN',
'cp950' => 'Encode::TW',
'MacChineseTrad' => 'Encode::TW',
- 'big5plus' => 'Encode::HanExtra',
- 'euc-tw' => 'Encode::HanExtra',
- 'gb18030' => 'Encode::HanExtra',
+ #'big5plus' => 'Encode::HanExtra',
+ #'euc-tw' => 'Encode::HanExtra',
+ #'gb18030' => 'Encode::HanExtra',
);
}
-*Encode::ExtModule = \%ExtModule;
+#
+# Why not export ? to keep ConfigLocal Happy!
+#
+while (my ($enc,$mod) = each %ExtModule){
+ $Encode::ExtModule{$enc} = $mod;
+}
1;
#
-# $Id: Encoder.pm,v 0.4 2002/04/12 20:23:05 dankogai Exp dankogai $
+# $Id: Encoder.pm,v 0.4 2002/04/12 20:23:05 dankogai Exp $
#
package Encode::Encoder;
use strict;
+++ /dev/null
-package Encode::JP::2022_JP;
-use Encode::JP;
-use Encode::JP::JIS;
-use Encode::JP::H2Z;
-use base 'Encode::Encoding';
-
-use vars qw($VERSION);
-$VERSION = do { my @r = (q$Revision: 1.0 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
-
-my $canon = 'iso-2022-jp';
-my $obj = bless {name => $canon}, __PACKAGE__;
-$obj->Define($canon);
-
-sub name { return $_[0]->{name}; }
-
-#
-# decode is identical to 7bit-jis
-#
-
-sub decode
-{
- my ($obj,$str,$chk) = @_;
- return Encode::decode('7bit-jis', $str, $chk);
-}
-
-# iso-2022-jp = 7bit-jis with all x201 (Hankaku) converted to
-# x208 equivalent (Zenkaku)
-
-sub encode
-{
- my ($obj,$str,$chk) = @_;
- my $euc = Encode::encode('euc-jp', $str, $chk);
- &Encode::JP::H2Z::h2z(\$euc);
- return &Encode::JP::JIS::euc_jis_nox0212(\$euc);
-}
-
-1;
-__END__
-
-=head1 NAME
-
-Encode::JP::2022_JP -- internally used by Encode::JP
-
-=cut
+++ /dev/null
-package Encode::JP::2022_JP1;
-use Encode::JP;
-use Encode::JP::JIS;
-use Encode::JP::H2Z;
-use base 'Encode::Encoding';
-
-use vars qw($VERSION);
-$VERSION = do { my @r = (q$Revision: 1.0 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
-
-my $canon = 'iso-2022-jp-1';
-my $obj = bless {name => $canon}, __PACKAGE__;
-$obj->Define($canon);
-
-sub name { return $_[0]->{name}; }
-
-#
-# decode is identical to 7bit-jis
-#
-
-sub decode
-{
- my ($obj,$str,$chk) = @_;
- return Encode::decode('7bit-jis', $str, $chk);
-}
-
-# iso-2022-jp = 7bit-jis with all x201 (Hankaku) converted to
-# x208 equivalent (Zenkaku)
-
-sub encode
-{
- my ($obj,$str,$chk) = @_;
- my $euc = Encode::encode('euc-jp', $str, $chk);
- &Encode::JP::H2Z::h2z(\$euc);
- return &Encode::JP::JIS::euc_jis(\$euc);
-}
-
-1;
-__END__
-
-
-=head1 NAME
-
-Encode::JP::2022_JP1 -- internally used by Encode::JP
-
-=cut
+++ /dev/null
-package Encode::JP::JIS;
-use Encode::JP;
-use base 'Encode::Encoding';
-
-use strict;
-
-our $VERSION = do { my @r = (q$Revision: 1.0 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
-
-# Just for the time being, we implement jis-7bit
-# encoding via EUC
-
-my $canon = '7bit-jis';
-my $obj = bless {name => $canon}, __PACKAGE__;
-$obj->Define($canon);
-
-sub name { return $_[0]->{name}; }
-
-sub decode
-{
- my ($obj,$str,$chk) = @_;
- my $res = $str;
- jis_euc(\$res);
- return Encode::decode('euc-jp', $res, $chk);
-}
-
-sub encode
-{
- my ($obj,$str,$chk) = @_;
- my $res = Encode::encode('euc-jp', $str, $chk);
- euc_jis(\$res);
- return $res;
-}
-
-use Encode::CJKConstants qw(:all);
-
-# JIS<->EUC
-
-sub jis_euc {
- my $r_str = shift;
- $$r_str =~ s(
- ($RE{JIS_0212}|$RE{JIS_0208}|$RE{ISO_ASC}|$RE{JIS_KANA})
- ([^\e]*)
- )
- {
- my ($esc, $str) = ($1, $2);
- if ($esc !~ /$RE{ISO_ASC}/o) {
- $str =~ tr/\x21-\x7e/\xa1-\xfe/;
- if ($esc =~ /$RE{JIS_KANA}/o) {
- $str =~ s/([\xa1-\xdf])/\x8e$1/og;
- }
- elsif ($esc =~ /$RE{JIS_0212}/o) {
- $str =~ s/([\xa1-\xfe][\xa1-\xfe])/\x8f$1/og;
- }
- }
- $str;
- }geox;
- $$r_str;
-}
-
-sub euc_jis{
- my $r_str = shift;
- $$r_str =~ s{
- ((?:$RE{EUC_C})+|(?:$RE{EUC_KANA})+|(?:$RE{EUC_0212})+)
- }{
- my $str = $1;
- my $esc =
- ( $str =~ tr/\x8E//d ) ? $ESC{KANA} :
- ( $str =~ tr/\x8F//d ) ? $ESC{JIS_0212} :
- $ESC{JIS_0208};
- $str =~ tr/\xA1-\xFE/\x21-\x7E/;
- $esc . $str . $ESC{ASC};
- }geox;
- $$r_str =~
- s/\Q$ESC{ASC}\E
- (\Q$ESC{KANA}\E|\Q$ESC{JIS_0212}\E|\Q$ESC{JIS_0208}\E)/$1/gox;
- $$r_str;
-}
-
-sub euc_jis_nox0212{
- my $r_str = shift;
- $$r_str =~ s/$RE{EUC_0212}/$CHARCODE{UNDEF_EUC}/go;
- euc_jis($r_str);
-}
-
-1;
-__END__
-
-
-=head1 NAME
-
-Encode::JP::JIS -- internally used by Encode::JP
-
-=cut
--- /dev/null
+package Encode::JP::JIS7;
+use strict;
+
+our $VERSION = do { my @r = (q$Revision: 1.1 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
+
+require Encode;
+for my $name ('7bit-jis', 'iso-2022-jp', 'iso-2022-jp-1'){
+ my $h2z = ($name eq '7bit-jis') ? 0 : 1;
+ my $jis0212 = ($name eq 'iso-2022-jp') ? 0 : 1;
+
+ $Encode::Encoding{$name} =
+ bless {
+ Name => $name,
+ h2z => $h2z,
+ jis0212 => $jis0212,
+ } => __PACKAGE__;
+}
+
+sub name { shift->{'Name'} }
+sub new_sequence { $_[0] };
+
+use Encode::CJKConstants qw(:all);
+
+#
+# decode is identical for all 2022 variants
+#
+
+sub decode
+{
+ my ($obj,$str,$chk) = @_;
+ jis_euc(\$str);
+ return Encode::decode('euc-jp', $str, $chk);
+}
+
+#
+# encode is different
+#
+
+sub encode
+{
+ require Encode::JP::H2Z;
+ my ($obj,$str,$chk) = @_;
+ my ($h2z, $jis0212) = @$obj{qw(h2z jis0212)};
+ my $result = Encode::encode('euc-jp', $str, $chk);
+ $h2z and &Encode::JP::H2Z::h2z(\$result);
+ euc_jis(\$result, $jis0212);
+ return $result;
+}
+
+
+# JIS<->EUC
+
+sub jis_euc {
+ my $r_str = shift;
+ $$r_str =~ s(
+ ($RE{JIS_0212}|$RE{JIS_0208}|$RE{ISO_ASC}|$RE{JIS_KANA})
+ ([^\e]*)
+ )
+ {
+ my ($esc, $str) = ($1, $2);
+ if ($esc !~ /$RE{ISO_ASC}/o) {
+ $str =~ tr/\x21-\x7e/\xa1-\xfe/;
+ if ($esc =~ /$RE{JIS_KANA}/o) {
+ $str =~ s/([\xa1-\xdf])/\x8e$1/og;
+ }
+ elsif ($esc =~ /$RE{JIS_0212}/o) {
+ $str =~ s/([\xa1-\xfe][\xa1-\xfe])/\x8f$1/og;
+ }
+ }
+ $str;
+ }geox;
+ $$r_str;
+}
+
+sub euc_jis{
+ my $r_str = shift;
+ my $jis0212 = shift;
+ $$r_str =~ s{
+ ((?:$RE{EUC_C})+|(?:$RE{EUC_KANA})+|(?:$RE{EUC_0212})+)
+ }{
+ my $str = $1;
+ my $esc =
+ ( $str =~ tr/\x8E//d ) ? $ESC{KANA} :
+ ( $str =~ tr/\x8F//d ) ? $ESC{JIS_0212} :
+ $ESC{JIS_0208};
+ if ($esc eq $ESC{JIS_0212} && !$jis0212){
+ # fallback to '?'
+ $str =~ tr/\xA1-\xFE/\x3F/;
+ }else{
+ $str =~ tr/\xA1-\xFE/\x21-\x7E/;
+ }
+ $esc . $str . $ESC{ASC};
+ }geox;
+ $$r_str =~
+ s/\Q$ESC{ASC}\E
+ (\Q$ESC{KANA}\E|\Q$ESC{JIS_0212}\E|\Q$ESC{JIS_0208}\E)/$1/gox;
+ $$r_str;
+}
+
+1;
+__END__
+
+
+=head1 NAME
+
+Encode::JP::JIS7 -- internally used by Encode::JP
+
+=cut
use strict;
use warnings;
-our $VERSION = do { my @r = (q$Revision: 1.30 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
+our $VERSION = do { my @r = (q$Revision: 1.31 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
#
# Aux. subs & constants
}
sub name { shift->{'Name'} }
-sub new_sequence { $_[0] };
+sub new_sequence
+{
+ my $self = shift;
+ # Return the original if endian known
+ return $self if ($self->{endian});
+ # Return a clone
+ return bless {%$self},ref($self);
+}
+
#
-# two implementation of (en|de)code exist. *_modern use
-# an array and *_classic stick with substr. *_classic is much
-# slower but more memory conservative. *_modern is default.
+# three implementation of (en|de)code exist. XS version is the fastest.
+# *_modern use # an array and *_classic stick with substr. *_classic is
+# much slower but more memory conservative. *_xs is default.
sub set_transcoder{
no warnings qw(redefine);
my $type = shift;
- if ($type eq "modern"){
+ if ($type eq "xs"){
+ *decode = \&decode_xs;
+ *encode = \&encode_xs;
+ }elsif($type eq "modern"){
*decode = \&decode_modern;
*encode = \&encode_modern;
}elsif($type eq "classic"){
*encode = \&encode_classic;
}else{
require Carp;
- Carp::croak __PACKAGE__, "::set_transcoder(modern|classic)";
+ Carp::croak __PACKAGE__, "::set_transcoder(modern|classic|xs)";
}
}
-set_transcoder("modern");
+set_transcoder("xs");
#
# *_modern are much faster but guzzle more memory
#
-sub decode_modern
+sub decode_modern($$;$)
{
my ($obj, $str, $chk ) = @_;
my ($size, $endian, $ucs2) = @$obj{qw(size endian ucs2)};
return $utf8;
}
-sub encode_modern
+sub encode_modern($$;$)
{
my ($obj, $utf8, $chk) = @_;
my ($size, $endian, $ucs2) = @$obj{qw(size endian ucs2)};
# *_classic are slower but more memory conservative
#
-sub decode_classic
+sub decode_classic($$;$)
{
my ($obj, $str, $chk ) = @_;
my ($size, $endian, $ucs2) = @$obj{qw(size endian ucs2)};
return $utf8;
}
-sub encode_classic
+sub encode_classic($$;$)
{
my ($obj, $utf8, $chk) = @_;
my ($size, $endian, $ucs2) = @$obj{qw(size endian ucs2)};
#
-# $Id: Unicode.t,v 1.6 2002/04/12 20:23:05 dankogai Exp dankogai $
+# $Id: Unicode.t,v 1.7 2002/04/14 22:05:20 dankogai Exp $
#
# This script is written entirely in ASCII, even though quoted literals
# do include non-BMP unicode characters -- Are you happy, jhi?
#
+our $ON_EBCDIC;
BEGIN {
require Config; import Config;
if ($Config{'extensions'} !~ /\bEncode\b/) {
# print "1..0 # Skip: PerlIO was not built\n";
# exit 0;
# }
+
# should work on EBCDIC
# if (ord("A") == 193) {
# print "1..0 # Skip: EBCDIC\n";
# exit 0;
# }
+ $ON_EBCDIC = (ord("A") == 193) || $ARGV[0];
$| = 1;
}
use strict;
#use Test::More 'no_plan';
-use Test::More tests => 22;
+use Test::More tests => 30;
use Encode qw(encode decode);
#
is(decode('UCS-2BE', $n_16be), $fallback, "decode UCS-2BE: fallback");
is(decode('UCS-2LE', $n_16le), $fallback, "decode UCS-2LE: fallback");
eval { decode('UCS-2BE', $n_16be, 1) };
-ok($@=~/^UCS-2BE:/, "decode UCS-2BE: exception");
-eval { decode('UCS-2LE', $n_16le, 1) };
-ok($@=~/^UCS-2LE:/, "decode UCS-2LE: exception");
+is (index($@,'UCS-2BE:'), 0, "decode UCS-2BE: exception");
+eval { decode('UCS-2LE', $n_16le, 1) };
+is (index($@,'UCS-2LE:'), 0, "decode UCS-2LE: exception");
is(encode('UCS-2BE', $nasty), $f_16be, "encode UCS-2BE: fallback");
is(encode('UCS-2LE', $nasty), $f_16le, "encode UCS-2LE: fallback");
eval { encode('UCS-2BE', $nasty, 1) };
-ok($@=~/^UCS-2BE:/, "encode UCS-2BE: exception");
+is(index($@, 'UCS-2BE'), 0, "encode UCS-2BE: exception");
eval { encode('UCS-2LE', $nasty, 1) };
-ok($@=~/^UCS-2LE:/, "encode UCS-2LE: exception");
+is(index($@, 'UCS-2LE'), 0, "encode UCS-2LE: exception");
+
+#
+# SvGROW test for (en|de)code_xs
+#
+SKIP: {
+ skip "Not on EBCDIC", 8 if $ON_EBCDIC;
+ my $utf8 = '';
+ for my $j (0,0x10){
+ for my $i (0..0xffff){
+ $j == 0 and (0xD800 <= $i && $i <= 0xDFFF) and next;
+ $utf8 .= ord($j+$i);
+ }
+ my $len = length($utf8);
+ for my $major ('UTF-16', 'UTF-32'){
+ for my $minor ('BE', 'LE'){
+ my $enc = $major.$minor;
+ is(decode($enc, encode($enc, $utf8)), $utf8, "$enc RT ($len)");
+ }
+ }
+ }
+};
+
1;
__END__
#
-# $Id: jperl.t,v 1.20 2002/04/04 19:50:52 dankogai Exp $
+# $Id: jperl.t,v 1.21 2002/04/14 22:05:20 dankogai Exp $
#
# This script is written in euc-jp
}
use strict;
-use Test::More tests => 15;
+use Test::More tests => 18;
my $Debug = shift;
no encoding; # ensure
ok(! defined(${^ENCODING}), q{out of black magic});
use bytes;
is (length($Namae), 10);
+
+#
+# now something completely different!
+#
+{
+ use encoding "euc-jp", Filter=>1;
+ ok(1, "Filter on");
+ use utf8;
+ no strict 'vars'; # fools
+ # doesn't work w/ "my" as of this writing.
+ # because of buggy strict.pm and utf8.pm
+ our $¿Í = 2;
+ # ^^U+4eba, "human" in CJK ideograph
+ $¿Í++; # a child is born
+ *people = \$¿Í;
+ is ($people, 3, "Filter:utf8 identifier");
+ no encoding;
+ ok(1, "Filter off");
+}
+
1;
__END__
for my $op (qw(encode decode)){
my ($meth, $from, $to) = ($op eq 'encode') ?
(\&encode, 'utf8', 'utf16') : (\&decode, 'utf16', 'utf8');
+ my $XS = sub {
+ Encode::Unicode::set_transcoder("xs");
+ $meth->('UTF-16BE', $S{$from}{$sz}{$cp})
+ eq $S{$to}{$sz}{$cp}
+ or die "$op,$from,$to,$sz,$cp";
+ };
my $modern = sub {
Encode::Unicode::set_transcoder("modern");
$meth->('UTF-16BE', $S{$from}{$sz}{$cp})
print "---- $op length=$sz/range=$cp ----\n";
my $r = timethese($count,
{
- "Modern" => $modern,
+ "XS" => $XS,
+ "Modern" => $modern,
"Classic" => $classic,
},
'none',