[p5sagit/p5-mst-13.2.git] / ext / Encode / Encode.pm

package Encode;
use strict;
our $VERSION = do { my @r = (q$Revision: 1.42 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
our $DEBUG = 0;

require DynaLoader;
require Exporter;

our @ISA = qw(Exporter DynaLoader);

# Public, encouraged API is exported by default
our @EXPORT = qw (
  decode
  decode_utf8
  encode
  encode_utf8
  encodings
  find_encoding
);

our @EXPORT_OK =
    qw(
       _utf8_off
       _utf8_on
       define_encoding
       from_to
       is_16bit
       is_8bit
       is_utf8
       resolve_alias
       utf8_downgrade
       utf8_upgrade
      );

bootstrap Encode ();

# Documentation moved after __END__ for speed - NI-S

use Carp;

our $ON_EBCDIC = (ord("A") == 193);

use Encode::Alias;

# Make a %Encoding package variable to allow a certain amount of cheating
our %Encoding;
our %ExtModule;
require Encode::Config;
eval { require Encode::ConfigLocal };

sub encodings
{
    my $class = shift;
    my @modules = (@_ and $_[0] eq ":all") ? values %ExtModule : @_;
    for my $mod (@modules){
	$mod =~ s,::,/,g or $mod = "Encode/$mod";
	$mod .= '.pm'; 
	$DEBUG and warn "about to require $mod;";
	eval { require $mod; };
    }
    my %modules = map {$_ => 1} @modules;
    return
	sort { lc $a cmp lc $b }
             grep {!/^(?:Internal|Unicode)$/o} keys %Encoding;
}

sub define_encoding
{
    my $obj  = shift;
    my $name = shift;
    $Encoding{$name} = $obj;
    my $lc = lc($name);
    define_alias($lc => $obj) unless $lc eq $name;
    while (@_)
    {
	my $alias = shift;
	define_alias($alias,$obj);
    }
    return $obj;
}

sub getEncoding
{
    my ($class,$name,$skip_external) = @_;
    my $enc;
    if (ref($name) && $name->can('new_sequence'))
    {
	return $name;
    }
    my $lc = lc $name;
    if (exists $Encoding{$name})
    {
	return $Encoding{$name};
    }
    if (exists $Encoding{$lc})
    {
	return $Encoding{$lc};
    }

    my $oc = $class->find_alias($name);
    return $oc if defined $oc;

    $oc = $class->find_alias($lc) if $lc ne $name;
    return $oc if defined $oc;

    unless ($skip_external)
    {
	if (my $mod = $ExtModule{$name} || $ExtModule{$lc}){
	    $mod =~ s,::,/,g ; $mod .= '.pm';
	    eval{ require $mod; };
	    return $Encoding{$name} if exists $Encoding{$name};
	}
    }
    return;
}

sub find_encoding
{
    my ($name,$skip_external) = @_;
    return __PACKAGE__->getEncoding($name,$skip_external);
}

sub resolve_alias {
    my $obj = find_encoding(shift);
    defined $obj and return $obj->name;
    return;
}

sub encode($$;$)
{
    my ($name,$string,$check) = @_;
    $check ||=0;
    my $enc = find_encoding($name);
    croak("Unknown encoding '$name'") unless defined $enc;
    my $octets = $enc->encode($string,$check);
    return undef if ($check && length($string));
    return $octets;
}

sub decode($$;$)
{
    my ($name,$octets,$check) = @_;
    $check ||=0;
    my $enc = find_encoding($name);
    croak("Unknown encoding '$name'") unless defined $enc;
    my $string = $enc->decode($octets,$check);
    $_[1] = $octets if $check;
    return $string;
}

sub from_to($$$;$)
{
    my ($string,$from,$to,$check) = @_;
    $check ||=0;
    my $f = find_encoding($from);
    croak("Unknown encoding '$from'") unless defined $f;
    my $t = find_encoding($to);
    croak("Unknown encoding '$to'") unless defined $t;
    my $uni = $f->decode($string,$check);
    return undef if ($check && length($string));
    $string =  $t->encode($uni,$check);
    return undef if ($check && length($uni));
    return defined($_[0] = $string) ? length($string) : undef ;
}

sub encode_utf8($)
{
    my ($str) = @_;
    utf8::encode($str);
    return $str;
}

sub decode_utf8($)
{
    my ($str) = @_;
    return undef unless utf8::decode($str);
    return $str;
}

predefine_encodings();

#
# This is to restore %Encoding if really needed;
#
sub predefine_encodings{
    if ($ON_EBCDIC) { 
	# was in Encode::UTF_EBCDIC
	package Encode::UTF_EBCDIC;
	*name         = sub{ shift->{'Name'} };
	*new_sequence = sub{ return $_[0] };
	*decode = sub{
	    my ($obj,$str,$chk) = @_;
	    my $res = '';
	    for (my $i = 0; $i < length($str); $i++) {
		$res .= 
		    chr(utf8::unicode_to_native(ord(substr($str,$i,1))));
	    }
	    $_[1] = '' if $chk;
	    return $res;
	};
	*encode = sub{
	    my ($obj,$str,$chk) = @_;
	    my $res = '';
	    for (my $i = 0; $i < length($str); $i++) {
		$res .= 
		    chr(utf8::native_to_unicode(ord(substr($str,$i,1))));
	    }
	    $_[1] = '' if $chk;
	    return $res;
	};
	$Encode::Encoding{Unicode} = 
	    bless {Name => "UTF_EBCDIC"} => "Encode::UTF_EBCDIC";
    } else {  
	# was in Encode::UTF_EBCDIC
	package Encode::Internal;
	*name         = sub{ shift->{'Name'} };
	*new_sequence = sub{ return $_[0] };
	*decode = sub{
	    my ($obj,$str,$chk) = @_;
	    utf8::upgrade($str);
	    $_[1] = '' if $chk;
	    return $str;
	};
	*encode = \&decode;
	$Encode::Encoding{Unicode} = 
	    bless {Name => "Internal"} => "Encode::Internal";
    }

    {
	# was in Encode::utf8
	package Encode::utf8;
	*name         = sub{ shift->{'Name'} };
	*new_sequence = sub{ return $_[0] };
	*decode = sub{
	    my ($obj,$octets,$chk) = @_;
	    my $str = Encode::decode_utf8($octets);
	    if (defined $str) {
		$_[1] = '' if $chk;
		return $str;
	    }
	    return undef;
	};
	*encode = sub {
	    my ($obj,$string,$chk) = @_;
	    my $octets = Encode::encode_utf8($string);
	    $_[1] = '' if $chk;
	    return $octets;
	};
	$Encode::Encoding{utf8} = 
	    bless {Name => "utf8"} => "Encode::utf8";
    }
}

require Encode::Encoding;

eval { require PerlIO::encoding };

1;

__END__

=head1 NAME

Encode - character encodings

=head1 SYNOPSIS

    use Encode;


=head2 Table of Contents

Encode consists of a collection of modules which details are too big 
to fit in one document.  This POD itself explains the top-level APIs
and general topics at a glance.  For other topics and more details, 
see the PODs below;

  Name			        Description
  --------------------------------------------------------
  Encode::Alias         Alias defintions to encodings
  Encode::Encoding      Encode Implementation Base Class
  Encode::Supported     List of Supported Encodings
  Encode::CN            Simplified Chinese Encodings
  Encode::JP            Japanese Encodings
  Encode::KR            Korean Encodings
  Encode::TW            Traditional Chinese Encodings
  --------------------------------------------------------

=head1 DESCRIPTION

The C<Encode> module provides the interfaces between Perl's strings
and the rest of the system.  Perl strings are sequences of
B<characters>.

The repertoire of characters that Perl can represent is at least that
defined by the Unicode Consortium. On most platforms the ordinal
values of the characters (as returned by C<ord(ch)>) is the "Unicode
codepoint" for the character (the exceptions are those platforms where
the legacy encoding is some variant of EBCDIC rather than a super-set
of ASCII - see L<perlebcdic>).

Traditionally computer data has been moved around in 8-bit chunks
often called "bytes". These chunks are also known as "octets" in
networking standards. Perl is widely used to manipulate data of many
types - not only strings of characters representing human or computer
languages but also "binary" data being the machines representation of
numbers, pixels in an image - or just about anything.

When Perl is processing "binary data" the programmer wants Perl to
process "sequences of bytes". This is not a problem for Perl - as a
byte has 256 possible values it easily fits in Perl's much larger
"logical character".

=head2 TERMINOLOGY

=over 4

=item *

I<character>: a character in the range 0..(2**32-1) (or more).
(What Perl's strings are made of.)

=item *

I<byte>: a character in the range 0..255
(A special case of a Perl character.)

=item *

I<octet>: 8 bits of data, with ordinal values 0..255
(Term for bytes passed to or from a non-Perl context, e.g. disk file.)

=back

The marker [INTERNAL] marks Internal Implementation Details, in
general meant only for those who think they know what they are doing,
and such details may change in future releases.

=head1 PERL ENCODING API

=over 4

=item $octets  = encode(ENCODING, $string[, CHECK])

Encodes string from Perl's internal form into I<ENCODING> and returns
a sequence of octets.  ENCODING can be either a canonical name or
alias.  For encoding names and aliases, see L</"Defining Aliases">.
For CHECK see L</"Handling Malformed Data">.

For example to convert (internally UTF-8 encoded) Unicode string to
iso-8859-1 (also known as Latin1), 

  $octets = encode("iso-8859-1", $unicode);

=item $string = decode(ENCODING, $octets[, CHECK])

Decode sequence of octets assumed to be in I<ENCODING> into Perl's
internal form and returns the resulting string.  as in encode(),
ENCODING can be either a canonical name or alias. For encoding names
and aliases, see L</"Defining Aliases">.  For CHECK see
L</"Handling Malformed Data">.

For example to convert ISO-8859-1 data to UTF-8:

  $utf8 = decode("iso-8859-1", $latin1);

=item [$length =] from_to($string, FROM_ENCODING, TO_ENCODING [,CHECK])

Convert B<in-place> the data between two encodings.  How did the data
in $string originally get to be in FROM_ENCODING?  Either using
encode() or through PerlIO: See L</"Encoding and IO">.
For encoding names and aliases, see L</"Defining Aliases">. 
For CHECK see L</"Handling Malformed Data">.

For example to convert ISO-8859-1 data to UTF-8:

	from_to($data, "iso-8859-1", "utf-8");

and to convert it back:

	from_to($data, "utf-8", "iso-8859-1");

Note that because the conversion happens in place, the data to be
converted cannot be a string constant, it must be a scalar variable.

from_to() return the length of the converted string on success, undef
otherwise.

=back

=head2 UTF-8 / utf8

The Unicode consortium defines the UTF-8 standard as a way of encoding
the entire Unicode repertoire as sequences of octets.  This encoding is
expected to become very widespread. Perl can use this form internally
to represent strings, so conversions to and from this form are
particularly efficient (as octets in memory do not have to change,
just the meta-data that tells Perl how to treat them).

=over 4

=item $octets = encode_utf8($string);

The characters that comprise string are encoded in Perl's superset of UTF-8
and the resulting octets returned as a sequence of bytes. All possible
characters have a UTF-8 representation so this function cannot fail.

=item $string = decode_utf8($octets [, CHECK]);

The sequence of octets represented by $octets is decoded from UTF-8
into a sequence of logical characters. Not all sequences of octets
form valid UTF-8 encodings, so it is possible for this call to fail.
For CHECK see L</"Handling Malformed Data">.

=back

=head2 Listing available encodings

  use Encode;
  @list = Encode->encodings();

Returns a list of the canonical names of the available encodings that
are loaded.  To get a list of all available encodings including the
ones that are not loaded yet, say

  @all_encodings = Encode->encodings(":all");

Or you can give the name of specific module.

  @with_jp = Encode->encodings("Encode::JP");

When "::" is not in the name, "Encode::" is assumed.

  @ebcdic = Encode->encodings("EBCDIC");

To find which encodings are supported by this package in details, 
see L<Encode::Supported>.

=head2 Defining Aliases

To add new alias to a given encoding,  Use;

  use Encode;
  use Encode::Alias;
  define_alias(newName => ENCODING);

After that, newName can be used as an alias for ENCODING.
ENCODING may be either the name of an encoding or an
I<encoding object>

But before you do so, make sure the alias is nonexistent with
C<resolve_alias()>, which returns the canonical name thereof.
i.e.

  Encode::resolve_alias("latin1") eq "iso-8859-1" # true
  Encode::resolve_alias("iso-8859-12")   # false; nonexistent
  Encode::resolve_alias($name) eq $name  # true if $name is canonical

This resolve_alias() does not need C<use Encode::Alias> and is 
exported via C<use encode qw(resolve_alias)>.

See L<Encode::Alias> on details.

=head1 Encoding and IO

It is very common to want to do encoding transformations when
reading or writing files, network connections, pipes etc.
If Perl is configured to use the new 'perlio' IO system then
C<Encode> provides a "layer" (See L<perliol>) which can transform
data as it is read or written.

Here is how the blind poet would modernise the encoding:

    use Encode;
    open(my $iliad,'<:encoding(iso-8859-7)','iliad.greek');
    open(my $utf8,'>:utf8','iliad.utf8');
    my @epic = <$iliad>;
    print $utf8 @epic;
    close($utf8);
    close($illiad);

In addition the new IO system can also be configured to read/write
UTF-8 encoded characters (as noted above this is efficient):

    open(my $fh,'>:utf8','anything');
    print $fh "Any \x{0021} string \N{SMILEY FACE}\n";

Either of the above forms of "layer" specifications can be made the default
for a lexical scope with the C<use open ...> pragma. See L<open>.

Once a handle is open is layers can be altered using C<binmode>.

Without any such configuration, or if Perl itself is built using
system's own IO, then write operations assume that file handle accepts
only I<bytes> and will C<die> if a character larger than 255 is
written to the handle. When reading, each octet from the handle
becomes a byte-in-a-character. Note that this default is the same
behaviour as bytes-only languages (including Perl before v5.6) would
have, and is sufficient to handle native 8-bit encodings
e.g. iso-8859-1, EBCDIC etc. and any legacy mechanisms for handling
other encodings and binary data.

In other cases it is the programs responsibility to transform
characters into bytes using the API above before doing writes, and to
transform the bytes read from a handle into characters before doing
"character operations" (e.g. C<lc>, C</\W+/>, ...).

You can also use PerlIO to convert larger amounts of data you don't
want to bring into memory.  For example to convert between ISO-8859-1
(Latin 1) and UTF-8 (or UTF-EBCDIC in EBCDIC machines):

    open(F, "<:encoding(iso-8859-1)", "data.txt") or die $!;
    open(G, ">:utf8",                 "data.utf") or die $!;
    while (<F>) { print G }

    # Could also do "print G <F>" but that would pull
    # the whole file into memory just to write it out again.

More examples:

    open(my $f, "<:encoding(cp1252)")
    open(my $g, ">:encoding(iso-8859-2)")
    open(my $h, ">:encoding(latin9)")       # iso-8859-15

See L<PerlIO> for more information.

See also L<encoding> for how to change the default encoding of the
data in your script.

=head1 Handling Malformed Data

If I<CHECK> is not set, (en|de)code will put I<substitution character> in
place of the malformed character.  for UCM-based encodings,
E<lt>subcharE<gt> will be used.  For Unicode, \xFFFD is used.  If the
data is supposed to be UTF-8, an optional lexical warning (category
utf8) is given. 

If I<CHECK> is true but not a code reference, dies with an error message.

In future you will be able to use a code reference to a callback
function for the value of I<CHECK> but its API is still undecided.

=head1 Defining Encodings

To define a new encoding, use:

    use Encode qw(define_alias);
    define_encoding($object, 'canonicalName' [, alias...]);

I<canonicalName> will be associated with I<$object>.  The object
should provide the interface described in L<Encode::Encoding>
If more than two arguments are provided then additional
arguments are taken as aliases for I<$object> as for C<define_alias>.

See L<Encode::Encoding> for more details.

=head1 Messing with Perl's Internals

The following API uses parts of Perl's internals in the current
implementation.  As such they are efficient, but may change.

=over 4

=item is_utf8(STRING [, CHECK])

[INTERNAL] Test whether the UTF-8 flag is turned on in the STRING.
If CHECK is true, also checks the data in STRING for being well-formed
UTF-8.  Returns true if successful, false otherwise.

=item _utf8_on(STRING)

[INTERNAL] Turn on the UTF-8 flag in STRING.  The data in STRING is
B<not> checked for being well-formed UTF-8.  Do not use unless you
B<know> that the STRING is well-formed UTF-8.  Returns the previous
state of the UTF-8 flag (so please don't test the return value as
I<not> success or failure), or C<undef> if STRING is not a string.

=item _utf8_off(STRING)

[INTERNAL] Turn off the UTF-8 flag in STRING.  Do not use frivolously.
Returns the previous state of the UTF-8 flag (so please don't test the
return value as I<not> success or failure), or C<undef> if STRING is
not a string.

=back

=head1 SEE ALSO

L<Encode::Encoding>,
L<Encode::Supported>,
L<PerlIO>, 
L<encoding>,
L<perlebcdic>, 
L<perlfunc/open>, 
L<perlunicode>, 
L<utf8>, 
the Perl Unicode Mailing List E<lt>perl-unicode@perl.orgE<gt>

head2 MAINTAINER

This project was originated by Nick Ing-Simmons and later maintained
by Dan Kogai E<lt>dankogai@dan.co.jpE<gt>.  See AUTHORS for full list
of people involved.  For any questions, use
E<lt>perl-unicode@perl.orgE<gt> so others can share.

=cut
Commit	Line	Data
2c674647	1	package Encode;
51ef4e11	2	use strict;
b2704119	3	our $VERSION = do { my @r = (q$Revision: 1.42 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
5129552c	4	our $DEBUG = 0;
2c674647	5
	6	require DynaLoader;
	7	require Exporter;
	8
51ef4e11	9	our @ISA = qw(Exporter DynaLoader);
2c674647	10
4411f3b6	11	# Public, encouraged API is exported by default
51ef4e11	12	our @EXPORT = qw (
4411f3b6	13	decode
4411f3b6	14	decode_utf8
fcb875d4	15	encode
fcb875d4	16	encode_utf8
51ef4e11	17	encodings
fcb875d4	18	find_encoding
4411f3b6	19	);
4411f3b6	20
51ef4e11	21	our @EXPORT_OK =
2c674647	22	qw(
fcb875d4	23	_utf8_off
fcb875d4	24	_utf8_on
51ef4e11	25	define_encoding
2c674647	26	from_to
4411f3b6	27	is_16bit
fcb875d4	28	is_8bit
	29	is_utf8
	30	resolve_alias
a12c0f56	31	utf8_downgrade
fcb875d4	32	utf8_upgrade
2c674647	33	);
	34
	35	bootstrap Encode ();
	36
4411f3b6	37	# Documentation moved after __END__ for speed - NI-S
2c674647	38
bf230f3d	39	use Carp;
bf230f3d	40
a63c962f	41	our $ON_EBCDIC = (ord("A") == 193);
f2a2953c	42
5d030b67	43	use Encode::Alias;
5d030b67	44
5129552c	45	# Make a %Encoding package variable to allow a certain amount of cheating
5129552c	46	our %Encoding;
aae85ceb	47	our %ExtModule;
	48	require Encode::Config;
	49	eval { require Encode::ConfigLocal };
5129552c	50
656753f8	51	sub encodings
656753f8	52	{
5129552c	53	my $class = shift;
071db25d	54	my @modules = (@_ and $_[0] eq ":all") ? values %ExtModule : @_;
c731e18e	55	for my $mod (@modules){
	56	$mod =~ s,::,/,g or $mod = "Encode/$mod";
	57	$mod .= '.pm';
	58	$DEBUG and warn "about to require $mod;";
	59	eval { require $mod; };
5129552c	60	}
c731e18e	61	my %modules = map {$_ => 1} @modules;
5129552c	62	return
ce912cd4	63	sort { lc $a cmp lc $b }
ce912cd4	64	grep {!/^(?:Internal\|Unicode)$/o} keys %Encoding;
51ef4e11	65	}
51ef4e11	66
51ef4e11	67	sub define_encoding
51ef4e11	68	{
18586f54	69	my $obj = shift;
18586f54	70	my $name = shift;
5129552c	71	$Encoding{$name} = $obj;
18586f54	72	my $lc = lc($name);
	73	define_alias($lc => $obj) unless $lc eq $name;
	74	while (@_)
	75	{
	76	my $alias = shift;
	77	define_alias($alias,$obj);
	78	}
	79	return $obj;
656753f8	80	}
656753f8	81
656753f8	82	sub getEncoding
656753f8	83	{
dd9703c9	84	my ($class,$name,$skip_external) = @_;
18586f54	85	my $enc;
	86	if (ref($name) && $name->can('new_sequence'))
	87	{
	88	return $name;
	89	}
	90	my $lc = lc $name;
5129552c	91	if (exists $Encoding{$name})
18586f54	92	{
5129552c	93	return $Encoding{$name};
18586f54	94	}
5129552c	95	if (exists $Encoding{$lc})
18586f54	96	{
5129552c	97	return $Encoding{$lc};
18586f54	98	}
c50d192e	99
5129552c	100	my $oc = $class->find_alias($name);
c50d192e	101	return $oc if defined $oc;
c50d192e	102
5129552c	103	$oc = $class->find_alias($lc) if $lc ne $name;
c50d192e	104	return $oc if defined $oc;
c50d192e	105
c731e18e	106	unless ($skip_external)
d1ed7747	107	{
c731e18e	108	if (my $mod = $ExtModule{$name} \|\| $ExtModule{$lc}){
	109	$mod =~ s,::,/,g ; $mod .= '.pm';
	110	eval{ require $mod; };
	111	return $Encoding{$name} if exists $Encoding{$name};
	112	}
d1ed7747	113	}
18586f54	114	return;
656753f8	115	}
656753f8	116
4411f3b6	117	sub find_encoding
4411f3b6	118	{
dd9703c9	119	my ($name,$skip_external) = @_;
dd9703c9	120	return __PACKAGE__->getEncoding($name,$skip_external);
4411f3b6	121	}
4411f3b6	122
fcb875d4	123	sub resolve_alias {
	124	my $obj = find_encoding(shift);
	125	defined $obj and return $obj->name;
	126	return;
	127	}
	128
b2704119	129	sub encode($$;$)
4411f3b6	130	{
18586f54	131	my ($name,$string,$check) = @_;
b2704119	132	$check \|\|=0;
18586f54	133	my $enc = find_encoding($name);
	134	croak("Unknown encoding '$name'") unless defined $enc;
	135	my $octets = $enc->encode($string,$check);
	136	return undef if ($check && length($string));
	137	return $octets;
4411f3b6	138	}
4411f3b6	139
b2704119	140	sub decode($$;$)
4411f3b6	141	{
18586f54	142	my ($name,$octets,$check) = @_;
b2704119	143	$check \|\|=0;
18586f54	144	my $enc = find_encoding($name);
	145	croak("Unknown encoding '$name'") unless defined $enc;
	146	my $string = $enc->decode($octets,$check);
	147	$_[1] = $octets if $check;
	148	return $string;
4411f3b6	149	}
4411f3b6	150
b2704119	151	sub from_to($$$;$)
4411f3b6	152	{
18586f54	153	my ($string,$from,$to,$check) = @_;
b2704119	154	$check \|\|=0;
18586f54	155	my $f = find_encoding($from);
	156	croak("Unknown encoding '$from'") unless defined $f;
	157	my $t = find_encoding($to);
	158	croak("Unknown encoding '$to'") unless defined $t;
	159	my $uni = $f->decode($string,$check);
	160	return undef if ($check && length($string));
a999c27c	161	$string = $t->encode($uni,$check);
18586f54	162	return undef if ($check && length($uni));
3ef515df	163	return defined($_[0] = $string) ? length($string) : undef ;
4411f3b6	164	}
4411f3b6	165
b2704119	166	sub encode_utf8($)
4411f3b6	167	{
18586f54	168	my ($str) = @_;
c731e18e	169	utf8::encode($str);
18586f54	170	return $str;
4411f3b6	171	}
4411f3b6	172
b2704119	173	sub decode_utf8($)
4411f3b6	174	{
18586f54	175	my ($str) = @_;
	176	return undef unless utf8::decode($str);
	177	return $str;
5ad8ef52	178	}
5ad8ef52	179
f2a2953c	180	predefine_encodings();
	181
	182	#
	183	# This is to restore %Encoding if really needed;
	184	#
	185	sub predefine_encodings{
	186	if ($ON_EBCDIC) {
	187	# was in Encode::UTF_EBCDIC
	188	package Encode::UTF_EBCDIC;
	189	*name = sub{ shift->{'Name'} };
	190	*new_sequence = sub{ return $_[0] };
	191	*decode = sub{
	192	my ($obj,$str,$chk) = @_;
	193	my $res = '';
	194	for (my $i = 0; $i < length($str); $i++) {
	195	$res .=
	196	chr(utf8::unicode_to_native(ord(substr($str,$i,1))));
	197	}
	198	$_[1] = '' if $chk;
	199	return $res;
	200	};
	201	*encode = sub{
	202	my ($obj,$str,$chk) = @_;
	203	my $res = '';
	204	for (my $i = 0; $i < length($str); $i++) {
	205	$res .=
	206	chr(utf8::native_to_unicode(ord(substr($str,$i,1))));
	207	}
	208	$_[1] = '' if $chk;
	209	return $res;
	210	};
77ea6967	211	$Encode::Encoding{Unicode} =
c731e18e	212	bless {Name => "UTF_EBCDIC"} => "Encode::UTF_EBCDIC";
f2a2953c	213	} else {
	214	# was in Encode::UTF_EBCDIC
	215	package Encode::Internal;
	216	*name = sub{ shift->{'Name'} };
	217	*new_sequence = sub{ return $_[0] };
	218	*decode = sub{
	219	my ($obj,$str,$chk) = @_;
	220	utf8::upgrade($str);
	221	$_[1] = '' if $chk;
	222	return $str;
	223	};
	224	*encode = \&decode;
	225	$Encode::Encoding{Unicode} =
c731e18e	226	bless {Name => "Internal"} => "Encode::Internal";
f2a2953c	227	}
	228
	229	{
	230	# was in Encode::utf8
	231	package Encode::utf8;
	232	*name = sub{ shift->{'Name'} };
	233	*new_sequence = sub{ return $_[0] };
	234	*decode = sub{
	235	my ($obj,$octets,$chk) = @_;
	236	my $str = Encode::decode_utf8($octets);
	237	if (defined $str) {
	238	$_[1] = '' if $chk;
	239	return $str;
	240	}
	241	return undef;
	242	};
	243	*encode = sub {
	244	my ($obj,$string,$chk) = @_;
	245	my $octets = Encode::encode_utf8($string);
	246	$_[1] = '' if $chk;
	247	return $octets;
	248	};
	249	$Encode::Encoding{utf8} =
c731e18e	250	bless {Name => "utf8"} => "Encode::utf8";
f2a2953c	251	}
f2a2953c	252	}
f2a2953c	253
18586f54	254	require Encode::Encoding;
b2704119	255
b2704119	256	eval { require PerlIO::encoding };
4411f3b6	257
656753f8	258	1;
656753f8	259
2a936312	260	__END__
2a936312	261
4411f3b6	262	=head1 NAME
	263
	264	Encode - character encodings
	265
	266	=head1 SYNOPSIS
	267
	268	use Encode;
	269
67d7b5ef	270
	271	=head2 Table of Contents
	272
	273	Encode consists of a collection of modules which details are too big
	274	to fit in one document. This POD itself explains the top-level APIs
	275	and general topics at a glance. For other topics and more details,
	276	see the PODs below;
	277
	278	Name Description
	279	--------------------------------------------------------
	280	Encode::Alias Alias defintions to encodings
	281	Encode::Encoding Encode Implementation Base Class
	282	Encode::Supported List of Supported Encodings
	283	Encode::CN Simplified Chinese Encodings
	284	Encode::JP Japanese Encodings
	285	Encode::KR Korean Encodings
	286	Encode::TW Traditional Chinese Encodings
	287	--------------------------------------------------------
	288
4411f3b6	289	=head1 DESCRIPTION
4411f3b6	290
47bfe92f	291	The C<Encode> module provides the interfaces between Perl's strings
67d7b5ef	292	and the rest of the system. Perl strings are sequences of
	293	B<characters>.
	294
	295	The repertoire of characters that Perl can represent is at least that
	296	defined by the Unicode Consortium. On most platforms the ordinal
	297	values of the characters (as returned by C<ord(ch)>) is the "Unicode
	298	codepoint" for the character (the exceptions are those platforms where
	299	the legacy encoding is some variant of EBCDIC rather than a super-set
	300	of ASCII - see L<perlebcdic>).
	301
	302	Traditionally computer data has been moved around in 8-bit chunks
	303	often called "bytes". These chunks are also known as "octets" in
	304	networking standards. Perl is widely used to manipulate data of many
	305	types - not only strings of characters representing human or computer
	306	languages but also "binary" data being the machines representation of
	307	numbers, pixels in an image - or just about anything.
	308
	309	When Perl is processing "binary data" the programmer wants Perl to
	310	process "sequences of bytes". This is not a problem for Perl - as a
	311	byte has 256 possible values it easily fits in Perl's much larger
	312	"logical character".
	313
	314	=head2 TERMINOLOGY
4411f3b6	315
67d7b5ef	316	=over 4
21938dfa	317
67d7b5ef	318	=item *
	319
	320	I<character>: a character in the range 0..(2**32-1) (or more).
	321	(What Perl's strings are made of.)
	322
	323	=item *
	324
	325	I<byte>: a character in the range 0..255
	326	(A special case of a Perl character.)
	327
	328	=item *
	329
	330	I<octet>: 8 bits of data, with ordinal values 0..255
	331	(Term for bytes passed to or from a non-Perl context, e.g. disk file.)
	332
	333	=back
4411f3b6	334
67d7b5ef	335	The marker [INTERNAL] marks Internal Implementation Details, in
	336	general meant only for those who think they know what they are doing,
	337	and such details may change in future releases.
	338
	339	=head1 PERL ENCODING API
4411f3b6	340
	341	=over 4
	342
f2a2953c	343	=item $octets = encode(ENCODING, $string[, CHECK])
4411f3b6	344
47bfe92f	345	Encodes string from Perl's internal form into I<ENCODING> and returns
67d7b5ef	346	a sequence of octets. ENCODING can be either a canonical name or
	347	alias. For encoding names and aliases, see L</"Defining Aliases">.
	348	For CHECK see L</"Handling Malformed Data">.
4411f3b6	349
67d7b5ef	350	For example to convert (internally UTF-8 encoded) Unicode string to
67d7b5ef	351	iso-8859-1 (also known as Latin1),
681a7c68	352
67d7b5ef	353	$octets = encode("iso-8859-1", $unicode);
681a7c68	354
f2a2953c	355	=item $string = decode(ENCODING, $octets[, CHECK])
4411f3b6	356
47bfe92f	357	Decode sequence of octets assumed to be in I<ENCODING> into Perl's
67d7b5ef	358	internal form and returns the resulting string. as in encode(),
	359	ENCODING can be either a canonical name or alias. For encoding names
	360	and aliases, see L</"Defining Aliases">. For CHECK see
47bfe92f	361	L</"Handling Malformed Data">.
47bfe92f	362
1b2c56c8	363	For example to convert ISO-8859-1 data to UTF-8:
681a7c68	364
67d7b5ef	365	$utf8 = decode("iso-8859-1", $latin1);
681a7c68	366
f2a2953c	367	=item [$length =] from_to($string, FROM_ENCODING, TO_ENCODING [,CHECK])
47bfe92f	368
2b106fbe	369	Convert B<in-place> the data between two encodings. How did the data
2b106fbe	370	in $string originally get to be in FROM_ENCODING? Either using
67d7b5ef	371	encode() or through PerlIO: See L</"Encoding and IO">.
	372	For encoding names and aliases, see L</"Defining Aliases">.
	373	For CHECK see L</"Handling Malformed Data">.
2b106fbe	374
1b2c56c8	375	For example to convert ISO-8859-1 data to UTF-8:
2b106fbe	376
	377	from_to($data, "iso-8859-1", "utf-8");
	378
	379	and to convert it back:
	380
	381	from_to($data, "utf-8", "iso-8859-1");
4411f3b6	382
ab97ca19	383	Note that because the conversion happens in place, the data to be
	384	converted cannot be a string constant, it must be a scalar variable.
	385
3ef515df	386	from_to() return the length of the converted string on success, undef
	387	otherwise.
	388
4411f3b6	389	=back
4411f3b6	390
f2a2953c	391	=head2 UTF-8 / utf8
	392
	393	The Unicode consortium defines the UTF-8 standard as a way of encoding
	394	the entire Unicode repertoire as sequences of octets. This encoding is
	395	expected to become very widespread. Perl can use this form internally
	396	to represent strings, so conversions to and from this form are
	397	particularly efficient (as octets in memory do not have to change,
	398	just the meta-data that tells Perl how to treat them).
	399
	400	=over 4
	401
	402	=item $octets = encode_utf8($string);
	403
	404	The characters that comprise string are encoded in Perl's superset of UTF-8
	405	and the resulting octets returned as a sequence of bytes. All possible
	406	characters have a UTF-8 representation so this function cannot fail.
	407
	408	=item $string = decode_utf8($octets [, CHECK]);
	409
	410	The sequence of octets represented by $octets is decoded from UTF-8
	411	into a sequence of logical characters. Not all sequences of octets
	412	form valid UTF-8 encodings, so it is possible for this call to fail.
	413	For CHECK see L</"Handling Malformed Data">.
	414
	415	=back
	416
51ef4e11	417	=head2 Listing available encodings
51ef4e11	418
5129552c	419	use Encode;
	420	@list = Encode->encodings();
	421
	422	Returns a list of the canonical names of the available encodings that
	423	are loaded. To get a list of all available encodings including the
	424	ones that are not loaded yet, say
	425
	426	@all_encodings = Encode->encodings(":all");
	427
	428	Or you can give the name of specific module.
	429
c731e18e	430	@with_jp = Encode->encodings("Encode::JP");
	431
	432	When "::" is not in the name, "Encode::" is assumed.
51ef4e11	433
c731e18e	434	@ebcdic = Encode->encodings("EBCDIC");
5d030b67	435
a63c962f	436	To find which encodings are supported by this package in details,
5d030b67	437	see L<Encode::Supported>.
51ef4e11	438
	439	=head2 Defining Aliases
	440
67d7b5ef	441	To add new alias to a given encoding, Use;
67d7b5ef	442
5129552c	443	use Encode;
5129552c	444	use Encode::Alias;
a63c962f	445	define_alias(newName => ENCODING);
51ef4e11	446
3ef515df	447	After that, newName can be used as an alias for ENCODING.
f2a2953c	448	ENCODING may be either the name of an encoding or an
f2a2953c	449	I<encoding object>
51ef4e11	450
fcb875d4	451	But before you do so, make sure the alias is nonexistent with
	452	C<resolve_alias()>, which returns the canonical name thereof.
	453	i.e.
	454
	455	Encode::resolve_alias("latin1") eq "iso-8859-1" # true
	456	Encode::resolve_alias("iso-8859-12") # false; nonexistent
	457	Encode::resolve_alias($name) eq $name # true if $name is canonical
	458
	459	This resolve_alias() does not need C<use Encode::Alias> and is
	460	exported via C<use encode qw(resolve_alias)>.
	461
5d030b67	462	See L<Encode::Alias> on details.
51ef4e11	463
4411f3b6	464	=head1 Encoding and IO
	465
	466	It is very common to want to do encoding transformations when
	467	reading or writing files, network connections, pipes etc.
47bfe92f	468	If Perl is configured to use the new 'perlio' IO system then
4411f3b6	469	C<Encode> provides a "layer" (See L<perliol>) which can transform
	470	data as it is read or written.
	471
8e86646e	472	Here is how the blind poet would modernise the encoding:
8e86646e	473
42234700	474	use Encode;
8e86646e	475	open(my $iliad,'<:encoding(iso-8859-7)','iliad.greek');
	476	open(my $utf8,'>:utf8','iliad.utf8');
	477	my @epic = <$iliad>;
	478	print $utf8 @epic;
	479	close($utf8);
	480	close($illiad);
4411f3b6	481
	482	In addition the new IO system can also be configured to read/write
	483	UTF-8 encoded characters (as noted above this is efficient):
	484
e9692b5b	485	open(my $fh,'>:utf8','anything');
e9692b5b	486	print $fh "Any \x{0021} string \N{SMILEY FACE}\n";
4411f3b6	487
	488	Either of the above forms of "layer" specifications can be made the default
	489	for a lexical scope with the C<use open ...> pragma. See L<open>.
	490
	491	Once a handle is open is layers can be altered using C<binmode>.
	492
47bfe92f	493	Without any such configuration, or if Perl itself is built using
4411f3b6	494	system's own IO, then write operations assume that file handle accepts
	495	only I<bytes> and will C<die> if a character larger than 255 is
	496	written to the handle. When reading, each octet from the handle
	497	becomes a byte-in-a-character. Note that this default is the same
47bfe92f	498	behaviour as bytes-only languages (including Perl before v5.6) would
	499	have, and is sufficient to handle native 8-bit encodings
	500	e.g. iso-8859-1, EBCDIC etc. and any legacy mechanisms for handling
	501	other encodings and binary data.
	502
	503	In other cases it is the programs responsibility to transform
	504	characters into bytes using the API above before doing writes, and to
	505	transform the bytes read from a handle into characters before doing
	506	"character operations" (e.g. C<lc>, C</\W+/>, ...).
	507
47bfe92f	508	You can also use PerlIO to convert larger amounts of data you don't
1b2c56c8	509	want to bring into memory. For example to convert between ISO-8859-1
47bfe92f	510	(Latin 1) and UTF-8 (or UTF-EBCDIC in EBCDIC machines):
47bfe92f	511
e9692b5b	512	open(F, "<:encoding(iso-8859-1)", "data.txt") or die $!;
	513	open(G, ">:utf8", "data.utf") or die $!;
	514	while (<F>) { print G }
	515
	516	# Could also do "print G <F>" but that would pull
	517	# the whole file into memory just to write it out again.
	518
	519	More examples:
47bfe92f	520
e9692b5b	521	open(my $f, "<:encoding(cp1252)")
	522	open(my $g, ">:encoding(iso-8859-2)")
	523	open(my $h, ">:encoding(latin9)") # iso-8859-15
47bfe92f	524
47bfe92f	525	See L<PerlIO> for more information.
4411f3b6	526
1768d7eb	527	See also L<encoding> for how to change the default encoding of the
d521382b	528	data in your script.
1768d7eb	529
67d7b5ef	530	=head1 Handling Malformed Data
67d7b5ef	531
f2a2953c	532	If I<CHECK> is not set, (en\|de)code will put I<substitution character> in
	533	place of the malformed character. for UCM-based encodings,
	534	E<lt>subcharE<gt> will be used. For Unicode, \xFFFD is used. If the
	535	data is supposed to be UTF-8, an optional lexical warning (category
	536	utf8) is given.
67d7b5ef	537
f2a2953c	538	If I<CHECK> is true but not a code reference, dies with an error message.
67d7b5ef	539
f2a2953c	540	In future you will be able to use a code reference to a callback
f2a2953c	541	function for the value of I<CHECK> but its API is still undecided.
67d7b5ef	542
	543	=head1 Defining Encodings
	544
	545	To define a new encoding, use:
	546
	547	use Encode qw(define_alias);
	548	define_encoding($object, 'canonicalName' [, alias...]);
	549
	550	I<canonicalName> will be associated with I<$object>. The object
	551	should provide the interface described in L<Encode::Encoding>
	552	If more than two arguments are provided then additional
	553	arguments are taken as aliases for I<$object> as for C<define_alias>.
	554
f2a2953c	555	See L<Encode::Encoding> for more details.
f2a2953c	556
4411f3b6	557	=head1 Messing with Perl's Internals
4411f3b6	558
47bfe92f	559	The following API uses parts of Perl's internals in the current
47bfe92f	560	implementation. As such they are efficient, but may change.
4411f3b6	561
	562	=over 4
	563
a63c962f	564	=item is_utf8(STRING [, CHECK])
4411f3b6	565
4411f3b6	566	[INTERNAL] Test whether the UTF-8 flag is turned on in the STRING.
47bfe92f	567	If CHECK is true, also checks the data in STRING for being well-formed
47bfe92f	568	UTF-8. Returns true if successful, false otherwise.
4411f3b6	569
a63c962f	570	=item _utf8_on(STRING)
4411f3b6	571
	572	[INTERNAL] Turn on the UTF-8 flag in STRING. The data in STRING is
	573	B<not> checked for being well-formed UTF-8. Do not use unless you
	574	B<know> that the STRING is well-formed UTF-8. Returns the previous
	575	state of the UTF-8 flag (so please don't test the return value as
	576	I<not> success or failure), or C<undef> if STRING is not a string.
	577
a63c962f	578	=item _utf8_off(STRING)
4411f3b6	579
	580	[INTERNAL] Turn off the UTF-8 flag in STRING. Do not use frivolously.
	581	Returns the previous state of the UTF-8 flag (so please don't test the
	582	return value as I<not> success or failure), or C<undef> if STRING is
	583	not a string.
	584
	585	=back
	586
	587	=head1 SEE ALSO
	588
5d030b67	589	L<Encode::Encoding>,
	590	L<Encode::Supported>,
	591	L<PerlIO>,
	592	L<encoding>,
	593	L<perlebcdic>,
	594	L<perlfunc/open>,
	595	L<perlunicode>,
	596	L<utf8>,
	597	the Perl Unicode Mailing List E<lt>perl-unicode@perl.orgE<gt>
4411f3b6	598
aae85ceb	599	head2 MAINTAINER
	600
	601	This project was originated by Nick Ing-Simmons and later maintained
	602	by Dan Kogai E<lt>dankogai@dan.co.jpE<gt>. See AUTHORS for full list
	603	of people involved. For any questions, use
	604	E<lt>perl-unicode@perl.orgE<gt> so others can share.
	605
4411f3b6	606	=cut