[p5sagit/p5-mst-13.2.git] / ext / Encode / Encode.pm

package Encode;
use strict;
our $VERSION = do { my @r = (q$Revision: 1.40 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
our $DEBUG = 0;

require DynaLoader;
require Exporter;

our @ISA = qw(Exporter DynaLoader);

# Public, encouraged API is exported by default
our @EXPORT = qw (
  decode
  decode_utf8
  encode
  encode_utf8
  encodings
  find_encoding
);

our @EXPORT_OK =
    qw(
       _utf8_off
       _utf8_on
       define_encoding
       from_to
       is_16bit
       is_8bit
       is_utf8
       resolve_alias
       utf8_downgrade
       utf8_upgrade
      );

bootstrap Encode ();

# Documentation moved after __END__ for speed - NI-S

use Carp;

our $ON_EBCDIC = (ord("A") == 193);

use Encode::Alias;

# Make a %Encoding package variable to allow a certain amount of cheating
our %Encoding;
our %ExtModule;
require Encode::Config;
eval { require Encode::ConfigLocal };

sub encodings
{
    my $class = shift;
    my @modules = (@_ and $_[0] eq ":all") ? values %ExtModule : @_;
    for my $mod (@modules){
	$mod =~ s,::,/,g or $mod = "Encode/$mod";
	$mod .= '.pm'; 
	$DEBUG and warn "about to require $mod;";
	eval { require $mod; };
    }
    my %modules = map {$_ => 1} @modules;
    return
	sort { lc $a cmp lc $b }
             grep {!/^(?:Internal|Unicode)$/o} keys %Encoding;
}

sub define_encoding
{
    my $obj  = shift;
    my $name = shift;
    $Encoding{$name} = $obj;
    my $lc = lc($name);
    define_alias($lc => $obj) unless $lc eq $name;
    while (@_)
    {
	my $alias = shift;
	define_alias($alias,$obj);
    }
    return $obj;
}

sub getEncoding
{
    my ($class,$name,$skip_external) = @_;
    my $enc;
    if (ref($name) && $name->can('new_sequence'))
    {
	return $name;
    }
    my $lc = lc $name;
    if (exists $Encoding{$name})
    {
	return $Encoding{$name};
    }
    if (exists $Encoding{$lc})
    {
	return $Encoding{$lc};
    }

    my $oc = $class->find_alias($name);
    return $oc if defined $oc;

    $oc = $class->find_alias($lc) if $lc ne $name;
    return $oc if defined $oc;

    unless ($skip_external)
    {
	if (my $mod = $ExtModule{$name} || $ExtModule{$lc}){
	    $mod =~ s,::,/,g ; $mod .= '.pm';
	    eval{ require $mod; };
	    return $Encoding{$name} if exists $Encoding{$name};
	}
    }
    return;
}

sub find_encoding
{
    my ($name,$skip_external) = @_;
    return __PACKAGE__->getEncoding($name,$skip_external);
}

sub resolve_alias {
    my $obj = find_encoding(shift);
    defined $obj and return $obj->name;
    return;
}

sub encode
{
    my ($name,$string,$check) = @_;
    my $enc = find_encoding($name);
    croak("Unknown encoding '$name'") unless defined $enc;
    my $octets = $enc->encode($string,$check);
    return undef if ($check && length($string));
    return $octets;
}

sub decode
{
    my ($name,$octets,$check) = @_;
    my $enc = find_encoding($name);
    croak("Unknown encoding '$name'") unless defined $enc;
    my $string = $enc->decode($octets,$check);
    $_[1] = $octets if $check;
    return $string;
}

sub from_to
{
    my ($string,$from,$to,$check) = @_;
    my $f = find_encoding($from);
    croak("Unknown encoding '$from'") unless defined $f;
    my $t = find_encoding($to);
    croak("Unknown encoding '$to'") unless defined $t;
    my $uni = $f->decode($string,$check);
    return undef if ($check && length($string));
    $string =  $t->encode($uni,$check);
    return undef if ($check && length($uni));
    return defined($_[0] = $string) ? length($string) : undef ;
}

sub encode_utf8
{
    my ($str) = @_;
    utf8::encode($str);
    return $str;
}

sub decode_utf8
{
    my ($str) = @_;
    return undef unless utf8::decode($str);
    return $str;
}

predefine_encodings();

#
# This is to restore %Encoding if really needed;
#
sub predefine_encodings{
    if ($ON_EBCDIC) { 
	# was in Encode::UTF_EBCDIC
	package Encode::UTF_EBCDIC;
	*name         = sub{ shift->{'Name'} };
	*new_sequence = sub{ return $_[0] };
	*decode = sub{
	    my ($obj,$str,$chk) = @_;
	    my $res = '';
	    for (my $i = 0; $i < length($str); $i++) {
		$res .= 
		    chr(utf8::unicode_to_native(ord(substr($str,$i,1))));
	    }
	    $_[1] = '' if $chk;
	    return $res;
	};
	*encode = sub{
	    my ($obj,$str,$chk) = @_;
	    my $res = '';
	    for (my $i = 0; $i < length($str); $i++) {
		$res .= 
		    chr(utf8::native_to_unicode(ord(substr($str,$i,1))));
	    }
	    $_[1] = '' if $chk;
	    return $res;
	};
	$Encode::Encoding{Unicode} = 
	    bless {Name => "UTF_EBCDIC"} => "Encode::UTF_EBCDIC";
    } else {  
	# was in Encode::UTF_EBCDIC
	package Encode::Internal;
	*name         = sub{ shift->{'Name'} };
	*new_sequence = sub{ return $_[0] };
	*decode = sub{
	    my ($obj,$str,$chk) = @_;
	    utf8::upgrade($str);
	    $_[1] = '' if $chk;
	    return $str;
	};
	*encode = \&decode;
	$Encode::Encoding{Unicode} = 
	    bless {Name => "Internal"} => "Encode::Internal";
    }

    {
	# was in Encode::utf8
	package Encode::utf8;
	*name         = sub{ shift->{'Name'} };
	*new_sequence = sub{ return $_[0] };
	*decode = sub{
	    my ($obj,$octets,$chk) = @_;
	    my $str = Encode::decode_utf8($octets);
	    if (defined $str) {
		$_[1] = '' if $chk;
		return $str;
	    }
	    return undef;
	};
	*encode = sub {
	    my ($obj,$string,$chk) = @_;
	    my $octets = Encode::encode_utf8($string);
	    $_[1] = '' if $chk;
	    return $octets;
	};
	$Encode::Encoding{utf8} = 
	    bless {Name => "utf8"} => "Encode::utf8";
    }
}

require Encode::Encoding;
require Encode::XS;

1;

__END__

=head1 NAME

Encode - character encodings

=head1 SYNOPSIS

    use Encode;


=head2 Table of Contents

Encode consists of a collection of modules which details are too big 
to fit in one document.  This POD itself explains the top-level APIs
and general topics at a glance.  For other topics and more details, 
see the PODs below;

  Name			        Description
  --------------------------------------------------------
  Encode::Alias         Alias defintions to encodings
  Encode::Encoding      Encode Implementation Base Class
  Encode::Supported     List of Supported Encodings
  Encode::CN            Simplified Chinese Encodings
  Encode::JP            Japanese Encodings
  Encode::KR            Korean Encodings
  Encode::TW            Traditional Chinese Encodings
  --------------------------------------------------------

=head1 DESCRIPTION

The C<Encode> module provides the interfaces between Perl's strings
and the rest of the system.  Perl strings are sequences of
B<characters>.

The repertoire of characters that Perl can represent is at least that
defined by the Unicode Consortium. On most platforms the ordinal
values of the characters (as returned by C<ord(ch)>) is the "Unicode
codepoint" for the character (the exceptions are those platforms where
the legacy encoding is some variant of EBCDIC rather than a super-set
of ASCII - see L<perlebcdic>).

Traditionally computer data has been moved around in 8-bit chunks
often called "bytes". These chunks are also known as "octets" in
networking standards. Perl is widely used to manipulate data of many
types - not only strings of characters representing human or computer
languages but also "binary" data being the machines representation of
numbers, pixels in an image - or just about anything.

When Perl is processing "binary data" the programmer wants Perl to
process "sequences of bytes". This is not a problem for Perl - as a
byte has 256 possible values it easily fits in Perl's much larger
"logical character".

=head2 TERMINOLOGY

=over 4

=item *

I<character>: a character in the range 0..(2**32-1) (or more).
(What Perl's strings are made of.)

=item *

I<byte>: a character in the range 0..255
(A special case of a Perl character.)

=item *

I<octet>: 8 bits of data, with ordinal values 0..255
(Term for bytes passed to or from a non-Perl context, e.g. disk file.)

=back

The marker [INTERNAL] marks Internal Implementation Details, in
general meant only for those who think they know what they are doing,
and such details may change in future releases.

=head1 PERL ENCODING API

=over 4

=item $octets  = encode(ENCODING, $string[, CHECK])

Encodes string from Perl's internal form into I<ENCODING> and returns
a sequence of octets.  ENCODING can be either a canonical name or
alias.  For encoding names and aliases, see L</"Defining Aliases">.
For CHECK see L</"Handling Malformed Data">.

For example to convert (internally UTF-8 encoded) Unicode string to
iso-8859-1 (also known as Latin1), 

  $octets = encode("iso-8859-1", $unicode);

=item $string = decode(ENCODING, $octets[, CHECK])

Decode sequence of octets assumed to be in I<ENCODING> into Perl's
internal form and returns the resulting string.  as in encode(),
ENCODING can be either a canonical name or alias. For encoding names
and aliases, see L</"Defining Aliases">.  For CHECK see
L</"Handling Malformed Data">.

For example to convert ISO-8859-1 data to UTF-8:

  $utf8 = decode("iso-8859-1", $latin1);

=item [$length =] from_to($string, FROM_ENCODING, TO_ENCODING [,CHECK])

Convert B<in-place> the data between two encodings.  How did the data
in $string originally get to be in FROM_ENCODING?  Either using
encode() or through PerlIO: See L</"Encoding and IO">.
For encoding names and aliases, see L</"Defining Aliases">. 
For CHECK see L</"Handling Malformed Data">.

For example to convert ISO-8859-1 data to UTF-8:

	from_to($data, "iso-8859-1", "utf-8");

and to convert it back:

	from_to($data, "utf-8", "iso-8859-1");

Note that because the conversion happens in place, the data to be
converted cannot be a string constant, it must be a scalar variable.

from_to() return the length of the converted string on success, undef
otherwise.

=back

=head2 UTF-8 / utf8

The Unicode consortium defines the UTF-8 standard as a way of encoding
the entire Unicode repertoire as sequences of octets.  This encoding is
expected to become very widespread. Perl can use this form internally
to represent strings, so conversions to and from this form are
particularly efficient (as octets in memory do not have to change,
just the meta-data that tells Perl how to treat them).

=over 4

=item $octets = encode_utf8($string);

The characters that comprise string are encoded in Perl's superset of UTF-8
and the resulting octets returned as a sequence of bytes. All possible
characters have a UTF-8 representation so this function cannot fail.

=item $string = decode_utf8($octets [, CHECK]);

The sequence of octets represented by $octets is decoded from UTF-8
into a sequence of logical characters. Not all sequences of octets
form valid UTF-8 encodings, so it is possible for this call to fail.
For CHECK see L</"Handling Malformed Data">.

=back

=head2 Listing available encodings

  use Encode;
  @list = Encode->encodings();

Returns a list of the canonical names of the available encodings that
are loaded.  To get a list of all available encodings including the
ones that are not loaded yet, say

  @all_encodings = Encode->encodings(":all");

Or you can give the name of specific module.

  @with_jp = Encode->encodings("Encode::JP");

When "::" is not in the name, "Encode::" is assumed.

  @ebcdic = Encode->encodings("EBCDIC");

To find which encodings are supported by this package in details, 
see L<Encode::Supported>.

=head2 Defining Aliases

To add new alias to a given encoding,  Use;

  use Encode;
  use Encode::Alias;
  define_alias(newName => ENCODING);

After that, newName can be used as an alias for ENCODING.
ENCODING may be either the name of an encoding or an
I<encoding object>

But before you do so, make sure the alias is nonexistent with
C<resolve_alias()>, which returns the canonical name thereof.
i.e.

  Encode::resolve_alias("latin1") eq "iso-8859-1" # true
  Encode::resolve_alias("iso-8859-12")   # false; nonexistent
  Encode::resolve_alias($name) eq $name  # true if $name is canonical

This resolve_alias() does not need C<use Encode::Alias> and is 
exported via C<use encode qw(resolve_alias)>.

See L<Encode::Alias> on details.

=head1 Encoding and IO

It is very common to want to do encoding transformations when
reading or writing files, network connections, pipes etc.
If Perl is configured to use the new 'perlio' IO system then
C<Encode> provides a "layer" (See L<perliol>) which can transform
data as it is read or written.

Here is how the blind poet would modernise the encoding:

    use Encode;
    open(my $iliad,'<:encoding(iso-8859-7)','iliad.greek');
    open(my $utf8,'>:utf8','iliad.utf8');
    my @epic = <$iliad>;
    print $utf8 @epic;
    close($utf8);
    close($illiad);

In addition the new IO system can also be configured to read/write
UTF-8 encoded characters (as noted above this is efficient):

    open(my $fh,'>:utf8','anything');
    print $fh "Any \x{0021} string \N{SMILEY FACE}\n";

Either of the above forms of "layer" specifications can be made the default
for a lexical scope with the C<use open ...> pragma. See L<open>.

Once a handle is open is layers can be altered using C<binmode>.

Without any such configuration, or if Perl itself is built using
system's own IO, then write operations assume that file handle accepts
only I<bytes> and will C<die> if a character larger than 255 is
written to the handle. When reading, each octet from the handle
becomes a byte-in-a-character. Note that this default is the same
behaviour as bytes-only languages (including Perl before v5.6) would
have, and is sufficient to handle native 8-bit encodings
e.g. iso-8859-1, EBCDIC etc. and any legacy mechanisms for handling
other encodings and binary data.

In other cases it is the programs responsibility to transform
characters into bytes using the API above before doing writes, and to
transform the bytes read from a handle into characters before doing
"character operations" (e.g. C<lc>, C</\W+/>, ...).

You can also use PerlIO to convert larger amounts of data you don't
want to bring into memory.  For example to convert between ISO-8859-1
(Latin 1) and UTF-8 (or UTF-EBCDIC in EBCDIC machines):

    open(F, "<:encoding(iso-8859-1)", "data.txt") or die $!;
    open(G, ">:utf8",                 "data.utf") or die $!;
    while (<F>) { print G }

    # Could also do "print G <F>" but that would pull
    # the whole file into memory just to write it out again.

More examples:

    open(my $f, "<:encoding(cp1252)")
    open(my $g, ">:encoding(iso-8859-2)")
    open(my $h, ">:encoding(latin9)")       # iso-8859-15

See L<PerlIO> for more information.

See also L<encoding> for how to change the default encoding of the
data in your script.

=head1 Handling Malformed Data

If I<CHECK> is not set, (en|de)code will put I<substitution character> in
place of the malformed character.  for UCM-based encodings,
E<lt>subcharE<gt> will be used.  For Unicode, \xFFFD is used.  If the
data is supposed to be UTF-8, an optional lexical warning (category
utf8) is given. 

If I<CHECK> is true but not a code reference, dies with an error message.

In future you will be able to use a code reference to a callback
function for the value of I<CHECK> but its API is still undecided.

=head1 Defining Encodings

To define a new encoding, use:

    use Encode qw(define_alias);
    define_encoding($object, 'canonicalName' [, alias...]);

I<canonicalName> will be associated with I<$object>.  The object
should provide the interface described in L<Encode::Encoding>
If more than two arguments are provided then additional
arguments are taken as aliases for I<$object> as for C<define_alias>.

See L<Encode::Encoding> for more details.

=head1 Messing with Perl's Internals

The following API uses parts of Perl's internals in the current
implementation.  As such they are efficient, but may change.

=over 4

=item is_utf8(STRING [, CHECK])

[INTERNAL] Test whether the UTF-8 flag is turned on in the STRING.
If CHECK is true, also checks the data in STRING for being well-formed
UTF-8.  Returns true if successful, false otherwise.

=item _utf8_on(STRING)

[INTERNAL] Turn on the UTF-8 flag in STRING.  The data in STRING is
B<not> checked for being well-formed UTF-8.  Do not use unless you
B<know> that the STRING is well-formed UTF-8.  Returns the previous
state of the UTF-8 flag (so please don't test the return value as
I<not> success or failure), or C<undef> if STRING is not a string.

=item _utf8_off(STRING)

[INTERNAL] Turn off the UTF-8 flag in STRING.  Do not use frivolously.
Returns the previous state of the UTF-8 flag (so please don't test the
return value as I<not> success or failure), or C<undef> if STRING is
not a string.

=back

=head1 SEE ALSO

L<Encode::Encoding>,
L<Encode::Supported>,
L<PerlIO>, 
L<encoding>,
L<perlebcdic>, 
L<perlfunc/open>, 
L<perlunicode>, 
L<utf8>, 
the Perl Unicode Mailing List E<lt>perl-unicode@perl.orgE<gt>

head2 MAINTAINER

This project was originated by Nick Ing-Simmons and later maintained
by Dan Kogai E<lt>dankogai@dan.co.jpE<gt>.  See AUTHORS for full list
of people involved.  For any questions, use
E<lt>perl-unicode@perl.orgE<gt> so others can share.

=cut
Commit	Line	Data
2c674647	1	package Encode;
51ef4e11	2	use strict;
aae85ceb	3	our $VERSION = do { my @r = (q$Revision: 1.40 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
5129552c	4	our $DEBUG = 0;
2c674647	5
	6	require DynaLoader;
	7	require Exporter;
	8
51ef4e11	9	our @ISA = qw(Exporter DynaLoader);
2c674647	10
4411f3b6	11	# Public, encouraged API is exported by default
51ef4e11	12	our @EXPORT = qw (
4411f3b6	13	decode
4411f3b6	14	decode_utf8
fcb875d4	15	encode
fcb875d4	16	encode_utf8
51ef4e11	17	encodings
fcb875d4	18	find_encoding
4411f3b6	19	);
4411f3b6	20
51ef4e11	21	our @EXPORT_OK =
2c674647	22	qw(
fcb875d4	23	_utf8_off
fcb875d4	24	_utf8_on
51ef4e11	25	define_encoding
2c674647	26	from_to
4411f3b6	27	is_16bit
fcb875d4	28	is_8bit
	29	is_utf8
	30	resolve_alias
a12c0f56	31	utf8_downgrade
fcb875d4	32	utf8_upgrade
2c674647	33	);
	34
	35	bootstrap Encode ();
	36
4411f3b6	37	# Documentation moved after __END__ for speed - NI-S
2c674647	38
bf230f3d	39	use Carp;
bf230f3d	40
a63c962f	41	our $ON_EBCDIC = (ord("A") == 193);
f2a2953c	42
5d030b67	43	use Encode::Alias;
5d030b67	44
5129552c	45	# Make a %Encoding package variable to allow a certain amount of cheating
5129552c	46	our %Encoding;
aae85ceb	47	our %ExtModule;
	48	require Encode::Config;
	49	eval { require Encode::ConfigLocal };
5129552c	50
656753f8	51	sub encodings
656753f8	52	{
5129552c	53	my $class = shift;
071db25d	54	my @modules = (@_ and $_[0] eq ":all") ? values %ExtModule : @_;
c731e18e	55	for my $mod (@modules){
	56	$mod =~ s,::,/,g or $mod = "Encode/$mod";
	57	$mod .= '.pm';
	58	$DEBUG and warn "about to require $mod;";
	59	eval { require $mod; };
5129552c	60	}
c731e18e	61	my %modules = map {$_ => 1} @modules;
5129552c	62	return
ce912cd4	63	sort { lc $a cmp lc $b }
ce912cd4	64	grep {!/^(?:Internal\|Unicode)$/o} keys %Encoding;
51ef4e11	65	}
51ef4e11	66
51ef4e11	67	sub define_encoding
51ef4e11	68	{
18586f54	69	my $obj = shift;
18586f54	70	my $name = shift;
5129552c	71	$Encoding{$name} = $obj;
18586f54	72	my $lc = lc($name);
	73	define_alias($lc => $obj) unless $lc eq $name;
	74	while (@_)
	75	{
	76	my $alias = shift;
	77	define_alias($alias,$obj);
	78	}
	79	return $obj;
656753f8	80	}
656753f8	81
656753f8	82	sub getEncoding
656753f8	83	{
dd9703c9	84	my ($class,$name,$skip_external) = @_;
18586f54	85	my $enc;
	86	if (ref($name) && $name->can('new_sequence'))
	87	{
	88	return $name;
	89	}
	90	my $lc = lc $name;
5129552c	91	if (exists $Encoding{$name})
18586f54	92	{
5129552c	93	return $Encoding{$name};
18586f54	94	}
5129552c	95	if (exists $Encoding{$lc})
18586f54	96	{
5129552c	97	return $Encoding{$lc};
18586f54	98	}
c50d192e	99
5129552c	100	my $oc = $class->find_alias($name);
c50d192e	101	return $oc if defined $oc;
c50d192e	102
5129552c	103	$oc = $class->find_alias($lc) if $lc ne $name;
c50d192e	104	return $oc if defined $oc;
c50d192e	105
c731e18e	106	unless ($skip_external)
d1ed7747	107	{
c731e18e	108	if (my $mod = $ExtModule{$name} \|\| $ExtModule{$lc}){
	109	$mod =~ s,::,/,g ; $mod .= '.pm';
	110	eval{ require $mod; };
	111	return $Encoding{$name} if exists $Encoding{$name};
	112	}
d1ed7747	113	}
18586f54	114	return;
656753f8	115	}
656753f8	116
4411f3b6	117	sub find_encoding
4411f3b6	118	{
dd9703c9	119	my ($name,$skip_external) = @_;
dd9703c9	120	return __PACKAGE__->getEncoding($name,$skip_external);
4411f3b6	121	}
4411f3b6	122
fcb875d4	123	sub resolve_alias {
	124	my $obj = find_encoding(shift);
	125	defined $obj and return $obj->name;
	126	return;
	127	}
	128
4411f3b6	129	sub encode
4411f3b6	130	{
18586f54	131	my ($name,$string,$check) = @_;
	132	my $enc = find_encoding($name);
	133	croak("Unknown encoding '$name'") unless defined $enc;
	134	my $octets = $enc->encode($string,$check);
	135	return undef if ($check && length($string));
	136	return $octets;
4411f3b6	137	}
	138
	139	sub decode
	140	{
18586f54	141	my ($name,$octets,$check) = @_;
	142	my $enc = find_encoding($name);
	143	croak("Unknown encoding '$name'") unless defined $enc;
	144	my $string = $enc->decode($octets,$check);
	145	$_[1] = $octets if $check;
	146	return $string;
4411f3b6	147	}
	148
	149	sub from_to
	150	{
18586f54	151	my ($string,$from,$to,$check) = @_;
	152	my $f = find_encoding($from);
	153	croak("Unknown encoding '$from'") unless defined $f;
	154	my $t = find_encoding($to);
	155	croak("Unknown encoding '$to'") unless defined $t;
	156	my $uni = $f->decode($string,$check);
	157	return undef if ($check && length($string));
a999c27c	158	$string = $t->encode($uni,$check);
18586f54	159	return undef if ($check && length($uni));
3ef515df	160	return defined($_[0] = $string) ? length($string) : undef ;
4411f3b6	161	}
	162
	163	sub encode_utf8
	164	{
18586f54	165	my ($str) = @_;
c731e18e	166	utf8::encode($str);
18586f54	167	return $str;
4411f3b6	168	}
	169
	170	sub decode_utf8
	171	{
18586f54	172	my ($str) = @_;
	173	return undef unless utf8::decode($str);
	174	return $str;
5ad8ef52	175	}
5ad8ef52	176
f2a2953c	177	predefine_encodings();
	178
	179	#
	180	# This is to restore %Encoding if really needed;
	181	#
	182	sub predefine_encodings{
	183	if ($ON_EBCDIC) {
	184	# was in Encode::UTF_EBCDIC
	185	package Encode::UTF_EBCDIC;
	186	*name = sub{ shift->{'Name'} };
	187	*new_sequence = sub{ return $_[0] };
	188	*decode = sub{
	189	my ($obj,$str,$chk) = @_;
	190	my $res = '';
	191	for (my $i = 0; $i < length($str); $i++) {
	192	$res .=
	193	chr(utf8::unicode_to_native(ord(substr($str,$i,1))));
	194	}
	195	$_[1] = '' if $chk;
	196	return $res;
	197	};
	198	*encode = sub{
	199	my ($obj,$str,$chk) = @_;
	200	my $res = '';
	201	for (my $i = 0; $i < length($str); $i++) {
	202	$res .=
	203	chr(utf8::native_to_unicode(ord(substr($str,$i,1))));
	204	}
	205	$_[1] = '' if $chk;
	206	return $res;
	207	};
77ea6967	208	$Encode::Encoding{Unicode} =
c731e18e	209	bless {Name => "UTF_EBCDIC"} => "Encode::UTF_EBCDIC";
f2a2953c	210	} else {
	211	# was in Encode::UTF_EBCDIC
	212	package Encode::Internal;
	213	*name = sub{ shift->{'Name'} };
	214	*new_sequence = sub{ return $_[0] };
	215	*decode = sub{
	216	my ($obj,$str,$chk) = @_;
	217	utf8::upgrade($str);
	218	$_[1] = '' if $chk;
	219	return $str;
	220	};
	221	*encode = \&decode;
	222	$Encode::Encoding{Unicode} =
c731e18e	223	bless {Name => "Internal"} => "Encode::Internal";
f2a2953c	224	}
	225
	226	{
	227	# was in Encode::utf8
	228	package Encode::utf8;
	229	*name = sub{ shift->{'Name'} };
	230	*new_sequence = sub{ return $_[0] };
	231	*decode = sub{
	232	my ($obj,$octets,$chk) = @_;
	233	my $str = Encode::decode_utf8($octets);
	234	if (defined $str) {
	235	$_[1] = '' if $chk;
	236	return $str;
	237	}
	238	return undef;
	239	};
	240	*encode = sub {
	241	my ($obj,$string,$chk) = @_;
	242	my $octets = Encode::encode_utf8($string);
	243	$_[1] = '' if $chk;
	244	return $octets;
	245	};
	246	$Encode::Encoding{utf8} =
c731e18e	247	bless {Name => "utf8"} => "Encode::utf8";
f2a2953c	248	}
f2a2953c	249	}
f2a2953c	250
18586f54	251	require Encode::Encoding;
18586f54	252	require Encode::XS;
4411f3b6	253
656753f8	254	1;
656753f8	255
2a936312	256	__END__
2a936312	257
4411f3b6	258	=head1 NAME
	259
	260	Encode - character encodings
	261
	262	=head1 SYNOPSIS
	263
	264	use Encode;
	265
67d7b5ef	266
	267	=head2 Table of Contents
	268
	269	Encode consists of a collection of modules which details are too big
	270	to fit in one document. This POD itself explains the top-level APIs
	271	and general topics at a glance. For other topics and more details,
	272	see the PODs below;
	273
	274	Name Description
	275	--------------------------------------------------------
	276	Encode::Alias Alias defintions to encodings
	277	Encode::Encoding Encode Implementation Base Class
	278	Encode::Supported List of Supported Encodings
	279	Encode::CN Simplified Chinese Encodings
	280	Encode::JP Japanese Encodings
	281	Encode::KR Korean Encodings
	282	Encode::TW Traditional Chinese Encodings
	283	--------------------------------------------------------
	284
4411f3b6	285	=head1 DESCRIPTION
4411f3b6	286
47bfe92f	287	The C<Encode> module provides the interfaces between Perl's strings
67d7b5ef	288	and the rest of the system. Perl strings are sequences of
	289	B<characters>.
	290
	291	The repertoire of characters that Perl can represent is at least that
	292	defined by the Unicode Consortium. On most platforms the ordinal
	293	values of the characters (as returned by C<ord(ch)>) is the "Unicode
	294	codepoint" for the character (the exceptions are those platforms where
	295	the legacy encoding is some variant of EBCDIC rather than a super-set
	296	of ASCII - see L<perlebcdic>).
	297
	298	Traditionally computer data has been moved around in 8-bit chunks
	299	often called "bytes". These chunks are also known as "octets" in
	300	networking standards. Perl is widely used to manipulate data of many
	301	types - not only strings of characters representing human or computer
	302	languages but also "binary" data being the machines representation of
	303	numbers, pixels in an image - or just about anything.
	304
	305	When Perl is processing "binary data" the programmer wants Perl to
	306	process "sequences of bytes". This is not a problem for Perl - as a
	307	byte has 256 possible values it easily fits in Perl's much larger
	308	"logical character".
	309
	310	=head2 TERMINOLOGY
4411f3b6	311
67d7b5ef	312	=over 4
21938dfa	313
67d7b5ef	314	=item *
	315
	316	I<character>: a character in the range 0..(2**32-1) (or more).
	317	(What Perl's strings are made of.)
	318
	319	=item *
	320
	321	I<byte>: a character in the range 0..255
	322	(A special case of a Perl character.)
	323
	324	=item *
	325
	326	I<octet>: 8 bits of data, with ordinal values 0..255
	327	(Term for bytes passed to or from a non-Perl context, e.g. disk file.)
	328
	329	=back
4411f3b6	330
67d7b5ef	331	The marker [INTERNAL] marks Internal Implementation Details, in
	332	general meant only for those who think they know what they are doing,
	333	and such details may change in future releases.
	334
	335	=head1 PERL ENCODING API
4411f3b6	336
	337	=over 4
	338
f2a2953c	339	=item $octets = encode(ENCODING, $string[, CHECK])
4411f3b6	340
47bfe92f	341	Encodes string from Perl's internal form into I<ENCODING> and returns
67d7b5ef	342	a sequence of octets. ENCODING can be either a canonical name or
	343	alias. For encoding names and aliases, see L</"Defining Aliases">.
	344	For CHECK see L</"Handling Malformed Data">.
4411f3b6	345
67d7b5ef	346	For example to convert (internally UTF-8 encoded) Unicode string to
67d7b5ef	347	iso-8859-1 (also known as Latin1),
681a7c68	348
67d7b5ef	349	$octets = encode("iso-8859-1", $unicode);
681a7c68	350
f2a2953c	351	=item $string = decode(ENCODING, $octets[, CHECK])
4411f3b6	352
47bfe92f	353	Decode sequence of octets assumed to be in I<ENCODING> into Perl's
67d7b5ef	354	internal form and returns the resulting string. as in encode(),
	355	ENCODING can be either a canonical name or alias. For encoding names
	356	and aliases, see L</"Defining Aliases">. For CHECK see
47bfe92f	357	L</"Handling Malformed Data">.
47bfe92f	358
1b2c56c8	359	For example to convert ISO-8859-1 data to UTF-8:
681a7c68	360
67d7b5ef	361	$utf8 = decode("iso-8859-1", $latin1);
681a7c68	362
f2a2953c	363	=item [$length =] from_to($string, FROM_ENCODING, TO_ENCODING [,CHECK])
47bfe92f	364
2b106fbe	365	Convert B<in-place> the data between two encodings. How did the data
2b106fbe	366	in $string originally get to be in FROM_ENCODING? Either using
67d7b5ef	367	encode() or through PerlIO: See L</"Encoding and IO">.
	368	For encoding names and aliases, see L</"Defining Aliases">.
	369	For CHECK see L</"Handling Malformed Data">.
2b106fbe	370
1b2c56c8	371	For example to convert ISO-8859-1 data to UTF-8:
2b106fbe	372
	373	from_to($data, "iso-8859-1", "utf-8");
	374
	375	and to convert it back:
	376
	377	from_to($data, "utf-8", "iso-8859-1");
4411f3b6	378
ab97ca19	379	Note that because the conversion happens in place, the data to be
	380	converted cannot be a string constant, it must be a scalar variable.
	381
3ef515df	382	from_to() return the length of the converted string on success, undef
	383	otherwise.
	384
4411f3b6	385	=back
4411f3b6	386
f2a2953c	387	=head2 UTF-8 / utf8
	388
	389	The Unicode consortium defines the UTF-8 standard as a way of encoding
	390	the entire Unicode repertoire as sequences of octets. This encoding is
	391	expected to become very widespread. Perl can use this form internally
	392	to represent strings, so conversions to and from this form are
	393	particularly efficient (as octets in memory do not have to change,
	394	just the meta-data that tells Perl how to treat them).
	395
	396	=over 4
	397
	398	=item $octets = encode_utf8($string);
	399
	400	The characters that comprise string are encoded in Perl's superset of UTF-8
	401	and the resulting octets returned as a sequence of bytes. All possible
	402	characters have a UTF-8 representation so this function cannot fail.
	403
	404	=item $string = decode_utf8($octets [, CHECK]);
	405
	406	The sequence of octets represented by $octets is decoded from UTF-8
	407	into a sequence of logical characters. Not all sequences of octets
	408	form valid UTF-8 encodings, so it is possible for this call to fail.
	409	For CHECK see L</"Handling Malformed Data">.
	410
	411	=back
	412
51ef4e11	413	=head2 Listing available encodings
51ef4e11	414
5129552c	415	use Encode;
	416	@list = Encode->encodings();
	417
	418	Returns a list of the canonical names of the available encodings that
	419	are loaded. To get a list of all available encodings including the
	420	ones that are not loaded yet, say
	421
	422	@all_encodings = Encode->encodings(":all");
	423
	424	Or you can give the name of specific module.
	425
c731e18e	426	@with_jp = Encode->encodings("Encode::JP");
	427
	428	When "::" is not in the name, "Encode::" is assumed.
51ef4e11	429
c731e18e	430	@ebcdic = Encode->encodings("EBCDIC");
5d030b67	431
a63c962f	432	To find which encodings are supported by this package in details,
5d030b67	433	see L<Encode::Supported>.
51ef4e11	434
	435	=head2 Defining Aliases
	436
67d7b5ef	437	To add new alias to a given encoding, Use;
67d7b5ef	438
5129552c	439	use Encode;
5129552c	440	use Encode::Alias;
a63c962f	441	define_alias(newName => ENCODING);
51ef4e11	442
3ef515df	443	After that, newName can be used as an alias for ENCODING.
f2a2953c	444	ENCODING may be either the name of an encoding or an
f2a2953c	445	I<encoding object>
51ef4e11	446
fcb875d4	447	But before you do so, make sure the alias is nonexistent with
	448	C<resolve_alias()>, which returns the canonical name thereof.
	449	i.e.
	450
	451	Encode::resolve_alias("latin1") eq "iso-8859-1" # true
	452	Encode::resolve_alias("iso-8859-12") # false; nonexistent
	453	Encode::resolve_alias($name) eq $name # true if $name is canonical
	454
	455	This resolve_alias() does not need C<use Encode::Alias> and is
	456	exported via C<use encode qw(resolve_alias)>.
	457
5d030b67	458	See L<Encode::Alias> on details.
51ef4e11	459
4411f3b6	460	=head1 Encoding and IO
	461
	462	It is very common to want to do encoding transformations when
	463	reading or writing files, network connections, pipes etc.
47bfe92f	464	If Perl is configured to use the new 'perlio' IO system then
4411f3b6	465	C<Encode> provides a "layer" (See L<perliol>) which can transform
	466	data as it is read or written.
	467
8e86646e	468	Here is how the blind poet would modernise the encoding:
8e86646e	469
42234700	470	use Encode;
8e86646e	471	open(my $iliad,'<:encoding(iso-8859-7)','iliad.greek');
	472	open(my $utf8,'>:utf8','iliad.utf8');
	473	my @epic = <$iliad>;
	474	print $utf8 @epic;
	475	close($utf8);
	476	close($illiad);
4411f3b6	477
	478	In addition the new IO system can also be configured to read/write
	479	UTF-8 encoded characters (as noted above this is efficient):
	480
e9692b5b	481	open(my $fh,'>:utf8','anything');
e9692b5b	482	print $fh "Any \x{0021} string \N{SMILEY FACE}\n";
4411f3b6	483
	484	Either of the above forms of "layer" specifications can be made the default
	485	for a lexical scope with the C<use open ...> pragma. See L<open>.
	486
	487	Once a handle is open is layers can be altered using C<binmode>.
	488
47bfe92f	489	Without any such configuration, or if Perl itself is built using
4411f3b6	490	system's own IO, then write operations assume that file handle accepts
	491	only I<bytes> and will C<die> if a character larger than 255 is
	492	written to the handle. When reading, each octet from the handle
	493	becomes a byte-in-a-character. Note that this default is the same
47bfe92f	494	behaviour as bytes-only languages (including Perl before v5.6) would
	495	have, and is sufficient to handle native 8-bit encodings
	496	e.g. iso-8859-1, EBCDIC etc. and any legacy mechanisms for handling
	497	other encodings and binary data.
	498
	499	In other cases it is the programs responsibility to transform
	500	characters into bytes using the API above before doing writes, and to
	501	transform the bytes read from a handle into characters before doing
	502	"character operations" (e.g. C<lc>, C</\W+/>, ...).
	503
47bfe92f	504	You can also use PerlIO to convert larger amounts of data you don't
1b2c56c8	505	want to bring into memory. For example to convert between ISO-8859-1
47bfe92f	506	(Latin 1) and UTF-8 (or UTF-EBCDIC in EBCDIC machines):
47bfe92f	507
e9692b5b	508	open(F, "<:encoding(iso-8859-1)", "data.txt") or die $!;
	509	open(G, ">:utf8", "data.utf") or die $!;
	510	while (<F>) { print G }
	511
	512	# Could also do "print G <F>" but that would pull
	513	# the whole file into memory just to write it out again.
	514
	515	More examples:
47bfe92f	516
e9692b5b	517	open(my $f, "<:encoding(cp1252)")
	518	open(my $g, ">:encoding(iso-8859-2)")
	519	open(my $h, ">:encoding(latin9)") # iso-8859-15
47bfe92f	520
47bfe92f	521	See L<PerlIO> for more information.
4411f3b6	522
1768d7eb	523	See also L<encoding> for how to change the default encoding of the
d521382b	524	data in your script.
1768d7eb	525
67d7b5ef	526	=head1 Handling Malformed Data
67d7b5ef	527
f2a2953c	528	If I<CHECK> is not set, (en\|de)code will put I<substitution character> in
	529	place of the malformed character. for UCM-based encodings,
	530	E<lt>subcharE<gt> will be used. For Unicode, \xFFFD is used. If the
	531	data is supposed to be UTF-8, an optional lexical warning (category
	532	utf8) is given.
67d7b5ef	533
f2a2953c	534	If I<CHECK> is true but not a code reference, dies with an error message.
67d7b5ef	535
f2a2953c	536	In future you will be able to use a code reference to a callback
f2a2953c	537	function for the value of I<CHECK> but its API is still undecided.
67d7b5ef	538
	539	=head1 Defining Encodings
	540
	541	To define a new encoding, use:
	542
	543	use Encode qw(define_alias);
	544	define_encoding($object, 'canonicalName' [, alias...]);
	545
	546	I<canonicalName> will be associated with I<$object>. The object
	547	should provide the interface described in L<Encode::Encoding>
	548	If more than two arguments are provided then additional
	549	arguments are taken as aliases for I<$object> as for C<define_alias>.
	550
f2a2953c	551	See L<Encode::Encoding> for more details.
f2a2953c	552
4411f3b6	553	=head1 Messing with Perl's Internals
4411f3b6	554
47bfe92f	555	The following API uses parts of Perl's internals in the current
47bfe92f	556	implementation. As such they are efficient, but may change.
4411f3b6	557
	558	=over 4
	559
a63c962f	560	=item is_utf8(STRING [, CHECK])
4411f3b6	561
4411f3b6	562	[INTERNAL] Test whether the UTF-8 flag is turned on in the STRING.
47bfe92f	563	If CHECK is true, also checks the data in STRING for being well-formed
47bfe92f	564	UTF-8. Returns true if successful, false otherwise.
4411f3b6	565
a63c962f	566	=item _utf8_on(STRING)
4411f3b6	567
	568	[INTERNAL] Turn on the UTF-8 flag in STRING. The data in STRING is
	569	B<not> checked for being well-formed UTF-8. Do not use unless you
	570	B<know> that the STRING is well-formed UTF-8. Returns the previous
	571	state of the UTF-8 flag (so please don't test the return value as
	572	I<not> success or failure), or C<undef> if STRING is not a string.
	573
a63c962f	574	=item _utf8_off(STRING)
4411f3b6	575
	576	[INTERNAL] Turn off the UTF-8 flag in STRING. Do not use frivolously.
	577	Returns the previous state of the UTF-8 flag (so please don't test the
	578	return value as I<not> success or failure), or C<undef> if STRING is
	579	not a string.
	580
	581	=back
	582
	583	=head1 SEE ALSO
	584
5d030b67	585	L<Encode::Encoding>,
	586	L<Encode::Supported>,
	587	L<PerlIO>,
	588	L<encoding>,
	589	L<perlebcdic>,
	590	L<perlfunc/open>,
	591	L<perlunicode>,
	592	L<utf8>,
	593	the Perl Unicode Mailing List E<lt>perl-unicode@perl.orgE<gt>
4411f3b6	594
aae85ceb	595	head2 MAINTAINER
	596
	597	This project was originated by Nick Ing-Simmons and later maintained
	598	by Dan Kogai E<lt>dankogai@dan.co.jpE<gt>. See AUTHORS for full list
	599	of people involved. For any questions, use
	600	E<lt>perl-unicode@perl.orgE<gt> so others can share.
	601
4411f3b6	602	=cut