[p5sagit/p5-mst-13.2.git] / ext / Encode / Encode.pm

package Encode;
use strict;
our $VERSION = do { my @r = (q$Revision: 1.33 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
our $DEBUG = 0;

require DynaLoader;
require Exporter;

our @ISA = qw(Exporter DynaLoader);

# Public, encouraged API is exported by default
our @EXPORT = qw (
  decode
  decode_utf8
  encode
  encode_utf8
  encodings
  find_encoding
);

our @EXPORT_OK =
    qw(
       _utf8_off
       _utf8_on
       define_encoding
       from_to
       is_16bit
       is_8bit
       is_utf8
       resolve_alias
       utf8_downgrade
       utf8_upgrade
      );

bootstrap Encode ();

# Documentation moved after __END__ for speed - NI-S

use Carp;

our $ON_EBCDIC = (ord("A") == 193);

use Encode::Alias;

# Make a %Encoding package variable to allow a certain amount of cheating
our %Encoding;
use Encode::Config;

sub encodings
{
    my $class = shift;
    my @modules = (@_ and $_[0] eq ":all") ? values %ExtModule : @_;
    for my $mod (@modules){
	$mod =~ s,::,/,g or $mod = "Encode/$mod";
	$mod .= '.pm'; 
	$DEBUG and warn "about to require $mod;";
	eval { require $mod; };
    }
    my %modules = map {$_ => 1} @modules;
    return
	sort { lc $a cmp lc $b }
             grep {!/^(?:Internal|Unicode)$/o} keys %Encoding;
}

sub define_encoding
{
    my $obj  = shift;
    my $name = shift;
    $Encoding{$name} = $obj;
    my $lc = lc($name);
    define_alias($lc => $obj) unless $lc eq $name;
    while (@_)
    {
	my $alias = shift;
	define_alias($alias,$obj);
    }
    return $obj;
}

sub getEncoding
{
    my ($class,$name,$skip_external) = @_;
    my $enc;
    if (ref($name) && $name->can('new_sequence'))
    {
	return $name;
    }
    my $lc = lc $name;
    if (exists $Encoding{$name})
    {
	return $Encoding{$name};
    }
    if (exists $Encoding{$lc})
    {
	return $Encoding{$lc};
    }

    my $oc = $class->find_alias($name);
    return $oc if defined $oc;

    $oc = $class->find_alias($lc) if $lc ne $name;
    return $oc if defined $oc;

    unless ($skip_external)
    {
	if (my $mod = $ExtModule{$name} || $ExtModule{$lc}){
	    $mod =~ s,::,/,g ; $mod .= '.pm';
	    eval{ require $mod; };
	    return $Encoding{$name} if exists $Encoding{$name};
	}
    }
    return;
}

sub find_encoding
{
    my ($name,$skip_external) = @_;
    return __PACKAGE__->getEncoding($name,$skip_external);
}

sub resolve_alias {
    my $obj = find_encoding(shift);
    defined $obj and return $obj->name;
    return;
}

sub encode
{
    my ($name,$string,$check) = @_;
    my $enc = find_encoding($name);
    croak("Unknown encoding '$name'") unless defined $enc;
    my $octets = $enc->encode($string,$check);
    return undef if ($check && length($string));
    return $octets;
}

sub decode
{
    my ($name,$octets,$check) = @_;
    my $enc = find_encoding($name);
    croak("Unknown encoding '$name'") unless defined $enc;
    my $string = $enc->decode($octets,$check);
    $_[1] = $octets if $check;
    return $string;
}

sub from_to
{
    my ($string,$from,$to,$check) = @_;
    my $f = find_encoding($from);
    croak("Unknown encoding '$from'") unless defined $f;
    my $t = find_encoding($to);
    croak("Unknown encoding '$to'") unless defined $t;
    my $uni = $f->decode($string,$check);
    return undef if ($check && length($string));
    $string =  $t->encode($uni,$check);
    return undef if ($check && length($uni));
    return defined($_[0] = $string) ? length($string) : undef ;
}

sub encode_utf8
{
    my ($str) = @_;
    utf8::encode($str);
    return $str;
}

sub decode_utf8
{
    my ($str) = @_;
    return undef unless utf8::decode($str);
    return $str;
}

predefine_encodings();

#
# This is to restore %Encoding if really needed;
#
sub predefine_encodings{
    if ($ON_EBCDIC) { 
	# was in Encode::UTF_EBCDIC
	package Encode::UTF_EBCDIC;
	*name         = sub{ shift->{'Name'} };
	*new_sequence = sub{ return $_[0] };
	*decode = sub{
	    my ($obj,$str,$chk) = @_;
	    my $res = '';
	    for (my $i = 0; $i < length($str); $i++) {
		$res .= 
		    chr(utf8::unicode_to_native(ord(substr($str,$i,1))));
	    }
	    $_[1] = '' if $chk;
	    return $res;
	};
	*encode = sub{
	    my ($obj,$str,$chk) = @_;
	    my $res = '';
	    for (my $i = 0; $i < length($str); $i++) {
		$res .= 
		    chr(utf8::native_to_unicode(ord(substr($str,$i,1))));
	    }
	    $_[1] = '' if $chk;
	    return $res;
	};
	$Encode::Encoding{Internal} = 
	    bless {Name => "UTF_EBCDIC"} => "Encode::UTF_EBCDIC";
    } else {  
	# was in Encode::UTF_EBCDIC
	package Encode::Internal;
	*name         = sub{ shift->{'Name'} };
	*new_sequence = sub{ return $_[0] };
	*decode = sub{
	    my ($obj,$str,$chk) = @_;
	    utf8::upgrade($str);
	    $_[1] = '' if $chk;
	    return $str;
	};
	*encode = \&decode;
	$Encode::Encoding{Unicode} = 
	    bless {Name => "Internal"} => "Encode::Internal";
    }

    {
	# was in Encode::utf8
	package Encode::utf8;
	*name         = sub{ shift->{'Name'} };
	*new_sequence = sub{ return $_[0] };
	*decode = sub{
	    my ($obj,$octets,$chk) = @_;
	    my $str = Encode::decode_utf8($octets);
	    if (defined $str) {
		$_[1] = '' if $chk;
		return $str;
	    }
	    return undef;
	};
	*encode = sub {
	    my ($obj,$string,$chk) = @_;
	    my $octets = Encode::encode_utf8($string);
	    $_[1] = '' if $chk;
	    return $octets;
	};
	$Encode::Encoding{utf8} = 
	    bless {Name => "utf8"} => "Encode::utf8";
    }
    # do externals if necessary 
    require File::Basename;
    require File::Spec;
    for my $ext (qw()){
	my $pm =
	    File::Spec->catfile(File::Basename::dirname($INC{'Encode.pm'}),
				"Encode", "$ext.pm");
	do $pm;
    }
}

require Encode::Encoding;
require Encode::XS;

1;

__END__

=head1 NAME

Encode - character encodings

=head1 SYNOPSIS

    use Encode;


=head2 Table of Contents

Encode consists of a collection of modules which details are too big 
to fit in one document.  This POD itself explains the top-level APIs
and general topics at a glance.  For other topics and more details, 
see the PODs below;

  Name			        Description
  --------------------------------------------------------
  Encode::Alias         Alias defintions to encodings
  Encode::Encoding      Encode Implementation Base Class
  Encode::Supported     List of Supported Encodings
  Encode::CN            Simplified Chinese Encodings
  Encode::JP            Japanese Encodings
  Encode::KR            Korean Encodings
  Encode::TW            Traditional Chinese Encodings
  --------------------------------------------------------

=head1 DESCRIPTION

The C<Encode> module provides the interfaces between Perl's strings
and the rest of the system.  Perl strings are sequences of
B<characters>.

The repertoire of characters that Perl can represent is at least that
defined by the Unicode Consortium. On most platforms the ordinal
values of the characters (as returned by C<ord(ch)>) is the "Unicode
codepoint" for the character (the exceptions are those platforms where
the legacy encoding is some variant of EBCDIC rather than a super-set
of ASCII - see L<perlebcdic>).

Traditionally computer data has been moved around in 8-bit chunks
often called "bytes". These chunks are also known as "octets" in
networking standards. Perl is widely used to manipulate data of many
types - not only strings of characters representing human or computer
languages but also "binary" data being the machines representation of
numbers, pixels in an image - or just about anything.

When Perl is processing "binary data" the programmer wants Perl to
process "sequences of bytes". This is not a problem for Perl - as a
byte has 256 possible values it easily fits in Perl's much larger
"logical character".

=head2 TERMINOLOGY

=over 4

=item *

I<character>: a character in the range 0..(2**32-1) (or more).
(What Perl's strings are made of.)

=item *

I<byte>: a character in the range 0..255
(A special case of a Perl character.)

=item *

I<octet>: 8 bits of data, with ordinal values 0..255
(Term for bytes passed to or from a non-Perl context, e.g. disk file.)

=back

The marker [INTERNAL] marks Internal Implementation Details, in
general meant only for those who think they know what they are doing,
and such details may change in future releases.

=head1 PERL ENCODING API

=over 4

=item $octets  = encode(ENCODING, $string[, CHECK])

Encodes string from Perl's internal form into I<ENCODING> and returns
a sequence of octets.  ENCODING can be either a canonical name or
alias.  For encoding names and aliases, see L</"Defining Aliases">.
For CHECK see L</"Handling Malformed Data">.

For example to convert (internally UTF-8 encoded) Unicode string to
iso-8859-1 (also known as Latin1), 

  $octets = encode("iso-8859-1", $unicode);

=item $string = decode(ENCODING, $octets[, CHECK])

Decode sequence of octets assumed to be in I<ENCODING> into Perl's
internal form and returns the resulting string.  as in encode(),
ENCODING can be either a canonical name or alias. For encoding names
and aliases, see L</"Defining Aliases">.  For CHECK see
L</"Handling Malformed Data">.

For example to convert ISO-8859-1 data to UTF-8:

  $utf8 = decode("iso-8859-1", $latin1);

=item [$length =] from_to($string, FROM_ENCODING, TO_ENCODING [,CHECK])

Convert B<in-place> the data between two encodings.  How did the data
in $string originally get to be in FROM_ENCODING?  Either using
encode() or through PerlIO: See L</"Encoding and IO">.
For encoding names and aliases, see L</"Defining Aliases">. 
For CHECK see L</"Handling Malformed Data">.

For example to convert ISO-8859-1 data to UTF-8:

	from_to($data, "iso-8859-1", "utf-8");

and to convert it back:

	from_to($data, "utf-8", "iso-8859-1");

Note that because the conversion happens in place, the data to be
converted cannot be a string constant, it must be a scalar variable.

from_to() return the length of the converted string on success, undef
otherwise.

=back

=head2 UTF-8 / utf8

The Unicode consortium defines the UTF-8 standard as a way of encoding
the entire Unicode repertoire as sequences of octets.  This encoding is
expected to become very widespread. Perl can use this form internally
to represent strings, so conversions to and from this form are
particularly efficient (as octets in memory do not have to change,
just the meta-data that tells Perl how to treat them).

=over 4

=item $octets = encode_utf8($string);

The characters that comprise string are encoded in Perl's superset of UTF-8
and the resulting octets returned as a sequence of bytes. All possible
characters have a UTF-8 representation so this function cannot fail.

=item $string = decode_utf8($octets [, CHECK]);

The sequence of octets represented by $octets is decoded from UTF-8
into a sequence of logical characters. Not all sequences of octets
form valid UTF-8 encodings, so it is possible for this call to fail.
For CHECK see L</"Handling Malformed Data">.

=back

=head2 Listing available encodings

  use Encode;
  @list = Encode->encodings();

Returns a list of the canonical names of the available encodings that
are loaded.  To get a list of all available encodings including the
ones that are not loaded yet, say

  @all_encodings = Encode->encodings(":all");

Or you can give the name of specific module.

  @with_jp = Encode->encodings("Encode::JP");

When "::" is not in the name, "Encode::" is assumed.

  @ebcdic = Encode->encodings("EBCDIC");

To find which encodings are supported by this package in details, 
see L<Encode::Supported>.

=head2 Defining Aliases

To add new alias to a given encoding,  Use;

  use Encode;
  use Encode::Alias;
  define_alias(newName => ENCODING);

After that, newName can be used as an alias for ENCODING.
ENCODING may be either the name of an encoding or an
I<encoding object>

But before you do so, make sure the alias is nonexistent with
C<resolve_alias()>, which returns the canonical name thereof.
i.e.

  Encode::resolve_alias("latin1") eq "iso-8859-1" # true
  Encode::resolve_alias("iso-8859-12")   # false; nonexistent
  Encode::resolve_alias($name) eq $name  # true if $name is canonical

This resolve_alias() does not need C<use Encode::Alias> and is 
exported via C<use encode qw(resolve_alias)>.

See L<Encode::Alias> on details.

=head1 Encoding and IO

It is very common to want to do encoding transformations when
reading or writing files, network connections, pipes etc.
If Perl is configured to use the new 'perlio' IO system then
C<Encode> provides a "layer" (See L<perliol>) which can transform
data as it is read or written.

Here is how the blind poet would modernise the encoding:

    use Encode;
    open(my $iliad,'<:encoding(iso-8859-7)','iliad.greek');
    open(my $utf8,'>:utf8','iliad.utf8');
    my @epic = <$iliad>;
    print $utf8 @epic;
    close($utf8);
    close($illiad);

In addition the new IO system can also be configured to read/write
UTF-8 encoded characters (as noted above this is efficient):

    open(my $fh,'>:utf8','anything');
    print $fh "Any \x{0021} string \N{SMILEY FACE}\n";

Either of the above forms of "layer" specifications can be made the default
for a lexical scope with the C<use open ...> pragma. See L<open>.

Once a handle is open is layers can be altered using C<binmode>.

Without any such configuration, or if Perl itself is built using
system's own IO, then write operations assume that file handle accepts
only I<bytes> and will C<die> if a character larger than 255 is
written to the handle. When reading, each octet from the handle
becomes a byte-in-a-character. Note that this default is the same
behaviour as bytes-only languages (including Perl before v5.6) would
have, and is sufficient to handle native 8-bit encodings
e.g. iso-8859-1, EBCDIC etc. and any legacy mechanisms for handling
other encodings and binary data.

In other cases it is the programs responsibility to transform
characters into bytes using the API above before doing writes, and to
transform the bytes read from a handle into characters before doing
"character operations" (e.g. C<lc>, C</\W+/>, ...).

You can also use PerlIO to convert larger amounts of data you don't
want to bring into memory.  For example to convert between ISO-8859-1
(Latin 1) and UTF-8 (or UTF-EBCDIC in EBCDIC machines):

    open(F, "<:encoding(iso-8859-1)", "data.txt") or die $!;
    open(G, ">:utf8",                 "data.utf") or die $!;
    while (<F>) { print G }

    # Could also do "print G <F>" but that would pull
    # the whole file into memory just to write it out again.

More examples:

    open(my $f, "<:encoding(cp1252)")
    open(my $g, ">:encoding(iso-8859-2)")
    open(my $h, ">:encoding(latin9)")       # iso-8859-15

See L<PerlIO> for more information.

See also L<encoding> for how to change the default encoding of the
data in your script.

=head1 Handling Malformed Data

If I<CHECK> is not set, (en|de)code will put I<substitution character> in
place of the malformed character.  for UCM-based encodings,
E<lt>subcharE<gt> will be used.  For Unicode, \xFFFD is used.  If the
data is supposed to be UTF-8, an optional lexical warning (category
utf8) is given. 

If I<CHECK> is true but not a code reference, dies with an error message.

In future you will be able to use a code reference to a callback
function for the value of I<CHECK> but its API is still undecided.

=head1 Defining Encodings

To define a new encoding, use:

    use Encode qw(define_alias);
    define_encoding($object, 'canonicalName' [, alias...]);

I<canonicalName> will be associated with I<$object>.  The object
should provide the interface described in L<Encode::Encoding>
If more than two arguments are provided then additional
arguments are taken as aliases for I<$object> as for C<define_alias>.

See L<Encode::Encoding> for more details.

=head1 Messing with Perl's Internals

The following API uses parts of Perl's internals in the current
implementation.  As such they are efficient, but may change.

=over 4

=item is_utf8(STRING [, CHECK])

[INTERNAL] Test whether the UTF-8 flag is turned on in the STRING.
If CHECK is true, also checks the data in STRING for being well-formed
UTF-8.  Returns true if successful, false otherwise.

=item _utf8_on(STRING)

[INTERNAL] Turn on the UTF-8 flag in STRING.  The data in STRING is
B<not> checked for being well-formed UTF-8.  Do not use unless you
B<know> that the STRING is well-formed UTF-8.  Returns the previous
state of the UTF-8 flag (so please don't test the return value as
I<not> success or failure), or C<undef> if STRING is not a string.

=item _utf8_off(STRING)

[INTERNAL] Turn off the UTF-8 flag in STRING.  Do not use frivolously.
Returns the previous state of the UTF-8 flag (so please don't test the
return value as I<not> success or failure), or C<undef> if STRING is
not a string.

=back

=head1 SEE ALSO

L<Encode::Encoding>,
L<Encode::Supported>,
L<PerlIO>, 
L<encoding>,
L<perlebcdic>, 
L<perlfunc/open>, 
L<perlunicode>, 
L<utf8>, 
the Perl Unicode Mailing List E<lt>perl-unicode@perl.orgE<gt>

=cut
Commit	Line	Data
2c674647	1	package Encode;
51ef4e11	2	use strict;
448e90bb	3	our $VERSION = do { my @r = (q$Revision: 1.33 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
5129552c	4	our $DEBUG = 0;
2c674647	5
	6	require DynaLoader;
	7	require Exporter;
	8
51ef4e11	9	our @ISA = qw(Exporter DynaLoader);
2c674647	10
4411f3b6	11	# Public, encouraged API is exported by default
51ef4e11	12	our @EXPORT = qw (
4411f3b6	13	decode
4411f3b6	14	decode_utf8
fcb875d4	15	encode
fcb875d4	16	encode_utf8
51ef4e11	17	encodings
fcb875d4	18	find_encoding
4411f3b6	19	);
4411f3b6	20
51ef4e11	21	our @EXPORT_OK =
2c674647	22	qw(
fcb875d4	23	_utf8_off
fcb875d4	24	_utf8_on
51ef4e11	25	define_encoding
2c674647	26	from_to
4411f3b6	27	is_16bit
fcb875d4	28	is_8bit
	29	is_utf8
	30	resolve_alias
a12c0f56	31	utf8_downgrade
fcb875d4	32	utf8_upgrade
2c674647	33	);
	34
	35	bootstrap Encode ();
	36
4411f3b6	37	# Documentation moved after __END__ for speed - NI-S
2c674647	38
bf230f3d	39	use Carp;
bf230f3d	40
a63c962f	41	our $ON_EBCDIC = (ord("A") == 193);
f2a2953c	42
5d030b67	43	use Encode::Alias;
5d030b67	44
5129552c	45	# Make a %Encoding package variable to allow a certain amount of cheating
5129552c	46	our %Encoding;
fdd579e2	47	use Encode::Config;
5129552c	48
656753f8	49	sub encodings
656753f8	50	{
5129552c	51	my $class = shift;
071db25d	52	my @modules = (@_ and $_[0] eq ":all") ? values %ExtModule : @_;
c731e18e	53	for my $mod (@modules){
	54	$mod =~ s,::,/,g or $mod = "Encode/$mod";
	55	$mod .= '.pm';
	56	$DEBUG and warn "about to require $mod;";
	57	eval { require $mod; };
5129552c	58	}
c731e18e	59	my %modules = map {$_ => 1} @modules;
5129552c	60	return
ce912cd4	61	sort { lc $a cmp lc $b }
ce912cd4	62	grep {!/^(?:Internal\|Unicode)$/o} keys %Encoding;
51ef4e11	63	}
51ef4e11	64
51ef4e11	65	sub define_encoding
51ef4e11	66	{
18586f54	67	my $obj = shift;
18586f54	68	my $name = shift;
5129552c	69	$Encoding{$name} = $obj;
18586f54	70	my $lc = lc($name);
	71	define_alias($lc => $obj) unless $lc eq $name;
	72	while (@_)
	73	{
	74	my $alias = shift;
	75	define_alias($alias,$obj);
	76	}
	77	return $obj;
656753f8	78	}
656753f8	79
656753f8	80	sub getEncoding
656753f8	81	{
dd9703c9	82	my ($class,$name,$skip_external) = @_;
18586f54	83	my $enc;
	84	if (ref($name) && $name->can('new_sequence'))
	85	{
	86	return $name;
	87	}
	88	my $lc = lc $name;
5129552c	89	if (exists $Encoding{$name})
18586f54	90	{
5129552c	91	return $Encoding{$name};
18586f54	92	}
5129552c	93	if (exists $Encoding{$lc})
18586f54	94	{
5129552c	95	return $Encoding{$lc};
18586f54	96	}
c50d192e	97
5129552c	98	my $oc = $class->find_alias($name);
c50d192e	99	return $oc if defined $oc;
c50d192e	100
5129552c	101	$oc = $class->find_alias($lc) if $lc ne $name;
c50d192e	102	return $oc if defined $oc;
c50d192e	103
c731e18e	104	unless ($skip_external)
d1ed7747	105	{
c731e18e	106	if (my $mod = $ExtModule{$name} \|\| $ExtModule{$lc}){
	107	$mod =~ s,::,/,g ; $mod .= '.pm';
	108	eval{ require $mod; };
	109	return $Encoding{$name} if exists $Encoding{$name};
	110	}
d1ed7747	111	}
18586f54	112	return;
656753f8	113	}
656753f8	114
4411f3b6	115	sub find_encoding
4411f3b6	116	{
dd9703c9	117	my ($name,$skip_external) = @_;
dd9703c9	118	return __PACKAGE__->getEncoding($name,$skip_external);
4411f3b6	119	}
4411f3b6	120
fcb875d4	121	sub resolve_alias {
	122	my $obj = find_encoding(shift);
	123	defined $obj and return $obj->name;
	124	return;
	125	}
	126
4411f3b6	127	sub encode
4411f3b6	128	{
18586f54	129	my ($name,$string,$check) = @_;
	130	my $enc = find_encoding($name);
	131	croak("Unknown encoding '$name'") unless defined $enc;
	132	my $octets = $enc->encode($string,$check);
	133	return undef if ($check && length($string));
	134	return $octets;
4411f3b6	135	}
	136
	137	sub decode
	138	{
18586f54	139	my ($name,$octets,$check) = @_;
	140	my $enc = find_encoding($name);
	141	croak("Unknown encoding '$name'") unless defined $enc;
	142	my $string = $enc->decode($octets,$check);
	143	$_[1] = $octets if $check;
	144	return $string;
4411f3b6	145	}
	146
	147	sub from_to
	148	{
18586f54	149	my ($string,$from,$to,$check) = @_;
	150	my $f = find_encoding($from);
	151	croak("Unknown encoding '$from'") unless defined $f;
	152	my $t = find_encoding($to);
	153	croak("Unknown encoding '$to'") unless defined $t;
	154	my $uni = $f->decode($string,$check);
	155	return undef if ($check && length($string));
a999c27c	156	$string = $t->encode($uni,$check);
18586f54	157	return undef if ($check && length($uni));
3ef515df	158	return defined($_[0] = $string) ? length($string) : undef ;
4411f3b6	159	}
	160
	161	sub encode_utf8
	162	{
18586f54	163	my ($str) = @_;
c731e18e	164	utf8::encode($str);
18586f54	165	return $str;
4411f3b6	166	}
	167
	168	sub decode_utf8
	169	{
18586f54	170	my ($str) = @_;
	171	return undef unless utf8::decode($str);
	172	return $str;
5ad8ef52	173	}
5ad8ef52	174
f2a2953c	175	predefine_encodings();
	176
	177	#
	178	# This is to restore %Encoding if really needed;
	179	#
	180	sub predefine_encodings{
	181	if ($ON_EBCDIC) {
	182	# was in Encode::UTF_EBCDIC
	183	package Encode::UTF_EBCDIC;
	184	*name = sub{ shift->{'Name'} };
	185	*new_sequence = sub{ return $_[0] };
	186	*decode = sub{
	187	my ($obj,$str,$chk) = @_;
	188	my $res = '';
	189	for (my $i = 0; $i < length($str); $i++) {
	190	$res .=
	191	chr(utf8::unicode_to_native(ord(substr($str,$i,1))));
	192	}
	193	$_[1] = '' if $chk;
	194	return $res;
	195	};
	196	*encode = sub{
	197	my ($obj,$str,$chk) = @_;
	198	my $res = '';
	199	for (my $i = 0; $i < length($str); $i++) {
	200	$res .=
	201	chr(utf8::native_to_unicode(ord(substr($str,$i,1))));
	202	}
	203	$_[1] = '' if $chk;
	204	return $res;
	205	};
c731e18e	206	$Encode::Encoding{Internal} =
c731e18e	207	bless {Name => "UTF_EBCDIC"} => "Encode::UTF_EBCDIC";
f2a2953c	208	} else {
	209	# was in Encode::UTF_EBCDIC
	210	package Encode::Internal;
	211	*name = sub{ shift->{'Name'} };
	212	*new_sequence = sub{ return $_[0] };
	213	*decode = sub{
	214	my ($obj,$str,$chk) = @_;
	215	utf8::upgrade($str);
	216	$_[1] = '' if $chk;
	217	return $str;
	218	};
	219	*encode = \&decode;
	220	$Encode::Encoding{Unicode} =
c731e18e	221	bless {Name => "Internal"} => "Encode::Internal";
f2a2953c	222	}
	223
	224	{
	225	# was in Encode::utf8
	226	package Encode::utf8;
	227	*name = sub{ shift->{'Name'} };
	228	*new_sequence = sub{ return $_[0] };
	229	*decode = sub{
	230	my ($obj,$octets,$chk) = @_;
	231	my $str = Encode::decode_utf8($octets);
	232	if (defined $str) {
	233	$_[1] = '' if $chk;
	234	return $str;
	235	}
	236	return undef;
	237	};
	238	*encode = sub {
	239	my ($obj,$string,$chk) = @_;
	240	my $octets = Encode::encode_utf8($string);
	241	$_[1] = '' if $chk;
	242	return $octets;
	243	};
	244	$Encode::Encoding{utf8} =
c731e18e	245	bless {Name => "utf8"} => "Encode::utf8";
f2a2953c	246	}
	247	# do externals if necessary
	248	require File::Basename;
	249	require File::Spec;
c731e18e	250	for my $ext (qw()){
f2a2953c	251	my $pm =
	252	File::Spec->catfile(File::Basename::dirname($INC{'Encode.pm'}),
	253	"Encode", "$ext.pm");
	254	do $pm;
	255	}
	256	}
	257
18586f54	258	require Encode::Encoding;
18586f54	259	require Encode::XS;
4411f3b6	260
656753f8	261	1;
656753f8	262
2a936312	263	__END__
2a936312	264
4411f3b6	265	=head1 NAME
	266
	267	Encode - character encodings
	268
	269	=head1 SYNOPSIS
	270
	271	use Encode;
	272
67d7b5ef	273
	274	=head2 Table of Contents
	275
	276	Encode consists of a collection of modules which details are too big
	277	to fit in one document. This POD itself explains the top-level APIs
	278	and general topics at a glance. For other topics and more details,
	279	see the PODs below;
	280
	281	Name Description
	282	--------------------------------------------------------
	283	Encode::Alias Alias defintions to encodings
	284	Encode::Encoding Encode Implementation Base Class
	285	Encode::Supported List of Supported Encodings
	286	Encode::CN Simplified Chinese Encodings
	287	Encode::JP Japanese Encodings
	288	Encode::KR Korean Encodings
	289	Encode::TW Traditional Chinese Encodings
	290	--------------------------------------------------------
	291
4411f3b6	292	=head1 DESCRIPTION
4411f3b6	293
47bfe92f	294	The C<Encode> module provides the interfaces between Perl's strings
67d7b5ef	295	and the rest of the system. Perl strings are sequences of
	296	B<characters>.
	297
	298	The repertoire of characters that Perl can represent is at least that
	299	defined by the Unicode Consortium. On most platforms the ordinal
	300	values of the characters (as returned by C<ord(ch)>) is the "Unicode
	301	codepoint" for the character (the exceptions are those platforms where
	302	the legacy encoding is some variant of EBCDIC rather than a super-set
	303	of ASCII - see L<perlebcdic>).
	304
	305	Traditionally computer data has been moved around in 8-bit chunks
	306	often called "bytes". These chunks are also known as "octets" in
	307	networking standards. Perl is widely used to manipulate data of many
	308	types - not only strings of characters representing human or computer
	309	languages but also "binary" data being the machines representation of
	310	numbers, pixels in an image - or just about anything.
	311
	312	When Perl is processing "binary data" the programmer wants Perl to
	313	process "sequences of bytes". This is not a problem for Perl - as a
	314	byte has 256 possible values it easily fits in Perl's much larger
	315	"logical character".
	316
	317	=head2 TERMINOLOGY
4411f3b6	318
67d7b5ef	319	=over 4
21938dfa	320
67d7b5ef	321	=item *
	322
	323	I<character>: a character in the range 0..(2**32-1) (or more).
	324	(What Perl's strings are made of.)
	325
	326	=item *
	327
	328	I<byte>: a character in the range 0..255
	329	(A special case of a Perl character.)
	330
	331	=item *
	332
	333	I<octet>: 8 bits of data, with ordinal values 0..255
	334	(Term for bytes passed to or from a non-Perl context, e.g. disk file.)
	335
	336	=back
4411f3b6	337
67d7b5ef	338	The marker [INTERNAL] marks Internal Implementation Details, in
	339	general meant only for those who think they know what they are doing,
	340	and such details may change in future releases.
	341
	342	=head1 PERL ENCODING API
4411f3b6	343
	344	=over 4
	345
f2a2953c	346	=item $octets = encode(ENCODING, $string[, CHECK])
4411f3b6	347
47bfe92f	348	Encodes string from Perl's internal form into I<ENCODING> and returns
67d7b5ef	349	a sequence of octets. ENCODING can be either a canonical name or
	350	alias. For encoding names and aliases, see L</"Defining Aliases">.
	351	For CHECK see L</"Handling Malformed Data">.
4411f3b6	352
67d7b5ef	353	For example to convert (internally UTF-8 encoded) Unicode string to
67d7b5ef	354	iso-8859-1 (also known as Latin1),
681a7c68	355
67d7b5ef	356	$octets = encode("iso-8859-1", $unicode);
681a7c68	357
f2a2953c	358	=item $string = decode(ENCODING, $octets[, CHECK])
4411f3b6	359
47bfe92f	360	Decode sequence of octets assumed to be in I<ENCODING> into Perl's
67d7b5ef	361	internal form and returns the resulting string. as in encode(),
	362	ENCODING can be either a canonical name or alias. For encoding names
	363	and aliases, see L</"Defining Aliases">. For CHECK see
47bfe92f	364	L</"Handling Malformed Data">.
47bfe92f	365
1b2c56c8	366	For example to convert ISO-8859-1 data to UTF-8:
681a7c68	367
67d7b5ef	368	$utf8 = decode("iso-8859-1", $latin1);
681a7c68	369
f2a2953c	370	=item [$length =] from_to($string, FROM_ENCODING, TO_ENCODING [,CHECK])
47bfe92f	371
2b106fbe	372	Convert B<in-place> the data between two encodings. How did the data
2b106fbe	373	in $string originally get to be in FROM_ENCODING? Either using
67d7b5ef	374	encode() or through PerlIO: See L</"Encoding and IO">.
	375	For encoding names and aliases, see L</"Defining Aliases">.
	376	For CHECK see L</"Handling Malformed Data">.
2b106fbe	377
1b2c56c8	378	For example to convert ISO-8859-1 data to UTF-8:
2b106fbe	379
	380	from_to($data, "iso-8859-1", "utf-8");
	381
	382	and to convert it back:
	383
	384	from_to($data, "utf-8", "iso-8859-1");
4411f3b6	385
ab97ca19	386	Note that because the conversion happens in place, the data to be
	387	converted cannot be a string constant, it must be a scalar variable.
	388
3ef515df	389	from_to() return the length of the converted string on success, undef
	390	otherwise.
	391
4411f3b6	392	=back
4411f3b6	393
f2a2953c	394	=head2 UTF-8 / utf8
	395
	396	The Unicode consortium defines the UTF-8 standard as a way of encoding
	397	the entire Unicode repertoire as sequences of octets. This encoding is
	398	expected to become very widespread. Perl can use this form internally
	399	to represent strings, so conversions to and from this form are
	400	particularly efficient (as octets in memory do not have to change,
	401	just the meta-data that tells Perl how to treat them).
	402
	403	=over 4
	404
	405	=item $octets = encode_utf8($string);
	406
	407	The characters that comprise string are encoded in Perl's superset of UTF-8
	408	and the resulting octets returned as a sequence of bytes. All possible
	409	characters have a UTF-8 representation so this function cannot fail.
	410
	411	=item $string = decode_utf8($octets [, CHECK]);
	412
	413	The sequence of octets represented by $octets is decoded from UTF-8
	414	into a sequence of logical characters. Not all sequences of octets
	415	form valid UTF-8 encodings, so it is possible for this call to fail.
	416	For CHECK see L</"Handling Malformed Data">.
	417
	418	=back
	419
51ef4e11	420	=head2 Listing available encodings
51ef4e11	421
5129552c	422	use Encode;
	423	@list = Encode->encodings();
	424
	425	Returns a list of the canonical names of the available encodings that
	426	are loaded. To get a list of all available encodings including the
	427	ones that are not loaded yet, say
	428
	429	@all_encodings = Encode->encodings(":all");
	430
	431	Or you can give the name of specific module.
	432
c731e18e	433	@with_jp = Encode->encodings("Encode::JP");
	434
	435	When "::" is not in the name, "Encode::" is assumed.
51ef4e11	436
c731e18e	437	@ebcdic = Encode->encodings("EBCDIC");
5d030b67	438
a63c962f	439	To find which encodings are supported by this package in details,
5d030b67	440	see L<Encode::Supported>.
51ef4e11	441
	442	=head2 Defining Aliases
	443
67d7b5ef	444	To add new alias to a given encoding, Use;
67d7b5ef	445
5129552c	446	use Encode;
5129552c	447	use Encode::Alias;
a63c962f	448	define_alias(newName => ENCODING);
51ef4e11	449
3ef515df	450	After that, newName can be used as an alias for ENCODING.
f2a2953c	451	ENCODING may be either the name of an encoding or an
f2a2953c	452	I<encoding object>
51ef4e11	453
fcb875d4	454	But before you do so, make sure the alias is nonexistent with
	455	C<resolve_alias()>, which returns the canonical name thereof.
	456	i.e.
	457
	458	Encode::resolve_alias("latin1") eq "iso-8859-1" # true
	459	Encode::resolve_alias("iso-8859-12") # false; nonexistent
	460	Encode::resolve_alias($name) eq $name # true if $name is canonical
	461
	462	This resolve_alias() does not need C<use Encode::Alias> and is
	463	exported via C<use encode qw(resolve_alias)>.
	464
5d030b67	465	See L<Encode::Alias> on details.
51ef4e11	466
4411f3b6	467	=head1 Encoding and IO
	468
	469	It is very common to want to do encoding transformations when
	470	reading or writing files, network connections, pipes etc.
47bfe92f	471	If Perl is configured to use the new 'perlio' IO system then
4411f3b6	472	C<Encode> provides a "layer" (See L<perliol>) which can transform
	473	data as it is read or written.
	474
8e86646e	475	Here is how the blind poet would modernise the encoding:
8e86646e	476
42234700	477	use Encode;
8e86646e	478	open(my $iliad,'<:encoding(iso-8859-7)','iliad.greek');
	479	open(my $utf8,'>:utf8','iliad.utf8');
	480	my @epic = <$iliad>;
	481	print $utf8 @epic;
	482	close($utf8);
	483	close($illiad);
4411f3b6	484
	485	In addition the new IO system can also be configured to read/write
	486	UTF-8 encoded characters (as noted above this is efficient):
	487
e9692b5b	488	open(my $fh,'>:utf8','anything');
e9692b5b	489	print $fh "Any \x{0021} string \N{SMILEY FACE}\n";
4411f3b6	490
	491	Either of the above forms of "layer" specifications can be made the default
	492	for a lexical scope with the C<use open ...> pragma. See L<open>.
	493
	494	Once a handle is open is layers can be altered using C<binmode>.
	495
47bfe92f	496	Without any such configuration, or if Perl itself is built using
4411f3b6	497	system's own IO, then write operations assume that file handle accepts
	498	only I<bytes> and will C<die> if a character larger than 255 is
	499	written to the handle. When reading, each octet from the handle
	500	becomes a byte-in-a-character. Note that this default is the same
47bfe92f	501	behaviour as bytes-only languages (including Perl before v5.6) would
	502	have, and is sufficient to handle native 8-bit encodings
	503	e.g. iso-8859-1, EBCDIC etc. and any legacy mechanisms for handling
	504	other encodings and binary data.
	505
	506	In other cases it is the programs responsibility to transform
	507	characters into bytes using the API above before doing writes, and to
	508	transform the bytes read from a handle into characters before doing
	509	"character operations" (e.g. C<lc>, C</\W+/>, ...).
	510
47bfe92f	511	You can also use PerlIO to convert larger amounts of data you don't
1b2c56c8	512	want to bring into memory. For example to convert between ISO-8859-1
47bfe92f	513	(Latin 1) and UTF-8 (or UTF-EBCDIC in EBCDIC machines):
47bfe92f	514
e9692b5b	515	open(F, "<:encoding(iso-8859-1)", "data.txt") or die $!;
	516	open(G, ">:utf8", "data.utf") or die $!;
	517	while (<F>) { print G }
	518
	519	# Could also do "print G <F>" but that would pull
	520	# the whole file into memory just to write it out again.
	521
	522	More examples:
47bfe92f	523
e9692b5b	524	open(my $f, "<:encoding(cp1252)")
	525	open(my $g, ">:encoding(iso-8859-2)")
	526	open(my $h, ">:encoding(latin9)") # iso-8859-15
47bfe92f	527
47bfe92f	528	See L<PerlIO> for more information.
4411f3b6	529
1768d7eb	530	See also L<encoding> for how to change the default encoding of the
d521382b	531	data in your script.
1768d7eb	532
67d7b5ef	533	=head1 Handling Malformed Data
67d7b5ef	534
f2a2953c	535	If I<CHECK> is not set, (en\|de)code will put I<substitution character> in
	536	place of the malformed character. for UCM-based encodings,
	537	E<lt>subcharE<gt> will be used. For Unicode, \xFFFD is used. If the
	538	data is supposed to be UTF-8, an optional lexical warning (category
	539	utf8) is given.
67d7b5ef	540
f2a2953c	541	If I<CHECK> is true but not a code reference, dies with an error message.
67d7b5ef	542
f2a2953c	543	In future you will be able to use a code reference to a callback
f2a2953c	544	function for the value of I<CHECK> but its API is still undecided.
67d7b5ef	545
	546	=head1 Defining Encodings
	547
	548	To define a new encoding, use:
	549
	550	use Encode qw(define_alias);
	551	define_encoding($object, 'canonicalName' [, alias...]);
	552
	553	I<canonicalName> will be associated with I<$object>. The object
	554	should provide the interface described in L<Encode::Encoding>
	555	If more than two arguments are provided then additional
	556	arguments are taken as aliases for I<$object> as for C<define_alias>.
	557
f2a2953c	558	See L<Encode::Encoding> for more details.
f2a2953c	559
4411f3b6	560	=head1 Messing with Perl's Internals
4411f3b6	561
47bfe92f	562	The following API uses parts of Perl's internals in the current
47bfe92f	563	implementation. As such they are efficient, but may change.
4411f3b6	564
	565	=over 4
	566
a63c962f	567	=item is_utf8(STRING [, CHECK])
4411f3b6	568
4411f3b6	569	[INTERNAL] Test whether the UTF-8 flag is turned on in the STRING.
47bfe92f	570	If CHECK is true, also checks the data in STRING for being well-formed
47bfe92f	571	UTF-8. Returns true if successful, false otherwise.
4411f3b6	572
a63c962f	573	=item _utf8_on(STRING)
4411f3b6	574
	575	[INTERNAL] Turn on the UTF-8 flag in STRING. The data in STRING is
	576	B<not> checked for being well-formed UTF-8. Do not use unless you
	577	B<know> that the STRING is well-formed UTF-8. Returns the previous
	578	state of the UTF-8 flag (so please don't test the return value as
	579	I<not> success or failure), or C<undef> if STRING is not a string.
	580
a63c962f	581	=item _utf8_off(STRING)
4411f3b6	582
	583	[INTERNAL] Turn off the UTF-8 flag in STRING. Do not use frivolously.
	584	Returns the previous state of the UTF-8 flag (so please don't test the
	585	return value as I<not> success or failure), or C<undef> if STRING is
	586	not a string.
	587
	588	=back
	589
	590	=head1 SEE ALSO
	591
5d030b67	592	L<Encode::Encoding>,
	593	L<Encode::Supported>,
	594	L<PerlIO>,
	595	L<encoding>,
	596	L<perlebcdic>,
	597	L<perlfunc/open>,
	598	L<perlunicode>,
	599	L<utf8>,
	600	the Perl Unicode Mailing List E<lt>perl-unicode@perl.orgE<gt>
4411f3b6	601
4411f3b6	602	=cut