[p5sagit/p5-mst-13.2.git] / ext / Encode / Encode.pm

package Encode;
use strict;
our $VERSION = do { my @r = (q$Revision: 1.31 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
our $DEBUG = 0;

require DynaLoader;
require Exporter;

our @ISA = qw(Exporter DynaLoader);

# Public, encouraged API is exported by default
our @EXPORT = qw (
  encode
  decode
  encode_utf8
  decode_utf8
  find_encoding
  encodings
);

our @EXPORT_OK =
    qw(
       define_encoding
       from_to
       is_utf8
       is_8bit
       is_16bit
       utf8_upgrade
       utf8_downgrade
       _utf8_on
       _utf8_off
      );

bootstrap Encode ();

# Documentation moved after __END__ for speed - NI-S

use Carp;

our $ON_EBCDIC = (ord("A") == 193);

use Encode::Alias;

# Make a %Encoding package variable to allow a certain amount of cheating
our %Encoding;
use Encode::Config;

sub encodings
{
    my $class = shift;
    my @modules = (@_ and $_[0] eq ":all") ? values %ExtModule : @_;
    for my $mod (@modules){
	$mod =~ s,::,/,g or $mod = "Encode/$mod";
	$mod .= '.pm'; 
	$DEBUG and warn "about to require $mod;";
	eval { require $mod; };
    }
    my %modules = map {$_ => 1} @modules;
    return
	sort grep {!/^(?:Internal|Unicode)$/o} keys %Encoding;
}

sub define_encoding
{
    my $obj  = shift;
    my $name = shift;
    $Encoding{$name} = $obj;
    my $lc = lc($name);
    define_alias($lc => $obj) unless $lc eq $name;
    while (@_)
    {
	my $alias = shift;
	define_alias($alias,$obj);
    }
    return $obj;
}

sub getEncoding
{
    my ($class,$name,$skip_external) = @_;
    my $enc;
    if (ref($name) && $name->can('new_sequence'))
    {
	return $name;
    }
    my $lc = lc $name;
    if (exists $Encoding{$name})
    {
	return $Encoding{$name};
    }
    if (exists $Encoding{$lc})
    {
	return $Encoding{$lc};
    }

    my $oc = $class->find_alias($name);
    return $oc if defined $oc;

    $oc = $class->find_alias($lc) if $lc ne $name;
    return $oc if defined $oc;

    unless ($skip_external)
    {
	if (my $mod = $ExtModule{$name} || $ExtModule{$lc}){
	    $mod =~ s,::,/,g ; $mod .= '.pm';
	    eval{ require $mod; };
	    return $Encoding{$name} if exists $Encoding{$name};
	}
    }
    return;
}

sub find_encoding
{
    my ($name,$skip_external) = @_;
    return __PACKAGE__->getEncoding($name,$skip_external);
}

sub encode
{
    my ($name,$string,$check) = @_;
    my $enc = find_encoding($name);
    croak("Unknown encoding '$name'") unless defined $enc;
    my $octets = $enc->encode($string,$check);
    return undef if ($check && length($string));
    return $octets;
}

sub decode
{
    my ($name,$octets,$check) = @_;
    my $enc = find_encoding($name);
    croak("Unknown encoding '$name'") unless defined $enc;
    my $string = $enc->decode($octets,$check);
    $_[1] = $octets if $check;
    return $string;
}

sub from_to
{
    my ($string,$from,$to,$check) = @_;
    my $f = find_encoding($from);
    croak("Unknown encoding '$from'") unless defined $f;
    my $t = find_encoding($to);
    croak("Unknown encoding '$to'") unless defined $t;
    my $uni = $f->decode($string,$check);
    return undef if ($check && length($string));
    $string =  $t->encode($uni,$check);
    return undef if ($check && length($uni));
    return defined($_[0] = $string) ? length($string) : undef ;
}

sub encode_utf8
{
    my ($str) = @_;
    utf8::encode($str);
    return $str;
}

sub decode_utf8
{
    my ($str) = @_;
    return undef unless utf8::decode($str);
    return $str;
}

predefine_encodings();

#
# This is to restore %Encoding if really needed;
#
sub predefine_encodings{
    if ($ON_EBCDIC) { 
	# was in Encode::UTF_EBCDIC
	package Encode::UTF_EBCDIC;
	*name         = sub{ shift->{'Name'} };
	*new_sequence = sub{ return $_[0] };
	*decode = sub{
	    my ($obj,$str,$chk) = @_;
	    my $res = '';
	    for (my $i = 0; $i < length($str); $i++) {
		$res .= 
		    chr(utf8::unicode_to_native(ord(substr($str,$i,1))));
	    }
	    $_[1] = '' if $chk;
	    return $res;
	};
	*encode = sub{
	    my ($obj,$str,$chk) = @_;
	    my $res = '';
	    for (my $i = 0; $i < length($str); $i++) {
		$res .= 
		    chr(utf8::native_to_unicode(ord(substr($str,$i,1))));
	    }
	    $_[1] = '' if $chk;
	    return $res;
	};
	$Encode::Encoding{Internal} = 
	    bless {Name => "UTF_EBCDIC"} => "Encode::UTF_EBCDIC";
    } else {  
	# was in Encode::UTF_EBCDIC
	package Encode::Internal;
	*name         = sub{ shift->{'Name'} };
	*new_sequence = sub{ return $_[0] };
	*decode = sub{
	    my ($obj,$str,$chk) = @_;
	    utf8::upgrade($str);
	    $_[1] = '' if $chk;
	    return $str;
	};
	*encode = \&decode;
	$Encode::Encoding{Unicode} = 
	    bless {Name => "Internal"} => "Encode::Internal";
    }

    {
	# was in Encode::utf8
	package Encode::utf8;
	*name         = sub{ shift->{'Name'} };
	*new_sequence = sub{ return $_[0] };
	*decode = sub{
	    my ($obj,$octets,$chk) = @_;
	    my $str = Encode::decode_utf8($octets);
	    if (defined $str) {
		$_[1] = '' if $chk;
		return $str;
	    }
	    return undef;
	};
	*encode = sub {
	    my ($obj,$string,$chk) = @_;
	    my $octets = Encode::encode_utf8($string);
	    $_[1] = '' if $chk;
	    return $octets;
	};
	$Encode::Encoding{utf8} = 
	    bless {Name => "utf8"} => "Encode::utf8";
    }
    # do externals if necessary 
    require File::Basename;
    require File::Spec;
    for my $ext (qw()){
	my $pm =
	    File::Spec->catfile(File::Basename::dirname($INC{'Encode.pm'}),
				"Encode", "$ext.pm");
	do $pm;
    }
}

require Encode::Encoding;
require Encode::XS;

1;

__END__

=head1 NAME

Encode - character encodings

=head1 SYNOPSIS

    use Encode;


=head2 Table of Contents

Encode consists of a collection of modules which details are too big 
to fit in one document.  This POD itself explains the top-level APIs
and general topics at a glance.  For other topics and more details, 
see the PODs below;

  Name			        Description
  --------------------------------------------------------
  Encode::Alias         Alias defintions to encodings
  Encode::Encoding      Encode Implementation Base Class
  Encode::Supported     List of Supported Encodings
  Encode::CN            Simplified Chinese Encodings
  Encode::JP            Japanese Encodings
  Encode::KR            Korean Encodings
  Encode::TW            Traditional Chinese Encodings
  --------------------------------------------------------

=head1 DESCRIPTION

The C<Encode> module provides the interfaces between Perl's strings
and the rest of the system.  Perl strings are sequences of
B<characters>.

The repertoire of characters that Perl can represent is at least that
defined by the Unicode Consortium. On most platforms the ordinal
values of the characters (as returned by C<ord(ch)>) is the "Unicode
codepoint" for the character (the exceptions are those platforms where
the legacy encoding is some variant of EBCDIC rather than a super-set
of ASCII - see L<perlebcdic>).

Traditionally computer data has been moved around in 8-bit chunks
often called "bytes". These chunks are also known as "octets" in
networking standards. Perl is widely used to manipulate data of many
types - not only strings of characters representing human or computer
languages but also "binary" data being the machines representation of
numbers, pixels in an image - or just about anything.

When Perl is processing "binary data" the programmer wants Perl to
process "sequences of bytes". This is not a problem for Perl - as a
byte has 256 possible values it easily fits in Perl's much larger
"logical character".

=head2 TERMINOLOGY

=over 4

=item *

I<character>: a character in the range 0..(2**32-1) (or more).
(What Perl's strings are made of.)

=item *

I<byte>: a character in the range 0..255
(A special case of a Perl character.)

=item *

I<octet>: 8 bits of data, with ordinal values 0..255
(Term for bytes passed to or from a non-Perl context, e.g. disk file.)

=back

The marker [INTERNAL] marks Internal Implementation Details, in
general meant only for those who think they know what they are doing,
and such details may change in future releases.

=head1 PERL ENCODING API

=over 4

=item $octets  = encode(ENCODING, $string[, CHECK])

Encodes string from Perl's internal form into I<ENCODING> and returns
a sequence of octets.  ENCODING can be either a canonical name or
alias.  For encoding names and aliases, see L</"Defining Aliases">.
For CHECK see L</"Handling Malformed Data">.

For example to convert (internally UTF-8 encoded) Unicode string to
iso-8859-1 (also known as Latin1), 

  $octets = encode("iso-8859-1", $unicode);

=item $string = decode(ENCODING, $octets[, CHECK])

Decode sequence of octets assumed to be in I<ENCODING> into Perl's
internal form and returns the resulting string.  as in encode(),
ENCODING can be either a canonical name or alias. For encoding names
and aliases, see L</"Defining Aliases">.  For CHECK see
L</"Handling Malformed Data">.

For example to convert ISO-8859-1 data to UTF-8:

  $utf8 = decode("iso-8859-1", $latin1);

=item [$length =] from_to($string, FROM_ENCODING, TO_ENCODING [,CHECK])

Convert B<in-place> the data between two encodings.  How did the data
in $string originally get to be in FROM_ENCODING?  Either using
encode() or through PerlIO: See L</"Encoding and IO">.
For encoding names and aliases, see L</"Defining Aliases">. 
For CHECK see L</"Handling Malformed Data">.

For example to convert ISO-8859-1 data to UTF-8:

	from_to($data, "iso-8859-1", "utf-8");

and to convert it back:

	from_to($data, "utf-8", "iso-8859-1");

Note that because the conversion happens in place, the data to be
converted cannot be a string constant, it must be a scalar variable.

from_to() return the length of the converted string on success, undef
otherwise.

=back

=head2 UTF-8 / utf8

The Unicode consortium defines the UTF-8 standard as a way of encoding
the entire Unicode repertoire as sequences of octets.  This encoding is
expected to become very widespread. Perl can use this form internally
to represent strings, so conversions to and from this form are
particularly efficient (as octets in memory do not have to change,
just the meta-data that tells Perl how to treat them).

=over 4

=item $octets = encode_utf8($string);

The characters that comprise string are encoded in Perl's superset of UTF-8
and the resulting octets returned as a sequence of bytes. All possible
characters have a UTF-8 representation so this function cannot fail.

=item $string = decode_utf8($octets [, CHECK]);

The sequence of octets represented by $octets is decoded from UTF-8
into a sequence of logical characters. Not all sequences of octets
form valid UTF-8 encodings, so it is possible for this call to fail.
For CHECK see L</"Handling Malformed Data">.

=back

=head2 Listing available encodings

  use Encode;
  @list = Encode->encodings();

Returns a list of the canonical names of the available encodings that
are loaded.  To get a list of all available encodings including the
ones that are not loaded yet, say

  @all_encodings = Encode->encodings(":all");

Or you can give the name of specific module.

  @with_jp = Encode->encodings("Encode::JP");

When "::" is not in the name, "Encode::" is assumed.

  @ebcdic = Encode->encodings("EBCDIC");

To find which encodings are supported by this package in details, 
see L<Encode::Supported>.

=head2 Defining Aliases

To add new alias to a given encoding,  Use;

  use Encode;
  use Encode::Alias;
  define_alias(newName => ENCODING);

After that, newName can be used as an alias for ENCODING.
ENCODING may be either the name of an encoding or an
I<encoding object>

See L<Encode::Alias> on details.

=head1 Encoding and IO

It is very common to want to do encoding transformations when
reading or writing files, network connections, pipes etc.
If Perl is configured to use the new 'perlio' IO system then
C<Encode> provides a "layer" (See L<perliol>) which can transform
data as it is read or written.

Here is how the blind poet would modernise the encoding:

    use Encode;
    open(my $iliad,'<:encoding(iso-8859-7)','iliad.greek');
    open(my $utf8,'>:utf8','iliad.utf8');
    my @epic = <$iliad>;
    print $utf8 @epic;
    close($utf8);
    close($illiad);

In addition the new IO system can also be configured to read/write
UTF-8 encoded characters (as noted above this is efficient):

    open(my $fh,'>:utf8','anything');
    print $fh "Any \x{0021} string \N{SMILEY FACE}\n";

Either of the above forms of "layer" specifications can be made the default
for a lexical scope with the C<use open ...> pragma. See L<open>.

Once a handle is open is layers can be altered using C<binmode>.

Without any such configuration, or if Perl itself is built using
system's own IO, then write operations assume that file handle accepts
only I<bytes> and will C<die> if a character larger than 255 is
written to the handle. When reading, each octet from the handle
becomes a byte-in-a-character. Note that this default is the same
behaviour as bytes-only languages (including Perl before v5.6) would
have, and is sufficient to handle native 8-bit encodings
e.g. iso-8859-1, EBCDIC etc. and any legacy mechanisms for handling
other encodings and binary data.

In other cases it is the programs responsibility to transform
characters into bytes using the API above before doing writes, and to
transform the bytes read from a handle into characters before doing
"character operations" (e.g. C<lc>, C</\W+/>, ...).

You can also use PerlIO to convert larger amounts of data you don't
want to bring into memory.  For example to convert between ISO-8859-1
(Latin 1) and UTF-8 (or UTF-EBCDIC in EBCDIC machines):

    open(F, "<:encoding(iso-8859-1)", "data.txt") or die $!;
    open(G, ">:utf8",                 "data.utf") or die $!;
    while (<F>) { print G }

    # Could also do "print G <F>" but that would pull
    # the whole file into memory just to write it out again.

More examples:

    open(my $f, "<:encoding(cp1252)")
    open(my $g, ">:encoding(iso-8859-2)")
    open(my $h, ">:encoding(latin9)")       # iso-8859-15

See L<PerlIO> for more information.

See also L<encoding> for how to change the default encoding of the
data in your script.

=head1 Handling Malformed Data

If I<CHECK> is not set, (en|de)code will put I<substitution character> in
place of the malformed character.  for UCM-based encodings,
E<lt>subcharE<gt> will be used.  For Unicode, \xFFFD is used.  If the
data is supposed to be UTF-8, an optional lexical warning (category
utf8) is given. 

If I<CHECK> is true but not a code reference, dies with an error message.

In future you will be able to use a code reference to a callback
function for the value of I<CHECK> but its API is still undecided.

=head1 Defining Encodings

To define a new encoding, use:

    use Encode qw(define_alias);
    define_encoding($object, 'canonicalName' [, alias...]);

I<canonicalName> will be associated with I<$object>.  The object
should provide the interface described in L<Encode::Encoding>
If more than two arguments are provided then additional
arguments are taken as aliases for I<$object> as for C<define_alias>.

See L<Encode::Encoding> for more details.

=head1 Messing with Perl's Internals

The following API uses parts of Perl's internals in the current
implementation.  As such they are efficient, but may change.

=over 4

=item is_utf8(STRING [, CHECK])

[INTERNAL] Test whether the UTF-8 flag is turned on in the STRING.
If CHECK is true, also checks the data in STRING for being well-formed
UTF-8.  Returns true if successful, false otherwise.

=item _utf8_on(STRING)

[INTERNAL] Turn on the UTF-8 flag in STRING.  The data in STRING is
B<not> checked for being well-formed UTF-8.  Do not use unless you
B<know> that the STRING is well-formed UTF-8.  Returns the previous
state of the UTF-8 flag (so please don't test the return value as
I<not> success or failure), or C<undef> if STRING is not a string.

=item _utf8_off(STRING)

[INTERNAL] Turn off the UTF-8 flag in STRING.  Do not use frivolously.
Returns the previous state of the UTF-8 flag (so please don't test the
return value as I<not> success or failure), or C<undef> if STRING is
not a string.

=back

=head1 SEE ALSO

L<Encode::Encoding>,
L<Encode::Supported>,
L<PerlIO>, 
L<encoding>,
L<perlebcdic>, 
L<perlfunc/open>, 
L<perlunicode>, 
L<utf8>, 
the Perl Unicode Mailing List E<lt>perl-unicode@perl.orgE<gt>

=cut
Commit	Line	Data
2c674647	1	package Encode;
51ef4e11	2	use strict;
fdd579e2	3	our $VERSION = do { my @r = (q$Revision: 1.31 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
5129552c	4	our $DEBUG = 0;
2c674647	5
	6	require DynaLoader;
	7	require Exporter;
	8
51ef4e11	9	our @ISA = qw(Exporter DynaLoader);
2c674647	10
4411f3b6	11	# Public, encouraged API is exported by default
51ef4e11	12	our @EXPORT = qw (
4411f3b6	13	encode
	14	decode
	15	encode_utf8
	16	decode_utf8
	17	find_encoding
51ef4e11	18	encodings
4411f3b6	19	);
4411f3b6	20
51ef4e11	21	our @EXPORT_OK =
2c674647	22	qw(
51ef4e11	23	define_encoding
2c674647	24	from_to
2c674647	25	is_utf8
4411f3b6	26	is_8bit
4411f3b6	27	is_16bit
a12c0f56	28	utf8_upgrade
a12c0f56	29	utf8_downgrade
4411f3b6	30	_utf8_on
4411f3b6	31	_utf8_off
2c674647	32	);
	33
	34	bootstrap Encode ();
	35
4411f3b6	36	# Documentation moved after __END__ for speed - NI-S
2c674647	37
bf230f3d	38	use Carp;
bf230f3d	39
a63c962f	40	our $ON_EBCDIC = (ord("A") == 193);
f2a2953c	41
5d030b67	42	use Encode::Alias;
5d030b67	43
5129552c	44	# Make a %Encoding package variable to allow a certain amount of cheating
5129552c	45	our %Encoding;
fdd579e2	46	use Encode::Config;
5129552c	47
656753f8	48	sub encodings
656753f8	49	{
5129552c	50	my $class = shift;
071db25d	51	my @modules = (@_ and $_[0] eq ":all") ? values %ExtModule : @_;
c731e18e	52	for my $mod (@modules){
	53	$mod =~ s,::,/,g or $mod = "Encode/$mod";
	54	$mod .= '.pm';
	55	$DEBUG and warn "about to require $mod;";
	56	eval { require $mod; };
5129552c	57	}
c731e18e	58	my %modules = map {$_ => 1} @modules;
5129552c	59	return
c731e18e	60	sort grep {!/^(?:Internal\|Unicode)$/o} keys %Encoding;
51ef4e11	61	}
51ef4e11	62
51ef4e11	63	sub define_encoding
51ef4e11	64	{
18586f54	65	my $obj = shift;
18586f54	66	my $name = shift;
5129552c	67	$Encoding{$name} = $obj;
18586f54	68	my $lc = lc($name);
	69	define_alias($lc => $obj) unless $lc eq $name;
	70	while (@_)
	71	{
	72	my $alias = shift;
	73	define_alias($alias,$obj);
	74	}
	75	return $obj;
656753f8	76	}
656753f8	77
656753f8	78	sub getEncoding
656753f8	79	{
dd9703c9	80	my ($class,$name,$skip_external) = @_;
18586f54	81	my $enc;
	82	if (ref($name) && $name->can('new_sequence'))
	83	{
	84	return $name;
	85	}
	86	my $lc = lc $name;
5129552c	87	if (exists $Encoding{$name})
18586f54	88	{
5129552c	89	return $Encoding{$name};
18586f54	90	}
5129552c	91	if (exists $Encoding{$lc})
18586f54	92	{
5129552c	93	return $Encoding{$lc};
18586f54	94	}
c50d192e	95
5129552c	96	my $oc = $class->find_alias($name);
c50d192e	97	return $oc if defined $oc;
c50d192e	98
5129552c	99	$oc = $class->find_alias($lc) if $lc ne $name;
c50d192e	100	return $oc if defined $oc;
c50d192e	101
c731e18e	102	unless ($skip_external)
d1ed7747	103	{
c731e18e	104	if (my $mod = $ExtModule{$name} \|\| $ExtModule{$lc}){
	105	$mod =~ s,::,/,g ; $mod .= '.pm';
	106	eval{ require $mod; };
	107	return $Encoding{$name} if exists $Encoding{$name};
	108	}
d1ed7747	109	}
18586f54	110	return;
656753f8	111	}
656753f8	112
4411f3b6	113	sub find_encoding
4411f3b6	114	{
dd9703c9	115	my ($name,$skip_external) = @_;
dd9703c9	116	return __PACKAGE__->getEncoding($name,$skip_external);
4411f3b6	117	}
	118
	119	sub encode
	120	{
18586f54	121	my ($name,$string,$check) = @_;
	122	my $enc = find_encoding($name);
	123	croak("Unknown encoding '$name'") unless defined $enc;
	124	my $octets = $enc->encode($string,$check);
	125	return undef if ($check && length($string));
	126	return $octets;
4411f3b6	127	}
	128
	129	sub decode
	130	{
18586f54	131	my ($name,$octets,$check) = @_;
	132	my $enc = find_encoding($name);
	133	croak("Unknown encoding '$name'") unless defined $enc;
	134	my $string = $enc->decode($octets,$check);
	135	$_[1] = $octets if $check;
	136	return $string;
4411f3b6	137	}
	138
	139	sub from_to
	140	{
18586f54	141	my ($string,$from,$to,$check) = @_;
	142	my $f = find_encoding($from);
	143	croak("Unknown encoding '$from'") unless defined $f;
	144	my $t = find_encoding($to);
	145	croak("Unknown encoding '$to'") unless defined $t;
	146	my $uni = $f->decode($string,$check);
	147	return undef if ($check && length($string));
a999c27c	148	$string = $t->encode($uni,$check);
18586f54	149	return undef if ($check && length($uni));
3ef515df	150	return defined($_[0] = $string) ? length($string) : undef ;
4411f3b6	151	}
	152
	153	sub encode_utf8
	154	{
18586f54	155	my ($str) = @_;
c731e18e	156	utf8::encode($str);
18586f54	157	return $str;
4411f3b6	158	}
	159
	160	sub decode_utf8
	161	{
18586f54	162	my ($str) = @_;
	163	return undef unless utf8::decode($str);
	164	return $str;
5ad8ef52	165	}
5ad8ef52	166
f2a2953c	167	predefine_encodings();
	168
	169	#
	170	# This is to restore %Encoding if really needed;
	171	#
	172	sub predefine_encodings{
	173	if ($ON_EBCDIC) {
	174	# was in Encode::UTF_EBCDIC
	175	package Encode::UTF_EBCDIC;
	176	*name = sub{ shift->{'Name'} };
	177	*new_sequence = sub{ return $_[0] };
	178	*decode = sub{
	179	my ($obj,$str,$chk) = @_;
	180	my $res = '';
	181	for (my $i = 0; $i < length($str); $i++) {
	182	$res .=
	183	chr(utf8::unicode_to_native(ord(substr($str,$i,1))));
	184	}
	185	$_[1] = '' if $chk;
	186	return $res;
	187	};
	188	*encode = sub{
	189	my ($obj,$str,$chk) = @_;
	190	my $res = '';
	191	for (my $i = 0; $i < length($str); $i++) {
	192	$res .=
	193	chr(utf8::native_to_unicode(ord(substr($str,$i,1))));
	194	}
	195	$_[1] = '' if $chk;
	196	return $res;
	197	};
c731e18e	198	$Encode::Encoding{Internal} =
c731e18e	199	bless {Name => "UTF_EBCDIC"} => "Encode::UTF_EBCDIC";
f2a2953c	200	} else {
	201	# was in Encode::UTF_EBCDIC
	202	package Encode::Internal;
	203	*name = sub{ shift->{'Name'} };
	204	*new_sequence = sub{ return $_[0] };
	205	*decode = sub{
	206	my ($obj,$str,$chk) = @_;
	207	utf8::upgrade($str);
	208	$_[1] = '' if $chk;
	209	return $str;
	210	};
	211	*encode = \&decode;
	212	$Encode::Encoding{Unicode} =
c731e18e	213	bless {Name => "Internal"} => "Encode::Internal";
f2a2953c	214	}
	215
	216	{
	217	# was in Encode::utf8
	218	package Encode::utf8;
	219	*name = sub{ shift->{'Name'} };
	220	*new_sequence = sub{ return $_[0] };
	221	*decode = sub{
	222	my ($obj,$octets,$chk) = @_;
	223	my $str = Encode::decode_utf8($octets);
	224	if (defined $str) {
	225	$_[1] = '' if $chk;
	226	return $str;
	227	}
	228	return undef;
	229	};
	230	*encode = sub {
	231	my ($obj,$string,$chk) = @_;
	232	my $octets = Encode::encode_utf8($string);
	233	$_[1] = '' if $chk;
	234	return $octets;
	235	};
	236	$Encode::Encoding{utf8} =
c731e18e	237	bless {Name => "utf8"} => "Encode::utf8";
f2a2953c	238	}
	239	# do externals if necessary
	240	require File::Basename;
	241	require File::Spec;
c731e18e	242	for my $ext (qw()){
f2a2953c	243	my $pm =
	244	File::Spec->catfile(File::Basename::dirname($INC{'Encode.pm'}),
	245	"Encode", "$ext.pm");
	246	do $pm;
	247	}
	248	}
	249
18586f54	250	require Encode::Encoding;
18586f54	251	require Encode::XS;
4411f3b6	252
656753f8	253	1;
656753f8	254
2a936312	255	__END__
2a936312	256
4411f3b6	257	=head1 NAME
	258
	259	Encode - character encodings
	260
	261	=head1 SYNOPSIS
	262
	263	use Encode;
	264
67d7b5ef	265
	266	=head2 Table of Contents
	267
	268	Encode consists of a collection of modules which details are too big
	269	to fit in one document. This POD itself explains the top-level APIs
	270	and general topics at a glance. For other topics and more details,
	271	see the PODs below;
	272
	273	Name Description
	274	--------------------------------------------------------
	275	Encode::Alias Alias defintions to encodings
	276	Encode::Encoding Encode Implementation Base Class
	277	Encode::Supported List of Supported Encodings
	278	Encode::CN Simplified Chinese Encodings
	279	Encode::JP Japanese Encodings
	280	Encode::KR Korean Encodings
	281	Encode::TW Traditional Chinese Encodings
	282	--------------------------------------------------------
	283
4411f3b6	284	=head1 DESCRIPTION
4411f3b6	285
47bfe92f	286	The C<Encode> module provides the interfaces between Perl's strings
67d7b5ef	287	and the rest of the system. Perl strings are sequences of
	288	B<characters>.
	289
	290	The repertoire of characters that Perl can represent is at least that
	291	defined by the Unicode Consortium. On most platforms the ordinal
	292	values of the characters (as returned by C<ord(ch)>) is the "Unicode
	293	codepoint" for the character (the exceptions are those platforms where
	294	the legacy encoding is some variant of EBCDIC rather than a super-set
	295	of ASCII - see L<perlebcdic>).
	296
	297	Traditionally computer data has been moved around in 8-bit chunks
	298	often called "bytes". These chunks are also known as "octets" in
	299	networking standards. Perl is widely used to manipulate data of many
	300	types - not only strings of characters representing human or computer
	301	languages but also "binary" data being the machines representation of
	302	numbers, pixels in an image - or just about anything.
	303
	304	When Perl is processing "binary data" the programmer wants Perl to
	305	process "sequences of bytes". This is not a problem for Perl - as a
	306	byte has 256 possible values it easily fits in Perl's much larger
	307	"logical character".
	308
	309	=head2 TERMINOLOGY
4411f3b6	310
67d7b5ef	311	=over 4
21938dfa	312
67d7b5ef	313	=item *
	314
	315	I<character>: a character in the range 0..(2**32-1) (or more).
	316	(What Perl's strings are made of.)
	317
	318	=item *
	319
	320	I<byte>: a character in the range 0..255
	321	(A special case of a Perl character.)
	322
	323	=item *
	324
	325	I<octet>: 8 bits of data, with ordinal values 0..255
	326	(Term for bytes passed to or from a non-Perl context, e.g. disk file.)
	327
	328	=back
4411f3b6	329
67d7b5ef	330	The marker [INTERNAL] marks Internal Implementation Details, in
	331	general meant only for those who think they know what they are doing,
	332	and such details may change in future releases.
	333
	334	=head1 PERL ENCODING API
4411f3b6	335
	336	=over 4
	337
f2a2953c	338	=item $octets = encode(ENCODING, $string[, CHECK])
4411f3b6	339
47bfe92f	340	Encodes string from Perl's internal form into I<ENCODING> and returns
67d7b5ef	341	a sequence of octets. ENCODING can be either a canonical name or
	342	alias. For encoding names and aliases, see L</"Defining Aliases">.
	343	For CHECK see L</"Handling Malformed Data">.
4411f3b6	344
67d7b5ef	345	For example to convert (internally UTF-8 encoded) Unicode string to
67d7b5ef	346	iso-8859-1 (also known as Latin1),
681a7c68	347
67d7b5ef	348	$octets = encode("iso-8859-1", $unicode);
681a7c68	349
f2a2953c	350	=item $string = decode(ENCODING, $octets[, CHECK])
4411f3b6	351
47bfe92f	352	Decode sequence of octets assumed to be in I<ENCODING> into Perl's
67d7b5ef	353	internal form and returns the resulting string. as in encode(),
	354	ENCODING can be either a canonical name or alias. For encoding names
	355	and aliases, see L</"Defining Aliases">. For CHECK see
47bfe92f	356	L</"Handling Malformed Data">.
47bfe92f	357
1b2c56c8	358	For example to convert ISO-8859-1 data to UTF-8:
681a7c68	359
67d7b5ef	360	$utf8 = decode("iso-8859-1", $latin1);
681a7c68	361
f2a2953c	362	=item [$length =] from_to($string, FROM_ENCODING, TO_ENCODING [,CHECK])
47bfe92f	363
2b106fbe	364	Convert B<in-place> the data between two encodings. How did the data
2b106fbe	365	in $string originally get to be in FROM_ENCODING? Either using
67d7b5ef	366	encode() or through PerlIO: See L</"Encoding and IO">.
	367	For encoding names and aliases, see L</"Defining Aliases">.
	368	For CHECK see L</"Handling Malformed Data">.
2b106fbe	369
1b2c56c8	370	For example to convert ISO-8859-1 data to UTF-8:
2b106fbe	371
	372	from_to($data, "iso-8859-1", "utf-8");
	373
	374	and to convert it back:
	375
	376	from_to($data, "utf-8", "iso-8859-1");
4411f3b6	377
ab97ca19	378	Note that because the conversion happens in place, the data to be
	379	converted cannot be a string constant, it must be a scalar variable.
	380
3ef515df	381	from_to() return the length of the converted string on success, undef
	382	otherwise.
	383
4411f3b6	384	=back
4411f3b6	385
f2a2953c	386	=head2 UTF-8 / utf8
	387
	388	The Unicode consortium defines the UTF-8 standard as a way of encoding
	389	the entire Unicode repertoire as sequences of octets. This encoding is
	390	expected to become very widespread. Perl can use this form internally
	391	to represent strings, so conversions to and from this form are
	392	particularly efficient (as octets in memory do not have to change,
	393	just the meta-data that tells Perl how to treat them).
	394
	395	=over 4
	396
	397	=item $octets = encode_utf8($string);
	398
	399	The characters that comprise string are encoded in Perl's superset of UTF-8
	400	and the resulting octets returned as a sequence of bytes. All possible
	401	characters have a UTF-8 representation so this function cannot fail.
	402
	403	=item $string = decode_utf8($octets [, CHECK]);
	404
	405	The sequence of octets represented by $octets is decoded from UTF-8
	406	into a sequence of logical characters. Not all sequences of octets
	407	form valid UTF-8 encodings, so it is possible for this call to fail.
	408	For CHECK see L</"Handling Malformed Data">.
	409
	410	=back
	411
51ef4e11	412	=head2 Listing available encodings
51ef4e11	413
5129552c	414	use Encode;
	415	@list = Encode->encodings();
	416
	417	Returns a list of the canonical names of the available encodings that
	418	are loaded. To get a list of all available encodings including the
	419	ones that are not loaded yet, say
	420
	421	@all_encodings = Encode->encodings(":all");
	422
	423	Or you can give the name of specific module.
	424
c731e18e	425	@with_jp = Encode->encodings("Encode::JP");
	426
	427	When "::" is not in the name, "Encode::" is assumed.
51ef4e11	428
c731e18e	429	@ebcdic = Encode->encodings("EBCDIC");
5d030b67	430
a63c962f	431	To find which encodings are supported by this package in details,
5d030b67	432	see L<Encode::Supported>.
51ef4e11	433
	434	=head2 Defining Aliases
	435
67d7b5ef	436	To add new alias to a given encoding, Use;
67d7b5ef	437
5129552c	438	use Encode;
5129552c	439	use Encode::Alias;
a63c962f	440	define_alias(newName => ENCODING);
51ef4e11	441
3ef515df	442	After that, newName can be used as an alias for ENCODING.
f2a2953c	443	ENCODING may be either the name of an encoding or an
f2a2953c	444	I<encoding object>
51ef4e11	445
5d030b67	446	See L<Encode::Alias> on details.
51ef4e11	447
4411f3b6	448	=head1 Encoding and IO
	449
	450	It is very common to want to do encoding transformations when
	451	reading or writing files, network connections, pipes etc.
47bfe92f	452	If Perl is configured to use the new 'perlio' IO system then
4411f3b6	453	C<Encode> provides a "layer" (See L<perliol>) which can transform
	454	data as it is read or written.
	455
8e86646e	456	Here is how the blind poet would modernise the encoding:
8e86646e	457
42234700	458	use Encode;
8e86646e	459	open(my $iliad,'<:encoding(iso-8859-7)','iliad.greek');
	460	open(my $utf8,'>:utf8','iliad.utf8');
	461	my @epic = <$iliad>;
	462	print $utf8 @epic;
	463	close($utf8);
	464	close($illiad);
4411f3b6	465
	466	In addition the new IO system can also be configured to read/write
	467	UTF-8 encoded characters (as noted above this is efficient):
	468
e9692b5b	469	open(my $fh,'>:utf8','anything');
e9692b5b	470	print $fh "Any \x{0021} string \N{SMILEY FACE}\n";
4411f3b6	471
	472	Either of the above forms of "layer" specifications can be made the default
	473	for a lexical scope with the C<use open ...> pragma. See L<open>.
	474
	475	Once a handle is open is layers can be altered using C<binmode>.
	476
47bfe92f	477	Without any such configuration, or if Perl itself is built using
4411f3b6	478	system's own IO, then write operations assume that file handle accepts
	479	only I<bytes> and will C<die> if a character larger than 255 is
	480	written to the handle. When reading, each octet from the handle
	481	becomes a byte-in-a-character. Note that this default is the same
47bfe92f	482	behaviour as bytes-only languages (including Perl before v5.6) would
	483	have, and is sufficient to handle native 8-bit encodings
	484	e.g. iso-8859-1, EBCDIC etc. and any legacy mechanisms for handling
	485	other encodings and binary data.
	486
	487	In other cases it is the programs responsibility to transform
	488	characters into bytes using the API above before doing writes, and to
	489	transform the bytes read from a handle into characters before doing
	490	"character operations" (e.g. C<lc>, C</\W+/>, ...).
	491
47bfe92f	492	You can also use PerlIO to convert larger amounts of data you don't
1b2c56c8	493	want to bring into memory. For example to convert between ISO-8859-1
47bfe92f	494	(Latin 1) and UTF-8 (or UTF-EBCDIC in EBCDIC machines):
47bfe92f	495
e9692b5b	496	open(F, "<:encoding(iso-8859-1)", "data.txt") or die $!;
	497	open(G, ">:utf8", "data.utf") or die $!;
	498	while (<F>) { print G }
	499
	500	# Could also do "print G <F>" but that would pull
	501	# the whole file into memory just to write it out again.
	502
	503	More examples:
47bfe92f	504
e9692b5b	505	open(my $f, "<:encoding(cp1252)")
	506	open(my $g, ">:encoding(iso-8859-2)")
	507	open(my $h, ">:encoding(latin9)") # iso-8859-15
47bfe92f	508
47bfe92f	509	See L<PerlIO> for more information.
4411f3b6	510
1768d7eb	511	See also L<encoding> for how to change the default encoding of the
d521382b	512	data in your script.
1768d7eb	513
67d7b5ef	514	=head1 Handling Malformed Data
67d7b5ef	515
f2a2953c	516	If I<CHECK> is not set, (en\|de)code will put I<substitution character> in
	517	place of the malformed character. for UCM-based encodings,
	518	E<lt>subcharE<gt> will be used. For Unicode, \xFFFD is used. If the
	519	data is supposed to be UTF-8, an optional lexical warning (category
	520	utf8) is given.
67d7b5ef	521
f2a2953c	522	If I<CHECK> is true but not a code reference, dies with an error message.
67d7b5ef	523
f2a2953c	524	In future you will be able to use a code reference to a callback
f2a2953c	525	function for the value of I<CHECK> but its API is still undecided.
67d7b5ef	526
	527	=head1 Defining Encodings
	528
	529	To define a new encoding, use:
	530
	531	use Encode qw(define_alias);
	532	define_encoding($object, 'canonicalName' [, alias...]);
	533
	534	I<canonicalName> will be associated with I<$object>. The object
	535	should provide the interface described in L<Encode::Encoding>
	536	If more than two arguments are provided then additional
	537	arguments are taken as aliases for I<$object> as for C<define_alias>.
	538
f2a2953c	539	See L<Encode::Encoding> for more details.
f2a2953c	540
4411f3b6	541	=head1 Messing with Perl's Internals
4411f3b6	542
47bfe92f	543	The following API uses parts of Perl's internals in the current
47bfe92f	544	implementation. As such they are efficient, but may change.
4411f3b6	545
	546	=over 4
	547
a63c962f	548	=item is_utf8(STRING [, CHECK])
4411f3b6	549
4411f3b6	550	[INTERNAL] Test whether the UTF-8 flag is turned on in the STRING.
47bfe92f	551	If CHECK is true, also checks the data in STRING for being well-formed
47bfe92f	552	UTF-8. Returns true if successful, false otherwise.
4411f3b6	553
a63c962f	554	=item _utf8_on(STRING)
4411f3b6	555
	556	[INTERNAL] Turn on the UTF-8 flag in STRING. The data in STRING is
	557	B<not> checked for being well-formed UTF-8. Do not use unless you
	558	B<know> that the STRING is well-formed UTF-8. Returns the previous
	559	state of the UTF-8 flag (so please don't test the return value as
	560	I<not> success or failure), or C<undef> if STRING is not a string.
	561
a63c962f	562	=item _utf8_off(STRING)
4411f3b6	563
	564	[INTERNAL] Turn off the UTF-8 flag in STRING. Do not use frivolously.
	565	Returns the previous state of the UTF-8 flag (so please don't test the
	566	return value as I<not> success or failure), or C<undef> if STRING is
	567	not a string.
	568
	569	=back
	570
	571	=head1 SEE ALSO
	572
5d030b67	573	L<Encode::Encoding>,
	574	L<Encode::Supported>,
	575	L<PerlIO>,
	576	L<encoding>,
	577	L<perlebcdic>,
	578	L<perlfunc/open>,
	579	L<perlunicode>,
	580	L<utf8>,
	581	the Perl Unicode Mailing List E<lt>perl-unicode@perl.orgE<gt>
4411f3b6	582
4411f3b6	583	=cut