[p5sagit/p5-mst-13.2.git] / ext / Encode / Encode.pm

package Encode;
use strict;
our $VERSION = do { my @r = (q$Revision: 1.11 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
our $DEBUG = 0;

require DynaLoader;
require Exporter;

our @ISA = qw(Exporter DynaLoader);

# Public, encouraged API is exported by default
our @EXPORT = qw (
  encode
  decode
  encode_utf8
  decode_utf8
  find_encoding
  encodings
);

our @EXPORT_OK =
    qw(
       define_encoding
       from_to
       is_utf8
       is_8bit
       is_16bit
       utf8_upgrade
       utf8_downgrade
       _utf8_on
       _utf8_off
      );

bootstrap Encode ();

# Documentation moved after __END__ for speed - NI-S

use Carp;

our $ON_EBCDIC = (ord("A") == 193);
use Encode::Alias;

# Make a %Encoding package variable to allow a certain amount of cheating
our %Encoding;

our %ExtModule =
    (
     viscii             => 'Encode/Byte.pm',
     'koi8-r'           => 'Encode/Byte.pm',
     cp1047             => 'Encode/EBCDIC.pm',
     cp37               => 'Encode/EBCDIC.pm',
     'posix-bc'         => 'Encode/EBCDIC.pm',
     symbol             => 'Encode/Symbol.pm',
     dingbats           => 'Encode/Symbol.pm',
    );

for my $k (2..11,13..16){
    $ExtModule{"iso-8859-$k"} = 'Encode/Byte.pm';
}

for my $k (1250..1258){
    $ExtModule{"cp$k"} = 'Encode/Byte.pm';
}

unless ($ON_EBCDIC) { # CJK added to autoload unless EBCDIC env
%ExtModule =(
	     %ExtModule,
	     'euc-cn'           => 'Encode/CN.pm',
	     gb2312		=> 'Encode/CN.pm',
	     gb12345		=> 'Encode/CN.pm',
	     gbk		=> 'Encode/CN.pm',
	     cp936		=> 'Encode/CN.pm',
	     'iso-ir-165'	=> 'Encode/CN.pm',
	     'euc-jp'	        => 'Encode/JP.pm',
	     'iso-2022-jp'	=> 'Encode/JP.pm',
	     'iso-2022-jp-1'	=> 'Encode/JP.pm',
	     '7bit-jis'         => 'Encode/JP.pm',
	     shiftjis	        => 'Encode/JP.pm',
	     macjapan	        => 'Encode/JP.pm',
	     cp932		=> 'Encode/JP.pm',
	     'euc-kr'       	=> 'Encode/KR.pm',
	     ksc5601		=> 'Encode/KR.pm',
	     cp949		=> 'Encode/KR.pm',
	     big5		=> 'Encode/TW.pm',
	     'big5-hkscs'	=> 'Encode/TW.pm',
	     cp950		=> 'Encode/TW.pm',
	     gb18030		=> 'Encode/HanExtra.pm',
	     big5plus     	=> 'Encode/HanExtra.pm',
	     'euc-tw'   	=> 'Encode/HanExtra.pm',
	     );
}

for my $k (qw{ CentralEurRoman  Croatian  Cyrillic   Greek
	       Iceland          Roman     Rumanian   Sami
	       Thai             Turkish   Ukrainian
	     })
{
    $ExtModule{"mac$k"} = 'Encode/Byte.pm';
}

sub encodings
{
    my $class = shift;
    my @modules = (@_ and $_[0] eq ":all") ? values %ExtModule : @_;
    for my $m (@modules)
    {
	$DEBUG and warn "about to require $m;";
	eval { require $m; };
    }
    return
	map({$_->[0]} 
	    sort({$a->[1] cmp $b->[1]}
		 map({[$_, lc $_]} 
		     grep({ $_ ne 'Internal' }  keys %Encoding))));
}

sub define_encoding
{
    my $obj  = shift;
    my $name = shift;
    $Encoding{$name} = $obj;
    my $lc = lc($name);
    define_alias($lc => $obj) unless $lc eq $name;
    while (@_)
    {
	my $alias = shift;
	define_alias($alias,$obj);
    }
    return $obj;
}

sub getEncoding
{
    my ($class,$name,$skip_external) = @_;
    my $enc;
    if (ref($name) && $name->can('new_sequence'))
    {
	return $name;
    }
    my $lc = lc $name;
    if (exists $Encoding{$name})
    {
	return $Encoding{$name};
    }
    if (exists $Encoding{$lc})
    {
	return $Encoding{$lc};
    }

    my $oc = $class->find_alias($name);
    return $oc if defined $oc;

    $oc = $class->find_alias($lc) if $lc ne $name;
    return $oc if defined $oc;

    if (!$skip_external and exists $ExtModule{$lc})
    {
	eval{ require $ExtModule{$lc}; };
	return $Encoding{$name} if exists $Encoding{$name};
    }

    return;
}

sub find_encoding
{
    my ($name,$skip_external) = @_;
    return __PACKAGE__->getEncoding($name,$skip_external);
}

sub encode
{
    my ($name,$string,$check) = @_;
    my $enc = find_encoding($name);
    croak("Unknown encoding '$name'") unless defined $enc;
    my $octets = $enc->encode($string,$check);
    return undef if ($check && length($string));
    return $octets;
}

sub decode
{
    my ($name,$octets,$check) = @_;
    my $enc = find_encoding($name);
    croak("Unknown encoding '$name'") unless defined $enc;
    my $string = $enc->decode($octets,$check);
    $_[1] = $octets if $check;
    return $string;
}

sub from_to
{
    my ($string,$from,$to,$check) = @_;
    my $f = find_encoding($from);
    croak("Unknown encoding '$from'") unless defined $f;
    my $t = find_encoding($to);
    croak("Unknown encoding '$to'") unless defined $t;
    my $uni = $f->decode($string,$check);
    return undef if ($check && length($string));
    $string = $t->encode($uni,$check);
    return undef if ($check && length($uni));
    return defined($_[0] = $string) ? length($string) : undef ;
}

sub encode_utf8
{
    my ($str) = @_;
  utf8::encode($str);
    return $str;
}

sub decode_utf8
{
    my ($str) = @_;
    return undef unless utf8::decode($str);
    return $str;
}

require Encode::Encoding;
require Encode::XS;
require Encode::Internal;
require Encode::Unicode;
require Encode::utf8;
require Encode::10646_1;
require Encode::ucs2_le;

1;

__END__

=head1 NAME

Encode - character encodings

=head1 SYNOPSIS

    use Encode;


=head2 Table of Contents

Encode consists of a collection of modules which details are too big 
to fit in one document.  This POD itself explains the top-level APIs
and general topics at a glance.  For other topics and more details, 
see the PODs below;

  Name			        Description
  --------------------------------------------------------
  Encode::Alias         Alias defintions to encodings
  Encode::Encoding      Encode Implementation Base Class
  Encode::Supported     List of Supported Encodings
  Encode::CN            Simplified Chinese Encodings
  Encode::JP            Japanese Encodings
  Encode::KR            Korean Encodings
  Encode::TW            Traditional Chinese Encodings
  --------------------------------------------------------

=head1 DESCRIPTION

The C<Encode> module provides the interfaces between Perl's strings
and the rest of the system.  Perl strings are sequences of
B<characters>.

The repertoire of characters that Perl can represent is at least that
defined by the Unicode Consortium. On most platforms the ordinal
values of the characters (as returned by C<ord(ch)>) is the "Unicode
codepoint" for the character (the exceptions are those platforms where
the legacy encoding is some variant of EBCDIC rather than a super-set
of ASCII - see L<perlebcdic>).

Traditionally computer data has been moved around in 8-bit chunks
often called "bytes". These chunks are also known as "octets" in
networking standards. Perl is widely used to manipulate data of many
types - not only strings of characters representing human or computer
languages but also "binary" data being the machines representation of
numbers, pixels in an image - or just about anything.

When Perl is processing "binary data" the programmer wants Perl to
process "sequences of bytes". This is not a problem for Perl - as a
byte has 256 possible values it easily fits in Perl's much larger
"logical character".

=head2 TERMINOLOGY

=over 4

=item *

I<character>: a character in the range 0..(2**32-1) (or more).
(What Perl's strings are made of.)

=item *

I<byte>: a character in the range 0..255
(A special case of a Perl character.)

=item *

I<octet>: 8 bits of data, with ordinal values 0..255
(Term for bytes passed to or from a non-Perl context, e.g. disk file.)

=back

The marker [INTERNAL] marks Internal Implementation Details, in
general meant only for those who think they know what they are doing,
and such details may change in future releases.

=head1 PERL ENCODING API

=over 4

=item $bytes  = encode(ENCODING, $string[, CHECK])

Encodes string from Perl's internal form into I<ENCODING> and returns
a sequence of octets.  ENCODING can be either a canonical name or
alias.  For encoding names and aliases, see L</"Defining Aliases">.
For CHECK see L</"Handling Malformed Data">.

For example to convert (internally UTF-8 encoded) Unicode string to
iso-8859-1 (also known as Latin1), 

  $octets = encode("iso-8859-1", $unicode);

=item $string = decode(ENCODING, $bytes[, CHECK])

Decode sequence of octets assumed to be in I<ENCODING> into Perl's
internal form and returns the resulting string.  as in encode(),
ENCODING can be either a canonical name or alias. For encoding names
and aliases, see L</"Defining Aliases">.  For CHECK see
L</"Handling Malformed Data">.

For example to convert ISO-8859-1 data to UTF-8:

  $utf8 = decode("iso-8859-1", $latin1);

=item [$length =] from_to($string, FROM_ENCODING, TO_ENCODING[, CHECK])

Convert B<in-place> the data between two encodings.  How did the data
in $string originally get to be in FROM_ENCODING?  Either using
encode() or through PerlIO: See L</"Encoding and IO">.
For encoding names and aliases, see L</"Defining Aliases">. 
For CHECK see L</"Handling Malformed Data">.

For example to convert ISO-8859-1 data to UTF-8:

	from_to($data, "iso-8859-1", "utf-8");

and to convert it back:

	from_to($data, "utf-8", "iso-8859-1");

Note that because the conversion happens in place, the data to be
converted cannot be a string constant, it must be a scalar variable.

from_to() return the length of the converted string on success, undef
otherwise.

=back

=head2 Listing available encodings

  use Encode;
  @list = Encode->encodings();

Returns a list of the canonical names of the available encodings that
are loaded.  To get a list of all available encodings including the
ones that are not loaded yet, say

  @all_encodings = Encode->encodings(":all");

Or you can give the name of specific module.

  @with_jp = Encode->encodings("Encode/JP.pm");

Note in this case you have to say C<"Encode/JP.pm"> instead of
C<"Encode::JP">.

To find which encodings are supported by this package in details, 
see L<Encode::Supported>.


=head2 Defining Aliases

To add new alias to a given encoding,  Use;

  use Encode;
  use Encode::Alias;
  define_alias(newName => ENCODING);

After that, newName can be used as an alias for ENCODING.
ENCODING may be either the name of an encoding or an I<encoding
 object>

See L<Encode::Alias> on details.

=head1 Encoding and IO

It is very common to want to do encoding transformations when
reading or writing files, network connections, pipes etc.
If Perl is configured to use the new 'perlio' IO system then
C<Encode> provides a "layer" (See L<perliol>) which can transform
data as it is read or written.

Here is how the blind poet would modernise the encoding:

    use Encode;
    open(my $iliad,'<:encoding(iso-8859-7)','iliad.greek');
    open(my $utf8,'>:utf8','iliad.utf8');
    my @epic = <$iliad>;
    print $utf8 @epic;
    close($utf8);
    close($illiad);

In addition the new IO system can also be configured to read/write
UTF-8 encoded characters (as noted above this is efficient):

    open(my $fh,'>:utf8','anything');
    print $fh "Any \x{0021} string \N{SMILEY FACE}\n";

Either of the above forms of "layer" specifications can be made the default
for a lexical scope with the C<use open ...> pragma. See L<open>.

Once a handle is open is layers can be altered using C<binmode>.

Without any such configuration, or if Perl itself is built using
system's own IO, then write operations assume that file handle accepts
only I<bytes> and will C<die> if a character larger than 255 is
written to the handle. When reading, each octet from the handle
becomes a byte-in-a-character. Note that this default is the same
behaviour as bytes-only languages (including Perl before v5.6) would
have, and is sufficient to handle native 8-bit encodings
e.g. iso-8859-1, EBCDIC etc. and any legacy mechanisms for handling
other encodings and binary data.

In other cases it is the programs responsibility to transform
characters into bytes using the API above before doing writes, and to
transform the bytes read from a handle into characters before doing
"character operations" (e.g. C<lc>, C</\W+/>, ...).

You can also use PerlIO to convert larger amounts of data you don't
want to bring into memory.  For example to convert between ISO-8859-1
(Latin 1) and UTF-8 (or UTF-EBCDIC in EBCDIC machines):

    open(F, "<:encoding(iso-8859-1)", "data.txt") or die $!;
    open(G, ">:utf8",                 "data.utf") or die $!;
    while (<F>) { print G }

    # Could also do "print G <F>" but that would pull
    # the whole file into memory just to write it out again.

More examples:

    open(my $f, "<:encoding(cp1252)")
    open(my $g, ">:encoding(iso-8859-2)")
    open(my $h, ">:encoding(latin9)")       # iso-8859-15

See L<PerlIO> for more information.

See also L<encoding> for how to change the default encoding of the
data in your script.

=head1 Handling Malformed Data

If CHECK is not set, C<undef> is returned.  If the data is supposed to
be UTF-8, an optional lexical warning (category utf8) is given.  If
CHECK is true but not a code reference, dies.

It would desirable to have a way to indicate that transform should use
the encodings "replacement character" - no such mechanism is defined yet.

It is also planned to allow I<CHECK> to be a code reference.

This is not yet implemented as there are design issues with what its
arguments should be and how it returns its results.

=over 4

=item Scheme 1

Passed remaining fragment of string being processed.
Modifies it in place to remove bytes/characters it can understand
and returns a string used to represent them.
e.g.

 sub fixup {
   my $ch = substr($_[0],0,1,'');
   return sprintf("\x{%02X}",ord($ch);
 }

This scheme is close to how underlying C code for Encode works, but gives
the fixup routine very little context.

=item Scheme 2

Passed original string, and an index into it of the problem area, and
output string so far.  Appends what it will to output string and
returns new index into original string.  For example:

 sub fixup {
   # my ($s,$i,$d) = @_;
   my $ch = substr($_[0],$_[1],1);
   $_[2] .= sprintf("\x{%02X}",ord($ch);
   return $_[1]+1;
 }

This scheme gives maximal control to the fixup routine but is more
complicated to code, and may need internals of Encode to be tweaked to
keep original string intact.

=item Other Schemes

Hybrids of above.

Multiple return values rather than in-place modifications.

Index into the string could be C<pos($str)> allowing C<s/\G...//>.

=back

=head2 UTF-8 / utf8

The Unicode consortium defines the UTF-8 standard as a way of encoding
the entire Unicode repertoire as sequences of octets.  This encoding is
expected to become very widespread. Perl can use this form internally
to represent strings, so conversions to and from this form are
particularly efficient (as octets in memory do not have to change,
just the meta-data that tells Perl how to treat them).

=over 4

=item $bytes = encode_utf8($string);

The characters that comprise string are encoded in Perl's superset of UTF-8
and the resulting octets returned as a sequence of bytes. All possible
characters have a UTF-8 representation so this function cannot fail.

=item $string = decode_utf8($bytes [, CHECK]);

The sequence of octets represented by $bytes is decoded from UTF-8
into a sequence of logical characters. Not all sequences of octets
form valid UTF-8 encodings, so it is possible for this call to fail.
For CHECK see L</"Handling Malformed Data">.

=back

=head1 Defining Encodings

To define a new encoding, use:

    use Encode qw(define_alias);
    define_encoding($object, 'canonicalName' [, alias...]);

I<canonicalName> will be associated with I<$object>.  The object
should provide the interface described in L<Encode::Encoding>
If more than two arguments are provided then additional
arguments are taken as aliases for I<$object> as for C<define_alias>.

=head1 Messing with Perl's Internals

The following API uses parts of Perl's internals in the current
implementation.  As such they are efficient, but may change.

=over 4

=item is_utf8(STRING [, CHECK])

[INTERNAL] Test whether the UTF-8 flag is turned on in the STRING.
If CHECK is true, also checks the data in STRING for being well-formed
UTF-8.  Returns true if successful, false otherwise.

=item _utf8_on(STRING)

[INTERNAL] Turn on the UTF-8 flag in STRING.  The data in STRING is
B<not> checked for being well-formed UTF-8.  Do not use unless you
B<know> that the STRING is well-formed UTF-8.  Returns the previous
state of the UTF-8 flag (so please don't test the return value as
I<not> success or failure), or C<undef> if STRING is not a string.

=item _utf8_off(STRING)

[INTERNAL] Turn off the UTF-8 flag in STRING.  Do not use frivolously.
Returns the previous state of the UTF-8 flag (so please don't test the
return value as I<not> success or failure), or C<undef> if STRING is
not a string.

=back

=head1 SEE ALSO

L<Encode::Encoding>,
L<Encode::Supported>,
L<PerlIO>, 
L<encoding>,
L<perlebcdic>, 
L<perlfunc/open>, 
L<perlunicode>, 
L<utf8>, 
the Perl Unicode Mailing List E<lt>perl-unicode@perl.orgE<gt>

=cut
Commit	Line	Data
2c674647	1	package Encode;
51ef4e11	2	use strict;
3ef515df	3	our $VERSION = do { my @r = (q$Revision: 1.11 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
5129552c	4	our $DEBUG = 0;
2c674647	5
	6	require DynaLoader;
	7	require Exporter;
	8
51ef4e11	9	our @ISA = qw(Exporter DynaLoader);
2c674647	10
4411f3b6	11	# Public, encouraged API is exported by default
51ef4e11	12	our @EXPORT = qw (
4411f3b6	13	encode
	14	decode
	15	encode_utf8
	16	decode_utf8
	17	find_encoding
51ef4e11	18	encodings
4411f3b6	19	);
4411f3b6	20
51ef4e11	21	our @EXPORT_OK =
2c674647	22	qw(
51ef4e11	23	define_encoding
2c674647	24	from_to
2c674647	25	is_utf8
4411f3b6	26	is_8bit
4411f3b6	27	is_16bit
a12c0f56	28	utf8_upgrade
a12c0f56	29	utf8_downgrade
4411f3b6	30	_utf8_on
4411f3b6	31	_utf8_off
2c674647	32	);
	33
	34	bootstrap Encode ();
	35
4411f3b6	36	# Documentation moved after __END__ for speed - NI-S
2c674647	37
bf230f3d	38	use Carp;
bf230f3d	39
a63c962f	40	our $ON_EBCDIC = (ord("A") == 193);
5d030b67	41	use Encode::Alias;
5d030b67	42
5129552c	43	# Make a %Encoding package variable to allow a certain amount of cheating
5129552c	44	our %Encoding;
5345d506	45
5129552c	46	our %ExtModule =
2b217bf7	47	(
5129552c	48	viscii => 'Encode/Byte.pm',
	49	'koi8-r' => 'Encode/Byte.pm',
	50	cp1047 => 'Encode/EBCDIC.pm',
	51	cp37 => 'Encode/EBCDIC.pm',
	52	'posix-bc' => 'Encode/EBCDIC.pm',
	53	symbol => 'Encode/Symbol.pm',
	54	dingbats => 'Encode/Symbol.pm',
2b217bf7	55	);
d1ed7747	56
5129552c	57	for my $k (2..11,13..16){
	58	$ExtModule{"iso-8859-$k"} = 'Encode/Byte.pm';
	59	}
	60
	61	for my $k (1250..1258){
	62	$ExtModule{"cp$k"} = 'Encode/Byte.pm';
	63	}
	64
a63c962f	65	unless ($ON_EBCDIC) { # CJK added to autoload unless EBCDIC env
	66	%ExtModule =(
	67	%ExtModule,
	68	'euc-cn' => 'Encode/CN.pm',
	69	gb2312 => 'Encode/CN.pm',
	70	gb12345 => 'Encode/CN.pm',
	71	gbk => 'Encode/CN.pm',
	72	cp936 => 'Encode/CN.pm',
	73	'iso-ir-165' => 'Encode/CN.pm',
	74	'euc-jp' => 'Encode/JP.pm',
	75	'iso-2022-jp' => 'Encode/JP.pm',
	76	'iso-2022-jp-1' => 'Encode/JP.pm',
	77	'7bit-jis' => 'Encode/JP.pm',
	78	shiftjis => 'Encode/JP.pm',
	79	macjapan => 'Encode/JP.pm',
	80	cp932 => 'Encode/JP.pm',
	81	'euc-kr' => 'Encode/KR.pm',
	82	ksc5601 => 'Encode/KR.pm',
	83	cp949 => 'Encode/KR.pm',
	84	big5 => 'Encode/TW.pm',
	85	'big5-hkscs' => 'Encode/TW.pm',
	86	cp950 => 'Encode/TW.pm',
	87	gb18030 => 'Encode/HanExtra.pm',
	88	big5plus => 'Encode/HanExtra.pm',
	89	'euc-tw' => 'Encode/HanExtra.pm',
	90	);
	91	}
	92
3ef515df	93	for my $k (qw{ CentralEurRoman Croatian Cyrillic Greek
	94	Iceland Roman Rumanian Sami
	95	Thai Turkish Ukrainian
	96	})
5129552c	97	{
	98	$ExtModule{"mac$k"} = 'Encode/Byte.pm';
	99	}
	100
656753f8	101	sub encodings
656753f8	102	{
5129552c	103	my $class = shift;
071db25d	104	my @modules = (@_ and $_[0] eq ":all") ? values %ExtModule : @_;
5129552c	105	for my $m (@modules)
	106	{
	107	$DEBUG and warn "about to require $m;";
	108	eval { require $m; };
	109	}
	110	return
	111	map({$_->[0]}
	112	sort({$a->[1] cmp $b->[1]}
	113	map({[$_, lc $_]}
	114	grep({ $_ ne 'Internal' } keys %Encoding))));
51ef4e11	115	}
51ef4e11	116
51ef4e11	117	sub define_encoding
51ef4e11	118	{
18586f54	119	my $obj = shift;
18586f54	120	my $name = shift;
5129552c	121	$Encoding{$name} = $obj;
18586f54	122	my $lc = lc($name);
	123	define_alias($lc => $obj) unless $lc eq $name;
	124	while (@_)
	125	{
	126	my $alias = shift;
	127	define_alias($alias,$obj);
	128	}
	129	return $obj;
656753f8	130	}
656753f8	131
656753f8	132	sub getEncoding
656753f8	133	{
dd9703c9	134	my ($class,$name,$skip_external) = @_;
18586f54	135	my $enc;
	136	if (ref($name) && $name->can('new_sequence'))
	137	{
	138	return $name;
	139	}
	140	my $lc = lc $name;
5129552c	141	if (exists $Encoding{$name})
18586f54	142	{
5129552c	143	return $Encoding{$name};
18586f54	144	}
5129552c	145	if (exists $Encoding{$lc})
18586f54	146	{
5129552c	147	return $Encoding{$lc};
18586f54	148	}
c50d192e	149
5129552c	150	my $oc = $class->find_alias($name);
c50d192e	151	return $oc if defined $oc;
c50d192e	152
5129552c	153	$oc = $class->find_alias($lc) if $lc ne $name;
c50d192e	154	return $oc if defined $oc;
c50d192e	155
5129552c	156	if (!$skip_external and exists $ExtModule{$lc})
d1ed7747	157	{
5129552c	158	eval{ require $ExtModule{$lc}; };
5129552c	159	return $Encoding{$name} if exists $Encoding{$name};
d1ed7747	160	}
18586f54	161
18586f54	162	return;
656753f8	163	}
656753f8	164
4411f3b6	165	sub find_encoding
4411f3b6	166	{
dd9703c9	167	my ($name,$skip_external) = @_;
dd9703c9	168	return __PACKAGE__->getEncoding($name,$skip_external);
4411f3b6	169	}
	170
	171	sub encode
	172	{
18586f54	173	my ($name,$string,$check) = @_;
	174	my $enc = find_encoding($name);
	175	croak("Unknown encoding '$name'") unless defined $enc;
	176	my $octets = $enc->encode($string,$check);
	177	return undef if ($check && length($string));
	178	return $octets;
4411f3b6	179	}
	180
	181	sub decode
	182	{
18586f54	183	my ($name,$octets,$check) = @_;
	184	my $enc = find_encoding($name);
	185	croak("Unknown encoding '$name'") unless defined $enc;
	186	my $string = $enc->decode($octets,$check);
	187	$_[1] = $octets if $check;
	188	return $string;
4411f3b6	189	}
	190
	191	sub from_to
	192	{
18586f54	193	my ($string,$from,$to,$check) = @_;
	194	my $f = find_encoding($from);
	195	croak("Unknown encoding '$from'") unless defined $f;
	196	my $t = find_encoding($to);
	197	croak("Unknown encoding '$to'") unless defined $t;
	198	my $uni = $f->decode($string,$check);
	199	return undef if ($check && length($string));
	200	$string = $t->encode($uni,$check);
	201	return undef if ($check && length($uni));
3ef515df	202	return defined($_[0] = $string) ? length($string) : undef ;
4411f3b6	203	}
	204
	205	sub encode_utf8
	206	{
18586f54	207	my ($str) = @_;
	208	utf8::encode($str);
	209	return $str;
4411f3b6	210	}
	211
	212	sub decode_utf8
	213	{
18586f54	214	my ($str) = @_;
	215	return undef unless utf8::decode($str);
	216	return $str;
5ad8ef52	217	}
5ad8ef52	218
18586f54	219	require Encode::Encoding;
	220	require Encode::XS;
	221	require Encode::Internal;
	222	require Encode::Unicode;
	223	require Encode::utf8;
64ffdd5e	224	require Encode::10646_1;
18586f54	225	require Encode::ucs2_le;
4411f3b6	226
656753f8	227	1;
656753f8	228
2a936312	229	__END__
2a936312	230
4411f3b6	231	=head1 NAME
	232
	233	Encode - character encodings
	234
	235	=head1 SYNOPSIS
	236
	237	use Encode;
	238
67d7b5ef	239
	240	=head2 Table of Contents
	241
	242	Encode consists of a collection of modules which details are too big
	243	to fit in one document. This POD itself explains the top-level APIs
	244	and general topics at a glance. For other topics and more details,
	245	see the PODs below;
	246
	247	Name Description
	248	--------------------------------------------------------
	249	Encode::Alias Alias defintions to encodings
	250	Encode::Encoding Encode Implementation Base Class
	251	Encode::Supported List of Supported Encodings
	252	Encode::CN Simplified Chinese Encodings
	253	Encode::JP Japanese Encodings
	254	Encode::KR Korean Encodings
	255	Encode::TW Traditional Chinese Encodings
	256	--------------------------------------------------------
	257
4411f3b6	258	=head1 DESCRIPTION
4411f3b6	259
47bfe92f	260	The C<Encode> module provides the interfaces between Perl's strings
67d7b5ef	261	and the rest of the system. Perl strings are sequences of
	262	B<characters>.
	263
	264	The repertoire of characters that Perl can represent is at least that
	265	defined by the Unicode Consortium. On most platforms the ordinal
	266	values of the characters (as returned by C<ord(ch)>) is the "Unicode
	267	codepoint" for the character (the exceptions are those platforms where
	268	the legacy encoding is some variant of EBCDIC rather than a super-set
	269	of ASCII - see L<perlebcdic>).
	270
	271	Traditionally computer data has been moved around in 8-bit chunks
	272	often called "bytes". These chunks are also known as "octets" in
	273	networking standards. Perl is widely used to manipulate data of many
	274	types - not only strings of characters representing human or computer
	275	languages but also "binary" data being the machines representation of
	276	numbers, pixels in an image - or just about anything.
	277
	278	When Perl is processing "binary data" the programmer wants Perl to
	279	process "sequences of bytes". This is not a problem for Perl - as a
	280	byte has 256 possible values it easily fits in Perl's much larger
	281	"logical character".
	282
	283	=head2 TERMINOLOGY
4411f3b6	284
67d7b5ef	285	=over 4
21938dfa	286
67d7b5ef	287	=item *
	288
	289	I<character>: a character in the range 0..(2**32-1) (or more).
	290	(What Perl's strings are made of.)
	291
	292	=item *
	293
	294	I<byte>: a character in the range 0..255
	295	(A special case of a Perl character.)
	296
	297	=item *
	298
	299	I<octet>: 8 bits of data, with ordinal values 0..255
	300	(Term for bytes passed to or from a non-Perl context, e.g. disk file.)
	301
	302	=back
4411f3b6	303
67d7b5ef	304	The marker [INTERNAL] marks Internal Implementation Details, in
	305	general meant only for those who think they know what they are doing,
	306	and such details may change in future releases.
	307
	308	=head1 PERL ENCODING API
4411f3b6	309
	310	=over 4
	311
a63c962f	312	=item $bytes = encode(ENCODING, $string[, CHECK])
4411f3b6	313
47bfe92f	314	Encodes string from Perl's internal form into I<ENCODING> and returns
67d7b5ef	315	a sequence of octets. ENCODING can be either a canonical name or
	316	alias. For encoding names and aliases, see L</"Defining Aliases">.
	317	For CHECK see L</"Handling Malformed Data">.
4411f3b6	318
67d7b5ef	319	For example to convert (internally UTF-8 encoded) Unicode string to
67d7b5ef	320	iso-8859-1 (also known as Latin1),
681a7c68	321
67d7b5ef	322	$octets = encode("iso-8859-1", $unicode);
681a7c68	323
a63c962f	324	=item $string = decode(ENCODING, $bytes[, CHECK])
4411f3b6	325
47bfe92f	326	Decode sequence of octets assumed to be in I<ENCODING> into Perl's
67d7b5ef	327	internal form and returns the resulting string. as in encode(),
	328	ENCODING can be either a canonical name or alias. For encoding names
	329	and aliases, see L</"Defining Aliases">. For CHECK see
47bfe92f	330	L</"Handling Malformed Data">.
47bfe92f	331
1b2c56c8	332	For example to convert ISO-8859-1 data to UTF-8:
681a7c68	333
67d7b5ef	334	$utf8 = decode("iso-8859-1", $latin1);
681a7c68	335
3ef515df	336	=item [$length =] from_to($string, FROM_ENCODING, TO_ENCODING[, CHECK])
47bfe92f	337
2b106fbe	338	Convert B<in-place> the data between two encodings. How did the data
2b106fbe	339	in $string originally get to be in FROM_ENCODING? Either using
67d7b5ef	340	encode() or through PerlIO: See L</"Encoding and IO">.
	341	For encoding names and aliases, see L</"Defining Aliases">.
	342	For CHECK see L</"Handling Malformed Data">.
2b106fbe	343
1b2c56c8	344	For example to convert ISO-8859-1 data to UTF-8:
2b106fbe	345
	346	from_to($data, "iso-8859-1", "utf-8");
	347
	348	and to convert it back:
	349
	350	from_to($data, "utf-8", "iso-8859-1");
4411f3b6	351
ab97ca19	352	Note that because the conversion happens in place, the data to be
	353	converted cannot be a string constant, it must be a scalar variable.
	354
3ef515df	355	from_to() return the length of the converted string on success, undef
	356	otherwise.
	357
4411f3b6	358	=back
4411f3b6	359
51ef4e11	360	=head2 Listing available encodings
51ef4e11	361
5129552c	362	use Encode;
	363	@list = Encode->encodings();
	364
	365	Returns a list of the canonical names of the available encodings that
	366	are loaded. To get a list of all available encodings including the
	367	ones that are not loaded yet, say
	368
	369	@all_encodings = Encode->encodings(":all");
	370
	371	Or you can give the name of specific module.
	372
	373	@with_jp = Encode->encodings("Encode/JP.pm");
51ef4e11	374
a63c962f	375	Note in this case you have to say C<"Encode/JP.pm"> instead of
a63c962f	376	C<"Encode::JP">.
5d030b67	377
a63c962f	378	To find which encodings are supported by this package in details,
5d030b67	379	see L<Encode::Supported>.
51ef4e11	380
67d7b5ef	381
51ef4e11	382	=head2 Defining Aliases
51ef4e11	383
67d7b5ef	384	To add new alias to a given encoding, Use;
67d7b5ef	385
5129552c	386	use Encode;
5129552c	387	use Encode::Alias;
a63c962f	388	define_alias(newName => ENCODING);
51ef4e11	389
3ef515df	390	After that, newName can be used as an alias for ENCODING.
	391	ENCODING may be either the name of an encoding or an I<encoding
	392	object>
51ef4e11	393
5d030b67	394	See L<Encode::Alias> on details.
51ef4e11	395
4411f3b6	396	=head1 Encoding and IO
	397
	398	It is very common to want to do encoding transformations when
	399	reading or writing files, network connections, pipes etc.
47bfe92f	400	If Perl is configured to use the new 'perlio' IO system then
4411f3b6	401	C<Encode> provides a "layer" (See L<perliol>) which can transform
	402	data as it is read or written.
	403
8e86646e	404	Here is how the blind poet would modernise the encoding:
8e86646e	405
42234700	406	use Encode;
8e86646e	407	open(my $iliad,'<:encoding(iso-8859-7)','iliad.greek');
	408	open(my $utf8,'>:utf8','iliad.utf8');
	409	my @epic = <$iliad>;
	410	print $utf8 @epic;
	411	close($utf8);
	412	close($illiad);
4411f3b6	413
	414	In addition the new IO system can also be configured to read/write
	415	UTF-8 encoded characters (as noted above this is efficient):
	416
e9692b5b	417	open(my $fh,'>:utf8','anything');
e9692b5b	418	print $fh "Any \x{0021} string \N{SMILEY FACE}\n";
4411f3b6	419
	420	Either of the above forms of "layer" specifications can be made the default
	421	for a lexical scope with the C<use open ...> pragma. See L<open>.
	422
	423	Once a handle is open is layers can be altered using C<binmode>.
	424
47bfe92f	425	Without any such configuration, or if Perl itself is built using
4411f3b6	426	system's own IO, then write operations assume that file handle accepts
	427	only I<bytes> and will C<die> if a character larger than 255 is
	428	written to the handle. When reading, each octet from the handle
	429	becomes a byte-in-a-character. Note that this default is the same
47bfe92f	430	behaviour as bytes-only languages (including Perl before v5.6) would
	431	have, and is sufficient to handle native 8-bit encodings
	432	e.g. iso-8859-1, EBCDIC etc. and any legacy mechanisms for handling
	433	other encodings and binary data.
	434
	435	In other cases it is the programs responsibility to transform
	436	characters into bytes using the API above before doing writes, and to
	437	transform the bytes read from a handle into characters before doing
	438	"character operations" (e.g. C<lc>, C</\W+/>, ...).
	439
47bfe92f	440	You can also use PerlIO to convert larger amounts of data you don't
1b2c56c8	441	want to bring into memory. For example to convert between ISO-8859-1
47bfe92f	442	(Latin 1) and UTF-8 (or UTF-EBCDIC in EBCDIC machines):
47bfe92f	443
e9692b5b	444	open(F, "<:encoding(iso-8859-1)", "data.txt") or die $!;
	445	open(G, ">:utf8", "data.utf") or die $!;
	446	while (<F>) { print G }
	447
	448	# Could also do "print G <F>" but that would pull
	449	# the whole file into memory just to write it out again.
	450
	451	More examples:
47bfe92f	452
e9692b5b	453	open(my $f, "<:encoding(cp1252)")
	454	open(my $g, ">:encoding(iso-8859-2)")
	455	open(my $h, ">:encoding(latin9)") # iso-8859-15
47bfe92f	456
47bfe92f	457	See L<PerlIO> for more information.
4411f3b6	458
1768d7eb	459	See also L<encoding> for how to change the default encoding of the
d521382b	460	data in your script.
1768d7eb	461
67d7b5ef	462	=head1 Handling Malformed Data
	463
	464	If CHECK is not set, C<undef> is returned. If the data is supposed to
	465	be UTF-8, an optional lexical warning (category utf8) is given. If
	466	CHECK is true but not a code reference, dies.
	467
	468	It would desirable to have a way to indicate that transform should use
	469	the encodings "replacement character" - no such mechanism is defined yet.
	470
	471	It is also planned to allow I<CHECK> to be a code reference.
	472
	473	This is not yet implemented as there are design issues with what its
	474	arguments should be and how it returns its results.
	475
	476	=over 4
	477
	478	=item Scheme 1
	479
	480	Passed remaining fragment of string being processed.
	481	Modifies it in place to remove bytes/characters it can understand
	482	and returns a string used to represent them.
	483	e.g.
	484
	485	sub fixup {
	486	my $ch = substr($_[0],0,1,'');
	487	return sprintf("\x{%02X}",ord($ch);
	488	}
	489
	490	This scheme is close to how underlying C code for Encode works, but gives
	491	the fixup routine very little context.
	492
	493	=item Scheme 2
	494
	495	Passed original string, and an index into it of the problem area, and
	496	output string so far. Appends what it will to output string and
	497	returns new index into original string. For example:
	498
	499	sub fixup {
	500	# my ($s,$i,$d) = @_;
	501	my $ch = substr($_[0],$_[1],1);
	502	$_[2] .= sprintf("\x{%02X}",ord($ch);
	503	return $_[1]+1;
	504	}
	505
	506	This scheme gives maximal control to the fixup routine but is more
	507	complicated to code, and may need internals of Encode to be tweaked to
	508	keep original string intact.
	509
	510	=item Other Schemes
	511
	512	Hybrids of above.
	513
	514	Multiple return values rather than in-place modifications.
	515
	516	Index into the string could be C<pos($str)> allowing C<s/\G...//>.
	517
	518	=back
	519
	520	=head2 UTF-8 / utf8
	521
	522	The Unicode consortium defines the UTF-8 standard as a way of encoding
	523	the entire Unicode repertoire as sequences of octets. This encoding is
	524	expected to become very widespread. Perl can use this form internally
	525	to represent strings, so conversions to and from this form are
526	particularly efficient (as octets in memory do not have to change,
527	just the meta-data that tells Perl how to treat them).
528
529	=over 4
530
531	=item $bytes = encode_utf8($string);
532
533	The characters that comprise string are encoded in Perl's superset of UTF-8
534	and the resulting octets returned as a sequence of bytes. All possible
535	characters have a UTF-8 representation so this function cannot fail.
536
537	=item $string = decode_utf8($bytes [, CHECK]);
538
539	The sequence of octets represented by $bytes is decoded from UTF-8
540	into a sequence of logical characters. Not all sequences of octets
541	form valid UTF-8 encodings, so it is possible for this call to fail.
542	For CHECK see L</"Handling Malformed Data">.
543
544	=back
545
546	=head1 Defining Encodings
547
548	To define a new encoding, use:
549
550	use Encode qw(define_alias);
551	define_encoding($object, 'canonicalName' [, alias...]);
552
553	I<canonicalName> will be associated with I<$object>. The object
554	should provide the interface described in L<Encode::Encoding>
555	If more than two arguments are provided then additional
556	arguments are taken as aliases for I<$object> as for C<define_alias>.
557
4411f3b6	558	=head1 Messing with Perl's Internals
4411f3b6	559
47bfe92f	560	The following API uses parts of Perl's internals in the current
47bfe92f	561	implementation. As such they are efficient, but may change.
4411f3b6	562
	563	=over 4
	564
a63c962f	565	=item is_utf8(STRING [, CHECK])
4411f3b6	566
4411f3b6	567	[INTERNAL] Test whether the UTF-8 flag is turned on in the STRING.
47bfe92f	568	If CHECK is true, also checks the data in STRING for being well-formed
47bfe92f	569	UTF-8. Returns true if successful, false otherwise.
4411f3b6	570
a63c962f	571	=item _utf8_on(STRING)
4411f3b6	572
	573	[INTERNAL] Turn on the UTF-8 flag in STRING. The data in STRING is
	574	B<not> checked for being well-formed UTF-8. Do not use unless you
	575	B<know> that the STRING is well-formed UTF-8. Returns the previous
	576	state of the UTF-8 flag (so please don't test the return value as
	577	I<not> success or failure), or C<undef> if STRING is not a string.
	578
a63c962f	579	=item _utf8_off(STRING)
4411f3b6	580
	581	[INTERNAL] Turn off the UTF-8 flag in STRING. Do not use frivolously.
	582	Returns the previous state of the UTF-8 flag (so please don't test the
	583	return value as I<not> success or failure), or C<undef> if STRING is
	584	not a string.
	585
	586	=back
	587
	588	=head1 SEE ALSO
	589
5d030b67	590	L<Encode::Encoding>,
	591	L<Encode::Supported>,
	592	L<PerlIO>,
	593	L<encoding>,
	594	L<perlebcdic>,
	595	L<perlfunc/open>,
	596	L<perlunicode>,
	597	L<utf8>,
	598	the Perl Unicode Mailing List E<lt>perl-unicode@perl.orgE<gt>
4411f3b6	599
4411f3b6	600	=cut