[p5sagit/p5-mst-13.2.git] / ext / Encode / Encode.pm

package Encode;
use strict;
our $VERSION = do { my @r = (q$Revision: 0.96 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
our $DEBUG = 0;

require DynaLoader;
require Exporter;

our @ISA = qw(Exporter DynaLoader);

# Public, encouraged API is exported by default
our @EXPORT = qw (
  encode
  decode
  encode_utf8
  decode_utf8
  find_encoding
  encodings
);

our @EXPORT_OK =
    qw(
       define_encoding
       from_to
       is_utf8
       is_8bit
       is_16bit
       utf8_upgrade
       utf8_downgrade
       _utf8_on
       _utf8_off
      );

bootstrap Encode ();

# Documentation moved after __END__ for speed - NI-S

use Carp;

use Encode::Alias;

# Make a %Encoding package variable to allow a certain amount of cheating
our %Encoding;

our %ExtModule =
    (
     viscii             => 'Encode/Byte.pm',
     'koi8-r'           => 'Encode/Byte.pm',
     cp1047             => 'Encode/EBCDIC.pm',
     cp37               => 'Encode/EBCDIC.pm',
     'posix-bc'         => 'Encode/EBCDIC.pm',
     symbol             => 'Encode/Symbol.pm',
     dingbats           => 'Encode/Symbol.pm',
     'euc-cn'           => 'Encode/CN.pm',
     gb2312		=> 'Encode/CN.pm',
     gb12345		=> 'Encode/CN.pm',
     gbk		=> 'Encode/CN.pm',
     cp936		=> 'Encode/CN.pm',
     'iso-ir-165'	=> 'Encode/CN.pm',
     'euc-jp'	        => 'Encode/JP.pm',
     'iso-2022-jp'	=> 'Encode/JP.pm',
     '7bit-jis'         => 'Encode/JP.pm',
     shiftjis	        => 'Encode/JP.pm',
     macjapan	        => 'Encode/JP.pm',
     cp932		=> 'Encode/JP.pm',
     'euc-kr'       	=> 'Encode/KR.pm',
     ksc5601		=> 'Encode/KR.pm',
     cp949		=> 'Encode/KR.pm',
     big5		=> 'Encode/TW.pm',
     'big5-hkscs'	=> 'Encode/TW.pm',
     cp950		=> 'Encode/TW.pm',
     gb18030		=> 'Encode/HanExtra.pm',
     big5plus     	=> 'Encode/HanExtra.pm',
     'euc-tw'   	=> 'Encode/HanExtra.pm',
    );

for my $k (2..11,13..16){
    $ExtModule{"iso-8859-$k"} = 'Encode/Byte.pm';
}

for my $k (1250..1258){
    $ExtModule{"cp$k"} = 'Encode/Byte.pm';
}

for my $k (qw(centeuro croatian cyrillic dingbats greek
	      iceland roman rumanian sami 
	      thai turkish  ukraine))
{
    $ExtModule{"mac$k"} = 'Encode/Byte.pm';
}


sub encodings
{
    my $class = shift;
    my @modules = ($_[0] eq ":all") ? values %ExtModule : @_;
    for my $m (@modules)
    {
	$DEBUG and warn "about to require $m;";
	eval { require $m; };
    }
    return
	map({$_->[0]} 
	    sort({$a->[1] cmp $b->[1]}
		 map({[$_, lc $_]} 
		     grep({ $_ ne 'Internal' }  keys %Encoding))));
}

sub define_encoding
{
    my $obj  = shift;
    my $name = shift;
    $Encoding{$name} = $obj;
    my $lc = lc($name);
    define_alias($lc => $obj) unless $lc eq $name;
    while (@_)
    {
	my $alias = shift;
	define_alias($alias,$obj);
    }
    return $obj;
}

sub getEncoding
{
    my ($class,$name,$skip_external) = @_;
    my $enc;
    if (ref($name) && $name->can('new_sequence'))
    {
	return $name;
    }
    my $lc = lc $name;
    if (exists $Encoding{$name})
    {
	return $Encoding{$name};
    }
    if (exists $Encoding{$lc})
    {
	return $Encoding{$lc};
    }

    my $oc = $class->find_alias($name);
    return $oc if defined $oc;

    $oc = $class->find_alias($lc) if $lc ne $name;
    return $oc if defined $oc;

    if (!$skip_external and exists $ExtModule{$lc})
    {
	eval{ require $ExtModule{$lc}; };
	return $Encoding{$name} if exists $Encoding{$name};
    }

    return;
}

sub find_encoding
{
    my ($name,$skip_external) = @_;
    return __PACKAGE__->getEncoding($name,$skip_external);
}

sub encode
{
    my ($name,$string,$check) = @_;
    my $enc = find_encoding($name);
    croak("Unknown encoding '$name'") unless defined $enc;
    my $octets = $enc->encode($string,$check);
    return undef if ($check && length($string));
    return $octets;
}

sub decode
{
    my ($name,$octets,$check) = @_;
    my $enc = find_encoding($name);
    croak("Unknown encoding '$name'") unless defined $enc;
    my $string = $enc->decode($octets,$check);
    $_[1] = $octets if $check;
    return $string;
}

sub from_to
{
    my ($string,$from,$to,$check) = @_;
    my $f = find_encoding($from);
    croak("Unknown encoding '$from'") unless defined $f;
    my $t = find_encoding($to);
    croak("Unknown encoding '$to'") unless defined $t;
    my $uni = $f->decode($string,$check);
    return undef if ($check && length($string));
    $string = $t->encode($uni,$check);
    return undef if ($check && length($uni));
    return length($_[0] = $string);
}

sub encode_utf8
{
    my ($str) = @_;
  utf8::encode($str);
    return $str;
}

sub decode_utf8
{
    my ($str) = @_;
    return undef unless utf8::decode($str);
    return $str;
}

require Encode::Encoding;
require Encode::XS;
require Encode::Internal;
require Encode::Unicode;
require Encode::utf8;
require Encode::iso10646_1;
require Encode::ucs2_le;

1;

__END__

=head1 NAME

Encode - character encodings

=head1 SYNOPSIS

    use Encode;

=head1 DESCRIPTION

The C<Encode> module provides the interfaces between Perl's strings
and the rest of the system.  Perl strings are sequences of B<characters>.

To find more about character encodings, please consult
L<Encode::Details> . This document focuses on programming references.

=head1 PERL ENCODING API

=head2 Generic Encoding Interface

=over 4

=item *

        $bytes  = encode(ENCODING, $string[, CHECK])

Encodes string from Perl's internal form into I<ENCODING> and returns
a sequence of octets.  For CHECK see L</"Handling Malformed Data">.

For example to convert (internally UTF-8 encoded) Unicode data
to octets:

	$octets = encode("utf8", $unicode);

=item *

        $string = decode(ENCODING, $bytes[, CHECK])

Decode sequence of octets assumed to be in I<ENCODING> into Perl's
internal form and returns the resulting string.  For CHECK see
L</"Handling Malformed Data">.

For example to convert ISO-8859-1 data to UTF-8:

	$utf8 = decode("latin1", $latin1);

=item *

	from_to($string, FROM_ENCODING, TO_ENCODING[, CHECK])

Convert B<in-place> the data between two encodings.  How did the data
in $string originally get to be in FROM_ENCODING?  Either using
encode() or through PerlIO: See L</"Encoding and IO">.  For CHECK
see L</"Handling Malformed Data">.

For example to convert ISO-8859-1 data to UTF-8:

	from_to($data, "iso-8859-1", "utf-8");

and to convert it back:

	from_to($data, "utf-8", "iso-8859-1");

Note that because the conversion happens in place, the data to be
converted cannot be a string constant, it must be a scalar variable.

=back

=head2 Handling Malformed Data

If CHECK is not set, C<undef> is returned.  If the data is supposed to
be UTF-8, an optional lexical warning (category utf8) is given.  If
CHECK is true but not a code reference, dies.

It would desirable to have a way to indicate that transform should use
the encodings "replacement character" - no such mechanism is defined yet.

It is also planned to allow I<CHECK> to be a code reference.

This is not yet implemented as there are design issues with what its
arguments should be and how it returns its results.

=over 4

=item Scheme 1

Passed remaining fragment of string being processed.
Modifies it in place to remove bytes/characters it can understand
and returns a string used to represent them.
e.g.

 sub fixup {
   my $ch = substr($_[0],0,1,'');
   return sprintf("\x{%02X}",ord($ch);
 }

This scheme is close to how underlying C code for Encode works, but gives
the fixup routine very little context.

=item Scheme 2

Passed original string, and an index into it of the problem area, and
output string so far.  Appends what it will to output string and
returns new index into original string.  For example:

 sub fixup {
   # my ($s,$i,$d) = @_;
   my $ch = substr($_[0],$_[1],1);
   $_[2] .= sprintf("\x{%02X}",ord($ch);
   return $_[1]+1;
 }

This scheme gives maximal control to the fixup routine but is more
complicated to code, and may need internals of Encode to be tweaked to
keep original string intact.

=item Other Schemes

Hybrids of above.

Multiple return values rather than in-place modifications.

Index into the string could be pos($str) allowing s/\G...//.

=back

=head2 UTF-8 / utf8

The Unicode consortium defines the UTF-8 standard as a way of encoding
the entire Unicode repertiore as sequences of octets.  This encoding is
expected to become very widespread. Perl can use this form internaly
to represent strings, so conversions to and from this form are
particularly efficient (as octets in memory do not have to change,
just the meta-data that tells Perl how to treat them).

=over 4

=item *

        $bytes = encode_utf8($string);

The characters that comprise string are encoded in Perl's superset of UTF-8
and the resulting octets returned as a sequence of bytes. All possible
characters have a UTF-8 representation so this function cannot fail.

=item *

        $string = decode_utf8($bytes [,CHECK]);

The sequence of octets represented by $bytes is decoded from UTF-8
into a sequence of logical characters. Not all sequences of octets
form valid UTF-8 encodings, so it is possible for this call to fail.
For CHECK see L</"Handling Malformed Data">.

=back

=head2 Listing available encodings

  use Encode;
  @list = Encode->encodings();

Returns a list of the canonical names of the available encodings that
are loaded.  To get a list of all available encodings including the
ones that are not loaded yet, say

  @all_encodings = Encode->encodings(":all");

Or you can give the name of specific module.

  @with_jp = Encode->encodings("Encode/JP.pm");

Note in this case you have to say "Encode/JP.pm instead of Encode::JP.

To find which encodings are suppoted by this package in details, 
see L<Encode::Supported>.

=head2 Defining Aliases

  use Encode;
  use Encode::Alias;
  define_alias( newName => ENCODING);

Allows newName to be used as am alias for ENCODING. ENCODING may be
either the name of an encoding or and encoding object (as above).

See L<Encode::Alias> on details.

=head1 Defining Encodings

    use Encode qw(define_alias);
    define_encoding( $object, 'canonicalName' [,alias...]);

Causes I<canonicalName> to be associated with I<$object>.  The object
should provide the interface described in L<Encode::Encoding>
below.  If more than two arguments are provided then additional
arguments are taken as aliases for I<$object> as for C<define_alias>.

=head1 Encoding and IO

It is very common to want to do encoding transformations when
reading or writing files, network connections, pipes etc.
If Perl is configured to use the new 'perlio' IO system then
C<Encode> provides a "layer" (See L<perliol>) which can transform
data as it is read or written.

Here is how the blind poet would modernise the encoding:

    use Encode;
    open(my $iliad,'<:encoding(iso-8859-7)','iliad.greek');
    open(my $utf8,'>:utf8','iliad.utf8');
    my @epic = <$iliad>;
    print $utf8 @epic;
    close($utf8);
    close($illiad);

In addition the new IO system can also be configured to read/write
UTF-8 encoded characters (as noted above this is efficient):

    open(my $fh,'>:utf8','anything');
    print $fh "Any \x{0021} string \N{SMILEY FACE}\n";

Either of the above forms of "layer" specifications can be made the default
for a lexical scope with the C<use open ...> pragma. See L<open>.

Once a handle is open is layers can be altered using C<binmode>.

Without any such configuration, or if Perl itself is built using
system's own IO, then write operations assume that file handle accepts
only I<bytes> and will C<die> if a character larger than 255 is
written to the handle. When reading, each octet from the handle
becomes a byte-in-a-character. Note that this default is the same
behaviour as bytes-only languages (including Perl before v5.6) would
have, and is sufficient to handle native 8-bit encodings
e.g. iso-8859-1, EBCDIC etc. and any legacy mechanisms for handling
other encodings and binary data.

In other cases it is the programs responsibility to transform
characters into bytes using the API above before doing writes, and to
transform the bytes read from a handle into characters before doing
"character operations" (e.g. C<lc>, C</\W+/>, ...).

You can also use PerlIO to convert larger amounts of data you don't
want to bring into memory.  For example to convert between ISO-8859-1
(Latin 1) and UTF-8 (or UTF-EBCDIC in EBCDIC machines):

    open(F, "<:encoding(iso-8859-1)", "data.txt") or die $!;
    open(G, ">:utf8",                 "data.utf") or die $!;
    while (<F>) { print G }

    # Could also do "print G <F>" but that would pull
    # the whole file into memory just to write it out again.

More examples:

    open(my $f, "<:encoding(cp1252)")
    open(my $g, ">:encoding(iso-8859-2)")
    open(my $h, ">:encoding(latin9)")       # iso-8859-15

See L<PerlIO> for more information.

See also L<encoding> for how to change the default encoding of the
data in your script.

=head1 Messing with Perl's Internals

The following API uses parts of Perl's internals in the current
implementation.  As such they are efficient, but may change.

=over 4

=item * is_utf8(STRING [, CHECK])

[INTERNAL] Test whether the UTF-8 flag is turned on in the STRING.
If CHECK is true, also checks the data in STRING for being well-formed
UTF-8.  Returns true if successful, false otherwise.

=item *

        _utf8_on(STRING)

[INTERNAL] Turn on the UTF-8 flag in STRING.  The data in STRING is
B<not> checked for being well-formed UTF-8.  Do not use unless you
B<know> that the STRING is well-formed UTF-8.  Returns the previous
state of the UTF-8 flag (so please don't test the return value as
I<not> success or failure), or C<undef> if STRING is not a string.

=item *

        _utf8_off(STRING)

[INTERNAL] Turn off the UTF-8 flag in STRING.  Do not use frivolously.
Returns the previous state of the UTF-8 flag (so please don't test the
return value as I<not> success or failure), or C<undef> if STRING is
not a string.

=back

=head1 SEE ALSO

L<Encode::Details>, 
L<Encode::Encoding>,
L<Encode::Supported>,
L<PerlIO>, 
L<encoding>,
L<perlebcdic>, 
L<perlfunc/open>, 
L<perlunicode>, 
L<utf8>, 
the Perl Unicode Mailing List E<lt>perl-unicode@perl.orgE<gt>

=cut
Commit	Line	Data
2c674647	1	package Encode;
51ef4e11	2	use strict;
5129552c	3	our $VERSION = do { my @r = (q$Revision: 0.96 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
5129552c	4	our $DEBUG = 0;
2c674647	5
	6	require DynaLoader;
	7	require Exporter;
	8
51ef4e11	9	our @ISA = qw(Exporter DynaLoader);
2c674647	10
4411f3b6	11	# Public, encouraged API is exported by default
51ef4e11	12	our @EXPORT = qw (
4411f3b6	13	encode
	14	decode
	15	encode_utf8
	16	decode_utf8
	17	find_encoding
51ef4e11	18	encodings
4411f3b6	19	);
4411f3b6	20
51ef4e11	21	our @EXPORT_OK =
2c674647	22	qw(
51ef4e11	23	define_encoding
2c674647	24	from_to
2c674647	25	is_utf8
4411f3b6	26	is_8bit
4411f3b6	27	is_16bit
a12c0f56	28	utf8_upgrade
a12c0f56	29	utf8_downgrade
4411f3b6	30	_utf8_on
4411f3b6	31	_utf8_off
2c674647	32	);
	33
	34	bootstrap Encode ();
	35
4411f3b6	36	# Documentation moved after __END__ for speed - NI-S
2c674647	37
bf230f3d	38	use Carp;
bf230f3d	39
5d030b67	40	use Encode::Alias;
5d030b67	41
5129552c	42	# Make a %Encoding package variable to allow a certain amount of cheating
5129552c	43	our %Encoding;
5345d506	44
5129552c	45	our %ExtModule =
2b217bf7	46	(
5129552c	47	viscii => 'Encode/Byte.pm',
	48	'koi8-r' => 'Encode/Byte.pm',
	49	cp1047 => 'Encode/EBCDIC.pm',
	50	cp37 => 'Encode/EBCDIC.pm',
	51	'posix-bc' => 'Encode/EBCDIC.pm',
	52	symbol => 'Encode/Symbol.pm',
	53	dingbats => 'Encode/Symbol.pm',
	54	'euc-cn' => 'Encode/CN.pm',
	55	gb2312 => 'Encode/CN.pm',
	56	gb12345 => 'Encode/CN.pm',
	57	gbk => 'Encode/CN.pm',
	58	cp936 => 'Encode/CN.pm',
	59	'iso-ir-165' => 'Encode/CN.pm',
	60	'euc-jp' => 'Encode/JP.pm',
	61	'iso-2022-jp' => 'Encode/JP.pm',
	62	'7bit-jis' => 'Encode/JP.pm',
	63	shiftjis => 'Encode/JP.pm',
	64	macjapan => 'Encode/JP.pm',
	65	cp932 => 'Encode/JP.pm',
	66	'euc-kr' => 'Encode/KR.pm',
	67	ksc5601 => 'Encode/KR.pm',
	68	cp949 => 'Encode/KR.pm',
	69	big5 => 'Encode/TW.pm',
	70	'big5-hkscs' => 'Encode/TW.pm',
	71	cp950 => 'Encode/TW.pm',
	72	gb18030 => 'Encode/HanExtra.pm',
	73	big5plus => 'Encode/HanExtra.pm',
	74	'euc-tw' => 'Encode/HanExtra.pm',
2b217bf7	75	);
d1ed7747	76
5129552c	77	for my $k (2..11,13..16){
	78	$ExtModule{"iso-8859-$k"} = 'Encode/Byte.pm';
	79	}
	80
	81	for my $k (1250..1258){
	82	$ExtModule{"cp$k"} = 'Encode/Byte.pm';
	83	}
	84
	85	for my $k (qw(centeuro croatian cyrillic dingbats greek
	86	iceland roman rumanian sami
	87	thai turkish ukraine))
	88	{
	89	$ExtModule{"mac$k"} = 'Encode/Byte.pm';
	90	}
	91
	92
656753f8	93	sub encodings
656753f8	94	{
5129552c	95	my $class = shift;
	96	my @modules = ($_[0] eq ":all") ? values %ExtModule : @_;
	97	for my $m (@modules)
	98	{
	99	$DEBUG and warn "about to require $m;";
	100	eval { require $m; };
	101	}
	102	return
	103	map({$_->[0]}
	104	sort({$a->[1] cmp $b->[1]}
	105	map({[$_, lc $_]}
	106	grep({ $_ ne 'Internal' } keys %Encoding))));
51ef4e11	107	}
51ef4e11	108
51ef4e11	109	sub define_encoding
51ef4e11	110	{
18586f54	111	my $obj = shift;
18586f54	112	my $name = shift;
5129552c	113	$Encoding{$name} = $obj;
18586f54	114	my $lc = lc($name);
	115	define_alias($lc => $obj) unless $lc eq $name;
	116	while (@_)
	117	{
	118	my $alias = shift;
	119	define_alias($alias,$obj);
	120	}
	121	return $obj;
656753f8	122	}
656753f8	123
656753f8	124	sub getEncoding
656753f8	125	{
dd9703c9	126	my ($class,$name,$skip_external) = @_;
18586f54	127	my $enc;
	128	if (ref($name) && $name->can('new_sequence'))
	129	{
	130	return $name;
	131	}
	132	my $lc = lc $name;
5129552c	133	if (exists $Encoding{$name})
18586f54	134	{
5129552c	135	return $Encoding{$name};
18586f54	136	}
5129552c	137	if (exists $Encoding{$lc})
18586f54	138	{
5129552c	139	return $Encoding{$lc};
18586f54	140	}
c50d192e	141
5129552c	142	my $oc = $class->find_alias($name);
c50d192e	143	return $oc if defined $oc;
c50d192e	144
5129552c	145	$oc = $class->find_alias($lc) if $lc ne $name;
c50d192e	146	return $oc if defined $oc;
c50d192e	147
5129552c	148	if (!$skip_external and exists $ExtModule{$lc})
d1ed7747	149	{
5129552c	150	eval{ require $ExtModule{$lc}; };
5129552c	151	return $Encoding{$name} if exists $Encoding{$name};
d1ed7747	152	}
18586f54	153
18586f54	154	return;
656753f8	155	}
656753f8	156
4411f3b6	157	sub find_encoding
4411f3b6	158	{
dd9703c9	159	my ($name,$skip_external) = @_;
dd9703c9	160	return __PACKAGE__->getEncoding($name,$skip_external);
4411f3b6	161	}
	162
	163	sub encode
	164	{
18586f54	165	my ($name,$string,$check) = @_;
	166	my $enc = find_encoding($name);
	167	croak("Unknown encoding '$name'") unless defined $enc;
	168	my $octets = $enc->encode($string,$check);
	169	return undef if ($check && length($string));
	170	return $octets;
4411f3b6	171	}
	172
	173	sub decode
	174	{
18586f54	175	my ($name,$octets,$check) = @_;
	176	my $enc = find_encoding($name);
	177	croak("Unknown encoding '$name'") unless defined $enc;
	178	my $string = $enc->decode($octets,$check);
	179	$_[1] = $octets if $check;
	180	return $string;
4411f3b6	181	}
	182
	183	sub from_to
	184	{
18586f54	185	my ($string,$from,$to,$check) = @_;
	186	my $f = find_encoding($from);
	187	croak("Unknown encoding '$from'") unless defined $f;
	188	my $t = find_encoding($to);
	189	croak("Unknown encoding '$to'") unless defined $t;
	190	my $uni = $f->decode($string,$check);
	191	return undef if ($check && length($string));
	192	$string = $t->encode($uni,$check);
	193	return undef if ($check && length($uni));
	194	return length($_[0] = $string);
4411f3b6	195	}
	196
	197	sub encode_utf8
	198	{
18586f54	199	my ($str) = @_;
	200	utf8::encode($str);
	201	return $str;
4411f3b6	202	}
	203
	204	sub decode_utf8
	205	{
18586f54	206	my ($str) = @_;
	207	return undef unless utf8::decode($str);
	208	return $str;
5ad8ef52	209	}
5ad8ef52	210
18586f54	211	require Encode::Encoding;
	212	require Encode::XS;
	213	require Encode::Internal;
	214	require Encode::Unicode;
	215	require Encode::utf8;
	216	require Encode::iso10646_1;
	217	require Encode::ucs2_le;
4411f3b6	218
656753f8	219	1;
656753f8	220
2a936312	221	__END__
2a936312	222
4411f3b6	223	=head1 NAME
	224
	225	Encode - character encodings
	226
	227	=head1 SYNOPSIS
	228
	229	use Encode;
	230
	231	=head1 DESCRIPTION
	232
47bfe92f	233	The C<Encode> module provides the interfaces between Perl's strings
47bfe92f	234	and the rest of the system. Perl strings are sequences of B<characters>.
4411f3b6	235
1b2c56c8	236	To find more about character encodings, please consult
5d030b67	237	L<Encode::Details> . This document focuses on programming references.
21938dfa	238
4411f3b6	239	=head1 PERL ENCODING API
	240
	241	=head2 Generic Encoding Interface
	242
	243	=over 4
	244
	245	=item *
	246
	247	$bytes = encode(ENCODING, $string[, CHECK])
	248
47bfe92f	249	Encodes string from Perl's internal form into I<ENCODING> and returns
47bfe92f	250	a sequence of octets. For CHECK see L</"Handling Malformed Data">.
4411f3b6	251
681a7c68	252	For example to convert (internally UTF-8 encoded) Unicode data
	253	to octets:
	254
	255	$octets = encode("utf8", $unicode);
	256
4411f3b6	257	=item *
	258
	259	$string = decode(ENCODING, $bytes[, CHECK])
	260
47bfe92f	261	Decode sequence of octets assumed to be in I<ENCODING> into Perl's
	262	internal form and returns the resulting string. For CHECK see
	263	L</"Handling Malformed Data">.
	264
1b2c56c8	265	For example to convert ISO-8859-1 data to UTF-8:
681a7c68	266
	267	$utf8 = decode("latin1", $latin1);
	268
47bfe92f	269	=item *
	270
	271	from_to($string, FROM_ENCODING, TO_ENCODING[, CHECK])
	272
2b106fbe	273	Convert B<in-place> the data between two encodings. How did the data
2b106fbe	274	in $string originally get to be in FROM_ENCODING? Either using
e9692b5b	275	encode() or through PerlIO: See L</"Encoding and IO">. For CHECK
2b106fbe	276	see L</"Handling Malformed Data">.
2b106fbe	277
1b2c56c8	278	For example to convert ISO-8859-1 data to UTF-8:
2b106fbe	279
	280	from_to($data, "iso-8859-1", "utf-8");
	281
	282	and to convert it back:
	283
	284	from_to($data, "utf-8", "iso-8859-1");
4411f3b6	285
ab97ca19	286	Note that because the conversion happens in place, the data to be
	287	converted cannot be a string constant, it must be a scalar variable.
	288
4411f3b6	289	=back
	290
	291	=head2 Handling Malformed Data
	292
	293	If CHECK is not set, C<undef> is returned. If the data is supposed to
47bfe92f	294	be UTF-8, an optional lexical warning (category utf8) is given. If
47bfe92f	295	CHECK is true but not a code reference, dies.
4411f3b6	296
47bfe92f	297	It would desirable to have a way to indicate that transform should use
47bfe92f	298	the encodings "replacement character" - no such mechanism is defined yet.
4411f3b6	299
	300	It is also planned to allow I<CHECK> to be a code reference.
	301
47bfe92f	302	This is not yet implemented as there are design issues with what its
47bfe92f	303	arguments should be and how it returns its results.
4411f3b6	304
	305	=over 4
	306
	307	=item Scheme 1
	308
	309	Passed remaining fragment of string being processed.
	310	Modifies it in place to remove bytes/characters it can understand
	311	and returns a string used to represent them.
	312	e.g.
	313
	314	sub fixup {
	315	my $ch = substr($_[0],0,1,'');
	316	return sprintf("\x{%02X}",ord($ch);
	317	}
	318
	319	This scheme is close to how underlying C code for Encode works, but gives
	320	the fixup routine very little context.
	321
	322	=item Scheme 2
	323
47bfe92f	324	Passed original string, and an index into it of the problem area, and
	325	output string so far. Appends what it will to output string and
	326	returns new index into original string. For example:
4411f3b6	327
	328	sub fixup {
	329	# my ($s,$i,$d) = @_;
	330	my $ch = substr($_[0],$_[1],1);
	331	$_[2] .= sprintf("\x{%02X}",ord($ch);
	332	return $_[1]+1;
	333	}
	334
47bfe92f	335	This scheme gives maximal control to the fixup routine but is more
	336	complicated to code, and may need internals of Encode to be tweaked to
	337	keep original string intact.
4411f3b6	338
	339	=item Other Schemes
	340
	341	Hybrids of above.
	342
	343	Multiple return values rather than in-place modifications.
	344
	345	Index into the string could be pos($str) allowing s/\G...//.
	346
	347	=back
	348
	349	=head2 UTF-8 / utf8
	350
	351	The Unicode consortium defines the UTF-8 standard as a way of encoding
47bfe92f	352	the entire Unicode repertiore as sequences of octets. This encoding is
	353	expected to become very widespread. Perl can use this form internaly
	354	to represent strings, so conversions to and from this form are
	355	particularly efficient (as octets in memory do not have to change,
	356	just the meta-data that tells Perl how to treat them).
4411f3b6	357
	358	=over 4
	359
	360	=item *
	361
	362	$bytes = encode_utf8($string);
	363
47bfe92f	364	The characters that comprise string are encoded in Perl's superset of UTF-8
4411f3b6	365	and the resulting octets returned as a sequence of bytes. All possible
	366	characters have a UTF-8 representation so this function cannot fail.
	367
	368	=item *
	369
	370	$string = decode_utf8($bytes [,CHECK]);
	371
47bfe92f	372	The sequence of octets represented by $bytes is decoded from UTF-8
	373	into a sequence of logical characters. Not all sequences of octets
	374	form valid UTF-8 encodings, so it is possible for this call to fail.
	375	For CHECK see L</"Handling Malformed Data">.
4411f3b6	376
	377	=back
	378
51ef4e11	379	=head2 Listing available encodings
51ef4e11	380
5129552c	381	use Encode;
	382	@list = Encode->encodings();
	383
	384	Returns a list of the canonical names of the available encodings that
	385	are loaded. To get a list of all available encodings including the
	386	ones that are not loaded yet, say
	387
	388	@all_encodings = Encode->encodings(":all");
	389
	390	Or you can give the name of specific module.
	391
	392	@with_jp = Encode->encodings("Encode/JP.pm");
51ef4e11	393
5129552c	394	Note in this case you have to say "Encode/JP.pm instead of Encode::JP.
5d030b67	395
	396	To find which encodings are suppoted by this package in details,
	397	see L<Encode::Supported>.
51ef4e11	398
	399	=head2 Defining Aliases
	400
5129552c	401	use Encode;
5129552c	402	use Encode::Alias;
51ef4e11	403	define_alias( newName => ENCODING);
51ef4e11	404
47bfe92f	405	Allows newName to be used as am alias for ENCODING. ENCODING may be
47bfe92f	406	either the name of an encoding or and encoding object (as above).
51ef4e11	407
5d030b67	408	See L<Encode::Alias> on details.
51ef4e11	409
1b2c56c8	410	=head1 Defining Encodings
51ef4e11	411
e9692b5b	412	use Encode qw(define_alias);
e9692b5b	413	define_encoding( $object, 'canonicalName' [,alias...]);
51ef4e11	414
47bfe92f	415	Causes I<canonicalName> to be associated with I<$object>. The object
1b2c56c8	416	should provide the interface described in L<Encode::Encoding>
47bfe92f	417	below. If more than two arguments are provided then additional
47bfe92f	418	arguments are taken as aliases for I<$object> as for C<define_alias>.
51ef4e11	419
4411f3b6	420	=head1 Encoding and IO
	421
	422	It is very common to want to do encoding transformations when
	423	reading or writing files, network connections, pipes etc.
47bfe92f	424	If Perl is configured to use the new 'perlio' IO system then
4411f3b6	425	C<Encode> provides a "layer" (See L<perliol>) which can transform
	426	data as it is read or written.
	427
8e86646e	428	Here is how the blind poet would modernise the encoding:
8e86646e	429
42234700	430	use Encode;
8e86646e	431	open(my $iliad,'<:encoding(iso-8859-7)','iliad.greek');
	432	open(my $utf8,'>:utf8','iliad.utf8');
	433	my @epic = <$iliad>;
	434	print $utf8 @epic;
	435	close($utf8);
	436	close($illiad);
4411f3b6	437
	438	In addition the new IO system can also be configured to read/write
	439	UTF-8 encoded characters (as noted above this is efficient):
	440
e9692b5b	441	open(my $fh,'>:utf8','anything');
e9692b5b	442	print $fh "Any \x{0021} string \N{SMILEY FACE}\n";
4411f3b6	443
	444	Either of the above forms of "layer" specifications can be made the default
	445	for a lexical scope with the C<use open ...> pragma. See L<open>.
	446
	447	Once a handle is open is layers can be altered using C<binmode>.
	448
47bfe92f	449	Without any such configuration, or if Perl itself is built using
4411f3b6	450	system's own IO, then write operations assume that file handle accepts
	451	only I<bytes> and will C<die> if a character larger than 255 is
	452	written to the handle. When reading, each octet from the handle
	453	becomes a byte-in-a-character. Note that this default is the same
47bfe92f	454	behaviour as bytes-only languages (including Perl before v5.6) would
	455	have, and is sufficient to handle native 8-bit encodings
	456	e.g. iso-8859-1, EBCDIC etc. and any legacy mechanisms for handling
	457	other encodings and binary data.
	458
	459	In other cases it is the programs responsibility to transform
	460	characters into bytes using the API above before doing writes, and to
	461	transform the bytes read from a handle into characters before doing
	462	"character operations" (e.g. C<lc>, C</\W+/>, ...).
	463
47bfe92f	464	You can also use PerlIO to convert larger amounts of data you don't
1b2c56c8	465	want to bring into memory. For example to convert between ISO-8859-1
47bfe92f	466	(Latin 1) and UTF-8 (or UTF-EBCDIC in EBCDIC machines):
47bfe92f	467
e9692b5b	468	open(F, "<:encoding(iso-8859-1)", "data.txt") or die $!;
	469	open(G, ">:utf8", "data.utf") or die $!;
	470	while (<F>) { print G }
	471
	472	# Could also do "print G <F>" but that would pull
	473	# the whole file into memory just to write it out again.
	474
	475	More examples:
47bfe92f	476
e9692b5b	477	open(my $f, "<:encoding(cp1252)")
	478	open(my $g, ">:encoding(iso-8859-2)")
	479	open(my $h, ">:encoding(latin9)") # iso-8859-15
47bfe92f	480
47bfe92f	481	See L<PerlIO> for more information.
4411f3b6	482
1768d7eb	483	See also L<encoding> for how to change the default encoding of the
d521382b	484	data in your script.
1768d7eb	485
4411f3b6	486	=head1 Messing with Perl's Internals
4411f3b6	487
47bfe92f	488	The following API uses parts of Perl's internals in the current
47bfe92f	489	implementation. As such they are efficient, but may change.
4411f3b6	490
	491	=over 4
	492
4411f3b6	493	=item * is_utf8(STRING [, CHECK])
	494
	495	[INTERNAL] Test whether the UTF-8 flag is turned on in the STRING.
47bfe92f	496	If CHECK is true, also checks the data in STRING for being well-formed
47bfe92f	497	UTF-8. Returns true if successful, false otherwise.
4411f3b6	498
4411f3b6	499	=item *
	500
	501	_utf8_on(STRING)
	502
	503	[INTERNAL] Turn on the UTF-8 flag in STRING. The data in STRING is
	504	B<not> checked for being well-formed UTF-8. Do not use unless you
	505	B<know> that the STRING is well-formed UTF-8. Returns the previous
	506	state of the UTF-8 flag (so please don't test the return value as
	507	I<not> success or failure), or C<undef> if STRING is not a string.
	508
	509	=item *
	510
	511	_utf8_off(STRING)
	512
	513	[INTERNAL] Turn off the UTF-8 flag in STRING. Do not use frivolously.
	514	Returns the previous state of the UTF-8 flag (so please don't test the
	515	return value as I<not> success or failure), or C<undef> if STRING is
	516	not a string.
	517
	518	=back
	519
	520	=head1 SEE ALSO
	521
5d030b67	522	L<Encode::Details>,
	523	L<Encode::Encoding>,
	524	L<Encode::Supported>,
	525	L<PerlIO>,
	526	L<encoding>,
	527	L<perlebcdic>,
	528	L<perlfunc/open>,
	529	L<perlunicode>,
	530	L<utf8>,
	531	the Perl Unicode Mailing List E<lt>perl-unicode@perl.orgE<gt>
4411f3b6	532
4411f3b6	533	=cut