[p5sagit/p5-mst-13.2.git] / ext / Encode / Encode.pm

package Encode;
use strict;
our $VERSION = do { my @r = (q$Revision: 0.95 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };

require DynaLoader;
require Exporter;

our @ISA = qw(Exporter DynaLoader);

# Public, encouraged API is exported by default
our @EXPORT = qw (
  encode
  decode
  encode_utf8
  decode_utf8
  find_encoding
  encodings
);

our @EXPORT_OK =
    qw(
       define_encoding
       define_alias
       from_to
       is_utf8
       is_8bit
       is_16bit
       utf8_upgrade
       utf8_downgrade
       _utf8_on
       _utf8_off
      );

bootstrap Encode ();

# Documentation moved after __END__ for speed - NI-S

use Carp;

use Encode::Alias;

# Make a %encoding package variable to allow a certain amount of cheating
our %encoding;

our %external_tables =
    (
	'euc-cn'	=> 'Encode/CN.pm',
	gb2312		=> 'Encode/CN.pm',
	gb12345		=> 'Encode/CN.pm',
	gbk		=> 'Encode/CN.pm',
	cp936		=> 'Encode/CN.pm',
	'iso-ir-165'	=> 'Encode/CN.pm',
	'euc-jp'	=> 'Encode/JP.pm',
	'iso-2022-jp'	=> 'Encode/JP.pm',
	'7bit-jis'	=> 'Encode/JP.pm',
	shiftjis	=> 'Encode/JP.pm',
	macjapan	=> 'Encode/JP.pm',
	cp932		=> 'Encode/JP.pm',
	'euc-kr'	=> 'Encode/KR.pm',
	ksc5601		=> 'Encode/KR.pm',
	cp949		=> 'Encode/KR.pm',
	big5		=> 'Encode/TW.pm',
	'big5-hkscs'	=> 'Encode/TW.pm',
	cp950		=> 'Encode/TW.pm',
	gb18030		=> 'Encode/HanExtra.pm',
	big5plus	=> 'Encode/HanExtra.pm',
	'euc-tw'	=> 'Encode/HanExtra.pm',
    );

sub encodings
{
 my ($class) = @_;
 return
     map { $_->[0] }
         sort { $a->[1] cmp $b->[1] }
               map { [$_, lc $_] }
                   grep { $_ ne 'Internal' }
                        keys %encoding;
}

sub define_encoding
{
    my $obj  = shift;
    my $name = shift;
    $encoding{$name} = $obj;
    my $lc = lc($name);
    define_alias($lc => $obj) unless $lc eq $name;
    while (@_)
    {
	my $alias = shift;
	define_alias($alias,$obj);
    }
    return $obj;
}

sub getEncoding
{
    my ($class,$name,$skip_external) = @_;
    my $enc;
    if (ref($name) && $name->can('new_sequence'))
    {
	return $name;
    }
    my $lc = lc $name;
    if (exists $encoding{$name})
    {
	return $encoding{$name};
    }
    if (exists $encoding{$lc})
    {
	return $encoding{$lc};
    }

    my $oc = $class->findAlias($name);
    return $oc if defined $oc;

    $oc = $class->findAlias($lc) if $lc ne $name;
    return $oc if defined $oc;

    if (!$skip_external and exists $external_tables{$lc})
    {
	require $external_tables{$lc};
	return $encoding{$name} if exists $encoding{$name};
    }

    return;
}

sub find_encoding
{
    my ($name,$skip_external) = @_;
    return __PACKAGE__->getEncoding($name,$skip_external);
}

sub encode
{
    my ($name,$string,$check) = @_;
    my $enc = find_encoding($name);
    croak("Unknown encoding '$name'") unless defined $enc;
    my $octets = $enc->encode($string,$check);
    return undef if ($check && length($string));
    return $octets;
}

sub decode
{
    my ($name,$octets,$check) = @_;
    my $enc = find_encoding($name);
    croak("Unknown encoding '$name'") unless defined $enc;
    my $string = $enc->decode($octets,$check);
    $_[1] = $octets if $check;
    return $string;
}

sub from_to
{
    my ($string,$from,$to,$check) = @_;
    my $f = find_encoding($from);
    croak("Unknown encoding '$from'") unless defined $f;
    my $t = find_encoding($to);
    croak("Unknown encoding '$to'") unless defined $t;
    my $uni = $f->decode($string,$check);
    return undef if ($check && length($string));
    $string = $t->encode($uni,$check);
    return undef if ($check && length($uni));
    return length($_[0] = $string);
}

sub encode_utf8
{
    my ($str) = @_;
  utf8::encode($str);
    return $str;
}

sub decode_utf8
{
    my ($str) = @_;
    return undef unless utf8::decode($str);
    return $str;
}

require Encode::Encoding;
require Encode::XS;
require Encode::Internal;
require Encode::Unicode;
require Encode::utf8;
require Encode::iso10646_1;
require Encode::ucs2_le;

1;

__END__

=head1 NAME

Encode - character encodings

=head1 SYNOPSIS

    use Encode;

=head1 DESCRIPTION

The C<Encode> module provides the interfaces between Perl's strings
and the rest of the system.  Perl strings are sequences of B<characters>.

To find more about character encodings, please consult
L<Encode::Details> . This document focuses on programming references.

=head1 PERL ENCODING API

=head2 Generic Encoding Interface

=over 4

=item *

        $bytes  = encode(ENCODING, $string[, CHECK])

Encodes string from Perl's internal form into I<ENCODING> and returns
a sequence of octets.  For CHECK see L</"Handling Malformed Data">.

For example to convert (internally UTF-8 encoded) Unicode data
to octets:

	$octets = encode("utf8", $unicode);

=item *

        $string = decode(ENCODING, $bytes[, CHECK])

Decode sequence of octets assumed to be in I<ENCODING> into Perl's
internal form and returns the resulting string.  For CHECK see
L</"Handling Malformed Data">.

For example to convert ISO-8859-1 data to UTF-8:

	$utf8 = decode("latin1", $latin1);

=item *

	from_to($string, FROM_ENCODING, TO_ENCODING[, CHECK])

Convert B<in-place> the data between two encodings.  How did the data
in $string originally get to be in FROM_ENCODING?  Either using
encode() or through PerlIO: See L</"Encoding and IO">.  For CHECK
see L</"Handling Malformed Data">.

For example to convert ISO-8859-1 data to UTF-8:

	from_to($data, "iso-8859-1", "utf-8");

and to convert it back:

	from_to($data, "utf-8", "iso-8859-1");

Note that because the conversion happens in place, the data to be
converted cannot be a string constant, it must be a scalar variable.

=back

=head2 Handling Malformed Data

If CHECK is not set, C<undef> is returned.  If the data is supposed to
be UTF-8, an optional lexical warning (category utf8) is given.  If
CHECK is true but not a code reference, dies.

It would desirable to have a way to indicate that transform should use
the encodings "replacement character" - no such mechanism is defined yet.

It is also planned to allow I<CHECK> to be a code reference.

This is not yet implemented as there are design issues with what its
arguments should be and how it returns its results.

=over 4

=item Scheme 1

Passed remaining fragment of string being processed.
Modifies it in place to remove bytes/characters it can understand
and returns a string used to represent them.
e.g.

 sub fixup {
   my $ch = substr($_[0],0,1,'');
   return sprintf("\x{%02X}",ord($ch);
 }

This scheme is close to how underlying C code for Encode works, but gives
the fixup routine very little context.

=item Scheme 2

Passed original string, and an index into it of the problem area, and
output string so far.  Appends what it will to output string and
returns new index into original string.  For example:

 sub fixup {
   # my ($s,$i,$d) = @_;
   my $ch = substr($_[0],$_[1],1);
   $_[2] .= sprintf("\x{%02X}",ord($ch);
   return $_[1]+1;
 }

This scheme gives maximal control to the fixup routine but is more
complicated to code, and may need internals of Encode to be tweaked to
keep original string intact.

=item Other Schemes

Hybrids of above.

Multiple return values rather than in-place modifications.

Index into the string could be pos($str) allowing s/\G...//.

=back

=head2 UTF-8 / utf8

The Unicode consortium defines the UTF-8 standard as a way of encoding
the entire Unicode repertiore as sequences of octets.  This encoding is
expected to become very widespread. Perl can use this form internaly
to represent strings, so conversions to and from this form are
particularly efficient (as octets in memory do not have to change,
just the meta-data that tells Perl how to treat them).

=over 4

=item *

        $bytes = encode_utf8($string);

The characters that comprise string are encoded in Perl's superset of UTF-8
and the resulting octets returned as a sequence of bytes. All possible
characters have a UTF-8 representation so this function cannot fail.

=item *

        $string = decode_utf8($bytes [,CHECK]);

The sequence of octets represented by $bytes is decoded from UTF-8
into a sequence of logical characters. Not all sequences of octets
form valid UTF-8 encodings, so it is possible for this call to fail.
For CHECK see L</"Handling Malformed Data">.

=back

=head2 Listing available encodings

  use Encode qw(encodings);
  @list = encodings();

Returns a list of the canonical names of the available encodings. 

To find which encodings are suppoted by this package in details, 
see L<Encode::Supported>.

=head2 Defining Aliases

  use Encode qw(define_alias);
  define_alias( newName => ENCODING);

Allows newName to be used as am alias for ENCODING. ENCODING may be
either the name of an encoding or and encoding object (as above).

See L<Encode::Alias> on details.

=head1 Defining Encodings

    use Encode qw(define_alias);
    define_encoding( $object, 'canonicalName' [,alias...]);

Causes I<canonicalName> to be associated with I<$object>.  The object
should provide the interface described in L<Encode::Encoding>
below.  If more than two arguments are provided then additional
arguments are taken as aliases for I<$object> as for C<define_alias>.

=head1 Encoding and IO

It is very common to want to do encoding transformations when
reading or writing files, network connections, pipes etc.
If Perl is configured to use the new 'perlio' IO system then
C<Encode> provides a "layer" (See L<perliol>) which can transform
data as it is read or written.

Here is how the blind poet would modernise the encoding:

    use Encode;
    open(my $iliad,'<:encoding(iso-8859-7)','iliad.greek');
    open(my $utf8,'>:utf8','iliad.utf8');
    my @epic = <$iliad>;
    print $utf8 @epic;
    close($utf8);
    close($illiad);

In addition the new IO system can also be configured to read/write
UTF-8 encoded characters (as noted above this is efficient):

    open(my $fh,'>:utf8','anything');
    print $fh "Any \x{0021} string \N{SMILEY FACE}\n";

Either of the above forms of "layer" specifications can be made the default
for a lexical scope with the C<use open ...> pragma. See L<open>.

Once a handle is open is layers can be altered using C<binmode>.

Without any such configuration, or if Perl itself is built using
system's own IO, then write operations assume that file handle accepts
only I<bytes> and will C<die> if a character larger than 255 is
written to the handle. When reading, each octet from the handle
becomes a byte-in-a-character. Note that this default is the same
behaviour as bytes-only languages (including Perl before v5.6) would
have, and is sufficient to handle native 8-bit encodings
e.g. iso-8859-1, EBCDIC etc. and any legacy mechanisms for handling
other encodings and binary data.

In other cases it is the programs responsibility to transform
characters into bytes using the API above before doing writes, and to
transform the bytes read from a handle into characters before doing
"character operations" (e.g. C<lc>, C</\W+/>, ...).

You can also use PerlIO to convert larger amounts of data you don't
want to bring into memory.  For example to convert between ISO-8859-1
(Latin 1) and UTF-8 (or UTF-EBCDIC in EBCDIC machines):

    open(F, "<:encoding(iso-8859-1)", "data.txt") or die $!;
    open(G, ">:utf8",                 "data.utf") or die $!;
    while (<F>) { print G }

    # Could also do "print G <F>" but that would pull
    # the whole file into memory just to write it out again.

More examples:

    open(my $f, "<:encoding(cp1252)")
    open(my $g, ">:encoding(iso-8859-2)")
    open(my $h, ">:encoding(latin9)")       # iso-8859-15

See L<PerlIO> for more information.

See also L<encoding> for how to change the default encoding of the
data in your script.

=head1 Messing with Perl's Internals

The following API uses parts of Perl's internals in the current
implementation.  As such they are efficient, but may change.

=over 4

=item * is_utf8(STRING [, CHECK])

[INTERNAL] Test whether the UTF-8 flag is turned on in the STRING.
If CHECK is true, also checks the data in STRING for being well-formed
UTF-8.  Returns true if successful, false otherwise.

=item *

        _utf8_on(STRING)

[INTERNAL] Turn on the UTF-8 flag in STRING.  The data in STRING is
B<not> checked for being well-formed UTF-8.  Do not use unless you
B<know> that the STRING is well-formed UTF-8.  Returns the previous
state of the UTF-8 flag (so please don't test the return value as
I<not> success or failure), or C<undef> if STRING is not a string.

=item *

        _utf8_off(STRING)

[INTERNAL] Turn off the UTF-8 flag in STRING.  Do not use frivolously.
Returns the previous state of the UTF-8 flag (so please don't test the
return value as I<not> success or failure), or C<undef> if STRING is
not a string.

=back

=head1 SEE ALSO

L<Encode::Details>, 
L<Encode::Encoding>,
L<Encode::Supported>,
L<PerlIO>, 
L<encoding>,
L<perlebcdic>, 
L<perlfunc/open>, 
L<perlunicode>, 
L<utf8>, 
the Perl Unicode Mailing List E<lt>perl-unicode@perl.orgE<gt>

=cut
Commit	Line	Data
2c674647	1	package Encode;
51ef4e11	2	use strict;
5d030b67	3	our $VERSION = do { my @r = (q$Revision: 0.95 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
2c674647	4
	5	require DynaLoader;
	6	require Exporter;
	7
51ef4e11	8	our @ISA = qw(Exporter DynaLoader);
2c674647	9
4411f3b6	10	# Public, encouraged API is exported by default
51ef4e11	11	our @EXPORT = qw (
4411f3b6	12	encode
	13	decode
	14	encode_utf8
	15	decode_utf8
	16	find_encoding
51ef4e11	17	encodings
4411f3b6	18	);
4411f3b6	19
51ef4e11	20	our @EXPORT_OK =
2c674647	21	qw(
51ef4e11	22	define_encoding
51ef4e11	23	define_alias
2c674647	24	from_to
2c674647	25	is_utf8
4411f3b6	26	is_8bit
4411f3b6	27	is_16bit
a12c0f56	28	utf8_upgrade
a12c0f56	29	utf8_downgrade
4411f3b6	30	_utf8_on
4411f3b6	31	_utf8_off
2c674647	32	);
	33
	34	bootstrap Encode ();
	35
4411f3b6	36	# Documentation moved after __END__ for speed - NI-S
2c674647	37
bf230f3d	38	use Carp;
bf230f3d	39
5d030b67	40	use Encode::Alias;
5d030b67	41
51ef4e11	42	# Make a %encoding package variable to allow a certain amount of cheating
51ef4e11	43	our %encoding;
5345d506	44
70122e76	45	our %external_tables =
2b217bf7	46	(
	47	'euc-cn' => 'Encode/CN.pm',
	48	gb2312 => 'Encode/CN.pm',
	49	gb12345 => 'Encode/CN.pm',
	50	gbk => 'Encode/CN.pm',
	51	cp936 => 'Encode/CN.pm',
	52	'iso-ir-165' => 'Encode/CN.pm',
	53	'euc-jp' => 'Encode/JP.pm',
ee981de6	54	'iso-2022-jp' => 'Encode/JP.pm',
ee981de6	55	'7bit-jis' => 'Encode/JP.pm',
2b217bf7	56	shiftjis => 'Encode/JP.pm',
	57	macjapan => 'Encode/JP.pm',
	58	cp932 => 'Encode/JP.pm',
	59	'euc-kr' => 'Encode/KR.pm',
	60	ksc5601 => 'Encode/KR.pm',
	61	cp949 => 'Encode/KR.pm',
	62	big5 => 'Encode/TW.pm',
	63	'big5-hkscs' => 'Encode/TW.pm',
	64	cp950 => 'Encode/TW.pm',
	65	gb18030 => 'Encode/HanExtra.pm',
	66	big5plus => 'Encode/HanExtra.pm',
	67	'euc-tw' => 'Encode/HanExtra.pm',
	68	);
d1ed7747	69
656753f8	70	sub encodings
	71	{
	72	my ($class) = @_;
40a073c6	73	return
	74	map { $_->[0] }
	75	sort { $a->[1] cmp $b->[1] }
	76	map { [$_, lc $_] }
	77	grep { $_ ne 'Internal' }
	78	keys %encoding;
51ef4e11	79	}
51ef4e11	80
51ef4e11	81	sub define_encoding
51ef4e11	82	{
18586f54	83	my $obj = shift;
	84	my $name = shift;
	85	$encoding{$name} = $obj;
	86	my $lc = lc($name);
	87	define_alias($lc => $obj) unless $lc eq $name;
	88	while (@_)
	89	{
	90	my $alias = shift;
	91	define_alias($alias,$obj);
	92	}
	93	return $obj;
656753f8	94	}
656753f8	95
656753f8	96	sub getEncoding
656753f8	97	{
dd9703c9	98	my ($class,$name,$skip_external) = @_;
18586f54	99	my $enc;
	100	if (ref($name) && $name->can('new_sequence'))
	101	{
	102	return $name;
	103	}
	104	my $lc = lc $name;
	105	if (exists $encoding{$name})
	106	{
	107	return $encoding{$name};
	108	}
	109	if (exists $encoding{$lc})
	110	{
	111	return $encoding{$lc};
	112	}
c50d192e	113
	114	my $oc = $class->findAlias($name);
	115	return $oc if defined $oc;
	116
	117	$oc = $class->findAlias($lc) if $lc ne $name;
	118	return $oc if defined $oc;
	119
dd9703c9	120	if (!$skip_external and exists $external_tables{$lc})
d1ed7747	121	{
	122	require $external_tables{$lc};
	123	return $encoding{$name} if exists $encoding{$name};
	124	}
18586f54	125
18586f54	126	return;
656753f8	127	}
656753f8	128
4411f3b6	129	sub find_encoding
4411f3b6	130	{
dd9703c9	131	my ($name,$skip_external) = @_;
dd9703c9	132	return __PACKAGE__->getEncoding($name,$skip_external);
4411f3b6	133	}
	134
	135	sub encode
	136	{
18586f54	137	my ($name,$string,$check) = @_;
	138	my $enc = find_encoding($name);
	139	croak("Unknown encoding '$name'") unless defined $enc;
	140	my $octets = $enc->encode($string,$check);
	141	return undef if ($check && length($string));
	142	return $octets;
4411f3b6	143	}
	144
	145	sub decode
	146	{
18586f54	147	my ($name,$octets,$check) = @_;
	148	my $enc = find_encoding($name);
	149	croak("Unknown encoding '$name'") unless defined $enc;
	150	my $string = $enc->decode($octets,$check);
	151	$_[1] = $octets if $check;
	152	return $string;
4411f3b6	153	}
	154
	155	sub from_to
	156	{
18586f54	157	my ($string,$from,$to,$check) = @_;
	158	my $f = find_encoding($from);
	159	croak("Unknown encoding '$from'") unless defined $f;
	160	my $t = find_encoding($to);
	161	croak("Unknown encoding '$to'") unless defined $t;
	162	my $uni = $f->decode($string,$check);
	163	return undef if ($check && length($string));
	164	$string = $t->encode($uni,$check);
	165	return undef if ($check && length($uni));
	166	return length($_[0] = $string);
4411f3b6	167	}
	168
	169	sub encode_utf8
	170	{
18586f54	171	my ($str) = @_;
	172	utf8::encode($str);
	173	return $str;
4411f3b6	174	}
	175
	176	sub decode_utf8
	177	{
18586f54	178	my ($str) = @_;
	179	return undef unless utf8::decode($str);
	180	return $str;
5ad8ef52	181	}
5ad8ef52	182
18586f54	183	require Encode::Encoding;
	184	require Encode::XS;
	185	require Encode::Internal;
	186	require Encode::Unicode;
	187	require Encode::utf8;
	188	require Encode::iso10646_1;
	189	require Encode::ucs2_le;
4411f3b6	190
656753f8	191	1;
656753f8	192
2a936312	193	__END__
2a936312	194
4411f3b6	195	=head1 NAME
	196
	197	Encode - character encodings
	198
	199	=head1 SYNOPSIS
	200
	201	use Encode;
	202
	203	=head1 DESCRIPTION
	204
47bfe92f	205	The C<Encode> module provides the interfaces between Perl's strings
47bfe92f	206	and the rest of the system. Perl strings are sequences of B<characters>.
4411f3b6	207
1b2c56c8	208	To find more about character encodings, please consult
5d030b67	209	L<Encode::Details> . This document focuses on programming references.
21938dfa	210
4411f3b6	211	=head1 PERL ENCODING API
	212
	213	=head2 Generic Encoding Interface
	214
	215	=over 4
	216
	217	=item *
	218
	219	$bytes = encode(ENCODING, $string[, CHECK])
	220
47bfe92f	221	Encodes string from Perl's internal form into I<ENCODING> and returns
47bfe92f	222	a sequence of octets. For CHECK see L</"Handling Malformed Data">.
4411f3b6	223
681a7c68	224	For example to convert (internally UTF-8 encoded) Unicode data
	225	to octets:
	226
	227	$octets = encode("utf8", $unicode);
	228
4411f3b6	229	=item *
	230
	231	$string = decode(ENCODING, $bytes[, CHECK])
	232
47bfe92f	233	Decode sequence of octets assumed to be in I<ENCODING> into Perl's
	234	internal form and returns the resulting string. For CHECK see
	235	L</"Handling Malformed Data">.
	236
1b2c56c8	237	For example to convert ISO-8859-1 data to UTF-8:
681a7c68	238
	239	$utf8 = decode("latin1", $latin1);
	240
47bfe92f	241	=item *
	242
	243	from_to($string, FROM_ENCODING, TO_ENCODING[, CHECK])
	244
2b106fbe	245	Convert B<in-place> the data between two encodings. How did the data
2b106fbe	246	in $string originally get to be in FROM_ENCODING? Either using
e9692b5b	247	encode() or through PerlIO: See L</"Encoding and IO">. For CHECK
2b106fbe	248	see L</"Handling Malformed Data">.
2b106fbe	249
1b2c56c8	250	For example to convert ISO-8859-1 data to UTF-8:
2b106fbe	251
	252	from_to($data, "iso-8859-1", "utf-8");
	253
	254	and to convert it back:
	255
	256	from_to($data, "utf-8", "iso-8859-1");
4411f3b6	257
ab97ca19	258	Note that because the conversion happens in place, the data to be
	259	converted cannot be a string constant, it must be a scalar variable.
	260
4411f3b6	261	=back
	262
	263	=head2 Handling Malformed Data
	264
	265	If CHECK is not set, C<undef> is returned. If the data is supposed to
47bfe92f	266	be UTF-8, an optional lexical warning (category utf8) is given. If
47bfe92f	267	CHECK is true but not a code reference, dies.
4411f3b6	268
47bfe92f	269	It would desirable to have a way to indicate that transform should use
47bfe92f	270	the encodings "replacement character" - no such mechanism is defined yet.
4411f3b6	271
	272	It is also planned to allow I<CHECK> to be a code reference.
	273
47bfe92f	274	This is not yet implemented as there are design issues with what its
47bfe92f	275	arguments should be and how it returns its results.
4411f3b6	276
	277	=over 4
	278
	279	=item Scheme 1
	280
	281	Passed remaining fragment of string being processed.
	282	Modifies it in place to remove bytes/characters it can understand
	283	and returns a string used to represent them.
	284	e.g.
	285
	286	sub fixup {
	287	my $ch = substr($_[0],0,1,'');
	288	return sprintf("\x{%02X}",ord($ch);
	289	}
	290
	291	This scheme is close to how underlying C code for Encode works, but gives
	292	the fixup routine very little context.
	293
	294	=item Scheme 2
	295
47bfe92f	296	Passed original string, and an index into it of the problem area, and
	297	output string so far. Appends what it will to output string and
	298	returns new index into original string. For example:
4411f3b6	299
	300	sub fixup {
	301	# my ($s,$i,$d) = @_;
	302	my $ch = substr($_[0],$_[1],1);
	303	$_[2] .= sprintf("\x{%02X}",ord($ch);
	304	return $_[1]+1;
	305	}
	306
47bfe92f	307	This scheme gives maximal control to the fixup routine but is more
	308	complicated to code, and may need internals of Encode to be tweaked to
	309	keep original string intact.
4411f3b6	310
	311	=item Other Schemes
	312
	313	Hybrids of above.
	314
	315	Multiple return values rather than in-place modifications.
	316
	317	Index into the string could be pos($str) allowing s/\G...//.
	318
	319	=back
	320
	321	=head2 UTF-8 / utf8
	322
	323	The Unicode consortium defines the UTF-8 standard as a way of encoding
47bfe92f	324	the entire Unicode repertiore as sequences of octets. This encoding is
	325	expected to become very widespread. Perl can use this form internaly
	326	to represent strings, so conversions to and from this form are
	327	particularly efficient (as octets in memory do not have to change,
	328	just the meta-data that tells Perl how to treat them).
4411f3b6	329
	330	=over 4
	331
	332	=item *
	333
	334	$bytes = encode_utf8($string);
	335
47bfe92f	336	The characters that comprise string are encoded in Perl's superset of UTF-8
4411f3b6	337	and the resulting octets returned as a sequence of bytes. All possible
	338	characters have a UTF-8 representation so this function cannot fail.
	339
	340	=item *
	341
	342	$string = decode_utf8($bytes [,CHECK]);
	343
47bfe92f	344	The sequence of octets represented by $bytes is decoded from UTF-8
	345	into a sequence of logical characters. Not all sequences of octets
	346	form valid UTF-8 encodings, so it is possible for this call to fail.
	347	For CHECK see L</"Handling Malformed Data">.
4411f3b6	348
	349	=back
	350
51ef4e11	351	=head2 Listing available encodings
	352
	353	use Encode qw(encodings);
	354	@list = encodings();
	355
5d030b67	356	Returns a list of the canonical names of the available encodings.
	357
	358	To find which encodings are suppoted by this package in details,
	359	see L<Encode::Supported>.
51ef4e11	360
	361	=head2 Defining Aliases
	362
	363	use Encode qw(define_alias);
	364	define_alias( newName => ENCODING);
	365
47bfe92f	366	Allows newName to be used as am alias for ENCODING. ENCODING may be
47bfe92f	367	either the name of an encoding or and encoding object (as above).
51ef4e11	368
5d030b67	369	See L<Encode::Alias> on details.
51ef4e11	370
1b2c56c8	371	=head1 Defining Encodings
51ef4e11	372
e9692b5b	373	use Encode qw(define_alias);
e9692b5b	374	define_encoding( $object, 'canonicalName' [,alias...]);
51ef4e11	375
47bfe92f	376	Causes I<canonicalName> to be associated with I<$object>. The object
1b2c56c8	377	should provide the interface described in L<Encode::Encoding>
47bfe92f	378	below. If more than two arguments are provided then additional
47bfe92f	379	arguments are taken as aliases for I<$object> as for C<define_alias>.
51ef4e11	380
4411f3b6	381	=head1 Encoding and IO
	382
	383	It is very common to want to do encoding transformations when
	384	reading or writing files, network connections, pipes etc.
47bfe92f	385	If Perl is configured to use the new 'perlio' IO system then
4411f3b6	386	C<Encode> provides a "layer" (See L<perliol>) which can transform
	387	data as it is read or written.
	388
8e86646e	389	Here is how the blind poet would modernise the encoding:
8e86646e	390
42234700	391	use Encode;
8e86646e	392	open(my $iliad,'<:encoding(iso-8859-7)','iliad.greek');
	393	open(my $utf8,'>:utf8','iliad.utf8');
	394	my @epic = <$iliad>;
	395	print $utf8 @epic;
	396	close($utf8);
	397	close($illiad);
4411f3b6	398
	399	In addition the new IO system can also be configured to read/write
	400	UTF-8 encoded characters (as noted above this is efficient):
	401
e9692b5b	402	open(my $fh,'>:utf8','anything');
e9692b5b	403	print $fh "Any \x{0021} string \N{SMILEY FACE}\n";
4411f3b6	404
	405	Either of the above forms of "layer" specifications can be made the default
	406	for a lexical scope with the C<use open ...> pragma. See L<open>.
	407
	408	Once a handle is open is layers can be altered using C<binmode>.
	409
47bfe92f	410	Without any such configuration, or if Perl itself is built using
4411f3b6	411	system's own IO, then write operations assume that file handle accepts
	412	only I<bytes> and will C<die> if a character larger than 255 is
	413	written to the handle. When reading, each octet from the handle
	414	becomes a byte-in-a-character. Note that this default is the same
47bfe92f	415	behaviour as bytes-only languages (including Perl before v5.6) would
	416	have, and is sufficient to handle native 8-bit encodings
	417	e.g. iso-8859-1, EBCDIC etc. and any legacy mechanisms for handling
	418	other encodings and binary data.
	419
	420	In other cases it is the programs responsibility to transform
	421	characters into bytes using the API above before doing writes, and to
	422	transform the bytes read from a handle into characters before doing
	423	"character operations" (e.g. C<lc>, C</\W+/>, ...).
	424
47bfe92f	425	You can also use PerlIO to convert larger amounts of data you don't
1b2c56c8	426	want to bring into memory. For example to convert between ISO-8859-1
47bfe92f	427	(Latin 1) and UTF-8 (or UTF-EBCDIC in EBCDIC machines):
47bfe92f	428
e9692b5b	429	open(F, "<:encoding(iso-8859-1)", "data.txt") or die $!;
	430	open(G, ">:utf8", "data.utf") or die $!;
	431	while (<F>) { print G }
	432
	433	# Could also do "print G <F>" but that would pull
	434	# the whole file into memory just to write it out again.
	435
	436	More examples:
47bfe92f	437
e9692b5b	438	open(my $f, "<:encoding(cp1252)")
	439	open(my $g, ">:encoding(iso-8859-2)")
	440	open(my $h, ">:encoding(latin9)") # iso-8859-15
47bfe92f	441
47bfe92f	442	See L<PerlIO> for more information.
4411f3b6	443
1768d7eb	444	See also L<encoding> for how to change the default encoding of the
d521382b	445	data in your script.
1768d7eb	446
4411f3b6	447	=head1 Messing with Perl's Internals
4411f3b6	448
47bfe92f	449	The following API uses parts of Perl's internals in the current
47bfe92f	450	implementation. As such they are efficient, but may change.
4411f3b6	451
	452	=over 4
	453
4411f3b6	454	=item * is_utf8(STRING [, CHECK])
	455
	456	[INTERNAL] Test whether the UTF-8 flag is turned on in the STRING.
47bfe92f	457	If CHECK is true, also checks the data in STRING for being well-formed
47bfe92f	458	UTF-8. Returns true if successful, false otherwise.
4411f3b6	459
4411f3b6	460	=item *
	461
	462	_utf8_on(STRING)
	463
	464	[INTERNAL] Turn on the UTF-8 flag in STRING. The data in STRING is
	465	B<not> checked for being well-formed UTF-8. Do not use unless you
	466	B<know> that the STRING is well-formed UTF-8. Returns the previous
	467	state of the UTF-8 flag (so please don't test the return value as
	468	I<not> success or failure), or C<undef> if STRING is not a string.
	469
	470	=item *
	471
	472	_utf8_off(STRING)
	473
	474	[INTERNAL] Turn off the UTF-8 flag in STRING. Do not use frivolously.
	475	Returns the previous state of the UTF-8 flag (so please don't test the
	476	return value as I<not> success or failure), or C<undef> if STRING is
	477	not a string.
	478
	479	=back
	480
	481	=head1 SEE ALSO
	482
5d030b67	483	L<Encode::Details>,
	484	L<Encode::Encoding>,
	485	L<Encode::Supported>,
	486	L<PerlIO>,
	487	L<encoding>,
	488	L<perlebcdic>,
	489	L<perlfunc/open>,
	490	L<perlunicode>,
	491	L<utf8>,
	492	the Perl Unicode Mailing List E<lt>perl-unicode@perl.orgE<gt>
4411f3b6	493
4411f3b6	494	=cut