[p5sagit/p5-mst-13.2.git] / ext / Encode / Encode.pm

package Encode;
use strict;
our $VERSION = do { my @r = (q$Revision: 1.20 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
our $DEBUG = 0;

require DynaLoader;
require Exporter;

our @ISA = qw(Exporter DynaLoader);

# Public, encouraged API is exported by default
our @EXPORT = qw (
  encode
  decode
  encode_utf8
  decode_utf8
  find_encoding
  encodings
);

our @EXPORT_OK =
    qw(
       define_encoding
       from_to
       is_utf8
       is_8bit
       is_16bit
       utf8_upgrade
       utf8_downgrade
       _utf8_on
       _utf8_off
      );

bootstrap Encode ();

# Documentation moved after __END__ for speed - NI-S

use Carp;

our $ON_EBCDIC = (ord("A") == 193);
use Encode::Alias;

# Make a %Encoding package variable to allow a certain amount of cheating
our %Encoding;
our %ExtModule;

my @codepages = qw(
		     37  424  437  500  737  775  850  852  855 
		    856  857  860  861  862  863  864  865  866 
		    869  874  875  932  936  949  950 1006 1026 
		   1047 1250 1251 1252 1253 1254 1255 1256 1257
		   1258
		   );

my @macintosh = qw(
		   CentralEurRoman  Croatian  Cyrillic   Greek
		   Iceland          Roman     Rumanian   Sami
		   Thai             Turkish   Ukrainian
		   );

for my $k (2..11,13..16){
    $ExtModule{"iso-8859-$k"} = 'Encode/Byte.pm';
}

for my $k (@codepages){
    $ExtModule{"cp$k"} = 'Encode/Byte.pm';
}

for my $k (@macintosh)
{
    $ExtModule{"mac$k"} = 'Encode/Byte.pm';
}

%ExtModule =
    (%ExtModule,
     'koi8-r'           => 'Encode/Byte.pm',
     'posix-bc'         => 'Encode/EBCDIC.pm',
     cp037              => 'Encode/EBCDIC.pm',
     cp1026             => 'Encode/EBCDIC.pm',
     cp1047             => 'Encode/EBCDIC.pm',
     cp500              => 'Encode/EBCDIC.pm',
     cp875              => 'Encode/EBCDIC.pm',
     dingbats           => 'Encode/Symbol.pm',
     macDingbats        => 'Encode/Symbol.pm',
     macSymbol          => 'Encode/Symbol.pm',
     symbol             => 'Encode/Symbol.pm',
     viscii             => 'Encode/Byte.pm',
);

unless ($ON_EBCDIC) { # CJK added to autoload unless EBCDIC env
%ExtModule =(%ExtModule,
	     'euc-cn'           => 'Encode/CN.pm',
	     gb2312		=> 'Encode/CN.pm',
	     gb12345		=> 'Encode/CN.pm',
	     gbk		=> 'Encode/CN.pm',
	     cp936		=> 'Encode/CN.pm',
	     'iso-ir-165'	=> 'Encode/CN.pm',
	     'euc-jp'	        => 'Encode/JP.pm',
	     'iso-2022-jp'	=> 'Encode/JP.pm',
	     'iso-2022-jp-1'	=> 'Encode/JP.pm',
	     '7bit-jis'         => 'Encode/JP.pm',
	     shiftjis	        => 'Encode/JP.pm',
	     macJapanese        => 'Encode/JP.pm',
	     cp932		=> 'Encode/JP.pm',
	     'euc-kr'       	=> 'Encode/KR.pm',
	     ksc5601		=> 'Encode/KR.pm',
	     macKorean          => 'Encode/KR.pm',
	     cp949		=> 'Encode/KR.pm',
	     big5		=> 'Encode/TW.pm',
	     'big5-hkscs'	=> 'Encode/TW.pm',
	     cp950		=> 'Encode/TW.pm',
	     gb18030		=> 'Encode/HanExtra.pm',
	     big5plus     	=> 'Encode/HanExtra.pm',
	     'euc-tw'   	=> 'Encode/HanExtra.pm',
	     );
}


sub encodings
{
    my $class = shift;
    my @modules = (@_ and $_[0] eq ":all") ? values %ExtModule : @_;
    for my $m (@modules)
    {
	$DEBUG and warn "about to require $m;";
	eval { require $m; };
    }
    return
	map({$_->[0]} 
	    sort({$a->[1] cmp $b->[1]}
		 map({[$_, lc $_]} 
		     grep({ $_ ne 'Internal' }  keys %Encoding))));
}

sub define_encoding
{
    my $obj  = shift;
    my $name = shift;
    $Encoding{$name} = $obj;
    my $lc = lc($name);
    define_alias($lc => $obj) unless $lc eq $name;
    while (@_)
    {
	my $alias = shift;
	define_alias($alias,$obj);
    }
    return $obj;
}

sub getEncoding
{
    my ($class,$name,$skip_external) = @_;
    my $enc;
    if (ref($name) && $name->can('new_sequence'))
    {
	return $name;
    }
    my $lc = lc $name;
    if (exists $Encoding{$name})
    {
	return $Encoding{$name};
    }
    if (exists $Encoding{$lc})
    {
	return $Encoding{$lc};
    }

    my $oc = $class->find_alias($name);
    return $oc if defined $oc;

    $oc = $class->find_alias($lc) if $lc ne $name;
    return $oc if defined $oc;

    if (!$skip_external and exists $ExtModule{$lc})
    {
	eval{ require $ExtModule{$lc}; };
	return $Encoding{$name} if exists $Encoding{$name};
    }

    return;
}

sub find_encoding
{
    my ($name,$skip_external) = @_;
    return __PACKAGE__->getEncoding($name,$skip_external);
}

sub encode
{
    my ($name,$string,$check) = @_;
    my $enc = find_encoding($name);
    croak("Unknown encoding '$name'") unless defined $enc;
    my $octets = $enc->encode($string,$check);
    return undef if ($check && length($string));
    return $octets;
}

sub decode
{
    my ($name,$octets,$check) = @_;
    my $enc = find_encoding($name);
    croak("Unknown encoding '$name'") unless defined $enc;
    my $string = $enc->decode($octets,$check);
    $_[1] = $octets if $check;
    return $string;
}

sub from_to
{
    my ($string,$from,$to,$check) = @_;
    my $f = find_encoding($from);
    croak("Unknown encoding '$from'") unless defined $f;
    my $t = find_encoding($to);
    croak("Unknown encoding '$to'") unless defined $t;
    my $uni = $f->decode($string,$check);
    return undef if ($check && length($string));
    $string =  $t->encode($uni,$check);
    return undef if ($check && length($uni));
    return defined($_[0] = $string) ? length($string) : undef ;
}

sub encode_utf8
{
    my ($str) = @_;
  utf8::encode($str);
    return $str;
}

sub decode_utf8
{
    my ($str) = @_;
    return undef unless utf8::decode($str);
    return $str;
}

require Encode::Encoding;
require Encode::XS;
require Encode::Internal;
require Encode::Unicode;
require Encode::utf8;
require Encode::10646_1;
require Encode::ucs2_le;

1;

__END__

=head1 NAME

Encode - character encodings

=head1 SYNOPSIS

    use Encode;


=head2 Table of Contents

Encode consists of a collection of modules which details are too big 
to fit in one document.  This POD itself explains the top-level APIs
and general topics at a glance.  For other topics and more details, 
see the PODs below;

  Name			        Description
  --------------------------------------------------------
  Encode::Alias         Alias defintions to encodings
  Encode::Encoding      Encode Implementation Base Class
  Encode::Supported     List of Supported Encodings
  Encode::CN            Simplified Chinese Encodings
  Encode::JP            Japanese Encodings
  Encode::KR            Korean Encodings
  Encode::TW            Traditional Chinese Encodings
  --------------------------------------------------------

=head1 DESCRIPTION

The C<Encode> module provides the interfaces between Perl's strings
and the rest of the system.  Perl strings are sequences of
B<characters>.

The repertoire of characters that Perl can represent is at least that
defined by the Unicode Consortium. On most platforms the ordinal
values of the characters (as returned by C<ord(ch)>) is the "Unicode
codepoint" for the character (the exceptions are those platforms where
the legacy encoding is some variant of EBCDIC rather than a super-set
of ASCII - see L<perlebcdic>).

Traditionally computer data has been moved around in 8-bit chunks
often called "bytes". These chunks are also known as "octets" in
networking standards. Perl is widely used to manipulate data of many
types - not only strings of characters representing human or computer
languages but also "binary" data being the machines representation of
numbers, pixels in an image - or just about anything.

When Perl is processing "binary data" the programmer wants Perl to
process "sequences of bytes". This is not a problem for Perl - as a
byte has 256 possible values it easily fits in Perl's much larger
"logical character".

=head2 TERMINOLOGY

=over 4

=item *

I<character>: a character in the range 0..(2**32-1) (or more).
(What Perl's strings are made of.)

=item *

I<byte>: a character in the range 0..255
(A special case of a Perl character.)

=item *

I<octet>: 8 bits of data, with ordinal values 0..255
(Term for bytes passed to or from a non-Perl context, e.g. disk file.)

=back

The marker [INTERNAL] marks Internal Implementation Details, in
general meant only for those who think they know what they are doing,
and such details may change in future releases.

=head1 PERL ENCODING API

=over 4

=item $bytes  = encode(ENCODING, $string[, CHECK])

Encodes string from Perl's internal form into I<ENCODING> and returns
a sequence of octets.  ENCODING can be either a canonical name or
alias.  For encoding names and aliases, see L</"Defining Aliases">.
For CHECK see L</"Handling Malformed Data">.

For example to convert (internally UTF-8 encoded) Unicode string to
iso-8859-1 (also known as Latin1), 

  $octets = encode("iso-8859-1", $unicode);

=item $string = decode(ENCODING, $bytes[, CHECK])

Decode sequence of octets assumed to be in I<ENCODING> into Perl's
internal form and returns the resulting string.  as in encode(),
ENCODING can be either a canonical name or alias. For encoding names
and aliases, see L</"Defining Aliases">.  For CHECK see
L</"Handling Malformed Data">.

For example to convert ISO-8859-1 data to UTF-8:

  $utf8 = decode("iso-8859-1", $latin1);

=item [$length =] from_to($string, FROM_ENCODING, TO_ENCODING[, CHECK])

Convert B<in-place> the data between two encodings.  How did the data
in $string originally get to be in FROM_ENCODING?  Either using
encode() or through PerlIO: See L</"Encoding and IO">.
For encoding names and aliases, see L</"Defining Aliases">. 
For CHECK see L</"Handling Malformed Data">.

For example to convert ISO-8859-1 data to UTF-8:

	from_to($data, "iso-8859-1", "utf-8");

and to convert it back:

	from_to($data, "utf-8", "iso-8859-1");

Note that because the conversion happens in place, the data to be
converted cannot be a string constant, it must be a scalar variable.

from_to() return the length of the converted string on success, undef
otherwise.

=back

=head2 Listing available encodings

  use Encode;
  @list = Encode->encodings();

Returns a list of the canonical names of the available encodings that
are loaded.  To get a list of all available encodings including the
ones that are not loaded yet, say

  @all_encodings = Encode->encodings(":all");

Or you can give the name of specific module.

  @with_jp = Encode->encodings("Encode/JP.pm");

Note in this case you have to say C<"Encode/JP.pm"> instead of
C<"Encode::JP">.

To find which encodings are supported by this package in details, 
see L<Encode::Supported>.


=head2 Defining Aliases

To add new alias to a given encoding,  Use;

  use Encode;
  use Encode::Alias;
  define_alias(newName => ENCODING);

After that, newName can be used as an alias for ENCODING.
ENCODING may be either the name of an encoding or an I<encoding
 object>

See L<Encode::Alias> on details.

=head1 Encoding and IO

It is very common to want to do encoding transformations when
reading or writing files, network connections, pipes etc.
If Perl is configured to use the new 'perlio' IO system then
C<Encode> provides a "layer" (See L<perliol>) which can transform
data as it is read or written.

Here is how the blind poet would modernise the encoding:

    use Encode;
    open(my $iliad,'<:encoding(iso-8859-7)','iliad.greek');
    open(my $utf8,'>:utf8','iliad.utf8');
    my @epic = <$iliad>;
    print $utf8 @epic;
    close($utf8);
    close($illiad);

In addition the new IO system can also be configured to read/write
UTF-8 encoded characters (as noted above this is efficient):

    open(my $fh,'>:utf8','anything');
    print $fh "Any \x{0021} string \N{SMILEY FACE}\n";

Either of the above forms of "layer" specifications can be made the default
for a lexical scope with the C<use open ...> pragma. See L<open>.

Once a handle is open is layers can be altered using C<binmode>.

Without any such configuration, or if Perl itself is built using
system's own IO, then write operations assume that file handle accepts
only I<bytes> and will C<die> if a character larger than 255 is
written to the handle. When reading, each octet from the handle
becomes a byte-in-a-character. Note that this default is the same
behaviour as bytes-only languages (including Perl before v5.6) would
have, and is sufficient to handle native 8-bit encodings
e.g. iso-8859-1, EBCDIC etc. and any legacy mechanisms for handling
other encodings and binary data.

In other cases it is the programs responsibility to transform
characters into bytes using the API above before doing writes, and to
transform the bytes read from a handle into characters before doing
"character operations" (e.g. C<lc>, C</\W+/>, ...).

You can also use PerlIO to convert larger amounts of data you don't
want to bring into memory.  For example to convert between ISO-8859-1
(Latin 1) and UTF-8 (or UTF-EBCDIC in EBCDIC machines):

    open(F, "<:encoding(iso-8859-1)", "data.txt") or die $!;
    open(G, ">:utf8",                 "data.utf") or die $!;
    while (<F>) { print G }

    # Could also do "print G <F>" but that would pull
    # the whole file into memory just to write it out again.

More examples:

    open(my $f, "<:encoding(cp1252)")
    open(my $g, ">:encoding(iso-8859-2)")
    open(my $h, ">:encoding(latin9)")       # iso-8859-15

See L<PerlIO> for more information.

See also L<encoding> for how to change the default encoding of the
data in your script.

=head1 Handling Malformed Data

If CHECK is not set, C<undef> is returned.  If the data is supposed to
be UTF-8, an optional lexical warning (category utf8) is given.  If
CHECK is true but not a code reference, dies.

It would desirable to have a way to indicate that transform should use
the encodings "replacement character" - no such mechanism is defined yet.

It is also planned to allow I<CHECK> to be a code reference.

This is not yet implemented as there are design issues with what its
arguments should be and how it returns its results.

=over 4

=item Scheme 1

Passed remaining fragment of string being processed.
Modifies it in place to remove bytes/characters it can understand
and returns a string used to represent them.
e.g.

 sub fixup {
   my $ch = substr($_[0],0,1,'');
   return sprintf("\x{%02X}",ord($ch);
 }

This scheme is close to how underlying C code for Encode works, but gives
the fixup routine very little context.

=item Scheme 2

Passed original string, and an index into it of the problem area, and
output string so far.  Appends what it will to output string and
returns new index into original string.  For example:

 sub fixup {
   # my ($s,$i,$d) = @_;
   my $ch = substr($_[0],$_[1],1);
   $_[2] .= sprintf("\x{%02X}",ord($ch);
   return $_[1]+1;
 }

This scheme gives maximal control to the fixup routine but is more
complicated to code, and may need internals of Encode to be tweaked to
keep original string intact.

=item Other Schemes

Hybrids of above.

Multiple return values rather than in-place modifications.

Index into the string could be C<pos($str)> allowing C<s/\G...//>.

=back

=head2 UTF-8 / utf8

The Unicode consortium defines the UTF-8 standard as a way of encoding
the entire Unicode repertoire as sequences of octets.  This encoding is
expected to become very widespread. Perl can use this form internally
to represent strings, so conversions to and from this form are
particularly efficient (as octets in memory do not have to change,
just the meta-data that tells Perl how to treat them).

=over 4

=item $bytes = encode_utf8($string);

The characters that comprise string are encoded in Perl's superset of UTF-8
and the resulting octets returned as a sequence of bytes. All possible
characters have a UTF-8 representation so this function cannot fail.

=item $string = decode_utf8($bytes [, CHECK]);

The sequence of octets represented by $bytes is decoded from UTF-8
into a sequence of logical characters. Not all sequences of octets
form valid UTF-8 encodings, so it is possible for this call to fail.
For CHECK see L</"Handling Malformed Data">.

=back

=head1 Defining Encodings

To define a new encoding, use:

    use Encode qw(define_alias);
    define_encoding($object, 'canonicalName' [, alias...]);

I<canonicalName> will be associated with I<$object>.  The object
should provide the interface described in L<Encode::Encoding>
If more than two arguments are provided then additional
arguments are taken as aliases for I<$object> as for C<define_alias>.

=head1 Messing with Perl's Internals

The following API uses parts of Perl's internals in the current
implementation.  As such they are efficient, but may change.

=over 4

=item is_utf8(STRING [, CHECK])

[INTERNAL] Test whether the UTF-8 flag is turned on in the STRING.
If CHECK is true, also checks the data in STRING for being well-formed
UTF-8.  Returns true if successful, false otherwise.

=item _utf8_on(STRING)

[INTERNAL] Turn on the UTF-8 flag in STRING.  The data in STRING is
B<not> checked for being well-formed UTF-8.  Do not use unless you
B<know> that the STRING is well-formed UTF-8.  Returns the previous
state of the UTF-8 flag (so please don't test the return value as
I<not> success or failure), or C<undef> if STRING is not a string.

=item _utf8_off(STRING)

[INTERNAL] Turn off the UTF-8 flag in STRING.  Do not use frivolously.
Returns the previous state of the UTF-8 flag (so please don't test the
return value as I<not> success or failure), or C<undef> if STRING is
not a string.

=back

=head1 SEE ALSO

L<Encode::Encoding>,
L<Encode::Supported>,
L<PerlIO>, 
L<encoding>,
L<perlebcdic>, 
L<perlfunc/open>, 
L<perlunicode>, 
L<utf8>, 
the Perl Unicode Mailing List E<lt>perl-unicode@perl.orgE<gt>

=cut
Commit	Line	Data
2c674647	1	package Encode;
51ef4e11	2	use strict;
a999c27c	3	our $VERSION = do { my @r = (q$Revision: 1.20 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
5129552c	4	our $DEBUG = 0;
2c674647	5
	6	require DynaLoader;
	7	require Exporter;
	8
51ef4e11	9	our @ISA = qw(Exporter DynaLoader);
2c674647	10
4411f3b6	11	# Public, encouraged API is exported by default
51ef4e11	12	our @EXPORT = qw (
4411f3b6	13	encode
	14	decode
	15	encode_utf8
	16	decode_utf8
	17	find_encoding
51ef4e11	18	encodings
4411f3b6	19	);
4411f3b6	20
51ef4e11	21	our @EXPORT_OK =
2c674647	22	qw(
51ef4e11	23	define_encoding
2c674647	24	from_to
2c674647	25	is_utf8
4411f3b6	26	is_8bit
4411f3b6	27	is_16bit
a12c0f56	28	utf8_upgrade
a12c0f56	29	utf8_downgrade
4411f3b6	30	_utf8_on
4411f3b6	31	_utf8_off
2c674647	32	);
	33
	34	bootstrap Encode ();
	35
4411f3b6	36	# Documentation moved after __END__ for speed - NI-S
2c674647	37
bf230f3d	38	use Carp;
bf230f3d	39
a63c962f	40	our $ON_EBCDIC = (ord("A") == 193);
5d030b67	41	use Encode::Alias;
5d030b67	42
5129552c	43	# Make a %Encoding package variable to allow a certain amount of cheating
5129552c	44	our %Encoding;
a999c27c	45	our %ExtModule;
	46
	47	my @codepages = qw(
	48	37 424 437 500 737 775 850 852 855
	49	856 857 860 861 862 863 864 865 866
	50	869 874 875 932 936 949 950 1006 1026
	51	1047 1250 1251 1252 1253 1254 1255 1256 1257
	52	1258
	53	);
	54
	55	my @macintosh = qw(
	56	CentralEurRoman Croatian Cyrillic Greek
	57	Iceland Roman Rumanian Sami
	58	Thai Turkish Ukrainian
	59	);
d1ed7747	60
5129552c	61	for my $k (2..11,13..16){
	62	$ExtModule{"iso-8859-$k"} = 'Encode/Byte.pm';
	63	}
	64
a999c27c	65	for my $k (@codepages){
5129552c	66	$ExtModule{"cp$k"} = 'Encode/Byte.pm';
	67	}
	68
a999c27c	69	for my $k (@macintosh)
	70	{
	71	$ExtModule{"mac$k"} = 'Encode/Byte.pm';
	72	}
	73
	74	%ExtModule =
	75	(%ExtModule,
	76	'koi8-r' => 'Encode/Byte.pm',
	77	'posix-bc' => 'Encode/EBCDIC.pm',
	78	cp037 => 'Encode/EBCDIC.pm',
	79	cp1026 => 'Encode/EBCDIC.pm',
	80	cp1047 => 'Encode/EBCDIC.pm',
	81	cp500 => 'Encode/EBCDIC.pm',
	82	cp875 => 'Encode/EBCDIC.pm',
	83	dingbats => 'Encode/Symbol.pm',
	84	macDingbats => 'Encode/Symbol.pm',
	85	macSymbol => 'Encode/Symbol.pm',
	86	symbol => 'Encode/Symbol.pm',
	87	viscii => 'Encode/Byte.pm',
	88	);
	89
a63c962f	90	unless ($ON_EBCDIC) { # CJK added to autoload unless EBCDIC env
a999c27c	91	%ExtModule =(%ExtModule,
a63c962f	92	'euc-cn' => 'Encode/CN.pm',
	93	gb2312 => 'Encode/CN.pm',
	94	gb12345 => 'Encode/CN.pm',
	95	gbk => 'Encode/CN.pm',
	96	cp936 => 'Encode/CN.pm',
	97	'iso-ir-165' => 'Encode/CN.pm',
	98	'euc-jp' => 'Encode/JP.pm',
	99	'iso-2022-jp' => 'Encode/JP.pm',
	100	'iso-2022-jp-1' => 'Encode/JP.pm',
	101	'7bit-jis' => 'Encode/JP.pm',
	102	shiftjis => 'Encode/JP.pm',
a999c27c	103	macJapanese => 'Encode/JP.pm',
a63c962f	104	cp932 => 'Encode/JP.pm',
	105	'euc-kr' => 'Encode/KR.pm',
	106	ksc5601 => 'Encode/KR.pm',
a999c27c	107	macKorean => 'Encode/KR.pm',
a63c962f	108	cp949 => 'Encode/KR.pm',
	109	big5 => 'Encode/TW.pm',
	110	'big5-hkscs' => 'Encode/TW.pm',
	111	cp950 => 'Encode/TW.pm',
	112	gb18030 => 'Encode/HanExtra.pm',
	113	big5plus => 'Encode/HanExtra.pm',
	114	'euc-tw' => 'Encode/HanExtra.pm',
	115	);
	116	}
	117
a999c27c	118
a999c27c	119
5129552c	120
656753f8	121	sub encodings
656753f8	122	{
5129552c	123	my $class = shift;
071db25d	124	my @modules = (@_ and $_[0] eq ":all") ? values %ExtModule : @_;
5129552c	125	for my $m (@modules)
	126	{
	127	$DEBUG and warn "about to require $m;";
	128	eval { require $m; };
	129	}
	130	return
	131	map({$_->[0]}
	132	sort({$a->[1] cmp $b->[1]}
	133	map({[$_, lc $_]}
	134	grep({ $_ ne 'Internal' } keys %Encoding))));
51ef4e11	135	}
51ef4e11	136
51ef4e11	137	sub define_encoding
51ef4e11	138	{
18586f54	139	my $obj = shift;
18586f54	140	my $name = shift;
5129552c	141	$Encoding{$name} = $obj;
18586f54	142	my $lc = lc($name);
	143	define_alias($lc => $obj) unless $lc eq $name;
	144	while (@_)
	145	{
	146	my $alias = shift;
	147	define_alias($alias,$obj);
	148	}
	149	return $obj;
656753f8	150	}
656753f8	151
656753f8	152	sub getEncoding
656753f8	153	{
dd9703c9	154	my ($class,$name,$skip_external) = @_;
18586f54	155	my $enc;
	156	if (ref($name) && $name->can('new_sequence'))
	157	{
	158	return $name;
	159	}
	160	my $lc = lc $name;
5129552c	161	if (exists $Encoding{$name})
18586f54	162	{
5129552c	163	return $Encoding{$name};
18586f54	164	}
5129552c	165	if (exists $Encoding{$lc})
18586f54	166	{
5129552c	167	return $Encoding{$lc};
18586f54	168	}
c50d192e	169
5129552c	170	my $oc = $class->find_alias($name);
c50d192e	171	return $oc if defined $oc;
c50d192e	172
5129552c	173	$oc = $class->find_alias($lc) if $lc ne $name;
c50d192e	174	return $oc if defined $oc;
c50d192e	175
5129552c	176	if (!$skip_external and exists $ExtModule{$lc})
d1ed7747	177	{
5129552c	178	eval{ require $ExtModule{$lc}; };
5129552c	179	return $Encoding{$name} if exists $Encoding{$name};
d1ed7747	180	}
18586f54	181
18586f54	182	return;
656753f8	183	}
656753f8	184
4411f3b6	185	sub find_encoding
4411f3b6	186	{
dd9703c9	187	my ($name,$skip_external) = @_;
dd9703c9	188	return __PACKAGE__->getEncoding($name,$skip_external);
4411f3b6	189	}
	190
	191	sub encode
	192	{
18586f54	193	my ($name,$string,$check) = @_;
	194	my $enc = find_encoding($name);
	195	croak("Unknown encoding '$name'") unless defined $enc;
	196	my $octets = $enc->encode($string,$check);
	197	return undef if ($check && length($string));
	198	return $octets;
4411f3b6	199	}
	200
	201	sub decode
	202	{
18586f54	203	my ($name,$octets,$check) = @_;
	204	my $enc = find_encoding($name);
	205	croak("Unknown encoding '$name'") unless defined $enc;
	206	my $string = $enc->decode($octets,$check);
	207	$_[1] = $octets if $check;
	208	return $string;
4411f3b6	209	}
	210
	211	sub from_to
	212	{
18586f54	213	my ($string,$from,$to,$check) = @_;
	214	my $f = find_encoding($from);
	215	croak("Unknown encoding '$from'") unless defined $f;
	216	my $t = find_encoding($to);
	217	croak("Unknown encoding '$to'") unless defined $t;
	218	my $uni = $f->decode($string,$check);
	219	return undef if ($check && length($string));
a999c27c	220	$string = $t->encode($uni,$check);
18586f54	221	return undef if ($check && length($uni));
3ef515df	222	return defined($_[0] = $string) ? length($string) : undef ;
4411f3b6	223	}
	224
	225	sub encode_utf8
	226	{
18586f54	227	my ($str) = @_;
	228	utf8::encode($str);
	229	return $str;
4411f3b6	230	}
	231
	232	sub decode_utf8
	233	{
18586f54	234	my ($str) = @_;
	235	return undef unless utf8::decode($str);
	236	return $str;
5ad8ef52	237	}
5ad8ef52	238
18586f54	239	require Encode::Encoding;
	240	require Encode::XS;
	241	require Encode::Internal;
	242	require Encode::Unicode;
	243	require Encode::utf8;
64ffdd5e	244	require Encode::10646_1;
18586f54	245	require Encode::ucs2_le;
4411f3b6	246
656753f8	247	1;
656753f8	248
2a936312	249	__END__
2a936312	250
4411f3b6	251	=head1 NAME
	252
	253	Encode - character encodings
	254
	255	=head1 SYNOPSIS
	256
	257	use Encode;
	258
67d7b5ef	259
	260	=head2 Table of Contents
	261
	262	Encode consists of a collection of modules which details are too big
	263	to fit in one document. This POD itself explains the top-level APIs
	264	and general topics at a glance. For other topics and more details,
	265	see the PODs below;
	266
	267	Name Description
	268	--------------------------------------------------------
	269	Encode::Alias Alias defintions to encodings
	270	Encode::Encoding Encode Implementation Base Class
	271	Encode::Supported List of Supported Encodings
	272	Encode::CN Simplified Chinese Encodings
	273	Encode::JP Japanese Encodings
	274	Encode::KR Korean Encodings
	275	Encode::TW Traditional Chinese Encodings
	276	--------------------------------------------------------
	277
4411f3b6	278	=head1 DESCRIPTION
4411f3b6	279
47bfe92f	280	The C<Encode> module provides the interfaces between Perl's strings
67d7b5ef	281	and the rest of the system. Perl strings are sequences of
	282	B<characters>.
	283
	284	The repertoire of characters that Perl can represent is at least that
	285	defined by the Unicode Consortium. On most platforms the ordinal
	286	values of the characters (as returned by C<ord(ch)>) is the "Unicode
	287	codepoint" for the character (the exceptions are those platforms where
	288	the legacy encoding is some variant of EBCDIC rather than a super-set
	289	of ASCII - see L<perlebcdic>).
	290
	291	Traditionally computer data has been moved around in 8-bit chunks
	292	often called "bytes". These chunks are also known as "octets" in
	293	networking standards. Perl is widely used to manipulate data of many
	294	types - not only strings of characters representing human or computer
	295	languages but also "binary" data being the machines representation of
	296	numbers, pixels in an image - or just about anything.
	297
	298	When Perl is processing "binary data" the programmer wants Perl to
	299	process "sequences of bytes". This is not a problem for Perl - as a
	300	byte has 256 possible values it easily fits in Perl's much larger
	301	"logical character".
	302
	303	=head2 TERMINOLOGY
4411f3b6	304
67d7b5ef	305	=over 4
21938dfa	306
67d7b5ef	307	=item *
	308
	309	I<character>: a character in the range 0..(2**32-1) (or more).
	310	(What Perl's strings are made of.)
	311
	312	=item *
	313
	314	I<byte>: a character in the range 0..255
	315	(A special case of a Perl character.)
	316
	317	=item *
	318
	319	I<octet>: 8 bits of data, with ordinal values 0..255
	320	(Term for bytes passed to or from a non-Perl context, e.g. disk file.)
	321
	322	=back
4411f3b6	323
67d7b5ef	324	The marker [INTERNAL] marks Internal Implementation Details, in
	325	general meant only for those who think they know what they are doing,
	326	and such details may change in future releases.
	327
	328	=head1 PERL ENCODING API
4411f3b6	329
	330	=over 4
	331
a63c962f	332	=item $bytes = encode(ENCODING, $string[, CHECK])
4411f3b6	333
47bfe92f	334	Encodes string from Perl's internal form into I<ENCODING> and returns
67d7b5ef	335	a sequence of octets. ENCODING can be either a canonical name or
	336	alias. For encoding names and aliases, see L</"Defining Aliases">.
	337	For CHECK see L</"Handling Malformed Data">.
4411f3b6	338
67d7b5ef	339	For example to convert (internally UTF-8 encoded) Unicode string to
67d7b5ef	340	iso-8859-1 (also known as Latin1),
681a7c68	341
67d7b5ef	342	$octets = encode("iso-8859-1", $unicode);
681a7c68	343
a63c962f	344	=item $string = decode(ENCODING, $bytes[, CHECK])
4411f3b6	345
47bfe92f	346	Decode sequence of octets assumed to be in I<ENCODING> into Perl's
67d7b5ef	347	internal form and returns the resulting string. as in encode(),
	348	ENCODING can be either a canonical name or alias. For encoding names
	349	and aliases, see L</"Defining Aliases">. For CHECK see
47bfe92f	350	L</"Handling Malformed Data">.
47bfe92f	351
1b2c56c8	352	For example to convert ISO-8859-1 data to UTF-8:
681a7c68	353
67d7b5ef	354	$utf8 = decode("iso-8859-1", $latin1);
681a7c68	355
3ef515df	356	=item [$length =] from_to($string, FROM_ENCODING, TO_ENCODING[, CHECK])
47bfe92f	357
2b106fbe	358	Convert B<in-place> the data between two encodings. How did the data
2b106fbe	359	in $string originally get to be in FROM_ENCODING? Either using
67d7b5ef	360	encode() or through PerlIO: See L</"Encoding and IO">.
	361	For encoding names and aliases, see L</"Defining Aliases">.
	362	For CHECK see L</"Handling Malformed Data">.
2b106fbe	363
1b2c56c8	364	For example to convert ISO-8859-1 data to UTF-8:
2b106fbe	365
	366	from_to($data, "iso-8859-1", "utf-8");
	367
	368	and to convert it back:
	369
	370	from_to($data, "utf-8", "iso-8859-1");
4411f3b6	371
ab97ca19	372	Note that because the conversion happens in place, the data to be
	373	converted cannot be a string constant, it must be a scalar variable.
	374
3ef515df	375	from_to() return the length of the converted string on success, undef
	376	otherwise.
	377
4411f3b6	378	=back
4411f3b6	379
51ef4e11	380	=head2 Listing available encodings
51ef4e11	381
5129552c	382	use Encode;
	383	@list = Encode->encodings();
	384
	385	Returns a list of the canonical names of the available encodings that
	386	are loaded. To get a list of all available encodings including the
	387	ones that are not loaded yet, say
	388
	389	@all_encodings = Encode->encodings(":all");
	390
	391	Or you can give the name of specific module.
	392
	393	@with_jp = Encode->encodings("Encode/JP.pm");
51ef4e11	394
a63c962f	395	Note in this case you have to say C<"Encode/JP.pm"> instead of
a63c962f	396	C<"Encode::JP">.
5d030b67	397
a63c962f	398	To find which encodings are supported by this package in details,
5d030b67	399	see L<Encode::Supported>.
51ef4e11	400
67d7b5ef	401
51ef4e11	402	=head2 Defining Aliases
51ef4e11	403
67d7b5ef	404	To add new alias to a given encoding, Use;
67d7b5ef	405
5129552c	406	use Encode;
5129552c	407	use Encode::Alias;
a63c962f	408	define_alias(newName => ENCODING);
51ef4e11	409
3ef515df	410	After that, newName can be used as an alias for ENCODING.
	411	ENCODING may be either the name of an encoding or an I<encoding
	412	object>
51ef4e11	413
5d030b67	414	See L<Encode::Alias> on details.
51ef4e11	415
4411f3b6	416	=head1 Encoding and IO
	417
	418	It is very common to want to do encoding transformations when
	419	reading or writing files, network connections, pipes etc.
47bfe92f	420	If Perl is configured to use the new 'perlio' IO system then
4411f3b6	421	C<Encode> provides a "layer" (See L<perliol>) which can transform
	422	data as it is read or written.
	423
8e86646e	424	Here is how the blind poet would modernise the encoding:
8e86646e	425
42234700	426	use Encode;
8e86646e	427	open(my $iliad,'<:encoding(iso-8859-7)','iliad.greek');
	428	open(my $utf8,'>:utf8','iliad.utf8');
	429	my @epic = <$iliad>;
	430	print $utf8 @epic;
	431	close($utf8);
	432	close($illiad);
4411f3b6	433
	434	In addition the new IO system can also be configured to read/write
	435	UTF-8 encoded characters (as noted above this is efficient):
	436
e9692b5b	437	open(my $fh,'>:utf8','anything');
e9692b5b	438	print $fh "Any \x{0021} string \N{SMILEY FACE}\n";
4411f3b6	439
	440	Either of the above forms of "layer" specifications can be made the default
	441	for a lexical scope with the C<use open ...> pragma. See L<open>.
	442
	443	Once a handle is open is layers can be altered using C<binmode>.
	444
47bfe92f	445	Without any such configuration, or if Perl itself is built using
4411f3b6	446	system's own IO, then write operations assume that file handle accepts
	447	only I<bytes> and will C<die> if a character larger than 255 is
	448	written to the handle. When reading, each octet from the handle
	449	becomes a byte-in-a-character. Note that this default is the same
47bfe92f	450	behaviour as bytes-only languages (including Perl before v5.6) would
	451	have, and is sufficient to handle native 8-bit encodings
	452	e.g. iso-8859-1, EBCDIC etc. and any legacy mechanisms for handling
	453	other encodings and binary data.
	454
	455	In other cases it is the programs responsibility to transform
	456	characters into bytes using the API above before doing writes, and to
	457	transform the bytes read from a handle into characters before doing
	458	"character operations" (e.g. C<lc>, C</\W+/>, ...).
	459
47bfe92f	460	You can also use PerlIO to convert larger amounts of data you don't
1b2c56c8	461	want to bring into memory. For example to convert between ISO-8859-1
47bfe92f	462	(Latin 1) and UTF-8 (or UTF-EBCDIC in EBCDIC machines):
47bfe92f	463
e9692b5b	464	open(F, "<:encoding(iso-8859-1)", "data.txt") or die $!;
	465	open(G, ">:utf8", "data.utf") or die $!;
	466	while (<F>) { print G }
	467
	468	# Could also do "print G <F>" but that would pull
	469	# the whole file into memory just to write it out again.
	470
	471	More examples:
47bfe92f	472
e9692b5b	473	open(my $f, "<:encoding(cp1252)")
	474	open(my $g, ">:encoding(iso-8859-2)")
	475	open(my $h, ">:encoding(latin9)") # iso-8859-15
47bfe92f	476
47bfe92f	477	See L<PerlIO> for more information.
4411f3b6	478
1768d7eb	479	See also L<encoding> for how to change the default encoding of the
d521382b	480	data in your script.
1768d7eb	481
67d7b5ef	482	=head1 Handling Malformed Data
	483
	484	If CHECK is not set, C<undef> is returned. If the data is supposed to
	485	be UTF-8, an optional lexical warning (category utf8) is given. If
	486	CHECK is true but not a code reference, dies.
	487
	488	It would desirable to have a way to indicate that transform should use
	489	the encodings "replacement character" - no such mechanism is defined yet.
	490
	491	It is also planned to allow I<CHECK> to be a code reference.
	492
	493	This is not yet implemented as there are design issues with what its
	494	arguments should be and how it returns its results.
	495
	496	=over 4
	497
	498	=item Scheme 1
	499
	500	Passed remaining fragment of string being processed.
	501	Modifies it in place to remove bytes/characters it can understand
	502	and returns a string used to represent them.
	503	e.g.
	504
	505	sub fixup {
	506	my $ch = substr($_[0],0,1,'');
	507	return sprintf("\x{%02X}",ord($ch);
	508	}
	509
	510	This scheme is close to how underlying C code for Encode works, but gives
	511	the fixup routine very little context.
	512
	513	=item Scheme 2
	514
	515	Passed original string, and an index into it of the problem area, and
	516	output string so far. Appends what it will to output string and
	517	returns new index into original string. For example:
	518
	519	sub fixup {
	520	# my ($s,$i,$d) = @_;
	521	my $ch = substr($_[0],$_[1],1);
	522	$_[2] .= sprintf("\x{%02X}",ord($ch);
	523	return $_[1]+1;
	524	}
	525
	526	This scheme gives maximal control to the fixup routine but is more
	527	complicated to code, and may need internals of Encode to be tweaked to
	528	keep original string intact.
	529
	530	=item Other Schemes
	531
	532	Hybrids of above.
	533
	534	Multiple return values rather than in-place modifications.
	535
	536	Index into the string could be C<pos($str)> allowing C<s/\G...//>.
	537
	538	=back
	539
	540	=head2 UTF-8 / utf8
	541
	542	The Unicode consortium defines the UTF-8 standard as a way of encoding
	543	the entire Unicode repertoire as sequences of octets. This encoding is
	544	expected to become very widespread. Perl can use this form internally
	545	to represent strings, so conversions to and from this form are
546	particularly efficient (as octets in memory do not have to change,
547	just the meta-data that tells Perl how to treat them).
548
549	=over 4
550
551	=item $bytes = encode_utf8($string);
552
553	The characters that comprise string are encoded in Perl's superset of UTF-8
554	and the resulting octets returned as a sequence of bytes. All possible
555	characters have a UTF-8 representation so this function cannot fail.
556
557	=item $string = decode_utf8($bytes [, CHECK]);
558
559	The sequence of octets represented by $bytes is decoded from UTF-8
560	into a sequence of logical characters. Not all sequences of octets
561	form valid UTF-8 encodings, so it is possible for this call to fail.
562	For CHECK see L</"Handling Malformed Data">.
563
564	=back
565
566	=head1 Defining Encodings
567
568	To define a new encoding, use:
569
570	use Encode qw(define_alias);
571	define_encoding($object, 'canonicalName' [, alias...]);
572
573	I<canonicalName> will be associated with I<$object>. The object
574	should provide the interface described in L<Encode::Encoding>
575	If more than two arguments are provided then additional
576	arguments are taken as aliases for I<$object> as for C<define_alias>.
577
4411f3b6	578	=head1 Messing with Perl's Internals
4411f3b6	579
47bfe92f	580	The following API uses parts of Perl's internals in the current
47bfe92f	581	implementation. As such they are efficient, but may change.
4411f3b6	582
	583	=over 4
	584
a63c962f	585	=item is_utf8(STRING [, CHECK])
4411f3b6	586
4411f3b6	587	[INTERNAL] Test whether the UTF-8 flag is turned on in the STRING.
47bfe92f	588	If CHECK is true, also checks the data in STRING for being well-formed
47bfe92f	589	UTF-8. Returns true if successful, false otherwise.
4411f3b6	590
a63c962f	591	=item _utf8_on(STRING)
4411f3b6	592
	593	[INTERNAL] Turn on the UTF-8 flag in STRING. The data in STRING is
	594	B<not> checked for being well-formed UTF-8. Do not use unless you
	595	B<know> that the STRING is well-formed UTF-8. Returns the previous
	596	state of the UTF-8 flag (so please don't test the return value as
	597	I<not> success or failure), or C<undef> if STRING is not a string.
	598
a63c962f	599	=item _utf8_off(STRING)
4411f3b6	600
	601	[INTERNAL] Turn off the UTF-8 flag in STRING. Do not use frivolously.
	602	Returns the previous state of the UTF-8 flag (so please don't test the
	603	return value as I<not> success or failure), or C<undef> if STRING is
	604	not a string.
	605
	606	=back
	607
	608	=head1 SEE ALSO
	609
5d030b67	610	L<Encode::Encoding>,
	611	L<Encode::Supported>,
	612	L<PerlIO>,
	613	L<encoding>,
	614	L<perlebcdic>,
	615	L<perlfunc/open>,
	616	L<perlunicode>,
	617	L<utf8>,
	618	the Perl Unicode Mailing List E<lt>perl-unicode@perl.orgE<gt>
4411f3b6	619
4411f3b6	620	=cut