[p5sagit/p5-mst-13.2.git] / ext / Encode / Encode.pm

package Encode;
use strict;
our $VERSION = do { my @r = (q$Revision: 0.94 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };

require DynaLoader;
require Exporter;

our @ISA = qw(Exporter DynaLoader);

# Public, encouraged API is exported by default
our @EXPORT = qw (
  encode
  decode
  encode_utf8
  decode_utf8
  find_encoding
  encodings
);

our @EXPORT_OK =
    qw(
       define_encoding
       define_alias
       from_to
       is_utf8
       is_8bit
       is_16bit
       utf8_upgrade
       utf8_downgrade
       _utf8_on
       _utf8_off
      );

bootstrap Encode ();

# Documentation moved after __END__ for speed - NI-S

use Carp;

# Make a %encoding package variable to allow a certain amount of cheating
our %encoding;
my @alias;  # ordered matching list
my %alias;  # cached known aliases

                     # 0  1  2  3  4  5   6   7   8   9  10
our @latin2iso_num = ( 0, 1, 2, 3, 4, 9, 10, 13, 14, 15, 16 );

our %winlatin2cp   = (
		      'Latin1'     => 1252,
		      'Latin2'     => 1250,
		      'Cyrillic'   => 1251,
		      'Greek'      => 1253,
		      'Turkish'    => 1254,
		      'Hebrew'     => 1255,
		      'Arabic'     => 1256,
		      'Baltic'     => 1257,
		      'Vietnamese' => 1258,
		     );

our %external_tables =
    (
	'euc-cn'	=> 'Encode/CN.pm',
	gb2312		=> 'Encode/CN.pm',
	gb12345		=> 'Encode/CN.pm',
	gbk		=> 'Encode/CN.pm',
	cp936		=> 'Encode/CN.pm',
	'iso-ir-165'	=> 'Encode/CN.pm',
	'euc-jp'	=> 'Encode/JP.pm',
	'iso-2022-jp'	=> 'Encode/JP.pm',
	'7bit-jis'	=> 'Encode/JP.pm',
	shiftjis	=> 'Encode/JP.pm',
	macjapan	=> 'Encode/JP.pm',
	cp932		=> 'Encode/JP.pm',
	'euc-kr'	=> 'Encode/KR.pm',
	ksc5601		=> 'Encode/KR.pm',
	cp949		=> 'Encode/KR.pm',
	big5		=> 'Encode/TW.pm',
	'big5-hkscs'	=> 'Encode/TW.pm',
	cp950		=> 'Encode/TW.pm',
	gb18030		=> 'Encode/HanExtra.pm',
	big5plus	=> 'Encode/HanExtra.pm',
	'euc-tw'	=> 'Encode/HanExtra.pm',
    );

sub encodings
{
 my ($class) = @_;
 return
     map { $_->[0] }
         sort { $a->[1] cmp $b->[1] }
               map { [$_, lc $_] }
                   grep { $_ ne 'Internal' }
                        keys %encoding;
}

sub findAlias
{
    my $class = shift;
    local $_ = shift;
    # print "# findAlias $_\n";
    unless (exists $alias{$_})
    {
	for (my $i=0; $i < @alias; $i += 2)
	{
	    my $alias = $alias[$i];
	    my $val   = $alias[$i+1];
	    my $new;
	    if (ref($alias) eq 'Regexp' && $_ =~ $alias)
	    {
		$new = eval $val;
	    }
	    elsif (ref($alias) eq 'CODE')
	    {
		$new = &{$alias}($val)
		}
	    elsif (lc($_) eq lc($alias))
	    {
		$new = $val;
	    }
	    if (defined($new))
	    {
		next if $new eq $_; # avoid (direct) recursion on bugs
		my $enc = (ref($new)) ? $new : find_encoding($new);
		if ($enc)
		{
		    $alias{$_} = $enc;
		    last;
		}
	    }
	}
    }
    return $alias{$_};
}

sub define_alias
{
    while (@_)
    {
	my ($alias,$name) = splice(@_,0,2);
	push(@alias, $alias => $name);
    }
}

# Allow variants of iso-8859-1 etc.
define_alias( qr/^iso[-_]?(\d+)[-_](\d+)$/i => '"iso-$1-$2"' );

# At least HP-UX has these.
define_alias( qr/^iso8859(\d+)$/i => '"iso-8859-$1"' );

# More HP stuff.
define_alias( qr/^(?:hp-)?(arabic|greek|hebrew|kana|roman|thai|turkish)8$/i => '"${1}8"' );

# The Official name of ASCII.
define_alias( qr/^ANSI[-_]?X3\.4[-_]?1968$/i => '"ascii"' );

# This is a font issue, not an encoding issue.
# (The currency symbol of the Latin 1 upper half
#  has been redefined as the euro symbol.)
define_alias( qr/^(.+)\@euro$/i => '"$1"' );

# Allow latin-1 style names as well
define_alias( qr/^(?:iso[-_]?)?latin[-_]?(\d+)$/i => '"iso-8859-$latin2iso_num[$1]"' );

# Allow winlatin1 style names as well
define_alias( qr/^win(latin[12]|cyrillic|baltic|greek|turkish|hebrew|arabic|baltic|vietnamese)$/i => '"cp$winlatin2cp{\u$1}"' );

# Common names for non-latin prefered MIME names
define_alias( 'ascii'    => 'US-ascii',
              'cyrillic' => 'iso-8859-5',
              'arabic'   => 'iso-8859-6',
              'greek'    => 'iso-8859-7',
              'hebrew'   => 'iso-8859-8',
              'thai'     => 'iso-8859-11',
              'tis620'   => 'iso-8859-11',
	    );

# At least AIX has IBM-NNN (surprisingly...) instead of cpNNN.
# And Microsoft has their own naming (again, surprisingly).
define_alias( qr/^(?:ibm|ms)[-_]?(\d\d\d\d?)$/i => '"cp$1"');

# Sometimes seen with a leading zero.
define_alias( qr/^cp037$/i => '"cp37"');

# Ououououou.
define_alias( qr/^macRomanian$/i => '"macRumanian"');

# Standardize on the dashed versions.
define_alias( qr/^utf8$/i  => 'utf-8' );
define_alias( qr/^koi8r$/i => 'koi8-r' );
define_alias( qr/^koi8u$/i => 'koi8-u' );

# Seen in some Linuxes.
define_alias( qr/^ujis$/i => 'euc-jp' );

# CP936 doesn't have vendor-addon for GBK, so they're identical.
define_alias( qr/^gbk$/i => '"cp936"');

# TODO: HP-UX '8' encodings arabic8 greek8 hebrew8 kana8 thai8 turkish8
# TODO: HP-UX '15' encodings japanese15 korean15 roi15
# TODO: Cyrillic encoding ISO-IR-111 (useful?)
# TODO: Armenian encoding ARMSCII-8
# TODO: Hebrew encoding ISO-8859-8-1
# TODO: Thai encoding TCVN
# TODO: Korean encoding Johab
# TODO: Vietnamese encodings VPS
# TODO: Mac Asian+African encodings: Arabic Armenian Bengali Burmese
#       ChineseSimp ChineseTrad Devanagari Ethiopic ExtArabic
#       Farsi Georgian Gujarati Gurmukhi Hebrew Japanese
#       Kannada Khmer Korean Laotian Malayalam Mongolian
#       Oriya Sinhalese Symbol Tamil Telugu Tibetan Vietnamese

# Map white space and _ to '-'
define_alias( qr/^(\S+)[\s_]+(.*)$/i => '"$1-$2"' );

sub define_encoding
{
    my $obj  = shift;
    my $name = shift;
    $encoding{$name} = $obj;
    my $lc = lc($name);
    define_alias($lc => $obj) unless $lc eq $name;
    while (@_)
    {
	my $alias = shift;
	define_alias($alias,$obj);
    }
    return $obj;
}

sub getEncoding
{
    my ($class,$name,$skip_external) = @_;
    my $enc;
    if (ref($name) && $name->can('new_sequence'))
    {
	return $name;
    }
    my $lc = lc $name;
    if (exists $encoding{$name})
    {
	return $encoding{$name};
    }
    if (exists $encoding{$lc})
    {
	return $encoding{$lc};
    }

    my $oc = $class->findAlias($name);
    return $oc if defined $oc;

    $oc = $class->findAlias($lc) if $lc ne $name;
    return $oc if defined $oc;

    if (!$skip_external and exists $external_tables{$lc})
    {
	require $external_tables{$lc};
	return $encoding{$name} if exists $encoding{$name};
    }

    return;
}

sub find_encoding
{
    my ($name,$skip_external) = @_;
    return __PACKAGE__->getEncoding($name,$skip_external);
}

sub encode
{
    my ($name,$string,$check) = @_;
    my $enc = find_encoding($name);
    croak("Unknown encoding '$name'") unless defined $enc;
    my $octets = $enc->encode($string,$check);
    return undef if ($check && length($string));
    return $octets;
}

sub decode
{
    my ($name,$octets,$check) = @_;
    my $enc = find_encoding($name);
    croak("Unknown encoding '$name'") unless defined $enc;
    my $string = $enc->decode($octets,$check);
    $_[1] = $octets if $check;
    return $string;
}

sub from_to
{
    my ($string,$from,$to,$check) = @_;
    my $f = find_encoding($from);
    croak("Unknown encoding '$from'") unless defined $f;
    my $t = find_encoding($to);
    croak("Unknown encoding '$to'") unless defined $t;
    my $uni = $f->decode($string,$check);
    return undef if ($check && length($string));
    $string = $t->encode($uni,$check);
    return undef if ($check && length($uni));
    return length($_[0] = $string);
}

sub encode_utf8
{
    my ($str) = @_;
  utf8::encode($str);
    return $str;
}

sub decode_utf8
{
    my ($str) = @_;
    return undef unless utf8::decode($str);
    return $str;
}

require Encode::Encoding;
require Encode::XS;
require Encode::Internal;
require Encode::Unicode;
require Encode::utf8;
require Encode::iso10646_1;
require Encode::ucs2_le;

1;

__END__

=head1 NAME

Encode - character encodings

=head1 SYNOPSIS

    use Encode;

=head1 DESCRIPTION

The C<Encode> module provides the interfaces between Perl's strings
and the rest of the system.  Perl strings are sequences of B<characters>.

To find more about character encodings, please consult
L<Encode::Description> . This document focuses on programming references.

=head1 PERL ENCODING API

=head2 Generic Encoding Interface

=over 4

=item *

        $bytes  = encode(ENCODING, $string[, CHECK])

Encodes string from Perl's internal form into I<ENCODING> and returns
a sequence of octets.  For CHECK see L</"Handling Malformed Data">.

For example to convert (internally UTF-8 encoded) Unicode data
to octets:

	$octets = encode("utf8", $unicode);

=item *

        $string = decode(ENCODING, $bytes[, CHECK])

Decode sequence of octets assumed to be in I<ENCODING> into Perl's
internal form and returns the resulting string.  For CHECK see
L</"Handling Malformed Data">.

For example to convert ISO-8859-1 data to UTF-8:

	$utf8 = decode("latin1", $latin1);

=item *

	from_to($string, FROM_ENCODING, TO_ENCODING[, CHECK])

Convert B<in-place> the data between two encodings.  How did the data
in $string originally get to be in FROM_ENCODING?  Either using
encode() or through PerlIO: See L</"Encoding and IO">.  For CHECK
see L</"Handling Malformed Data">.

For example to convert ISO-8859-1 data to UTF-8:

	from_to($data, "iso-8859-1", "utf-8");

and to convert it back:

	from_to($data, "utf-8", "iso-8859-1");

Note that because the conversion happens in place, the data to be
converted cannot be a string constant, it must be a scalar variable.

=back

=head2 Handling Malformed Data

If CHECK is not set, C<undef> is returned.  If the data is supposed to
be UTF-8, an optional lexical warning (category utf8) is given.  If
CHECK is true but not a code reference, dies.

It would desirable to have a way to indicate that transform should use
the encodings "replacement character" - no such mechanism is defined yet.

It is also planned to allow I<CHECK> to be a code reference.

This is not yet implemented as there are design issues with what its
arguments should be and how it returns its results.

=over 4

=item Scheme 1

Passed remaining fragment of string being processed.
Modifies it in place to remove bytes/characters it can understand
and returns a string used to represent them.
e.g.

 sub fixup {
   my $ch = substr($_[0],0,1,'');
   return sprintf("\x{%02X}",ord($ch);
 }

This scheme is close to how underlying C code for Encode works, but gives
the fixup routine very little context.

=item Scheme 2

Passed original string, and an index into it of the problem area, and
output string so far.  Appends what it will to output string and
returns new index into original string.  For example:

 sub fixup {
   # my ($s,$i,$d) = @_;
   my $ch = substr($_[0],$_[1],1);
   $_[2] .= sprintf("\x{%02X}",ord($ch);
   return $_[1]+1;
 }

This scheme gives maximal control to the fixup routine but is more
complicated to code, and may need internals of Encode to be tweaked to
keep original string intact.

=item Other Schemes

Hybrids of above.

Multiple return values rather than in-place modifications.

Index into the string could be pos($str) allowing s/\G...//.

=back

=head2 UTF-8 / utf8

The Unicode consortium defines the UTF-8 standard as a way of encoding
the entire Unicode repertiore as sequences of octets.  This encoding is
expected to become very widespread. Perl can use this form internaly
to represent strings, so conversions to and from this form are
particularly efficient (as octets in memory do not have to change,
just the meta-data that tells Perl how to treat them).

=over 4

=item *

        $bytes = encode_utf8($string);

The characters that comprise string are encoded in Perl's superset of UTF-8
and the resulting octets returned as a sequence of bytes. All possible
characters have a UTF-8 representation so this function cannot fail.

=item *

        $string = decode_utf8($bytes [,CHECK]);

The sequence of octets represented by $bytes is decoded from UTF-8
into a sequence of logical characters. Not all sequences of octets
form valid UTF-8 encodings, so it is possible for this call to fail.
For CHECK see L</"Handling Malformed Data">.

=back

=head2 Listing available encodings

  use Encode qw(encodings);
  @list = encodings();

Returns a list of the canonical names of the available encodings.

=head2 Defining Aliases

  use Encode qw(define_alias);
  define_alias( newName => ENCODING);

Allows newName to be used as am alias for ENCODING. ENCODING may be
either the name of an encoding or and encoding object (as above).

Currently I<newName> can be specified in the following ways:

=over 4

=item As a simple string.

=item As a qr// compiled regular expression, e.g.:

  define_alias( qr/^iso8859-(\d+)$/i => '"iso-8859-$1"' );

In this case if I<ENCODING> is not a reference it is C<eval>-ed to
allow C<$1> etc. to be subsituted.  The example is one way to names as
used in X11 font names to alias the MIME names for the iso-8859-*
family.  Note the double quote inside the single quote.  If you are
using regex here, you have to do so or it won't work in this case.

=item As a code reference, e.g.:

  define_alias( sub { return /^iso8859-(\d+)$/i ? "iso-8859-$1" : undef } , '');

In this case C<$_> will be set to the name that is being looked up and
I<ENCODING> is passed to the sub as its first argument.  The example
is another way to names as used in X11 font names to alias the MIME
names for the iso-8859-* family.

=back

=head1 Defining Encodings

    use Encode qw(define_alias);
    define_encoding( $object, 'canonicalName' [,alias...]);

Causes I<canonicalName> to be associated with I<$object>.  The object
should provide the interface described in L<Encode::Encoding>
below.  If more than two arguments are provided then additional
arguments are taken as aliases for I<$object> as for C<define_alias>.

=head1 Encoding and IO

It is very common to want to do encoding transformations when
reading or writing files, network connections, pipes etc.
If Perl is configured to use the new 'perlio' IO system then
C<Encode> provides a "layer" (See L<perliol>) which can transform
data as it is read or written.

Here is how the blind poet would modernise the encoding:

    use Encode;
    open(my $iliad,'<:encoding(iso-8859-7)','iliad.greek');
    open(my $utf8,'>:utf8','iliad.utf8');
    my @epic = <$iliad>;
    print $utf8 @epic;
    close($utf8);
    close($illiad);

In addition the new IO system can also be configured to read/write
UTF-8 encoded characters (as noted above this is efficient):

    open(my $fh,'>:utf8','anything');
    print $fh "Any \x{0021} string \N{SMILEY FACE}\n";

Either of the above forms of "layer" specifications can be made the default
for a lexical scope with the C<use open ...> pragma. See L<open>.

Once a handle is open is layers can be altered using C<binmode>.

Without any such configuration, or if Perl itself is built using
system's own IO, then write operations assume that file handle accepts
only I<bytes> and will C<die> if a character larger than 255 is
written to the handle. When reading, each octet from the handle
becomes a byte-in-a-character. Note that this default is the same
behaviour as bytes-only languages (including Perl before v5.6) would
have, and is sufficient to handle native 8-bit encodings
e.g. iso-8859-1, EBCDIC etc. and any legacy mechanisms for handling
other encodings and binary data.

In other cases it is the programs responsibility to transform
characters into bytes using the API above before doing writes, and to
transform the bytes read from a handle into characters before doing
"character operations" (e.g. C<lc>, C</\W+/>, ...).

You can also use PerlIO to convert larger amounts of data you don't
want to bring into memory.  For example to convert between ISO-8859-1
(Latin 1) and UTF-8 (or UTF-EBCDIC in EBCDIC machines):

    open(F, "<:encoding(iso-8859-1)", "data.txt") or die $!;
    open(G, ">:utf8",                 "data.utf") or die $!;
    while (<F>) { print G }

    # Could also do "print G <F>" but that would pull
    # the whole file into memory just to write it out again.

More examples:

    open(my $f, "<:encoding(cp1252)")
    open(my $g, ">:encoding(iso-8859-2)")
    open(my $h, ">:encoding(latin9)")       # iso-8859-15

See L<PerlIO> for more information.

See also L<encoding> for how to change the default encoding of the
data in your script.

=head1 Messing with Perl's Internals

The following API uses parts of Perl's internals in the current
implementation.  As such they are efficient, but may change.

=over 4

=item * is_utf8(STRING [, CHECK])

[INTERNAL] Test whether the UTF-8 flag is turned on in the STRING.
If CHECK is true, also checks the data in STRING for being well-formed
UTF-8.  Returns true if successful, false otherwise.

=item *

        _utf8_on(STRING)

[INTERNAL] Turn on the UTF-8 flag in STRING.  The data in STRING is
B<not> checked for being well-formed UTF-8.  Do not use unless you
B<know> that the STRING is well-formed UTF-8.  Returns the previous
state of the UTF-8 flag (so please don't test the return value as
I<not> success or failure), or C<undef> if STRING is not a string.

=item *

        _utf8_off(STRING)

[INTERNAL] Turn off the UTF-8 flag in STRING.  Do not use frivolously.
Returns the previous state of the UTF-8 flag (so please don't test the
return value as I<not> success or failure), or C<undef> if STRING is
not a string.

=back

=head1 SEE ALSO

L<perlunicode>, L<perlebcdic>, L<perlfunc/open>, L<PerlIO>, L<encoding>,
L<utf8>, L<Encode::Description>, L<Encode::Encoding> the Perl Unicode Mailing List E<lt>perl-unicode@perl.orgE<gt>


=cut
Commit	Line	Data
2c674647	1	package Encode;
51ef4e11	2	use strict;
1b2c56c8	3	our $VERSION = do { my @r = (q$Revision: 0.94 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
2c674647	4
	5	require DynaLoader;
	6	require Exporter;
	7
51ef4e11	8	our @ISA = qw(Exporter DynaLoader);
2c674647	9
4411f3b6	10	# Public, encouraged API is exported by default
51ef4e11	11	our @EXPORT = qw (
4411f3b6	12	encode
	13	decode
	14	encode_utf8
	15	decode_utf8
	16	find_encoding
51ef4e11	17	encodings
4411f3b6	18	);
4411f3b6	19
51ef4e11	20	our @EXPORT_OK =
2c674647	21	qw(
51ef4e11	22	define_encoding
51ef4e11	23	define_alias
2c674647	24	from_to
2c674647	25	is_utf8
4411f3b6	26	is_8bit
4411f3b6	27	is_16bit
a12c0f56	28	utf8_upgrade
a12c0f56	29	utf8_downgrade
4411f3b6	30	_utf8_on
4411f3b6	31	_utf8_off
2c674647	32	);
	33
	34	bootstrap Encode ();
	35
4411f3b6	36	# Documentation moved after __END__ for speed - NI-S
2c674647	37
bf230f3d	38	use Carp;
bf230f3d	39
51ef4e11	40	# Make a %encoding package variable to allow a certain amount of cheating
	41	our %encoding;
	42	my @alias; # ordered matching list
	43	my %alias; # cached known aliases
f7ac3676	44
6d6a7c8d	45	# 0 1 2 3 4 5 6 7 8 9 10
	46	our @latin2iso_num = ( 0, 1, 2, 3, 4, 9, 10, 13, 14, 15, 16 );
	47
f7ac3676	48	our %winlatin2cp = (
	49	'Latin1' => 1252,
	50	'Latin2' => 1250,
	51	'Cyrillic' => 1251,
f7ac3676	52	'Greek' => 1253,
	53	'Turkish' => 1254,
	54	'Hebrew' => 1255,
	55	'Arabic' => 1256,
	56	'Baltic' => 1257,
	57	'Vietnamese' => 1258,
	58	);
5345d506	59
70122e76	60	our %external_tables =
2b217bf7	61	(
	62	'euc-cn' => 'Encode/CN.pm',
	63	gb2312 => 'Encode/CN.pm',
	64	gb12345 => 'Encode/CN.pm',
	65	gbk => 'Encode/CN.pm',
	66	cp936 => 'Encode/CN.pm',
	67	'iso-ir-165' => 'Encode/CN.pm',
	68	'euc-jp' => 'Encode/JP.pm',
ee981de6	69	'iso-2022-jp' => 'Encode/JP.pm',
ee981de6	70	'7bit-jis' => 'Encode/JP.pm',
2b217bf7	71	shiftjis => 'Encode/JP.pm',
	72	macjapan => 'Encode/JP.pm',
	73	cp932 => 'Encode/JP.pm',
	74	'euc-kr' => 'Encode/KR.pm',
	75	ksc5601 => 'Encode/KR.pm',
	76	cp949 => 'Encode/KR.pm',
	77	big5 => 'Encode/TW.pm',
	78	'big5-hkscs' => 'Encode/TW.pm',
	79	cp950 => 'Encode/TW.pm',
	80	gb18030 => 'Encode/HanExtra.pm',
	81	big5plus => 'Encode/HanExtra.pm',
	82	'euc-tw' => 'Encode/HanExtra.pm',
	83	);
d1ed7747	84
656753f8	85	sub encodings
	86	{
	87	my ($class) = @_;
40a073c6	88	return
	89	map { $_->[0] }
	90	sort { $a->[1] cmp $b->[1] }
	91	map { [$_, lc $_] }
	92	grep { $_ ne 'Internal' }
	93	keys %encoding;
51ef4e11	94	}
	95
	96	sub findAlias
	97	{
18586f54	98	my $class = shift;
	99	local $_ = shift;
	100	# print "# findAlias $_\n";
	101	unless (exists $alias{$_})
656753f8	102	{
18586f54	103	for (my $i=0; $i < @alias; $i += 2)
	104	{
	105	my $alias = $alias[$i];
	106	my $val = $alias[$i+1];
	107	my $new;
	108	if (ref($alias) eq 'Regexp' && $_ =~ $alias)
	109	{
	110	$new = eval $val;
	111	}
	112	elsif (ref($alias) eq 'CODE')
	113	{
	114	$new = &{$alias}($val)
	115	}
	116	elsif (lc($_) eq lc($alias))
	117	{
	118	$new = $val;
	119	}
	120	if (defined($new))
	121	{
	122	next if $new eq $_; # avoid (direct) recursion on bugs
	123	my $enc = (ref($new)) ? $new : find_encoding($new);
	124	if ($enc)
	125	{
	126	$alias{$_} = $enc;
	127	last;
	128	}
	129	}
	130	}
656753f8	131	}
18586f54	132	return $alias{$_};
5345d506	133	}
5345d506	134
51ef4e11	135	sub define_alias
5345d506	136	{
18586f54	137	while (@_)
	138	{
	139	my ($alias,$name) = splice(@_,0,2);
	140	push(@alias, $alias => $name);
	141	}
51ef4e11	142	}
51ef4e11	143
016cb72c	144	# Allow variants of iso-8859-1 etc.
d6089a2a	145	define_alias( qr/^iso[-_]?(\d+)[-_](\d+)$/i => '"iso-$1-$2"' );
016cb72c	146
7faf300d	147	# At least HP-UX has these.
	148	define_alias( qr/^iso8859(\d+)$/i => '"iso-8859-$1"' );
	149
f7ac3676	150	# More HP stuff.
	151	define_alias( qr/^(?:hp-)?(arabic\|greek\|hebrew\|kana\|roman\|thai\|turkish)8$/i => '"${1}8"' );
	152
0b3236bb	153	# The Official name of ASCII.
8a361256	154	define_alias( qr/^ANSI[-_]?X3\.4[-_]?1968$/i => '"ascii"' );
8a361256	155
58d53262	156	# This is a font issue, not an encoding issue.
	157	# (The currency symbol of the Latin 1 upper half
	158	# has been redefined as the euro symbol.)
	159	define_alias( qr/^(.+)\@euro$/i => '"$1"' );
	160
016cb72c	161	# Allow latin-1 style names as well
7faf300d	162	define_alias( qr/^(?:iso[-_]?)?latin[-_]?(\d+)$/i => '"iso-8859-$latin2iso_num[$1]"' );
016cb72c	163
f7ac3676	164	# Allow winlatin1 style names as well
cf91068f	165	define_alias( qr/^win(latin[12]\|cyrillic\|baltic\|greek\|turkish\|hebrew\|arabic\|baltic\|vietnamese)$/i => '"cp$winlatin2cp{\u$1}"' );
f7ac3676	166
016cb72c	167	# Common names for non-latin prefered MIME names
	168	define_alias( 'ascii' => 'US-ascii',
	169	'cyrillic' => 'iso-8859-5',
	170	'arabic' => 'iso-8859-6',
	171	'greek' => 'iso-8859-7',
f7ac3676	172	'hebrew' => 'iso-8859-8',
	173	'thai' => 'iso-8859-11',
	174	'tis620' => 'iso-8859-11',
	175	);
016cb72c	176
7faf300d	177	# At least AIX has IBM-NNN (surprisingly...) instead of cpNNN.
1853dd5f	178	# And Microsoft has their own naming (again, surprisingly).
	179	define_alias( qr/^(?:ibm\|ms)[-_]?(\d\d\d\d?)$/i => '"cp$1"');
	180
	181	# Sometimes seen with a leading zero.
	182	define_alias( qr/^cp037$/i => '"cp37"');
	183
	184	# Ououououou.
	185	define_alias( qr/^macRomanian$/i => '"macRumanian"');
7faf300d	186
58d53262	187	# Standardize on the dashed versions.
58d53262	188	define_alias( qr/^utf8$/i => 'utf-8' );
7faf300d	189	define_alias( qr/^koi8r$/i => 'koi8-r' );
f7ac3676	190	define_alias( qr/^koi8u$/i => 'koi8-u' );
f7ac3676	191
1853dd5f	192	# Seen in some Linuxes.
	193	define_alias( qr/^ujis$/i => 'euc-jp' );
	194
b2729934	195	# CP936 doesn't have vendor-addon for GBK, so they're identical.
	196	define_alias( qr/^gbk$/i => '"cp936"');
	197
f7ac3676	198	# TODO: HP-UX '8' encodings arabic8 greek8 hebrew8 kana8 thai8 turkish8
	199	# TODO: HP-UX '15' encodings japanese15 korean15 roi15
	200	# TODO: Cyrillic encoding ISO-IR-111 (useful?)
f7ac3676	201	# TODO: Armenian encoding ARMSCII-8
	202	# TODO: Hebrew encoding ISO-8859-8-1
	203	# TODO: Thai encoding TCVN
	204	# TODO: Korean encoding Johab
56a543c5	205	# TODO: Vietnamese encodings VPS
f7ac3676	206	# TODO: Mac Asian+African encodings: Arabic Armenian Bengali Burmese
	207	# ChineseSimp ChineseTrad Devanagari Ethiopic ExtArabic
	208	# Farsi Georgian Gujarati Gurmukhi Hebrew Japanese
	209	# Kannada Khmer Korean Laotian Malayalam Mongolian
	210	# Oriya Sinhalese Symbol Tamil Telugu Tibetan Vietnamese
18586f54	211
1853dd5f	212	# Map white space and _ to '-'
016cb72c	213	define_alias( qr/^(\S+)[\s_]+(.*)$/i => '"$1-$2"' );
016cb72c	214
51ef4e11	215	sub define_encoding
51ef4e11	216	{
18586f54	217	my $obj = shift;
	218	my $name = shift;
	219	$encoding{$name} = $obj;
	220	my $lc = lc($name);
	221	define_alias($lc => $obj) unless $lc eq $name;
	222	while (@_)
	223	{
	224	my $alias = shift;
	225	define_alias($alias,$obj);
	226	}
	227	return $obj;
656753f8	228	}
656753f8	229
656753f8	230	sub getEncoding
656753f8	231	{
dd9703c9	232	my ($class,$name,$skip_external) = @_;
18586f54	233	my $enc;
	234	if (ref($name) && $name->can('new_sequence'))
	235	{
	236	return $name;
	237	}
	238	my $lc = lc $name;
	239	if (exists $encoding{$name})
	240	{
	241	return $encoding{$name};
	242	}
	243	if (exists $encoding{$lc})
	244	{
	245	return $encoding{$lc};
	246	}
c50d192e	247
	248	my $oc = $class->findAlias($name);
	249	return $oc if defined $oc;
	250
	251	$oc = $class->findAlias($lc) if $lc ne $name;
	252	return $oc if defined $oc;
	253
dd9703c9	254	if (!$skip_external and exists $external_tables{$lc})
d1ed7747	255	{
	256	require $external_tables{$lc};
	257	return $encoding{$name} if exists $encoding{$name};
	258	}
18586f54	259
18586f54	260	return;
656753f8	261	}
656753f8	262
4411f3b6	263	sub find_encoding
4411f3b6	264	{
dd9703c9	265	my ($name,$skip_external) = @_;
dd9703c9	266	return __PACKAGE__->getEncoding($name,$skip_external);
4411f3b6	267	}
	268
	269	sub encode
	270	{
18586f54	271	my ($name,$string,$check) = @_;
	272	my $enc = find_encoding($name);
	273	croak("Unknown encoding '$name'") unless defined $enc;
	274	my $octets = $enc->encode($string,$check);
	275	return undef if ($check && length($string));
	276	return $octets;
4411f3b6	277	}
	278
	279	sub decode
	280	{
18586f54	281	my ($name,$octets,$check) = @_;
	282	my $enc = find_encoding($name);
	283	croak("Unknown encoding '$name'") unless defined $enc;
	284	my $string = $enc->decode($octets,$check);
	285	$_[1] = $octets if $check;
	286	return $string;
4411f3b6	287	}
	288
	289	sub from_to
	290	{
18586f54	291	my ($string,$from,$to,$check) = @_;
	292	my $f = find_encoding($from);
	293	croak("Unknown encoding '$from'") unless defined $f;
	294	my $t = find_encoding($to);
	295	croak("Unknown encoding '$to'") unless defined $t;
	296	my $uni = $f->decode($string,$check);
	297	return undef if ($check && length($string));
	298	$string = $t->encode($uni,$check);
	299	return undef if ($check && length($uni));
	300	return length($_[0] = $string);
4411f3b6	301	}
	302
	303	sub encode_utf8
	304	{
18586f54	305	my ($str) = @_;
	306	utf8::encode($str);
	307	return $str;
4411f3b6	308	}
	309
	310	sub decode_utf8
	311	{
18586f54	312	my ($str) = @_;
	313	return undef unless utf8::decode($str);
	314	return $str;
5ad8ef52	315	}
5ad8ef52	316
18586f54	317	require Encode::Encoding;
	318	require Encode::XS;
	319	require Encode::Internal;
	320	require Encode::Unicode;
	321	require Encode::utf8;
	322	require Encode::iso10646_1;
	323	require Encode::ucs2_le;
4411f3b6	324
656753f8	325	1;
656753f8	326
2a936312	327	__END__
2a936312	328
4411f3b6	329	=head1 NAME
	330
	331	Encode - character encodings
	332
	333	=head1 SYNOPSIS
	334
	335	use Encode;
	336
	337	=head1 DESCRIPTION
	338
47bfe92f	339	The C<Encode> module provides the interfaces between Perl's strings
47bfe92f	340	and the rest of the system. Perl strings are sequences of B<characters>.
4411f3b6	341
1b2c56c8	342	To find more about character encodings, please consult
1b2c56c8	343	L<Encode::Description> . This document focuses on programming references.
21938dfa	344
4411f3b6	345	=head1 PERL ENCODING API
	346
	347	=head2 Generic Encoding Interface
	348
	349	=over 4
	350
	351	=item *
	352
	353	$bytes = encode(ENCODING, $string[, CHECK])
	354
47bfe92f	355	Encodes string from Perl's internal form into I<ENCODING> and returns
47bfe92f	356	a sequence of octets. For CHECK see L</"Handling Malformed Data">.
4411f3b6	357
681a7c68	358	For example to convert (internally UTF-8 encoded) Unicode data
	359	to octets:
	360
	361	$octets = encode("utf8", $unicode);
	362
4411f3b6	363	=item *
	364
	365	$string = decode(ENCODING, $bytes[, CHECK])
	366
47bfe92f	367	Decode sequence of octets assumed to be in I<ENCODING> into Perl's
	368	internal form and returns the resulting string. For CHECK see
	369	L</"Handling Malformed Data">.
	370
1b2c56c8	371	For example to convert ISO-8859-1 data to UTF-8:
681a7c68	372
	373	$utf8 = decode("latin1", $latin1);
	374
47bfe92f	375	=item *
	376
	377	from_to($string, FROM_ENCODING, TO_ENCODING[, CHECK])
	378
2b106fbe	379	Convert B<in-place> the data between two encodings. How did the data
2b106fbe	380	in $string originally get to be in FROM_ENCODING? Either using
e9692b5b	381	encode() or through PerlIO: See L</"Encoding and IO">. For CHECK
2b106fbe	382	see L</"Handling Malformed Data">.
2b106fbe	383
1b2c56c8	384	For example to convert ISO-8859-1 data to UTF-8:
2b106fbe	385
	386	from_to($data, "iso-8859-1", "utf-8");
	387
	388	and to convert it back:
	389
	390	from_to($data, "utf-8", "iso-8859-1");
4411f3b6	391
ab97ca19	392	Note that because the conversion happens in place, the data to be
	393	converted cannot be a string constant, it must be a scalar variable.
	394
4411f3b6	395	=back
	396
	397	=head2 Handling Malformed Data
	398
	399	If CHECK is not set, C<undef> is returned. If the data is supposed to
47bfe92f	400	be UTF-8, an optional lexical warning (category utf8) is given. If
47bfe92f	401	CHECK is true but not a code reference, dies.
4411f3b6	402
47bfe92f	403	It would desirable to have a way to indicate that transform should use
47bfe92f	404	the encodings "replacement character" - no such mechanism is defined yet.
4411f3b6	405
	406	It is also planned to allow I<CHECK> to be a code reference.
	407
47bfe92f	408	This is not yet implemented as there are design issues with what its
47bfe92f	409	arguments should be and how it returns its results.
4411f3b6	410
	411	=over 4
	412
	413	=item Scheme 1
	414
	415	Passed remaining fragment of string being processed.
	416	Modifies it in place to remove bytes/characters it can understand
	417	and returns a string used to represent them.
	418	e.g.
	419
	420	sub fixup {
	421	my $ch = substr($_[0],0,1,'');
	422	return sprintf("\x{%02X}",ord($ch);
	423	}
	424
	425	This scheme is close to how underlying C code for Encode works, but gives
	426	the fixup routine very little context.
	427
	428	=item Scheme 2
	429
47bfe92f	430	Passed original string, and an index into it of the problem area, and
	431	output string so far. Appends what it will to output string and
	432	returns new index into original string. For example:
4411f3b6	433
	434	sub fixup {
	435	# my ($s,$i,$d) = @_;
	436	my $ch = substr($_[0],$_[1],1);
	437	$_[2] .= sprintf("\x{%02X}",ord($ch);
	438	return $_[1]+1;
	439	}
	440
47bfe92f	441	This scheme gives maximal control to the fixup routine but is more
	442	complicated to code, and may need internals of Encode to be tweaked to
	443	keep original string intact.
4411f3b6	444
	445	=item Other Schemes
	446
	447	Hybrids of above.
	448
	449	Multiple return values rather than in-place modifications.
	450
	451	Index into the string could be pos($str) allowing s/\G...//.
	452
	453	=back
	454
	455	=head2 UTF-8 / utf8
	456
	457	The Unicode consortium defines the UTF-8 standard as a way of encoding
47bfe92f	458	the entire Unicode repertiore as sequences of octets. This encoding is
	459	expected to become very widespread. Perl can use this form internaly
	460	to represent strings, so conversions to and from this form are
	461	particularly efficient (as octets in memory do not have to change,
	462	just the meta-data that tells Perl how to treat them).
4411f3b6	463
	464	=over 4
	465
	466	=item *
	467
	468	$bytes = encode_utf8($string);
	469
47bfe92f	470	The characters that comprise string are encoded in Perl's superset of UTF-8
4411f3b6	471	and the resulting octets returned as a sequence of bytes. All possible
	472	characters have a UTF-8 representation so this function cannot fail.
	473
	474	=item *
	475
	476	$string = decode_utf8($bytes [,CHECK]);
	477
47bfe92f	478	The sequence of octets represented by $bytes is decoded from UTF-8
	479	into a sequence of logical characters. Not all sequences of octets
	480	form valid UTF-8 encodings, so it is possible for this call to fail.
	481	For CHECK see L</"Handling Malformed Data">.
4411f3b6	482
	483	=back
	484
51ef4e11	485	=head2 Listing available encodings
	486
	487	use Encode qw(encodings);
	488	@list = encodings();
	489
	490	Returns a list of the canonical names of the available encodings.
	491
	492	=head2 Defining Aliases
	493
	494	use Encode qw(define_alias);
	495	define_alias( newName => ENCODING);
	496
47bfe92f	497	Allows newName to be used as am alias for ENCODING. ENCODING may be
47bfe92f	498	either the name of an encoding or and encoding object (as above).
51ef4e11	499
	500	Currently I<newName> can be specified in the following ways:
	501
	502	=over 4
	503
	504	=item As a simple string.
	505
	506	=item As a qr// compiled regular expression, e.g.:
	507
	508	define_alias( qr/^iso8859-(\d+)$/i => '"iso-8859-$1"' );
	509
47bfe92f	510	In this case if I<ENCODING> is not a reference it is C<eval>-ed to
	511	allow C<$1> etc. to be subsituted. The example is one way to names as
	512	used in X11 font names to alias the MIME names for the iso-8859-*
1b2c56c8	513	family. Note the double quote inside the single quote. If you are
1b2c56c8	514	using regex here, you have to do so or it won't work in this case.
51ef4e11	515
	516	=item As a code reference, e.g.:
	517
	518	define_alias( sub { return /^iso8859-(\d+)$/i ? "iso-8859-$1" : undef } , '');
	519
	520	In this case C<$_> will be set to the name that is being looked up and
47bfe92f	521	I<ENCODING> is passed to the sub as its first argument. The example
	522	is another way to names as used in X11 font names to alias the MIME
	523	names for the iso-8859-* family.
51ef4e11	524
	525	=back
	526
1b2c56c8	527	=head1 Defining Encodings
51ef4e11	528
e9692b5b	529	use Encode qw(define_alias);
e9692b5b	530	define_encoding( $object, 'canonicalName' [,alias...]);
51ef4e11	531
47bfe92f	532	Causes I<canonicalName> to be associated with I<$object>. The object
1b2c56c8	533	should provide the interface described in L<Encode::Encoding>
47bfe92f	534	below. If more than two arguments are provided then additional
47bfe92f	535	arguments are taken as aliases for I<$object> as for C<define_alias>.
51ef4e11	536
4411f3b6	537	=head1 Encoding and IO
	538
	539	It is very common to want to do encoding transformations when
	540	reading or writing files, network connections, pipes etc.
47bfe92f	541	If Perl is configured to use the new 'perlio' IO system then
4411f3b6	542	C<Encode> provides a "layer" (See L<perliol>) which can transform
	543	data as it is read or written.
	544
8e86646e	545	Here is how the blind poet would modernise the encoding:
8e86646e	546
42234700	547	use Encode;
8e86646e	548	open(my $iliad,'<:encoding(iso-8859-7)','iliad.greek');
	549	open(my $utf8,'>:utf8','iliad.utf8');
	550	my @epic = <$iliad>;
	551	print $utf8 @epic;
	552	close($utf8);
	553	close($illiad);
4411f3b6	554
	555	In addition the new IO system can also be configured to read/write
	556	UTF-8 encoded characters (as noted above this is efficient):
	557
e9692b5b	558	open(my $fh,'>:utf8','anything');
e9692b5b	559	print $fh "Any \x{0021} string \N{SMILEY FACE}\n";
4411f3b6	560
	561	Either of the above forms of "layer" specifications can be made the default
	562	for a lexical scope with the C<use open ...> pragma. See L<open>.
	563
	564	Once a handle is open is layers can be altered using C<binmode>.
	565
47bfe92f	566	Without any such configuration, or if Perl itself is built using
4411f3b6	567	system's own IO, then write operations assume that file handle accepts
	568	only I<bytes> and will C<die> if a character larger than 255 is
	569	written to the handle. When reading, each octet from the handle
	570	becomes a byte-in-a-character. Note that this default is the same
47bfe92f	571	behaviour as bytes-only languages (including Perl before v5.6) would
	572	have, and is sufficient to handle native 8-bit encodings
	573	e.g. iso-8859-1, EBCDIC etc. and any legacy mechanisms for handling
	574	other encodings and binary data.
	575
	576	In other cases it is the programs responsibility to transform
	577	characters into bytes using the API above before doing writes, and to
	578	transform the bytes read from a handle into characters before doing
	579	"character operations" (e.g. C<lc>, C</\W+/>, ...).
	580
47bfe92f	581	You can also use PerlIO to convert larger amounts of data you don't
1b2c56c8	582	want to bring into memory. For example to convert between ISO-8859-1
47bfe92f	583	(Latin 1) and UTF-8 (or UTF-EBCDIC in EBCDIC machines):
47bfe92f	584
e9692b5b	585	open(F, "<:encoding(iso-8859-1)", "data.txt") or die $!;
	586	open(G, ">:utf8", "data.utf") or die $!;
	587	while (<F>) { print G }
	588
	589	# Could also do "print G <F>" but that would pull
	590	# the whole file into memory just to write it out again.
	591
	592	More examples:
47bfe92f	593
e9692b5b	594	open(my $f, "<:encoding(cp1252)")
	595	open(my $g, ">:encoding(iso-8859-2)")
	596	open(my $h, ">:encoding(latin9)") # iso-8859-15
47bfe92f	597
47bfe92f	598	See L<PerlIO> for more information.
4411f3b6	599
1768d7eb	600	See also L<encoding> for how to change the default encoding of the
d521382b	601	data in your script.
1768d7eb	602
4411f3b6	603	=head1 Messing with Perl's Internals
4411f3b6	604
47bfe92f	605	The following API uses parts of Perl's internals in the current
47bfe92f	606	implementation. As such they are efficient, but may change.
4411f3b6	607
	608	=over 4
	609
4411f3b6	610	=item * is_utf8(STRING [, CHECK])
	611
	612	[INTERNAL] Test whether the UTF-8 flag is turned on in the STRING.
47bfe92f	613	If CHECK is true, also checks the data in STRING for being well-formed
47bfe92f	614	UTF-8. Returns true if successful, false otherwise.
4411f3b6	615
4411f3b6	616	=item *
	617
	618	_utf8_on(STRING)
	619
	620	[INTERNAL] Turn on the UTF-8 flag in STRING. The data in STRING is
	621	B<not> checked for being well-formed UTF-8. Do not use unless you
	622	B<know> that the STRING is well-formed UTF-8. Returns the previous
	623	state of the UTF-8 flag (so please don't test the return value as
	624	I<not> success or failure), or C<undef> if STRING is not a string.
	625
	626	=item *
	627
	628	_utf8_off(STRING)
	629
	630	[INTERNAL] Turn off the UTF-8 flag in STRING. Do not use frivolously.
	631	Returns the previous state of the UTF-8 flag (so please don't test the
	632	return value as I<not> success or failure), or C<undef> if STRING is
	633	not a string.
	634
	635	=back
	636
	637	=head1 SEE ALSO
	638
70122e76	639	L<perlunicode>, L<perlebcdic>, L<perlfunc/open>, L<PerlIO>, L<encoding>,
1b2c56c8	640	L<utf8>, L<Encode::Description>, L<Encode::Encoding> the Perl Unicode Mailing List E<lt>perl-unicode@perl.orgE<gt>
70122e76	641
4411f3b6	642
	643	=cut
	644