[p5sagit/p5-mst-13.2.git] / lib / encoding.pm

package encoding;

our $VERSION = '1.00';

use Encode;

BEGIN {
    if (ord("A") == 193) {
	require Carp;
	Carp::croak "encoding pragma does not support EBCDIC platforms";
    }
}

sub import {
    my ($class, $name) = @_;
    $name = $ENV{PERL_ENCODING} if @_ < 2;
    $name = "latin1" unless defined $name;
    my $enc = find_encoding($name);
    unless (defined $enc) {
	require Carp;
	Carp::croak "Unknown encoding '$name'";
    }
    ${^ENCODING} = $enc;
}

=pod

=head1 NAME

encoding - pragma to control the conversion of legacy data into Unicode

=head1 SYNOPSIS

    use encoding "iso 8859-7";

    # The \xDF of ISO 8859-7 (Greek) is \x{3af} in Unicode.

    $a = "\xDF";
    $b = "\x{100}";

    printf "%#x\n", ord($a); # will print 0x3af, not 0xdf

    $c = $a . $b;

    # $c will be "\x{3af}\x{100}", not "\x{df}\x{100}".

    # chr() is affected, and ...

    print "mega\n"  if ord(chr(0xdf)) == 0x3af;

    # ... ord() is affected by the encoding pragma ...

    print "tera\n" if ord(pack("C", 0xdf)) == 0x3af;

    # but pack/unpack are not affected, in case you still
    # want back to your native encoding

    print "peta\n" if unpack("C", (pack("C", 0xdf))) == 0xdf;

=head1 DESCRIPTION

Normally when legacy 8-bit data is converted to Unicode the data is
expected to be Latin-1 (or EBCDIC in EBCDIC platforms).  With the
encoding pragma you can change this default.

The pragma is a per script, not a per block lexical.  Only the last
C<use encoding> matters, and it affects B<the whole script>.

Notice that only literals (string or regular expression) having only
legacy code points are affected: if you mix data like this

	\xDF\x{100}

the data is assumed to be in (Latin 1 and) Unicode, not in your native
encoding.  In other words, this will match in "greek":

	"\xDF" =~ /\x{3af}/

but this will not

	"\xDF\x{100}" =~ /\x{3af}\x{100}/

since the C<\xDF> on the left will B<not> be upgraded to C<\x{3af}>
because of the C<\x{100}> on the left.  You should not be mixing your
legacy data and Unicode in the same string.

This pragma also affects encoding of the 0x80..0xFF code point range:
normally characters in that range are left as eight-bit bytes (unless
they are combined with characters with code points 0x100 or larger,
in which case all characters need to become UTF-8 encoded), but if
the C<encoding> pragma is present, even the 0x80..0xFF range always
gets UTF-8 encoded.

If no encoding is specified, the environment variable L<PERL_ENCODING>
is consulted.  If that fails, "latin1" (ISO 8859-1) is assumed.  If no
encoding can be found, C<Unknown encoding '...'> error will be thrown.

=head1 KNOWN PROBLEMS

For native multibyte encodings (either fixed or variable length)
the current implementation of the regular expressions may introduce
recoding errors for longer regular expression literals than 127 bytes.

The encoding pragma is not supported on EBCDIC platforms.

=head1 SEE ALSO

L<perlunicode>, L<Encode>

=cut

1;
Commit	Line	Data
0a378802	1	package encoding;
0a378802	2
8de1277c	3	our $VERSION = '1.00';
8de1277c	4
0a378802	5	use Encode;
0a378802	6
0effba8c	7	BEGIN {
	8	if (ord("A") == 193) {
	9	require Carp;
	10	Carp::croak "encoding pragma does not support EBCDIC platforms";
	11	}
	12	}
	13
0a378802	14	sub import {
	15	my ($class, $name) = @_;
	16	$name = $ENV{PERL_ENCODING} if @_ < 2;
121910a4	17	$name = "latin1" unless defined $name;
0a378802	18	my $enc = find_encoding($name);
	19	unless (defined $enc) {
	20	require Carp;
	21	Carp::croak "Unknown encoding '$name'";
	22	}
	23	${^ENCODING} = $enc;
	24	}
	25
	26	=pod
	27
	28	=head1 NAME
	29
	30	encoding - pragma to control the conversion of legacy data into Unicode
	31
	32	=head1 SYNOPSIS
	33
	34	use encoding "iso 8859-7";
	35
121910a4	36	# The \xDF of ISO 8859-7 (Greek) is \x{3af} in Unicode.
4bdee82d	37
0a378802	38	$a = "\xDF";
	39	$b = "\x{100}";
	40
4bdee82d	41	printf "%#x\n", ord($a); # will print 0x3af, not 0xdf
4bdee82d	42
0a378802	43	$c = $a . $b;
	44
	45	# $c will be "\x{3af}\x{100}", not "\x{df}\x{100}".
0a378802	46
121910a4	47	# chr() is affected, and ...
	48
	49	print "mega\n" if ord(chr(0xdf)) == 0x3af;
	50
	51	# ... ord() is affected by the encoding pragma ...
	52
	53	print "tera\n" if ord(pack("C", 0xdf)) == 0x3af;
	54
3de8ed06	55	# but pack/unpack are not affected, in case you still
121910a4	56	# want back to your native encoding
	57
	58	print "peta\n" if unpack("C", (pack("C", 0xdf))) == 0xdf;
	59
0a378802	60	=head1 DESCRIPTION
	61
	62	Normally when legacy 8-bit data is converted to Unicode the data is
	63	expected to be Latin-1 (or EBCDIC in EBCDIC platforms). With the
	64	encoding pragma you can change this default.
	65
	66	The pragma is a per script, not a per block lexical. Only the last
9f4817db	67	C<use encoding> matters, and it affects B<the whole script>.
0a378802	68
a72c7584	69	Notice that only literals (string or regular expression) having only
	70	legacy code points are affected: if you mix data like this
	71
	72	\xDF\x{100}
	73
	74	the data is assumed to be in (Latin 1 and) Unicode, not in your native
	75	encoding. In other words, this will match in "greek":
	76
	77	"\xDF" =~ /\x{3af}/
	78
	79	but this will not
	80
	81	"\xDF\x{100}" =~ /\x{3af}\x{100}/
	82
	83	since the C<\xDF> on the left will B<not> be upgraded to C<\x{3af}>
	84	because of the C<\x{100}> on the left. You should not be mixing your
	85	legacy data and Unicode in the same string.
	86
4ef28c72	87	This pragma also affects encoding of the 0x80..0xFF code point range:
	88	normally characters in that range are left as eight-bit bytes (unless
	89	they are combined with characters with code points 0x100 or larger,
	90	in which case all characters need to become UTF-8 encoded), but if
	91	the C<encoding> pragma is present, even the 0x80..0xFF range always
	92	gets UTF-8 encoded.
	93
4bdee82d	94	If no encoding is specified, the environment variable L<PERL_ENCODING>
3de8ed06	95	is consulted. If that fails, "latin1" (ISO 8859-1) is assumed. If no
3de8ed06	96	encoding can be found, C<Unknown encoding '...'> error will be thrown.
4bdee82d	97
6ec9efec	98	=head1 KNOWN PROBLEMS
0a378802	99
a72c7584	100	For native multibyte encodings (either fixed or variable length)
	101	the current implementation of the regular expressions may introduce
	102	recoding errors for longer regular expression literals than 127 bytes.
d521382b	103
0effba8c	104	The encoding pragma is not supported on EBCDIC platforms.
0effba8c	105
0a378802	106	=head1 SEE ALSO
0a378802	107
121910a4	108	L<perlunicode>, L<Encode>
0a378802	109
	110	=cut
	111
	112	1;