[p5sagit/p5-mst-13.2.git] / lib / encoding.pm

package encoding;

our $VERSION = '1.00';

use Encode;

sub import {
    my ($class, $name) = @_;
    $name = $ENV{PERL_ENCODING} if @_ < 2;
    $name = "latin1" unless defined $name;
    my $enc = find_encoding($name);
    unless (defined $enc) {
	require Carp;
	Carp::croak "Unknown encoding '$name'";
    }
    ${^ENCODING} = $enc;
}

=pod

=head1 NAME

encoding - pragma to control the conversion of legacy data into Unicode

=head1 SYNOPSIS

    use encoding "iso 8859-7";

    # The \xDF of ISO 8859-7 (Greek) is \x{3af} in Unicode.

    $a = "\xDF";
    $b = "\x{100}";

    printf "%#x\n", ord($a); # will print 0x3af, not 0xdf

    $c = $a . $b;

    # $c will be "\x{3af}\x{100}", not "\x{df}\x{100}".

    # chr() is affected, and ...

    print "mega\n"  if ord(chr(0xdf)) == 0x3af;

    # ... ord() is affected by the encoding pragma ...

    print "tera\n" if ord(pack("C", 0xdf)) == 0x3af;

    # but pack/unpack are not affected, in case you still
    # want back to your native encoding

    print "peta\n" if unpack("C", (pack("C", 0xdf))) == 0xdf;

=head1 DESCRIPTION

Normally when legacy 8-bit data is converted to Unicode the data is
expected to be Latin-1 (or EBCDIC in EBCDIC platforms).  With the
encoding pragma you can change this default.

The pragma is a per script, not a per block lexical.  Only the last
C<use encoding> matters, and it affects B<the whole script>.

Notice that only literals (string or regular expression) having only
legacy code points are affected: if you mix data like this

	\xDF\x{100}

the data is assumed to be in (Latin 1 and) Unicode, not in your native
encoding.  In other words, this will match in "greek":

	"\xDF" =~ /\x{3af}/

but this will not

	"\xDF\x{100}" =~ /\x{3af}\x{100}/

since the C<\xDF> on the left will B<not> be upgraded to C<\x{3af}>
because of the C<\x{100}> on the left.  You should not be mixing your
legacy data and Unicode in the same string.

This pragma also affects encoding of the 0x80..0xFF code point range:
normally characters in that range are left as eight-bit bytes (unless
they are combined with characters with code points 0x100 or larger,
in which case all characters need to become UTF-8 encoded), but if
the C<encoding> pragma is present, even the 0x80..0xFF range always
gets UTF-8 encoded.

If no encoding is specified, the environment variable L<PERL_ENCODING>
is consulted.  If that fails, "latin1" (ISO 8859-1) is assumed.  If no
encoding can be found, C<Unknown encoding '...'> error will be thrown.

=head1 KNOWN PROBLEMS

For native multibyte encodings (either fixed or variable length)
the current implementation of the regular expressions may introduce
recoding errors for longer regular expression literals than 127 bytes.

=head1 SEE ALSO

L<perlunicode>, L<Encode>

=cut

1;
Commit	Line	Data
0a378802	1	package encoding;
0a378802	2
8de1277c	3	our $VERSION = '1.00';
8de1277c	4
0a378802	5	use Encode;
	6
	7	sub import {
	8	my ($class, $name) = @_;
	9	$name = $ENV{PERL_ENCODING} if @_ < 2;
121910a4	10	$name = "latin1" unless defined $name;
0a378802	11	my $enc = find_encoding($name);
	12	unless (defined $enc) {
	13	require Carp;
	14	Carp::croak "Unknown encoding '$name'";
	15	}
	16	${^ENCODING} = $enc;
	17	}
	18
	19	=pod
	20
	21	=head1 NAME
	22
	23	encoding - pragma to control the conversion of legacy data into Unicode
	24
	25	=head1 SYNOPSIS
	26
	27	use encoding "iso 8859-7";
	28
121910a4	29	# The \xDF of ISO 8859-7 (Greek) is \x{3af} in Unicode.
4bdee82d	30
0a378802	31	$a = "\xDF";
	32	$b = "\x{100}";
	33
4bdee82d	34	printf "%#x\n", ord($a); # will print 0x3af, not 0xdf
4bdee82d	35
0a378802	36	$c = $a . $b;
	37
	38	# $c will be "\x{3af}\x{100}", not "\x{df}\x{100}".
0a378802	39
121910a4	40	# chr() is affected, and ...
	41
	42	print "mega\n" if ord(chr(0xdf)) == 0x3af;
	43
	44	# ... ord() is affected by the encoding pragma ...
	45
	46	print "tera\n" if ord(pack("C", 0xdf)) == 0x3af;
	47
3de8ed06	48	# but pack/unpack are not affected, in case you still
121910a4	49	# want back to your native encoding
	50
	51	print "peta\n" if unpack("C", (pack("C", 0xdf))) == 0xdf;
	52
0a378802	53	=head1 DESCRIPTION
	54
	55	Normally when legacy 8-bit data is converted to Unicode the data is
	56	expected to be Latin-1 (or EBCDIC in EBCDIC platforms). With the
	57	encoding pragma you can change this default.
	58
	59	The pragma is a per script, not a per block lexical. Only the last
9f4817db	60	C<use encoding> matters, and it affects B<the whole script>.
0a378802	61
a72c7584	62	Notice that only literals (string or regular expression) having only
	63	legacy code points are affected: if you mix data like this
	64
	65	\xDF\x{100}
	66
	67	the data is assumed to be in (Latin 1 and) Unicode, not in your native
	68	encoding. In other words, this will match in "greek":
	69
	70	"\xDF" =~ /\x{3af}/
	71
	72	but this will not
	73
	74	"\xDF\x{100}" =~ /\x{3af}\x{100}/
	75
	76	since the C<\xDF> on the left will B<not> be upgraded to C<\x{3af}>
	77	because of the C<\x{100}> on the left. You should not be mixing your
	78	legacy data and Unicode in the same string.
	79
4ef28c72	80	This pragma also affects encoding of the 0x80..0xFF code point range:
	81	normally characters in that range are left as eight-bit bytes (unless
	82	they are combined with characters with code points 0x100 or larger,
	83	in which case all characters need to become UTF-8 encoded), but if
	84	the C<encoding> pragma is present, even the 0x80..0xFF range always
	85	gets UTF-8 encoded.
	86
4bdee82d	87	If no encoding is specified, the environment variable L<PERL_ENCODING>
3de8ed06	88	is consulted. If that fails, "latin1" (ISO 8859-1) is assumed. If no
3de8ed06	89	encoding can be found, C<Unknown encoding '...'> error will be thrown.
4bdee82d	90
6ec9efec	91	=head1 KNOWN PROBLEMS
0a378802	92
a72c7584	93	For native multibyte encodings (either fixed or variable length)
	94	the current implementation of the regular expressions may introduce
	95	recoding errors for longer regular expression literals than 127 bytes.
d521382b	96
0a378802	97	=head1 SEE ALSO
0a378802	98
121910a4	99	L<perlunicode>, L<Encode>
0a378802	100
	101	=cut
	102
	103	1;