[p5sagit/p5-mst-13.2.git] / lib / encoding.pm

package encoding;

our $VERSION = '1.00';

use Encode;

sub import {
    my ($class, $name) = @_;
    $name = $ENV{PERL_ENCODING} if @_ < 2;
    $name = "latin1" unless defined $name;
    my $enc = find_encoding($name);
    unless (defined $enc) {
	require Carp;
	Carp::croak "Unknown encoding '$name'";
    }
    ${^ENCODING} = $enc;
}

=pod

=head1 NAME

encoding - pragma to control the conversion of legacy data into Unicode

=head1 SYNOPSIS

    use encoding "iso 8859-7";

    # The \xDF of ISO 8859-7 (Greek) is \x{3af} in Unicode.

    $a = "\xDF";
    $b = "\x{100}";

    printf "%#x\n", ord($a); # will print 0x3af, not 0xdf

    $c = $a . $b;

    # $c will be "\x{3af}\x{100}", not "\x{df}\x{100}".

    # chr() is affected, and ...

    print "mega\n"  if ord(chr(0xdf)) == 0x3af;

    # ... ord() is affected by the encoding pragma ...

    print "tera\n" if ord(pack("C", 0xdf)) == 0x3af;

    # but pack/unpack are not affected, in case you still
    # want back to your native encoding

    print "peta\n" if unpack("C", (pack("C", 0xdf))) == 0xdf;

=head1 DESCRIPTION

Normally when legacy 8-bit data is converted to Unicode the data is
expected to be Latin-1 (or EBCDIC in EBCDIC platforms).  With the
encoding pragma you can change this default.

The pragma is a per script, not a per block lexical.  Only the last
C<use encoding> matters, and it affects B<the whole script>.

Notice that only literals (string or regular expression) having only
legacy code points are affected: if you mix data like this

	\xDF\x{100}

the data is assumed to be in (Latin 1 and) Unicode, not in your native
encoding.  In other words, this will match in "greek":

	"\xDF" =~ /\x{3af}/

but this will not

	"\xDF\x{100}" =~ /\x{3af}\x{100}/

since the C<\xDF> on the left will B<not> be upgraded to C<\x{3af}>
because of the C<\x{100}> on the left.  You should not be mixing your
legacy data and Unicode in the same string.

If no encoding is specified, the environment variable L<PERL_ENCODING>
is consulted.  If that fails, "latin1" (ISO 8859-1) is assumed.  If no
encoding can be found, C<Unknown encoding '...'> error will be thrown.

=head1 KNOWN PROBLEMS

For native multibyte encodings (either fixed or variable length)
the current implementation of the regular expressions may introduce
recoding errors for longer regular expression literals than 127 bytes.

=head1 SEE ALSO

L<perlunicode>, L<Encode>

=cut

1;
Commit	Line	Data
0a378802	1	package encoding;
0a378802	2
8de1277c	3	our $VERSION = '1.00';
8de1277c	4
0a378802	5	use Encode;
	6
	7	sub import {
	8	my ($class, $name) = @_;
	9	$name = $ENV{PERL_ENCODING} if @_ < 2;
121910a4	10	$name = "latin1" unless defined $name;
0a378802	11	my $enc = find_encoding($name);
	12	unless (defined $enc) {
	13	require Carp;
	14	Carp::croak "Unknown encoding '$name'";
	15	}
	16	${^ENCODING} = $enc;
	17	}
	18
	19	=pod
	20
	21	=head1 NAME
	22
	23	encoding - pragma to control the conversion of legacy data into Unicode
	24
	25	=head1 SYNOPSIS
	26
	27	use encoding "iso 8859-7";
	28
121910a4	29	# The \xDF of ISO 8859-7 (Greek) is \x{3af} in Unicode.
4bdee82d	30
0a378802	31	$a = "\xDF";
	32	$b = "\x{100}";
	33
4bdee82d	34	printf "%#x\n", ord($a); # will print 0x3af, not 0xdf
4bdee82d	35
0a378802	36	$c = $a . $b;
	37
	38	# $c will be "\x{3af}\x{100}", not "\x{df}\x{100}".
0a378802	39
121910a4	40	# chr() is affected, and ...
	41
	42	print "mega\n" if ord(chr(0xdf)) == 0x3af;
	43
	44	# ... ord() is affected by the encoding pragma ...
	45
	46	print "tera\n" if ord(pack("C", 0xdf)) == 0x3af;
	47
3de8ed06	48	# but pack/unpack are not affected, in case you still
121910a4	49	# want back to your native encoding
	50
	51	print "peta\n" if unpack("C", (pack("C", 0xdf))) == 0xdf;
	52
0a378802	53	=head1 DESCRIPTION
	54
	55	Normally when legacy 8-bit data is converted to Unicode the data is
	56	expected to be Latin-1 (or EBCDIC in EBCDIC platforms). With the
	57	encoding pragma you can change this default.
	58
	59	The pragma is a per script, not a per block lexical. Only the last
9f4817db	60	C<use encoding> matters, and it affects B<the whole script>.
0a378802	61
a72c7584	62	Notice that only literals (string or regular expression) having only
	63	legacy code points are affected: if you mix data like this
	64
	65	\xDF\x{100}
	66
	67	the data is assumed to be in (Latin 1 and) Unicode, not in your native
	68	encoding. In other words, this will match in "greek":
	69
	70	"\xDF" =~ /\x{3af}/
	71
	72	but this will not
	73
	74	"\xDF\x{100}" =~ /\x{3af}\x{100}/
	75
	76	since the C<\xDF> on the left will B<not> be upgraded to C<\x{3af}>
	77	because of the C<\x{100}> on the left. You should not be mixing your
	78	legacy data and Unicode in the same string.
	79
4bdee82d	80	If no encoding is specified, the environment variable L<PERL_ENCODING>
3de8ed06	81	is consulted. If that fails, "latin1" (ISO 8859-1) is assumed. If no
3de8ed06	82	encoding can be found, C<Unknown encoding '...'> error will be thrown.
4bdee82d	83
6ec9efec	84	=head1 KNOWN PROBLEMS
0a378802	85
a72c7584	86	For native multibyte encodings (either fixed or variable length)
	87	the current implementation of the regular expressions may introduce
	88	recoding errors for longer regular expression literals than 127 bytes.
d521382b	89
0a378802	90	=head1 SEE ALSO
0a378802	91
121910a4	92	L<perlunicode>, L<Encode>
0a378802	93
	94	=cut
	95
	96	1;