[p5sagit/p5-mst-13.2.git] / lib / encoding.pm

package encoding;

use Encode;

sub import {
    my ($class, $name) = @_;
    $name = $ENV{PERL_ENCODING} if @_ < 2;
    $name = "latin1" unless defined $name;
    my $enc = find_encoding($name);
    unless (defined $enc) {
	require Carp;
	Carp::croak "Unknown encoding '$name'";
    }
    ${^ENCODING} = $enc;
}

=pod

=head1 NAME

encoding - pragma to control the conversion of legacy data into Unicode

=head1 SYNOPSIS

    use encoding "iso 8859-7";

    # The \xDF of ISO 8859-7 (Greek) is \x{3af} in Unicode.

    $a = "\xDF";
    $b = "\x{100}";

    printf "%#x\n", ord($a); # will print 0x3af, not 0xdf

    $c = $a . $b;

    # $c will be "\x{3af}\x{100}", not "\x{df}\x{100}".

    # chr() is affected, and ...

    print "mega\n"  if ord(chr(0xdf)) == 0x3af;

    # ... ord() is affected by the encoding pragma ...

    print "tera\n" if ord(pack("C", 0xdf)) == 0x3af;

    # but pack/unpack C are not, in case you still
    # want back to your native encoding

    print "peta\n" if unpack("C", (pack("C", 0xdf))) == 0xdf;

=head1 DESCRIPTION

Normally when legacy 8-bit data is converted to Unicode the data is
expected to be Latin-1 (or EBCDIC in EBCDIC platforms).  With the
encoding pragma you can change this default.

The pragma is a per script, not a per block lexical.  Only the last
C<use encoding> matters, and it affects B<the whole script>.

If no encoding is specified, the environment variable L<PERL_ENCODING>
is consulted.  If that fails, "latin1" (ISO 8859-1) is assumed.
If no encoding can be found, C<Unknown encoding '...'> error will be thrown.

=head1 KNOWN PROBLEMS

The C<\x..> and C<\0...> in regular expressions are not affected by
this pragma.  They very probably should.

The charnames pragma ("\N{LATIN SMALL SHARP LETTER S}") does not work
with this pragma.

=head1 SEE ALSO

L<perlunicode>, L<Encode>

=cut

1;
Commit	Line	Data
0a378802	1	package encoding;
	2
	3	use Encode;
	4
	5	sub import {
	6	my ($class, $name) = @_;
	7	$name = $ENV{PERL_ENCODING} if @_ < 2;
121910a4	8	$name = "latin1" unless defined $name;
0a378802	9	my $enc = find_encoding($name);
	10	unless (defined $enc) {
	11	require Carp;
	12	Carp::croak "Unknown encoding '$name'";
	13	}
	14	${^ENCODING} = $enc;
	15	}
	16
	17	=pod
	18
	19	=head1 NAME
	20
	21	encoding - pragma to control the conversion of legacy data into Unicode
	22
	23	=head1 SYNOPSIS
	24
	25	use encoding "iso 8859-7";
	26
121910a4	27	# The \xDF of ISO 8859-7 (Greek) is \x{3af} in Unicode.
4bdee82d	28
0a378802	29	$a = "\xDF";
	30	$b = "\x{100}";
	31
4bdee82d	32	printf "%#x\n", ord($a); # will print 0x3af, not 0xdf
4bdee82d	33
0a378802	34	$c = $a . $b;
	35
	36	# $c will be "\x{3af}\x{100}", not "\x{df}\x{100}".
0a378802	37
121910a4	38	# chr() is affected, and ...
	39
	40	print "mega\n" if ord(chr(0xdf)) == 0x3af;
	41
	42	# ... ord() is affected by the encoding pragma ...
	43
	44	print "tera\n" if ord(pack("C", 0xdf)) == 0x3af;
	45
	46	# but pack/unpack C are not, in case you still
	47	# want back to your native encoding
	48
	49	print "peta\n" if unpack("C", (pack("C", 0xdf))) == 0xdf;
	50
0a378802	51	=head1 DESCRIPTION
	52
	53	Normally when legacy 8-bit data is converted to Unicode the data is
	54	expected to be Latin-1 (or EBCDIC in EBCDIC platforms). With the
	55	encoding pragma you can change this default.
	56
	57	The pragma is a per script, not a per block lexical. Only the last
9f4817db	58	C<use encoding> matters, and it affects B<the whole script>.
0a378802	59
4bdee82d	60	If no encoding is specified, the environment variable L<PERL_ENCODING>
121910a4	61	is consulted. If that fails, "latin1" (ISO 8859-1) is assumed.
121910a4	62	If no encoding can be found, C<Unknown encoding '...'> error will be thrown.
4bdee82d	63
6ec9efec	64	=head1 KNOWN PROBLEMS
0a378802	65
121910a4	66	The C<\x..> and C<\0...> in regular expressions are not affected by
6ec9efec	67	this pragma. They very probably should.
d521382b	68
6ec9efec	69	The charnames pragma ("\N{LATIN SMALL SHARP LETTER S}") does not work
6ec9efec	70	with this pragma.
d521382b	71
0a378802	72	=head1 SEE ALSO
0a378802	73
121910a4	74	L<perlunicode>, L<Encode>
0a378802	75
	76	=cut
	77
	78	1;