From: Nick Ing-Simmons Date: Wed, 31 Oct 2001 08:59:56 +0000 (+0000) Subject: Integrate mainline X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=25f59a5e41d8ad4cca035052507fca0ebff1d7e6;p=p5sagit%2Fp5-mst-13.2.git Integrate mainline p4raw-id: //depot/perlio@12788 --- 25f59a5e41d8ad4cca035052507fca0ebff1d7e6 diff --cc lib/encoding.pm index 0000000,e758687..7dacd7a mode 000000,100644..100644 --- a/lib/encoding.pm +++ b/lib/encoding.pm @@@ -1,0 -1,63 +1,54 @@@ + package encoding; + + use Encode; + + sub import { + my ($class, $name) = @_; + $name = $ENV{PERL_ENCODING} if @_ < 2; + my $enc = find_encoding($name); + unless (defined $enc) { + require Carp; + Carp::croak "Unknown encoding '$name'"; + } + ${^ENCODING} = $enc; + } + + =pod + + =head1 NAME + + encoding - pragma to control the conversion of legacy data into Unicode + + =head1 SYNOPSIS + + use encoding "iso 8859-7"; + + $a = "\xDF"; + $b = "\x{100}"; + + $c = $a . $b; + + # $c will be "\x{3af}\x{100}", not "\x{df}\x{100}". + # The \xDF of ISO 8859-7 is \x{3af} in Unicode. + + =head1 DESCRIPTION + + Normally when legacy 8-bit data is converted to Unicode the data is + expected to be Latin-1 (or EBCDIC in EBCDIC platforms). With the + encoding pragma you can change this default. + + The pragma is a per script, not a per block lexical. Only the last -C matters, and it affects B. ++'use encoding' seen matters. + + =head1 FUTURE POSSIBILITIES + -The C<\x..> and C<\0...> in regular expressions are not -affected by this pragma. They probably should. - -Also chr(), ord(), and C<\N{...}> might become affected. - -=head1 KNOWN PROBLEMS - -Cannot be combined with C. Note that this is a problem -B if you would like to have Unicode identifiers in your scripts. -You should not need C for anything else these days -(since Perl 5.8.0) ++The C<\x..> and C<\0...> in literals and regular expressions are not ++affected by this pragma. They probably should. Ditto C<\N{...}>. + + =head1 SEE ALSO + + L + + =cut + + 1; diff --cc lib/encoding.t index 0000000,2be0312..40d97a2 mode 000000,100644..100644 --- a/lib/encoding.t +++ b/lib/encoding.t @@@ -1,0 -1,31 +1,24 @@@ -print "1..5\n"; ++print "1..3\n"; + + use encoding "latin1"; # ignored (overwritten by the next line) + use encoding "greek"; # iso 8859-7 (no "latin" alias, surprise...) + -# "greek" is "ISO 8859-7", and \xDF in ISO 8859-7 is -# \x{3AF} in Unicode (GREEK SMALL LETTER IOTA WITH TONOS), -# instead of \xDF in Unicode (LATIN SMALL LETTER SHARP S) - + $a = "\xDF"; + $b = "\x{100}"; + -print "not " unless ord($a) == 0x3af; -print "ok 1\n"; - -print "not " unless ord($b) == 0x100; -print "ok 2\n"; - -my $c; ++my $c = $a . $b; + -$c = $a . $b; ++# "greek" is "ISO 8859-7", and \xDF in ISO 8859-7 is ++# \x{3AF} in Unicode (GREEK SMALL LETTER IOTA WITH TONOS), ++# instead of \xDF in Unicode (LATIN SMALL LETTER SHARP S) + + print "not " unless ord($c) == 0x3af; -print "ok 3\n"; ++print "ok 1\n"; + + print "not " unless length($c) == 2; -print "ok 4\n"; ++print "ok 2\n"; + + print "not " unless ord(substr($c, 1, 1)) == 0x100; -print "ok 5\n"; ++print "ok 3\n"; ++ +