sub import {
my ($class, $name) = @_;
$name = $ENV{PERL_ENCODING} if @_ < 2;
+ $name = "latin1" unless defined $name;
my $enc = find_encoding($name);
unless (defined $enc) {
require Carp;
use encoding "iso 8859-7";
- # The \xDF of ISO 8859-7 is \x{3af} in Unicode.
+ # The \xDF of ISO 8859-7 (Greek) is \x{3af} in Unicode.
$a = "\xDF";
$b = "\x{100}";
# $c will be "\x{3af}\x{100}", not "\x{df}\x{100}".
+ # chr() is affected, and ...
+
+ print "mega\n" if ord(chr(0xdf)) == 0x3af;
+
+ # ... ord() is affected by the encoding pragma ...
+
+ print "tera\n" if ord(pack("C", 0xdf)) == 0x3af;
+
+ # but pack/unpack C are not, in case you still
+ # want back to your native encoding
+
+ print "peta\n" if unpack("C", (pack("C", 0xdf))) == 0xdf;
+
=head1 DESCRIPTION
Normally when legacy 8-bit data is converted to Unicode the data is
C<use encoding> matters, and it affects B<the whole script>.
If no encoding is specified, the environment variable L<PERL_ENCODING>
-is consulted. If no encoding can be found, C<Unknown encoding '...'>
-error will be thrown.
+is consulted. If that fails, "latin1" (ISO 8859-1) is assumed.
+If no encoding can be found, C<Unknown encoding '...'> error will be thrown.
=head1 FUTURE POSSIBILITIES
-The C<\x..> and C<\0...> in regular expressions are not
-affected by this pragma. They probably should.
+The C<\x..> and C<\0...> in regular expressions are not affected by
+this pragma. They probably should.
-Also chr(), ord(), and C<\N{...}> might become affected.
+The charnames "\N{...}" does not work with this pragma.
=head1 KNOWN PROBLEMS
Cannot be combined with C<use utf8>. Note that this is a problem
B<only> if you would like to have Unicode identifiers in your scripts.
You should not need C<use utf8> for anything else these days
-(since Perl 5.8.0)
+(since Perl 5.8.0).
=head1 SEE ALSO
-L<perlunicode>, L<encode>
+L<perlunicode>, L<Encode>
=cut
-print "1..5\n";
+print "1..9\n";
use encoding "latin1"; # ignored (overwritten by the next line)
use encoding "greek"; # iso 8859-7 (no "latin" alias, surprise...)
print "not " unless ord(substr($c, 1, 1)) == 0x100;
print "ok 5\n";
+print "not " unless ord(chr(0xdf)) == 0x3af; # spooky
+print "ok 6\n";
+
+print "not " unless ord(pack("C", 0xdf)) == 0x3af;
+print "ok 7\n";
+
+# we didn't break pack/unpack, I hope
+
+print "not " unless unpack("C", pack("C", 0xdf)) == 0xdf;
+print "ok 8\n";
+
+# the first octet of UTF-8 encoded 0x3af
+print "not " unless unpack("C", chr(0xdf)) == 0xce;
+print "ok 9\n";
-g File has setgid bit set.
-k File has sticky bit set.
- -T File is an ASCII text file.
+ -T File is an ASCII text file (heuristic guess).
-B File is a "binary" file (opposite of -T).
-M Age of file in days when script started.
Returns the character represented by that NUMBER in the character set.
For example, C<chr(65)> is C<"A"> in either ASCII or Unicode, and
-chr(0x263a) is a Unicode smiley face. Note that characters from
-127 to 255 (inclusive) are not encoded in Unicode for backward
-compatibility reasons.
+chr(0x263a) is a Unicode smiley face. Note that characters from 127
+to 255 (inclusive) are by default not encoded in Unicode for backward
+compatibility reasons (but see L<encoding>).
For the reverse, use L</ord>.
-See L<utf8> for more about Unicode.
+See L<perlunicode> and L<encoding> for more about Unicode.
If NUMBER is omitted, uses C<$_>.
=item ord
-Returns the numeric (ASCII or Unicode) value of the first character of EXPR. If
-EXPR is omitted, uses C<$_>. For the reverse, see L</chr>.
-See L<utf8> for more about Unicode.
+Returns the numeric (the native 8-bit encoding, like ASCII or EBCDIC,
+or Unicode) value of the first character of EXPR. If EXPR is omitted,
+uses C<$_>.
+
+For the reverse, see L</chr>.
+See L<perlunicode> and L<encoding> for more about Unicode.
=item our EXPR
follows:
a A string with arbitrary binary data, will be null padded.
- A An ASCII string, will be space padded.
- Z A null terminated (asciz) string, will be null padded.
+ A A text (ASCII) string, will be space padded.
+ Z A null terminated (ASCIZ) string, will be null padded.
b A bit string (ascending bit order inside each byte, like vec()).
B A bit string (descending bit order inside each byte).
SV *argsv = POPs;
STRLEN len;
U8 *s = (U8*)SvPVx(argsv, len);
+ SV *tmpsv;
+
+ if (PL_encoding && !DO_UTF8(argsv)) {
+ tmpsv = sv_2mortal(newSVsv(argsv));
+ s = (U8*)Perl_sv_recode_to_utf8(aTHX_ tmpsv, PL_encoding);
+ argsv = tmpsv;
+ }
XPUSHu(DO_UTF8(argsv) ? utf8_to_uvchr(s, 0) : (*s & 0xff));
+
RETURN;
}
*tmps++ = value;
*tmps = '\0';
(void)SvPOK_only(TARG);
+ if (PL_encoding)
+ Perl_sv_recode_to_utf8(aTHX_ TARG, PL_encoding);
XPUSHs(TARG);
RETURN;
}
char *
Perl_sv_recode_to_utf8(pTHX_ SV *sv, SV *encoding)
{
- if (SvPOK(sv) && !SvUTF8(sv) && SvROK(encoding)) {
+ if (SvPOK(sv) && !DO_UTF8(sv) && SvROK(encoding)) {
SV *uni;
STRLEN len;
char *s;