From: Jarkko Hietaniemi Date: Wed, 31 Oct 2001 14:44:33 +0000 (+0000) Subject: Extend the effect of the encoding pragma to chr() and ord(). X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=121910a497e33cc9235ecb1b0488ff5200159bc4;p=p5sagit%2Fp5-mst-13.2.git Extend the effect of the encoding pragma to chr() and ord(). TODO: regex literals, reentrancy problems with the utf8 and charnames pragmas. p4raw-id: //depot/perl@12790 --- diff --git a/lib/encoding.pm b/lib/encoding.pm index be0fd73..33c5113 100644 --- a/lib/encoding.pm +++ b/lib/encoding.pm @@ -5,6 +5,7 @@ use Encode; sub import { my ($class, $name) = @_; $name = $ENV{PERL_ENCODING} if @_ < 2; + $name = "latin1" unless defined $name; my $enc = find_encoding($name); unless (defined $enc) { require Carp; @@ -23,7 +24,7 @@ encoding - pragma to control the conversion of legacy data into Unicode use encoding "iso 8859-7"; - # The \xDF of ISO 8859-7 is \x{3af} in Unicode. + # The \xDF of ISO 8859-7 (Greek) is \x{3af} in Unicode. $a = "\xDF"; $b = "\x{100}"; @@ -34,6 +35,19 @@ encoding - pragma to control the conversion of legacy data into Unicode # $c will be "\x{3af}\x{100}", not "\x{df}\x{100}". + # chr() is affected, and ... + + print "mega\n" if ord(chr(0xdf)) == 0x3af; + + # ... ord() is affected by the encoding pragma ... + + print "tera\n" if ord(pack("C", 0xdf)) == 0x3af; + + # but pack/unpack C are not, in case you still + # want back to your native encoding + + print "peta\n" if unpack("C", (pack("C", 0xdf))) == 0xdf; + =head1 DESCRIPTION Normally when legacy 8-bit data is converted to Unicode the data is @@ -44,26 +58,26 @@ The pragma is a per script, not a per block lexical. Only the last C matters, and it affects B. If no encoding is specified, the environment variable L -is consulted. If no encoding can be found, C -error will be thrown. +is consulted. If that fails, "latin1" (ISO 8859-1) is assumed. +If no encoding can be found, C error will be thrown. =head1 FUTURE POSSIBILITIES -The C<\x..> and C<\0...> in regular expressions are not -affected by this pragma. They probably should. +The C<\x..> and C<\0...> in regular expressions are not affected by +this pragma. They probably should. -Also chr(), ord(), and C<\N{...}> might become affected. +The charnames "\N{...}" does not work with this pragma. =head1 KNOWN PROBLEMS Cannot be combined with C. Note that this is a problem B if you would like to have Unicode identifiers in your scripts. You should not need C for anything else these days -(since Perl 5.8.0) +(since Perl 5.8.0). =head1 SEE ALSO -L, L +L, L =cut diff --git a/lib/encoding.t b/lib/encoding.t index 2be0312..4dee08e6 100644 --- a/lib/encoding.t +++ b/lib/encoding.t @@ -1,4 +1,4 @@ -print "1..5\n"; +print "1..9\n"; use encoding "latin1"; # ignored (overwritten by the next line) use encoding "greek"; # iso 8859-7 (no "latin" alias, surprise...) @@ -29,3 +29,17 @@ print "ok 4\n"; print "not " unless ord(substr($c, 1, 1)) == 0x100; print "ok 5\n"; +print "not " unless ord(chr(0xdf)) == 0x3af; # spooky +print "ok 6\n"; + +print "not " unless ord(pack("C", 0xdf)) == 0x3af; +print "ok 7\n"; + +# we didn't break pack/unpack, I hope + +print "not " unless unpack("C", pack("C", 0xdf)) == 0xdf; +print "ok 8\n"; + +# the first octet of UTF-8 encoded 0x3af +print "not " unless unpack("C", chr(0xdf)) == 0xce; +print "ok 9\n"; diff --git a/pod/perlfunc.pod b/pod/perlfunc.pod index ca48470..8722105 100644 --- a/pod/perlfunc.pod +++ b/pod/perlfunc.pod @@ -291,7 +291,7 @@ X<-S>X<-b>X<-c>X<-t>X<-u>X<-g>X<-k>X<-T>X<-B>X<-M>X<-A>X<-C> -g File has setgid bit set. -k File has sticky bit set. - -T File is an ASCII text file. + -T File is an ASCII text file (heuristic guess). -B File is a "binary" file (opposite of -T). -M Age of file in days when script started. @@ -682,12 +682,12 @@ On POSIX systems, you can detect this condition this way: Returns the character represented by that NUMBER in the character set. For example, C is C<"A"> in either ASCII or Unicode, and -chr(0x263a) is a Unicode smiley face. Note that characters from -127 to 255 (inclusive) are not encoded in Unicode for backward -compatibility reasons. +chr(0x263a) is a Unicode smiley face. Note that characters from 127 +to 255 (inclusive) are by default not encoded in Unicode for backward +compatibility reasons (but see L). For the reverse, use L. -See L for more about Unicode. +See L and L for more about Unicode. If NUMBER is omitted, uses C<$_>. @@ -2970,9 +2970,12 @@ DIRHANDLEs have their own namespace separate from FILEHANDLEs. =item ord -Returns the numeric (ASCII or Unicode) value of the first character of EXPR. If -EXPR is omitted, uses C<$_>. For the reverse, see L. -See L for more about Unicode. +Returns the numeric (the native 8-bit encoding, like ASCII or EBCDIC, +or Unicode) value of the first character of EXPR. If EXPR is omitted, +uses C<$_>. + +For the reverse, see L. +See L and L for more about Unicode. =item our EXPR @@ -3051,8 +3054,8 @@ sequence of characters that give the order and type of values, as follows: a A string with arbitrary binary data, will be null padded. - A An ASCII string, will be space padded. - Z A null terminated (asciz) string, will be null padded. + A A text (ASCII) string, will be space padded. + Z A null terminated (ASCIZ) string, will be null padded. b A bit string (ascending bit order inside each byte, like vec()). B A bit string (descending bit order inside each byte). diff --git a/pp.c b/pp.c index 1c323f7..d6d0534 100644 --- a/pp.c +++ b/pp.c @@ -3033,8 +3033,16 @@ PP(pp_ord) SV *argsv = POPs; STRLEN len; U8 *s = (U8*)SvPVx(argsv, len); + SV *tmpsv; + + if (PL_encoding && !DO_UTF8(argsv)) { + tmpsv = sv_2mortal(newSVsv(argsv)); + s = (U8*)Perl_sv_recode_to_utf8(aTHX_ tmpsv, PL_encoding); + argsv = tmpsv; + } XPUSHu(DO_UTF8(argsv) ? utf8_to_uvchr(s, 0) : (*s & 0xff)); + RETURN; } @@ -3063,6 +3071,8 @@ PP(pp_chr) *tmps++ = value; *tmps = '\0'; (void)SvPOK_only(TARG); + if (PL_encoding) + Perl_sv_recode_to_utf8(aTHX_ TARG, PL_encoding); XPUSHs(TARG); RETURN; } diff --git a/sv.c b/sv.c index 553267a..afd2aad 100644 --- a/sv.c +++ b/sv.c @@ -10378,7 +10378,7 @@ The PV of the sv is returned. char * Perl_sv_recode_to_utf8(pTHX_ SV *sv, SV *encoding) { - if (SvPOK(sv) && !SvUTF8(sv) && SvROK(encoding)) { + if (SvPOK(sv) && !DO_UTF8(sv) && SvROK(encoding)) { SV *uni; STRLEN len; char *s;