From: Gurusamy Sarathy Date: Mon, 7 Feb 2000 07:08:15 +0000 (+0000) Subject: remove $^U dependent behaviors in runtime; chr() and sprintf('%c',...) X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=3969a89698ec7136fcf0eb1062fdf63f0e7726f4;p=p5sagit%2Fp5-mst-13.2.git remove $^U dependent behaviors in runtime; chr() and sprintf('%c',...) now return bytes all the way to 255, they will be transparently coerced (in future) to UTF-8 when they are used in operations involving other UTF-8 strings; C doesn't set $^U anymore p4raw-id: //depot/perl@5013 --- diff --git a/lib/byte.pm b/lib/byte.pm index 569fa66..0424e17 100644 --- a/lib/byte.pm +++ b/lib/byte.pm @@ -38,9 +38,7 @@ the effect of C within the current lexical scope. Perl normally assumes character semantics in the presence of character data (i.e. data that has come from a source that has -been marked as being of a particular character encoding) or when -the global $^U flag is enabled. [XXX: implement -C command line -switch and mention that instead of $^U?] +been marked as being of a particular character encoding). To understand the implications and differences between character semantics and byte semantics, see L. diff --git a/lib/utf8.pm b/lib/utf8.pm index be7cc0b..d9e9bec 100644 --- a/lib/utf8.pm +++ b/lib/utf8.pm @@ -1,8 +1,5 @@ package utf8; -$^U = 1 if caller and caller eq 'main'; # they are unicode aware - # XXX split this out? - sub import { $^H |= 0x00800000; $enc{caller()} = $_[1] if $_[1]; @@ -60,15 +57,6 @@ and package names. =item * -As a side effect, when this pragma is used within the main package, -it also enables Unicode character semantics for the entire program. -See L for more on that. - -[XXX: split this out into separate "pragma" and/or -C command-line -switch?] - -=item * - In the absence of inputs marked as UTF-8, regular expressions within the scope of this pragma will default to using character semantics instead of byte semantics. @@ -80,9 +68,6 @@ of byte semantics. @chars = split //, $data; # splits characters } -[XXX: Should this should be enabled like chr()/sprintf("%c") by looking -at $^U instead?] - =head1 SEE ALSO L, L diff --git a/pod/perlunicode.pod b/pod/perlunicode.pod index 5a73d4e..bebf7aa 100644 --- a/pod/perlunicode.pod +++ b/pod/perlunicode.pod @@ -31,12 +31,9 @@ or from literals and constants in the source text. Later, in L, we'll see how such inputs may be marked as being Unicode character data sources. -One particular condition will enable character semantics on the entire -program, bypassing the compatibility mode: if the C<$^U> global flag is -set to C<1>, nearly all operations will use character semantics by -default. As an added convenience, if the C pragma is used in the -C
package, C<$^U> is enabled automatically. [XXX: Should there -be a -C switch to enable $^U?] +If the C<$^U> global flag is set to C<1>, all system calls will use the +corresponding wide character APIs. This is currently only implemented +on Windows. [XXX: Should there be a -C switch to enable $^U?] Regardless of the above, the C pragma can always be used to force byte semantics in a particular lexical scope. See L. diff --git a/pod/perlvar.pod b/pod/perlvar.pod index dca9cc0..79ec7f9 100644 --- a/pod/perlvar.pod +++ b/pod/perlvar.pod @@ -875,37 +875,15 @@ and B<-C> filetests are based on this value. =item $^U -Global flag that switches on Unicode character support in the Perl -interpreter. The initial value is usually C<0> for compatibility -with Perl versions earlier than 5.6, but may be automatically set -to C<1> by Perl if the system provides a user-settable default -(e.g., C<$ENV{LC_CTYPE}>). It is also implicitly set to C<1> -whenever the utf8 pragma is loaded. +Global flag that enables system calls made by Perl to use wide character +APIs native to the system, if available. This is currently only implemented +on the Windows platform. -Setting it to C<1> has the following effects: +The initial value is typically C<0> for compatibility with Perl versions +earlier than 5.6, but may be automatically set to C<1> by Perl if the system +provides a user-settable default (e.g., C<$ENV{LC_CTYPE}>). -=over - -=item * - -C produces UTF-8 encoded Unicode characters. These are the same -as the corresponding ASCII characters if the argument is less than 128. - -=item * - -The C<%c> format in C generates a UTF-8 encoded Unicode -character. This is the same as the corresponding ASCII character -if the argument is less than 128. - -=item * - -Any system calls made by Perl will use wide character APIs native to -the system, if available. This is currently only implemented on the -Windows platform. - -=back - -The C pragma overrides the value of this flag in the current +The C pragma always overrides the effect of this flag in the current lexical scope. See L. =item $^V @@ -914,7 +892,7 @@ The revision, version, and subversion of the Perl interpreter, represented as a "version tuple". Version tuples have both a numeric value and a string value. The numeric value is a floating point number that amounts to revision + version/1000 + subversion/1000000, and the string value -is made of utf8 characters: +is made of characters possibly in the UTF-8 range: C. This can be used to determine whether the Perl interpreter executing a diff --git a/pp.c b/pp.c index 9406a0b..eb05228 100644 --- a/pp.c +++ b/pp.c @@ -2199,10 +2199,9 @@ PP(pp_chr) char *tmps; U32 value = POPu; - SvUTF8_off(TARG); /* decontaminate */ (void)SvUPGRADE(TARG,SVt_PV); - if (value >= 128 && PL_bigchar && !IN_BYTE) { + if (value > 255 && !IN_BYTE) { SvGROW(TARG,8); tmps = SvPVX(TARG); tmps = (char*)uv_to_utf8((U8*)tmps, (UV)value); @@ -2219,6 +2218,7 @@ PP(pp_chr) tmps = SvPVX(TARG); *tmps++ = value; *tmps = '\0'; + SvUTF8_off(TARG); /* decontaminate */ (void)SvPOK_only(TARG); XPUSHs(TARG); RETURN; diff --git a/sv.c b/sv.c index 0cf3b4c..29bf2fb 100644 --- a/sv.c +++ b/sv.c @@ -5830,7 +5830,7 @@ Perl_sv_vcatpvfn(pTHX_ SV *sv, const char *pat, STRLEN patlen, va_list *args, SV uv = va_arg(*args, int); else uv = (svix < svmax) ? SvIVx(svargs[svix++]) : 0; - if (uv >= 128 && PL_bigchar && !IN_BYTE) { + if ((uv > 255 || (uv > 127 && SvUTF8(sv))) && !IN_BYTE) { eptr = (char*)utf8buf; elen = uv_to_utf8((U8*)eptr, uv) - utf8buf; is_utf = TRUE;