From: Jarkko Hietaniemi Date: Thu, 16 Jan 2003 01:58:39 +0000 (+0000) Subject: Make the locale-induced UTF-8-ification of STD fhs X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=fde18df140d5f64815bdd632a127ecd5ce3d97fa;p=p5sagit%2Fp5-mst-13.2.git Make the locale-induced UTF-8-ification of STD fhs and the default file open layer explicit (either -C or PERL_UTF8_LOCALE), instead of implicit (and unasked-for). p4raw-id: //depot/perl@18490 --- diff --git a/embedvar.h b/embedvar.h index cc70926..202cea0 100644 --- a/embedvar.h +++ b/embedvar.h @@ -413,10 +413,10 @@ #define PL_utf8_toupper (vTHX->Iutf8_toupper) #define PL_utf8_upper (vTHX->Iutf8_upper) #define PL_utf8_xdigit (vTHX->Iutf8_xdigit) +#define PL_utf8locale (vTHX->Iutf8locale) #define PL_uudmap (vTHX->Iuudmap) #define PL_wantutf8 (vTHX->Iwantutf8) #define PL_warnhook (vTHX->Iwarnhook) -#define PL_widesyscalls (vTHX->Iwidesyscalls) #define PL_xiv_arenaroot (vTHX->Ixiv_arenaroot) #define PL_xiv_root (vTHX->Ixiv_root) #define PL_xnv_arenaroot (vTHX->Ixnv_arenaroot) @@ -702,10 +702,10 @@ #define PL_Iutf8_toupper PL_utf8_toupper #define PL_Iutf8_upper PL_utf8_upper #define PL_Iutf8_xdigit PL_utf8_xdigit +#define PL_Iutf8locale PL_utf8locale #define PL_Iuudmap PL_uudmap #define PL_Iwantutf8 PL_wantutf8 #define PL_Iwarnhook PL_warnhook -#define PL_Iwidesyscalls PL_widesyscalls #define PL_Ixiv_arenaroot PL_xiv_arenaroot #define PL_Ixiv_root PL_xiv_root #define PL_Ixnv_arenaroot PL_xnv_arenaroot diff --git a/gv.c b/gv.c index 08dd7c3..8dfa932 100644 --- a/gv.c +++ b/gv.c @@ -974,9 +974,15 @@ Perl_gv_fetchpv(pTHX_ const char *nambeg, I32 add, I32 sv_type) goto ro_magicalize; else break; + case '\025': + if (len > 1 && strNE(name, "\025TF8_LOCALE")) + break; + goto ro_magicalize; + case '\027': /* $^W & $^WARNING_BITS */ - if (len > 1 && strNE(name, "\027ARNING_BITS") - && strNE(name, "\027IDE_SYSTEM_CALLS")) + if (len > 1 + && strNE(name, "\027ARNING_BITS") + ) break; goto magicalize; @@ -1793,10 +1799,13 @@ Perl_is_gv_magical(pTHX_ char *name, STRLEN len, U32 flags) goto yes; } break; + case '\025': + if (len > 1 && strEQ(name, "\025TF8_LOCALE")) + goto yes; case '\027': /* $^W & $^WARNING_BITS */ if (len == 1 || (len == 12 && strEQ(name, "\027ARNING_BITS")) - || (len == 17 && strEQ(name, "\027IDE_SYSTEM_CALLS"))) + ) { goto yes; } diff --git a/intrpvar.h b/intrpvar.h index f44ccce..7320725 100644 --- a/intrpvar.h +++ b/intrpvar.h @@ -48,7 +48,7 @@ The C variable which corresponds to Perl's $^W warning variable. */ PERLVAR(Idowarn, U8) -PERLVAR(Iwidesyscalls, bool) /* wide system calls */ +PERLVAR(Iutf8locale, bool) /* utf8 locale detected */ PERLVAR(Idoextract, bool) PERLVAR(Isawampersand, bool) /* must save all match strings */ PERLVAR(Iunsafe, bool) diff --git a/locale.c b/locale.c index c03451b..9d52244 100644 --- a/locale.c +++ b/locale.c @@ -475,7 +475,7 @@ Perl_init_i18nl10n(pTHX_ int printwarn) #ifdef USE_PERLIO { - /* Set PL_wantutf8 to TRUE if using PerlIO _and_ + /* Set PL_utf8locale to TRUE if using PerlIO _and_ any of the following are true: - nl_langinfo(CODESET) contains /^utf-?8/i - $ENV{LC_ALL} contains /^utf-?8/i @@ -487,37 +487,44 @@ Perl_init_i18nl10n(pTHX_ int printwarn) it overrides LC_MESSAGES for GNU gettext, and it also can have more than one locale, separated by spaces, in case you need to know.) - If PL_wantutf8 is true, perl.c:S_parse_body() - will turn on the PerlIO :utf8 discipline on STDIN, STDOUT, - STDERR, _and_ the default open discipline. + If PL_utf8locale and PL_wantutf8 (set by -C) are true, + perl.c:S_parse_body() will turn on the PerlIO :utf8 layer + on STDIN, STDOUT, STDERR, _and_ the default open discipline. */ - bool wantutf8 = FALSE; + bool utf8locale = FALSE; char *codeset = NULL; #if defined(HAS_NL_LANGINFO) && defined(CODESET) codeset = nl_langinfo(CODESET); #endif if (codeset) - wantutf8 = (ibcmp(codeset, "UTF-8", 5) == 0 || - ibcmp(codeset, "UTF8", 4) == 0); + utf8locale = (ibcmp(codeset, "UTF-8", 5) == 0 || + ibcmp(codeset, "UTF8", 4) == 0); #if defined(USE_LOCALE) else { /* nl_langinfo(CODESET) is supposed to correctly * interpret the locale environment variables, * but just in case it fails, let's do this manually. */ if (lang) - wantutf8 = (ibcmp(lang, "UTF-8", 5) == 0 || - ibcmp(lang, "UTF8", 4) == 0); + utf8locale = (ibcmp(lang, "UTF-8", 5) == 0 || + ibcmp(lang, "UTF8", 4) == 0); #ifdef USE_LOCALE_CTYPE if (curctype) - wantutf8 = (ibcmp(curctype, "UTF-8", 5) == 0 || - ibcmp(curctype, "UTF8", 4) == 0); + utf8locale = (ibcmp(curctype, "UTF-8", 5) == 0 || + ibcmp(curctype, "UTF8", 4) == 0); #endif if (lc_all) - wantutf8 = (ibcmp(lc_all, "UTF-8", 5) == 0 || - ibcmp(lc_all, "UTF8", 4) == 0); -#endif /* USE_LOCALE */ + utf8locale = (ibcmp(lc_all, "UTF-8", 5) == 0 || + ibcmp(lc_all, "UTF8", 4) == 0); } - if (wantutf8) - PL_wantutf8 = TRUE; +#endif /* USE_LOCALE */ + if (utf8locale) + PL_utf8locale = TRUE; + } + /* Set PL_wantutf8 to $ENV{PERL_UTF8_LOCALE} if using PerlIO. + This is an alternative to using the -C command line switch + (the -C if present will override this). */ + { + char *p = PerlEnv_getenv("PERL_UTF8_LOCALE"); + PL_wantutf8 = p ? (bool) atoi(p) : FALSE; } #endif diff --git a/mg.c b/mg.c index bdf204b..72c8fdf 100644 --- a/mg.c +++ b/mg.c @@ -662,7 +662,11 @@ Perl_magic_get(pTHX_ SV *sv, MAGIC *mg) ? (PL_taint_warn || PL_unsafe ? -1 : 1) : 0); break; - case '\027': /* ^W & $^WARNING_BITS & ^WIDE_SYSTEM_CALLS */ + case '\025': /* $^UTF8_LOCALE */ + if (strEQ(mg->mg_ptr, "\025TF8_LOCALE")) + sv_setiv(sv, (IV) (PL_wantutf8 && PL_utf8locale)); + break; + case '\027': /* ^W & $^WARNING_BITS */ if (*(mg->mg_ptr+1) == '\0') sv_setiv(sv, (IV)((PL_dowarn & G_WARN_ON) ? TRUE : FALSE)); else if (strEQ(mg->mg_ptr+1, "ARNING_BITS")) { @@ -679,8 +683,6 @@ Perl_magic_get(pTHX_ SV *sv, MAGIC *mg) } SvPOK_only(sv); } - else if (strEQ(mg->mg_ptr+1, "IDE_SYSTEM_CALLS")) - sv_setiv(sv, (IV)PL_widesyscalls); break; case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': case '&': @@ -1925,7 +1927,13 @@ Perl_magic_set(pTHX_ SV *sv, MAGIC *mg) PL_basetime = (Time_t)(SvIOK(sv) ? SvIVX(sv) : sv_2iv(sv)); #endif break; - case '\027': /* ^W & $^WARNING_BITS & ^WIDE_SYSTEM_CALLS */ + case '\025': /* $^UTF8_LOCALE */ + if (SvIOK(sv) ? SvIVX(sv) : sv_2iv(sv)) + PL_wantutf8 = PL_utf8locale; + else + PL_wantutf8 = FALSE; + break; + case '\027': /* ^W & $^WARNING_BITS */ if (*(mg->mg_ptr+1) == '\0') { if ( ! (PL_dowarn & G_WARN_ALL_MASK)) { i = SvIOK(sv) ? SvIVX(sv) : sv_2iv(sv); @@ -1967,8 +1975,6 @@ Perl_magic_set(pTHX_ SV *sv, MAGIC *mg) } } } - else if (strEQ(mg->mg_ptr+1, "IDE_SYSTEM_CALLS")) - PL_widesyscalls = (bool)SvTRUE(sv); break; case '.': if (PL_localizing) { diff --git a/perl.c b/perl.c index 8b73d25..3493cd8 100644 --- a/perl.c +++ b/perl.c @@ -1355,10 +1355,11 @@ print \" \\@INC:\\n @INC\\n\";"); if (!PL_do_undump) init_postdump_symbols(argc,argv,env); - /* PL_wantutf8 is conditionally turned on by + /* PL_utf8locale is conditionally turned on by * locale.c:Perl_init_i18nl10n() if the environment - * look like the user wants to use UTF-8. */ - if (PL_wantutf8) { /* Requires init_predump_symbols(). */ + * look like the user wants to use UTF-8. + * PL_wantutf8 is turned on by -C or by $ENV{PERL_UTF8_LOCALE}. */ + if (PL_utf8locale && PL_wantutf8) { /* Requires init_predump_symbols(). */ IO* io; PerlIO* fp; SV* sv; @@ -2156,7 +2157,7 @@ Perl_moreswitches(pTHX_ char *s) return s + numlen; } case 'C': - PL_widesyscalls = TRUE; + PL_wantutf8 = TRUE; /* Can be set earlier by $ENV{PERL_UTF8_LOCALE}. */ s++; return s; case 'F': @@ -3397,7 +3398,7 @@ Perl_init_argv_symbols(pTHX_ register int argc, register char **argv) for (; argc > 0; argc--,argv++) { SV *sv = newSVpv(argv[0],0); av_push(GvAVn(PL_argvgv),sv); - if (PL_widesyscalls) + if (PL_wantutf8) (void)sv_utf8_decode(sv); } } diff --git a/perlapi.h b/perlapi.h index 451a4d9..ff344ab 100644 --- a/perlapi.h +++ b/perlapi.h @@ -584,14 +584,14 @@ END_EXTERN_C #define PL_utf8_upper (*Perl_Iutf8_upper_ptr(aTHX)) #undef PL_utf8_xdigit #define PL_utf8_xdigit (*Perl_Iutf8_xdigit_ptr(aTHX)) +#undef PL_utf8locale +#define PL_utf8locale (*Perl_Iutf8locale_ptr(aTHX)) #undef PL_uudmap #define PL_uudmap (*Perl_Iuudmap_ptr(aTHX)) #undef PL_wantutf8 #define PL_wantutf8 (*Perl_Iwantutf8_ptr(aTHX)) #undef PL_warnhook #define PL_warnhook (*Perl_Iwarnhook_ptr(aTHX)) -#undef PL_widesyscalls -#define PL_widesyscalls (*Perl_Iwidesyscalls_ptr(aTHX)) #undef PL_xiv_arenaroot #define PL_xiv_arenaroot (*Perl_Ixiv_arenaroot_ptr(aTHX)) #undef PL_xiv_root diff --git a/pod/perlrun.pod b/pod/perlrun.pod index 7251712..46e1849 100644 --- a/pod/perlrun.pod +++ b/pod/perlrun.pod @@ -266,11 +266,21 @@ An alternate delimiter may be specified using B<-F>. =item B<-C> -enables Perl to use the native wide character APIs on the target system. -The magic variable C<${^WIDE_SYSTEM_CALLS}> reflects the state of -this switch. See L. - -This feature is currently only implemented on the Win32 platform. +enables Perl to use the Unicode APIs on the target system. + +As of Perl 5.8.1, if C<-C> is used and the locale settings (the LC_ALL, +LC_CTYPE, and LANG environment variables) indicate a UTF-8 locale, +the STDIN is expected to be in UTF-8, the STDOUT and STDERR are +expected to be in UTF-8, and C<:utf8> is the default file open layer. +See L, L, and L for more information. +The magic variable C<${^UTF8_LOCALE}> reflects this state, +see L. (Another way of setting this +variable is to set the environment variable PERL_UTF8_LOCALE.) + +(In Perls earlier than 5.8.1 the C<-C> switch was a Win32-only switch +that enabled the use of Unicode-aware "wide system call" Win32 APIs. +This feature was practically unused, however, and the command line +switch was therefore "recycled".) =item B<-c> diff --git a/pod/perlunicode.pod b/pod/perlunicode.pod index ee8b6ef..1d3f846 100644 --- a/pod/perlunicode.pod +++ b/pod/perlunicode.pod @@ -67,13 +67,6 @@ character data. Such data may come from filehandles, from calls to external programs, from information provided by the system (such as %ENV), or from literals and constants in the source text. -On Windows platforms, if the C<-C> command line switch is used or the -${^WIDE_SYSTEM_CALLS} global flag is set to C<1>, all system calls -will use the corresponding wide-character APIs. This feature is -available only on Windows to conform to the API standard already -established for that platform--and there are very few non-Windows -platforms that have Unicode-aware APIs. - The C pragma will always, regardless of platform, force byte semantics in a particular lexical scope. See L. @@ -1050,10 +1043,14 @@ there are a couple of exceptions: =item * -If your locale environment variables (LANGUAGE, LC_ALL, LC_CTYPE, LANG) -contain the strings 'UTF-8' or 'UTF8' (case-insensitive matching), -the default encodings of your STDIN, STDOUT, and STDERR, and of -B, are considered to be UTF-8. +If your locale environment variables (LC_ALL, LC_CTYPE, LANG) +contain the strings 'UTF-8' or 'UTF8' (matched case-insensitively) +B you enable using UTF-8 either by using the C<-C> command line +switch or setting the PERL_UTF8_LOCALE environment variable to a true +value, then the default encodings of your STDIN, STDOUT, and STDERR, +and of B, are considered to be UTF-8. +See L, L, and L for more +information. The magic variable C<${^UTF8_LOCALE}> will also be set. =item * @@ -1410,6 +1407,6 @@ the UTF-8 flag: =head1 SEE ALSO L, L, L, L, L, L, -L, L +L, L =cut diff --git a/pod/perluniintro.pod b/pod/perluniintro.pod index 21f0fa7..3a23460 100644 --- a/pod/perluniintro.pod +++ b/pod/perluniintro.pod @@ -172,13 +172,15 @@ To output UTF-8, use the C<:utf8> output layer. Prepending to this sample program ensures that the output is completely UTF-8, and removes the program's warning. -If your locale environment variables (C, C, -C, C) contain the strings 'UTF-8' or 'UTF8', -regardless of case, then the default encoding of your STDIN, STDOUT, -and STDERR and of B, is UTF-8. Note that -this means that Perl expects other software to work, too: if Perl has -been led to believe that STDIN should be UTF-8, but then STDIN coming -in from another command is not UTF-8, Perl will complain about the +If your locale environment variables (C, C, C) +contain the strings 'UTF-8' or 'UTF8' (matched case-insensitively) +B you enable using UTF-8 either by using the C<-C> command line +switch or by setting the PERL_UTF8_LOCALE environment variable to +a true value, then the default encoding of your STDIN, STDOUT, and +STDERR, and of B, is UTF-8. Note that this +means that Perl expects other software to work, too: if Perl has been +led to believe that STDIN should be UTF-8, but then STDIN coming in +from another command is not UTF-8, Perl will complain about the malformed UTF-8. All features that combine Unicode and I/O also require using the new diff --git a/pod/perlvar.pod b/pod/perlvar.pod index 08235c2..7621be0 100644 --- a/pod/perlvar.pod +++ b/pod/perlvar.pod @@ -1109,6 +1109,16 @@ Reflects if taint mode is on or off. 1 for on (the program was run with B<-T>), 0 for off, -1 when only taint warnings are enabled (i.e. with B<-t> or B<-TU>). This variable is read-only. +=item ${^UTF8_LOCALE} + +Reflects whether the locale settings indicated the use of UTF-8 and that +the use of UTF-8 was enabled either by the C<-C> command line switch or +by setting the PERL_UTF8_LOCALE environment variable to a true value. +This variable is read-only. If true, the STDIN is expected to be in +UTF-8, the STDOUT and STDERR are in UTF-8, and C<:utf8> is the default +file open layer. See L, L, and L +for more information. + =item $PERL_VERSION =item $^V @@ -1148,21 +1158,6 @@ related to the B<-w> switch.) See also L. The current set of warning checks enabled by the C pragma. See the documentation of C for more details. -=item ${^WIDE_SYSTEM_CALLS} - -Global flag that enables system calls made by Perl to use wide character -APIs native to the system, if available. This is currently only implemented -on the Windows platform. - -This can also be enabled from the command line using the C<-C> switch. - -The initial value is typically C<0> for compatibility with Perl versions -earlier than 5.6, but may be automatically set to C<1> by Perl if the system -provides a user-settable default (e.g., C<$ENV{LC_CTYPE}>). - -The C pragma always overrides the effect of this flag in the current -lexical scope. See L. - =item $EXECUTABLE_NAME =item $^X