From: Jarkko Hietaniemi Date: Wed, 20 Mar 2002 00:55:54 +0000 (+0000) Subject: If it looks like UTF-8 (either nl_langinfo or locale variables), X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=b310b0538cc1a7948587a9e5ff30683fec2a3ece;p=p5sagit%2Fp5-mst-13.2.git If it looks like UTF-8 (either nl_langinfo or locale variables), think UTF-8, embrace your inner UTF-8, as suggested by Larry. (And as suggested by Markus Kuhn.) While we are at it, document also the case of mixed hash keys as a known potential troublemaker. (Since it's locale-related, sometimes.) p4raw-id: //depot/perl@15350 --- diff --git a/embedvar.h b/embedvar.h index 965f265..8eccfa2 100644 --- a/embedvar.h +++ b/embedvar.h @@ -439,6 +439,7 @@ #define PL_utf8_upper (PERL_GET_INTERP->Iutf8_upper) #define PL_utf8_xdigit (PERL_GET_INTERP->Iutf8_xdigit) #define PL_uudmap (PERL_GET_INTERP->Iuudmap) +#define PL_wantutf8 (PERL_GET_INTERP->Iwantutf8) #define PL_warnhook (PERL_GET_INTERP->Iwarnhook) #define PL_widesyscalls (PERL_GET_INTERP->Iwidesyscalls) #define PL_xiv_arenaroot (PERL_GET_INTERP->Ixiv_arenaroot) @@ -737,6 +738,7 @@ #define PL_utf8_upper (vTHX->Iutf8_upper) #define PL_utf8_xdigit (vTHX->Iutf8_xdigit) #define PL_uudmap (vTHX->Iuudmap) +#define PL_wantutf8 (vTHX->Iwantutf8) #define PL_warnhook (vTHX->Iwarnhook) #define PL_widesyscalls (vTHX->Iwidesyscalls) #define PL_xiv_arenaroot (vTHX->Ixiv_arenaroot) @@ -1038,6 +1040,7 @@ #define PL_Iutf8_upper PL_utf8_upper #define PL_Iutf8_xdigit PL_utf8_xdigit #define PL_Iuudmap PL_uudmap +#define PL_Iwantutf8 PL_wantutf8 #define PL_Iwarnhook PL_warnhook #define PL_Iwidesyscalls PL_widesyscalls #define PL_Ixiv_arenaroot PL_xiv_arenaroot diff --git a/intrpvar.h b/intrpvar.h index 31d6449..94125c5 100644 --- a/intrpvar.h +++ b/intrpvar.h @@ -515,6 +515,8 @@ PERLVARI(IOpSpace,I32,0) PERLVAR(IOpSlab,I32 *) #endif +PERLVAR(Iwantutf8, bool) /* want utf8 as the default discipline */ + /* New variables must be added to the very end for binary compatibility. * XSUB.h provides wrapper functions via perlapi.h that make this * irrelevant, but not all code may be expected to #include XSUB.h. */ diff --git a/lib/open.pm b/lib/open.pm index 7d59d9a..363a005 100644 --- a/lib/open.pm +++ b/lib/open.pm @@ -242,6 +242,11 @@ pragma. =back +If your locale environment variables (LANGUAGE, LC_ALL, LC_CTYPE, LANG) +contain the strings 'UTF-8' or 'UTF8' (case-insensitive matching), +the default encoding of your STDIN, STDOUT, and STDERR, and of +B, is UTF-8. + Directory handles may also support disciplines in future. =head1 NONPERLIO FUNCTIONALITY diff --git a/locale.c b/locale.c index 22e9030..2c84ab5 100644 --- a/locale.c +++ b/locale.c @@ -25,6 +25,10 @@ # include #endif +#ifdef I_LANGINFO +# include +#endif + /* * Standardize the locale name from a string returned by 'setlocale'. * @@ -462,10 +466,45 @@ Perl_init_i18nl10n(pTHX_ int printwarn) #ifdef USE_LOCALE_NUMERIC new_numeric(curnum); #endif /* USE_LOCALE_NUMERIC */ + } #endif /* USE_LOCALE */ + { + bool wantutf8 = FALSE; + char *codeset = NULL; +#if defined(HAS_NL_LANGINFO) && defined(CODESET) + codeset = nl_langinfo(CODESET); +#endif + if (codeset && + (ibcmp(codeset, "UTF-8", 5) == 0 || + ibcmp(codeset, "UTF8", 4) == 0)) + wantutf8 = TRUE; +#ifdef __GLIBC__ + if (!wantutf8 && language && + (ibcmp(language, "UTF-8", 5) == 0 || + ibcmp(language, "UTF8", 4) == 0)) + wantutf8 = TRUE; +#endif + if (!wantutf8 && lc_all && + (ibcmp(lc_all, "UTF-8", 5) == 0 || + ibcmp(lc_all, "UTF8", 4) == 0)) + wantutf8 = TRUE; +#ifdef USE_LOCALE_CTYPE + if (!wantutf8 && curctype && + (ibcmp(curctype, "UTF-8", 5) == 0 || + ibcmp(curctype, "UTF8", 4) == 0)) + wantutf8 = TRUE; +#endif + if (!wantutf8 && lang && + (ibcmp(lang, "UTF-8", 5) == 0 || + ibcmp(lang, "UTF8", 4) == 0)) + wantutf8 = TRUE; + if (wantutf8) + PL_wantutf8 = TRUE; + } + #ifdef USE_LOCALE_CTYPE if (curctype != NULL) Safefree(curctype); diff --git a/perl.c b/perl.c index ebf5ca1..c8a6370 100644 --- a/perl.c +++ b/perl.c @@ -1428,6 +1428,22 @@ print \" \\@INC:\\n @INC\\n\";"); if (!PL_do_undump) init_postdump_symbols(argc,argv,env); + if (PL_wantutf8) { /* Requires init_predump_symbols(). */ + IO* io; + PerlIO* fp; + SV* sv; + if (PL_stdingv && (io = GvIO(PL_stdingv)) && (fp = IoIFP(io))) + PerlIO_binmode(aTHX_ fp, IoTYPE(io), 0, ":utf8"); + if (PL_defoutgv && (io = GvIO(PL_defoutgv)) && (fp = IoOFP(io))) + PerlIO_binmode(aTHX_ fp, IoTYPE(io), 0, ":utf8"); + if (PL_stderrgv && (io = GvIO(PL_stderrgv)) && (fp = IoOFP(io))) + PerlIO_binmode(aTHX_ fp, IoTYPE(io), 0, ":utf8"); + if ((sv = GvSV(gv_fetchpv("\017PEN", TRUE, SVt_PV)))) { + sv_setpvn(sv, ":utf8\0:utf8", 11); + SvSETMAGIC(sv); + } + } + init_lexer(); /* now parse the script */ diff --git a/perlapi.h b/perlapi.h index 82af7c5..5070d1d 100644 --- a/perlapi.h +++ b/perlapi.h @@ -606,6 +606,8 @@ END_EXTERN_C #define PL_utf8_xdigit (*Perl_Iutf8_xdigit_ptr(aTHX)) #undef PL_uudmap #define PL_uudmap (*Perl_Iuudmap_ptr(aTHX)) +#undef PL_wantutf8 +#define PL_wantutf8 (*Perl_Iwantutf8_ptr(aTHX)) #undef PL_warnhook #define PL_warnhook (*Perl_Iwarnhook_ptr(aTHX)) #undef PL_widesyscalls diff --git a/pod/perldelta.pod b/pod/perldelta.pod index f973b31..8ea2a49 100644 --- a/pod/perldelta.pod +++ b/pod/perldelta.pod @@ -312,6 +312,13 @@ The list form of C is now implemented for pipes (at least on UNIX): creates a pipe, and runs the equivalent of exec('cat', '/etc/motd') in the child process. +=item * + +If your locale environment variables (LANGUAGE, LC_ALL, LC_CTYPE, LANG) +contain the strings 'UTF-8' or 'UTF8' (case-insensitive matching), +the default encoding of your STDIN, STDOUT, and STDERR, and of +B, is UTF-8. + =back =head2 Safe Signals @@ -334,7 +341,8 @@ out from potentially blocking operations should still work, though. Unicode in general should be now much more usable than in Perl 5.6.0 (or even in 5.6.1). Unicode can be used in hash keys, Unicode in regular expressions should work now, Unicode in tr/// should work now, -Unicode in I/O should work now. +Unicode in I/O should work now. See L for introduction +and L for details. =over 4 diff --git a/pod/perllocale.pod b/pod/perllocale.pod index 43ffe58..5605428 100644 --- a/pod/perllocale.pod +++ b/pod/perllocale.pod @@ -985,6 +985,15 @@ nations, when we all know that the world can equally well be divided into bankers, bikers, gamers, and so on. But, for now, it's the only standard we've got. This may be construed as a bug. +=head1 Unicode and UTF-8 + +The support of Unicode is new starting from Perl version 5.6, and +more fully implemented in the version 5.8. See L and +L for more details. + +Usually locale settings and Unicode do not affect each other, but +there are exceptions, see L for examples. + =head1 BUGS =head2 Broken systems @@ -1000,7 +1009,8 @@ operating system upgrade. =head1 SEE ALSO -L, L, L, +L, L, L, L, +L, L, L, L, L, L, L, L, L, L, L, diff --git a/pod/perlunicode.pod b/pod/perlunicode.pod index 518d239..34e00c8 100644 --- a/pod/perlunicode.pod +++ b/pod/perlunicode.pod @@ -828,6 +828,29 @@ are specifically discussed. There is no C pragma or the platform's "natural" 8-bit encoding of Unicode. See L for more discussion of the issues. +=head2 Locales + +Usually locale settins and Unicode do not affect each other, but +there are a couple of exceptions: + +=over 4 + +=item * + +If your locale environment variables (LANGUAGE, LC_ALL, LC_CTYPE, LANG) +contain the strings 'UTF-8' or 'UTF8' (case-insensitive matching), +the default encoding of your STDIN, STDOUT, and STDERR, and of +B, is UTF-8. + +=item * + +Perl tries really hard to work both with Unicode and the old byte +oriented world: most often this is nice, but sometimes this causes +problems. See L for example how sometimes using locales +with Unicode can be a good thing. + +=back + =head2 Using Unicode in XS If you want to handle Perl Unicode in XS extensions, you may find @@ -936,7 +959,16 @@ Use of locales with Unicode data may lead to odd results. Currently there is some attempt to apply 8-bit locale info to characters in the range 0..255, but this is demonstrably incorrect for locales that use characters above that range when mapped into Unicode. It will also -tend to run slower. Avoidance of locales is strongly encouraged. +tend to run slower. Avoidance of locales is strongly encouraged, +with one known expection, see the next paragraph. + +If the keys of a hash are "mixed", that is, some keys are Unicode, +while some keys are "byte", the keys may behave differently in regular +expressions since the definition of character classes like C +is different for byte strings and character strings. This problem can +sometimes be helped by using an appropriate locale (see L). +Another way is to force all the strings to be character encoded by +using utf8::upgrade() (see L). Some functions are slower when working on UTF-8 encoded strings than on byte encoded strings. All functions that need to hop over diff --git a/pod/perluniintro.pod b/pod/perluniintro.pod index c560723..dd3064f 100644 --- a/pod/perluniintro.pod +++ b/pod/perluniintro.pod @@ -169,6 +169,11 @@ To output UTF-8 always, use the ":utf8" output discipline. Prepending to this sample program ensures the output is completely UTF-8, and of course, removes the warning. +If your locale environment variables (LANGUAGE, LC_ALL, LC_CTYPE, LANG) +contain the strings 'UTF-8' or 'UTF8' (case-insensitive matching), +the default encoding of your STDIN, STDOUT, and STDERR, and of +B, is UTF-8. + =head2 Unicode and EBCDIC Perl 5.8.0 also supports Unicode on EBCDIC platforms. There,