From: Jarkko Hietaniemi Date: Sun, 28 Oct 2001 22:55:24 +0000 (+0000) Subject: Add the encoding pragma to control the "upgrade" X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=0a3788024daeeff27b99d9992ad4f1eb40663b1a;p=p5sagit%2Fp5-mst-13.2.git Add the encoding pragma to control the "upgrade" from the native eight bit data to Unicode. TODO: \x.. and \0... literals. \N{}. chr()? ord()? p4raw-id: //depot/perl@12750 --- diff --git a/MANIFEST b/MANIFEST index 957246e..42743f7 100644 --- a/MANIFEST +++ b/MANIFEST @@ -879,6 +879,8 @@ lib/dotsh.pl Code to "dot" in a shell script lib/Dumpvalue.pm Screen dump of perl values lib/Dumpvalue.t See if Dumpvalue works lib/dumpvar.pl A variable dumper +lib/encoding.pm Encoding of legacy data into Unicode +lib/encoding.t Test for the encoding pragma lib/English.pm Readable aliases for short variables lib/English.t See if English works lib/Env.pm Map environment into ordinary variables diff --git a/embedvar.h b/embedvar.h index 2eb5407..95550e6 100644 --- a/embedvar.h +++ b/embedvar.h @@ -231,6 +231,7 @@ #define PL_dowarn (PERL_GET_INTERP->Idowarn) #define PL_e_script (PERL_GET_INTERP->Ie_script) #define PL_egid (PERL_GET_INTERP->Iegid) +#define PL_encoding (PERL_GET_INTERP->Iencoding) #define PL_endav (PERL_GET_INTERP->Iendav) #define PL_envgv (PERL_GET_INTERP->Ienvgv) #define PL_errgv (PERL_GET_INTERP->Ierrgv) @@ -522,6 +523,7 @@ #define PL_dowarn (vTHX->Idowarn) #define PL_e_script (vTHX->Ie_script) #define PL_egid (vTHX->Iegid) +#define PL_encoding (vTHX->Iencoding) #define PL_endav (vTHX->Iendav) #define PL_envgv (vTHX->Ienvgv) #define PL_errgv (vTHX->Ierrgv) @@ -816,6 +818,7 @@ #define PL_Idowarn PL_dowarn #define PL_Ie_script PL_e_script #define PL_Iegid PL_egid +#define PL_Iencoding PL_encoding #define PL_Iendav PL_endav #define PL_Ienvgv PL_envgv #define PL_Ierrgv PL_errgv diff --git a/gv.c b/gv.c index e99b15c..53af8a5 100644 --- a/gv.c +++ b/gv.c @@ -887,7 +887,6 @@ Perl_gv_fetchpv(pTHX_ const char *nambeg, I32 add, I32 sv_type) case '\001': /* $^A */ case '\003': /* $^C */ case '\004': /* $^D */ - case '\005': /* $^E */ case '\006': /* $^F */ case '\010': /* $^H */ case '\011': /* $^I, NOT \t in EBCDIC */ @@ -901,6 +900,11 @@ Perl_gv_fetchpv(pTHX_ const char *nambeg, I32 add, I32 sv_type) break; sv_setiv(GvSV(gv), (IV)(IoFLAGS(GvIOp(PL_defoutgv)) & IOf_FLUSH) != 0); goto magicalize; + case '\005': /* $^E && $^ENCODING */ + if (len > 1 && strNE(name, "\005NCODING")) + break; + goto magicalize; + case '\017': /* $^O & $^OPEN */ if (len > 1 && strNE(name, "\017PEN")) break; diff --git a/intrpvar.h b/intrpvar.h index c224ff7..63c9397 100644 --- a/intrpvar.h +++ b/intrpvar.h @@ -499,6 +499,8 @@ PERLVARI(Iknown_layers, PerlIO_list_t *,NULL) PERLVARI(Idef_layerlist, PerlIO_list_t *,NULL) #endif +PERLVARI(Iencoding, SV*, Nullsv) /* character encoding */ + /* New variables must be added to the very end for binary compatibility. * XSUB.h provides wrapper functions via perlapi.h that make this * irrelevant, but not all code may be expected to #include XSUB.h. */ diff --git a/lib/encoding.pm b/lib/encoding.pm new file mode 100644 index 0000000..472a10a --- /dev/null +++ b/lib/encoding.pm @@ -0,0 +1,54 @@ +package encoding; + +use Encode; + +sub import { + my ($class, $name) = @_; + $name = $ENV{PERL_ENCODING} if @_ < 2; + my $enc = find_encoding($name); + unless (defined $enc) { + require Carp; + Carp::croak "Unknown encoding '$name'"; + } + ${^ENCODING} = $enc; +} + +=pod + +=head1 NAME + +encoding - pragma to control the conversion of legacy data into Unicode + +=head1 SYNOPSIS + + use encoding "iso 8859-7"; + + $a = "\xDF"; + $b = "\x{100}"; + + $c = $a . $b; + + # $c will be "\x{3af}\x{100}", not "\x{df}\x{100}". + # The \xDF of ISO 8859-7 is \x{3af} in Unicode. + +=head1 DESCRIPTION + +Normally when legacy 8-bit data is converted to Unicode the data is +expected to be Latin-1 (or EBCDIC in EBCDIC platforms). With the +encoding pragma you can change this default. + +The pragma is a per script, not a per block lexical. Only the last +'use encoding' seen matters. + +=head1 FUTURE POSSIBILITIES + +The C<\x..> and C<\0...> in literals and regular expressions are not +affected by this pragma. They probably should. + +=head1 SEE ALSO + +L + +=cut + +1; diff --git a/lib/encoding.t b/lib/encoding.t new file mode 100644 index 0000000..6e18d34 --- /dev/null +++ b/lib/encoding.t @@ -0,0 +1,24 @@ +print "1..3\n"; + +use encoding "latin1"; # ignored (overwritten by the next line) +use encoding "greek"; + +$a = "\xDF"; +$b = "\x{100}"; + +my $c = $a . $b; + +# "greek" is "ISO 8859-7", and \xDF in ISO 8859-7 is +# \x3AF in Unicode (GREEK SMALL LETTER IOTA WITH TONOS), +# instead of \xDF in Unicode (LATIN SMALL LETTER SHARP S) + +print "not " unless ord($c) == 0x3af; +print "ok 1\n"; + +print "not " unless length($c) == 2; +print "ok 2\n"; + +print "not " unless ord(substr($c, 1, 1)) == 0x100; +print "ok 3\n"; + + diff --git a/mg.c b/mg.c index 793035d..3608e6a 100644 --- a/mg.c +++ b/mg.c @@ -519,62 +519,66 @@ Perl_magic_get(pTHX_ SV *sv, MAGIC *mg) #endif break; case '\005': /* ^E */ + if (*(mg->mg_ptr+1) == '\0') { #ifdef MACOS_TRADITIONAL - { - char msg[256]; + { + char msg[256]; - sv_setnv(sv,(double)gMacPerl_OSErr); - sv_setpv(sv, gMacPerl_OSErr ? GetSysErrText(gMacPerl_OSErr, msg) : ""); - } + sv_setnv(sv,(double)gMacPerl_OSErr); + sv_setpv(sv, gMacPerl_OSErr ? GetSysErrText(gMacPerl_OSErr, msg) : ""); + } #else #ifdef VMS - { -# include -# include - char msg[255]; - $DESCRIPTOR(msgdsc,msg); - sv_setnv(sv,(NV) vaxc$errno); - if (sys$getmsg(vaxc$errno,&msgdsc.dsc$w_length,&msgdsc,0,0) & 1) - sv_setpvn(sv,msgdsc.dsc$a_pointer,msgdsc.dsc$w_length); - else - sv_setpv(sv,""); - } + { +# include +# include + char msg[255]; + $DESCRIPTOR(msgdsc,msg); + sv_setnv(sv,(NV) vaxc$errno); + if (sys$getmsg(vaxc$errno,&msgdsc.dsc$w_length,&msgdsc,0,0) & 1) + sv_setpvn(sv,msgdsc.dsc$a_pointer,msgdsc.dsc$w_length); + else + sv_setpv(sv,""); + } #else #ifdef OS2 - if (!(_emx_env & 0x200)) { /* Under DOS */ - sv_setnv(sv, (NV)errno); - sv_setpv(sv, errno ? Strerror(errno) : ""); - } else { - if (errno != errno_isOS2) { - int tmp = _syserrno(); - if (tmp) /* 2nd call to _syserrno() makes it 0 */ - Perl_rc = tmp; - } - sv_setnv(sv, (NV)Perl_rc); - sv_setpv(sv, os2error(Perl_rc)); - } + if (!(_emx_env & 0x200)) { /* Under DOS */ + sv_setnv(sv, (NV)errno); + sv_setpv(sv, errno ? Strerror(errno) : ""); + } else { + if (errno != errno_isOS2) { + int tmp = _syserrno(); + if (tmp) /* 2nd call to _syserrno() makes it 0 */ + Perl_rc = tmp; + } + sv_setnv(sv, (NV)Perl_rc); + sv_setpv(sv, os2error(Perl_rc)); + } #else #ifdef WIN32 - { - DWORD dwErr = GetLastError(); - sv_setnv(sv, (NV)dwErr); - if (dwErr) - { - PerlProc_GetOSError(sv, dwErr); - } - else - sv_setpv(sv, ""); - SetLastError(dwErr); - } + { + DWORD dwErr = GetLastError(); + sv_setnv(sv, (NV)dwErr); + if (dwErr) + { + PerlProc_GetOSError(sv, dwErr); + } + else + sv_setpv(sv, ""); + SetLastError(dwErr); + } #else - sv_setnv(sv, (NV)errno); - sv_setpv(sv, errno ? Strerror(errno) : ""); + sv_setnv(sv, (NV)errno); + sv_setpv(sv, errno ? Strerror(errno) : ""); #endif #endif #endif #endif - SvNOK_on(sv); /* what a wonderful hack! */ - break; + SvNOK_on(sv); /* what a wonderful hack! */ + } + else if (strEQ(mg->mg_ptr+1, "NCODING")) + sv_setsv(sv, PL_encoding); + break; case '\006': /* ^F */ sv_setiv(sv, (IV)PL_maxsysfd); break; @@ -625,7 +629,7 @@ Perl_magic_get(pTHX_ SV *sv, MAGIC *mg) case '\027': /* ^W & $^WARNING_BITS & ^WIDE_SYSTEM_CALLS */ if (*(mg->mg_ptr+1) == '\0') sv_setiv(sv, (IV)((PL_dowarn & G_WARN_ON) ? TRUE : FALSE)); - else if (strEQ(mg->mg_ptr, "\027ARNING_BITS")) { + else if (strEQ(mg->mg_ptr+1, "ARNING_BITS")) { if (PL_compiling.cop_warnings == pWARN_NONE || PL_compiling.cop_warnings == pWARN_STD) { @@ -639,7 +643,7 @@ Perl_magic_get(pTHX_ SV *sv, MAGIC *mg) } SvPOK_only(sv); } - else if (strEQ(mg->mg_ptr, "\027IDE_SYSTEM_CALLS")) + else if (strEQ(mg->mg_ptr+1, "IDE_SYSTEM_CALLS")) sv_setiv(sv, (IV)PL_widesyscalls); break; case '1': case '2': case '3': case '4': @@ -1742,25 +1746,32 @@ Perl_magic_set(pTHX_ SV *sv, MAGIC *mg) DEBUG_x(dump_all()); break; case '\005': /* ^E */ + if (*(mg->mg_ptr+1) == '\0') { #ifdef MACOS_TRADITIONAL - gMacPerl_OSErr = SvIOK(sv) ? SvIVX(sv) : sv_2iv(sv); + gMacPerl_OSErr = SvIOK(sv) ? SvIVX(sv) : sv_2iv(sv); #else # ifdef VMS - set_vaxc_errno(SvIOK(sv) ? SvIVX(sv) : sv_2iv(sv)); + set_vaxc_errno(SvIOK(sv) ? SvIVX(sv) : sv_2iv(sv)); # else # ifdef WIN32 - SetLastError( SvIV(sv) ); + SetLastError( SvIV(sv) ); # else # ifdef OS2 - os2_setsyserrno(SvIOK(sv) ? SvIVX(sv) : sv_2iv(sv)); + os2_setsyserrno(SvIOK(sv) ? SvIVX(sv) : sv_2iv(sv)); # else - /* will anyone ever use this? */ - SETERRNO(SvIOK(sv) ? SvIVX(sv) : sv_2iv(sv), 4); + /* will anyone ever use this? */ + SETERRNO(SvIOK(sv) ? SvIVX(sv) : sv_2iv(sv), 4); # endif # endif # endif #endif - break; + } + else if (strEQ(mg->mg_ptr+1, "NCODING")) { + if (PL_encoding) + sv_setsv(PL_encoding, sv); + else + PL_encoding = newSVsv(sv); + } case '\006': /* ^F */ PL_maxsysfd = SvIOK(sv) ? SvIVX(sv) : sv_2iv(sv); break; @@ -1811,7 +1822,7 @@ Perl_magic_set(pTHX_ SV *sv, MAGIC *mg) | (i ? G_WARN_ON : G_WARN_OFF) ; } } - else if (strEQ(mg->mg_ptr, "\027ARNING_BITS")) { + else if (strEQ(mg->mg_ptr+1, "ARNING_BITS")) { if ( ! (PL_dowarn & G_WARN_ALL_MASK)) { if (!SvPOK(sv) && PL_localizing) { sv_setpvn(sv, WARN_NONEstring, WARNsize); @@ -1845,7 +1856,7 @@ Perl_magic_set(pTHX_ SV *sv, MAGIC *mg) } } } - else if (strEQ(mg->mg_ptr, "\027IDE_SYSTEM_CALLS")) + else if (strEQ(mg->mg_ptr+1, "IDE_SYSTEM_CALLS")) PL_widesyscalls = SvTRUE(sv); break; case '.': diff --git a/perlapi.h b/perlapi.h index 2811a44..0592374 100644 --- a/perlapi.h +++ b/perlapi.h @@ -199,6 +199,8 @@ END_EXTERN_C #define PL_e_script (*Perl_Ie_script_ptr(aTHX)) #undef PL_egid #define PL_egid (*Perl_Iegid_ptr(aTHX)) +#undef PL_encoding +#define PL_encoding (*Perl_Iencoding_ptr(aTHX)) #undef PL_endav #define PL_endav (*Perl_Iendav_ptr(aTHX)) #undef PL_envgv diff --git a/pod/perlunicode.pod b/pod/perlunicode.pod index 0b52afa..9205fdf 100644 --- a/pod/perlunicode.pod +++ b/pod/perlunicode.pod @@ -102,10 +102,11 @@ literal UTF-8 string constant in the program), character semantics apply; otherwise, byte semantics are in effect. To force byte semantics on Unicode data, the C pragma should be used. -Notice that if you have a string with byte semantics and you then -add character data into it, the bytes will be upgraded I (or if in EBCDIC, after a translation -to ISO 8859-1). +Notice that if you concatenate strings with byte semantics and strings +with Unicode character data, the bytes will by default be upgraded +I (or if in EBCDIC, after a +translation to ISO 8859-1). To change this, use the C +pragma, see L. Under character semantics, many operations that formerly operated on bytes change to operating on characters. For ASCII data this makes no diff --git a/pod/perlvar.pod b/pod/perlvar.pod index 64fc7fd..d34daa6 100644 --- a/pod/perlvar.pod +++ b/pod/perlvar.pod @@ -654,6 +654,11 @@ status; see L for details. Also see L. +=item ${^ENCODING} + +The encoding used to interpret native eight-bit encodings to Unicode, +see L. An opaque C object. + =item $OS_ERROR =item $ERRNO diff --git a/sv.c b/sv.c index 5885b8e..520734c 100644 --- a/sv.c +++ b/sv.c @@ -3302,27 +3302,54 @@ Perl_sv_utf8_upgrade_flags(pTHX_ register SV *sv, I32 flags) sv_force_normal(sv); } - /* This function could be much more efficient if we had a FLAG in SVs - * to signal if there are any hibit chars in the PV. - * Given that there isn't make loop fast as possible - */ - s = (U8 *) SvPVX(sv); - e = (U8 *) SvEND(sv); - t = s; - while (t < e) { - U8 ch = *t++; - if ((hibit = !NATIVE_IS_INVARIANT(ch))) - break; - } - if (hibit) { - STRLEN len; - - len = SvCUR(sv) + 1; /* Plus the \0 */ - SvPVX(sv) = (char*)bytes_to_utf8((U8*)s, &len); - SvCUR(sv) = len - 1; - if (SvLEN(sv) != 0) - Safefree(s); /* No longer using what was there before. */ - SvLEN(sv) = len; /* No longer know the real size. */ + if (PL_encoding) { + SV *uni; + STRLEN len; + char *s; + dSP; + ENTER; + SAVETMPS; + PUSHMARK(sp); + EXTEND(SP, 3); + XPUSHs(PL_encoding); + XPUSHs(sv); + XPUSHs(&PL_sv_yes); + PUTBACK; + call_method("decode", G_SCALAR); + SPAGAIN; + uni = POPs; + PUTBACK; + s = SvPVutf8(uni, len); + if (s != SvPVX(sv)) { + SvGROW(sv, len); + Move(s, SvPVX(sv), len, char); + SvCUR_set(sv, len); + } + FREETMPS; + LEAVE; + } else { /* Assume Latin-1/EBCDIC */ + /* This function could be much more efficient if we + * had a FLAG in SVs to signal if there are any hibit + * chars in the PV. Given that there isn't such a flag + * make the loop as fast as possible. */ + s = (U8 *) SvPVX(sv); + e = (U8 *) SvEND(sv); + t = s; + while (t < e) { + U8 ch = *t++; + if ((hibit = !NATIVE_IS_INVARIANT(ch))) + break; + } + if (hibit) { + STRLEN len; + + len = SvCUR(sv) + 1; /* Plus the \0 */ + SvPVX(sv) = (char*)bytes_to_utf8((U8*)s, &len); + SvCUR(sv) = len - 1; + if (SvLEN(sv) != 0) + Safefree(s); /* No longer using what was there before. */ + SvLEN(sv) = len; /* No longer know the real size. */ + } } /* Mark as UTF-8 even if no hibit - saves scanning loop */ SvUTF8_on(sv); @@ -9827,6 +9854,7 @@ perl_clone_using(PerlInterpreter *proto_perl, UV flags, #ifdef VMS PL_statusvalue_vms = proto_perl->Istatusvalue_vms; #endif + PL_encoding = sv_dup(proto_perl->Iencoding, param); /* Clone the regex array */ PL_regex_padav = newAV();