from the native eight bit data to Unicode.
TODO: \x.. and \0... literals. \N{}. chr()? ord()?
p4raw-id: //depot/perl@12750
lib/Dumpvalue.pm Screen dump of perl values
lib/Dumpvalue.t See if Dumpvalue works
lib/dumpvar.pl A variable dumper
+lib/encoding.pm Encoding of legacy data into Unicode
+lib/encoding.t Test for the encoding pragma
lib/English.pm Readable aliases for short variables
lib/English.t See if English works
lib/Env.pm Map environment into ordinary variables
#define PL_dowarn (PERL_GET_INTERP->Idowarn)
#define PL_e_script (PERL_GET_INTERP->Ie_script)
#define PL_egid (PERL_GET_INTERP->Iegid)
+#define PL_encoding (PERL_GET_INTERP->Iencoding)
#define PL_endav (PERL_GET_INTERP->Iendav)
#define PL_envgv (PERL_GET_INTERP->Ienvgv)
#define PL_errgv (PERL_GET_INTERP->Ierrgv)
#define PL_dowarn (vTHX->Idowarn)
#define PL_e_script (vTHX->Ie_script)
#define PL_egid (vTHX->Iegid)
+#define PL_encoding (vTHX->Iencoding)
#define PL_endav (vTHX->Iendav)
#define PL_envgv (vTHX->Ienvgv)
#define PL_errgv (vTHX->Ierrgv)
#define PL_Idowarn PL_dowarn
#define PL_Ie_script PL_e_script
#define PL_Iegid PL_egid
+#define PL_Iencoding PL_encoding
#define PL_Iendav PL_endav
#define PL_Ienvgv PL_envgv
#define PL_Ierrgv PL_errgv
case '\001': /* $^A */
case '\003': /* $^C */
case '\004': /* $^D */
- case '\005': /* $^E */
case '\006': /* $^F */
case '\010': /* $^H */
case '\011': /* $^I, NOT \t in EBCDIC */
break;
sv_setiv(GvSV(gv), (IV)(IoFLAGS(GvIOp(PL_defoutgv)) & IOf_FLUSH) != 0);
goto magicalize;
+ case '\005': /* $^E && $^ENCODING */
+ if (len > 1 && strNE(name, "\005NCODING"))
+ break;
+ goto magicalize;
+
case '\017': /* $^O & $^OPEN */
if (len > 1 && strNE(name, "\017PEN"))
break;
PERLVARI(Idef_layerlist, PerlIO_list_t *,NULL)
#endif
+PERLVARI(Iencoding, SV*, Nullsv) /* character encoding */
+
/* New variables must be added to the very end for binary compatibility.
* XSUB.h provides wrapper functions via perlapi.h that make this
* irrelevant, but not all code may be expected to #include XSUB.h. */
--- /dev/null
+package encoding;
+
+use Encode;
+
+sub import {
+ my ($class, $name) = @_;
+ $name = $ENV{PERL_ENCODING} if @_ < 2;
+ my $enc = find_encoding($name);
+ unless (defined $enc) {
+ require Carp;
+ Carp::croak "Unknown encoding '$name'";
+ }
+ ${^ENCODING} = $enc;
+}
+
+=pod
+
+=head1 NAME
+
+encoding - pragma to control the conversion of legacy data into Unicode
+
+=head1 SYNOPSIS
+
+ use encoding "iso 8859-7";
+
+ $a = "\xDF";
+ $b = "\x{100}";
+
+ $c = $a . $b;
+
+ # $c will be "\x{3af}\x{100}", not "\x{df}\x{100}".
+ # The \xDF of ISO 8859-7 is \x{3af} in Unicode.
+
+=head1 DESCRIPTION
+
+Normally when legacy 8-bit data is converted to Unicode the data is
+expected to be Latin-1 (or EBCDIC in EBCDIC platforms). With the
+encoding pragma you can change this default.
+
+The pragma is a per script, not a per block lexical. Only the last
+'use encoding' seen matters.
+
+=head1 FUTURE POSSIBILITIES
+
+The C<\x..> and C<\0...> in literals and regular expressions are not
+affected by this pragma. They probably should.
+
+=head1 SEE ALSO
+
+L<perlunicode>
+
+=cut
+
+1;
--- /dev/null
+print "1..3\n";
+
+use encoding "latin1"; # ignored (overwritten by the next line)
+use encoding "greek";
+
+$a = "\xDF";
+$b = "\x{100}";
+
+my $c = $a . $b;
+
+# "greek" is "ISO 8859-7", and \xDF in ISO 8859-7 is
+# \x3AF in Unicode (GREEK SMALL LETTER IOTA WITH TONOS),
+# instead of \xDF in Unicode (LATIN SMALL LETTER SHARP S)
+
+print "not " unless ord($c) == 0x3af;
+print "ok 1\n";
+
+print "not " unless length($c) == 2;
+print "ok 2\n";
+
+print "not " unless ord(substr($c, 1, 1)) == 0x100;
+print "ok 3\n";
+
+
#endif
break;
case '\005': /* ^E */
+ if (*(mg->mg_ptr+1) == '\0') {
#ifdef MACOS_TRADITIONAL
- {
- char msg[256];
+ {
+ char msg[256];
- sv_setnv(sv,(double)gMacPerl_OSErr);
- sv_setpv(sv, gMacPerl_OSErr ? GetSysErrText(gMacPerl_OSErr, msg) : "");
- }
+ sv_setnv(sv,(double)gMacPerl_OSErr);
+ sv_setpv(sv, gMacPerl_OSErr ? GetSysErrText(gMacPerl_OSErr, msg) : "");
+ }
#else
#ifdef VMS
- {
-# include <descrip.h>
-# include <starlet.h>
- char msg[255];
- $DESCRIPTOR(msgdsc,msg);
- sv_setnv(sv,(NV) vaxc$errno);
- if (sys$getmsg(vaxc$errno,&msgdsc.dsc$w_length,&msgdsc,0,0) & 1)
- sv_setpvn(sv,msgdsc.dsc$a_pointer,msgdsc.dsc$w_length);
- else
- sv_setpv(sv,"");
- }
+ {
+# include <descrip.h>
+# include <starlet.h>
+ char msg[255];
+ $DESCRIPTOR(msgdsc,msg);
+ sv_setnv(sv,(NV) vaxc$errno);
+ if (sys$getmsg(vaxc$errno,&msgdsc.dsc$w_length,&msgdsc,0,0) & 1)
+ sv_setpvn(sv,msgdsc.dsc$a_pointer,msgdsc.dsc$w_length);
+ else
+ sv_setpv(sv,"");
+ }
#else
#ifdef OS2
- if (!(_emx_env & 0x200)) { /* Under DOS */
- sv_setnv(sv, (NV)errno);
- sv_setpv(sv, errno ? Strerror(errno) : "");
- } else {
- if (errno != errno_isOS2) {
- int tmp = _syserrno();
- if (tmp) /* 2nd call to _syserrno() makes it 0 */
- Perl_rc = tmp;
- }
- sv_setnv(sv, (NV)Perl_rc);
- sv_setpv(sv, os2error(Perl_rc));
- }
+ if (!(_emx_env & 0x200)) { /* Under DOS */
+ sv_setnv(sv, (NV)errno);
+ sv_setpv(sv, errno ? Strerror(errno) : "");
+ } else {
+ if (errno != errno_isOS2) {
+ int tmp = _syserrno();
+ if (tmp) /* 2nd call to _syserrno() makes it 0 */
+ Perl_rc = tmp;
+ }
+ sv_setnv(sv, (NV)Perl_rc);
+ sv_setpv(sv, os2error(Perl_rc));
+ }
#else
#ifdef WIN32
- {
- DWORD dwErr = GetLastError();
- sv_setnv(sv, (NV)dwErr);
- if (dwErr)
- {
- PerlProc_GetOSError(sv, dwErr);
- }
- else
- sv_setpv(sv, "");
- SetLastError(dwErr);
- }
+ {
+ DWORD dwErr = GetLastError();
+ sv_setnv(sv, (NV)dwErr);
+ if (dwErr)
+ {
+ PerlProc_GetOSError(sv, dwErr);
+ }
+ else
+ sv_setpv(sv, "");
+ SetLastError(dwErr);
+ }
#else
- sv_setnv(sv, (NV)errno);
- sv_setpv(sv, errno ? Strerror(errno) : "");
+ sv_setnv(sv, (NV)errno);
+ sv_setpv(sv, errno ? Strerror(errno) : "");
#endif
#endif
#endif
#endif
- SvNOK_on(sv); /* what a wonderful hack! */
- break;
+ SvNOK_on(sv); /* what a wonderful hack! */
+ }
+ else if (strEQ(mg->mg_ptr+1, "NCODING"))
+ sv_setsv(sv, PL_encoding);
+ break;
case '\006': /* ^F */
sv_setiv(sv, (IV)PL_maxsysfd);
break;
case '\027': /* ^W & $^WARNING_BITS & ^WIDE_SYSTEM_CALLS */
if (*(mg->mg_ptr+1) == '\0')
sv_setiv(sv, (IV)((PL_dowarn & G_WARN_ON) ? TRUE : FALSE));
- else if (strEQ(mg->mg_ptr, "\027ARNING_BITS")) {
+ else if (strEQ(mg->mg_ptr+1, "ARNING_BITS")) {
if (PL_compiling.cop_warnings == pWARN_NONE ||
PL_compiling.cop_warnings == pWARN_STD)
{
}
SvPOK_only(sv);
}
- else if (strEQ(mg->mg_ptr, "\027IDE_SYSTEM_CALLS"))
+ else if (strEQ(mg->mg_ptr+1, "IDE_SYSTEM_CALLS"))
sv_setiv(sv, (IV)PL_widesyscalls);
break;
case '1': case '2': case '3': case '4':
DEBUG_x(dump_all());
break;
case '\005': /* ^E */
+ if (*(mg->mg_ptr+1) == '\0') {
#ifdef MACOS_TRADITIONAL
- gMacPerl_OSErr = SvIOK(sv) ? SvIVX(sv) : sv_2iv(sv);
+ gMacPerl_OSErr = SvIOK(sv) ? SvIVX(sv) : sv_2iv(sv);
#else
# ifdef VMS
- set_vaxc_errno(SvIOK(sv) ? SvIVX(sv) : sv_2iv(sv));
+ set_vaxc_errno(SvIOK(sv) ? SvIVX(sv) : sv_2iv(sv));
# else
# ifdef WIN32
- SetLastError( SvIV(sv) );
+ SetLastError( SvIV(sv) );
# else
# ifdef OS2
- os2_setsyserrno(SvIOK(sv) ? SvIVX(sv) : sv_2iv(sv));
+ os2_setsyserrno(SvIOK(sv) ? SvIVX(sv) : sv_2iv(sv));
# else
- /* will anyone ever use this? */
- SETERRNO(SvIOK(sv) ? SvIVX(sv) : sv_2iv(sv), 4);
+ /* will anyone ever use this? */
+ SETERRNO(SvIOK(sv) ? SvIVX(sv) : sv_2iv(sv), 4);
# endif
# endif
# endif
#endif
- break;
+ }
+ else if (strEQ(mg->mg_ptr+1, "NCODING")) {
+ if (PL_encoding)
+ sv_setsv(PL_encoding, sv);
+ else
+ PL_encoding = newSVsv(sv);
+ }
case '\006': /* ^F */
PL_maxsysfd = SvIOK(sv) ? SvIVX(sv) : sv_2iv(sv);
break;
| (i ? G_WARN_ON : G_WARN_OFF) ;
}
}
- else if (strEQ(mg->mg_ptr, "\027ARNING_BITS")) {
+ else if (strEQ(mg->mg_ptr+1, "ARNING_BITS")) {
if ( ! (PL_dowarn & G_WARN_ALL_MASK)) {
if (!SvPOK(sv) && PL_localizing) {
sv_setpvn(sv, WARN_NONEstring, WARNsize);
}
}
}
- else if (strEQ(mg->mg_ptr, "\027IDE_SYSTEM_CALLS"))
+ else if (strEQ(mg->mg_ptr+1, "IDE_SYSTEM_CALLS"))
PL_widesyscalls = SvTRUE(sv);
break;
case '.':
#define PL_e_script (*Perl_Ie_script_ptr(aTHX))
#undef PL_egid
#define PL_egid (*Perl_Iegid_ptr(aTHX))
+#undef PL_encoding
+#define PL_encoding (*Perl_Iencoding_ptr(aTHX))
#undef PL_endav
#define PL_endav (*Perl_Iendav_ptr(aTHX))
#undef PL_envgv
apply; otherwise, byte semantics are in effect. To force byte semantics
on Unicode data, the C<bytes> pragma should be used.
-Notice that if you have a string with byte semantics and you then
-add character data into it, the bytes will be upgraded I<as if they
-were ISO 8859-1 (Latin-1)> (or if in EBCDIC, after a translation
-to ISO 8859-1).
+Notice that if you concatenate strings with byte semantics and strings
+with Unicode character data, the bytes will by default be upgraded
+I<as if they were ISO 8859-1 (Latin-1)> (or if in EBCDIC, after a
+translation to ISO 8859-1). To change this, use the C<encoding>
+pragma, see L<encoding>.
Under character semantics, many operations that formerly operated on
bytes change to operating on characters. For ASCII data this makes no
Also see L<Error Indicators>.
+=item ${^ENCODING}
+
+The encoding used to interpret native eight-bit encodings to Unicode,
+see L<encode>. An opaque C<Encode::XS> object.
+
=item $OS_ERROR
=item $ERRNO
sv_force_normal(sv);
}
- /* This function could be much more efficient if we had a FLAG in SVs
- * to signal if there are any hibit chars in the PV.
- * Given that there isn't make loop fast as possible
- */
- s = (U8 *) SvPVX(sv);
- e = (U8 *) SvEND(sv);
- t = s;
- while (t < e) {
- U8 ch = *t++;
- if ((hibit = !NATIVE_IS_INVARIANT(ch)))
- break;
- }
- if (hibit) {
- STRLEN len;
-
- len = SvCUR(sv) + 1; /* Plus the \0 */
- SvPVX(sv) = (char*)bytes_to_utf8((U8*)s, &len);
- SvCUR(sv) = len - 1;
- if (SvLEN(sv) != 0)
- Safefree(s); /* No longer using what was there before. */
- SvLEN(sv) = len; /* No longer know the real size. */
+ if (PL_encoding) {
+ SV *uni;
+ STRLEN len;
+ char *s;
+ dSP;
+ ENTER;
+ SAVETMPS;
+ PUSHMARK(sp);
+ EXTEND(SP, 3);
+ XPUSHs(PL_encoding);
+ XPUSHs(sv);
+ XPUSHs(&PL_sv_yes);
+ PUTBACK;
+ call_method("decode", G_SCALAR);
+ SPAGAIN;
+ uni = POPs;
+ PUTBACK;
+ s = SvPVutf8(uni, len);
+ if (s != SvPVX(sv)) {
+ SvGROW(sv, len);
+ Move(s, SvPVX(sv), len, char);
+ SvCUR_set(sv, len);
+ }
+ FREETMPS;
+ LEAVE;
+ } else { /* Assume Latin-1/EBCDIC */
+ /* This function could be much more efficient if we
+ * had a FLAG in SVs to signal if there are any hibit
+ * chars in the PV. Given that there isn't such a flag
+ * make the loop as fast as possible. */
+ s = (U8 *) SvPVX(sv);
+ e = (U8 *) SvEND(sv);
+ t = s;
+ while (t < e) {
+ U8 ch = *t++;
+ if ((hibit = !NATIVE_IS_INVARIANT(ch)))
+ break;
+ }
+ if (hibit) {
+ STRLEN len;
+
+ len = SvCUR(sv) + 1; /* Plus the \0 */
+ SvPVX(sv) = (char*)bytes_to_utf8((U8*)s, &len);
+ SvCUR(sv) = len - 1;
+ if (SvLEN(sv) != 0)
+ Safefree(s); /* No longer using what was there before. */
+ SvLEN(sv) = len; /* No longer know the real size. */
+ }
}
/* Mark as UTF-8 even if no hibit - saves scanning loop */
SvUTF8_on(sv);
#ifdef VMS
PL_statusvalue_vms = proto_perl->Istatusvalue_vms;
#endif
+ PL_encoding = sv_dup(proto_perl->Iencoding, param);
/* Clone the regex array */
PL_regex_padav = newAV();