From: Jarkko Hietaniemi Date: Wed, 31 Oct 2001 00:57:42 +0000 (+0000) Subject: Enable -Mencoding=foobar also for string literals. X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=9f4817dbf9f375fea5253c12557f39e3774b891c;p=p5sagit%2Fp5-mst-13.2.git Enable -Mencoding=foobar also for string literals. p4raw-id: //depot/perl@12782 --- diff --git a/embed.h b/embed.h index 4ac3878..71fb041 100644 --- a/embed.h +++ b/embed.h @@ -688,6 +688,7 @@ #define sv_pos_b2u Perl_sv_pos_b2u #define sv_pvutf8n_force Perl_sv_pvutf8n_force #define sv_pvbyten_force Perl_sv_pvbyten_force +#define sv_recode_to_utf8 Perl_sv_recode_to_utf8 #define sv_reftype Perl_sv_reftype #define sv_replace Perl_sv_replace #define sv_report_used Perl_sv_report_used @@ -2201,6 +2202,7 @@ #define sv_pos_b2u(a,b) Perl_sv_pos_b2u(aTHX_ a,b) #define sv_pvutf8n_force(a,b) Perl_sv_pvutf8n_force(aTHX_ a,b) #define sv_pvbyten_force(a,b) Perl_sv_pvbyten_force(aTHX_ a,b) +#define sv_recode_to_utf8(a) Perl_sv_recode_to_utf8(aTHX_ a) #define sv_reftype(a,b) Perl_sv_reftype(aTHX_ a,b) #define sv_replace(a,b) Perl_sv_replace(aTHX_ a,b) #define sv_report_used() Perl_sv_report_used(aTHX) diff --git a/embed.pl b/embed.pl index 345a299..392e16c 100755 --- a/embed.pl +++ b/embed.pl @@ -1779,6 +1779,7 @@ Apd |void |sv_pos_b2u |SV* sv|I32* offsetp Amd |char* |sv_pvn_force |SV* sv|STRLEN* lp Apd |char* |sv_pvutf8n_force|SV* sv|STRLEN* lp Apd |char* |sv_pvbyten_force|SV* sv|STRLEN* lp +Apd |void |sv_recode_to_utf8 |SV* Apd |char* |sv_reftype |SV* sv|int ob Apd |void |sv_replace |SV* sv|SV* nsv Apd |void |sv_report_used diff --git a/global.sym b/global.sym index c2535a9..cf8ec98 100644 --- a/global.sym +++ b/global.sym @@ -438,6 +438,7 @@ Perl_sv_pos_u2b Perl_sv_pos_b2u Perl_sv_pvutf8n_force Perl_sv_pvbyten_force +Perl_sv_recode_to_utf8 Perl_sv_reftype Perl_sv_replace Perl_sv_report_used diff --git a/lib/encoding.pm b/lib/encoding.pm index 7dacd7a..1addeb4 100644 --- a/lib/encoding.pm +++ b/lib/encoding.pm @@ -38,12 +38,14 @@ expected to be Latin-1 (or EBCDIC in EBCDIC platforms). With the encoding pragma you can change this default. The pragma is a per script, not a per block lexical. Only the last -'use encoding' seen matters. +C matters, and it affects B. =head1 FUTURE POSSIBILITIES -The C<\x..> and C<\0...> in literals and regular expressions are not -affected by this pragma. They probably should. Ditto C<\N{...}>. +The C<\x..> and C<\0...> in regular expressions are not +affected by this pragma. They probably should. + +Also C<\N{...}> might become affected. =head1 SEE ALSO diff --git a/lib/encoding.t b/lib/encoding.t index 40d97a2..2be0312 100644 --- a/lib/encoding.t +++ b/lib/encoding.t @@ -1,24 +1,31 @@ -print "1..3\n"; +print "1..5\n"; use encoding "latin1"; # ignored (overwritten by the next line) use encoding "greek"; # iso 8859-7 (no "latin" alias, surprise...) -$a = "\xDF"; -$b = "\x{100}"; - -my $c = $a . $b; - # "greek" is "ISO 8859-7", and \xDF in ISO 8859-7 is # \x{3AF} in Unicode (GREEK SMALL LETTER IOTA WITH TONOS), # instead of \xDF in Unicode (LATIN SMALL LETTER SHARP S) -print "not " unless ord($c) == 0x3af; +$a = "\xDF"; +$b = "\x{100}"; + +print "not " unless ord($a) == 0x3af; print "ok 1\n"; -print "not " unless length($c) == 2; +print "not " unless ord($b) == 0x100; print "ok 2\n"; -print "not " unless ord(substr($c, 1, 1)) == 0x100; +my $c; + +$c = $a . $b; + +print "not " unless ord($c) == 0x3af; print "ok 3\n"; +print "not " unless length($c) == 2; +print "ok 4\n"; + +print "not " unless ord(substr($c, 1, 1)) == 0x100; +print "ok 5\n"; diff --git a/pod/perlapi.pod b/pod/perlapi.pod index 0435058..41d2373 100644 --- a/pod/perlapi.pod +++ b/pod/perlapi.pod @@ -1404,17 +1404,6 @@ SV is B incremented. =for hackers Found in file sv.c -=item newSV - -Create a new null SV, or if len > 0, create a new empty SVt_PV type SV -with an initial PV allocation of len+1. Normally accessed via the C -macro. - - SV* newSV(STRLEN len) - -=for hackers -Found in file sv.c - =item NEWSV Creates a new SV. A non-zero C parameter indicates the number of @@ -1428,6 +1417,17 @@ C is an integer id between 0 and 1299 (used to identify leaks). =for hackers Found in file handy.h +=item newSV + +Create a new null SV, or if len > 0, create a new empty SVt_PV type SV +with an initial PV allocation of len+1. Normally accessed via the C +macro. + + SV* newSV(STRLEN len) + +=for hackers +Found in file sv.c + =item newSViv Creates a new SV and copies an integer into it. The reference count for the @@ -2282,22 +2282,22 @@ version which guarantees to evaluate sv only once. =for hackers Found in file sv.h -=item SvIVX +=item SvIVx -Returns the raw value in the SV's IV slot, without checks or conversions. -Only use when you are sure SvIOK is true. See also C. +Coerces the given SV to an integer and returns it. Guarantees to evaluate +sv only once. Use the more efficent C otherwise. - IV SvIVX(SV* sv) + IV SvIVx(SV* sv) =for hackers Found in file sv.h -=item SvIVx +=item SvIVX -Coerces the given SV to an integer and returns it. Guarantees to evaluate -sv only once. Use the more efficent C otherwise. +Returns the raw value in the SV's IV slot, without checks or conversions. +Only use when you are sure SvIOK is true. See also C. - IV SvIVx(SV* sv) + IV SvIVX(SV* sv) =for hackers Found in file sv.h @@ -2606,21 +2606,21 @@ Like C, but converts sv to utf8 first if necessary. =for hackers Found in file sv.h -=item SvPVx +=item SvPVX -A version of C which guarantees to evaluate sv only once. +Returns a pointer to the physical string in the SV. The SV must contain a +string. - char* SvPVx(SV* sv, STRLEN len) + char* SvPVX(SV* sv) =for hackers Found in file sv.h -=item SvPVX +=item SvPVx -Returns a pointer to the physical string in the SV. The SV must contain a -string. +A version of C which guarantees to evaluate sv only once. - char* SvPVX(SV* sv) + char* SvPVx(SV* sv, STRLEN len) =for hackers Found in file sv.h @@ -2827,19 +2827,19 @@ false, defined or undefined. Does not handle 'get' magic. =for hackers Found in file sv.h -=item SvTYPE - -Returns the type of the SV. See C. +=item svtype - svtype SvTYPE(SV* sv) +An enum of flags for Perl types. These are found in the file B +in the C enum. Test these flags with the C macro. =for hackers Found in file sv.h -=item svtype +=item SvTYPE -An enum of flags for Perl types. These are found in the file B -in the C enum. Test these flags with the C macro. +Returns the type of the SV. See C. + + svtype SvTYPE(SV* sv) =for hackers Found in file sv.h @@ -2950,22 +2950,22 @@ for a version which guarantees to evaluate sv only once. =for hackers Found in file sv.h -=item SvUVX +=item SvUVx -Returns the raw value in the SV's UV slot, without checks or conversions. -Only use when you are sure SvIOK is true. See also C. +Coerces the given SV to an unsigned integer and returns it. Guarantees to +evaluate sv only once. Use the more efficent C otherwise. - UV SvUVX(SV* sv) + UV SvUVx(SV* sv) =for hackers Found in file sv.h -=item SvUVx +=item SvUVX -Coerces the given SV to an unsigned integer and returns it. Guarantees to -evaluate sv only once. Use the more efficent C otherwise. +Returns the raw value in the SV's UV slot, without checks or conversions. +Only use when you are sure SvIOK is true. See also C. - UV SvUVx(SV* sv) + UV SvUVX(SV* sv) =for hackers Found in file sv.h @@ -3662,6 +3662,20 @@ instead. =for hackers Found in file sv.c +=item sv_recode_to_utf8 + +If PL_encoding is set you can call this to recode the pv of the sv. +The PL_encoding is assumed to be an Encode object, on entry the pv is assumed +to be octets in that encoding, and the sv will be converted into Unicode +(and UTF-8). + +If PL_encoding is not an Encode object, things will go boom. + + void sv_recode_to_utf8(SV*) + +=for hackers +Found in file sv.c + =item sv_reftype Returns a string describing what the SV is a reference to. diff --git a/proto.h b/proto.h index dc96ebe..7d9bc02 100644 --- a/proto.h +++ b/proto.h @@ -760,6 +760,7 @@ PERL_CALLCONV void Perl_sv_pos_b2u(pTHX_ SV* sv, I32* offsetp); /* PERL_CALLCONV char* sv_pvn_force(pTHX_ SV* sv, STRLEN* lp); */ PERL_CALLCONV char* Perl_sv_pvutf8n_force(pTHX_ SV* sv, STRLEN* lp); PERL_CALLCONV char* Perl_sv_pvbyten_force(pTHX_ SV* sv, STRLEN* lp); +PERL_CALLCONV void Perl_sv_recode_to_utf8(pTHX_ SV*); PERL_CALLCONV char* Perl_sv_reftype(pTHX_ SV* sv, int ob); PERL_CALLCONV void Perl_sv_replace(pTHX_ SV* sv, SV* nsv); PERL_CALLCONV void Perl_sv_report_used(pTHX); diff --git a/sv.c b/sv.c index 520734c..a447517 100644 --- a/sv.c +++ b/sv.c @@ -3302,32 +3302,9 @@ Perl_sv_utf8_upgrade_flags(pTHX_ register SV *sv, I32 flags) sv_force_normal(sv); } - if (PL_encoding) { - SV *uni; - STRLEN len; - char *s; - dSP; - ENTER; - SAVETMPS; - PUSHMARK(sp); - EXTEND(SP, 3); - XPUSHs(PL_encoding); - XPUSHs(sv); - XPUSHs(&PL_sv_yes); - PUTBACK; - call_method("decode", G_SCALAR); - SPAGAIN; - uni = POPs; - PUTBACK; - s = SvPVutf8(uni, len); - if (s != SvPVX(sv)) { - SvGROW(sv, len); - Move(s, SvPVX(sv), len, char); - SvCUR_set(sv, len); - } - FREETMPS; - LEAVE; - } else { /* Assume Latin-1/EBCDIC */ + if (PL_encoding) + Perl_sv_recode_to_utf8(aTHX_ sv); + else { /* Assume Latin-1/EBCDIC */ /* This function could be much more efficient if we * had a FLAG in SVs to signal if there are any hibit * chars in the PV. Given that there isn't such a flag @@ -3350,9 +3327,9 @@ Perl_sv_utf8_upgrade_flags(pTHX_ register SV *sv, I32 flags) Safefree(s); /* No longer using what was there before. */ SvLEN(sv) = len; /* No longer know the real size. */ } + /* Mark as UTF-8 even if no hibit - saves scanning loop */ + SvUTF8_on(sv); } - /* Mark as UTF-8 even if no hibit - saves scanning loop */ - SvUTF8_on(sv); return SvCUR(sv); } @@ -10382,3 +10359,46 @@ perl_clone_using(PerlInterpreter *proto_perl, UV flags, #endif /* USE_ITHREADS */ +/* +=for apidoc sv_recode_to_utf8 + +If PL_encoding is set you can call this to recode the pv of the sv. +The PL_encoding is assumed to be an Encode object, on entry the pv is assumed +to be octets in that encoding, and the sv will be converted into Unicode +(and UTF-8). + +If PL_encoding is not an Encode object, things will go boom. + +=cut +*/ + +void +Perl_sv_recode_to_utf8(pTHX_ SV *sv) +{ + SV *uni; + STRLEN len; + char *s; + dSP; + ENTER; + SAVETMPS; + PUSHMARK(sp); + EXTEND(SP, 3); + XPUSHs(PL_encoding); + XPUSHs(sv); + XPUSHs(&PL_sv_yes); + PUTBACK; + call_method("decode", G_SCALAR); + SPAGAIN; + uni = POPs; + PUTBACK; + s = SvPVutf8(uni, len); + if (s != SvPVX(sv)) { + SvGROW(sv, len); + Move(s, SvPVX(sv), len, char); + SvCUR_set(sv, len); + } + FREETMPS; + LEAVE; + SvUTF8_on(sv); +} + diff --git a/toke.c b/toke.c index e6d7abc..90f8305 100644 --- a/toke.c +++ b/toke.c @@ -1653,6 +1653,10 @@ S_scan_const(pTHX_ char *start) Perl_croak(aTHX_ "panic: constant overflowed allocated space"); SvPOK_on(sv); + if (PL_encoding && !has_utf8) { + Perl_sv_recode_to_utf8(aTHX_ sv); + has_utf8 = TRUE; + } if (has_utf8) { SvUTF8_on(sv); if (PL_lex_inwhat == OP_TRANS && PL_sublex_info.sub_op) { @@ -7734,3 +7738,4 @@ utf16rev_textfilter(pTHX_ int idx, SV *sv, int maxlen) return count; } #endif +