From: Nick Ing-Simmons Date: Sun, 18 Mar 2001 14:18:12 +0000 (+0000) Subject: UTF-X encoding invariance for Encode: X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=1b026014ba0f5424fabe070eda050db5e7df518a;p=p5sagit%2Fp5-mst-13.2.git UTF-X encoding invariance for Encode: - move Encode::utf8_encode to utf8::encode (likewise decode,upgrade,downgrade,valid) - move the XS code for those to universal.c (so in miniperl) - add utf8::unicode_to_native and its inverse to allow EBCDIC to work in true unicode. - change ext/Encode/compile to use above. - Fix t/lib/encode.t for above - Teach t/lib/b.t to expect -uutf8 - In utf8.c look for SWASHNEW rather than just utf8:: package to see if utf8.pm is needed. p4raw-id: //depot/perlio@9198 --- diff --git a/ext/Encode/Encode.pm b/ext/Encode/Encode.pm index b5ba929..fd85520 100644 --- a/ext/Encode/Encode.pm +++ b/ext/Encode/Encode.pm @@ -188,14 +188,14 @@ sub from_to sub encode_utf8 { my ($str) = @_; - utf8_encode($str); + utf8::encode($str); return $str; } sub decode_utf8 { my ($str) = @_; - return undef unless utf8_decode($str); + return undef unless utf8::decode($str); return $str; } @@ -226,14 +226,14 @@ package Encode::Unicode; use base 'Encode::Encoding'; # Dummy package that provides the encode interface but leaves data -# as UTF-8 encoded. It is here so that from_to() works. +# as UTF-X encoded. It is here so that from_to() works. __PACKAGE__->Define('Unicode'); sub decode { my ($obj,$str,$chk) = @_; - Encode::utf8_upgrade($str); + utf8::upgrade($str); $_[1] = '' if $chk; return $str; } @@ -717,17 +717,6 @@ As such they are efficient, but may change. =over 4 -=item * - - $num_octets = utf8_upgrade($string); - -Converts internal representation of string to the UTF-8 form. -Returns the number of octets necessary to represent the string as UTF-8. - -=item * utf8_downgrade($string[, CHECK]) - -Converts internal representation of string to be un-encoded bytes. - =item * is_utf8(STRING [, CHECK]) [INTERNAL] Test whether the UTF-8 flag is turned on in the STRING. diff --git a/ext/Encode/Encode.xs b/ext/Encode/Encode.xs index cdb1965..4d62501 100644 --- a/ext/Encode/Encode.xs +++ b/ext/Encode/Encode.xs @@ -433,38 +433,6 @@ encode_method(pTHX_ encode_t *enc, encpage_t *dir, SV *src, int check) return dst; } -MODULE = Encode PACKAGE = Encode PREFIX = sv_ - -void -valid_utf8(sv) -SV * sv -CODE: - { - STRLEN len; - char *s = SvPV(sv,len); - if (!SvUTF8(sv) || is_utf8_string((U8*)s,len)) - XSRETURN_YES; - else - XSRETURN_NO; - } - -void -sv_utf8_encode(sv) -SV * sv - -bool -sv_utf8_decode(sv) -SV * sv - -STRLEN -sv_utf8_upgrade(sv) -SV * sv - -bool -sv_utf8_downgrade(sv,failok=0) -SV * sv -bool failok - MODULE = Encode PACKAGE = Encode::XS PREFIX = Method_ PROTOTYPES: ENABLE diff --git a/ext/Encode/compile b/ext/Encode/compile index 8201043..d0611f7 100755 --- a/ext/Encode/compile +++ b/ext/Encode/compile @@ -8,23 +8,16 @@ use Getopt::Std; my @orig_ARGV = @ARGV; my $perforce = '$Id$'; - sub encode_U { # UTF-8 encode long hand - only covers part of perl's range my $uv = shift; - if ($uv < 0x80) - { - return chr($uv) - } - if ($uv < 0x800) - { - return chr(($uv >> 6) | 0xC0). - chr(($uv & 0x3F) | 0x80); - } - return chr(($uv >> 12) | 0xE0). - chr((($uv >> 6) & 0x3F) | 0x80). - chr(($uv & 0x3F) | 0x80); + # chr() works in native space so convert value from table + # into that space before using chr(). + my $ch = chr(utf8::unicode_to_native($uv)); + # Now get core perl to encode that the way it likes. + utf8::encode($ch); + return $ch; } sub encode_S diff --git a/lib/utf8.pm b/lib/utf8.pm index f9055b5..7c9a7df 100644 --- a/lib/utf8.pm +++ b/lib/utf8.pm @@ -52,7 +52,7 @@ source text. Until UTF-8 becomes the default format for source text, this pragma should be used to recognize UTF-8 in the source. When UTF-8 becomes the standard source format, this pragma will effectively become a no-op. This pragma already is a no-op on -EBCDIC platforms (where it is alright to code perl in EBCDIC +EBCDIC platforms (where it is alright to code perl in EBCDIC rather than UTF-8). Enabling the C pragma has the following effects: @@ -81,6 +81,32 @@ of byte semantics. =back +=head2 Utility functions + +The following functions are defined in the C package by the perl core. + +=over 4 + +=item * $num_octets = utf8::upgrade($string); + +Converts internal representation of string to the perls internal UTF-X form. +Returns the number of octets necessary to represent the string as UTF-X. + +=item * utf8::downgrade($string[, CHECK]) + +Converts internal representation of string to be un-encoded bytes. + +=item * utf8::encode($string) + +Converts (in-place) I<$string> from logical characters to octet sequence +representing it in perl's UTF-X encoding. + +=item * $flag = utf8::decode($string) + +Attempts to converts I<$string> in-place from perl's UTF-X encoding into logical characters. + +=back + =head1 SEE ALSO L, L diff --git a/t/lib/b.t b/t/lib/b.t index 397fdba..019a1e8 100755 --- a/t/lib/b.t +++ b/t/lib/b.t @@ -2,11 +2,11 @@ BEGIN { chdir 't' if -d 't'; - if ($^O eq 'MacOS') { - @INC = qw(: ::lib ::macos:lib); - } else { - @INC = '.'; - push @INC, '../lib'; + if ($^O eq 'MacOS') { + @INC = qw(: ::lib ::macos:lib); + } else { + @INC = '.'; + push @INC, '../lib'; } } @@ -141,7 +141,7 @@ $a =~ s/-u(Cwd|File|File::Copy|OS2),//g if $^O eq 'os2'; $a =~ s/-uCwd,// if $^O eq 'cygwin'; if ($Config{static_ext} eq ' ') { $b = '-uCarp,-uCarp::Heavy,-uDB,-uExporter,-uExporter::Heavy,-uattributes,' - . '-umain,-ustrict,-uwarnings'; + . '-umain,-ustrict,-uutf8,-uwarnings'; if (ord('A') == 193) { # EBCDIC sort order is qw(a A) not qw(A a) $b = join ',', sort split /,/, $b; } @@ -157,7 +157,7 @@ if ($is_thread) { $a = `$^X $path "-MO=Showlex" -e "my %one" $redir`; if (ord('A') != 193) { # ASCIIish print "# [$a]\nnot " unless $a =~ /sv_undef.*PVNV.*%one.*sv_undef.*HV/s; - } + } else { # EBCDICish C<1: PVNV (0x1a7ede34) "%\226\225\205"> print "# [$a]\nnot " unless $a =~ /sv_undef.*PVNV.*%\\[0-9].*sv_undef.*HV/s; } @@ -171,6 +171,6 @@ my $foo = $deparse->coderef2text(sub { { 234; }}); print "not " unless $foo =~ /{.*{.*234;.*}.*}/sm; ok; $foo = $deparse->coderef2text(sub { { 234; } continue { 123; } }); -print "not " unless $foo =~ /{.*{.*234;.*}.*continue.*{.*123.*}/sm; +print "not " unless $foo =~ /{.*{.*234;.*}.*continue.*{.*123.*}/sm; ok; } diff --git a/t/lib/encode.t b/t/lib/encode.t index af1f34b..d4a13ee 100644 --- a/t/lib/encode.t +++ b/t/lib/encode.t @@ -104,9 +104,9 @@ for my $i (256,128,129,256) { my $c = chr($i); my $s = "$c\n".sprintf("%02X",$i); - ok(Encode::valid_utf8($s),1,"concat of $i botched"); - Encode::utf8_upgrade($s); - ok(Encode::valid_utf8($s),1,"concat of $i botched"); + ok(utf8::valid($s),1,"concat of $i botched"); + utf8::upgrade($s); + ok(utf8::valid($s),1,"concat of $i botched"); } # Spot check a few points in/out of utf8 diff --git a/universal.c b/universal.c index 12d31e5..3e14a68 100644 --- a/universal.c +++ b/universal.c @@ -130,9 +130,18 @@ Perl_sv_derived_from(pTHX_ SV *sv, const char *name) : FALSE ; } +#include "XSUB.h" + void XS_UNIVERSAL_isa(pTHXo_ CV *cv); void XS_UNIVERSAL_can(pTHXo_ CV *cv); void XS_UNIVERSAL_VERSION(pTHXo_ CV *cv); +XS(XS_utf8_valid); +XS(XS_utf8_encode); +XS(XS_utf8_decode); +XS(XS_utf8_upgrade); +XS(XS_utf8_downgrade); +XS(XS_utf8_unicode_to_native); +XS(XS_utf8_native_to_unicode); void Perl_boot_core_UNIVERSAL(pTHX) @@ -142,9 +151,15 @@ Perl_boot_core_UNIVERSAL(pTHX) newXS("UNIVERSAL::isa", XS_UNIVERSAL_isa, file); newXS("UNIVERSAL::can", XS_UNIVERSAL_can, file); newXS("UNIVERSAL::VERSION", XS_UNIVERSAL_VERSION, file); + newXS("utf8::valid", XS_utf8_valid, file); + newXS("utf8::encode", XS_utf8_encode, file); + newXS("utf8::decode", XS_utf8_decode, file); + newXS("utf8::upgrade", XS_utf8_upgrade, file); + newXS("utf8::downgrade", XS_utf8_downgrade, file); + newXS("utf8::native_to_unicode", XS_utf8_native_to_unicode, file); + newXS("utf8::unicode_to_native", XS_utf8_unicode_to_native, file); } -#include "XSUB.h" XS(XS_UNIVERSAL_isa) { @@ -299,3 +314,107 @@ finish: XSRETURN(1); } +XS(XS_utf8_valid) +{ + dXSARGS; + if (items != 1) + Perl_croak(aTHX_ "Usage: utf8::valid(sv)"); + { + SV * sv = ST(0); + { + STRLEN len; + char *s = SvPV(sv,len); + if (!SvUTF8(sv) || is_utf8_string((U8*)s,len)) + XSRETURN_YES; + else + XSRETURN_NO; + } + } + XSRETURN_EMPTY; +} + +XS(XS_utf8_encode) +{ + dXSARGS; + if (items != 1) + Perl_croak(aTHX_ "Usage: utf8::encode(sv)"); + { + SV * sv = ST(0); + + sv_utf8_encode(sv); + } + XSRETURN_EMPTY; +} + +XS(XS_utf8_decode) +{ + dXSARGS; + if (items != 1) + Perl_croak(aTHX_ "Usage: utf8::decode(sv)"); + { + SV * sv = ST(0); + bool RETVAL; + + RETVAL = sv_utf8_decode(sv); + ST(0) = boolSV(RETVAL); + sv_2mortal(ST(0)); + } + XSRETURN(1); +} + +XS(XS_utf8_upgrade) +{ + dXSARGS; + if (items != 1) + Perl_croak(aTHX_ "Usage: utf8::upgrade(sv)"); + { + SV * sv = ST(0); + STRLEN RETVAL; + dXSTARG; + + RETVAL = sv_utf8_upgrade(sv); + XSprePUSH; PUSHi((IV)RETVAL); + } + XSRETURN(1); +} + +XS(XS_utf8_downgrade) +{ + dXSARGS; + if (items < 1 || items > 2) + Perl_croak(aTHX_ "Usage: utf8::downgrade(sv, failok=0)"); + { + SV * sv = ST(0); + bool failok; + bool RETVAL; + + if (items < 2) + failok = 0; + else { + failok = (int)SvIV(ST(1)); + } + + RETVAL = sv_utf8_downgrade(sv, failok); + ST(0) = boolSV(RETVAL); + sv_2mortal(ST(0)); + } + XSRETURN(1); +} + +XS(XS_utf8_native_to_unicode) +{ + dXSARGS; + UV uv = SvUV(ST(0)); + ST(0) = sv_2mortal(newSViv(NATIVE_TO_UNI(uv))); + XSRETURN(1); +} + +XS(XS_utf8_unicode_to_native) +{ + dXSARGS; + UV uv = SvUV(ST(0)); + ST(0) = sv_2mortal(newSViv(UNI_TO_NATIVE(uv))); + XSRETURN(1); +} + + diff --git a/utf8.c b/utf8.c index 7ca3cc7..81fb44d 100644 --- a/utf8.c +++ b/utf8.c @@ -1282,8 +1282,9 @@ Perl_swash_init(pTHX_ char* pkg, char* name, SV *listsv, I32 minbits, I32 none) SV* retval; SV* tokenbufsv = sv_2mortal(NEWSV(0,0)); dSP; + HV *stash = gv_stashpvn(pkg, strlen(pkg), FALSE); - if (!gv_stashpv(pkg, 0)) { /* demand load utf8 */ + if (!gv_fetchmeth(stash, "SWASHNEW", 8, -1)) { /* demand load utf8 */ ENTER; Perl_load_module(aTHX_ PERL_LOADMOD_NOIMPORT, newSVpv(pkg,0), Nullsv); LEAVE;