- move Encode::utf8_encode to utf8::encode (likewise decode,upgrade,downgrade,valid)
- move the XS code for those to universal.c (so in miniperl)
- add utf8::unicode_to_native and its inverse to allow EBCDIC to work in true unicode.
- change ext/Encode/compile to use above.
- Fix t/lib/encode.t for above
- Teach t/lib/b.t to expect -uutf8
- In utf8.c look for SWASHNEW rather than just utf8:: package to see if
utf8.pm is needed.
p4raw-id: //depot/perlio@9198
sub encode_utf8
{
my ($str) = @_;
- utf8_encode($str);
+ utf8::encode($str);
return $str;
}
sub decode_utf8
{
my ($str) = @_;
- return undef unless utf8_decode($str);
+ return undef unless utf8::decode($str);
return $str;
}
use base 'Encode::Encoding';
# Dummy package that provides the encode interface but leaves data
-# as UTF-8 encoded. It is here so that from_to() works.
+# as UTF-X encoded. It is here so that from_to() works.
__PACKAGE__->Define('Unicode');
sub decode
{
my ($obj,$str,$chk) = @_;
- Encode::utf8_upgrade($str);
+ utf8::upgrade($str);
$_[1] = '' if $chk;
return $str;
}
=over 4
-=item *
-
- $num_octets = utf8_upgrade($string);
-
-Converts internal representation of string to the UTF-8 form.
-Returns the number of octets necessary to represent the string as UTF-8.
-
-=item * utf8_downgrade($string[, CHECK])
-
-Converts internal representation of string to be un-encoded bytes.
-
=item * is_utf8(STRING [, CHECK])
[INTERNAL] Test whether the UTF-8 flag is turned on in the STRING.
return dst;
}
-MODULE = Encode PACKAGE = Encode PREFIX = sv_
-
-void
-valid_utf8(sv)
-SV * sv
-CODE:
- {
- STRLEN len;
- char *s = SvPV(sv,len);
- if (!SvUTF8(sv) || is_utf8_string((U8*)s,len))
- XSRETURN_YES;
- else
- XSRETURN_NO;
- }
-
-void
-sv_utf8_encode(sv)
-SV * sv
-
-bool
-sv_utf8_decode(sv)
-SV * sv
-
-STRLEN
-sv_utf8_upgrade(sv)
-SV * sv
-
-bool
-sv_utf8_downgrade(sv,failok=0)
-SV * sv
-bool failok
-
MODULE = Encode PACKAGE = Encode::XS PREFIX = Method_
PROTOTYPES: ENABLE
my @orig_ARGV = @ARGV;
my $perforce = '$Id$';
-
sub encode_U
{
# UTF-8 encode long hand - only covers part of perl's range
my $uv = shift;
- if ($uv < 0x80)
- {
- return chr($uv)
- }
- if ($uv < 0x800)
- {
- return chr(($uv >> 6) | 0xC0).
- chr(($uv & 0x3F) | 0x80);
- }
- return chr(($uv >> 12) | 0xE0).
- chr((($uv >> 6) & 0x3F) | 0x80).
- chr(($uv & 0x3F) | 0x80);
+ # chr() works in native space so convert value from table
+ # into that space before using chr().
+ my $ch = chr(utf8::unicode_to_native($uv));
+ # Now get core perl to encode that the way it likes.
+ utf8::encode($ch);
+ return $ch;
}
sub encode_S
text, this pragma should be used to recognize UTF-8 in the source.
When UTF-8 becomes the standard source format, this pragma will
effectively become a no-op. This pragma already is a no-op on
-EBCDIC platforms (where it is alright to code perl in EBCDIC
+EBCDIC platforms (where it is alright to code perl in EBCDIC
rather than UTF-8).
Enabling the C<utf8> pragma has the following effects:
=back
+=head2 Utility functions
+
+The following functions are defined in the C<utf8::> package by the perl core.
+
+=over 4
+
+=item * $num_octets = utf8::upgrade($string);
+
+Converts internal representation of string to the perls internal UTF-X form.
+Returns the number of octets necessary to represent the string as UTF-X.
+
+=item * utf8::downgrade($string[, CHECK])
+
+Converts internal representation of string to be un-encoded bytes.
+
+=item * utf8::encode($string)
+
+Converts (in-place) I<$string> from logical characters to octet sequence
+representing it in perl's UTF-X encoding.
+
+=item * $flag = utf8::decode($string)
+
+Attempts to converts I<$string> in-place from perl's UTF-X encoding into logical characters.
+
+=back
+
=head1 SEE ALSO
L<perlunicode>, L<bytes>
BEGIN {
chdir 't' if -d 't';
- if ($^O eq 'MacOS') {
- @INC = qw(: ::lib ::macos:lib);
- } else {
- @INC = '.';
- push @INC, '../lib';
+ if ($^O eq 'MacOS') {
+ @INC = qw(: ::lib ::macos:lib);
+ } else {
+ @INC = '.';
+ push @INC, '../lib';
}
}
$a =~ s/-uCwd,// if $^O eq 'cygwin';
if ($Config{static_ext} eq ' ') {
$b = '-uCarp,-uCarp::Heavy,-uDB,-uExporter,-uExporter::Heavy,-uattributes,'
- . '-umain,-ustrict,-uwarnings';
+ . '-umain,-ustrict,-uutf8,-uwarnings';
if (ord('A') == 193) { # EBCDIC sort order is qw(a A) not qw(A a)
$b = join ',', sort split /,/, $b;
}
$a = `$^X $path "-MO=Showlex" -e "my %one" $redir`;
if (ord('A') != 193) { # ASCIIish
print "# [$a]\nnot " unless $a =~ /sv_undef.*PVNV.*%one.*sv_undef.*HV/s;
- }
+ }
else { # EBCDICish C<1: PVNV (0x1a7ede34) "%\226\225\205">
print "# [$a]\nnot " unless $a =~ /sv_undef.*PVNV.*%\\[0-9].*sv_undef.*HV/s;
}
print "not " unless $foo =~ /{.*{.*234;.*}.*}/sm;
ok;
$foo = $deparse->coderef2text(sub { { 234; } continue { 123; } });
-print "not " unless $foo =~ /{.*{.*234;.*}.*continue.*{.*123.*}/sm;
+print "not " unless $foo =~ /{.*{.*234;.*}.*continue.*{.*123.*}/sm;
ok;
}
{
my $c = chr($i);
my $s = "$c\n".sprintf("%02X",$i);
- ok(Encode::valid_utf8($s),1,"concat of $i botched");
- Encode::utf8_upgrade($s);
- ok(Encode::valid_utf8($s),1,"concat of $i botched");
+ ok(utf8::valid($s),1,"concat of $i botched");
+ utf8::upgrade($s);
+ ok(utf8::valid($s),1,"concat of $i botched");
}
# Spot check a few points in/out of utf8
: FALSE ;
}
+#include "XSUB.h"
+
void XS_UNIVERSAL_isa(pTHXo_ CV *cv);
void XS_UNIVERSAL_can(pTHXo_ CV *cv);
void XS_UNIVERSAL_VERSION(pTHXo_ CV *cv);
+XS(XS_utf8_valid);
+XS(XS_utf8_encode);
+XS(XS_utf8_decode);
+XS(XS_utf8_upgrade);
+XS(XS_utf8_downgrade);
+XS(XS_utf8_unicode_to_native);
+XS(XS_utf8_native_to_unicode);
void
Perl_boot_core_UNIVERSAL(pTHX)
newXS("UNIVERSAL::isa", XS_UNIVERSAL_isa, file);
newXS("UNIVERSAL::can", XS_UNIVERSAL_can, file);
newXS("UNIVERSAL::VERSION", XS_UNIVERSAL_VERSION, file);
+ newXS("utf8::valid", XS_utf8_valid, file);
+ newXS("utf8::encode", XS_utf8_encode, file);
+ newXS("utf8::decode", XS_utf8_decode, file);
+ newXS("utf8::upgrade", XS_utf8_upgrade, file);
+ newXS("utf8::downgrade", XS_utf8_downgrade, file);
+ newXS("utf8::native_to_unicode", XS_utf8_native_to_unicode, file);
+ newXS("utf8::unicode_to_native", XS_utf8_unicode_to_native, file);
}
-#include "XSUB.h"
XS(XS_UNIVERSAL_isa)
{
XSRETURN(1);
}
+XS(XS_utf8_valid)
+{
+ dXSARGS;
+ if (items != 1)
+ Perl_croak(aTHX_ "Usage: utf8::valid(sv)");
+ {
+ SV * sv = ST(0);
+ {
+ STRLEN len;
+ char *s = SvPV(sv,len);
+ if (!SvUTF8(sv) || is_utf8_string((U8*)s,len))
+ XSRETURN_YES;
+ else
+ XSRETURN_NO;
+ }
+ }
+ XSRETURN_EMPTY;
+}
+
+XS(XS_utf8_encode)
+{
+ dXSARGS;
+ if (items != 1)
+ Perl_croak(aTHX_ "Usage: utf8::encode(sv)");
+ {
+ SV * sv = ST(0);
+
+ sv_utf8_encode(sv);
+ }
+ XSRETURN_EMPTY;
+}
+
+XS(XS_utf8_decode)
+{
+ dXSARGS;
+ if (items != 1)
+ Perl_croak(aTHX_ "Usage: utf8::decode(sv)");
+ {
+ SV * sv = ST(0);
+ bool RETVAL;
+
+ RETVAL = sv_utf8_decode(sv);
+ ST(0) = boolSV(RETVAL);
+ sv_2mortal(ST(0));
+ }
+ XSRETURN(1);
+}
+
+XS(XS_utf8_upgrade)
+{
+ dXSARGS;
+ if (items != 1)
+ Perl_croak(aTHX_ "Usage: utf8::upgrade(sv)");
+ {
+ SV * sv = ST(0);
+ STRLEN RETVAL;
+ dXSTARG;
+
+ RETVAL = sv_utf8_upgrade(sv);
+ XSprePUSH; PUSHi((IV)RETVAL);
+ }
+ XSRETURN(1);
+}
+
+XS(XS_utf8_downgrade)
+{
+ dXSARGS;
+ if (items < 1 || items > 2)
+ Perl_croak(aTHX_ "Usage: utf8::downgrade(sv, failok=0)");
+ {
+ SV * sv = ST(0);
+ bool failok;
+ bool RETVAL;
+
+ if (items < 2)
+ failok = 0;
+ else {
+ failok = (int)SvIV(ST(1));
+ }
+
+ RETVAL = sv_utf8_downgrade(sv, failok);
+ ST(0) = boolSV(RETVAL);
+ sv_2mortal(ST(0));
+ }
+ XSRETURN(1);
+}
+
+XS(XS_utf8_native_to_unicode)
+{
+ dXSARGS;
+ UV uv = SvUV(ST(0));
+ ST(0) = sv_2mortal(newSViv(NATIVE_TO_UNI(uv)));
+ XSRETURN(1);
+}
+
+XS(XS_utf8_unicode_to_native)
+{
+ dXSARGS;
+ UV uv = SvUV(ST(0));
+ ST(0) = sv_2mortal(newSViv(UNI_TO_NATIVE(uv)));
+ XSRETURN(1);
+}
+
+
SV* retval;
SV* tokenbufsv = sv_2mortal(NEWSV(0,0));
dSP;
+ HV *stash = gv_stashpvn(pkg, strlen(pkg), FALSE);
- if (!gv_stashpv(pkg, 0)) { /* demand load utf8 */
+ if (!gv_fetchmeth(stash, "SWASHNEW", 8, -1)) { /* demand load utf8 */
ENTER;
Perl_load_module(aTHX_ PERL_LOADMOD_NOIMPORT, newSVpv(pkg,0), Nullsv);
LEAVE;