ext/Encode/t/perlio.t test script
ext/Encode/t/rt.pl test script
ext/Encode/t/unibench.pl benchmark script
+ext/Encode/t/utf8strict.t test script
ext/Encode/t/Unicode.t test script
ext/Encode/TW/Makefile.PL Encode extension
ext/Encode/TW/TW.pm Encode extension
# $Id: Changes,v 2.9 2004/12/03 19:16:53 dankogai Exp dankogai $
#
$Revision: 2.9 $ $Date: 2004/12/03 19:16:53 $
+! Encode.pm
+ New Pod section: "UTF-8 vs utf8"; explains utf-8-strict
++ t/utf8strict.t
+ Tests utf-8-strict, accordingly to
+ UTF-8 decoder capability and stress test" by Markus Kuhn
+ http://smontagu.damowmow.com/utf8test.html
+ Note that malformed and overlong sequences are not test here
+ because perl already does that for you, utf-8-strict or not.
+! Encode.pm Encode/encode.h t/fallback.t
+ Addressed "encode(..., Encode::LEAVE_SRC) does not work".
+ Now FB_(PERLQQ|HTMLCREF|XMLCREF) implies LEAVE_SRC so
+ you can (en|de)code constant strings with these fallbacks.
+ http://rt.cpan.org/NoAuth/Bug.html?id=8736
+! Encode.pm Encode.xs lib/Encode/Alias.pm t/Aliases.t
+ Make Encode.pm support the real UTF-8, by GAAS
+ Message-Id: <lrfz2mcngd.fsf@caliper.activestate.com>
+ Message-Id: <lr4qizbvvm.fsf@caliper.activestate.com>
+! Encode.pm Encode.xs
+ post-2.09 comment patches from GAAS applied.
+ Message-Id: <lroehacz6q.fsf@caliper.activestate.com>
+ Message-Id: <lrk6rycymu.fsf@caliper.activestate.com>
+
+2.09 2004/12/03 19:16:53
! Encode.pm Encode.xs
Addressed " :encoding(utf8) broken in perl-5.8.6".
Message-Id: <lrllcfeank.fsf_-_@caliper.activestate.com>
#
package Encode;
use strict;
-our $VERSION = do { my @r = (q$Revision: 2.9 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
+# our $VERSION = do { my @r = (q$Revision: 2.9 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
+our $VERSION = '2.0902';
sub DEBUG () { 0 }
use XSLoader ();
XSLoader::load(__PACKAGE__, $VERSION);
Carp::croak("Unknown encoding '$name'");
}
my $octets = $enc->encode($string,$check);
- $_[1] = $string if $check;
+ $_[1] = $string if $check and !($check & LEAVE_SRC());
return $octets;
}
Carp::croak("Unknown encoding '$name'");
}
my $string = $enc->decode($octets,$check);
- $_[1] = $octets if $check;
+ $_[1] = $octets if $check and !($check & LEAVE_SRC());
return $string;
}
};
$Encode::Encoding{utf8} =
bless {Name => "utf8"} => "Encode::utf8";
+ $Encode::Encoding{"utf-8-strict"} =
+ bless {Name => "utf-8-strict", strict_utf8 => 1 } => "Encode::utf8";
}
}
the result is always off, even when it contains completely valid utf8
string. See L</"The UTF-8 flag"> below.
-If the $string is C<undef> or a reference then C<undef> is returned.
+If the $string is C<undef> then C<undef> is returned.
=item $string = decode(ENCODING, $octets [, CHECK])
ASCII data (or EBCDIC on EBCDIC machines). See L</"The UTF-8 flag">
below.
-If the $string is C<undef> or a reference then C<undef> is returned.
+If the $string is C<undef> then C<undef> is returned.
=item [$length =] from_to($octets, FROM_ENC, TO_ENC [, CHECK])
C<\x{I<HHHH>}>, HTML uses C<&#I<NNN>;> where I<NNN> is a decimal number and
XML uses C<&#xI<HHHH>;> where I<HHHH> is the hexadecimal number.
+In Encode 2.10 or later, C<LEAVE_SRC> is also implied.
+
=item The bitmask
These modes are actually set via a bitmask. Here is how the FB_XX
DIE_ON_ERR 0x0001 X
WARN_ON_ERR 0x0002 X
RETURN_ON_ERR 0x0004 X X
- LEAVE_SRC 0x0008
+ LEAVE_SRC 0x0008 X
PERLQQ 0x0100 X
HTMLCREF 0x0200
XMLCREF 0x0400
=back
+=head1 UTF-8 vs. utf8
+
+ ....We now view strings not as sequences of bytes, but as sequences
+ of numbers in the range 0 .. 2**32-1 (or in the case of 64-bit
+ computers, 0 .. 2**64-1) -- Programming Perl, 3rd ed.
+
+That has been the perl's notion of UTF-8 but official UTF-8 is more
+strict; Its ranges is much narrower (0 .. 10FFFF), some sequences are
+not allowed (i.e. Those used in the surrogate pair, 0xFFFE, et al).
+
+Now that is overruled by Larry Wall himself.
+
+ From: Larry Wall <larry@wall.org>
+ Date: December 04, 2004 11:51:58 JST
+ To: perl-unicode@perl.org
+ Subject: Re: Make Encode.pm support the real UTF-8
+ Message-Id: <20041204025158.GA28754@wall.org>
+
+ On Fri, Dec 03, 2004 at 10:12:12PM +0000, Tim Bunce wrote:
+ : I've no problem with 'utf8' being perl's unrestricted uft8 encoding,
+ : but "UTF-8" is the name of the standard and should give the
+ : corresponding behaviour.
+
+ For what it's worth, that's how I've always kept them straight in my
+ head.
+
+ Also for what it's worth, Perl 6 will mostly default to strict but
+ make it easy to switch back to lax.
+
+ Larry
+
+Do you copy? As of Perl 5.8.7, B<UTF-8> means strict, official UTF-8
+while B<utf8> means liberal, lax, version thereof. And Encode version
+2.10 or later thus groks the difference between C<UTF-8> and C"utf8".
+
+ encode("utf8", "\x{FFFF_FFFF}", 1); # okay
+ encode("UTF-8", "\x{FFFF_FFFF}", 1); # croaks
+
+C<UTF-8> in Encode is actually a canonical name for C<utf-8-strict>.
+Yes, the hyphen between "UTF" and "8" is important. Without it Encode
+goes "liberal"
+
+ find_encoding("UTF-8")->name # is 'utf-8-strict'
+ find_encoding("utf-8")->name # ditto. names are case insensitive
+ find_encoding("utf8")->name # ditto. "_" are treated as "-"
+ find_encoding("UTF8")->name # is 'utf8'.
+
+
=head1 SEE ALSO
L<Encode::Encoding>,
UNIMPLEMENTED(_encoded_utf8_to_bytes, I32)
UNIMPLEMENTED(_encoded_bytes_to_utf8, I32)
+#define UTF8_ALLOW_STRICT 0
+#define UTF8_ALLOW_NONSTRICT (UTF8_ALLOW_ANY & \
+ ~(UTF8_ALLOW_CONTINUATION | \
+ UTF8_ALLOW_NON_CONTINUATION | \
+ UTF8_ALLOW_LONG))
+
void
Encode_XSEncoding(pTHX_ encode_t * enc)
{
return dst;
}
+static bool
+strict_utf8(pTHX_ SV* sv)
+{
+ HV* hv;
+ SV** svp;
+ sv = SvRV(sv);
+ if (!sv || SvTYPE(sv) != SVt_PVHV)
+ return 0;
+ hv = (HV*)sv;
+ svp = hv_fetch(hv, "strict_utf8", 11, 0);
+ if (!svp)
+ return 0;
+ return SvTRUE(*svp);
+}
+
+static U8*
+process_utf8(pTHX_ SV* dst, U8* s, U8* e, int check,
+ bool encode, bool strict, bool stop_at_partial)
+{
+ UV uv;
+ STRLEN ulen;
+
+ SvPOK_only(dst);
+ SvCUR_set(dst,0);
+
+ while (s < e) {
+ if (UTF8_IS_INVARIANT(*s)) {
+ sv_catpvn(dst, (char *)s, 1);
+ s++;
+ continue;
+ }
+
+ if (UTF8_IS_START(*s)) {
+ U8 skip = UTF8SKIP(s);
+ if ((s + skip) > e) {
+ /* Partial character */
+ /* XXX could check that rest of bytes are UTF8_IS_CONTINUATION(ch) */
+ if (stop_at_partial)
+ break;
+
+ goto malformed_byte;
+ }
+
+ uv = utf8n_to_uvuni(s, e - s, &ulen,
+ UTF8_CHECK_ONLY | (strict ? UTF8_ALLOW_STRICT :
+ UTF8_ALLOW_NONSTRICT)
+ );
+#if 1 /* perl-5.8.6 and older do not check UTF8_ALLOW_LONG */
+ if (strict && uv > PERL_UNICODE_MAX)
+ ulen = -1;
+#endif
+ if (ulen == -1) {
+ if (strict) {
+ uv = utf8n_to_uvuni(s, e - s, &ulen,
+ UTF8_CHECK_ONLY | UTF8_ALLOW_NONSTRICT);
+ if (ulen == -1)
+ goto malformed_byte;
+ goto malformed;
+ }
+ goto malformed_byte;
+ }
+
+
+ /* Whole char is good */
+ sv_catpvn(dst,(char *)s,skip);
+ s += skip;
+ continue;
+ }
+
+ /* If we get here there is something wrong with alleged UTF-8 */
+ malformed_byte:
+ uv = (UV)*s;
+ ulen = 1;
+
+ malformed:
+ if (check & ENCODE_DIE_ON_ERR){
+ if (encode)
+ Perl_croak(aTHX_ ERR_ENCODE_NOMAP, uv, "utf8");
+ else
+ Perl_croak(aTHX_ ERR_DECODE_NOMAP, "utf8", uv);
+ }
+ if (check & ENCODE_WARN_ON_ERR){
+ if (encode)
+ Perl_warner(aTHX_ packWARN(WARN_UTF8),
+ ERR_ENCODE_NOMAP, uv, "utf8");
+ else
+ Perl_warner(aTHX_ packWARN(WARN_UTF8),
+ ERR_DECODE_NOMAP, "utf8", uv);
+ }
+ if (check & ENCODE_RETURN_ON_ERR) {
+ break;
+ }
+ if (check & (ENCODE_PERLQQ|ENCODE_HTMLCREF|ENCODE_XMLCREF)){
+ SV* subchar = newSVpvf(check & ENCODE_PERLQQ ? (ulen == 1 ? "\\x%02" UVXf : "\\x{%04" UVXf "}"):
+ check & ENCODE_HTMLCREF ? "&#%" UVuf ";" :
+ "&#x%" UVxf ";", uv);
+ sv_catsv(dst, subchar);
+ SvREFCNT_dec(subchar);
+ } else {
+ sv_catpv(dst, FBCHAR_UTF8);
+ }
+ s += ulen;
+ }
+ *SvEND(dst) = '\0';
+
+ return s;
+}
+
+
MODULE = Encode PACKAGE = Encode::utf8 PREFIX = Method_
PROTOTYPES: DISABLE
SV *dst = newSV(slen>0?slen:1); /* newSV() abhors 0 -- inaba */
/*
- * PerlO check -- we assume the object is of PerlIO if renewed
- * and if so, we set RETURN_ON_ERR for partial character
+ * PerlIO check -- we assume the object is of PerlIO if renewed
*/
int renewed = 0;
dSP; ENTER; SAVETMPS;
FREETMPS; LEAVE;
/* end PerlIO check */
- SvPOK_only(dst);
- SvCUR_set(dst,0);
if (SvUTF8(src)) {
s = utf8_to_bytes(s,&slen);
if (s) {
croak("Cannot decode string with wide characters");
}
}
- while (s < e) {
- if (UTF8_IS_INVARIANT(*s) || UTF8_IS_START(*s)) {
- U8 skip = UTF8SKIP(s);
- if ((s + skip) > e) {
- /* Partial character - done */
- if (renewed)
- break;
- goto decode_utf8_fallback;
- }
- else if (is_utf8_char(s)) {
- /* Whole char is good */
- sv_catpvn(dst,(char *)s,skip);
- s += skip;
- continue;
- }
- else {
- /* starts ok but isn't "good" */
- }
- }
- else {
- /* Invalid start byte */
- }
- /* If we get here there is something wrong with alleged UTF-8 */
- decode_utf8_fallback:
- if (check & ENCODE_DIE_ON_ERR){
- Perl_croak(aTHX_ ERR_DECODE_NOMAP, "utf8", (UV)*s);
- XSRETURN(0);
- }
- if (check & ENCODE_WARN_ON_ERR){
- Perl_warner(aTHX_ packWARN(WARN_UTF8),
- ERR_DECODE_NOMAP, "utf8", (UV)*s);
- }
- if (check & ENCODE_RETURN_ON_ERR) {
- break;
- }
- if (check & (ENCODE_PERLQQ|ENCODE_HTMLCREF|ENCODE_XMLCREF)){
- SV* subchar = newSVpvf(check & ENCODE_PERLQQ ? "\\x%02" UVXf :
- check & ENCODE_HTMLCREF ? "&#%" UVuf ";" :
- "&#x%" UVxf ";", (UV)*s);
- sv_catsv(dst, subchar);
- SvREFCNT_dec(subchar);
- } else {
- sv_catpv(dst, FBCHAR_UTF8);
- }
- s++;
- }
- *SvEND(dst) = '\0';
+
+ s = process_utf8(aTHX_ dst, s, e, check, 0, strict_utf8(aTHX_ obj), renewed);
/* Clear out translated part of source unless asked not to */
if (check && !(check & ENCODE_LEAVE_SRC)){
U8 *e = (U8 *) SvEND(src);
SV *dst = newSV(slen>0?slen:1); /* newSV() abhors 0 -- inaba */
if (SvUTF8(src)) {
- /* Already encoded - trust it and just copy the octets */
- sv_setpvn(dst,(char *)s,(e-s));
- s = e;
+ /* Already encoded */
+ if (strict_utf8(aTHX_ obj)) {
+ s = process_utf8(aTHX_ dst, s, e, check, 1, 1, 0);
+ }
+ else {
+ /* trust it and just copy the octets */
+ sv_setpvn(dst,(char *)s,(e-s));
+ s = e;
+ }
}
else {
/* Native bytes - can always encode */
#define ENCODE_FB_CROAK 0x0001
#define ENCODE_FB_QUIET ENCODE_RETURN_ON_ERR
#define ENCODE_FB_WARN (ENCODE_RETURN_ON_ERR|ENCODE_WARN_ON_ERR)
-#define ENCODE_FB_PERLQQ ENCODE_PERLQQ
-#define ENCODE_FB_HTMLCREF ENCODE_HTMLCREF
-#define ENCODE_FB_XMLCREF ENCODE_XMLCREF
+#define ENCODE_FB_PERLQQ (ENCODE_PERLQQ|ENCODE_LEAVE_SRC)
+#define ENCODE_FB_HTMLCREF (ENCODE_HTMLCREF|ENCODE_LEAVE_SRC)
+#define ENCODE_FB_XMLCREF (ENCODE_XMLCREF|ENCODE_LEAVE_SRC)
#endif /* ENCODE_H */
t/perlio.t test script
t/rt.pl even more test script
t/unibench.pl benchmark script
+t/utf8strict.t test script
ucm/8859-1.ucm Unicode Character Map
ucm/8859-10.ucm Unicode Character Map
ucm/8859-11.ucm Unicode Character Map
# http://module-build.sourceforge.net/META-spec.html
#XXXXXXX This is a prototype!!! It will change in the future!!! XXXXX#
name: Encode
-version: 2.09
+version: 2.0902
version_from: Encode.pm
installdirs: perl
requires:
# define_alias( qr/\bmacRomanian$/i => '"macRumanian"');
# Standardize on the dashed versions.
- # define_alias( qr/\butf8$/i => '"utf-8"' );
define_alias( qr/\bkoi8[\s\-_]*([ru])$/i => '"koi8-$1"' );
unless ($Encode::ON_EBCDIC){
define_alias( qr/\bhk(?:scs)?[-_]?big5$/i => '"big5-hkscs"' );
}
# utf8 is blessed :)
- define_alias( qr/^UTF-8$/i => '"utf8"',);
+ define_alias( qr/^UTF-8$/i => '"utf-8-strict"');
# At last, Map white space and _ to '-'
define_alias( qr/^(\S+)[\s_]+(.*)$/i => '"$1-$2"' );
}
%a2c = (
'US-ascii' => 'ascii',
'ISO-646-US' => 'ascii',
- 'UTF-8' => 'utf8',
+ 'UTF-8' => 'utf-8-strict',
'UCS-2' => 'UCS-2BE',
'UCS2' => 'UCS-2BE',
'iso-10646-1' => 'UCS-2BE',
$src = $uo;
$dst = $ascii->encode($src, FB_PERLQQ);
is($dst, $ap, "FB_PERLQQ ascii");
-is($src, '', "FB_PERLQQ residue ascii");
+is($src, $uo, "FB_PERLQQ residue ascii");
$src = $ao;
$dst = $utf8->decode($src, FB_PERLQQ);
is($dst, $up, "FB_PERLQQ utf8");
-is($src, '', "FB_PERLQQ residue utf8");
+is($src, $ao, "FB_PERLQQ residue utf8");
$src = $uo;
$dst = $ascii->encode($src, FB_HTMLCREF);
is($dst, $ah, "FB_HTMLCREF ascii");
-is($src, '', "FB_HTMLCREF residue ascii");
+is($src, $uo, "FB_HTMLCREF residue ascii");
$src = $ao;
$dst = $utf8->decode($src, FB_HTMLCREF);
is($dst, $uh, "FB_HTMLCREF utf8");
-is($src, '', "FB_HTMLCREF residue utf8");
+is($src, $ao, "FB_HTMLCREF residue utf8");
$src = $uo;
$dst = $ascii->encode($src, FB_XMLCREF);
is($dst, $ax, "FB_XMLCREF ascii");
-is($src, '', "FB_XMLCREF residue ascii");
+is($src, $uo, "FB_XMLCREF residue ascii");
$src = $ao;
$dst = $utf8->decode($src, FB_XMLCREF);
is($dst, $ax, "FB_XMLCREF utf8");
-is($src, '', "FB_XMLCREF residue utf8");
+is($src, $ao, "FB_XMLCREF residue utf8");
--- /dev/null
+#!../perl
+our $DEBUG = @ARGV;
+our (%ORD, %SEQ, $NTESTS);
+BEGIN {
+ if ($ENV{'PERL_CORE'}){
+ chdir 't';
+ unshift @INC, '../lib';
+ }
+ require Config; import Config;
+ if ($Config{'extensions'} !~ /\bEncode\b/) {
+ print "1..0 # Skip: Encode was not built\n";
+ exit 0;
+ }
+ if ($] <= 5.008 and !$Config{perl_patchlevel}){
+ print "1..0 # Skip: Perl 5.8.1 or later required\n";
+ exit 0;
+ }
+ # http://smontagu.damowmow.com/utf8test.html
+ %ORD = (
+ 0x00000080 => 0, # 2.1.2
+ 0x00000800 => 0, # 2.1.3
+ 0x00010000 => 0, # 2.1.4
+ 0x00200000 => 1, # 2.1.5
+ 0x00400000 => 1, # 2.1.6
+ 0x0000007F => 0, # 2.2.1 -- unmapped okay
+ 0x000007FF => 0, # 2.2.2
+ 0x0000FFFF => 1, # 2.2.3
+ 0x001FFFFF => 1, # 2.2.4
+ 0x03FFFFFF => 1, # 2.2.5
+ 0x7FFFFFFF => 1, # 2.2.6
+ 0x0000D800 => 1, # 5.1.1
+ 0x0000DB7F => 1, # 5.1.2
+ 0x0000D880 => 1, # 5.1.3
+ 0x0000DBFF => 1, # 5.1.4
+ 0x0000DC00 => 1, # 5.1.5
+ 0x0000DF80 => 1, # 5.1.6
+ 0x0000DFFF => 1, # 5.1.7
+ # 5.2 "Paird UTF-16 surrogates skipped
+ # because utf-8-strict raises exception at the first one
+ 0x0000FFFF => 1, # 5.3.1
+ );
+ $NTESTS += scalar keys %ORD;
+ %SEQ = (
+ qq/ed 9f bf/ => 0, # 2.3.1
+ qq/ee 80 80/ => 0, # 2.3.2
+ qq/f4 8f bf bf/ => 0, # 2.3.3
+ qq/f4 90 80 80/ => 1, # 2.3.4 -- out of range so NG
+ # "3 Malformed sequences" are checked by perl.
+ # "4 Overlong sequences" are checked by perl.
+ );
+ $NTESTS += scalar keys %SEQ;
+}
+use strict;
+use Encode;
+use utf8;
+use Test::More tests => $NTESTS;
+
+local($SIG{__WARN__}) = sub { $DEBUG and $@ and print STDERR $@ };
+
+my $d = find_encoding("utf-8-strict");
+for my $u (sort keys %ORD){
+ my $c = chr($u);
+ eval { $d->encode($c,1) };
+ $DEBUG and $@ and warn $@;
+ my $t = $@ ? 1 : 0;
+ is($t, $ORD{$u}, sprintf "U+%04X", $u);
+}
+for my $s (sort keys %SEQ){
+ my $o = pack "C*" => map {hex} split /\s+/, $s;
+ eval { $d->decode($o,1) };
+ $DEBUG and $@ and warn $@;
+ my $t = $@ ? 1 : 0;
+ is($t, $SEQ{$s}, $s);
+}
+
+__END__
+
+