From: Steve Peters Date: Tue, 27 Sep 2005 02:45:50 +0000 (+0000) Subject: Upgrade to Encode 2.12 X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=8e180e821340074db62baf476e6c5d624c9aee27;p=p5sagit%2Fp5-mst-13.2.git Upgrade to Encode 2.12 p4raw-id: //depot/perl@25609 --- diff --git a/ext/Encode/AUTHORS b/ext/Encode/AUTHORS index a42e4d6..edb016c 100644 --- a/ext/Encode/AUTHORS +++ b/ext/Encode/AUTHORS @@ -44,6 +44,7 @@ Paul Marquess Peter Prymmer Philip Newton Piotr Fusik +Rafael Garcia-Suarez Robin Barker SADAHIRO Tomoyuki SUGAWARA Hajime diff --git a/ext/Encode/Changes b/ext/Encode/Changes index dd9a6b5..acdead6 100644 --- a/ext/Encode/Changes +++ b/ext/Encode/Changes @@ -1,8 +1,21 @@ # Revision history for Perl extension Encode. # -# $Id: Changes,v 2.11 2005/08/05 10:58:25 dankogai Exp dankogai $ +# $Id: Changes,v 2.12 2005/09/08 14:17:17 dankogai Exp dankogai $ # -$Revision: 2.11 $ $Date: 2005/08/05 10:58:25 $ +$Revision: 2.12 $ $Date: 2005/09/08 14:17:17 $ +! Encode.xs Encode.pm t/fallback.t + Now accepts coderef for CHECK! +! ucm/8859-7.ucm + Updated to newer version at unicode.org + http://rt.cpan.org/NoAuth/Bug.html?id=14222 +! lib/Encode/Supported.pod + More POD typo fixed. + <42F5E243.80500@gmail.com> +! encoding.pm + More POD typo leftover fixed. + Message-Id: + +2.11 2005/08/05 10:58:25 ! AUTHORS CHANGES To reflect changes below ! Encode.pm encoding.pm diff --git a/ext/Encode/Encode.pm b/ext/Encode/Encode.pm index 9b45b7b..ac0123c 100644 --- a/ext/Encode/Encode.pm +++ b/ext/Encode/Encode.pm @@ -1,9 +1,9 @@ # -# $Id: Encode.pm,v 2.11 2005/08/05 10:58:25 dankogai Exp dankogai $ +# $Id: Encode.pm,v 2.12 2005/09/08 14:17:17 dankogai Exp dankogai $ # package Encode; use strict; -our $VERSION = sprintf "%d.%02d", q$Revision: 2.11 $ =~ /(\d+)/g; +our $VERSION = sprintf "%d.%02d", q$Revision: 2.12 $ =~ /(\d+)/g; sub DEBUG () { 0 } use XSLoader (); XSLoader::load(__PACKAGE__, $VERSION); @@ -557,8 +557,11 @@ L and L. =head1 Handling Malformed Data -The optional I argument is used as follows. When you omit it, -Encode::FB_DEFAULT ( == 0 ) is assumed. +The optional I argument tells Encode what to do when it +encounters malformed data. Without CHECK, Encode::FB_DEFAULT ( == 0 ) +is assumed. + +As of version 2.12 Encode supports coderef values for CHECK. See below. =over 2 @@ -648,12 +651,16 @@ constants via C. =back -=head2 Unimplemented fallback schemes +=head2 coderef for CHECK + +As of Encode 2.12 CHECK can also be a code reference which takes the +ord value of unmapped caharacter as an argument and returns a string +that represents the fallback character. For instance, -In the future, you will be able to use a code reference to a callback -function for the value of I but its API is still undecided. + $ascii = encode("ascii", $utf8, sub{ sprintf "", shift }); -The fallback scheme does not work on EBCDIC platforms. +Acts like FB_PERLQQ but EU+IE is used instead of +\x{I}. =head1 Defining Encodings @@ -799,7 +806,7 @@ Now that is overruled by Larry Wall himself. For what it's worth, that's how I've always kept them straight in my head. - + Also for what it's worth, Perl 6 will mostly default to strict but make it easy to switch back to lax. diff --git a/ext/Encode/Encode.xs b/ext/Encode/Encode.xs index cc5fe3b..8e225cd 100644 --- a/ext/Encode/Encode.xs +++ b/ext/Encode/Encode.xs @@ -1,5 +1,5 @@ /* - $Id: Encode.xs,v 2.5 2005/08/05 10:58:25 dankogai Exp dankogai $ + $Id: Encode.xs,v 2.6 2005/09/08 14:17:17 dankogai Exp dankogai $ */ #define PERL_NO_GET_CONTEXT @@ -35,6 +35,8 @@ UNIMPLEMENTED(_encoded_bytes_to_utf8, I32) UTF8_ALLOW_NON_CONTINUATION | \ UTF8_ALLOW_LONG)) +static SV* fallback_cb = (SV*)NULL ; + void Encode_XSEncoding(pTHX_ encode_t * enc) { @@ -64,6 +66,29 @@ call_failure(SV * routine, U8 * done, U8 * dest, U8 * orig) #define ERR_DECODE_NOMAP "%s \"\\x%02" UVXf "\" does not map to Unicode" static SV * +do_fallback_cb(pTHX_ UV ch) +{ + dSP; + int argc; + SV* retval; + ENTER; + SAVETMPS; + PUSHMARK(sp); + XPUSHs(sv_2mortal(newSVnv((UV)ch))); + PUTBACK; + argc = call_sv(fallback_cb, G_SCALAR); + SPAGAIN; + if (argc != 1){ + croak("fallback sub must return scalar!"); + } + retval = newSVsv(POPs); + PUTBACK; + FREETMPS; + LEAVE; + return retval; +} + +static SV * encode_method(pTHX_ encode_t * enc, encpage_t * dir, SV * src, int check, STRLEN * offset, SV * term, int * retcode) { @@ -167,6 +192,7 @@ encode_method(pTHX_ encode_t * enc, encpage_t * dir, SV * src, } if (check & (ENCODE_PERLQQ|ENCODE_HTMLCREF|ENCODE_XMLCREF)){ SV* subchar = + (fallback_cb != (SV*)NULL) ? do_fallback_cb(aTHX_ ch) : newSVpvf(check & ENCODE_PERLQQ ? "\\x{%04"UVxf"}" : check & ENCODE_HTMLCREF ? "&#%" UVuf ";" : "&#x%" UVxf ";", (UV)ch); @@ -199,7 +225,10 @@ encode_method(pTHX_ encode_t * enc, encpage_t * dir, SV * src, } if (check & (ENCODE_PERLQQ|ENCODE_HTMLCREF|ENCODE_XMLCREF)){ - SV* subchar = newSVpvf("\\x%02" UVXf, (UV)s[slen]); + SV* subchar = + (fallback_cb != (SV*)NULL) ? + do_fallback_cb(aTHX_ (UV)s[slen]) : + newSVpvf("\\x%02" UVXf, (UV)s[slen]); sdone += slen + 1; ddone += dlen + SvCUR(subchar); sv_catsv(dst, subchar); @@ -536,31 +565,57 @@ CODE: } void -Method_decode(obj,src,check = 0) +Method_decode(obj,src,check_sv = &PL_sv_no) SV * obj SV * src -int check +SV * check_sv CODE: { + int check; encode_t *enc = INT2PTR(encode_t *, SvIV(SvRV(obj))); if (SvUTF8(src)) { sv_utf8_downgrade(src, FALSE); } + if (SvROK(check_sv)){ + if (fallback_cb == (SV*)NULL){ + fallback_cb = newSVsv(check_sv); /* First time */ + }else{ + SvSetSV(fallback_cb, check_sv); /* Been here before */ + } + check = ENCODE_PERLQQ|ENCODE_LEAVE_SRC; /* same as FB_PERLQQ */ + }else{ + fallback_cb = (SV*)NULL; + check = SvIV(check_sv); + } ST(0) = encode_method(aTHX_ enc, enc->t_utf8, src, check, NULL, Nullsv, NULL); SvUTF8_on(ST(0)); XSRETURN(1); } + + void -Method_encode(obj,src,check = 0) +Method_encode(obj,src,check_sv = &PL_sv_no) SV * obj SV * src -int check +SV * check_sv CODE: { + int check; encode_t *enc = INT2PTR(encode_t *, SvIV(SvRV(obj))); sv_utf8_upgrade(src); + if (SvROK(check_sv)){ + if (fallback_cb == (SV*)NULL){ + fallback_cb = newSVsv(check_sv); /* First time */ + }else{ + SvSetSV(fallback_cb, check_sv); /* Been here before */ + } + check = ENCODE_PERLQQ|ENCODE_LEAVE_SRC; /* same as FB_PERLQQ */ + }else{ + fallback_cb = (SV*)NULL; + check = SvIV(check_sv); + } ST(0) = encode_method(aTHX_ enc, enc->f_utf8, src, check, NULL, Nullsv, NULL); XSRETURN(1); diff --git a/ext/Encode/META.yml b/ext/Encode/META.yml index e17c3de..9373f60 100644 --- a/ext/Encode/META.yml +++ b/ext/Encode/META.yml @@ -1,7 +1,7 @@ # http://module-build.sourceforge.net/META-spec.html #XXXXXXX This is a prototype!!! It will change in the future!!! XXXXX# name: Encode -version: 2.11 +version: 2.12 version_from: Encode.pm installdirs: perl requires: diff --git a/ext/Encode/encoding.pm b/ext/Encode/encoding.pm index df87582..d0b083a 100644 --- a/ext/Encode/encoding.pm +++ b/ext/Encode/encoding.pm @@ -1,6 +1,6 @@ -# $Id: encoding.pm,v 2.1 2004/10/19 04:55:01 dankogai Exp $ +# $Id: encoding.pm,v 2.2 2005/09/08 14:17:17 dankogai Exp dankogai $ package encoding; -our $VERSION = do { my @r = (q$Revision: 2.1 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; +our $VERSION = do { my @r = (q$Revision: 2.2 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; use Encode; use strict; diff --git a/ext/Encode/t/fallback.t b/ext/Encode/t/fallback.t index 4a04f54..76e1e8c 100644 --- a/ext/Encode/t/fallback.t +++ b/ext/Encode/t/fallback.t @@ -17,17 +17,17 @@ BEGIN { use strict; #use Test::More qw(no_plan); -use Test::More tests => 40; +use Test::More tests => 44; use Encode q(:all); my $uo = ''; my $nf = ''; -my ($af, $aq, $ap, $ah, $ax, $uf, $uq, $up, $uh, $ux); +my ($af, $aq, $ap, $ah, $ax, $uf, $uq, $up, $uh, $ux, $ac, $uc); for my $i (0x20..0x7e){ $uo .= chr($i); } -$af = $aq = $ap = $ah = $ax = -$uf = $uq = $up = $uh = $ux = +$af = $aq = $ap = $ah = $ax = $ac = +$uf = $uq = $up = $uh = $ux = $uc = $nf = $uo; my $residue = ''; @@ -39,9 +39,11 @@ for my $i (0x80..0xff){ $ap .= sprintf("\\x{%04x}", $i); $up .= sprintf("\\x%02X", $i); $ah .= sprintf("&#%d;", $i); - $uh .= sprintf("&#%d;", $i); + $uh .= sprintf("\\x%02X", $i); $ax .= sprintf("&#x%x;", $i); - $ux .= sprintf("&#x%x;", $i); + $ux .= sprintf("\\x%02X", $i); + $ac .= sprintf("", $i); + $uc .= sprintf("[%02X]", $i); } my $ao = $uo; @@ -124,30 +126,40 @@ is($src, $residue, "FB_QUIET residue utf8"); $src = $uo; $dst = $ascii->encode($src, FB_PERLQQ); -is($dst, $ap, "FB_PERLQQ ascii"); -is($src, $uo, "FB_PERLQQ residue ascii"); +is($dst, $ap, "FB_PERLQQ encode"); +is($src, $uo, "FB_PERLQQ residue encode"); $src = $ao; -$dst = $utf8->decode($src, FB_PERLQQ); -is($dst, $up, "FB_PERLQQ utf8"); -is($src, $ao, "FB_PERLQQ residue utf8"); +$dst = $ascii->decode($src, FB_PERLQQ); +is($dst, $up, "FB_PERLQQ decode"); +is($src, $ao, "FB_PERLQQ residue decode"); $src = $uo; $dst = $ascii->encode($src, FB_HTMLCREF); -is($dst, $ah, "FB_HTMLCREF ascii"); -is($src, $uo, "FB_HTMLCREF residue ascii"); +is($dst, $ah, "FB_HTMLCREF encode"); +is($src, $uo, "FB_HTMLCREF residue encode"); $src = $ao; -$dst = $utf8->decode($src, FB_HTMLCREF); -is($dst, $uh, "FB_HTMLCREF utf8"); -is($src, $ao, "FB_HTMLCREF residue utf8"); +$dst = $ascii->decode($src, FB_HTMLCREF); +is($dst, $uh, "FB_HTMLCREF decode"); +is($src, $ao, "FB_HTMLCREF residue decode"); $src = $uo; $dst = $ascii->encode($src, FB_XMLCREF); -is($dst, $ax, "FB_XMLCREF ascii"); -is($src, $uo, "FB_XMLCREF residue ascii"); +is($dst, $ax, "FB_XMLCREF encode"); +is($src, $uo, "FB_XMLCREF residue encode"); $src = $ao; -$dst = $utf8->decode($src, FB_XMLCREF); -is($dst, $ax, "FB_XMLCREF utf8"); -is($src, $ao, "FB_XMLCREF residue utf8"); +$dst = $ascii->decode($src, FB_XMLCREF); +is($dst, $ux, "FB_XMLCREF decode"); +is($src, $ao, "FB_XMLCREF residue decode"); + +$src = $uo; +$dst = $ascii->encode($src, sub{ sprintf "", shift }); +is($dst, $ac, "coderef encode"); +is($src, $uo, "coderef residue encode"); + +$src = $ao; +$dst = $ascii->decode($src, sub{ sprintf "[%02X]", shift }); +is($dst, $uc, "coderef decode"); +is($src, $ao, "coderef residue decode"); diff --git a/ext/Encode/ucm/8859-7.ucm b/ext/Encode/ucm/8859-7.ucm index 6ca7c86..69eab84 100644 --- a/ext/Encode/ucm/8859-7.ucm +++ b/ext/Encode/ucm/8859-7.ucm @@ -1,5 +1,5 @@ # -# $Id: 8859-7.ucm,v 2.0 2004/05/16 20:55:19 dankogai Exp $ +# $Id: 8859-7.ucm,v 2.1 2005/09/08 14:17:17 dankogai Exp dankogai $ # # Original table can be obtained at # http://www.unicode.org/Public/MAPPINGS/ISO8859/8859-7.TXT @@ -185,6 +185,7 @@ CHARMAP \xB7 |0 # MIDDLE DOT \xBB |0 # RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK \xBD |0 # VULGAR FRACTION ONE HALF + \xAA |0 # GREEK YPOGEGRAMMENI \xB4 |0 # GREEK TONOS \xB5 |0 # GREEK DIALYTIKA TONOS \xB6 |0 # GREEK CAPITAL LETTER ALPHA WITH TONOS @@ -259,4 +260,6 @@ CHARMAP \xAF |0 # HORIZONTAL BAR \xA1 |0 # LEFT SINGLE QUOTATION MARK \xA2 |0 # RIGHT SINGLE QUOTATION MARK + \xA4 |0 # EURO SIGN + \xA5 |0 # DRACHMA SIGN END CHARMAP