Peter Prymmer <pvhp@best.com>
Philip Newton <pne@cpan.org>
Piotr Fusik <pfusik@op.pl>
+Rafael Garcia-Suarez <rgarciasuarez@mandriva.com>
Robin Barker <rmb1@cise.npl.co.uk>
SADAHIRO Tomoyuki <SADAHIRO@cpan.org>
SUGAWARA Hajime <sugawara@hdt.co.jp>
# Revision history for Perl extension Encode.
#
-# $Id: Changes,v 2.11 2005/08/05 10:58:25 dankogai Exp dankogai $
+# $Id: Changes,v 2.12 2005/09/08 14:17:17 dankogai Exp dankogai $
#
-$Revision: 2.11 $ $Date: 2005/08/05 10:58:25 $
+$Revision: 2.12 $ $Date: 2005/09/08 14:17:17 $
+! Encode.xs Encode.pm t/fallback.t
+ Now accepts coderef for CHECK!
+! ucm/8859-7.ucm
+ Updated to newer version at unicode.org
+ http://rt.cpan.org/NoAuth/Bug.html?id=14222
+! lib/Encode/Supported.pod
+ More POD typo fixed.
+ <42F5E243.80500@gmail.com>
+! encoding.pm
+ More POD typo leftover fixed.
+ Message-Id: <b77c1dce05080615487f95314@mail.gmail.com>
+
+2.11 2005/08/05 10:58:25
! AUTHORS CHANGES
To reflect changes below
! Encode.pm encoding.pm
#
-# $Id: Encode.pm,v 2.11 2005/08/05 10:58:25 dankogai Exp dankogai $
+# $Id: Encode.pm,v 2.12 2005/09/08 14:17:17 dankogai Exp dankogai $
#
package Encode;
use strict;
-our $VERSION = sprintf "%d.%02d", q$Revision: 2.11 $ =~ /(\d+)/g;
+our $VERSION = sprintf "%d.%02d", q$Revision: 2.12 $ =~ /(\d+)/g;
sub DEBUG () { 0 }
use XSLoader ();
XSLoader::load(__PACKAGE__, $VERSION);
=head1 Handling Malformed Data
-The optional I<CHECK> argument is used as follows. When you omit it,
-Encode::FB_DEFAULT ( == 0 ) is assumed.
+The optional I<CHECK> argument tells Encode what to do when it
+encounters malformed data. Without CHECK, Encode::FB_DEFAULT ( == 0 )
+is assumed.
+
+As of version 2.12 Encode supports coderef values for CHECK. See below.
=over 2
=back
-=head2 Unimplemented fallback schemes
+=head2 coderef for CHECK
+
+As of Encode 2.12 CHECK can also be a code reference which takes the
+ord value of unmapped caharacter as an argument and returns a string
+that represents the fallback character. For instance,
-In the future, you will be able to use a code reference to a callback
-function for the value of I<CHECK> but its API is still undecided.
+ $ascii = encode("ascii", $utf8, sub{ sprintf "<U+%04X>", shift });
-The fallback scheme does not work on EBCDIC platforms.
+Acts like FB_PERLQQ but E<lt>U+I<XXXX>E<gt> is used instead of
+\x{I<XXXX>}.
=head1 Defining Encodings
For what it's worth, that's how I've always kept them straight in my
head.
-
+
Also for what it's worth, Perl 6 will mostly default to strict but
make it easy to switch back to lax.
/*
- $Id: Encode.xs,v 2.5 2005/08/05 10:58:25 dankogai Exp dankogai $
+ $Id: Encode.xs,v 2.6 2005/09/08 14:17:17 dankogai Exp dankogai $
*/
#define PERL_NO_GET_CONTEXT
UTF8_ALLOW_NON_CONTINUATION | \
UTF8_ALLOW_LONG))
+static SV* fallback_cb = (SV*)NULL ;
+
void
Encode_XSEncoding(pTHX_ encode_t * enc)
{
#define ERR_DECODE_NOMAP "%s \"\\x%02" UVXf "\" does not map to Unicode"
static SV *
+do_fallback_cb(pTHX_ UV ch)
+{
+ dSP;
+ int argc;
+ SV* retval;
+ ENTER;
+ SAVETMPS;
+ PUSHMARK(sp);
+ XPUSHs(sv_2mortal(newSVnv((UV)ch)));
+ PUTBACK;
+ argc = call_sv(fallback_cb, G_SCALAR);
+ SPAGAIN;
+ if (argc != 1){
+ croak("fallback sub must return scalar!");
+ }
+ retval = newSVsv(POPs);
+ PUTBACK;
+ FREETMPS;
+ LEAVE;
+ return retval;
+}
+
+static SV *
encode_method(pTHX_ encode_t * enc, encpage_t * dir, SV * src,
int check, STRLEN * offset, SV * term, int * retcode)
{
}
if (check & (ENCODE_PERLQQ|ENCODE_HTMLCREF|ENCODE_XMLCREF)){
SV* subchar =
+ (fallback_cb != (SV*)NULL) ? do_fallback_cb(aTHX_ ch) :
newSVpvf(check & ENCODE_PERLQQ ? "\\x{%04"UVxf"}" :
check & ENCODE_HTMLCREF ? "&#%" UVuf ";" :
"&#x%" UVxf ";", (UV)ch);
}
if (check &
(ENCODE_PERLQQ|ENCODE_HTMLCREF|ENCODE_XMLCREF)){
- SV* subchar = newSVpvf("\\x%02" UVXf, (UV)s[slen]);
+ SV* subchar =
+ (fallback_cb != (SV*)NULL) ?
+ do_fallback_cb(aTHX_ (UV)s[slen]) :
+ newSVpvf("\\x%02" UVXf, (UV)s[slen]);
sdone += slen + 1;
ddone += dlen + SvCUR(subchar);
sv_catsv(dst, subchar);
}
void
-Method_decode(obj,src,check = 0)
+Method_decode(obj,src,check_sv = &PL_sv_no)
SV * obj
SV * src
-int check
+SV * check_sv
CODE:
{
+ int check;
encode_t *enc = INT2PTR(encode_t *, SvIV(SvRV(obj)));
if (SvUTF8(src)) {
sv_utf8_downgrade(src, FALSE);
}
+ if (SvROK(check_sv)){
+ if (fallback_cb == (SV*)NULL){
+ fallback_cb = newSVsv(check_sv); /* First time */
+ }else{
+ SvSetSV(fallback_cb, check_sv); /* Been here before */
+ }
+ check = ENCODE_PERLQQ|ENCODE_LEAVE_SRC; /* same as FB_PERLQQ */
+ }else{
+ fallback_cb = (SV*)NULL;
+ check = SvIV(check_sv);
+ }
ST(0) = encode_method(aTHX_ enc, enc->t_utf8, src, check,
NULL, Nullsv, NULL);
SvUTF8_on(ST(0));
XSRETURN(1);
}
+
+
void
-Method_encode(obj,src,check = 0)
+Method_encode(obj,src,check_sv = &PL_sv_no)
SV * obj
SV * src
-int check
+SV * check_sv
CODE:
{
+ int check;
encode_t *enc = INT2PTR(encode_t *, SvIV(SvRV(obj)));
sv_utf8_upgrade(src);
+ if (SvROK(check_sv)){
+ if (fallback_cb == (SV*)NULL){
+ fallback_cb = newSVsv(check_sv); /* First time */
+ }else{
+ SvSetSV(fallback_cb, check_sv); /* Been here before */
+ }
+ check = ENCODE_PERLQQ|ENCODE_LEAVE_SRC; /* same as FB_PERLQQ */
+ }else{
+ fallback_cb = (SV*)NULL;
+ check = SvIV(check_sv);
+ }
ST(0) = encode_method(aTHX_ enc, enc->f_utf8, src, check,
NULL, Nullsv, NULL);
XSRETURN(1);
# http://module-build.sourceforge.net/META-spec.html
#XXXXXXX This is a prototype!!! It will change in the future!!! XXXXX#
name: Encode
-version: 2.11
+version: 2.12
version_from: Encode.pm
installdirs: perl
requires:
-# $Id: encoding.pm,v 2.1 2004/10/19 04:55:01 dankogai Exp $
+# $Id: encoding.pm,v 2.2 2005/09/08 14:17:17 dankogai Exp dankogai $
package encoding;
-our $VERSION = do { my @r = (q$Revision: 2.1 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
+our $VERSION = do { my @r = (q$Revision: 2.2 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
use Encode;
use strict;
use strict;
#use Test::More qw(no_plan);
-use Test::More tests => 40;
+use Test::More tests => 44;
use Encode q(:all);
my $uo = '';
my $nf = '';
-my ($af, $aq, $ap, $ah, $ax, $uf, $uq, $up, $uh, $ux);
+my ($af, $aq, $ap, $ah, $ax, $uf, $uq, $up, $uh, $ux, $ac, $uc);
for my $i (0x20..0x7e){
$uo .= chr($i);
}
-$af = $aq = $ap = $ah = $ax =
-$uf = $uq = $up = $uh = $ux =
+$af = $aq = $ap = $ah = $ax = $ac =
+$uf = $uq = $up = $uh = $ux = $uc =
$nf = $uo;
my $residue = '';
$ap .= sprintf("\\x{%04x}", $i);
$up .= sprintf("\\x%02X", $i);
$ah .= sprintf("&#%d;", $i);
- $uh .= sprintf("&#%d;", $i);
+ $uh .= sprintf("\\x%02X", $i);
$ax .= sprintf("&#x%x;", $i);
- $ux .= sprintf("&#x%x;", $i);
+ $ux .= sprintf("\\x%02X", $i);
+ $ac .= sprintf("<U+%04X>", $i);
+ $uc .= sprintf("[%02X]", $i);
}
my $ao = $uo;
$src = $uo;
$dst = $ascii->encode($src, FB_PERLQQ);
-is($dst, $ap, "FB_PERLQQ ascii");
-is($src, $uo, "FB_PERLQQ residue ascii");
+is($dst, $ap, "FB_PERLQQ encode");
+is($src, $uo, "FB_PERLQQ residue encode");
$src = $ao;
-$dst = $utf8->decode($src, FB_PERLQQ);
-is($dst, $up, "FB_PERLQQ utf8");
-is($src, $ao, "FB_PERLQQ residue utf8");
+$dst = $ascii->decode($src, FB_PERLQQ);
+is($dst, $up, "FB_PERLQQ decode");
+is($src, $ao, "FB_PERLQQ residue decode");
$src = $uo;
$dst = $ascii->encode($src, FB_HTMLCREF);
-is($dst, $ah, "FB_HTMLCREF ascii");
-is($src, $uo, "FB_HTMLCREF residue ascii");
+is($dst, $ah, "FB_HTMLCREF encode");
+is($src, $uo, "FB_HTMLCREF residue encode");
$src = $ao;
-$dst = $utf8->decode($src, FB_HTMLCREF);
-is($dst, $uh, "FB_HTMLCREF utf8");
-is($src, $ao, "FB_HTMLCREF residue utf8");
+$dst = $ascii->decode($src, FB_HTMLCREF);
+is($dst, $uh, "FB_HTMLCREF decode");
+is($src, $ao, "FB_HTMLCREF residue decode");
$src = $uo;
$dst = $ascii->encode($src, FB_XMLCREF);
-is($dst, $ax, "FB_XMLCREF ascii");
-is($src, $uo, "FB_XMLCREF residue ascii");
+is($dst, $ax, "FB_XMLCREF encode");
+is($src, $uo, "FB_XMLCREF residue encode");
$src = $ao;
-$dst = $utf8->decode($src, FB_XMLCREF);
-is($dst, $ax, "FB_XMLCREF utf8");
-is($src, $ao, "FB_XMLCREF residue utf8");
+$dst = $ascii->decode($src, FB_XMLCREF);
+is($dst, $ux, "FB_XMLCREF decode");
+is($src, $ao, "FB_XMLCREF residue decode");
+
+$src = $uo;
+$dst = $ascii->encode($src, sub{ sprintf "<U+%04X>", shift });
+is($dst, $ac, "coderef encode");
+is($src, $uo, "coderef residue encode");
+
+$src = $ao;
+$dst = $ascii->decode($src, sub{ sprintf "[%02X]", shift });
+is($dst, $uc, "coderef decode");
+is($src, $ao, "coderef residue decode");
#
-# $Id: 8859-7.ucm,v 2.0 2004/05/16 20:55:19 dankogai Exp $
+# $Id: 8859-7.ucm,v 2.1 2005/09/08 14:17:17 dankogai Exp dankogai $
#
# Original table can be obtained at
# http://www.unicode.org/Public/MAPPINGS/ISO8859/8859-7.TXT
<U00B7> \xB7 |0 # MIDDLE DOT
<U00BB> \xBB |0 # RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK
<U00BD> \xBD |0 # VULGAR FRACTION ONE HALF
+<U037A> \xAA |0 # GREEK YPOGEGRAMMENI
<U0384> \xB4 |0 # GREEK TONOS
<U0385> \xB5 |0 # GREEK DIALYTIKA TONOS
<U0386> \xB6 |0 # GREEK CAPITAL LETTER ALPHA WITH TONOS
<U2015> \xAF |0 # HORIZONTAL BAR
<U2018> \xA1 |0 # LEFT SINGLE QUOTATION MARK
<U2019> \xA2 |0 # RIGHT SINGLE QUOTATION MARK
+<U20AC> \xA4 |0 # EURO SIGN
+<U20AF> \xA5 |0 # DRACHMA SIGN
END CHARMAP