#include "encode.h"
#include "def_t.h"
+#define FBCHAR 0xFFFd
+#define BOM_BE 0xFeFF
+#define BOM16LE 0xFFFe
+#define BOM32LE 0xFFFe0000
+
+#define valid_ucs2(x) ((0 <= (x) && (x) < 0xD800) || (0xDFFF < (x) && (x) <= 0xFFFF))
+
+#define issurrogate(x) (0xD800 <= (x) && (x) <= 0xDFFF )
+#define isHiSurrogate(x) (0xD800 <= (x) && (x) < 0xDC00 )
+#define isLoSurrogate(x) (0xDC00 <= (x) && (x) <= 0xDFFF )
+
+static UV
+enc_unpack(pTHX_ U8 **sp,U8 *e,STRLEN size,U8 endian)
+{
+ U8 *s = *sp;
+ UV v = 0;
+ if (s+size > e) {
+ croak("Partial character %c",(char) endian);
+ }
+ switch(endian) {
+ case 'N':
+ v = *s++;
+ v = (v << 8) | *s++;
+ case 'n':
+ v = (v << 8) | *s++;
+ v = (v << 8) | *s++;
+ break;
+ case 'V':
+ case 'v':
+ v |= *s++;
+ v |= (*s++ << 8);
+ if (endian == 'v')
+ break;
+ v |= (*s++ << 16);
+ v |= (*s++ << 24);
+ break;
+ default:
+ croak("Unknown endian %c",(char) endian);
+ break;
+ }
+ *sp = s;
+ return v;
+}
+
+void
+enc_pack(pTHX_ SV *result,STRLEN size,U8 endian,UV value)
+{
+ U8 *d = SvGROW(result,SvCUR(result)+size);
+ switch(endian) {
+ case 'v':
+ case 'V':
+ d += SvCUR(result);
+ SvCUR_set(result,SvCUR(result)+size);
+ while (size--) {
+ *d++ = value & 0xFF;
+ value >>= 8;
+ }
+ break;
+ case 'n':
+ case 'N':
+ SvCUR_set(result,SvCUR(result)+size);
+ d += SvCUR(result);
+ while (size--) {
+ *--d = value & 0xFF;
+ value >>= 8;
+ }
+ break;
+ default:
+ croak("Unknown endian %c",(char) endian);
+ break;
+ }
+}
+
#define ENCODE_XS_PROFILE 0 /* set 1 or more to profile.
t/encoding.t dumps core because of
Perl_warner and PerlIO don't work well */
XSRETURN(1);
}
+MODULE = Encode PACKAGE = Encode::Unicode
+
+void
+decode_xs(obj, str, chk = &PL_sv_undef)
+SV * obj
+SV * str
+SV * chk
+CODE:
+{
+ int size = SvIV(*hv_fetch((HV *)SvRV(obj),"size",4,0));
+ U8 endian = *((U8 *)SvPV_nolen(*hv_fetch((HV *)SvRV(obj),"endian",6,0)));
+ int ucs2 = SvTRUE(*hv_fetch((HV *)SvRV(obj),"ucs2",4,0));
+ SV *result = newSVpvn("",0);
+ STRLEN ulen;
+ U8 *s = SvPVbyte(str,ulen);
+ U8 *e = SvEND(str);
+ ST(0) = sv_2mortal(result);
+ SvUTF8_on(result);
+
+ if (!endian && s+size <= e) {
+ UV bom;
+ endian = (size == 4) ? 'N' : 'n';
+ bom = enc_unpack(aTHX_ &s,e,size,endian);
+ if (bom != BOM_BE) {
+ if (bom == BOM16LE) {
+ endian = 'v';
+ }
+ else if (bom == BOM32LE) {
+ endian = 'V';
+ }
+ else {
+ croak("%s:Unregognised BOM %"UVxf,
+ SvPV_nolen(*hv_fetch((HV *)SvRV(obj),"Name",4,0)),bom);
+ }
+ }
+#if 0
+ /* Update endian for this sequence */
+ hv_store((HV *)SvRV(obj),"endian",6,newSVpv((char *)&endian,1),0);
+#endif
+ }
+ while (s < e && s+size <= e) {
+ UV ord = enc_unpack(aTHX_ &s,e,size,endian);
+ U8 *d;
+ if (size != 4 && !valid_ucs2(ord)) {
+ if (ucs2) {
+ if (SvTRUE(chk)) {
+ croak("%s:no surrogates allowed %"UVxf,
+ SvPV_nolen(*hv_fetch((HV *)SvRV(obj),"Name",4,0)),ord);
+ }
+ if (s+size <= e) {
+ enc_unpack(aTHX_ &s,e,size,endian); /* skip the next one as well */
+ }
+ ord = FBCHAR;
+ }
+ else {
+ UV lo;
+ if (!isHiSurrogate(ord)) {
+ croak("%s:Malformed HI surrogate %"UVxf,
+ SvPV_nolen(*hv_fetch((HV *)SvRV(obj),"Name",4,0)),ord);
+ }
+ if (s+size > e) {
+ /* Partial character */
+ s -= size; /* back up to 1st half */
+ break; /* And exit loop */
+ }
+ lo = enc_unpack(aTHX_ &s,e,size,endian);
+ if (!isLoSurrogate(lo)){
+ croak("%s:Malformed LO surrogate %"UVxf,
+ SvPV_nolen(*hv_fetch((HV *)SvRV(obj),"Name",4,0)),ord);
+ }
+ ord = 0x10000 + ((ord - 0xD800) << 10) + (lo - 0xDC00);
+ }
+ }
+ d = (U8 *) SvGROW(result,SvCUR(result)+UTF8_MAXLEN+1);
+ d = uvuni_to_utf8_flags(d+SvCUR(result), ord, 0);
+ SvCUR_set(result,d - (U8 *)SvPVX(result));
+ }
+ if (SvTRUE(chk)) {
+ if (s < e) {
+ Perl_warner(aTHX_ packWARN(WARN_UTF8),"%s:Partial character",
+ SvPV_nolen(*hv_fetch((HV *)SvRV(obj),"Name",4,0)));
+ Move(s,SvPVX(str),e-s,U8);
+ SvCUR_set(str,(e-s));
+ }
+ else {
+ SvCUR_set(str,0);
+ }
+ *SvEND(str) = '\0';
+ }
+ XSRETURN(1);
+}
+
+void
+encode_xs(obj, utf8, chk = &PL_sv_undef)
+SV * obj
+SV * utf8
+SV * chk
+CODE:
+{
+ int size = SvIV(*hv_fetch((HV *)SvRV(obj),"size",4,0));
+ U8 endian = *((U8 *)SvPV_nolen(*hv_fetch((HV *)SvRV(obj),"endian",6,0)));
+ int ucs2 = SvTRUE(*hv_fetch((HV *)SvRV(obj),"ucs2",4,0));
+ SV *result = newSVpvn("",0);
+ STRLEN ulen;
+ U8 *s = SvPVutf8(utf8,ulen);
+ U8 *e = SvEND(utf8);
+ ST(0) = sv_2mortal(result);
+ if (!endian) {
+ endian = (size == 4) ? 'N' : 'n';
+ enc_pack(aTHX_ result,size,endian,BOM_BE);
+#if 0
+ /* Update endian for this sequence */
+ hv_store((HV *)SvRV(obj),"endian",6,newSVpv((char *)&endian,1),0);
+#endif
+ }
+ while (s < e && s+UTF8SKIP(s) <= e) {
+ STRLEN len;
+ UV ord = utf8n_to_uvuni(s, e-s, &len, 0);
+ s += len;
+ if (size != 4 && !valid_ucs2(ord)) {
+ if (!issurrogate(ord)){
+ if (ucs2) {
+ if (SvTRUE(chk)) {
+ croak("%s:code point \"\\x{"UVxf"}\" too high",
+ SvPV_nolen(*hv_fetch((HV *)SvRV(obj),"Name",4,0)),ord);
+ }
+ enc_pack(aTHX_ result,size,endian,FBCHAR);
+ }else{
+ UV hi = ((ord - 0x10000) >> 10) + 0xD800;
+ UV lo = ((ord - 0x10000) & 0x3FF) + 0xDC00;
+ enc_pack(aTHX_ result,size,endian,hi);
+ enc_pack(aTHX_ result,size,endian,lo);
+ }
+ }
+ else {
+ /* not supposed to happen */
+ enc_pack(aTHX_ result,size,endian,FBCHAR);
+ }
+ }
+ else {
+ enc_pack(aTHX_ result,size,endian,ord);
+ }
+ }
+ if (SvTRUE(chk)) {
+ if (s < e) {
+ Perl_warner(aTHX_ packWARN(WARN_UTF8),"%s:Partial character",
+ SvPV_nolen(*hv_fetch((HV *)SvRV(obj),"Name",4,0)));
+ Move(s,SvPVX(utf8),e-s,U8);
+ SvCUR_set(utf8,(e-s));
+ }
+ else {
+ SvCUR_set(utf8,0);
+ }
+ *SvEND(utf8) = '\0';
+ }
+ XSRETURN(1);
+}
+
MODULE = Encode PACKAGE = Encode
PROTOTYPES: ENABLE
sub BOM32LE(){ 0xFFFe0000 }
sub valid_ucs2($){
- return
- (0 <= $_[0] && $_[0] < 0xD800)
+ return
+ (0 <= $_[0] && $_[0] < 0xD800)
|| ( 0xDFFF < $_[0] && $_[0] <= 0xFFFF);
}
}
sub name { shift->{'Name'} }
-sub new_sequence { $_[0] };
+
+sub new_sequence
+{
+ my $self = shift;
+ # Return the original if endian known
+ return $self if ($self->{endian});
+ # Return a clone
+ return bless {%$self},ref($self);
+}
#
-# two implementation of (en|de)code exist. *_modern use
+# Three implementation of (en|de)code exist. *_modern use
# an array and *_classic stick with substr. *_classic is much
-# slower but more memory conservative. *_modern is default.
+# slower but more memory conservative.
+# *_xs is C code in Encode.xs
+# *_xs is the default.
sub set_transcoder{
no warnings qw(redefine);
}elsif($type eq "classic"){
*decode = \&decode_classic;
*encode = \&encode_classic;
+ }elsif($type eq "xs"){
+ *decode = \&decode_xs;
+ *encode = \&encode_xs;
}else{
- require Carp;
- Carp::croak __PACKAGE__, "::set_transcoder(modern|classic)";
+ require Carp;
+ Carp::croak __PACKAGE__, "::set_transcoder(modern|classic|xs)";
}
}
-set_transcoder("modern");
+set_transcoder("xs");
#
# *_modern are much faster but guzzle more memory
my $ord = shift @ord;
unless ($size == 4 or valid_ucs2($ord &= $mask)){
if ($ucs2){
- $chk and
+ $chk and
poisoned2death($obj, "no surrogates allowed", $ord);
shift @ord; # skip the next one as well
$ord = FBCHAR;
unless ($size == 4 or valid_ucs2($ord)) {
unless(issurrogate($ord)){
if ($ucs2){
- $chk and
+ $chk and
poisoned2death($obj, "code point too high", $ord);
push @str, FBCHAR;
}else{
-
+
push @str, ensurrogate($ord);
}
}else{ # not supposed to happen
my $ord = unpack($endian, substr($str, 0, $size, ''));
unless ($size == 4 or valid_ucs2($ord &= $mask)){
if ($ucs2){
- $chk and
+ $chk and
poisoned2death($obj, "no surrogates allowed", $ord);
substr($str,0,$size,''); # skip the next one as well
$ord = FBCHAR;
unless ($size == 4 or valid_ucs2($ord)) {
unless(issurrogate($ord)){
if ($ucs2){
- $chk and
+ $chk and
poisoned2death($obj, "code point too high", $ord);
$str .= pack($endian, FBCHAR);
}else{
my ($size, $bom) = @_;
my $N = $size == 2 ? 'n' : 'N';
my $ord = unpack($N, $bom);
- return ($ord eq BOM_BE) ? $N :
+ return ($ord eq BOM_BE) ? $N :
($ord eq BOM16LE) ? 'v' : ($ord eq BOM32LE) ? 'V' : undef;
}
=head1 SYNOPSIS
- use Encode qw/encode decode/;
+ use Encode qw/encode decode/;
$ucs2 = encode("UCS-2BE", $utf8);
$utf8 = decode("UCS-2BE", $ucs2);
-------------------------
=back
-
+
This modules handles BOM as follows.
=over 4
When BE or LE is omitted during decode(), it checks if BOM is in the
beginning of the string and if found endianness is set to what BOM
-says. If not found, dies.
+says. If not found, dies.
=item *
$uni = 0x10000 + ($hi - 0xD800) * 0x400 + ($lo - 0xDC00);
Note this move has made \x{D800}-\x{DFFF} into a forbidden zone but
-perl does not prohibit the use of characters within this range. To perl,
+perl does not prohibit the use of characters within this range. To perl,
every one of \x{0000_0000} up to \x{ffff_ffff} (*) is I<a character>.
(*) or \x{ffff_ffff_ffff_ffff} if your perl is compiled with 64-bit
L<http://www.unicode.org/unicode/faq/utf_bom.html>
Ch. 15, pp. 403 of C<Programming Perl (3rd Edition)>
-by Larry Wall, Tom Christiansen, Jon Orwant;
+by Larry Wall, Tom Christiansen, Jon Orwant;
O'Reilly & Associates; ISBN 0-596-00027-8
=cut