From: Nick Ing-Simmons Date: Tue, 27 Mar 2001 19:38:50 +0000 (+0000) Subject: Various EBCDIC fixes: X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=5ad8ef521b3ffc4e6bbbb9941bc4940d442b56b2;p=p5sagit%2Fp5-mst-13.2.git Various EBCDIC fixes: - major revelation that swash code is encoding aware, (or thought it was) - now it is ;-) - With that out of the way fix a slab of tr/// cases. - Fix Encode 'Unicode' to be true Unicode so tests pass. - As anticipated Base64.xs needed tweaks. - Until tr/// works right avoid old_encode64 in MIME tests. p4raw-id: //depot/perlio@9384 --- diff --git a/doop.c b/doop.c index 823c88d..266411a 100644 --- a/doop.c +++ b/doop.c @@ -36,7 +36,7 @@ S_do_trans_simple(pTHX_ SV *sv) tbl = (short*)cPVOP->op_pv; if (!tbl) - Perl_croak(aTHX_ "panic: do_trans_simple"); + Perl_croak(aTHX_ "panic: do_trans_simple line %d",__LINE__); s = (U8*)SvPV(sv, len); send = s + len; @@ -103,7 +103,7 @@ S_do_trans_count(pTHX_ SV *sv)/* SPC - OK */ tbl = (short*)cPVOP->op_pv; if (!tbl) - Perl_croak(aTHX_ "panic: do_trans_count"); + Perl_croak(aTHX_ "panic: do_trans_count line %d",__LINE__); s = (U8*)SvPV(sv, len); send = s + len; @@ -147,7 +147,7 @@ S_do_trans_complex(pTHX_ SV *sv)/* SPC - NOT OK */ tbl = (short*)cPVOP->op_pv; if (!tbl) - Perl_croak(aTHX_ "panic: do_trans_complex"); + Perl_croak(aTHX_ "panic: do_trans_complex line %d",__LINE__); s = (U8*)SvPV(sv, len); isutf8 = SvUTF8(sv); @@ -346,7 +346,7 @@ S_do_trans_simple_utf8(pTHX_ SV *sv)/* SPC - OK */ if ((uv = swash_fetch(rv, s)) < none) { s += UTF8SKIP(s); matches++; - d = uvchr_to_utf8(d, uv); + d = uvuni_to_utf8(d, uv); } else if (uv == none) { int i = UTF8SKIP(s); @@ -358,7 +358,7 @@ S_do_trans_simple_utf8(pTHX_ SV *sv)/* SPC - OK */ int i = UTF8SKIP(s); s += i; matches++; - d = uvchr_to_utf8(d, final); + d = uvuni_to_utf8(d, final); } else s += UTF8SKIP(s); @@ -367,7 +367,7 @@ S_do_trans_simple_utf8(pTHX_ SV *sv)/* SPC - OK */ STRLEN clen = d - dstart; STRLEN nlen = dend - dstart + len + UTF8_MAXLEN; if (!grows) - Perl_croak(aTHX_ "panic: do_trans_complex_utf8"); + Perl_croak(aTHX_ "panic: do_trans_simple_utf8 line %d",__LINE__); Renew(dstart, nlen+UTF8_MAXLEN, U8); d = dstart + clen; dend = dstart + nlen; @@ -496,7 +496,7 @@ S_do_trans_complex_utf8(pTHX_ SV *sv) /* SPC - NOT OK */ STRLEN clen = d - dstart; STRLEN nlen = dend - dstart + len + UTF8_MAXLEN; if (!grows) - Perl_croak(aTHX_ "panic: do_trans_complex_utf8"); + Perl_croak(aTHX_ "panic: do_trans_complex_utf8 line %d",__LINE__); Renew(dstart, nlen+UTF8_MAXLEN, U8); d = dstart + clen; dend = dstart + nlen; @@ -505,7 +505,7 @@ S_do_trans_complex_utf8(pTHX_ SV *sv) /* SPC - NOT OK */ matches++; s += UTF8SKIP(s); if (uv != puv) { - d = uvchr_to_utf8(d, uv); + d = uvuni_to_utf8(d, uv); puv = uv; } continue; @@ -523,13 +523,13 @@ S_do_trans_complex_utf8(pTHX_ SV *sv) /* SPC - NOT OK */ if (havefinal) { s += UTF8SKIP(s); if (puv != final) { - d = uvchr_to_utf8(d, final); + d = uvuni_to_utf8(d, final); puv = final; } } else { STRLEN len; - uv = utf8_to_uvchr(s, &len); + uv = utf8_to_uvuni(s, &len); if (uv != puv) { Copy(s, d, len, U8); d += len; @@ -550,7 +550,7 @@ S_do_trans_complex_utf8(pTHX_ SV *sv) /* SPC - NOT OK */ STRLEN clen = d - dstart; STRLEN nlen = dend - dstart + len + UTF8_MAXLEN; if (!grows) - Perl_croak(aTHX_ "panic: do_trans_complex_utf8"); + Perl_croak(aTHX_ "panic: do_trans_complex_utf8 line %d",__LINE__); Renew(dstart, nlen+UTF8_MAXLEN, U8); d = dstart + clen; dend = dstart + nlen; @@ -558,7 +558,7 @@ S_do_trans_complex_utf8(pTHX_ SV *sv) /* SPC - NOT OK */ if (uv < none) { matches++; s += UTF8SKIP(s); - d = uvchr_to_utf8(d, uv); + d = uvuni_to_utf8(d, uv); continue; } else if (uv == none) { /* "none" is unmapped character */ @@ -571,7 +571,7 @@ S_do_trans_complex_utf8(pTHX_ SV *sv) /* SPC - NOT OK */ else if (uv == extra && !del) { matches++; s += UTF8SKIP(s); - d = uvchr_to_utf8(d, final); + d = uvuni_to_utf8(d, final); continue; } matches++; /* "none+1" is delete character */ diff --git a/ext/Encode/Encode.pm b/ext/Encode/Encode.pm index 6501806..fde3891 100644 --- a/ext/Encode/Encode.pm +++ b/ext/Encode/Encode.pm @@ -60,6 +60,7 @@ sub findAlias my $alias = $alias[$i]; my $val = $alias[$i+1]; my $new; + if (ref($alias) eq 'Regexp' && $_ =~ $alias) { $new = eval $val; @@ -68,7 +69,7 @@ sub findAlias { $new = &{$alias}($val) } - elsif (lc($_) eq $alias) + elsif (lc($_) eq lc($alias)) { $new = $val; } @@ -222,13 +223,15 @@ sub new_sequence { return $_[0] } package Encode::XS; use base 'Encode::Encoding'; -package Encode::Unicode; +package Encode::Internal; use base 'Encode::Encoding'; # Dummy package that provides the encode interface but leaves data # as UTF-X encoded. It is here so that from_to() works. -__PACKAGE__->Define('Unicode'); +__PACKAGE__->Define('Internal'); + +Encode::define_alias( 'Unicode' => 'Internal' ) if ord('A') == 65; sub decode { @@ -240,6 +243,36 @@ sub decode *encode = \&decode; +package Encoding::Unicode; +use base 'Encode::Encoding'; + +__PACKAGE__->Define('Unicode') unless ord('A') == 65; + +sub decode +{ + my ($obj,$str,$chk) = @_; + my $res = ''; + for (my $i = 0; $i < length($str); $i++) + { + $res .= chr(utf8::unicode_to_native(ord(substr($str,$i,1)))); + } + $_[1] = '' if $chk; + return $res; +} + +sub encode +{ + my ($obj,$str,$chk) = @_; + my $res = ''; + for (my $i = 0; $i < length($str); $i++) + { + $res .= chr(utf8::native_to_unicode(ord(substr($str,$i,1)))); + } + $_[1] = '' if $chk; + return $res; +} + + package Encode::utf8; use base 'Encode::Encoding'; # package to allow long-hand diff --git a/ext/MIME/Base64/Base64.xs b/ext/MIME/Base64/Base64.xs index 118d170..f77ba14 100644 --- a/ext/MIME/Base64/Base64.xs +++ b/ext/MIME/Base64/Base64.xs @@ -11,15 +11,15 @@ metamail, which comes with this message: Copyright (c) 1991 Bell Communications Research, Inc. (Bellcore) - Permission to use, copy, modify, and distribute this material - for any purpose and without fee is hereby granted, provided - that the above copyright notice and this permission notice - appear in all copies, and that the name of Bellcore not be - used in advertising or publicity pertaining to this - material without the specific, prior written permission - of an authorized representative of Bellcore. BELLCORE - MAKES NO REPRESENTATIONS ABOUT THE ACCURACY OR SUITABILITY - OF THIS MATERIAL FOR ANY PURPOSE. IT IS PROVIDED "AS IS", + Permission to use, copy, modify, and distribute this material + for any purpose and without fee is hereby granted, provided + that the above copyright notice and this permission notice + appear in all copies, and that the name of Bellcore not be + used in advertising or publicity pertaining to this + material without the specific, prior written permission + of an authorized representative of Bellcore. BELLCORE + MAKES NO REPRESENTATIONS ABOUT THE ACCURACY OR SUITABILITY + OF THIS MATERIAL FOR ANY PURPOSE. IT IS PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED WARRANTIES. */ @@ -160,7 +160,7 @@ decode_base64(sv) PREINIT: STRLEN len; - register unsigned char *str = (unsigned char*)SvPV(sv, len); + register unsigned char *str = (unsigned char*)SvPVbyte(sv, len); unsigned char const* end = str + len; char *r; unsigned char c[4]; @@ -177,7 +177,7 @@ decode_base64(sv) while (str < end) { int i = 0; do { - unsigned char uc = index_64[*str++]; + unsigned char uc = index_64[NATIVE_TO_ASCII(*str++)]; if (uc != INVALID) c[i++] = uc; @@ -192,7 +192,7 @@ decode_base64(sv) break; } } while (i < 4); - + if (c[0] == EQ || c[1] == EQ) { if (PL_dowarn) warn("Premature padding of base64 data"); break; diff --git a/t/lib/encode.t b/t/lib/encode.t index d4a13ee..ceeb422 100644 --- a/t/lib/encode.t +++ b/t/lib/encode.t @@ -30,10 +30,10 @@ $cpy = $str; ok(length($str),from_to($cpy,'iso8859-1','Unicode'),"Length Wrong"); my $sym = Encode->getEncoding('symbol'); -my $uni = $sym->decode('a'); +my $uni = $sym->decode(encode(ascii => 'a')); ok("\N{alpha}",substr($uni,0,1),"alpha does not map to symbol 'a'"); $str = $sym->encode("\N{Beta}"); -ok("B",substr($str,0,1),"Symbol 'B' does not map to Beta"); +ok("B",decode(ascii => substr($str,0,1)),"Symbol 'B' does not map to Beta"); foreach my $enc (qw(symbol dingbats ascii),@encodings) { diff --git a/t/lib/mimeb64.t b/t/lib/mimeb64.t index 5bb78b1..b122552 100644 --- a/t/lib/mimeb64.t +++ b/t/lib/mimeb64.t @@ -9,6 +9,16 @@ print "1..282\n"; print "# Testing MIME::Base64-", $MIME::Base64::VERSION, "\n"; +BEGIN { + if (ord('A') != 193) { + *ASCII = sub { return $_[0] }; + } + else { + require Encode; + *ASCII = sub { Encode::encode('ascii',$_[0]) }; + } +} + $testno = 1; encodeTest(); @@ -23,27 +33,6 @@ sub encodeTest print "# encode test\n"; my @encode_tests = ( - ['' => ''], - ['a' => 'YQ=='], - ['aa' => 'YWE='], - ['aaa' => 'YWFh'], - - ['aaa' => 'YWFh'], - ['aaa' => 'YWFh'], - ['aaa' => 'YWFh'], - - ["\000\377" => "AP8="], - ["\377\000" => "/wA="], - ["\000\000\000" => "AAAA"], - - # from HTTP spec - ['Aladdin:open sesame' => 'QWxhZGRpbjpvcGVuIHNlc2FtZQ=='], - - ['a' x 100 => 'YWFh' x 33 . 'YQ=='], - - ['Multipurpose Internet Mail Extensions: The Base64 Content-Transfer-Encoding is designed to represent sequences of octets in a form that is not humanly readable. ' - => "TXVsdGlwdXJwb3NlIEludGVybmV0IE1haWwgRXh0ZW5zaW9uczogVGhlIEJhc2U2NCBDb250ZW50LVRyYW5zZmVyLUVuY29kaW5nIGlzIGRlc2lnbmVkIHRvIHJlcHJlc2VudCBzZXF1ZW5jZXMgb2Ygb2N0ZXRzIGluIGEgZm9ybSB0aGF0IGlzIG5vdCBodW1hbmx5IHJlYWRhYmxlLiA="], - # All values ["\000" => "AA=="], ["\001" => "AQ=="], @@ -301,6 +290,29 @@ sub encodeTest ["\375" => "/Q=="], ["\376" => "/g=="], ["\377" => "/w=="], + + ["\000\377" => "AP8="], + ["\377\000" => "/wA="], + ["\000\000\000" => "AAAA"], + + ['' => ''], + [ASCII('a') => 'YQ=='], + [ASCII('aa') => 'YWE='], + [ASCII('aaa') => 'YWFh'], + + [ASCII('aaa') => 'YWFh'], + [ASCII('aaa') => 'YWFh'], + [ASCII('aaa') => 'YWFh'], + + + # from HTTP spec + [ASCII('Aladdin:open sesame') => 'QWxhZGRpbjpvcGVuIHNlc2FtZQ=='], + + [ASCII('a') x 100 => 'YWFh' x 33 . 'YQ=='], + + [ASCII('Multipurpose Internet Mail Extensions: The Base64 Content-Transfer-Encoding is designed to represent sequences of octets in a form that is not humanly readable. ') + => "TXVsdGlwdXJwb3NlIEludGVybmV0IE1haWwgRXh0ZW5zaW9uczogVGhlIEJhc2U2NCBDb250ZW50LVRyYW5zZmVyLUVuY29kaW5nIGlzIGRlc2lnbmVkIHRvIHJlcHJlc2VudCBzZXF1ZW5jZXMgb2Ygb2N0ZXRzIGluIGEgZm9ybSB0aGF0IGlzIG5vdCBodW1hbmx5IHJlYWRhYmxlLiA="], + ); for $test (@encode_tests) { @@ -313,11 +325,12 @@ sub encodeTest } my $decoded = decode_base64($encoded); if ($decoded ne $plain) { - print "test $testno ($plain): expected $expected, got $encoded\n"; + print "test $testno ($encoded): expected $plain, got $decoded\n"; print "not "; } - # Try the old C versions too + if (ord('A') != 193) { # perl versions broken on EBCDIC + # Try the old Perl versions too if ($encoded ne MIME::Base64::old_encode_base64($plain, '')) { print "old_encode_base64 give different result.\n"; print "not "; @@ -326,6 +339,7 @@ sub encodeTest print "old_decode_base64 give different result.\n"; print "not "; } + } print "ok $testno\n"; $testno++; @@ -339,17 +353,17 @@ sub decodeTest local $SIG{__WARN__} = sub { print $_[0] }; # avoid warnings on stderr my @decode_tests = ( - ['YWE=' => 'aa'], - [' YWE=' => 'aa'], - ['Y WE=' => 'aa'], - ['YWE= ' => 'aa'], - ["Y\nW\r\nE=" => 'aa'], + ['YWE=' => ASCII('aa')], + [' YWE=' => ASCII('aa')], + ['Y WE=' => ASCII('aa')], + ['YWE= ' => ASCII('aa')], + ["Y\nW\r\nE=" => ASCII('aa')], # These will generate some warnings - ['YWE=====' => 'aa'], # extra padding - ['YWE' => 'aa'], # missing padding - ['YWFh====' => 'aaa'], - ['YQ' => 'a'], + ['YWE=====' => ASCII('aa')], # extra padding + ['YWE' => ASCII('aa')], # missing padding + ['YWFh====' => ASCII('aaa')], + ['YQ' => ASCII('a')], ['Y' => ''], ['' => ''], [undef() => ''], diff --git a/utf8.c b/utf8.c index b95c7ad..66d3fec 100644 --- a/utf8.c +++ b/utf8.c @@ -1280,14 +1280,34 @@ UV Perl_swash_fetch(pTHX_ SV *sv, U8 *ptr) { HV* hv = (HV*)SvRV(sv); + /* Given a UTF-X encoded char 0xAA..0xYY,0xZZ + then the "swatch" is a vec() for al the chars which start + with 0xAA..0xYY + So the key in the hash is length of encoded char -1 + */ U32 klen = UTF8SKIP(ptr) - 1; - U32 off = ptr[klen] & 127; /* NB: 64 bit always 0 when len > 1 */ + U32 off = ptr[klen]; STRLEN slen; - STRLEN needents = (klen ? 64 : 128); + STRLEN needents; U8 *tmps; U32 bit; SV *retval; + if (klen == 0) + { + /* If char in invariant then swatch is for all the invariant chars + * In both UTF-8 and UTF8-MOD that happens to be UTF_CONTINUATION_MARK + */ + needents = UTF_CONTINUATION_MARK; + off = NATIVE_TO_UTF(ptr[klen]); + } + else + { + /* If char is encoded then swatch is for the prefix */ + needents = (1 << UTF_ACCUMULATION_SHIFT); + off = NATIVE_TO_UTF(ptr[klen]) & UTF_CONTINUATION_MASK; + } + /* * This single-entry cache saves about 1/3 of the utf8 overhead in test * suite. (That is, only 7-8% overall over just a hash cache. Still, @@ -1337,7 +1357,7 @@ Perl_swash_fetch(pTHX_ SV *sv, U8 *ptr) svp = hv_store(hv, (char*)ptr, klen, retval, 0); - if (!svp || !(tmps = (U8*)SvPV(*svp, slen)) || slen < 8) + if (!svp || !(tmps = (U8*)SvPV(*svp, slen)) || (slen << 3) < needents) Perl_croak(aTHX_ "SWASHGET didn't return result of proper length"); }