From: Steve Hay Date: Thu, 22 Jan 2009 09:51:13 +0000 (+0000) Subject: Upgrade to Encode-2.27 X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=64bc6d54463906fbe8a23ff2c2507dc2d91d0df0;p=p5sagit%2Fp5-mst-13.2.git Upgrade to Encode-2.27 --- diff --git a/ext/Encode/AUTHORS b/ext/Encode/AUTHORS index 647b356..022a3c4 100644 --- a/ext/Encode/AUTHORS +++ b/ext/Encode/AUTHORS @@ -9,7 +9,8 @@ # # This list is in alphabetical order. -- -Andreas J. Koenig +Alex Davies +Andreas J. Koenig Anton Tagunov Autrijus Tang Benjamin Goldberg diff --git a/ext/Encode/Changes b/ext/Encode/Changes index 3039058..a65c47e 100644 --- a/ext/Encode/Changes +++ b/ext/Encode/Changes @@ -1,13 +1,36 @@ # Revision history for Perl extension Encode. # -# $Id: Changes,v 2.26 2008/07/01 20:56:17 dankogai Exp dankogai $ +# $Id: Changes,v 2.27 2009/01/21 22:55:07 dankogai Exp dankogai $ # -$Revision: 2.26 $ $Date: 2008/07/01 20:56:17 $ +$Revision: 2.27 $ $Date: 2009/01/21 22:55:07 $ +! lib/Encode/MIME/Header.pm t/mime-header.t + Addressed: Encode::MIME::Header MIME-Q encoding truncates + trailing zeros in some circumstances + http://rt.cpan.org/Ticket/Display.html?id=342627 +! lib/Encode/Alias.pm + Added alias: unicode-1-1-utf-7 + http://rt.cpan.org/Ticket/Display.html?id=38558 +! Encode.pm + Documented: _utf8_on() does not work for tainted values + http://rt.cpan.org/Ticket/Display.html?id=41163 +! bin/enc2xs + s[oss.software.ibm.com/icu][www.icu-project.org]g + http://rt.cpan.org/Ticket/Display.html?id=40245 +! lib/Encode/Guess.pm t/guess.t + Addressed:Empty file should produce an error message + http://rt.cpan.org/Ticket/Display.html?id=38652 + + +2.26 2008/07/01 20:56:17 +! Unicode/Unicode.xs AUTHORS + Refactored by Alex Davies + http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/2007-10/msg00745.html + Message-Id: <7637669B2E3D46B187591747DA27F4C8@Amelie> ! Encode.pm Absense of Encode::ConfigLocal no longer carps no matter what. - https://bugzilla.redhat.com/show_bug.cgi?id=435505#c2 - https://rt.cpan.org/Ticket/Display.html?id=28638 - https://rt.cpan.org/Ticket/Display.html?id=11511 + http://bugzilla.redhat.com/show_bug.cgi?id=435505#c2 + http://rt.cpan.org/Ticket/Display.html?id=28638 + http://rt.cpan.org/Ticket/Display.html?id=11511 ! lib/Encode/JIS7.pm use encoding 'utf8' and 'iso-2022-jp' glitches on perl 5.10 Thanks, MIYAGAWA diff --git a/ext/Encode/Encode.pm b/ext/Encode/Encode.pm index 0209257..878536b 100644 --- a/ext/Encode/Encode.pm +++ b/ext/Encode/Encode.pm @@ -1,10 +1,10 @@ # -# $Id: Encode.pm,v 2.26 2008/07/01 20:56:17 dankogai Exp dankogai $ +# $Id: Encode.pm,v 2.27 2009/01/21 22:55:07 dankogai Exp dankogai $ # package Encode; use strict; use warnings; -our $VERSION = sprintf "%d.%02d", q$Revision: 2.26 $ =~ /(\d+)/g; +our $VERSION = sprintf "%d.%02d", q$Revision: 2.27 $ =~ /(\d+)/g; sub DEBUG () { 0 } use XSLoader (); XSLoader::load( __PACKAGE__, $VERSION ); @@ -863,6 +863,8 @@ B that the STRING is well-formed UTF-8. Returns the previous state of the UTF8 flag (so please don't treat the return value as indicating success or failure), or C if STRING is not a string. +This function does not work on tainted values. + =item _utf8_off(STRING) [INTERNAL] Turns off the UTF8 flag in STRING. Do not use frivolously. @@ -870,6 +872,8 @@ Returns the previous state of the UTF8 flag (so please don't treat the return value as indicating success or failure), or C if STRING is not a string. +This function does not work on tainted values. + =back =head1 UTF-8 vs. utf8 vs. UTF8 diff --git a/ext/Encode/Makefile.PL b/ext/Encode/Makefile.PL index 7a78d11..5b8f832 100644 --- a/ext/Encode/Makefile.PL +++ b/ext/Encode/Makefile.PL @@ -1,5 +1,5 @@ # -# $Id: Makefile.PL,v 2.7 2008/07/01 20:56:17 dankogai Exp dankogai $ +# $Id: Makefile.PL,v 2.7 2008/07/01 20:56:17 dankogai Exp $ # use 5.007003; use strict; diff --git a/ext/Encode/Unicode/Unicode.xs b/ext/Encode/Unicode/Unicode.xs index 9efead6..da069c1 100644 --- a/ext/Encode/Unicode/Unicode.xs +++ b/ext/Encode/Unicode/Unicode.xs @@ -1,5 +1,5 @@ /* - $Id: Unicode.xs,v 2.3 2006/05/03 18:24:10 dankogai Exp $ + $Id: Unicode.xs,v 2.4 2009/01/21 22:55:07 dankogai Exp dankogai $ */ #define PERL_NO_GET_CONTEXT @@ -18,65 +18,84 @@ #define isLoSurrogate(x) (0xDC00 <= (x) && (x) <= 0xDFFF ) #define invalid_ucs2(x) ( issurrogate(x) || 0xFFFF < (x) ) +#define PERLIO_BUFSIZ 1024 /* XXX value comes from PerlIOEncode_get_base */ + +/* Avoid wasting too much space in the result buffer */ +static void +shrink_buffer(SV *result) +{ + if (SvLEN(result) > 42 + SvCUR(result)) { + char *buf; + STRLEN datalen = 1 + SvCUR(result); /* include the NUL byte */ + STRLEN buflen = PERL_STRLEN_ROUNDUP(datalen); + Newx(buf, buflen, char); + Copy(SvPVX(result), buf, datalen, char); + Safefree(SvPVX(result)); + SvPV_set(result, buf); + SvLEN_set(result, buflen); + } +} + static UV -enc_unpack(pTHX_ U8 **sp,U8 *e,STRLEN size,U8 endian) +enc_unpack(pTHX_ U8 **sp, U8 *e, STRLEN size, U8 endian) { U8 *s = *sp; UV v = 0; if (s+size > e) { - croak("Partial character %c",(char) endian); + croak("Partial character %c",(char) endian); } switch(endian) { case 'N': - v = *s++; - v = (v << 8) | *s++; + v = *s++; + v = (v << 8) | *s++; case 'n': - v = (v << 8) | *s++; - v = (v << 8) | *s++; - break; + v = (v << 8) | *s++; + v = (v << 8) | *s++; + break; case 'V': case 'v': - v |= *s++; - v |= (*s++ << 8); - if (endian == 'v') - break; - v |= (*s++ << 16); - v |= (*s++ << 24); - break; + v |= *s++; + v |= (*s++ << 8); + if (endian == 'v') + break; + v |= (*s++ << 16); + v |= (*s++ << 24); + break; default: - croak("Unknown endian %c",(char) endian); - break; + croak("Unknown endian %c",(char) endian); + break; } *sp = s; return v; } void -enc_pack(pTHX_ SV *result,STRLEN size,U8 endian,UV value) +enc_pack(pTHX_ SV *result, STRLEN size, U8 endian, UV value) { - U8 *d = (U8 *)SvGROW(result,SvCUR(result)+size+1); + U8 *d = (U8 *) SvPV_nolen(result); + switch(endian) { case 'v': case 'V': - d += SvCUR(result); - SvCUR_set(result,SvCUR(result)+size); - while (size--) { - *d++ = (U8)(value & 0xFF); - value >>= 8; - } - break; + d += SvCUR(result); + SvCUR_set(result,SvCUR(result)+size); + while (size--) { + *d++ = (U8)(value & 0xFF); + value >>= 8; + } + break; case 'n': case 'N': - SvCUR_set(result,SvCUR(result)+size); - d += SvCUR(result); - while (size--) { - *--d = (U8)(value & 0xFF); - value >>= 8; - } - break; + SvCUR_set(result,SvCUR(result)+size); + d += SvCUR(result); + while (size--) { + *--d = (U8)(value & 0xFF); + value >>= 8; + } + break; default: - croak("Unknown endian %c",(char) endian); - break; + croak("Unknown endian %c",(char) endian); + break; } } @@ -94,124 +113,162 @@ SV * str IV check CODE: { - U8 endian = *((U8 *)SvPV_nolen(attr("endian", 6))); - int size = SvIV(attr("size", 4)); - int ucs2 = SvTRUE(attr("ucs2", 4)); - int renewed = SvTRUE(attr("renewed", 7)); - SV *result = newSVpvn("",0); + U8 endian = *((U8 *)SvPV_nolen(attr("endian", 6))); + int size = SvIV(attr("size", 4)); + int ucs2 = -1; /* only needed in the event of surrogate pairs */ + SV *result = newSVpvn("",0); + STRLEN usize = (size > 0 ? size : 1); /* protect against rogue size<=0 */ STRLEN ulen; + STRLEN resultbuflen; + U8 *resultbuf; U8 *s = (U8 *)SvPVbyte(str,ulen); U8 *e = (U8 *)SvEND(str); + /* Optimise for the common case of being called from PerlIOEncode_fill() + with a standard length buffer. In this case the result SV's buffer is + only used temporarily, so we can afford to allocate the maximum needed + and not care about unused space. */ + const bool temp_result = (ulen == PERLIO_BUFSIZ); + ST(0) = sv_2mortal(result); SvUTF8_on(result); if (!endian && s+size <= e) { - UV bom; - endian = (size == 4) ? 'N' : 'n'; - bom = enc_unpack(aTHX_ &s,e,size,endian); - if (bom != BOM_BE) { - if (bom == BOM16LE) { - endian = 'v'; - } - else if (bom == BOM32LE) { - endian = 'V'; - } - else { - croak("%"SVf":Unrecognised BOM %"UVxf, - *hv_fetch((HV *)SvRV(obj),"Name",4,0), - bom); - } - } + UV bom; + endian = (size == 4) ? 'N' : 'n'; + bom = enc_unpack(aTHX_ &s,e,size,endian); + if (bom != BOM_BE) { + if (bom == BOM16LE) { + endian = 'v'; + } + else if (bom == BOM32LE) { + endian = 'V'; + } + else { + croak("%"SVf":Unrecognised BOM %"UVxf, + *hv_fetch((HV *)SvRV(obj),"Name",4,0), + bom); + } + } #if 1 - /* Update endian for next sequence */ - if (renewed) { - hv_store((HV *)SvRV(obj),"endian",6,newSVpv((char *)&endian,1),0); - } + /* Update endian for next sequence */ + if (SvTRUE(attr("renewed", 7))) { + hv_store((HV *)SvRV(obj),"endian",6,newSVpv((char *)&endian,1),0); + } #endif } - while (s < e && s+size <= e) { - UV ord = enc_unpack(aTHX_ &s,e,size,endian); - U8 *d; - if (issurrogate(ord)) { - if (ucs2 || size == 4) { - if (check) { - croak("%"SVf":no surrogates allowed %"UVxf, - *hv_fetch((HV *)SvRV(obj),"Name",4,0), - ord); - } - if (s+size <= e) { - /* skip the next one as well */ - enc_unpack(aTHX_ &s,e,size,endian); - } - ord = FBCHAR; - } - else { - UV lo; - if (!isHiSurrogate(ord)) { - if (check) { - croak("%"SVf":Malformed HI surrogate %"UVxf, - *hv_fetch((HV *)SvRV(obj),"Name",4,0), - ord); - } - else { - ord = FBCHAR; - } - } - else { - if (s+size > e) { - /* Partial character */ - s -= size; /* back up to 1st half */ - break; /* And exit loop */ - } - lo = enc_unpack(aTHX_ &s,e,size,endian); - if (!isLoSurrogate(lo)){ - if (check) { - croak("%"SVf":Malformed LO surrogate %"UVxf, - *hv_fetch((HV *)SvRV(obj),"Name",4,0), - ord); - } - else { - ord = FBCHAR; - } - } - else { - ord = 0x10000 + ((ord - 0xD800) << 10) + (lo - 0xDC00); - } - } - } - } - if ((ord & 0xFFFE) == 0xFFFE || (ord >= 0xFDD0 && ord <= 0xFDEF)) { - if (check) { - croak("%"SVf":Unicode character %"UVxf" is illegal", - *hv_fetch((HV *)SvRV(obj),"Name",4,0), - ord); - } else { - ord = FBCHAR; - } + if (temp_result) { + resultbuflen = 1 + ulen/usize * UTF8_MAXLEN; + } else { + /* Preallocate the buffer to the minimum possible space required. */ + resultbuflen = ulen/usize + UTF8_MAXLEN + 1; } + resultbuf = (U8 *) SvGROW(result, resultbuflen); - d = (U8 *) SvGROW(result,SvCUR(result)+UTF8_MAXLEN+1); - d = uvuni_to_utf8_flags(d+SvCUR(result), ord, 0); - SvCUR_set(result,d - (U8 *)SvPVX(result)); - } - if (s < e) { - /* unlikely to happen because it's fixed-length -- dankogai */ - if (check & ENCODE_WARN_ON_ERR){ - Perl_warner(aTHX_ packWARN(WARN_UTF8),"%"SVf":Partial character", - *hv_fetch((HV *)SvRV(obj),"Name",4,0)); - } + while (s < e && s+size <= e) { + UV ord = enc_unpack(aTHX_ &s,e,size,endian); + U8 *d; + if (issurrogate(ord)) { + if (ucs2 == -1) { + ucs2 = SvTRUE(attr("ucs2", 4)); + } + if (ucs2 || size == 4) { + if (check) { + croak("%"SVf":no surrogates allowed %"UVxf, + *hv_fetch((HV *)SvRV(obj),"Name",4,0), + ord); + } + if (s+size <= e) { + /* skip the next one as well */ + enc_unpack(aTHX_ &s,e,size,endian); + } + ord = FBCHAR; + } + else { + UV lo; + if (!isHiSurrogate(ord)) { + if (check) { + croak("%"SVf":Malformed HI surrogate %"UVxf, + *hv_fetch((HV *)SvRV(obj),"Name",4,0), + ord); + } + else { + ord = FBCHAR; + } + } + else { + if (s+size > e) { + /* Partial character */ + s -= size; /* back up to 1st half */ + break; /* And exit loop */ + } + lo = enc_unpack(aTHX_ &s,e,size,endian); + if (!isLoSurrogate(lo)) { + if (check) { + croak("%"SVf":Malformed LO surrogate %"UVxf, + *hv_fetch((HV *)SvRV(obj),"Name",4,0), + ord); + } + else { + ord = FBCHAR; + } + } + else { + ord = 0x10000 + ((ord - 0xD800) << 10) + (lo - 0xDC00); + } + } + } + } + + if ((ord & 0xFFFE) == 0xFFFE || (ord >= 0xFDD0 && ord <= 0xFDEF)) { + if (check) { + croak("%"SVf":Unicode character %"UVxf" is illegal", + *hv_fetch((HV *)SvRV(obj),"Name",4,0), + ord); + } else { + ord = FBCHAR; + } + } + + if (resultbuflen < SvCUR(result) + UTF8_MAXLEN + 1) { + /* Do not allocate >8Mb more than the minimum needed. + This prevents allocating too much in the rogue case of a large + input consisting initially of long sequence uft8-byte unicode + chars followed by single utf8-byte chars. */ + STRLEN remaining = (e - s)/usize; + STRLEN max_alloc = remaining + (8*1024*1024); + STRLEN est_alloc = remaining * UTF8_MAXLEN; + STRLEN newlen = SvLEN(result) + /* min(max_alloc, est_alloc) */ + (est_alloc > max_alloc ? max_alloc : est_alloc); + resultbuf = (U8 *) SvGROW(result, newlen); + resultbuflen = SvLEN(result); + } + + d = uvuni_to_utf8_flags(resultbuf+SvCUR(result), ord, 0); + SvCUR_set(result, d - (U8 *)SvPVX(result)); } - if (check && !(check & ENCODE_LEAVE_SRC)){ + if (s < e) { - Move(s,SvPVX(str),e-s,U8); - SvCUR_set(str,(e-s)); + /* unlikely to happen because it's fixed-length -- dankogai */ + if (check & ENCODE_WARN_ON_ERR) { + Perl_warner(aTHX_ packWARN(WARN_UTF8),"%"SVf":Partial character", + *hv_fetch((HV *)SvRV(obj),"Name",4,0)); + } } - else { - SvCUR_set(str,0); - } - *SvEND(str) = '\0'; + if (check && !(check & ENCODE_LEAVE_SRC)) { + if (s < e) { + Move(s,SvPVX(str),e-s,U8); + SvCUR_set(str,(e-s)); + } + else { + SvCUR_set(str,0); + } + *SvEND(str) = '\0'; } + + if (!temp_result) + shrink_buffer(result); + XSRETURN(1); } @@ -222,75 +279,92 @@ SV * utf8 IV check CODE: { - U8 endian = *((U8 *)SvPV_nolen(attr("endian", 6))); - int size = SvIV(attr("size", 4)); - int ucs2 = SvTRUE(attr("ucs2", 4)); - int renewed = SvTRUE(attr("renewed", 7)); - SV *result = newSVpvn("",0); + U8 endian = *((U8 *)SvPV_nolen(attr("endian", 6))); + const int size = SvIV(attr("size", 4)); + int ucs2 = -1; /* only needed if there is invalid_ucs2 input */ + const STRLEN usize = (size > 0 ? size : 1); + SV *result = newSVpvn("", 0); STRLEN ulen; - U8 *s = (U8 *)SvPVutf8(utf8,ulen); - U8 *e = (U8 *)SvEND(utf8); + U8 *s = (U8 *) SvPVutf8(utf8, ulen); + const U8 *e = (U8 *) SvEND(utf8); + /* Optimise for the common case of being called from PerlIOEncode_flush() + with a standard length buffer. In this case the result SV's buffer is + only used temporarily, so we can afford to allocate the maximum needed + and not care about unused space. */ + const bool temp_result = (ulen == PERLIO_BUFSIZ); + ST(0) = sv_2mortal(result); + + /* Preallocate the result buffer to the maximum possible size. + ie. assume each UTF8 byte is 1 character. + Then shrink the result's buffer if necesary at the end. */ + SvGROW(result, ((ulen+1) * usize)); + if (!endian) { - endian = (size == 4) ? 'N' : 'n'; - enc_pack(aTHX_ result,size,endian,BOM_BE); + endian = (size == 4) ? 'N' : 'n'; + enc_pack(aTHX_ result,size,endian,BOM_BE); #if 1 - /* Update endian for next sequence */ - if (renewed){ - hv_store((HV *)SvRV(obj),"endian",6,newSVpv((char *)&endian,1),0); - } + /* Update endian for next sequence */ + if (SvTRUE(attr("renewed", 7))) { + hv_store((HV *)SvRV(obj),"endian",6,newSVpv((char *)&endian,1),0); + } #endif } while (s < e && s+UTF8SKIP(s) <= e) { - STRLEN len; - UV ord = utf8n_to_uvuni(s, e-s, &len, 0); - s += len; - if (size != 4 && invalid_ucs2(ord)) { - if (!issurrogate(ord)){ - if (ucs2) { - if (check) { - croak("%"SVf":code point \"\\x{%"UVxf"}\" too high", - *hv_fetch((HV *)SvRV(obj),"Name",4,0),ord); - } - enc_pack(aTHX_ result,size,endian,FBCHAR); - }else{ - UV hi = ((ord - 0x10000) >> 10) + 0xD800; - UV lo = ((ord - 0x10000) & 0x3FF) + 0xDC00; - enc_pack(aTHX_ result,size,endian,hi); - enc_pack(aTHX_ result,size,endian,lo); - } - } - else { - /* not supposed to happen */ - enc_pack(aTHX_ result,size,endian,FBCHAR); - } - } - else { - enc_pack(aTHX_ result,size,endian,ord); - } + STRLEN len; + UV ord = utf8n_to_uvuni(s, e-s, &len, 0); + s += len; + if (size != 4 && invalid_ucs2(ord)) { + if (!issurrogate(ord)) { + if (ucs2 == -1) { + ucs2 = SvTRUE(attr("ucs2", 4)); + } + if (ucs2) { + if (check) { + croak("%"SVf":code point \"\\x{%"UVxf"}\" too high", + *hv_fetch((HV *)SvRV(obj),"Name",4,0),ord); + } + enc_pack(aTHX_ result,size,endian,FBCHAR); + } else { + UV hi = ((ord - 0x10000) >> 10) + 0xD800; + UV lo = ((ord - 0x10000) & 0x3FF) + 0xDC00; + enc_pack(aTHX_ result,size,endian,hi); + enc_pack(aTHX_ result,size,endian,lo); + } + } + else { + /* not supposed to happen */ + enc_pack(aTHX_ result,size,endian,FBCHAR); + } + } + else { + enc_pack(aTHX_ result,size,endian,ord); + } } if (s < e) { - /* UTF-8 partial char happens often on PerlIO. - Since this is okay and normal, we do not warn. - But this is critical when you choose to LEAVE_SRC - in which case we die */ - if (check & (ENCODE_DIE_ON_ERR|ENCODE_LEAVE_SRC)){ - Perl_croak(aTHX_ "%"SVf":partial character is not allowed " - "when CHECK = 0x%" UVuf, - *hv_fetch((HV *)SvRV(obj),"Name",4,0), check); + /* UTF-8 partial char happens often on PerlIO. + Since this is okay and normal, we do not warn. + But this is critical when you choose to LEAVE_SRC + in which case we die */ + if (check & (ENCODE_DIE_ON_ERR|ENCODE_LEAVE_SRC)) { + Perl_croak(aTHX_ "%"SVf":partial character is not allowed " + "when CHECK = 0x%" UVuf, + *hv_fetch((HV *)SvRV(obj),"Name",4,0), check); + } } - - } - if (check && !(check & ENCODE_LEAVE_SRC)){ - if (s < e) { - Move(s,SvPVX(utf8),e-s,U8); - SvCUR_set(utf8,(e-s)); + if (check && !(check & ENCODE_LEAVE_SRC)) { + if (s < e) { + Move(s,SvPVX(utf8),e-s,U8); + SvCUR_set(utf8,(e-s)); + } + else { + SvCUR_set(utf8,0); + } + *SvEND(utf8) = '\0'; } - else { - SvCUR_set(utf8,0); - } - *SvEND(utf8) = '\0'; - } + + if (!temp_result) + shrink_buffer(result); + XSRETURN(1); } - diff --git a/ext/Encode/bin/enc2xs b/ext/Encode/bin/enc2xs index c5cf8ee..233ca54 100644 --- a/ext/Encode/bin/enc2xs +++ b/ext/Encode/bin/enc2xs @@ -10,7 +10,7 @@ use warnings; use Getopt::Std; use Config; my @orig_ARGV = @ARGV; -our $VERSION = do { my @r = (q$Revision: 2.5 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; +our $VERSION = do { my @r = (q$Revision: 2.6 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; # These may get re-ordered. # RAW is a do_now as inserted by &enter @@ -1351,17 +1351,17 @@ Encode/bin directory. =item * ICU Home Page -L +L =item * ICU Character Mapping Tables -L +L =item * ICU:Conversion Data -L +L =back diff --git a/ext/Encode/lib/Encode/Alias.pm b/ext/Encode/lib/Encode/Alias.pm index d02ca39..5fb12e4b 100644 --- a/ext/Encode/lib/Encode/Alias.pm +++ b/ext/Encode/lib/Encode/Alias.pm @@ -2,7 +2,7 @@ package Encode::Alias; use strict; use warnings; no warnings 'redefine'; -our $VERSION = do { my @r = ( q$Revision: 2.10 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r }; +our $VERSION = do { my @r = ( q$Revision: 2.11 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r }; sub DEBUG () { 0 } use base qw(Exporter); @@ -135,7 +135,7 @@ sub init_aliases { define_alias( qr/^(.*)$/ => '"\L$1"' ); # UTF/UCS stuff - define_alias( qr/^UTF-?7$/i => '"UTF-7"' ); + define_alias( qr/^(unicode-1-1-)?UTF-?7$/i => '"UTF-7"' ); define_alias( qr/^UCS-?2-?LE$/i => '"UCS-2LE"' ); define_alias( qr/^UCS-?2-?(BE)?$/i => '"UCS-2BE"', diff --git a/ext/Encode/lib/Encode/Guess.pm b/ext/Encode/lib/Encode/Guess.pm index 1bc4df7..1ad7147 100644 --- a/ext/Encode/lib/Encode/Guess.pm +++ b/ext/Encode/lib/Encode/Guess.pm @@ -2,7 +2,7 @@ package Encode::Guess; use strict; use warnings; use Encode qw(:fallbacks find_encoding); -our $VERSION = do { my @r = ( q$Revision: 2.2 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r }; +our $VERSION = do { my @r = ( q$Revision: 2.3 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r }; my $Canon = 'Guess'; sub DEBUG () { 0 } @@ -68,7 +68,7 @@ sub guess { my $octet = shift; # sanity check - return unless defined $octet and length $octet; + return "Empty string, empty guess" unless defined $octet and length $octet; # cheat 0: utf8 flag; if ( Encode::is_utf8($octet) ) { diff --git a/ext/Encode/lib/Encode/MIME/Header.pm b/ext/Encode/lib/Encode/MIME/Header.pm index b664d88..4742a72 100644 --- a/ext/Encode/lib/Encode/MIME/Header.pm +++ b/ext/Encode/lib/Encode/MIME/Header.pm @@ -3,7 +3,7 @@ use strict; use warnings; no warnings 'redefine'; -our $VERSION = do { my @r = ( q$Revision: 2.5 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r }; +our $VERSION = do { my @r = ( q$Revision: 2.6 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r }; use Encode qw(find_encoding encode_utf8 decode_utf8); use MIME::Base64; use Carp; @@ -164,7 +164,7 @@ sub _encode { } $chunk .= $chr; } - $chunk and push @result, SINGLE->{$enc}($chunk); + length($chunk) and push @result, SINGLE->{$enc}($chunk); return @result; } diff --git a/ext/Encode/t/guess.t b/ext/Encode/t/guess.t index 5bfbf4e..707ca85 100644 --- a/ext/Encode/t/guess.t +++ b/ext/Encode/t/guess.t @@ -21,7 +21,7 @@ use File::Spec; use Encode qw(decode encode find_encoding _utf8_off); #use Test::More qw(no_plan); -use Test::More tests => 29; +use Test::More tests => 30; use_ok("Encode::Guess"); { no warnings; @@ -35,6 +35,7 @@ my $utf8off = $utf8on; _utf8_off($utf8off); my $utf16 = encode('UTF-16', $utf8on); my $utf32 = encode('UTF-32', $utf8on); +like(guess_encoding(''), qr/empty string/io, 'empty string'); is(guess_encoding($ascii)->name, 'ascii', 'ascii'); like(guess_encoding($latin1), qr/No appropriate encoding/io, 'no ascii'); is(guess_encoding($latin1, 'latin1')->name, 'iso-8859-1', 'iso-8859-1'); diff --git a/ext/Encode/t/mime-header.t b/ext/Encode/t/mime-header.t index e36e0ba..a69e176 100644 --- a/ext/Encode/t/mime-header.t +++ b/ext/Encode/t/mime-header.t @@ -1,5 +1,5 @@ # -# $Id: mime-header.t,v 2.3 2007/04/06 12:53:41 dankogai Exp $ +# $Id: mime-header.t,v 2.4 2009/01/21 22:55:07 dankogai Exp dankogai $ # This script is written in utf8 # BEGIN { @@ -23,7 +23,7 @@ no utf8; use strict; #use Test::More qw(no_plan); -use Test::More tests => 12; +use Test::More tests => 13; use_ok("Encode::MIME::Header"); my $eheader =<<'EOS'; @@ -119,4 +119,10 @@ is(Encode::encode('MIME-Q', $dheader), $qheader, "Double decode Q"); is(Encode::encode('MIME-Q', "\x{fc}"), '=?UTF-8?Q?=C3=BC?=', 'Encode latin1 characters'); +# RT42627 + +my $rt42627 = Encode::decode_utf8("\x{c2}\x{a3}xxxxxxxxxxxxxxxxxxx0"); +is(Encode::encode('MIME-Q', $rt42627), + '=?UTF-8?Q?=C2=A3xxxxxxxxxxxxxxxxxxx?==?UTF-8?Q?0?=', + 'MIME-Q encoding does not truncate trailing zeros'); __END__;