From: Jarkko Hietaniemi Date: Wed, 24 Apr 2002 20:20:53 +0000 (+0000) Subject: Upgrade to Encode 1.60, from Dan Kogai. X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=af1f55d9c3a7b3b07efc4feaa402d004e3fc2106;p=p5sagit%2Fp5-mst-13.2.git Upgrade to Encode 1.60, from Dan Kogai. p4raw-id: //depot/perl@16145 --- diff --git a/MANIFEST b/MANIFEST index 433a728..d15f08b 100644 --- a/MANIFEST +++ b/MANIFEST @@ -240,12 +240,13 @@ ext/Encode/lib/Encode/CN/HZ.pm Encode extension ext/Encode/lib/Encode/Config.pm Encode configuration module ext/Encode/lib/Encode/Encoder.pm OO Encoder ext/Encode/lib/Encode/Encoding.pm Encode extension +ext/Encode/lib/Encode/Guess.pm Encode Extension ext/Encode/lib/Encode/JP/H2Z.pm Encode extension ext/Encode/lib/Encode/JP/JIS7.pm Encode extension ext/Encode/lib/Encode/KR/2022_KR.pm Encode extension +ext/Encode/lib/Encode/MIME/Header.pm Encode extension ext/Encode/lib/Encode/PerlIO.pod Documents for Encode & PerlIO ext/Encode/lib/Encode/Supported.pod Documents for supported encodings -ext/Encode/t/unibench.pl benchmark script ext/Encode/t/Aliases.t test script ext/Encode/t/CJKT.t test script ext/Encode/t/Encode.t test script @@ -262,6 +263,7 @@ ext/Encode/t/fallback.t test script ext/Encode/t/gb2312.enc test data ext/Encode/t/gb2312.utf test data ext/Encode/t/grow.t test script +ext/Encode/t/guess.t test script ext/Encode/t/jisx0201.enc test data ext/Encode/t/jisx0201.utf test data ext/Encode/t/jisx0208.enc test data @@ -271,7 +273,9 @@ ext/Encode/t/jisx0212.utf test data ext/Encode/t/jperl.t test script ext/Encode/t/ksc5601.enc test data ext/Encode/t/ksc5601.utf test data +ext/Encode/t/mime-header.t test script ext/Encode/t/perlio.t test script +ext/Encode/t/unibench.pl benchmark script ext/Encode/ucm/8859-1.ucm Unicode Character Map ext/Encode/ucm/8859-10.ucm Unicode Character Map ext/Encode/ucm/8859-11.ucm Unicode Character Map diff --git a/ext/Encode/CN/Makefile.PL b/ext/Encode/CN/Makefile.PL index 46b262d..775a8f5 100644 --- a/ext/Encode/CN/Makefile.PL +++ b/ext/Encode/CN/Makefile.PL @@ -1,6 +1,7 @@ use 5.7.2; use strict; use ExtUtils::MakeMaker; +use strict; my %tables = (euc_cn_t => ['euc-cn.ucm', 'cp936.ucm', @@ -11,6 +12,20 @@ my %tables = (euc_cn_t => ['euc-cn.ucm', ir_165_t => ['ir-165.ucm'], ); +unless ($ENV{AGGREGATE_TABLES}){ + my @ucm; + for my $k (keys %tables){ + push @ucm, @{$tables{$k}}; + } + %tables = (); + my $seq = 0; + for my $ucm (sort @ucm){ + # 8.3 compliance ! + my $t = sprintf ("%s_%02d_t", substr($ucm, 0, 2), $seq++); + $tables{$t} = [ $ucm ]; + } +} + my $name = 'CN'; WriteMakefile( diff --git a/ext/Encode/Changes b/ext/Encode/Changes index 77a5f04..314358e 100644 --- a/ext/Encode/Changes +++ b/ext/Encode/Changes @@ -1,9 +1,34 @@ # Revision history for Perl extension Encode. # -# $Id: Changes,v 1.58 2002/04/22 23:54:22 dankogai Exp $ +# $Id: Changes,v 1.60 2002/04/24 20:06:52 dankogai Exp $ # -$Revision: 1.58 $ $Date: 2002/04/22 23:54:22 $ +$Revision: 1.60 $ $Date: 2002/04/24 20:06:52 $ +! Encode.xs + "Thou shalt not assume %x works." -- jhi + Message-Id: <20020424210618.E24347@alpha.hut.fi> +! CN/Makefile.PL JP/Makefile.PL KR/Makefile.PL TW/Makefile.PL To make + low-memory build machines happy, now *.c is created for each *.ucm + (no table aggregation). You can still override this by setting + $ENV{AGGREGATE_TABLES}. + Message-Id: <00B1B3E4-579F-11D6-A441-00039301D480@dan.co.jp> ++ lib/Encode/Guess.pm ++ lib/Encode/JP/JIS7.pm + Encoding-autodetect (mainly for Japanese encoding) added. In a + course of development, JIS7.pm was improved. ++ lib/Encode/HTML/Header.pm ++ lib/Encode/Config.pm + MIME B/Q Header Encoding Added! +! Encode.pm Encode.xs t/fallback.t + new fallbacks; XMLCREF and HTMLCREF upon Bart's request. + Message-Id: <20020424130709.GA14211@tanglefoot> + +1.59 $ 2002/04/22 23:54:22 +! Encode.pm Encode.xs + needs_lines() and perlio_ok() are added to Internal encodings such + as utf8 so XML::SAX is happy. FB_* stub xsubs are now prototyped. + +1.58 2002/04/22 23:54:22 ! TW/TW.pm s/MacChineseSimp/MacChineseTrad/ # ... oops. ! bin/ucm2text @@ -467,7 +492,7 @@ $Revision: 1.58 $ $Date: 2002/04/22 23:54:22 $ Typo fixes and improvements by jhi Message-Id: <200204010201.FAA03564@alpha.hut.fi>, et al. -1.11 $Date: 2002/04/22 23:54:22 $ +1.11 $Date: 2002/04/24 20:06:52 $ + t/encoding.t + t/jperl.t ! MANIFEST diff --git a/ext/Encode/Encode.pm b/ext/Encode/Encode.pm index b03d93d..65ef50b 100644 --- a/ext/Encode/Encode.pm +++ b/ext/Encode/Encode.pm @@ -1,6 +1,6 @@ package Encode; use strict; -our $VERSION = do { my @r = (q$Revision: 1.58 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; +our $VERSION = do { my @r = (q$Revision: 1.60 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; our $DEBUG = 0; use XSLoader (); XSLoader::load 'Encode'; @@ -15,8 +15,10 @@ our @EXPORT = qw( encodings find_encoding ); -our @FB_FLAGS = qw(DIE_ON_ERR WARN_ON_ERR RETURN_ON_ERR LEAVE_SRC PERLQQ); -our @FB_CONSTS = qw(FB_DEFAULT FB_QUIET FB_WARN FB_PERLQQ FB_CROAK); +our @FB_FLAGS = qw(DIE_ON_ERR WARN_ON_ERR RETURN_ON_ERR LEAVE_SRC + PERLQQ HTMLCREF XMLCREF); +our @FB_CONSTS = qw(FB_DEFAULT FB_CROAK FB_QUIET FB_WARN + FB_PERLQQ FB_HTMLCREF FB_XMLCREF); our @EXPORT_OK = ( @@ -194,6 +196,11 @@ sub predefine_encodings{ package Encode::UTF_EBCDIC; *name = sub{ shift->{'Name'} }; *new_sequence = sub{ return $_[0] }; + *needs_lines = sub{ 0 }; + *perlio_ok = sub { + eval{ require PerlIO::encoding }; + return $@ ? 0 : 1; + }; *decode = sub{ my ($obj,$str,$chk) = @_; my $res = ''; @@ -221,6 +228,11 @@ sub predefine_encodings{ package Encode::Internal; *name = sub{ shift->{'Name'} }; *new_sequence = sub{ return $_[0] }; + *needs_lines = sub{ 0 }; + *perlio_ok = sub { + eval{ require PerlIO::encoding }; + return $@ ? 0 : 1; + }; *decode = sub{ my ($obj,$str,$chk) = @_; utf8::upgrade($str); @@ -237,6 +249,11 @@ sub predefine_encodings{ package Encode::utf8; *name = sub{ shift->{'Name'} }; *new_sequence = sub{ return $_[0] }; + *needs_lines = sub{ 0 }; + *perlio_ok = sub { + eval{ require PerlIO::encoding }; + return $@ ? 0 : 1; + }; *decode = sub{ my ($obj,$octets,$chk) = @_; my $str = Encode::decode_utf8($octets); @@ -539,6 +556,10 @@ you are debugging the mode above. =item perlqq mode (I = Encode::FB_PERLQQ) +=item HTML charref mode (I = Encode::FB_HTMLCREF) + +=item XML charref mode (I = Encode::FB_XMLCREF) + For encodings that are implemented by Encode::XS, CHECK == Encode::FB_PERLQQ turns (en|de)code into C fallback mode. @@ -548,6 +569,10 @@ decoded to utf8. And when you encode, '\x{I}' will be inserted, where I is the Unicode ID of the character that cannot be found in the character repertoire of the encoding. +HTML/XML character reference modes are about the same, in place of +\x{I}, HTML uses &#I<1234>; where I<1234> is a decimal digit and +XML uses &#xI; where I is the hexadecimal digit. + =item The bitmask These modes are actually set via a bitmask. Here is how the FB_XX @@ -561,6 +586,8 @@ constants via C. RETURN_ON_ERR 0x0004 X X LEAVE_SRC 0x0008 PERLQQ 0x0100 X + HTMLCREF 0x0200 + XMLCREF 0x0400 =head2 Unimplemented fallback schemes diff --git a/ext/Encode/Encode.xs b/ext/Encode/Encode.xs index 1476456..a7e7c6a 100644 --- a/ext/Encode/Encode.xs +++ b/ext/Encode/Encode.xs @@ -1,5 +1,5 @@ /* - $Id: Encode.xs,v 1.34 2002/04/22 20:27:30 dankogai Exp $ + $Id: Encode.xs,v 1.38 2002/04/24 20:11:14 dankogai Exp dankogai $ */ #define PERL_NO_GET_CONTEXT @@ -145,6 +145,18 @@ encode_method(pTHX_ encode_t * enc, encpage_t * dir, SV * src, sdone += slen + clen; ddone += dlen + SvCUR(perlqq); sv_catsv(dst, perlqq); + }else if (check & ENCODE_HTMLCREF){ + SV* htmlcref = + sv_2mortal(newSVpvf("&#%d;", ch)); + sdone += slen + clen; + ddone += dlen + SvCUR(htmlcref); + sv_catsv(dst, htmlcref); + }else if (check & ENCODE_XMLCREF){ + SV* xmlcref = + sv_2mortal(newSVpvf("&#x%" UVxf ";", ch)); + sdone += slen + clen; + ddone += dlen + SvCUR(xmlcref); + sv_catsv(dst, xmlcref); } else { /* fallback char */ sdone += slen + clen; @@ -168,7 +180,8 @@ encode_method(pTHX_ encode_t * enc, encpage_t * dir, SV * src, enc->name[0], (U8) s[slen], code); } goto ENCODE_SET_SRC; - }else if (check & ENCODE_PERLQQ){ + }else if (check & + (ENCODE_PERLQQ|ENCODE_HTMLCREF|ENCODE_XMLCREF)){ SV* perlqq = sv_2mortal(newSVpvf("\\x%02X", s[slen])); sdone += slen + 1; @@ -441,9 +454,6 @@ CODE: OUTPUT: RETVAL -PROTOTYPES: DISABLE - - int DIE_ON_ERR() CODE: @@ -480,6 +490,20 @@ OUTPUT: RETVAL int +HTMLCREF() +CODE: + RETVAL = ENCODE_HTMLCREF; +OUTPUT: + RETVAL + +int +XMLCREF() +CODE: + RETVAL = ENCODE_XMLCREF; +OUTPUT: + RETVAL + +int FB_DEFAULT() CODE: RETVAL = ENCODE_FB_DEFAULT; @@ -514,6 +538,20 @@ CODE: OUTPUT: RETVAL +int +FB_HTMLCREF() +CODE: + RETVAL = ENCODE_FB_HTMLCREF; +OUTPUT: + RETVAL + +int +FB_XMLCREF() +CODE: + RETVAL = ENCODE_FB_XMLCREF; +OUTPUT: + RETVAL + BOOT: { #include "def_t.h" diff --git a/ext/Encode/Encode/encode.h b/ext/Encode/Encode/encode.h index 04df7f9..b860578 100644 --- a/ext/Encode/Encode/encode.h +++ b/ext/Encode/Encode/encode.h @@ -94,11 +94,15 @@ extern void Encode_DefineEncoding(encode_t *enc); #define ENCODE_RETURN_ON_ERR 0x0004 /* immediately returns on NOREP */ #define ENCODE_LEAVE_SRC 0x0008 /* $src updated unless set */ #define ENCODE_PERLQQ 0x0100 /* perlqq fallback string */ +#define ENCODE_HTMLCREF 0x0200 /* HTML character ref. fb mode */ +#define ENCODE_XMLCREF 0x0400 /* XML character ref. fb mode */ #define ENCODE_FB_DEFAULT 0x0000 #define ENCODE_FB_CROAK 0x0001 #define ENCODE_FB_QUIET ENCODE_RETURN_ON_ERR #define ENCODE_FB_WARN (ENCODE_RETURN_ON_ERR|ENCODE_WARN_ON_ERR) #define ENCODE_FB_PERLQQ ENCODE_PERLQQ +#define ENCODE_FB_HTMLCREF ENCODE_HTMLCREF +#define ENCODE_FB_XMLCREF ENCODE_XMLCREF #endif /* ENCODE_H */ diff --git a/ext/Encode/JP/Makefile.PL b/ext/Encode/JP/Makefile.PL index ce47d2f..a1df35d 100644 --- a/ext/Encode/JP/Makefile.PL +++ b/ext/Encode/JP/Makefile.PL @@ -1,6 +1,7 @@ use 5.7.2; use strict; use ExtUtils::MakeMaker; +use strict; my %tables = ( euc_jp_t => ['euc-jp.ucm'], @@ -12,6 +13,20 @@ my %tables = ( ], ); +unless ($ENV{AGGREGATE_TABLES}){ + my @ucm; + for my $k (keys %tables){ + push @ucm, @{$tables{$k}}; + } + %tables = (); + my $seq = 0; + for my $ucm (sort @ucm){ + # 8.3 compliance ! + my $t = sprintf ("%s_%02d_t", substr($ucm, 0, 2), $seq++); + $tables{$t} = [ $ucm ]; + } +} + my $name = 'JP'; WriteMakefile( diff --git a/ext/Encode/KR/Makefile.PL b/ext/Encode/KR/Makefile.PL index df0eeb6..4ba99ab 100644 --- a/ext/Encode/KR/Makefile.PL +++ b/ext/Encode/KR/Makefile.PL @@ -1,6 +1,7 @@ use 5.7.2; use strict; use ExtUtils::MakeMaker; +use strict; my %tables = (euc_kr_t => ['euc-kr.ucm', 'macKorean.ucm', @@ -10,6 +11,20 @@ my %tables = (euc_kr_t => ['euc-kr.ucm', johab_t => ['johab.ucm'], ); +unless ($ENV{AGGREGATE_TABLES}){ + my @ucm; + for my $k (keys %tables){ + push @ucm, @{$tables{$k}}; + } + %tables = (); + my $seq = 0; + for my $ucm (sort @ucm){ + # 8.3 compliance ! + my $t = sprintf ("%s_%02d_t", substr($ucm, 0, 2), $seq++); + $tables{$t} = [ $ucm ]; + } +} + my $name = 'KR'; WriteMakefile( diff --git a/ext/Encode/MANIFEST b/ext/Encode/MANIFEST index 2a35d9f..cc6a141 100644 --- a/ext/Encode/MANIFEST +++ b/ext/Encode/MANIFEST @@ -42,12 +42,13 @@ lib/Encode/CN/HZ.pm Encode extension lib/Encode/Config.pm Encode configuration module lib/Encode/Encoder.pm OO Encoder lib/Encode/Encoding.pm Encode extension +lib/Encode/Guess.pm Encode Extension lib/Encode/JP/H2Z.pm Encode extension lib/Encode/JP/JIS7.pm Encode extension lib/Encode/KR/2022_KR.pm Encode extension +lib/Encode/MIME/Header.pm Encode extension lib/Encode/PerlIO.pod Documents for Encode & PerlIO lib/Encode/Supported.pod Documents for supported encodings -t/unibench.pl benchmark script t/Aliases.t test script t/CJKT.t test script t/Encode.t test script @@ -64,6 +65,7 @@ t/fallback.t test script t/gb2312.enc test data t/gb2312.utf test data t/grow.t test script +t/guess.t test script t/jisx0201.enc test data t/jisx0201.utf test data t/jisx0208.enc test data @@ -73,7 +75,9 @@ t/jisx0212.utf test data t/jperl.t test script t/ksc5601.enc test data t/ksc5601.utf test data +t/mime-header.t test script t/perlio.t test script +t/unibench.pl benchmark script ucm/8859-1.ucm Unicode Character Map ucm/8859-10.ucm Unicode Character Map ucm/8859-11.ucm Unicode Character Map diff --git a/ext/Encode/TW/Makefile.PL b/ext/Encode/TW/Makefile.PL index 4fdae9e..8f12a81 100644 --- a/ext/Encode/TW/Makefile.PL +++ b/ext/Encode/TW/Makefile.PL @@ -1,6 +1,7 @@ use 5.7.2; use strict; use ExtUtils::MakeMaker; +use strict; my %tables = (big5_t => ['big5-eten.ucm', 'big5-hkscs.ucm', @@ -8,6 +9,20 @@ my %tables = (big5_t => ['big5-eten.ucm', 'cp950.ucm'], ); +unless ($ENV{AGGREGATE_TABLES}){ + my @ucm; + for my $k (keys %tables){ + push @ucm, @{$tables{$k}}; + } + %tables = (); + my $seq = 0; + for my $ucm (sort @ucm){ + # 8.3 compliance ! + my $t = sprintf ("%s_%02d_t", substr($ucm, 0, 2), $seq++); + $tables{$t} = [ $ucm ]; + } +} + my $name = 'TW'; WriteMakefile( diff --git a/ext/Encode/lib/Encode/Config.pm b/ext/Encode/lib/Encode/Config.pm index dcbc524..a834967 100644 --- a/ext/Encode/lib/Encode/Config.pm +++ b/ext/Encode/lib/Encode/Config.pm @@ -2,7 +2,7 @@ # Demand-load module list # package Encode::Config; -our $VERSION = do { my @r = (q$Revision: 1.5 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; +our $VERSION = do { my @r = (q$Revision: 1.6 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; use strict; @@ -139,6 +139,11 @@ unless (ord("A") == 193){ #'big5plus' => 'Encode::HanExtra', #'euc-tw' => 'Encode::HanExtra', #'gb18030' => 'Encode::HanExtra', + + 'MIME-Header' => 'Encode::MIME::Header', + 'MIME-B' => 'Encode::MIME::Header', + 'MIME-Q' => 'Encode::MIME::Header', + ); } diff --git a/ext/Encode/lib/Encode/Guess.pm b/ext/Encode/lib/Encode/Guess.pm new file mode 100644 index 0000000..e027e38 --- /dev/null +++ b/ext/Encode/lib/Encode/Guess.pm @@ -0,0 +1,95 @@ +package Encode::Guess; +use strict; +use Carp; +use Encode qw(:fallbacks find_encoding); +our $VERSION = do { my @r = (q$Revision: 1.1 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; + +my $Canon = 'Guess'; +$Encode::Encoding{$Canon} = bless { Name => $Canon } => __PACKAGE__; +our $DEBUG = 0; +our %DEF_CANDIDATES = + map { $_ => find_encoding($_) } qw(ascii utf8); +our %CANDIDATES; + + +sub import{ + my $class = shift; + %CANDIDATES = %DEF_CANDIDATES; + for my $c (@_){ + my $e = find_encoding($c) or die "Unknown encoding: $c"; + $CANDIDATES{$e->name} = $e; + $DEBUG and warn "Added: ", $e->name; + } +} + +sub name { shift->{'Name'} } +sub new_sequence { $_[0] } +sub needs_lines { 1 } +sub perlio_ok { 0 } + +sub decode($$;$){ + my ($obj, $octet, $chk) = @_; + my $utf8 = $obj->guess($octet)->decode($octet, $chk); + $_[1] = $octet if $chk; + return $utf8; +} + +sub encode{ + croak "Tsk, tsk, tsk. You can't be too lazy here here!"; +} + +sub guess { + my ($obj, $octet) = @_; + # cheat 1: utf8 flag; + Encode::is_utf8($octet) and return find_encoding('utf8'); + my %try = %CANDIDATES; + my $nline = 1; + for my $line (split /\r|\n|\r\n/, $octet){ + # cheat 2 -- escape + if ($line =~ /\e/o){ + my @keys = keys %try; + delete @try{qw/utf8 ascii/}; + for my $k (@keys){ + ref($try{$k}) eq 'Encode::XS' and delete $try{$k}; + } + } + my %ok = %try; + # warn join(",", keys %try); + for my $k (keys %try){ + my $scratch = $line; + $try{$k}->decode($scratch, FB_QUIET); + if ($scratch eq ''){ + $DEBUG and warn sprintf("%4d:%-24s ok\n", $nline, $k); + }else{ + use bytes (); + $DEBUG and + warn sprintf("%4d:%-24s not ok; %d bytes left\n", + $nline, $k, bytes::length($scratch)); + delete $ok{$k}; + + } + } + %ok or croak "No appropriate encodings found!"; + if (scalar(keys(%ok)) == 1){ + my ($retval) = values(%ok); + return $retval; + } + %try = %ok; $nline++; + } + unless ($try{ascii}){ + croak "Encodings too ambiguous: ", + join(" or ", keys %try); + } + return $try{ascii}; +} + + +1; +__END__ + +=head1 NAME + +Encode::Guess -- guesscoding! + +=cut + diff --git a/ext/Encode/lib/Encode/JP/JIS7.pm b/ext/Encode/lib/Encode/JP/JIS7.pm index c0a0d06..09ec94f 100644 --- a/ext/Encode/lib/Encode/JP/JIS7.pm +++ b/ext/Encode/lib/Encode/JP/JIS7.pm @@ -1,7 +1,7 @@ package Encode::JP::JIS7; use strict; -our $VERSION = do { my @r = (q$Revision: 1.6 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; +our $VERSION = do { my @r = (q$Revision: 1.7 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; use Encode qw(:fallbacks); @@ -42,9 +42,13 @@ our $DEBUG = 0; sub decode($$;$) { - my ($obj,$str,$chk) = @_; - my $residue = jis_euc(\$str); - # This is for PerlIO + my ($obj, $str, $chk) = @_; + my $residue = ''; + if ($chk){ + $str =~ s/([^\x00-\x7f].*)$//so; + $1 and $residue = $1; + } + $residue .= jis_euc(\$str); $_[1] = $residue if $chk; return Encode::decode('euc-jp', $str, FB_PERLQQ); } diff --git a/ext/Encode/lib/Encode/MIME/Header.pm b/ext/Encode/lib/Encode/MIME/Header.pm new file mode 100644 index 0000000..51f0923 --- /dev/null +++ b/ext/Encode/lib/Encode/MIME/Header.pm @@ -0,0 +1,212 @@ +package Encode::MIME::Header; +use strict; +# use warnings; +our $VERSION = do { my @r = (q$Revision: 1.1 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; + +use Encode qw(find_encoding encode_utf8); +use MIME::Base64; +use Carp; + +my %seed = + ( + decode_b => '1', # decodes 'B' encoding ? + decode_q => '1', # decodes 'Q' encoding ? + encode => 'B', # encode with 'B' or 'Q' ? + bpl => 75, # bytes per line + ); + +$Encode::Encoding{'MIME-Header'} = + bless { + %seed, + Name => 'MIME-Header', + } => __PACKAGE__; + +$Encode::Encoding{'MIME-B'} = + bless { + %seed, + decode_q => 0, + Name => 'MIME-B', + } => __PACKAGE__; + +$Encode::Encoding{'MIME-Q'} = + bless { + %seed, + decode_q => 1, + encode => 'Q', + Name => 'MIME-Q', + } => __PACKAGE__; + +sub name { shift->{'Name'} } +sub new_sequence { $_[0] } +sub needs_lines { 1 } +sub perlio_ok{ 0 }; + +sub decode($$;$){ + use utf8; + my ($obj, $str, $chk) = @_; + # zap spaces between encoded words + $str =~ s/\?=\s+=\?/\?==\?/gos; + # multi-line header to single line + $str =~ s/(:?\r|\n|\r\n)[ \t]//gos; + $str =~ + s{ + =\? # begin encoded word + ([0-9A-Za-z\-]+) # charset (encoding) + \?([QqBb])\? # delimiter + (.*?) # Base64-encodede contents + \?= # end encoded word + }{ + if (uc($2) eq 'B'){ + $obj->{decode_b} or croak qq(MIME "B" unsupported); + decode_b($1, $3); + }elsif(uc($2) eq 'Q'){ + $obj->{decode_q} or croak qq(MIME "Q" unsupported); + decode_q($1, $3); + }else{ + croak qq(MIME "$2" encoding is nonexistent!); + } + }egox; + $_[1] = '' if $chk; + return $str; +} + +sub decode_b{ + my $enc = shift; + my $d = find_encoding($enc) or croak(Unknown encoding "$enc"); + my $db64 = decode_base64(shift); + return $d->decode($db64, Encode::FB_PERLQQ); +} + +sub decode_q{ + my ($enc, $q) = @_; + my $d = find_encoding($enc) or croak(Unknown encoding "$enc"); + $q =~ s/_/ /go; + $q =~ s/=([0-9A-Fa-f]{2})/pack("C", hex($1))/ego; + return $d->decode($q, Encode::FB_PERLQQ); +} + +my $especials = + join('|' => + map {quotemeta(chr($_))} + unpack("C*", qq{()<>@,;:\"\'/[]?.=})); + +my $re_especials = qr/$especials/o; + +sub encode($$;$){ + my ($obj, $str, $chk) = @_; + my @line = (); + for my $line (split /\r|\n|\r\n/o, $str){ + my (@word, @subline); + for my $word (split /($re_especials)/o, $line){ + if ($word =~ /[^\x00-\x7f]/o){ + push @word, $obj->_encode($word); + }else{ + push @word, $word; + } + } + my $subline = ''; + for my $word (@word){ + use bytes (); + if (bytes::length($subline) + bytes::length($word) > $obj->{bpl}){ + push @subline, $subline; + $subline = ''; + } + $subline .= $word; + } + $subline and push @subline, $subline; + push @line, join("\n " => @subline); + } + $_[1] = '' if $chk; + return join("\n", @line); +} + +use constant HEAD => '=?UTF-8?'; +use constant TAIL => '?='; +use constant SINGLE => { B => \&_encode_b, Q => \&_encode_q, }; + +sub _encode{ + my ($o, $str) = @_; + my $enc = $o->{encode}; + my $llen = ($o->{bpl} - length(HEAD) - 2 - length(TAIL)); + $llen *= $enc eq 'B' ? 3/4 : 1/3; + my @result = (); + my $chunk = ''; + while(my $chr = substr($str, 0, 1, '')){ + use bytes (); + if (bytes::length($chunk) + bytes::length($chr) > $llen){ + push @result, SINGLE->{$enc}($chunk); + $chunk = ''; + } + $chunk .= $chr; + } + $chunk and push @result, SINGLE->{$enc}($chunk); + return @result; +} + +sub _encode_b{ + HEAD . 'B?' . encode_base64(encode_utf8(shift), '') . TAIL; +} + +sub _encode_q{ + my $chunk = shift; + $chunk =~ s{ + ([^0-9A-Za-z]) + }{ + join("" => map {sprintf "=%02X", $_} unpack("C*", $1)) + }egox; + return HEAD . 'Q?' . $chunk . TAIL; +} + +1; +__END__ + +=head1 NAME + +Encode::MIME::Header -- MIME 'B' and 'Q' header encoding + +=head1 SYNOPSIS + + use Encode qw/encode decode/; + $utf8 = decode('MIME-Header', $header); + $header = encode('MIME-Header', $utf8); + +=head1 ABSTRACT + +This module implements RFC 2047 Mime Header Encoding. There are 3 +variant encoding names; C, C and C. The +difference is described below + + decode() encode() + ---------------------------------------------- + MIME-Header Both B and Q =?UTF-8?B?....?= + MIME-B B only; Q croaks =?UTF-8?B?....?= + MIME-Q Q only; B croaks =?UTF-8?Q?....?= + +=head1 DESCRIPTION + +When you decode(=?I?I?I?=), I +is extracted and decoded for I encoding (B for Base64, Q for +Quoted-Printable). Then the decoded chunk is fed to +decode(I). So long as I is supported by Encode, +any source encoding is fine. + +When you encode, it just encodes UTF-8 string with I encoding then +quoted with =?UTF-8?I?....?= . The parts that RFC 2047 forbids to +encode are left as is and long lines are folded within 76 bytes per +line. + +=head1 BUGS + +It would be nice to support non-UTF8 encoding, such as =?ISO-2022-JP? +and =?ISO-8859-1?= but that makes the implementation too complicated. +These days major mail agents all support =?UTF-8? so I think it is +just good enough. + +=head1 SEE ALSO + +L + +RFC 2047, L and many other +locations. + +=cut diff --git a/ext/Encode/t/fallback.t b/ext/Encode/t/fallback.t index cf867be..3b66258 100644 --- a/ext/Encode/t/fallback.t +++ b/ext/Encode/t/fallback.t @@ -13,17 +13,18 @@ BEGIN { use strict; #use Test::More qw(no_plan); -use Test::More tests => 15; +use Test::More tests => 19; use Encode q(:all); my $original = ''; my $nofallback = ''; -my ($fallenback, $quiet, $perlqq); +my ($fallenback, $quiet, $perlqq, $htmlcref, $xmlcref); for my $i (0x20..0x7e){ $original .= chr($i); } -$fallenback = $quiet = $perlqq = $nofallback = $original; +$fallenback = $quiet = +$perlqq = $htmlcref = $xmlcref = $nofallback = $original; my $residue = ''; for my $i (0x80..0xff){ @@ -31,6 +32,8 @@ for my $i (0x80..0xff){ $residue .= chr($i); $fallenback .= '?'; $perlqq .= sprintf("\\x{%04x}", $i); + $htmlcref .= sprintf("&#%d;", $i); + $xmlcref .= sprintf("&#x%x;", $i); } utf8::upgrade($original); my $meth = find_encoding('ascii'); @@ -75,3 +78,13 @@ $src = $original; $dst = $meth->encode($src, FB_PERLQQ); is($dst, $perlqq, "FB_PERLQQ"); is($src, '', "FB_PERLQQ residue"); + +$src = $original; +$dst = $meth->encode($src, FB_HTMLCREF); +is($dst, $htmlcref, "FB_HTMLCREF"); +is($src, '', "FB_HTMLCREF residue"); + +$src = $original; +$dst = $meth->encode($src, FB_XMLCREF); +is($dst, $xmlcref, "FB_XMLCREF"); +is($src, '', "FB_XMLCREF residue"); diff --git a/ext/Encode/t/guess.t b/ext/Encode/t/guess.t new file mode 100644 index 0000000..7b5d3ca --- /dev/null +++ b/ext/Encode/t/guess.t @@ -0,0 +1,64 @@ +BEGIN { + if ($ENV{'PERL_CORE'}){ + chdir 't'; + unshift @INC, '../lib'; + } + require Config; import Config; + if ($Config{'extensions'} !~ /\bEncode\b/) { + print "1..0 # Skip: Encode was not built\n"; + exit 0; + } + $| = 1; +} + +use strict; +use File::Basename; +use File::Spec; +use Encode qw(decode encode find_encoding _utf8_off); + +#use Test::More qw(no_plan); +use Test::More tests => 11; +use_ok("Encode::Guess"); +{ + no warnings; + $Encode::Guess::DEBUG = shift || 0; +} + +my $ascii = join('' => map {chr($_)}(0x21..0x7e)); +my $latin1 = join('' => map {chr($_)}(0xa1..0xfe)); +my $utf8on = join('' => map {chr($_)}(0x3000..0x30fe)); +my $utf8off = $utf8on; _utf8_off($utf8off); + +is(Encode::Guess->guess($ascii)->name, 'ascii'); + +eval { Encode::Guess->guess($latin1) } ; +like($@, qr/No appropriate encoding/io); + +Encode::Guess->import(qw(latin1)); + +is(Encode::Guess->guess($latin1)->name, 'iso-8859-1'); +is(Encode::Guess->guess($utf8on)->name, 'utf8'); + +eval { Encode::Guess->guess($utf8off) }; +like($@, qr/ambiguous/io); + +my $jisx0201 = File::Spec->catfile(dirname(__FILE__), 'jisx0201.utf'); +my $jisx0208 = File::Spec->catfile(dirname(__FILE__), 'jisx0208.utf'); +my $jisx0212 = File::Spec->catfile(dirname(__FILE__), 'jisx0212.utf'); + +open my $fh, $jisx0208 or die "$jisx0208: $!"; +$utf8off = join('' => <$fh>); +close $fh; +$utf8on = decode('utf8', $utf8off); +my @jp = qw(7bit-jis shiftjis euc-jp); + +Encode::Guess->import(@jp); + +for my $jp (@jp){ + my $test = encode($jp, $utf8on); + is(Encode::Guess->guess($test)->name, $jp, $jp); +} +is (decode('Guess', encode('euc-jp', $utf8on)), $utf8on, "decode('Guess')"); +eval{ encode('Guess', $utf8on) }; +like($@, qr/lazy/io, "no encode()"); +__END__; diff --git a/ext/Encode/t/mime-header.t b/ext/Encode/t/mime-header.t new file mode 100644 index 0000000..ad487c5 --- /dev/null +++ b/ext/Encode/t/mime-header.t @@ -0,0 +1,20 @@ +BEGIN { + if ($ENV{'PERL_CORE'}){ + chdir 't'; + unshift @INC, '../lib'; + } + require Config; import Config; + if ($Config{'extensions'} !~ /\bEncode\b/) { + print "1..0 # Skip: Encode was not built\n"; + exit 0; + } + $| = 1; +} + +use strict; +use Test::More qw(no_plan); +#use Test::More tests => 19; +use_ok("Encode::MIME::Header"); + + +__END__;