From: Steve Peters Date: Wed, 25 Apr 2007 01:06:23 +0000 (+0000) Subject: Upgrade to Encode-2.20 X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=44b3b9c710e4ffb3c3fb08c702ea4f981c927631;p=p5sagit%2Fp5-mst-13.2.git Upgrade to Encode-2.20 p4raw-id: //depot/perl@31061 --- diff --git a/MANIFEST b/MANIFEST index 6c0959e..0dc43b8 100644 --- a/MANIFEST +++ b/MANIFEST @@ -478,6 +478,7 @@ ext/Encode/lib/Encode/CN/HZ.pm Encode extension ext/Encode/lib/Encode/Config.pm Encode configuration module ext/Encode/lib/Encode/Encoder.pm OO Encoder ext/Encode/lib/Encode/Encoding.pm Encode extension +ext/Encode/lib/Encode/GSM0338.pm Encode extension ext/Encode/lib/Encode/Guess.pm Encode Extension ext/Encode/lib/Encode/JP/H2Z.pm Encode extension ext/Encode/lib/Encode/JP/JIS7.pm Encode extension diff --git a/ext/Encode/Byte/Byte.pm b/ext/Encode/Byte/Byte.pm index 0824368..3ea9035 100644 --- a/ext/Encode/Byte/Byte.pm +++ b/ext/Encode/Byte/Byte.pm @@ -2,7 +2,7 @@ package Encode::Byte; use strict; use warnings; use Encode; -our $VERSION = do { my @r = ( q$Revision: 2.2 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r }; +our $VERSION = do { my @r = ( q$Revision: 2.3 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r }; use XSLoader; XSLoader::load( __PACKAGE__, $VERSION ); @@ -107,7 +107,6 @@ supported are as follows. # More vendor encodings AdobeStandardEncoding nextstep - gsm0338 # used in GSM handsets hp-roman8 =head1 DESCRIPTION diff --git a/ext/Encode/Changes b/ext/Encode/Changes index 5039a73..0d3d9a4 100644 --- a/ext/Encode/Changes +++ b/ext/Encode/Changes @@ -1,8 +1,23 @@ # Revision history for Perl extension Encode. # -# $Id: Changes,v 2.19 2007/04/06 12:53:41 dankogai Exp dankogai $ +# $Id: Changes,v 2.20 2007/04/22 14:56:12 dankogai Exp dankogai $ # -$Revision: 2.19 $ $Date: 2007/04/06 12:53:41 $ +$Revision: 2.20 $ $Date: 2007/04/22 14:56:12 $ +! Encode.pm + Pod fixes. Now find_encoding() is explained more in details. ++ lib/Encode/GSM0338.pm +- ucm/gsm0338.ucm +! lib/Encode/Supported.pod lib/Encode/Config.pm Bytes/Makefile.PL t/gsm0338.t + ESTI GSM 03.38 support is relocated from Encode::Byte to Encode::GSM0338. + This encoding is so kaputt it is unfit for Encode::XS! + Though it was okay for general cases and escape sequences, + '\0' => '@' IFF '\0\0' => '\0' had gliches. + So kaputt even t/gsm0338 wrongly interpreted that. + ref. http://www.csoft.co.uk/sms/character_sets/gsm.htm +! encoding.pm t/Aliases.t + Imported from bleedperl #31015 + +2.19 2007/04/06 12:53:41 ! lib/Encode/JP/JIS7.pm + t/jis7-fallback.t encode('iso-2022-jp') fallback support added by MIYAGAWA++ diff --git a/ext/Encode/Encode.pm b/ext/Encode/Encode.pm index 8b0f4a6..c52e7c4 100644 --- a/ext/Encode/Encode.pm +++ b/ext/Encode/Encode.pm @@ -1,10 +1,10 @@ # -# $Id: Encode.pm,v 2.19 2007/04/06 12:53:41 dankogai Exp dankogai $ +# $Id: Encode.pm,v 2.20 2007/04/22 14:56:12 dankogai Exp dankogai $ # package Encode; use strict; use warnings; -our $VERSION = sprintf "%d.%02d", q$Revision: 2.19 $ =~ /(\d+)/g; +our $VERSION = sprintf "%d.%02d", q$Revision: 2.20 $ =~ /(\d+)/g; sub DEBUG () { 0 } use XSLoader (); XSLoader::load( __PACKAGE__, $VERSION ); @@ -405,11 +405,11 @@ iso-8859-1 (also known as Latin1), $octets = encode("iso-8859-1", $string); -B: When you run C<$octets = encode("utf8", $string)>, then $octets -B $string. Though they both contain the same data, the UTF8 flag -for $octets is B off. When you encode anything, UTF8 flag of -the result is always off, even when it contains completely valid utf8 -string. See L below. +B: When you run C<$octets = encode("utf8", $string)>, then +$octets B $string. Though they both contain the +same data, the UTF8 flag for $octets is B off. When you +encode anything, UTF8 flag of the result is always off, even when it +contains completely valid utf8 string. See L below. If the $string is C then C is returned. @@ -433,6 +433,41 @@ below. If the $string is C then C is returned. +=item [$obj =] find_encoding(ENCODING) + +Returns the I corresponding to ENCODING. Returns +undef if no matching ENCODING is find. + +This object is what actually does the actual (en|de)coding. + + $utf8 = decode($name, $bytes); + +is in fact + + $utf8 = do{ + $obj = find_encoding($name); + croak qq(encoding "$name" not found) unless ref $obj; + $obj->decode($bytes) + }; + +with more error checking. + +Therefore you can save time by reusing this object as follows; + + my $enc = find_encoding("iso-8859-1"); + while(<>){ + my $utf8 = $enc->decode($_); + # and do someting with $utf8; + } + +Besides C<< ->decode >> and C<< ->encode >>, other methods are +available as well. For instance, C<< -> name >> returns the canonical +name of the encoding object. + + find_encoding("latin1")->name; # iso-8859-1 + +See L for details. + =item [$length =] from_to($octets, FROM_ENC, TO_ENC [, CHECK]) Converts B data between two encodings. The data in $octets @@ -532,9 +567,9 @@ See L for details. =head1 Encoding via PerlIO -If your perl supports I (which is the default), you can use a PerlIO layer to decode -and encode directly via a filehandle. The following two examples -are totally identical in their functionality. +If your perl supports I (which is the default), you can use a +PerlIO layer to decode and encode directly via a filehandle. The +following two examples are totally identical in their functionality. # via PerlIO open my $in, "<:encoding(shiftjis)", $infile or die; @@ -659,13 +694,17 @@ constants via C. =back +=over 2 + =item Encode::LEAVE_SRC If the C bit is not set, but I is, then the second argument to C or C may be assigned to by the functions. If you're not interested in this, then bitwise-or the bitmask with it. -=head2 coderef for CHECK +=back + +=Head2 coderef for CHECK As of Encode 2.12 CHECK can also be a code reference which takes the ord value of unmapped caharacter as an argument and returns a string diff --git a/ext/Encode/Encode.xs b/ext/Encode/Encode.xs index 30ede3f..fe645b6 100644 --- a/ext/Encode/Encode.xs +++ b/ext/Encode/Encode.xs @@ -1,5 +1,5 @@ /* - $Id: Encode.xs,v 2.11 2007/04/06 12:53:41 dankogai Exp dankogai $ + $Id: Encode.xs,v 2.11 2007/04/06 12:53:41 dankogai Exp $ */ #define PERL_NO_GET_CONTEXT diff --git a/ext/Encode/bin/piconv b/ext/Encode/bin/piconv index 37dd153..840bf3e 100644 --- a/ext/Encode/bin/piconv +++ b/ext/Encode/bin/piconv @@ -1,5 +1,5 @@ #!./perl -# $Id: piconv,v 2.3 2007/04/06 12:53:41 dankogai Exp dankogai $ +# $Id: piconv,v 2.3 2007/04/06 12:53:41 dankogai Exp $ # use 5.8.0; use strict; diff --git a/ext/Encode/encoding.pm b/ext/Encode/encoding.pm index fff7adb..7b8eee4 100644 --- a/ext/Encode/encoding.pm +++ b/ext/Encode/encoding.pm @@ -1,6 +1,6 @@ -# $Id: encoding.pm,v 2.5 2007/04/06 12:53:41 dankogai Exp dankogai $ +# $Id: encoding.pm,v 2.6 2007/04/22 14:56:12 dankogai Exp dankogai $ package encoding; -our $VERSION = do { my @r = ( q$Revision: 2.5 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r }; +our $VERSION = do { my @r = ( q$Revision: 2.6 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r }; use Encode; use strict; @@ -323,6 +323,14 @@ always the same as the length of C<$/> in the native encoding. This pragma affects utf8::upgrade, but not utf8::downgrade. +=head2 Side effects + +If the C pragma is in scope then the lengths returned are +calculated from the length of C<$/> in Unicode characters, which is not +always the same as the length of C<$/> in the native encoding. + +This pragma affects utf8::upgrade, but not utf8::downgrade. + =head1 FEATURES THAT REQUIRE 5.8.1 Some of the features offered by this pragma requires perl 5.8.1. Most diff --git a/ext/Encode/lib/Encode/Config.pm b/ext/Encode/lib/Encode/Config.pm index e6ca64d..c9f431b 100644 --- a/ext/Encode/lib/Encode/Config.pm +++ b/ext/Encode/lib/Encode/Config.pm @@ -2,7 +2,7 @@ # Demand-load module list # package Encode::Config; -our $VERSION = do { my @r = ( q$Revision: 2.3 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r }; +our $VERSION = do { my @r = ( q$Revision: 2.4 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r }; use strict; use warnings; @@ -75,7 +75,8 @@ our %ExtModule = ( 'MacUkrainian' => 'Encode::Byte', 'nextstep' => 'Encode::Byte', 'hp-roman8' => 'Encode::Byte', - 'gsm0338' => 'Encode::Byte', + #'gsm0338' => 'Encode::Byte', + 'gsm0338' => 'Encode::GSM0338', # Encode::EBCDIC 'cp37' => 'Encode::EBCDIC', diff --git a/ext/Encode/lib/Encode/GSM0338.pm b/ext/Encode/lib/Encode/GSM0338.pm new file mode 100644 index 0000000..b417809 --- /dev/null +++ b/ext/Encode/lib/Encode/GSM0338.pm @@ -0,0 +1,288 @@ +# +# $Id: GSM0338.pm,v 2.0 2007/04/22 14:54:22 dankogai Exp $ +# +package Encode::GSM0338; + +use strict; +use warnings; +use Carp; + +use vars qw($VERSION); +$VERSION = do { my @r = ( q$Revision: 2.0 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r }; + +use Encode qw(:fallbacks); + +use base qw(Encode::Encoding); +__PACKAGE__->Define('gsm0338'); + +sub needs_lines { 1 } +sub perlio_ok { 0 } + +use utf8; +our %UNI2GSM = ( + "\x{0040}" => "\x00", # COMMERCIAL AT + "\x{000A}" => "\x0A", # LINE FEED + "\x{000C}" => "\x1B\x0A", # FORM FEED + "\x{000D}" => "\x0D", # CARRIAGE RETURN + "\x{0020}" => "\x20", # SPACE + "\x{0021}" => "\x21", # EXCLAMATION MARK + "\x{0022}" => "\x22", # QUOTATION MARK + "\x{0023}" => "\x23", # NUMBER SIGN + "\x{0024}" => "\x02", # DOLLAR SIGN + "\x{0025}" => "\x25", # PERCENT SIGN + "\x{0026}" => "\x26", # AMPERSAND + "\x{0027}" => "\x27", # APOSTROPHE + "\x{0028}" => "\x28", # LEFT PARENTHESIS + "\x{0029}" => "\x29", # RIGHT PARENTHESIS + "\x{002A}" => "\x2A", # ASTERISK + "\x{002B}" => "\x2B", # PLUS SIGN + "\x{002C}" => "\x2C", # COMMA + "\x{002D}" => "\x2D", # HYPHEN-MINUS + "\x{002E}" => "\x2E", # FULL STOP + "\x{002F}" => "\x2F", # SOLIDUS + "\x{0030}" => "\x30", # DIGIT ZERO + "\x{0031}" => "\x31", # DIGIT ONE + "\x{0032}" => "\x32", # DIGIT TWO + "\x{0033}" => "\x33", # DIGIT THREE + "\x{0034}" => "\x34", # DIGIT FOUR + "\x{0035}" => "\x35", # DIGIT FIVE + "\x{0036}" => "\x36", # DIGIT SIX + "\x{0037}" => "\x37", # DIGIT SEVEN + "\x{0038}" => "\x38", # DIGIT EIGHT + "\x{0039}" => "\x39", # DIGIT NINE + "\x{003A}" => "\x3A", # COLON + "\x{003B}" => "\x3B", # SEMICOLON + "\x{003C}" => "\x3C", # LESS-THAN SIGN + "\x{003D}" => "\x3D", # EQUALS SIGN + "\x{003E}" => "\x3E", # GREATER-THAN SIGN + "\x{003F}" => "\x3F", # QUESTION MARK + "\x{0041}" => "\x41", # LATIN CAPITAL LETTER A + "\x{0042}" => "\x42", # LATIN CAPITAL LETTER B + "\x{0043}" => "\x43", # LATIN CAPITAL LETTER C + "\x{0044}" => "\x44", # LATIN CAPITAL LETTER D + "\x{0045}" => "\x45", # LATIN CAPITAL LETTER E + "\x{0046}" => "\x46", # LATIN CAPITAL LETTER F + "\x{0047}" => "\x47", # LATIN CAPITAL LETTER G + "\x{0048}" => "\x48", # LATIN CAPITAL LETTER H + "\x{0049}" => "\x49", # LATIN CAPITAL LETTER I + "\x{004A}" => "\x4A", # LATIN CAPITAL LETTER J + "\x{004B}" => "\x4B", # LATIN CAPITAL LETTER K + "\x{004C}" => "\x4C", # LATIN CAPITAL LETTER L + "\x{004D}" => "\x4D", # LATIN CAPITAL LETTER M + "\x{004E}" => "\x4E", # LATIN CAPITAL LETTER N + "\x{004F}" => "\x4F", # LATIN CAPITAL LETTER O + "\x{0050}" => "\x50", # LATIN CAPITAL LETTER P + "\x{0051}" => "\x51", # LATIN CAPITAL LETTER Q + "\x{0052}" => "\x52", # LATIN CAPITAL LETTER R + "\x{0053}" => "\x53", # LATIN CAPITAL LETTER S + "\x{0054}" => "\x54", # LATIN CAPITAL LETTER T + "\x{0055}" => "\x55", # LATIN CAPITAL LETTER U + "\x{0056}" => "\x56", # LATIN CAPITAL LETTER V + "\x{0057}" => "\x57", # LATIN CAPITAL LETTER W + "\x{0058}" => "\x58", # LATIN CAPITAL LETTER X + "\x{0059}" => "\x59", # LATIN CAPITAL LETTER Y + "\x{005A}" => "\x5A", # LATIN CAPITAL LETTER Z + "\x{005F}" => "\x11", # LOW LINE + "\x{0061}" => "\x61", # LATIN SMALL LETTER A + "\x{0062}" => "\x62", # LATIN SMALL LETTER B + "\x{0063}" => "\x63", # LATIN SMALL LETTER C + "\x{0064}" => "\x64", # LATIN SMALL LETTER D + "\x{0065}" => "\x65", # LATIN SMALL LETTER E + "\x{0066}" => "\x66", # LATIN SMALL LETTER F + "\x{0067}" => "\x67", # LATIN SMALL LETTER G + "\x{0068}" => "\x68", # LATIN SMALL LETTER H + "\x{0069}" => "\x69", # LATIN SMALL LETTER I + "\x{006A}" => "\x6A", # LATIN SMALL LETTER J + "\x{006B}" => "\x6B", # LATIN SMALL LETTER K + "\x{006C}" => "\x6C", # LATIN SMALL LETTER L + "\x{006D}" => "\x6D", # LATIN SMALL LETTER M + "\x{006E}" => "\x6E", # LATIN SMALL LETTER N + "\x{006F}" => "\x6F", # LATIN SMALL LETTER O + "\x{0070}" => "\x70", # LATIN SMALL LETTER P + "\x{0071}" => "\x71", # LATIN SMALL LETTER Q + "\x{0072}" => "\x72", # LATIN SMALL LETTER R + "\x{0073}" => "\x73", # LATIN SMALL LETTER S + "\x{0074}" => "\x74", # LATIN SMALL LETTER T + "\x{0075}" => "\x75", # LATIN SMALL LETTER U + "\x{0076}" => "\x76", # LATIN SMALL LETTER V + "\x{0077}" => "\x77", # LATIN SMALL LETTER W + "\x{0078}" => "\x78", # LATIN SMALL LETTER X + "\x{0079}" => "\x79", # LATIN SMALL LETTER Y + "\x{007A}" => "\x7A", # LATIN SMALL LETTER Z + "\x{000C}" => "\x1B\x0A", # FORM FEED + "\x{005B}" => "\x1B\x3C", # LEFT SQUARE BRACKET + "\x{005C}" => "\x1B\x2F", # REVERSE SOLIDUS + "\x{005D}" => "\x1B\x3E", # RIGHT SQUARE BRACKET + "\x{005E}" => "\x1B\x14", # CIRCUMFLEX ACCENT + "\x{007B}" => "\x1B\x28", # LEFT CURLY BRACKET + "\x{007C}" => "\x1B\x40", # VERTICAL LINE + "\x{007D}" => "\x1B\x29", # RIGHT CURLY BRACKET + "\x{007E}" => "\x1B\x3D", # TILDE + "\x{00A0}" => "\x1B", # NO-BREAK SPACE + "\x{00A1}" => "\x40", # INVERTED EXCLAMATION MARK + "\x{00A3}" => "\x01", # POUND SIGN + "\x{00A4}" => "\x24", # CURRENCY SIGN + "\x{00A5}" => "\x03", # YEN SIGN + "\x{00A7}" => "\x5F", # SECTION SIGN + "\x{00BF}" => "\x60", # INVERTED QUESTION MARK + "\x{00C4}" => "\x5B", # LATIN CAPITAL LETTER A WITH DIAERESIS + "\x{00C5}" => "\x0E", # LATIN CAPITAL LETTER A WITH RING ABOVE + "\x{00C6}" => "\x1C", # LATIN CAPITAL LETTER AE + "\x{00C9}" => "\x1F", # LATIN CAPITAL LETTER E WITH ACUTE + "\x{00D1}" => "\x5D", # LATIN CAPITAL LETTER N WITH TILDE + "\x{00D6}" => "\x5C", # LATIN CAPITAL LETTER O WITH DIAERESIS + "\x{00D8}" => "\x0B", # LATIN CAPITAL LETTER O WITH STROKE + "\x{00DC}" => "\x5E", # LATIN CAPITAL LETTER U WITH DIAERESIS + "\x{00DF}" => "\x1E", # LATIN SMALL LETTER SHARP S + "\x{00E0}" => "\x7F", # LATIN SMALL LETTER A WITH GRAVE + "\x{00E4}" => "\x7B", # LATIN SMALL LETTER A WITH DIAERESIS + "\x{00E5}" => "\x0F", # LATIN SMALL LETTER A WITH RING ABOVE + "\x{00E6}" => "\x1D", # LATIN SMALL LETTER AE + "\x{00E7}" => "\x09", # LATIN SMALL LETTER C WITH CEDILLA + "\x{00E8}" => "\x04", # LATIN SMALL LETTER E WITH GRAVE + "\x{00E9}" => "\x05", # LATIN SMALL LETTER E WITH ACUTE + "\x{00EC}" => "\x07", # LATIN SMALL LETTER I WITH GRAVE + "\x{00F1}" => "\x7D", # LATIN SMALL LETTER N WITH TILDE + "\x{00F2}" => "\x08", # LATIN SMALL LETTER O WITH GRAVE + "\x{00F6}" => "\x7C", # LATIN SMALL LETTER O WITH DIAERESIS + "\x{00F8}" => "\x0C", # LATIN SMALL LETTER O WITH STROKE + "\x{00F9}" => "\x06", # LATIN SMALL LETTER U WITH GRAVE + "\x{00FC}" => "\x7E", # LATIN SMALL LETTER U WITH DIAERESIS + "\x{0393}" => "\x13", # GREEK CAPITAL LETTER GAMMA + "\x{0394}" => "\x10", # GREEK CAPITAL LETTER DELTA + "\x{0398}" => "\x19", # GREEK CAPITAL LETTER THETA + "\x{039B}" => "\x14", # GREEK CAPITAL LETTER LAMDA + "\x{039E}" => "\x1A", # GREEK CAPITAL LETTER XI + "\x{03A0}" => "\x16", # GREEK CAPITAL LETTER PI + "\x{03A3}" => "\x18", # GREEK CAPITAL LETTER SIGMA + "\x{03A6}" => "\x12", # GREEK CAPITAL LETTER PHI + "\x{03A8}" => "\x17", # GREEK CAPITAL LETTER PSI + "\x{03A9}" => "\x15", # GREEK CAPITAL LETTER OMEGA + "\x{20AC}" => "\x1B\x65", # EURO SIGN +); +our %GSM2UNI = reverse %UNI2GSM; +our $ESC = "\x1b"; +our $ATMARK = "\x40"; +our $FBCHAR = "\x3F"; +our $NBSP = "\x{00A0}"; + +#define ERR_DECODE_NOMAP "%s \"\\x%02" UVXf "\" does not map to Unicode" + +sub decode ($$;$) { + my ( $obj, $bytes, $chk ) = @_; + my $str; + while ( length $bytes ) { + my $c = substr( $bytes, 0, 1, '' ); + my $u; + if ( $c eq "\x00" ) { + my $c2 = substr( $bytes, 0, 1, '' ); + $u = + !length $c2 ? $ATMARK + : $c2 eq "\x00" ? "\x{0000}" + : exists $GSM2UNI{$c2} ? $ATMARK . $GSM2UNI{$c2} + : $chk + ? croak sprintf( "\\x%02X\\x%02X does not map to Unicode", + ord($c), ord($c2) ) + : $ATMARK . $FBCHAR; + + } + elsif ( $c eq $ESC ) { + my $c2 = substr( $bytes, 0, 1, '' ); + $u = + exists $GSM2UNI{ $c . $c2 } ? $GSM2UNI{ $c . $c2 } + : exists $GSM2UNI{$c2} ? $NBSP . $GSM2UNI{$c2} + : $chk + ? croak sprintf( "\\x%02X\\x%02X does not map to Unicode", + ord($c), ord($c2) ) + : $NBSP . $FBCHAR; + } + else { + $u = + exists $GSM2UNI{$c} ? $GSM2UNI{$c} + : $chk + ? croak sprintf( "\\x%02X does not map to Unicode", ord($c) ) + : $FBCHAR; + } + $str .= $u; + } + $_[1] = $bytes if $chk; + return $str; +} + +#define ERR_ENCODE_NOMAP "\"\\x{%04" UVxf "}\" does not map to %s" + +sub encode($$;$) { + my ( $obj, $str, $chk ) = @_; + my $bytes; + while ( length $str ) { + my $u = substr( $str, 0, 1, '' ); + my $c; + $bytes .= + exists $UNI2GSM{$u} ? $UNI2GSM{$u} + : $chk + ? croak sprintf( "\\x{%04x} does not map to %s", + ord($u), $obj->name ) + : $FBCHAR; + } + $_[1] = $str if $chk; + return $bytes; +} + +1; +__END__ + +=head1 NAME + +Encode::GSM0338 -- ESTI GSM 03.38 Encoding + +=head1 SYNOPSIS + + use Encode qw/encode decode/; + $gsm0338 = encode("gsm0338", $utf8); # loads Encode::GSM0338 implicitly + $utf8 = decode("gsm0338", $gsm0338); # ditto + +=head1 DESCRIPTION + +GSM0338 is for GSM handsets. Though it shares alphanumerals with ASCII, +control character ranges and other parts are mapped very differently, +mainly to store Greek characters. There are also escape sequences +(starting with 0x1B) to cover e.g. the Euro sign. + +This was once handled by L but because of all those +unusual specifications, Encode 2.20 has relocated the support to +this module. + +=head1 NOTES + +Unlike most other encodings, the following aways croaks on error +for any $chk that evaluates to true. + + $gsm0338 = encode("gsm0338", $utf8 $chk); + $utf8 = decode("gsm0338", $gsm0338, $chk); + +So if you want to check the validity of the encoding, surround the +expression with C block as follows; + + eval { + $utf8 = decode("gsm0338", $gsm0338, $chk); + }; + if ($@){ + # handle exception here + } + +=head1 BUGS + +ESTI GSM 03.38 Encoding itself. + +Mapping \x00 to '@' causes too much pain everywhere. + +Its use of \x1b (escape) is also very questionable. + +Because of those two, the code paging approach used use in ucm-based +Encoding SOMETIMES fails so this module was written. + +=head1 SEE ALSO + +L + +=cut diff --git a/ext/Encode/lib/Encode/Supported.pod b/ext/Encode/lib/Encode/Supported.pod index 651f7e6..431bb77 100644 --- a/ext/Encode/lib/Encode/Supported.pod +++ b/ext/Encode/lib/Encode/Supported.pod @@ -12,7 +12,7 @@ Each encoding has one "canonical" name. The "canonical" name is chosen from the names of the encoding by picking the first in the following sequence (with a few exceptions). -=over 4 +=over 2 =item * @@ -103,7 +103,7 @@ Symbols and EBCDIC. The following encodings are based on single-byte encodings implemented as extended ASCII. Most of them map \x80-\xff (upper half) to non-ASCII characters. -=over 4 +=over 2 =item ISO-8859 and corresponding vendor mappings @@ -172,13 +172,24 @@ For gory details, see L koi8-u [RFC2319] ---------------------------------------------------------------- -=item gsm0338 - Hentai Latin 1 +=back + +=head2 gsm0338 - Hentai Latin 1 GSM0338 is for GSM handsets. Though it shares alphanumerals with ASCII, control character ranges and other parts are mapped very differently, mainly to store Greek characters. There are also escape -sequences (starting with 0x1B) to cover e.g. the Euro sign. Some -special cases like a trailing 0x00 byte or a lone 0x1B byte are not +sequences (starting with 0x1B) to cover e.g. the Euro sign. + +This was once handled by L but because of all those +unusual specifications, Encode 2.20 has relocated the support to +L. See L for details. + +=over 2 + +=item gsm0338 support before 2.19 + +Some special cases like a trailing 0x00 byte or a lone 0x1B byte are not well-defined and decode() will return an empty string for them. One possible workaround is @@ -204,7 +215,7 @@ countries, due to the size concerns (simplified Chinese is mapped to 'CN', continental China, while traditional Chinese is mapped to 'TW', Taiwan). Please refer to their respective documentation pages. -=over 4 +=over 2 =item Encode::CN -- Continental China @@ -289,7 +300,7 @@ distributed separately on CPAN, under the name Encode::JIS2K. =head2 Miscellaneous encodings -=over 4 +=over 2 =item Encode::EBCDIC @@ -342,7 +353,7 @@ The following encodings are not supported as yet; some because they are rarely used, some because of technical difficulties. They may be supported by external modules via CPAN in the future, however. -=over 4 +=over 2 =item ISO-2022-JP-2 [RFC1554] @@ -435,7 +446,7 @@ needed, we need to differentiate I and I. To understand that, here is a description of how we make computers grok our characters. -=over 4 +=over 2 =item * @@ -474,7 +485,7 @@ Technically, or mathematically, speaking, a character set encoded in such a CES that maps character by character may form a CCS. EUC is such an example. The CES of EUC is as follows: -=over 4 +=over 2 =item * @@ -511,7 +522,7 @@ applicability for information exchange over the Internet and to choose the most suitable aliases to name them in the context of such communication. -=over 4 +=over 2 =item * @@ -559,7 +570,7 @@ are IANA-registered Cs. See [RFC 2781] for details. Jungshik Shin reports that UTF-16 with a BOM is well accepted by MS IE 5/6 and NS 4/6. Beware however that -=over 4 +=over 2 =item * @@ -608,7 +619,7 @@ is a proprietary name. Microsoft products misuse the following names: -=over 4 +=over 2 =item KS_C_5601-1987 @@ -673,7 +684,7 @@ Encode separately supports C and C. =head1 Glossary -=over 4 +=over 2 =item character repertoire @@ -764,14 +775,14 @@ L, L =head1 References -=over 4 +=over 2 =item ECMA European Computer Manufacturers Association L -=over 4 +=over 2 =item ECMA-035 (eq C) @@ -786,7 +797,7 @@ The specification of ISO-2022 is available from the link above. Internet Assigned Numbers Authority L -=over 4 +=over 2 =item Assigned Charset Names by IANA @@ -814,7 +825,7 @@ L Unicode Consortium L -=over 4 +=over 2 =item Unicode Glossary @@ -828,7 +839,7 @@ The glossary of this document is based upon this site. =head2 Other Notable Sites -=over 4 +=over 2 =item czyborra.com @@ -867,7 +878,7 @@ L =head2 Offline sources -=over 4 +=over 2 =item C by Ken Lunde diff --git a/ext/Encode/t/gsm0338.t b/ext/Encode/t/gsm0338.t index 6066d7a..822ff4f 100644 --- a/ext/Encode/t/gsm0338.t +++ b/ext/Encode/t/gsm0338.t @@ -12,8 +12,10 @@ BEGIN { } use strict; -use Test::More tests => 21; +use utf8; +use Test::More tests => 778; use Encode; +use Encode::GSM0338; # The specification of GSM 03.38 is not awfully clear. # (http://www.unicode.org/Public/MAPPINGS/ETSI/GSM0338.TXT) @@ -21,7 +23,82 @@ use Encode; # are unclear, as is the semantics of those bytes as standalone # or as final single bytes. -sub t { is(decode("gsm0338", my $t = $_[0]), $_[1]) } + +my $chk = Encode::LEAVE_SRC(); + +# escapes +# see http://www.csoft.co.uk/sms/character_sets/gsm.htm +my %esc_seq = ( + "\x{20ac}" => "\x1b\x65", + "\x0c" => "\x1b\x0A", + "[" => "\x1b\x3C", + "\\" => "\x1b\x2F", + "]" => "\x1b\x3E", + "^" => "\x1b\x14", + "{" => "\x1b\x28", + "|" => "\x1b\x40", + "}" => "\x1b\x29", + "~" => "\x1b\x3D", +); + +my %unesc_seq = reverse %esc_seq; + + +sub eu{ + $_[0] =~ /[\x00-\x1f]/ ? + sprintf("\\x{%04X}", ord($_[0])) : encode_utf8($_[0]); + +} + +for my $c ( map { chr } 0 .. 127 ) { + my $u = $Encode::GSM0338::GSM2UNI{$c}; + + # default character set + is decode( "gsm0338", $c, $chk ), $u, + sprintf( "decode \\x%02X", ord($c) ); + eval { decode( "gsm0338", $c . "\xff", $chk ) }; + ok( $@, $@ ); + is encode( "gsm0338", $u, $chk ), $c, sprintf( "encode %s", eu($u) ); + eval { encode( "gsm0338", $u . "\x{3000}", $chk ) }; + ok( $@, $@ ); + + # nasty atmark + if ( $c eq "\x00" ) { + is decode( "gsm0338", "\x00" . $c, $chk ), "\x00", + sprintf( '@@ =>: \x00+\x%02X', ord($c) ); + } + else { + is decode( "gsm0338", "\x00" . $c ), '@' . decode( "gsm0338", $c ), + sprintf( '@: decode \x00+\x%02X', ord($c) ); + } + + # escape seq. + my $ecs = "\x1b" . $c; + if ( $unesc_seq{$ecs} ) { + is decode( "gsm0338", $ecs, $chk ), $unesc_seq{$ecs}, + sprintf( "ESC: decode ESC+\\x%02X", ord($c) ); + is encode( "gsm0338", $unesc_seq{$ecs}, $chk ), $ecs, + sprintf( "ESC: encode %s ", eu( $unesc_seq{$ecs} ) ); + } + else { + is decode( "gsm0338", $ecs, $chk ), + "\xA0" . decode( "gsm0338", $c ), + sprintf( "decode ESC+\\x%02X", ord($c) ); + } +} + +__END__ +for my $c (map { chr } 0..127){ + my $b = "\x1b$c"; + my $u = $Encode::GSM0338::GSM2UNI{$b}; + next unless $u; + $u ||= "\xA0" . $Encode::GSM0338::GSM2UNI{$c}; + is decode("gsm0338", $b), $u, sprintf("decode ESC+\\x%02X", ord($c) ); +} + +__END__ +# old test follows +ub t { is(decode("gsm0338", my $t = $_[0]), $_[1]) } # t("\x00", "\x00"); # ??? @@ -56,7 +133,3 @@ t("\x1B\x3E", "\x5D"); t("\x1B\x40", "\x7C"); t("\x1B\x40", "\x7C"); t("\x1B\x65", "\x{20AC}"); - - - - diff --git a/ext/Encode/t/mime-header.t b/ext/Encode/t/mime-header.t index 9c63630..e36e0ba 100644 --- a/ext/Encode/t/mime-header.t +++ b/ext/Encode/t/mime-header.t @@ -1,5 +1,5 @@ # -# $Id: mime-header.t,v 2.3 2007/04/06 12:53:41 dankogai Exp dankogai $ +# $Id: mime-header.t,v 2.3 2007/04/06 12:53:41 dankogai Exp $ # This script is written in utf8 # BEGIN {