Upgrade to Encode 2.11, plus a patch to PerlIO::encoding
[p5sagit/p5-mst-13.2.git] / ext / Encode / lib / Encode / Alias.pm
CommitLineData
5d030b67 1package Encode::Alias;
2use strict;
1485817e 3no warnings 'redefine';
5129552c 4use Encode;
56ff7374 5our $VERSION = do { my @r = (q$Revision: 2.4 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
8f139f4c 6sub DEBUG () { 0 }
5d030b67 7
10c5ecbb 8use base qw(Exporter);
5d030b67 9
10# Public, encouraged API is exported by default
5129552c 11
fcb875d4 12our @EXPORT =
5129552c 13 qw (
14 define_alias
15 find_alias
16 );
5d030b67 17
18our @Alias; # ordered matching list
19our %Alias; # cached known aliases
20
8f1ed24a 21sub find_alias{
5d030b67 22 my $class = shift;
151b5d36 23 my $find = shift;
8f1ed24a 24 unless (exists $Alias{$find}) {
151b5d36 25 $Alias{$find} = undef; # Recursion guard
8f1ed24a 26 for (my $i=0; $i < @Alias; $i += 2){
5d030b67 27 my $alias = $Alias[$i];
28 my $val = $Alias[$i+1];
29 my $new;
8f1ed24a 30 if (ref($alias) eq 'Regexp' && $find =~ $alias){
8f139f4c 31 DEBUG and warn "eval $val";
5d030b67 32 $new = eval $val;
8f139f4c 33 DEBUG and $@ and warn "$val, $@";
8f1ed24a 34 }elsif (ref($alias) eq 'CODE'){
8f139f4c 35 DEBUG and warn "$alias", "->", "($find)";
151b5d36 36 $new = $alias->($find);
8f1ed24a 37 }elsif (lc($find) eq lc($alias)){
5d030b67 38 $new = $val;
39 }
8f1ed24a 40 if (defined($new)){
151b5d36 41 next if $new eq $find; # avoid (direct) recursion on bugs
8f139f4c 42 DEBUG and warn "$alias, $new";
5d030b67 43 my $enc = (ref($new)) ? $new : Encode::find_encoding($new);
8f1ed24a 44 if ($enc){
151b5d36 45 $Alias{$find} = $enc;
5d030b67 46 last;
47 }
48 }
49 }
8f1ed24a 50 # case insensitive search when canonical is not in all lowercase
51 # RT ticket #7835
52 unless ($Alias{$find}){
53 my $lcfind = lc($find);
54 for my $name (keys %Encode::Encoding, keys %Encode::ExtModule){
55 $lcfind eq lc($name) or next;
56 $Alias{$find} = Encode::find_encoding($name);
57 DEBUG and warn "$find => $name";
58 }
59 }
5d030b67 60 }
8f139f4c 61 if (DEBUG){
a63c962f 62 my $name;
151b5d36 63 if (my $e = $Alias{$find}){
a63c962f 64 $name = $e->name;
65 }else{
66 $name = "";
67 }
151b5d36 68 warn "find_alias($class, $find)->name = $name";
a63c962f 69 }
151b5d36 70 return $Alias{$find};
5d030b67 71}
72
8f1ed24a 73sub define_alias{
74 while (@_){
5d030b67 75 my ($alias,$name) = splice(@_,0,2);
76 unshift(@Alias, $alias => $name); # newer one has precedence
5d030b67 77 if (ref($alias)){
8f1ed24a 78 # clear %Alias cache to allow overrides
5129552c 79 my @a = keys %Alias;
80 for my $k (@a){
8f1ed24a 81 if (ref($alias) eq 'Regexp' && $k =~ $alias){
8f139f4c 82 DEBUG and warn "delete \$Alias\{$k\}";
5d030b67 83 delete $Alias{$k};
84 }
8f1ed24a 85 elsif (ref($alias) eq 'CODE'){
8f139f4c 86 DEBUG and warn "delete \$Alias\{$k\}";
5d030b67 87 delete $Alias{$alias->($name)};
88 }
89 }
90 }else{
8f139f4c 91 DEBUG and warn "delete \$Alias\{$alias\}";
5d030b67 92 delete $Alias{$alias};
93 }
94 }
95}
96
5d030b67 97# Allow latin-1 style names as well
8f1ed24a 98# 0 1 2 3 4 5 6 7 8 9 10
5d030b67 99our @Latin2iso = ( 0, 1, 2, 3, 4, 9, 10, 13, 14, 15, 16 );
5d030b67 100# Allow winlatin1 style names as well
101our %Winlatin2cp = (
102 'latin1' => 1252,
103 'latin2' => 1250,
104 'cyrillic' => 1251,
105 'greek' => 1253,
106 'turkish' => 1254,
107 'hebrew' => 1255,
108 'arabic' => 1256,
109 'baltic' => 1257,
110 'vietnamese' => 1258,
111 );
112
5129552c 113init_aliases();
114
115sub undef_aliases{
116 @Alias = ();
117 %Alias = ();
118}
119
120sub init_aliases
121{
122 undef_aliases();
f2a2953c 123 # Try all-lower-case version should all else fails
a999c27c 124 define_alias( qr/^(.*)$/ => '"\L$1"' );
125
f2a2953c 126 # UTF/UCS stuff
1485817e 127 define_alias( qr/^UTF-?7$/i => '"UTF-7"');
11067275 128 define_alias( qr/^UCS-?2-?LE$/i => '"UCS-2LE"' );
129 define_alias( qr/^UCS-?2-?(BE)?$/i => '"UCS-2BE"',
126bf8bf 130 qr/^UCS-?4-?(BE|LE)?$/i => 'uc("UTF-32$1")',
11067275 131 qr/^iso-10646-1$/i => '"UCS-2BE"' );
8f1ed24a 132 define_alias( qr/^UTF-?(16|32)-?BE$/i => '"UTF-$1BE"',
133 qr/^UTF-?(16|32)-?LE$/i => '"UTF-$1LE"',
134 qr/^UTF-?(16|32)$/i => '"UTF-$1"',
f2a2953c 135 );
136 # ASCII
a999c27c 137 define_alias(qr/^(?:US-?)ascii$/i => '"ascii"');
a999c27c 138 define_alias('C' => 'ascii');
2d06ad02 139 define_alias(qr/\bISO[-_]?646[-_]?US$/i => '"ascii"');
67d7b5ef 140 # Allow variants of iso-8859-1 etc.
141 define_alias( qr/\biso[-_]?(\d+)[-_](\d+)$/i => '"iso-$1-$2"' );
5129552c 142
67d7b5ef 143 # At least HP-UX has these.
144 define_alias( qr/\biso8859(\d+)$/i => '"iso-8859-$1"' );
5129552c 145
67d7b5ef 146 # More HP stuff.
147 define_alias( qr/\b(?:hp-)?(arabic|greek|hebrew|kana|roman|thai|turkish)8$/i => '"${1}8"' );
5129552c 148
67d7b5ef 149 # The Official name of ASCII.
150 define_alias( qr/\bANSI[-_]?X3\.4[-_]?1968$/i => '"ascii"' );
5129552c 151
67d7b5ef 152 # This is a font issue, not an encoding issue.
153 # (The currency symbol of the Latin 1 upper half
154 # has been redefined as the euro symbol.)
5129552c 155 define_alias( qr/^(.+)\@euro$/i => '"$1"' );
156
fcb875d4 157 define_alias( qr/\b(?:iso[-_]?)?latin[-_]?(\d+)$/i
bb7af5ca 158 => 'defined $Encode::Alias::Latin2iso[$1] ? "iso-8859-$Encode::Alias::Latin2iso[$1]" : undef' );
5129552c 159
67d7b5ef 160 define_alias( qr/\bwin(latin[12]|cyrillic|baltic|greek|turkish|
fcb875d4 161 hebrew|arabic|baltic|vietnamese)$/ix =>
5129552c 162 '"cp" . $Encode::Alias::Winlatin2cp{lc($1)}' );
5d030b67 163
3c4b39be 164 # Common names for non-latin preferred MIME names
5129552c 165 define_alias( 'ascii' => 'US-ascii',
166 'cyrillic' => 'iso-8859-5',
167 'arabic' => 'iso-8859-6',
168 'greek' => 'iso-8859-7',
169 'hebrew' => 'iso-8859-8',
170 'thai' => 'iso-8859-11',
171 'tis620' => 'iso-8859-11',
172 );
5d030b67 173
67d7b5ef 174 # At least AIX has IBM-NNN (surprisingly...) instead of cpNNN.
175 # And Microsoft has their own naming (again, surprisingly).
fcb875d4 176 # And windows-* is registered in IANA!
2d06ad02 177 define_alias( qr/\b(?:cp|ibm|ms|windows)[-_ ]?(\d{2,4})$/i => '"cp$1"');
5d030b67 178
67d7b5ef 179 # Sometimes seen with a leading zero.
c731e18e 180 # define_alias( qr/\bcp037\b/i => '"cp37"');
5d030b67 181
3ef515df 182 # Mac Mappings
a999c27c 183 # predefined in *.ucm; unneeded
184 # define_alias( qr/\bmacIcelandic$/i => '"macIceland"');
3ef515df 185 define_alias( qr/^mac_(.*)$/i => '"mac$1"');
a999c27c 186 # Ououououou. gone. They are differente!
187 # define_alias( qr/\bmacRomanian$/i => '"macRumanian"');
fcb875d4 188
f2a2953c 189 # Standardize on the dashed versions.
cf9f87ce 190 define_alias( qr/\bkoi8[\s\-_]*([ru])$/i => '"koi8-$1"' );
5129552c 191
a63c962f 192 unless ($Encode::ON_EBCDIC){
193 # for Encode::CN
67d7b5ef 194 define_alias( qr/\beuc.*cn$/i => '"euc-cn"' );
195 define_alias( qr/\bcn.*euc$/i => '"euc-cn"' );
196 # define_alias( qr/\bGB[- ]?(\d+)$/i => '"euc-cn"' )
197 # CP936 doesn't have vendor-addon for GBK, so they're identical.
198 define_alias( qr/^gbk$/i => '"cp936"');
199 # This fixes gb2312 vs. euc-cn confusion, practically
b9531c19 200 define_alias( qr/\bGB[-_ ]?2312(?!-?raw)/i => '"euc-cn"' );
67d7b5ef 201 # for Encode::JP
202 define_alias( qr/\bjis$/i => '"7bit-jis"' );
203 define_alias( qr/\beuc.*jp$/i => '"euc-jp"' );
204 define_alias( qr/\bjp.*euc$/i => '"euc-jp"' );
205 define_alias( qr/\bujis$/i => '"euc-jp"' );
206 define_alias( qr/\bshift.*jis$/i => '"shiftjis"' );
207 define_alias( qr/\bsjis$/i => '"shiftjis"' );
8f1ed24a 208 define_alias( qr/\bwindows-31j$/i => '"cp932"' );
a63c962f 209 # for Encode::KR
67d7b5ef 210 define_alias( qr/\beuc.*kr$/i => '"euc-kr"' );
211 define_alias( qr/\bkr.*euc$/i => '"euc-kr"' );
212 # This fixes ksc5601 vs. euc-kr confusion, practically
213 define_alias( qr/(?:x-)?uhc$/i => '"cp949"' );
214 define_alias( qr/(?:x-)?windows-949$/i => '"cp949"' );
215 define_alias( qr/\bks_c_5601-1987$/i => '"cp949"' );
a63c962f 216 # for Encode::TW
b0b300a3 217 define_alias( qr/\bbig-?5$/i => '"big5-eten"' );
4b291ae6 218 define_alias( qr/\bbig5-?et(?:en)?$/i => '"big5-eten"' );
2d06ad02 219 define_alias( qr/\btca[-_]?big5$/i => '"big5-eten"' );
b0b300a3 220 define_alias( qr/\bbig5-?hk(?:scs)?$/i => '"big5-hkscs"' );
2d06ad02 221 define_alias( qr/\bhk(?:scs)?[-_]?big5$/i => '"big5-hkscs"' );
a63c962f 222 }
f2a2953c 223 # utf8 is blessed :)
7f0d54d7 224 define_alias( qr/^UTF-8$/i => '"utf-8-strict"');
67d7b5ef 225 # At last, Map white space and _ to '-'
5129552c 226 define_alias( qr/^(\S+)[\s_]+(.*)$/i => '"$1-$2"' );
227}
228
2291;
230__END__
5d030b67 231
232# TODO: HP-UX '8' encodings arabic8 greek8 hebrew8 kana8 thai8 turkish8
233# TODO: HP-UX '15' encodings japanese15 korean15 roi15
234# TODO: Cyrillic encoding ISO-IR-111 (useful?)
235# TODO: Armenian encoding ARMSCII-8
236# TODO: Hebrew encoding ISO-8859-8-1
237# TODO: Thai encoding TCVN
5d030b67 238# TODO: Vietnamese encodings VPS
239# TODO: Mac Asian+African encodings: Arabic Armenian Bengali Burmese
240# ChineseSimp ChineseTrad Devanagari Ethiopic ExtArabic
241# Farsi Georgian Gujarati Gurmukhi Hebrew Japanese
242# Kannada Khmer Korean Laotian Malayalam Mongolian
243# Oriya Sinhalese Symbol Tamil Telugu Tibetan Vietnamese
244
5d030b67 245=head1 NAME
246
ce912cd4 247Encode::Alias - alias definitions to encodings
5d030b67 248
249=head1 SYNOPSIS
250
5129552c 251 use Encode;
252 use Encode::Alias;
5d030b67 253 define_alias( newName => ENCODING);
254
255=head1 DESCRIPTION
256
3ef515df 257Allows newName to be used as an alias for ENCODING. ENCODING may be
fcb875d4 258either the name of an encoding or an encoding object (as described
3ef515df 259in L<Encode>).
5d030b67 260
261Currently I<newName> can be specified in the following ways:
262
263=over 4
264
265=item As a simple string.
266
267=item As a qr// compiled regular expression, e.g.:
268
269 define_alias( qr/^iso8859-(\d+)$/i => '"iso-8859-$1"' );
270
0ab8f81e 271In this case, if I<ENCODING> is not a reference, it is C<eval>-ed
272in order to allow C<$1> etc. to be substituted. The example is one
273way to alias names as used in X11 fonts to the MIME names for the
274iso-8859-* family. Note the double quotes inside the single quotes.
5d030b67 275
151b5d36 276(or, you don't have to do this yourself because this example is predefined)
277
3ef515df 278If you are using a regex here, you have to use the quotes as shown or
279it won't work. Also note that regex handling is tricky even for the
151b5d36 280experienced. Use this feature with caution.
5d030b67 281
282=item As a code reference, e.g.:
283
151b5d36 284 define_alias( sub {shift =~ /^iso8859-(\d+)$/i ? "iso-8859-$1" : undef } );
5d030b67 285
151b5d36 286The same effect as the example above in a different way. The coderef
287takes the alias name as an argument and returns a canonical name on
288success or undef if not. Note the second argument is not required.
289Use this with even more caution than the regex version.
5d030b67 290
5129552c 291=back
292
151b5d36 293=head3 Changes in code reference aliasing
294
295As of Encode 1.87, the older form
296
297 define_alias( sub { return /^iso8859-(\d+)$/i ? "iso-8859-$1" : undef } );
298
299no longer works.
300
301Encode up to 1.86 internally used "local $_" to implement ths older
302form. But consider the code below;
303
304 use Encode;
305 $_ = "eeeee" ;
306 while (/(e)/g) {
307 my $utf = decode('aliased-encoding-name', $1);
308 print "position:",pos,"\n";
309 }
310
311Prior to Encode 1.86 this fails because of "local $_".
312
0ab8f81e 313=head2 Alias overloading
5d030b67 314
3ef515df 315You can override predefined aliases by simply applying define_alias().
3c4b39be 316The new alias is always evaluated first, and when necessary,
0ab8f81e 317define_alias() flushes the internal cache to make the new definition
318available.
5d030b67 319
0ab8f81e 320 # redirect SHIFT_JIS to MS/IBM Code Page 932, which is a
5d030b67 321 # superset of SHIFT_JIS
322
5129552c 323 define_alias( qr/shift.*jis$/i => '"cp932"' );
324 define_alias( qr/sjis$/i => '"cp932"' );
325
0ab8f81e 326If you want to zap all predefined aliases, you can use
5129552c 327
328 Encode::Alias->undef_aliases;
329
330to do so. And
331
332 Encode::Alias->init_aliases;
333
0ab8f81e 334gets the factory settings back.
5d030b67 335
336=head1 SEE ALSO
337
338L<Encode>, L<Encode::Supported>
339
5129552c 340=cut
5d030b67 341