more accumulated cleanups
[p5sagit/p5-mst-13.2.git] / ext / Encode / lib / Encode / Alias.pm
CommitLineData
5d030b67 1package Encode::Alias;
2use strict;
1485817e 3no warnings 'redefine';
5129552c 4use Encode;
d1256cb1 5our $VERSION = do { my @r = ( q$Revision: 2.5 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r };
8f139f4c 6sub DEBUG () { 0 }
5d030b67 7
10c5ecbb 8use base qw(Exporter);
5d030b67 9
10# Public, encouraged API is exported by default
5129552c 11
d1256cb1 12our @EXPORT =
13 qw (
14 define_alias
15 find_alias
16);
5d030b67 17
d1256cb1 18our @Alias; # ordered matching list
19our %Alias; # cached known aliases
5d030b67 20
d1256cb1 21sub find_alias {
5d030b67 22 my $class = shift;
d1256cb1 23 my $find = shift;
24 unless ( exists $Alias{$find} ) {
25 $Alias{$find} = undef; # Recursion guard
26 for ( my $i = 0 ; $i < @Alias ; $i += 2 ) {
27 my $alias = $Alias[$i];
28 my $val = $Alias[ $i + 1 ];
29 my $new;
30 if ( ref($alias) eq 'Regexp' && $find =~ $alias ) {
31 DEBUG and warn "eval $val";
32 $new = eval $val;
33 DEBUG and $@ and warn "$val, $@";
34 }
35 elsif ( ref($alias) eq 'CODE' ) {
36 DEBUG and warn "$alias", "->", "($find)";
37 $new = $alias->($find);
38 }
39 elsif ( lc($find) eq lc($alias) ) {
40 $new = $val;
41 }
42 if ( defined($new) ) {
43 next if $new eq $find; # avoid (direct) recursion on bugs
44 DEBUG and warn "$alias, $new";
45 my $enc =
46 ( ref($new) ) ? $new : Encode::find_encoding($new);
47 if ($enc) {
48 $Alias{$find} = $enc;
49 last;
50 }
51 }
52 }
53
54 # case insensitive search when canonical is not in all lowercase
55 # RT ticket #7835
56 unless ( $Alias{$find} ) {
57 my $lcfind = lc($find);
58 for my $name ( keys %Encode::Encoding, keys %Encode::ExtModule )
59 {
60 $lcfind eq lc($name) or next;
61 $Alias{$find} = Encode::find_encoding($name);
62 DEBUG and warn "$find => $name";
63 }
64 }
5d030b67 65 }
d1256cb1 66 if (DEBUG) {
67 my $name;
68 if ( my $e = $Alias{$find} ) {
69 $name = $e->name;
70 }
71 else {
72 $name = "";
73 }
74 warn "find_alias($class, $find)->name = $name";
a63c962f 75 }
151b5d36 76 return $Alias{$find};
5d030b67 77}
78
d1256cb1 79sub define_alias {
80 while (@_) {
81 my ( $alias, $name ) = splice( @_, 0, 2 );
82 unshift( @Alias, $alias => $name ); # newer one has precedence
83 if ( ref($alias) ) {
84
85 # clear %Alias cache to allow overrides
86 my @a = keys %Alias;
87 for my $k (@a) {
88 if ( ref($alias) eq 'Regexp' && $k =~ $alias ) {
89 DEBUG and warn "delete \$Alias\{$k\}";
90 delete $Alias{$k};
91 }
92 elsif ( ref($alias) eq 'CODE' ) {
93 DEBUG and warn "delete \$Alias\{$k\}";
94 delete $Alias{ $alias->($name) };
95 }
96 }
97 }
98 else {
99 DEBUG and warn "delete \$Alias\{$alias\}";
100 delete $Alias{$alias};
101 }
5d030b67 102 }
103}
104
5d030b67 105# Allow latin-1 style names as well
8f1ed24a 106# 0 1 2 3 4 5 6 7 8 9 10
5d030b67 107our @Latin2iso = ( 0, 1, 2, 3, 4, 9, 10, 13, 14, 15, 16 );
d1256cb1 108
5d030b67 109# Allow winlatin1 style names as well
d1256cb1 110our %Winlatin2cp = (
111 'latin1' => 1252,
112 'latin2' => 1250,
113 'cyrillic' => 1251,
114 'greek' => 1253,
115 'turkish' => 1254,
116 'hebrew' => 1255,
117 'arabic' => 1256,
118 'baltic' => 1257,
119 'vietnamese' => 1258,
120);
5d030b67 121
5129552c 122init_aliases();
123
d1256cb1 124sub undef_aliases {
5129552c 125 @Alias = ();
126 %Alias = ();
127}
128
d1256cb1 129sub init_aliases {
5129552c 130 undef_aliases();
d1256cb1 131
f2a2953c 132 # Try all-lower-case version should all else fails
a999c27c 133 define_alias( qr/^(.*)$/ => '"\L$1"' );
134
f2a2953c 135 # UTF/UCS stuff
d1256cb1 136 define_alias( qr/^UTF-?7$/i => '"UTF-7"' );
137 define_alias( qr/^UCS-?2-?LE$/i => '"UCS-2LE"' );
138 define_alias(
139 qr/^UCS-?2-?(BE)?$/i => '"UCS-2BE"',
140 qr/^UCS-?4-?(BE|LE)?$/i => 'uc("UTF-32$1")',
141 qr/^iso-10646-1$/i => '"UCS-2BE"'
142 );
143 define_alias(
144 qr/^UTF-?(16|32)-?BE$/i => '"UTF-$1BE"',
145 qr/^UTF-?(16|32)-?LE$/i => '"UTF-$1LE"',
146 qr/^UTF-?(16|32)$/i => '"UTF-$1"',
147 );
148
f2a2953c 149 # ASCII
d1256cb1 150 define_alias( qr/^(?:US-?)ascii$/i => '"ascii"' );
151 define_alias( 'C' => 'ascii' );
152 define_alias( qr/\bISO[-_]?646[-_]?US$/i => '"ascii"' );
153
67d7b5ef 154 # Allow variants of iso-8859-1 etc.
155 define_alias( qr/\biso[-_]?(\d+)[-_](\d+)$/i => '"iso-$1-$2"' );
5129552c 156
67d7b5ef 157 # At least HP-UX has these.
158 define_alias( qr/\biso8859(\d+)$/i => '"iso-8859-$1"' );
5129552c 159
67d7b5ef 160 # More HP stuff.
d1256cb1 161 define_alias(
162 qr/\b(?:hp-)?(arabic|greek|hebrew|kana|roman|thai|turkish)8$/i =>
163 '"${1}8"' );
5129552c 164
67d7b5ef 165 # The Official name of ASCII.
166 define_alias( qr/\bANSI[-_]?X3\.4[-_]?1968$/i => '"ascii"' );
5129552c 167
67d7b5ef 168 # This is a font issue, not an encoding issue.
169 # (The currency symbol of the Latin 1 upper half
170 # has been redefined as the euro symbol.)
5129552c 171 define_alias( qr/^(.+)\@euro$/i => '"$1"' );
172
d1256cb1 173 define_alias( qr/\b(?:iso[-_]?)?latin[-_]?(\d+)$/i =>
174'defined $Encode::Alias::Latin2iso[$1] ? "iso-8859-$Encode::Alias::Latin2iso[$1]" : undef'
175 );
5129552c 176
d1256cb1 177 define_alias(
178 qr/\bwin(latin[12]|cyrillic|baltic|greek|turkish|
179 hebrew|arabic|baltic|vietnamese)$/ix =>
180 '"cp" . $Encode::Alias::Winlatin2cp{lc($1)}'
181 );
5d030b67 182
3c4b39be 183 # Common names for non-latin preferred MIME names
d1256cb1 184 define_alias(
185 'ascii' => 'US-ascii',
186 'cyrillic' => 'iso-8859-5',
187 'arabic' => 'iso-8859-6',
188 'greek' => 'iso-8859-7',
189 'hebrew' => 'iso-8859-8',
190 'thai' => 'iso-8859-11',
191 'tis620' => 'iso-8859-11',
192 );
5d030b67 193
67d7b5ef 194 # At least AIX has IBM-NNN (surprisingly...) instead of cpNNN.
195 # And Microsoft has their own naming (again, surprisingly).
d1256cb1 196 # And windows-* is registered in IANA!
197 define_alias(
198 qr/\b(?:cp|ibm|ms|windows)[-_ ]?(\d{2,4})$/i => '"cp$1"' );
5d030b67 199
67d7b5ef 200 # Sometimes seen with a leading zero.
c731e18e 201 # define_alias( qr/\bcp037\b/i => '"cp37"');
5d030b67 202
3ef515df 203 # Mac Mappings
a999c27c 204 # predefined in *.ucm; unneeded
205 # define_alias( qr/\bmacIcelandic$/i => '"macIceland"');
d1256cb1 206 define_alias( qr/^mac_(.*)$/i => '"mac$1"' );
207
a999c27c 208 # Ououououou. gone. They are differente!
209 # define_alias( qr/\bmacRomanian$/i => '"macRumanian"');
d1256cb1 210
f2a2953c 211 # Standardize on the dashed versions.
cf9f87ce 212 define_alias( qr/\bkoi8[\s\-_]*([ru])$/i => '"koi8-$1"' );
5129552c 213
d1256cb1 214 unless ($Encode::ON_EBCDIC) {
215
a63c962f 216 # for Encode::CN
d1256cb1 217 define_alias( qr/\beuc.*cn$/i => '"euc-cn"' );
218 define_alias( qr/\bcn.*euc$/i => '"euc-cn"' );
219
220 # define_alias( qr/\bGB[- ]?(\d+)$/i => '"euc-cn"' )
221 # CP936 doesn't have vendor-addon for GBK, so they're identical.
222 define_alias( qr/^gbk$/i => '"cp936"' );
223
224 # This fixes gb2312 vs. euc-cn confusion, practically
225 define_alias( qr/\bGB[-_ ]?2312(?!-?raw)/i => '"euc-cn"' );
226
227 # for Encode::JP
228 define_alias( qr/\bjis$/i => '"7bit-jis"' );
229 define_alias( qr/\beuc.*jp$/i => '"euc-jp"' );
230 define_alias( qr/\bjp.*euc$/i => '"euc-jp"' );
231 define_alias( qr/\bujis$/i => '"euc-jp"' );
232 define_alias( qr/\bshift.*jis$/i => '"shiftjis"' );
233 define_alias( qr/\bsjis$/i => '"shiftjis"' );
234 define_alias( qr/\bwindows-31j$/i => '"cp932"' );
235
a63c962f 236 # for Encode::KR
d1256cb1 237 define_alias( qr/\beuc.*kr$/i => '"euc-kr"' );
238 define_alias( qr/\bkr.*euc$/i => '"euc-kr"' );
239
240 # This fixes ksc5601 vs. euc-kr confusion, practically
241 define_alias( qr/(?:x-)?uhc$/i => '"cp949"' );
242 define_alias( qr/(?:x-)?windows-949$/i => '"cp949"' );
243 define_alias( qr/\bks_c_5601-1987$/i => '"cp949"' );
244
a63c962f 245 # for Encode::TW
d1256cb1 246 define_alias( qr/\bbig-?5$/i => '"big5-eten"' );
247 define_alias( qr/\bbig5-?et(?:en)?$/i => '"big5-eten"' );
248 define_alias( qr/\btca[-_]?big5$/i => '"big5-eten"' );
249 define_alias( qr/\bbig5-?hk(?:scs)?$/i => '"big5-hkscs"' );
250 define_alias( qr/\bhk(?:scs)?[-_]?big5$/i => '"big5-hkscs"' );
a63c962f 251 }
d1256cb1 252
f2a2953c 253 # utf8 is blessed :)
d1256cb1 254 define_alias( qr/^UTF-8$/i => '"utf-8-strict"' );
255
67d7b5ef 256 # At last, Map white space and _ to '-'
5129552c 257 define_alias( qr/^(\S+)[\s_]+(.*)$/i => '"$1-$2"' );
258}
259
2601;
261__END__
5d030b67 262
263# TODO: HP-UX '8' encodings arabic8 greek8 hebrew8 kana8 thai8 turkish8
264# TODO: HP-UX '15' encodings japanese15 korean15 roi15
265# TODO: Cyrillic encoding ISO-IR-111 (useful?)
266# TODO: Armenian encoding ARMSCII-8
267# TODO: Hebrew encoding ISO-8859-8-1
268# TODO: Thai encoding TCVN
5d030b67 269# TODO: Vietnamese encodings VPS
270# TODO: Mac Asian+African encodings: Arabic Armenian Bengali Burmese
271# ChineseSimp ChineseTrad Devanagari Ethiopic ExtArabic
272# Farsi Georgian Gujarati Gurmukhi Hebrew Japanese
273# Kannada Khmer Korean Laotian Malayalam Mongolian
274# Oriya Sinhalese Symbol Tamil Telugu Tibetan Vietnamese
275
5d030b67 276=head1 NAME
277
ce912cd4 278Encode::Alias - alias definitions to encodings
5d030b67 279
280=head1 SYNOPSIS
281
5129552c 282 use Encode;
283 use Encode::Alias;
5d030b67 284 define_alias( newName => ENCODING);
285
286=head1 DESCRIPTION
287
3ef515df 288Allows newName to be used as an alias for ENCODING. ENCODING may be
fcb875d4 289either the name of an encoding or an encoding object (as described
3ef515df 290in L<Encode>).
5d030b67 291
292Currently I<newName> can be specified in the following ways:
293
294=over 4
295
296=item As a simple string.
297
298=item As a qr// compiled regular expression, e.g.:
299
300 define_alias( qr/^iso8859-(\d+)$/i => '"iso-8859-$1"' );
301
0ab8f81e 302In this case, if I<ENCODING> is not a reference, it is C<eval>-ed
303in order to allow C<$1> etc. to be substituted. The example is one
304way to alias names as used in X11 fonts to the MIME names for the
305iso-8859-* family. Note the double quotes inside the single quotes.
5d030b67 306
151b5d36 307(or, you don't have to do this yourself because this example is predefined)
308
3ef515df 309If you are using a regex here, you have to use the quotes as shown or
310it won't work. Also note that regex handling is tricky even for the
151b5d36 311experienced. Use this feature with caution.
5d030b67 312
313=item As a code reference, e.g.:
314
151b5d36 315 define_alias( sub {shift =~ /^iso8859-(\d+)$/i ? "iso-8859-$1" : undef } );
5d030b67 316
151b5d36 317The same effect as the example above in a different way. The coderef
318takes the alias name as an argument and returns a canonical name on
319success or undef if not. Note the second argument is not required.
320Use this with even more caution than the regex version.
5d030b67 321
5129552c 322=back
323
151b5d36 324=head3 Changes in code reference aliasing
325
326As of Encode 1.87, the older form
327
328 define_alias( sub { return /^iso8859-(\d+)$/i ? "iso-8859-$1" : undef } );
329
330no longer works.
331
332Encode up to 1.86 internally used "local $_" to implement ths older
333form. But consider the code below;
334
335 use Encode;
336 $_ = "eeeee" ;
337 while (/(e)/g) {
338 my $utf = decode('aliased-encoding-name', $1);
339 print "position:",pos,"\n";
340 }
341
342Prior to Encode 1.86 this fails because of "local $_".
343
0ab8f81e 344=head2 Alias overloading
5d030b67 345
3ef515df 346You can override predefined aliases by simply applying define_alias().
3c4b39be 347The new alias is always evaluated first, and when necessary,
0ab8f81e 348define_alias() flushes the internal cache to make the new definition
349available.
5d030b67 350
0ab8f81e 351 # redirect SHIFT_JIS to MS/IBM Code Page 932, which is a
5d030b67 352 # superset of SHIFT_JIS
353
5129552c 354 define_alias( qr/shift.*jis$/i => '"cp932"' );
355 define_alias( qr/sjis$/i => '"cp932"' );
356
0ab8f81e 357If you want to zap all predefined aliases, you can use
5129552c 358
359 Encode::Alias->undef_aliases;
360
361to do so. And
362
363 Encode::Alias->init_aliases;
364
0ab8f81e 365gets the factory settings back.
5d030b67 366
367=head1 SEE ALSO
368
369L<Encode>, L<Encode::Supported>
370
5129552c 371=cut
5d030b67 372