For tied file handle calls, use PUSH* when we know that the stack has space.
[p5sagit/p5-mst-13.2.git] / cpan / Encode / lib / Encode / Alias.pm
CommitLineData
5d030b67 1package Encode::Alias;
2use strict;
656ebd29 3use warnings;
1485817e 4no warnings 'redefine';
4e71788c 5our $VERSION = do { my @r = ( q$Revision: 2.12 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r };
8f139f4c 6sub DEBUG () { 0 }
5d030b67 7
10c5ecbb 8use base qw(Exporter);
5d030b67 9
10# Public, encouraged API is exported by default
5129552c 11
d1256cb1 12our @EXPORT =
13 qw (
14 define_alias
15 find_alias
16);
5d030b67 17
d1256cb1 18our @Alias; # ordered matching list
19our %Alias; # cached known aliases
5d030b67 20
d1256cb1 21sub find_alias {
c0abe5aa 22 require Encode;
5d030b67 23 my $class = shift;
d1256cb1 24 my $find = shift;
25 unless ( exists $Alias{$find} ) {
26 $Alias{$find} = undef; # Recursion guard
27 for ( my $i = 0 ; $i < @Alias ; $i += 2 ) {
28 my $alias = $Alias[$i];
29 my $val = $Alias[ $i + 1 ];
30 my $new;
31 if ( ref($alias) eq 'Regexp' && $find =~ $alias ) {
32 DEBUG and warn "eval $val";
33 $new = eval $val;
34 DEBUG and $@ and warn "$val, $@";
35 }
36 elsif ( ref($alias) eq 'CODE' ) {
37 DEBUG and warn "$alias", "->", "($find)";
38 $new = $alias->($find);
39 }
40 elsif ( lc($find) eq lc($alias) ) {
41 $new = $val;
42 }
43 if ( defined($new) ) {
44 next if $new eq $find; # avoid (direct) recursion on bugs
45 DEBUG and warn "$alias, $new";
46 my $enc =
47 ( ref($new) ) ? $new : Encode::find_encoding($new);
48 if ($enc) {
49 $Alias{$find} = $enc;
50 last;
51 }
52 }
53 }
54
55 # case insensitive search when canonical is not in all lowercase
56 # RT ticket #7835
57 unless ( $Alias{$find} ) {
58 my $lcfind = lc($find);
59 for my $name ( keys %Encode::Encoding, keys %Encode::ExtModule )
60 {
61 $lcfind eq lc($name) or next;
62 $Alias{$find} = Encode::find_encoding($name);
63 DEBUG and warn "$find => $name";
64 }
65 }
5d030b67 66 }
d1256cb1 67 if (DEBUG) {
68 my $name;
69 if ( my $e = $Alias{$find} ) {
70 $name = $e->name;
71 }
72 else {
73 $name = "";
74 }
75 warn "find_alias($class, $find)->name = $name";
a63c962f 76 }
151b5d36 77 return $Alias{$find};
5d030b67 78}
79
d1256cb1 80sub define_alias {
81 while (@_) {
82 my ( $alias, $name ) = splice( @_, 0, 2 );
83 unshift( @Alias, $alias => $name ); # newer one has precedence
84 if ( ref($alias) ) {
85
86 # clear %Alias cache to allow overrides
87 my @a = keys %Alias;
88 for my $k (@a) {
89 if ( ref($alias) eq 'Regexp' && $k =~ $alias ) {
90 DEBUG and warn "delete \$Alias\{$k\}";
91 delete $Alias{$k};
92 }
93 elsif ( ref($alias) eq 'CODE' ) {
94 DEBUG and warn "delete \$Alias\{$k\}";
95 delete $Alias{ $alias->($name) };
96 }
97 }
98 }
99 else {
100 DEBUG and warn "delete \$Alias\{$alias\}";
101 delete $Alias{$alias};
102 }
5d030b67 103 }
104}
105
5d030b67 106# Allow latin-1 style names as well
8f1ed24a 107# 0 1 2 3 4 5 6 7 8 9 10
5d030b67 108our @Latin2iso = ( 0, 1, 2, 3, 4, 9, 10, 13, 14, 15, 16 );
d1256cb1 109
5d030b67 110# Allow winlatin1 style names as well
d1256cb1 111our %Winlatin2cp = (
112 'latin1' => 1252,
113 'latin2' => 1250,
114 'cyrillic' => 1251,
115 'greek' => 1253,
116 'turkish' => 1254,
117 'hebrew' => 1255,
118 'arabic' => 1256,
119 'baltic' => 1257,
120 'vietnamese' => 1258,
121);
5d030b67 122
5129552c 123init_aliases();
124
d1256cb1 125sub undef_aliases {
5129552c 126 @Alias = ();
127 %Alias = ();
128}
129
d1256cb1 130sub init_aliases {
c0abe5aa 131 require Encode;
5129552c 132 undef_aliases();
d1256cb1 133
f2a2953c 134 # Try all-lower-case version should all else fails
a999c27c 135 define_alias( qr/^(.*)$/ => '"\L$1"' );
136
f2a2953c 137 # UTF/UCS stuff
64bc6d54 138 define_alias( qr/^(unicode-1-1-)?UTF-?7$/i => '"UTF-7"' );
d1256cb1 139 define_alias( qr/^UCS-?2-?LE$/i => '"UCS-2LE"' );
140 define_alias(
141 qr/^UCS-?2-?(BE)?$/i => '"UCS-2BE"',
142 qr/^UCS-?4-?(BE|LE)?$/i => 'uc("UTF-32$1")',
143 qr/^iso-10646-1$/i => '"UCS-2BE"'
144 );
145 define_alias(
146 qr/^UTF-?(16|32)-?BE$/i => '"UTF-$1BE"',
147 qr/^UTF-?(16|32)-?LE$/i => '"UTF-$1LE"',
148 qr/^UTF-?(16|32)$/i => '"UTF-$1"',
149 );
150
f2a2953c 151 # ASCII
d1256cb1 152 define_alias( qr/^(?:US-?)ascii$/i => '"ascii"' );
153 define_alias( 'C' => 'ascii' );
ba40575b 154 define_alias( qr/\b(?:ISO[-_]?)?646(?:[-_]?US)?$/i => '"ascii"' );
d1256cb1 155
67d7b5ef 156 # Allow variants of iso-8859-1 etc.
157 define_alias( qr/\biso[-_]?(\d+)[-_](\d+)$/i => '"iso-$1-$2"' );
5129552c 158
67d7b5ef 159 # At least HP-UX has these.
160 define_alias( qr/\biso8859(\d+)$/i => '"iso-8859-$1"' );
5129552c 161
67d7b5ef 162 # More HP stuff.
d1256cb1 163 define_alias(
164 qr/\b(?:hp-)?(arabic|greek|hebrew|kana|roman|thai|turkish)8$/i =>
165 '"${1}8"' );
5129552c 166
67d7b5ef 167 # The Official name of ASCII.
168 define_alias( qr/\bANSI[-_]?X3\.4[-_]?1968$/i => '"ascii"' );
5129552c 169
67d7b5ef 170 # This is a font issue, not an encoding issue.
171 # (The currency symbol of the Latin 1 upper half
172 # has been redefined as the euro symbol.)
5129552c 173 define_alias( qr/^(.+)\@euro$/i => '"$1"' );
174
d1256cb1 175 define_alias( qr/\b(?:iso[-_]?)?latin[-_]?(\d+)$/i =>
176'defined $Encode::Alias::Latin2iso[$1] ? "iso-8859-$Encode::Alias::Latin2iso[$1]" : undef'
177 );
5129552c 178
d1256cb1 179 define_alias(
180 qr/\bwin(latin[12]|cyrillic|baltic|greek|turkish|
181 hebrew|arabic|baltic|vietnamese)$/ix =>
182 '"cp" . $Encode::Alias::Winlatin2cp{lc($1)}'
183 );
5d030b67 184
3c4b39be 185 # Common names for non-latin preferred MIME names
d1256cb1 186 define_alias(
187 'ascii' => 'US-ascii',
188 'cyrillic' => 'iso-8859-5',
189 'arabic' => 'iso-8859-6',
190 'greek' => 'iso-8859-7',
191 'hebrew' => 'iso-8859-8',
192 'thai' => 'iso-8859-11',
d1256cb1 193 );
51e4e64d 194 # RT #20781
195 define_alias(qr/\btis-?620\b/i => '"iso-8859-11"');
5d030b67 196
67d7b5ef 197 # At least AIX has IBM-NNN (surprisingly...) instead of cpNNN.
198 # And Microsoft has their own naming (again, surprisingly).
d1256cb1 199 # And windows-* is registered in IANA!
200 define_alias(
201 qr/\b(?:cp|ibm|ms|windows)[-_ ]?(\d{2,4})$/i => '"cp$1"' );
5d030b67 202
67d7b5ef 203 # Sometimes seen with a leading zero.
c731e18e 204 # define_alias( qr/\bcp037\b/i => '"cp37"');
5d030b67 205
3ef515df 206 # Mac Mappings
a999c27c 207 # predefined in *.ucm; unneeded
208 # define_alias( qr/\bmacIcelandic$/i => '"macIceland"');
d1256cb1 209 define_alias( qr/^mac_(.*)$/i => '"mac$1"' );
2fd0906e 210 # http://rt.cpan.org/Ticket/Display.html?id=36326
211 define_alias( qr/^macintosh$/i => '"MacRoman"' );
d1256cb1 212
a999c27c 213 # Ououououou. gone. They are differente!
214 # define_alias( qr/\bmacRomanian$/i => '"macRumanian"');
d1256cb1 215
f2a2953c 216 # Standardize on the dashed versions.
cf9f87ce 217 define_alias( qr/\bkoi8[\s\-_]*([ru])$/i => '"koi8-$1"' );
5129552c 218
d1256cb1 219 unless ($Encode::ON_EBCDIC) {
220
a63c962f 221 # for Encode::CN
d1256cb1 222 define_alias( qr/\beuc.*cn$/i => '"euc-cn"' );
223 define_alias( qr/\bcn.*euc$/i => '"euc-cn"' );
224
225 # define_alias( qr/\bGB[- ]?(\d+)$/i => '"euc-cn"' )
226 # CP936 doesn't have vendor-addon for GBK, so they're identical.
227 define_alias( qr/^gbk$/i => '"cp936"' );
228
229 # This fixes gb2312 vs. euc-cn confusion, practically
230 define_alias( qr/\bGB[-_ ]?2312(?!-?raw)/i => '"euc-cn"' );
231
232 # for Encode::JP
233 define_alias( qr/\bjis$/i => '"7bit-jis"' );
234 define_alias( qr/\beuc.*jp$/i => '"euc-jp"' );
235 define_alias( qr/\bjp.*euc$/i => '"euc-jp"' );
236 define_alias( qr/\bujis$/i => '"euc-jp"' );
237 define_alias( qr/\bshift.*jis$/i => '"shiftjis"' );
238 define_alias( qr/\bsjis$/i => '"shiftjis"' );
239 define_alias( qr/\bwindows-31j$/i => '"cp932"' );
240
a63c962f 241 # for Encode::KR
d1256cb1 242 define_alias( qr/\beuc.*kr$/i => '"euc-kr"' );
243 define_alias( qr/\bkr.*euc$/i => '"euc-kr"' );
244
245 # This fixes ksc5601 vs. euc-kr confusion, practically
246 define_alias( qr/(?:x-)?uhc$/i => '"cp949"' );
247 define_alias( qr/(?:x-)?windows-949$/i => '"cp949"' );
248 define_alias( qr/\bks_c_5601-1987$/i => '"cp949"' );
249
a63c962f 250 # for Encode::TW
d1256cb1 251 define_alias( qr/\bbig-?5$/i => '"big5-eten"' );
252 define_alias( qr/\bbig5-?et(?:en)?$/i => '"big5-eten"' );
253 define_alias( qr/\btca[-_]?big5$/i => '"big5-eten"' );
254 define_alias( qr/\bbig5-?hk(?:scs)?$/i => '"big5-hkscs"' );
255 define_alias( qr/\bhk(?:scs)?[-_]?big5$/i => '"big5-hkscs"' );
a63c962f 256 }
d1256cb1 257
f2a2953c 258 # utf8 is blessed :)
4e71788c 259 define_alias( qr/\bUTF-8$/i => '"utf-8-strict"' );
d1256cb1 260
67d7b5ef 261 # At last, Map white space and _ to '-'
5129552c 262 define_alias( qr/^(\S+)[\s_]+(.*)$/i => '"$1-$2"' );
263}
264
2651;
266__END__
5d030b67 267
268# TODO: HP-UX '8' encodings arabic8 greek8 hebrew8 kana8 thai8 turkish8
269# TODO: HP-UX '15' encodings japanese15 korean15 roi15
270# TODO: Cyrillic encoding ISO-IR-111 (useful?)
271# TODO: Armenian encoding ARMSCII-8
272# TODO: Hebrew encoding ISO-8859-8-1
273# TODO: Thai encoding TCVN
5d030b67 274# TODO: Vietnamese encodings VPS
275# TODO: Mac Asian+African encodings: Arabic Armenian Bengali Burmese
276# ChineseSimp ChineseTrad Devanagari Ethiopic ExtArabic
277# Farsi Georgian Gujarati Gurmukhi Hebrew Japanese
278# Kannada Khmer Korean Laotian Malayalam Mongolian
279# Oriya Sinhalese Symbol Tamil Telugu Tibetan Vietnamese
280
5d030b67 281=head1 NAME
282
ce912cd4 283Encode::Alias - alias definitions to encodings
5d030b67 284
285=head1 SYNOPSIS
286
5129552c 287 use Encode;
288 use Encode::Alias;
5d030b67 289 define_alias( newName => ENCODING);
290
291=head1 DESCRIPTION
292
3ef515df 293Allows newName to be used as an alias for ENCODING. ENCODING may be
fcb875d4 294either the name of an encoding or an encoding object (as described
3ef515df 295in L<Encode>).
5d030b67 296
297Currently I<newName> can be specified in the following ways:
298
299=over 4
300
301=item As a simple string.
302
303=item As a qr// compiled regular expression, e.g.:
304
305 define_alias( qr/^iso8859-(\d+)$/i => '"iso-8859-$1"' );
306
0ab8f81e 307In this case, if I<ENCODING> is not a reference, it is C<eval>-ed
308in order to allow C<$1> etc. to be substituted. The example is one
309way to alias names as used in X11 fonts to the MIME names for the
310iso-8859-* family. Note the double quotes inside the single quotes.
5d030b67 311
151b5d36 312(or, you don't have to do this yourself because this example is predefined)
313
3ef515df 314If you are using a regex here, you have to use the quotes as shown or
315it won't work. Also note that regex handling is tricky even for the
151b5d36 316experienced. Use this feature with caution.
5d030b67 317
318=item As a code reference, e.g.:
319
151b5d36 320 define_alias( sub {shift =~ /^iso8859-(\d+)$/i ? "iso-8859-$1" : undef } );
5d030b67 321
151b5d36 322The same effect as the example above in a different way. The coderef
323takes the alias name as an argument and returns a canonical name on
324success or undef if not. Note the second argument is not required.
325Use this with even more caution than the regex version.
5d030b67 326
5129552c 327=back
328
151b5d36 329=head3 Changes in code reference aliasing
330
331As of Encode 1.87, the older form
332
333 define_alias( sub { return /^iso8859-(\d+)$/i ? "iso-8859-$1" : undef } );
334
335no longer works.
336
337Encode up to 1.86 internally used "local $_" to implement ths older
338form. But consider the code below;
339
340 use Encode;
341 $_ = "eeeee" ;
342 while (/(e)/g) {
343 my $utf = decode('aliased-encoding-name', $1);
344 print "position:",pos,"\n";
345 }
346
347Prior to Encode 1.86 this fails because of "local $_".
348
0ab8f81e 349=head2 Alias overloading
5d030b67 350
3ef515df 351You can override predefined aliases by simply applying define_alias().
3c4b39be 352The new alias is always evaluated first, and when necessary,
0ab8f81e 353define_alias() flushes the internal cache to make the new definition
354available.
5d030b67 355
0ab8f81e 356 # redirect SHIFT_JIS to MS/IBM Code Page 932, which is a
5d030b67 357 # superset of SHIFT_JIS
358
5129552c 359 define_alias( qr/shift.*jis$/i => '"cp932"' );
360 define_alias( qr/sjis$/i => '"cp932"' );
361
0ab8f81e 362If you want to zap all predefined aliases, you can use
5129552c 363
364 Encode::Alias->undef_aliases;
365
366to do so. And
367
368 Encode::Alias->init_aliases;
369
0ab8f81e 370gets the factory settings back.
5d030b67 371
372=head1 SEE ALSO
373
374L<Encode>, L<Encode::Supported>
375
5129552c 376=cut
5d030b67 377