Upgrade to Encode 2.18
[p5sagit/p5-mst-13.2.git] / ext / Encode / lib / Encode / Alias.pm
CommitLineData
5d030b67 1package Encode::Alias;
2use strict;
656ebd29 3use warnings;
1485817e 4no warnings 'redefine';
5129552c 5use Encode;
656ebd29 6our $VERSION = do { my @r = ( q$Revision: 2.6 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r };
8f139f4c 7sub DEBUG () { 0 }
5d030b67 8
10c5ecbb 9use base qw(Exporter);
5d030b67 10
11# Public, encouraged API is exported by default
5129552c 12
d1256cb1 13our @EXPORT =
14 qw (
15 define_alias
16 find_alias
17);
5d030b67 18
d1256cb1 19our @Alias; # ordered matching list
20our %Alias; # cached known aliases
5d030b67 21
d1256cb1 22sub find_alias {
5d030b67 23 my $class = shift;
d1256cb1 24 my $find = shift;
25 unless ( exists $Alias{$find} ) {
26 $Alias{$find} = undef; # Recursion guard
27 for ( my $i = 0 ; $i < @Alias ; $i += 2 ) {
28 my $alias = $Alias[$i];
29 my $val = $Alias[ $i + 1 ];
30 my $new;
31 if ( ref($alias) eq 'Regexp' && $find =~ $alias ) {
32 DEBUG and warn "eval $val";
33 $new = eval $val;
34 DEBUG and $@ and warn "$val, $@";
35 }
36 elsif ( ref($alias) eq 'CODE' ) {
37 DEBUG and warn "$alias", "->", "($find)";
38 $new = $alias->($find);
39 }
40 elsif ( lc($find) eq lc($alias) ) {
41 $new = $val;
42 }
43 if ( defined($new) ) {
44 next if $new eq $find; # avoid (direct) recursion on bugs
45 DEBUG and warn "$alias, $new";
46 my $enc =
47 ( ref($new) ) ? $new : Encode::find_encoding($new);
48 if ($enc) {
49 $Alias{$find} = $enc;
50 last;
51 }
52 }
53 }
54
55 # case insensitive search when canonical is not in all lowercase
56 # RT ticket #7835
57 unless ( $Alias{$find} ) {
58 my $lcfind = lc($find);
59 for my $name ( keys %Encode::Encoding, keys %Encode::ExtModule )
60 {
61 $lcfind eq lc($name) or next;
62 $Alias{$find} = Encode::find_encoding($name);
63 DEBUG and warn "$find => $name";
64 }
65 }
5d030b67 66 }
d1256cb1 67 if (DEBUG) {
68 my $name;
69 if ( my $e = $Alias{$find} ) {
70 $name = $e->name;
71 }
72 else {
73 $name = "";
74 }
75 warn "find_alias($class, $find)->name = $name";
a63c962f 76 }
151b5d36 77 return $Alias{$find};
5d030b67 78}
79
d1256cb1 80sub define_alias {
81 while (@_) {
82 my ( $alias, $name ) = splice( @_, 0, 2 );
83 unshift( @Alias, $alias => $name ); # newer one has precedence
84 if ( ref($alias) ) {
85
86 # clear %Alias cache to allow overrides
87 my @a = keys %Alias;
88 for my $k (@a) {
89 if ( ref($alias) eq 'Regexp' && $k =~ $alias ) {
90 DEBUG and warn "delete \$Alias\{$k\}";
91 delete $Alias{$k};
92 }
93 elsif ( ref($alias) eq 'CODE' ) {
94 DEBUG and warn "delete \$Alias\{$k\}";
95 delete $Alias{ $alias->($name) };
96 }
97 }
98 }
99 else {
100 DEBUG and warn "delete \$Alias\{$alias\}";
101 delete $Alias{$alias};
102 }
5d030b67 103 }
104}
105
5d030b67 106# Allow latin-1 style names as well
8f1ed24a 107# 0 1 2 3 4 5 6 7 8 9 10
5d030b67 108our @Latin2iso = ( 0, 1, 2, 3, 4, 9, 10, 13, 14, 15, 16 );
d1256cb1 109
5d030b67 110# Allow winlatin1 style names as well
d1256cb1 111our %Winlatin2cp = (
112 'latin1' => 1252,
113 'latin2' => 1250,
114 'cyrillic' => 1251,
115 'greek' => 1253,
116 'turkish' => 1254,
117 'hebrew' => 1255,
118 'arabic' => 1256,
119 'baltic' => 1257,
120 'vietnamese' => 1258,
121);
5d030b67 122
5129552c 123init_aliases();
124
d1256cb1 125sub undef_aliases {
5129552c 126 @Alias = ();
127 %Alias = ();
128}
129
d1256cb1 130sub init_aliases {
5129552c 131 undef_aliases();
d1256cb1 132
f2a2953c 133 # Try all-lower-case version should all else fails
a999c27c 134 define_alias( qr/^(.*)$/ => '"\L$1"' );
135
f2a2953c 136 # UTF/UCS stuff
d1256cb1 137 define_alias( qr/^UTF-?7$/i => '"UTF-7"' );
138 define_alias( qr/^UCS-?2-?LE$/i => '"UCS-2LE"' );
139 define_alias(
140 qr/^UCS-?2-?(BE)?$/i => '"UCS-2BE"',
141 qr/^UCS-?4-?(BE|LE)?$/i => 'uc("UTF-32$1")',
142 qr/^iso-10646-1$/i => '"UCS-2BE"'
143 );
144 define_alias(
145 qr/^UTF-?(16|32)-?BE$/i => '"UTF-$1BE"',
146 qr/^UTF-?(16|32)-?LE$/i => '"UTF-$1LE"',
147 qr/^UTF-?(16|32)$/i => '"UTF-$1"',
148 );
149
f2a2953c 150 # ASCII
d1256cb1 151 define_alias( qr/^(?:US-?)ascii$/i => '"ascii"' );
152 define_alias( 'C' => 'ascii' );
153 define_alias( qr/\bISO[-_]?646[-_]?US$/i => '"ascii"' );
154
67d7b5ef 155 # Allow variants of iso-8859-1 etc.
156 define_alias( qr/\biso[-_]?(\d+)[-_](\d+)$/i => '"iso-$1-$2"' );
5129552c 157
67d7b5ef 158 # At least HP-UX has these.
159 define_alias( qr/\biso8859(\d+)$/i => '"iso-8859-$1"' );
5129552c 160
67d7b5ef 161 # More HP stuff.
d1256cb1 162 define_alias(
163 qr/\b(?:hp-)?(arabic|greek|hebrew|kana|roman|thai|turkish)8$/i =>
164 '"${1}8"' );
5129552c 165
67d7b5ef 166 # The Official name of ASCII.
167 define_alias( qr/\bANSI[-_]?X3\.4[-_]?1968$/i => '"ascii"' );
5129552c 168
67d7b5ef 169 # This is a font issue, not an encoding issue.
170 # (The currency symbol of the Latin 1 upper half
171 # has been redefined as the euro symbol.)
5129552c 172 define_alias( qr/^(.+)\@euro$/i => '"$1"' );
173
d1256cb1 174 define_alias( qr/\b(?:iso[-_]?)?latin[-_]?(\d+)$/i =>
175'defined $Encode::Alias::Latin2iso[$1] ? "iso-8859-$Encode::Alias::Latin2iso[$1]" : undef'
176 );
5129552c 177
d1256cb1 178 define_alias(
179 qr/\bwin(latin[12]|cyrillic|baltic|greek|turkish|
180 hebrew|arabic|baltic|vietnamese)$/ix =>
181 '"cp" . $Encode::Alias::Winlatin2cp{lc($1)}'
182 );
5d030b67 183
3c4b39be 184 # Common names for non-latin preferred MIME names
d1256cb1 185 define_alias(
186 'ascii' => 'US-ascii',
187 'cyrillic' => 'iso-8859-5',
188 'arabic' => 'iso-8859-6',
189 'greek' => 'iso-8859-7',
190 'hebrew' => 'iso-8859-8',
191 'thai' => 'iso-8859-11',
192 'tis620' => 'iso-8859-11',
193 );
5d030b67 194
67d7b5ef 195 # At least AIX has IBM-NNN (surprisingly...) instead of cpNNN.
196 # And Microsoft has their own naming (again, surprisingly).
d1256cb1 197 # And windows-* is registered in IANA!
198 define_alias(
199 qr/\b(?:cp|ibm|ms|windows)[-_ ]?(\d{2,4})$/i => '"cp$1"' );
5d030b67 200
67d7b5ef 201 # Sometimes seen with a leading zero.
c731e18e 202 # define_alias( qr/\bcp037\b/i => '"cp37"');
5d030b67 203
3ef515df 204 # Mac Mappings
a999c27c 205 # predefined in *.ucm; unneeded
206 # define_alias( qr/\bmacIcelandic$/i => '"macIceland"');
d1256cb1 207 define_alias( qr/^mac_(.*)$/i => '"mac$1"' );
208
a999c27c 209 # Ououououou. gone. They are differente!
210 # define_alias( qr/\bmacRomanian$/i => '"macRumanian"');
d1256cb1 211
f2a2953c 212 # Standardize on the dashed versions.
cf9f87ce 213 define_alias( qr/\bkoi8[\s\-_]*([ru])$/i => '"koi8-$1"' );
5129552c 214
d1256cb1 215 unless ($Encode::ON_EBCDIC) {
216
a63c962f 217 # for Encode::CN
d1256cb1 218 define_alias( qr/\beuc.*cn$/i => '"euc-cn"' );
219 define_alias( qr/\bcn.*euc$/i => '"euc-cn"' );
220
221 # define_alias( qr/\bGB[- ]?(\d+)$/i => '"euc-cn"' )
222 # CP936 doesn't have vendor-addon for GBK, so they're identical.
223 define_alias( qr/^gbk$/i => '"cp936"' );
224
225 # This fixes gb2312 vs. euc-cn confusion, practically
226 define_alias( qr/\bGB[-_ ]?2312(?!-?raw)/i => '"euc-cn"' );
227
228 # for Encode::JP
229 define_alias( qr/\bjis$/i => '"7bit-jis"' );
230 define_alias( qr/\beuc.*jp$/i => '"euc-jp"' );
231 define_alias( qr/\bjp.*euc$/i => '"euc-jp"' );
232 define_alias( qr/\bujis$/i => '"euc-jp"' );
233 define_alias( qr/\bshift.*jis$/i => '"shiftjis"' );
234 define_alias( qr/\bsjis$/i => '"shiftjis"' );
235 define_alias( qr/\bwindows-31j$/i => '"cp932"' );
236
a63c962f 237 # for Encode::KR
d1256cb1 238 define_alias( qr/\beuc.*kr$/i => '"euc-kr"' );
239 define_alias( qr/\bkr.*euc$/i => '"euc-kr"' );
240
241 # This fixes ksc5601 vs. euc-kr confusion, practically
242 define_alias( qr/(?:x-)?uhc$/i => '"cp949"' );
243 define_alias( qr/(?:x-)?windows-949$/i => '"cp949"' );
244 define_alias( qr/\bks_c_5601-1987$/i => '"cp949"' );
245
a63c962f 246 # for Encode::TW
d1256cb1 247 define_alias( qr/\bbig-?5$/i => '"big5-eten"' );
248 define_alias( qr/\bbig5-?et(?:en)?$/i => '"big5-eten"' );
249 define_alias( qr/\btca[-_]?big5$/i => '"big5-eten"' );
250 define_alias( qr/\bbig5-?hk(?:scs)?$/i => '"big5-hkscs"' );
251 define_alias( qr/\bhk(?:scs)?[-_]?big5$/i => '"big5-hkscs"' );
a63c962f 252 }
d1256cb1 253
f2a2953c 254 # utf8 is blessed :)
d1256cb1 255 define_alias( qr/^UTF-8$/i => '"utf-8-strict"' );
256
67d7b5ef 257 # At last, Map white space and _ to '-'
5129552c 258 define_alias( qr/^(\S+)[\s_]+(.*)$/i => '"$1-$2"' );
259}
260
2611;
262__END__
5d030b67 263
264# TODO: HP-UX '8' encodings arabic8 greek8 hebrew8 kana8 thai8 turkish8
265# TODO: HP-UX '15' encodings japanese15 korean15 roi15
266# TODO: Cyrillic encoding ISO-IR-111 (useful?)
267# TODO: Armenian encoding ARMSCII-8
268# TODO: Hebrew encoding ISO-8859-8-1
269# TODO: Thai encoding TCVN
5d030b67 270# TODO: Vietnamese encodings VPS
271# TODO: Mac Asian+African encodings: Arabic Armenian Bengali Burmese
272# ChineseSimp ChineseTrad Devanagari Ethiopic ExtArabic
273# Farsi Georgian Gujarati Gurmukhi Hebrew Japanese
274# Kannada Khmer Korean Laotian Malayalam Mongolian
275# Oriya Sinhalese Symbol Tamil Telugu Tibetan Vietnamese
276
5d030b67 277=head1 NAME
278
ce912cd4 279Encode::Alias - alias definitions to encodings
5d030b67 280
281=head1 SYNOPSIS
282
5129552c 283 use Encode;
284 use Encode::Alias;
5d030b67 285 define_alias( newName => ENCODING);
286
287=head1 DESCRIPTION
288
3ef515df 289Allows newName to be used as an alias for ENCODING. ENCODING may be
fcb875d4 290either the name of an encoding or an encoding object (as described
3ef515df 291in L<Encode>).
5d030b67 292
293Currently I<newName> can be specified in the following ways:
294
295=over 4
296
297=item As a simple string.
298
299=item As a qr// compiled regular expression, e.g.:
300
301 define_alias( qr/^iso8859-(\d+)$/i => '"iso-8859-$1"' );
302
0ab8f81e 303In this case, if I<ENCODING> is not a reference, it is C<eval>-ed
304in order to allow C<$1> etc. to be substituted. The example is one
305way to alias names as used in X11 fonts to the MIME names for the
306iso-8859-* family. Note the double quotes inside the single quotes.
5d030b67 307
151b5d36 308(or, you don't have to do this yourself because this example is predefined)
309
3ef515df 310If you are using a regex here, you have to use the quotes as shown or
311it won't work. Also note that regex handling is tricky even for the
151b5d36 312experienced. Use this feature with caution.
5d030b67 313
314=item As a code reference, e.g.:
315
151b5d36 316 define_alias( sub {shift =~ /^iso8859-(\d+)$/i ? "iso-8859-$1" : undef } );
5d030b67 317
151b5d36 318The same effect as the example above in a different way. The coderef
319takes the alias name as an argument and returns a canonical name on
320success or undef if not. Note the second argument is not required.
321Use this with even more caution than the regex version.
5d030b67 322
5129552c 323=back
324
151b5d36 325=head3 Changes in code reference aliasing
326
327As of Encode 1.87, the older form
328
329 define_alias( sub { return /^iso8859-(\d+)$/i ? "iso-8859-$1" : undef } );
330
331no longer works.
332
333Encode up to 1.86 internally used "local $_" to implement ths older
334form. But consider the code below;
335
336 use Encode;
337 $_ = "eeeee" ;
338 while (/(e)/g) {
339 my $utf = decode('aliased-encoding-name', $1);
340 print "position:",pos,"\n";
341 }
342
343Prior to Encode 1.86 this fails because of "local $_".
344
0ab8f81e 345=head2 Alias overloading
5d030b67 346
3ef515df 347You can override predefined aliases by simply applying define_alias().
3c4b39be 348The new alias is always evaluated first, and when necessary,
0ab8f81e 349define_alias() flushes the internal cache to make the new definition
350available.
5d030b67 351
0ab8f81e 352 # redirect SHIFT_JIS to MS/IBM Code Page 932, which is a
5d030b67 353 # superset of SHIFT_JIS
354
5129552c 355 define_alias( qr/shift.*jis$/i => '"cp932"' );
356 define_alias( qr/sjis$/i => '"cp932"' );
357
0ab8f81e 358If you want to zap all predefined aliases, you can use
5129552c 359
360 Encode::Alias->undef_aliases;
361
362to do so. And
363
364 Encode::Alias->init_aliases;
365
0ab8f81e 366gets the factory settings back.
5d030b67 367
368=head1 SEE ALSO
369
370L<Encode>, L<Encode::Supported>
371
5129552c 372=cut
5d030b67 373