Upgrade to Encode 2.00.
[p5sagit/p5-mst-13.2.git] / ext / Encode / lib / Encode / Alias.pm
CommitLineData
5d030b67 1package Encode::Alias;
2use strict;
1485817e 3no warnings 'redefine';
5129552c 4use Encode;
7237418a 5our $VERSION = do { my @r = (q$Revision: 2.0 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
8f139f4c 6sub DEBUG () { 0 }
5d030b67 7
10c5ecbb 8use base qw(Exporter);
5d030b67 9
10# Public, encouraged API is exported by default
5129552c 11
fcb875d4 12our @EXPORT =
5129552c 13 qw (
14 define_alias
15 find_alias
16 );
5d030b67 17
18our @Alias; # ordered matching list
19our %Alias; # cached known aliases
20
5129552c 21sub find_alias
5d030b67 22{
23 my $class = shift;
151b5d36 24 my $find = shift;
25 unless (exists $Alias{$find})
5d030b67 26 {
151b5d36 27 $Alias{$find} = undef; # Recursion guard
5d030b67 28 for (my $i=0; $i < @Alias; $i += 2)
29 {
30 my $alias = $Alias[$i];
31 my $val = $Alias[$i+1];
32 my $new;
151b5d36 33 if (ref($alias) eq 'Regexp' && $find =~ $alias)
5d030b67 34 {
8f139f4c 35 DEBUG and warn "eval $val";
5d030b67 36 $new = eval $val;
8f139f4c 37 DEBUG and $@ and warn "$val, $@";
5d030b67 38 }
39 elsif (ref($alias) eq 'CODE')
40 {
8f139f4c 41 DEBUG and warn "$alias", "->", "($find)";
151b5d36 42 $new = $alias->($find);
5d030b67 43 }
151b5d36 44 elsif (lc($find) eq lc($alias))
5d030b67 45 {
46 $new = $val;
47 }
48 if (defined($new))
49 {
151b5d36 50 next if $new eq $find; # avoid (direct) recursion on bugs
8f139f4c 51 DEBUG and warn "$alias, $new";
5d030b67 52 my $enc = (ref($new)) ? $new : Encode::find_encoding($new);
53 if ($enc)
54 {
151b5d36 55 $Alias{$find} = $enc;
5d030b67 56 last;
57 }
58 }
59 }
60 }
8f139f4c 61 if (DEBUG){
a63c962f 62 my $name;
151b5d36 63 if (my $e = $Alias{$find}){
a63c962f 64 $name = $e->name;
65 }else{
66 $name = "";
67 }
151b5d36 68 warn "find_alias($class, $find)->name = $name";
a63c962f 69 }
151b5d36 70 return $Alias{$find};
5d030b67 71}
72
73sub define_alias
74{
75 while (@_)
76 {
77 my ($alias,$name) = splice(@_,0,2);
78 unshift(@Alias, $alias => $name); # newer one has precedence
79 # clear %Alias cache to allow overrides
80 if (ref($alias)){
5129552c 81 my @a = keys %Alias;
82 for my $k (@a){
5d030b67 83 if (ref($alias) eq 'Regexp' && $k =~ $alias)
84 {
8f139f4c 85 DEBUG and warn "delete \$Alias\{$k\}";
5d030b67 86 delete $Alias{$k};
87 }
88 elsif (ref($alias) eq 'CODE')
89 {
8f139f4c 90 DEBUG and warn "delete \$Alias\{$k\}";
5d030b67 91 delete $Alias{$alias->($name)};
92 }
93 }
94 }else{
8f139f4c 95 DEBUG and warn "delete \$Alias\{$alias\}";
5d030b67 96 delete $Alias{$alias};
97 }
98 }
99}
100
5d030b67 101# Allow latin-1 style names as well
102 # 0 1 2 3 4 5 6 7 8 9 10
103our @Latin2iso = ( 0, 1, 2, 3, 4, 9, 10, 13, 14, 15, 16 );
5d030b67 104# Allow winlatin1 style names as well
105our %Winlatin2cp = (
106 'latin1' => 1252,
107 'latin2' => 1250,
108 'cyrillic' => 1251,
109 'greek' => 1253,
110 'turkish' => 1254,
111 'hebrew' => 1255,
112 'arabic' => 1256,
113 'baltic' => 1257,
114 'vietnamese' => 1258,
115 );
116
5129552c 117init_aliases();
118
119sub undef_aliases{
120 @Alias = ();
121 %Alias = ();
122}
123
124sub init_aliases
125{
126 undef_aliases();
a999c27c 127
f2a2953c 128 # Try all-lower-case version should all else fails
a999c27c 129 define_alias( qr/^(.*)$/ => '"\L$1"' );
130
f2a2953c 131 # UTF/UCS stuff
1485817e 132 define_alias( qr/^UTF-?7$/i => '"UTF-7"');
11067275 133 define_alias( qr/^UCS-?2-?LE$/i => '"UCS-2LE"' );
134 define_alias( qr/^UCS-?2-?(BE)?$/i => '"UCS-2BE"',
126bf8bf 135 qr/^UCS-?4-?(BE|LE)?$/i => 'uc("UTF-32$1")',
11067275 136 qr/^iso-10646-1$/i => '"UCS-2BE"' );
137 define_alias( qr/^UTF(16|32)-?BE$/i => '"UTF-$1BE"',
138 qr/^UTF(16|32)-?LE$/i => '"UTF-$1LE"',
139 qr/^UTF(16|32)$/i => '"UTF-$1"',
f2a2953c 140 );
141 # ASCII
a999c27c 142 define_alias(qr/^(?:US-?)ascii$/i => '"ascii"');
a999c27c 143 define_alias('C' => 'ascii');
2d06ad02 144 define_alias(qr/\bISO[-_]?646[-_]?US$/i => '"ascii"');
67d7b5ef 145 # Allow variants of iso-8859-1 etc.
146 define_alias( qr/\biso[-_]?(\d+)[-_](\d+)$/i => '"iso-$1-$2"' );
5129552c 147
67d7b5ef 148 # At least HP-UX has these.
149 define_alias( qr/\biso8859(\d+)$/i => '"iso-8859-$1"' );
5129552c 150
67d7b5ef 151 # More HP stuff.
152 define_alias( qr/\b(?:hp-)?(arabic|greek|hebrew|kana|roman|thai|turkish)8$/i => '"${1}8"' );
5129552c 153
67d7b5ef 154 # The Official name of ASCII.
155 define_alias( qr/\bANSI[-_]?X3\.4[-_]?1968$/i => '"ascii"' );
5129552c 156
67d7b5ef 157 # This is a font issue, not an encoding issue.
158 # (The currency symbol of the Latin 1 upper half
159 # has been redefined as the euro symbol.)
5129552c 160 define_alias( qr/^(.+)\@euro$/i => '"$1"' );
161
fcb875d4 162 define_alias( qr/\b(?:iso[-_]?)?latin[-_]?(\d+)$/i
bb7af5ca 163 => 'defined $Encode::Alias::Latin2iso[$1] ? "iso-8859-$Encode::Alias::Latin2iso[$1]" : undef' );
5129552c 164
67d7b5ef 165 define_alias( qr/\bwin(latin[12]|cyrillic|baltic|greek|turkish|
fcb875d4 166 hebrew|arabic|baltic|vietnamese)$/ix =>
5129552c 167 '"cp" . $Encode::Alias::Winlatin2cp{lc($1)}' );
5d030b67 168
67d7b5ef 169 # Common names for non-latin prefered MIME names
5129552c 170 define_alias( 'ascii' => 'US-ascii',
171 'cyrillic' => 'iso-8859-5',
172 'arabic' => 'iso-8859-6',
173 'greek' => 'iso-8859-7',
174 'hebrew' => 'iso-8859-8',
175 'thai' => 'iso-8859-11',
176 'tis620' => 'iso-8859-11',
177 );
5d030b67 178
67d7b5ef 179 # At least AIX has IBM-NNN (surprisingly...) instead of cpNNN.
180 # And Microsoft has their own naming (again, surprisingly).
fcb875d4 181 # And windows-* is registered in IANA!
2d06ad02 182 define_alias( qr/\b(?:cp|ibm|ms|windows)[-_ ]?(\d{2,4})$/i => '"cp$1"');
5d030b67 183
67d7b5ef 184 # Sometimes seen with a leading zero.
c731e18e 185 # define_alias( qr/\bcp037\b/i => '"cp37"');
5d030b67 186
3ef515df 187 # Mac Mappings
a999c27c 188 # predefined in *.ucm; unneeded
189 # define_alias( qr/\bmacIcelandic$/i => '"macIceland"');
3ef515df 190 define_alias( qr/^mac_(.*)$/i => '"mac$1"');
a999c27c 191 # Ououououou. gone. They are differente!
192 # define_alias( qr/\bmacRomanian$/i => '"macRumanian"');
fcb875d4 193
f2a2953c 194 # Standardize on the dashed versions.
78c56a8c 195 # define_alias( qr/\butf8$/i => '"utf-8"' );
f9674d83 196 define_alias( qr/\bkoi8[\s-_]*([ru])$/i => '"koi8-$1"' );
5129552c 197
a63c962f 198 unless ($Encode::ON_EBCDIC){
199 # for Encode::CN
67d7b5ef 200 define_alias( qr/\beuc.*cn$/i => '"euc-cn"' );
201 define_alias( qr/\bcn.*euc$/i => '"euc-cn"' );
202 # define_alias( qr/\bGB[- ]?(\d+)$/i => '"euc-cn"' )
203 # CP936 doesn't have vendor-addon for GBK, so they're identical.
204 define_alias( qr/^gbk$/i => '"cp936"');
205 # This fixes gb2312 vs. euc-cn confusion, practically
b9531c19 206 define_alias( qr/\bGB[-_ ]?2312(?!-?raw)/i => '"euc-cn"' );
67d7b5ef 207 # for Encode::JP
208 define_alias( qr/\bjis$/i => '"7bit-jis"' );
209 define_alias( qr/\beuc.*jp$/i => '"euc-jp"' );
210 define_alias( qr/\bjp.*euc$/i => '"euc-jp"' );
211 define_alias( qr/\bujis$/i => '"euc-jp"' );
212 define_alias( qr/\bshift.*jis$/i => '"shiftjis"' );
213 define_alias( qr/\bsjis$/i => '"shiftjis"' );
a63c962f 214 # for Encode::KR
67d7b5ef 215 define_alias( qr/\beuc.*kr$/i => '"euc-kr"' );
216 define_alias( qr/\bkr.*euc$/i => '"euc-kr"' );
217 # This fixes ksc5601 vs. euc-kr confusion, practically
218 define_alias( qr/(?:x-)?uhc$/i => '"cp949"' );
219 define_alias( qr/(?:x-)?windows-949$/i => '"cp949"' );
220 define_alias( qr/\bks_c_5601-1987$/i => '"cp949"' );
a63c962f 221 # for Encode::TW
b0b300a3 222 define_alias( qr/\bbig-?5$/i => '"big5-eten"' );
4b291ae6 223 define_alias( qr/\bbig5-?et(?:en)?$/i => '"big5-eten"' );
2d06ad02 224 define_alias( qr/\btca[-_]?big5$/i => '"big5-eten"' );
b0b300a3 225 define_alias( qr/\bbig5-?hk(?:scs)?$/i => '"big5-hkscs"' );
2d06ad02 226 define_alias( qr/\bhk(?:scs)?[-_]?big5$/i => '"big5-hkscs"' );
a63c962f 227 }
f2a2953c 228 # utf8 is blessed :)
229 define_alias( qr/^UTF-8$/i => '"utf8"',);
67d7b5ef 230 # At last, Map white space and _ to '-'
5129552c 231 define_alias( qr/^(\S+)[\s_]+(.*)$/i => '"$1-$2"' );
232}
233
2341;
235__END__
5d030b67 236
237# TODO: HP-UX '8' encodings arabic8 greek8 hebrew8 kana8 thai8 turkish8
238# TODO: HP-UX '15' encodings japanese15 korean15 roi15
239# TODO: Cyrillic encoding ISO-IR-111 (useful?)
240# TODO: Armenian encoding ARMSCII-8
241# TODO: Hebrew encoding ISO-8859-8-1
242# TODO: Thai encoding TCVN
5d030b67 243# TODO: Vietnamese encodings VPS
244# TODO: Mac Asian+African encodings: Arabic Armenian Bengali Burmese
245# ChineseSimp ChineseTrad Devanagari Ethiopic ExtArabic
246# Farsi Georgian Gujarati Gurmukhi Hebrew Japanese
247# Kannada Khmer Korean Laotian Malayalam Mongolian
248# Oriya Sinhalese Symbol Tamil Telugu Tibetan Vietnamese
249
5d030b67 250=head1 NAME
251
ce912cd4 252Encode::Alias - alias definitions to encodings
5d030b67 253
254=head1 SYNOPSIS
255
5129552c 256 use Encode;
257 use Encode::Alias;
5d030b67 258 define_alias( newName => ENCODING);
259
260=head1 DESCRIPTION
261
3ef515df 262Allows newName to be used as an alias for ENCODING. ENCODING may be
fcb875d4 263either the name of an encoding or an encoding object (as described
3ef515df 264in L<Encode>).
5d030b67 265
266Currently I<newName> can be specified in the following ways:
267
268=over 4
269
270=item As a simple string.
271
272=item As a qr// compiled regular expression, e.g.:
273
274 define_alias( qr/^iso8859-(\d+)$/i => '"iso-8859-$1"' );
275
0ab8f81e 276In this case, if I<ENCODING> is not a reference, it is C<eval>-ed
277in order to allow C<$1> etc. to be substituted. The example is one
278way to alias names as used in X11 fonts to the MIME names for the
279iso-8859-* family. Note the double quotes inside the single quotes.
5d030b67 280
151b5d36 281(or, you don't have to do this yourself because this example is predefined)
282
3ef515df 283If you are using a regex here, you have to use the quotes as shown or
284it won't work. Also note that regex handling is tricky even for the
151b5d36 285experienced. Use this feature with caution.
5d030b67 286
287=item As a code reference, e.g.:
288
151b5d36 289 define_alias( sub {shift =~ /^iso8859-(\d+)$/i ? "iso-8859-$1" : undef } );
5d030b67 290
151b5d36 291The same effect as the example above in a different way. The coderef
292takes the alias name as an argument and returns a canonical name on
293success or undef if not. Note the second argument is not required.
294Use this with even more caution than the regex version.
5d030b67 295
5129552c 296=back
297
151b5d36 298=head3 Changes in code reference aliasing
299
300As of Encode 1.87, the older form
301
302 define_alias( sub { return /^iso8859-(\d+)$/i ? "iso-8859-$1" : undef } );
303
304no longer works.
305
306Encode up to 1.86 internally used "local $_" to implement ths older
307form. But consider the code below;
308
309 use Encode;
310 $_ = "eeeee" ;
311 while (/(e)/g) {
312 my $utf = decode('aliased-encoding-name', $1);
313 print "position:",pos,"\n";
314 }
315
316Prior to Encode 1.86 this fails because of "local $_".
317
0ab8f81e 318=head2 Alias overloading
5d030b67 319
3ef515df 320You can override predefined aliases by simply applying define_alias().
0ab8f81e 321The new alias is always evaluated first, and when neccessary,
322define_alias() flushes the internal cache to make the new definition
323available.
5d030b67 324
0ab8f81e 325 # redirect SHIFT_JIS to MS/IBM Code Page 932, which is a
5d030b67 326 # superset of SHIFT_JIS
327
5129552c 328 define_alias( qr/shift.*jis$/i => '"cp932"' );
329 define_alias( qr/sjis$/i => '"cp932"' );
330
0ab8f81e 331If you want to zap all predefined aliases, you can use
5129552c 332
333 Encode::Alias->undef_aliases;
334
335to do so. And
336
337 Encode::Alias->init_aliases;
338
0ab8f81e 339gets the factory settings back.
5d030b67 340
341=head1 SEE ALSO
342
343L<Encode>, L<Encode::Supported>
344
5129552c 345=cut
5d030b67 346