Upgrade to Encode 1.97.
[p5sagit/p5-mst-13.2.git] / ext / Encode / lib / Encode / Alias.pm
CommitLineData
5d030b67 1package Encode::Alias;
2use strict;
1485817e 3no warnings 'redefine';
5129552c 4use Encode;
8f139f4c 5our $VERSION = do { my @r = (q$Revision: 1.37 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
6sub DEBUG () { 0 }
5d030b67 7
10c5ecbb 8use base qw(Exporter);
5d030b67 9
10# Public, encouraged API is exported by default
5129552c 11
fcb875d4 12our @EXPORT =
5129552c 13 qw (
14 define_alias
15 find_alias
16 );
5d030b67 17
18our @Alias; # ordered matching list
19our %Alias; # cached known aliases
20
5129552c 21sub find_alias
5d030b67 22{
23 my $class = shift;
151b5d36 24 my $find = shift;
25 unless (exists $Alias{$find})
5d030b67 26 {
151b5d36 27 $Alias{$find} = undef; # Recursion guard
5d030b67 28 for (my $i=0; $i < @Alias; $i += 2)
29 {
30 my $alias = $Alias[$i];
31 my $val = $Alias[$i+1];
32 my $new;
151b5d36 33 if (ref($alias) eq 'Regexp' && $find =~ $alias)
5d030b67 34 {
8f139f4c 35 DEBUG and warn "eval $val";
5d030b67 36 $new = eval $val;
8f139f4c 37 DEBUG and $@ and warn "$val, $@";
5d030b67 38 }
39 elsif (ref($alias) eq 'CODE')
40 {
8f139f4c 41 DEBUG and warn "$alias", "->", "($find)";
151b5d36 42 $new = $alias->($find);
5d030b67 43 }
151b5d36 44 elsif (lc($find) eq lc($alias))
5d030b67 45 {
46 $new = $val;
47 }
48 if (defined($new))
49 {
151b5d36 50 next if $new eq $find; # avoid (direct) recursion on bugs
8f139f4c 51 DEBUG and warn "$alias, $new";
5d030b67 52 my $enc = (ref($new)) ? $new : Encode::find_encoding($new);
53 if ($enc)
54 {
151b5d36 55 $Alias{$find} = $enc;
5d030b67 56 last;
57 }
58 }
59 }
60 }
8f139f4c 61 if (DEBUG){
a63c962f 62 my $name;
151b5d36 63 if (my $e = $Alias{$find}){
a63c962f 64 $name = $e->name;
65 }else{
66 $name = "";
67 }
151b5d36 68 warn "find_alias($class, $find)->name = $name";
a63c962f 69 }
151b5d36 70 return $Alias{$find};
5d030b67 71}
72
73sub define_alias
74{
75 while (@_)
76 {
77 my ($alias,$name) = splice(@_,0,2);
78 unshift(@Alias, $alias => $name); # newer one has precedence
79 # clear %Alias cache to allow overrides
80 if (ref($alias)){
5129552c 81 my @a = keys %Alias;
82 for my $k (@a){
5d030b67 83 if (ref($alias) eq 'Regexp' && $k =~ $alias)
84 {
8f139f4c 85 DEBUG and warn "delete \$Alias\{$k\}";
5d030b67 86 delete $Alias{$k};
87 }
88 elsif (ref($alias) eq 'CODE')
89 {
8f139f4c 90 DEBUG and warn "delete \$Alias\{$k\}";
5d030b67 91 delete $Alias{$alias->($name)};
92 }
93 }
94 }else{
8f139f4c 95 DEBUG and warn "delete \$Alias\{$alias\}";
5d030b67 96 delete $Alias{$alias};
97 }
98 }
99}
100
5d030b67 101# Allow latin-1 style names as well
102 # 0 1 2 3 4 5 6 7 8 9 10
103our @Latin2iso = ( 0, 1, 2, 3, 4, 9, 10, 13, 14, 15, 16 );
5d030b67 104# Allow winlatin1 style names as well
105our %Winlatin2cp = (
106 'latin1' => 1252,
107 'latin2' => 1250,
108 'cyrillic' => 1251,
109 'greek' => 1253,
110 'turkish' => 1254,
111 'hebrew' => 1255,
112 'arabic' => 1256,
113 'baltic' => 1257,
114 'vietnamese' => 1258,
115 );
116
5129552c 117init_aliases();
118
119sub undef_aliases{
120 @Alias = ();
121 %Alias = ();
122}
123
124sub init_aliases
125{
126 undef_aliases();
a999c27c 127
f2a2953c 128 # Try all-lower-case version should all else fails
a999c27c 129 define_alias( qr/^(.*)$/ => '"\L$1"' );
130
f2a2953c 131 # UTF/UCS stuff
1485817e 132 define_alias( qr/^UTF-?7$/i => '"UTF-7"');
11067275 133 define_alias( qr/^UCS-?2-?LE$/i => '"UCS-2LE"' );
134 define_alias( qr/^UCS-?2-?(BE)?$/i => '"UCS-2BE"',
126bf8bf 135 qr/^UCS-?4-?(BE|LE)?$/i => 'uc("UTF-32$1")',
11067275 136 qr/^iso-10646-1$/i => '"UCS-2BE"' );
137 define_alias( qr/^UTF(16|32)-?BE$/i => '"UTF-$1BE"',
138 qr/^UTF(16|32)-?LE$/i => '"UTF-$1LE"',
139 qr/^UTF(16|32)$/i => '"UTF-$1"',
f2a2953c 140 );
141 # ASCII
a999c27c 142 define_alias(qr/^(?:US-?)ascii$/i => '"ascii"');
a999c27c 143 define_alias('C' => 'ascii');
2d06ad02 144 define_alias(qr/\bISO[-_]?646[-_]?US$/i => '"ascii"');
67d7b5ef 145 # Allow variants of iso-8859-1 etc.
146 define_alias( qr/\biso[-_]?(\d+)[-_](\d+)$/i => '"iso-$1-$2"' );
5129552c 147
67d7b5ef 148 # At least HP-UX has these.
149 define_alias( qr/\biso8859(\d+)$/i => '"iso-8859-$1"' );
5129552c 150
67d7b5ef 151 # More HP stuff.
152 define_alias( qr/\b(?:hp-)?(arabic|greek|hebrew|kana|roman|thai|turkish)8$/i => '"${1}8"' );
5129552c 153
67d7b5ef 154 # The Official name of ASCII.
155 define_alias( qr/\bANSI[-_]?X3\.4[-_]?1968$/i => '"ascii"' );
5129552c 156
67d7b5ef 157 # This is a font issue, not an encoding issue.
158 # (The currency symbol of the Latin 1 upper half
159 # has been redefined as the euro symbol.)
5129552c 160 define_alias( qr/^(.+)\@euro$/i => '"$1"' );
161
fcb875d4 162 define_alias( qr/\b(?:iso[-_]?)?latin[-_]?(\d+)$/i
bb7af5ca 163 => 'defined $Encode::Alias::Latin2iso[$1] ? "iso-8859-$Encode::Alias::Latin2iso[$1]" : undef' );
5129552c 164
67d7b5ef 165 define_alias( qr/\bwin(latin[12]|cyrillic|baltic|greek|turkish|
fcb875d4 166 hebrew|arabic|baltic|vietnamese)$/ix =>
5129552c 167 '"cp" . $Encode::Alias::Winlatin2cp{lc($1)}' );
5d030b67 168
67d7b5ef 169 # Common names for non-latin prefered MIME names
5129552c 170 define_alias( 'ascii' => 'US-ascii',
171 'cyrillic' => 'iso-8859-5',
172 'arabic' => 'iso-8859-6',
173 'greek' => 'iso-8859-7',
174 'hebrew' => 'iso-8859-8',
175 'thai' => 'iso-8859-11',
176 'tis620' => 'iso-8859-11',
177 );
5d030b67 178
67d7b5ef 179 # At least AIX has IBM-NNN (surprisingly...) instead of cpNNN.
180 # And Microsoft has their own naming (again, surprisingly).
fcb875d4 181 # And windows-* is registered in IANA!
2d06ad02 182 define_alias( qr/\b(?:cp|ibm|ms|windows)[-_ ]?(\d{2,4})$/i => '"cp$1"');
5d030b67 183
67d7b5ef 184 # Sometimes seen with a leading zero.
c731e18e 185 # define_alias( qr/\bcp037\b/i => '"cp37"');
5d030b67 186
3ef515df 187 # Mac Mappings
a999c27c 188 # predefined in *.ucm; unneeded
189 # define_alias( qr/\bmacIcelandic$/i => '"macIceland"');
3ef515df 190 define_alias( qr/^mac_(.*)$/i => '"mac$1"');
a999c27c 191 # Ououououou. gone. They are differente!
192 # define_alias( qr/\bmacRomanian$/i => '"macRumanian"');
fcb875d4 193
f2a2953c 194 # Standardize on the dashed versions.
67d7b5ef 195 # define_alias( qr/\butf8$/i => 'utf-8' );
196 define_alias( qr/\bkoi8r$/i => 'koi8-r' );
197 define_alias( qr/\bkoi8u$/i => 'koi8-u' );
5129552c 198
a63c962f 199 unless ($Encode::ON_EBCDIC){
200 # for Encode::CN
67d7b5ef 201 define_alias( qr/\beuc.*cn$/i => '"euc-cn"' );
202 define_alias( qr/\bcn.*euc$/i => '"euc-cn"' );
203 # define_alias( qr/\bGB[- ]?(\d+)$/i => '"euc-cn"' )
204 # CP936 doesn't have vendor-addon for GBK, so they're identical.
205 define_alias( qr/^gbk$/i => '"cp936"');
206 # This fixes gb2312 vs. euc-cn confusion, practically
b9531c19 207 define_alias( qr/\bGB[-_ ]?2312(?!-?raw)/i => '"euc-cn"' );
67d7b5ef 208 # for Encode::JP
209 define_alias( qr/\bjis$/i => '"7bit-jis"' );
210 define_alias( qr/\beuc.*jp$/i => '"euc-jp"' );
211 define_alias( qr/\bjp.*euc$/i => '"euc-jp"' );
212 define_alias( qr/\bujis$/i => '"euc-jp"' );
213 define_alias( qr/\bshift.*jis$/i => '"shiftjis"' );
214 define_alias( qr/\bsjis$/i => '"shiftjis"' );
a63c962f 215 # for Encode::KR
67d7b5ef 216 define_alias( qr/\beuc.*kr$/i => '"euc-kr"' );
217 define_alias( qr/\bkr.*euc$/i => '"euc-kr"' );
218 # This fixes ksc5601 vs. euc-kr confusion, practically
219 define_alias( qr/(?:x-)?uhc$/i => '"cp949"' );
220 define_alias( qr/(?:x-)?windows-949$/i => '"cp949"' );
221 define_alias( qr/\bks_c_5601-1987$/i => '"cp949"' );
a63c962f 222 # for Encode::TW
b0b300a3 223 define_alias( qr/\bbig-?5$/i => '"big5-eten"' );
4b291ae6 224 define_alias( qr/\bbig5-?et(?:en)?$/i => '"big5-eten"' );
2d06ad02 225 define_alias( qr/\btca[-_]?big5$/i => '"big5-eten"' );
b0b300a3 226 define_alias( qr/\bbig5-?hk(?:scs)?$/i => '"big5-hkscs"' );
2d06ad02 227 define_alias( qr/\bhk(?:scs)?[-_]?big5$/i => '"big5-hkscs"' );
a63c962f 228 }
f2a2953c 229 # utf8 is blessed :)
230 define_alias( qr/^UTF-8$/i => '"utf8"',);
67d7b5ef 231 # At last, Map white space and _ to '-'
5129552c 232 define_alias( qr/^(\S+)[\s_]+(.*)$/i => '"$1-$2"' );
233}
234
2351;
236__END__
5d030b67 237
238# TODO: HP-UX '8' encodings arabic8 greek8 hebrew8 kana8 thai8 turkish8
239# TODO: HP-UX '15' encodings japanese15 korean15 roi15
240# TODO: Cyrillic encoding ISO-IR-111 (useful?)
241# TODO: Armenian encoding ARMSCII-8
242# TODO: Hebrew encoding ISO-8859-8-1
243# TODO: Thai encoding TCVN
5d030b67 244# TODO: Vietnamese encodings VPS
245# TODO: Mac Asian+African encodings: Arabic Armenian Bengali Burmese
246# ChineseSimp ChineseTrad Devanagari Ethiopic ExtArabic
247# Farsi Georgian Gujarati Gurmukhi Hebrew Japanese
248# Kannada Khmer Korean Laotian Malayalam Mongolian
249# Oriya Sinhalese Symbol Tamil Telugu Tibetan Vietnamese
250
5d030b67 251=head1 NAME
252
ce912cd4 253Encode::Alias - alias definitions to encodings
5d030b67 254
255=head1 SYNOPSIS
256
5129552c 257 use Encode;
258 use Encode::Alias;
5d030b67 259 define_alias( newName => ENCODING);
260
261=head1 DESCRIPTION
262
3ef515df 263Allows newName to be used as an alias for ENCODING. ENCODING may be
fcb875d4 264either the name of an encoding or an encoding object (as described
3ef515df 265in L<Encode>).
5d030b67 266
267Currently I<newName> can be specified in the following ways:
268
269=over 4
270
271=item As a simple string.
272
273=item As a qr// compiled regular expression, e.g.:
274
275 define_alias( qr/^iso8859-(\d+)$/i => '"iso-8859-$1"' );
276
0ab8f81e 277In this case, if I<ENCODING> is not a reference, it is C<eval>-ed
278in order to allow C<$1> etc. to be substituted. The example is one
279way to alias names as used in X11 fonts to the MIME names for the
280iso-8859-* family. Note the double quotes inside the single quotes.
5d030b67 281
151b5d36 282(or, you don't have to do this yourself because this example is predefined)
283
3ef515df 284If you are using a regex here, you have to use the quotes as shown or
285it won't work. Also note that regex handling is tricky even for the
151b5d36 286experienced. Use this feature with caution.
5d030b67 287
288=item As a code reference, e.g.:
289
151b5d36 290 define_alias( sub {shift =~ /^iso8859-(\d+)$/i ? "iso-8859-$1" : undef } );
5d030b67 291
151b5d36 292The same effect as the example above in a different way. The coderef
293takes the alias name as an argument and returns a canonical name on
294success or undef if not. Note the second argument is not required.
295Use this with even more caution than the regex version.
5d030b67 296
5129552c 297=back
298
151b5d36 299=head3 Changes in code reference aliasing
300
301As of Encode 1.87, the older form
302
303 define_alias( sub { return /^iso8859-(\d+)$/i ? "iso-8859-$1" : undef } );
304
305no longer works.
306
307Encode up to 1.86 internally used "local $_" to implement ths older
308form. But consider the code below;
309
310 use Encode;
311 $_ = "eeeee" ;
312 while (/(e)/g) {
313 my $utf = decode('aliased-encoding-name', $1);
314 print "position:",pos,"\n";
315 }
316
317Prior to Encode 1.86 this fails because of "local $_".
318
0ab8f81e 319=head2 Alias overloading
5d030b67 320
3ef515df 321You can override predefined aliases by simply applying define_alias().
0ab8f81e 322The new alias is always evaluated first, and when neccessary,
323define_alias() flushes the internal cache to make the new definition
324available.
5d030b67 325
0ab8f81e 326 # redirect SHIFT_JIS to MS/IBM Code Page 932, which is a
5d030b67 327 # superset of SHIFT_JIS
328
5129552c 329 define_alias( qr/shift.*jis$/i => '"cp932"' );
330 define_alias( qr/sjis$/i => '"cp932"' );
331
0ab8f81e 332If you want to zap all predefined aliases, you can use
5129552c 333
334 Encode::Alias->undef_aliases;
335
336to do so. And
337
338 Encode::Alias->init_aliases;
339
0ab8f81e 340gets the factory settings back.
5d030b67 341
342=head1 SEE ALSO
343
344L<Encode>, L<Encode::Supported>
345
5129552c 346=cut
5d030b67 347