Integrate mainline
[p5sagit/p5-mst-13.2.git] / ext / Encode / lib / Encode / Alias.pm
CommitLineData
5d030b67 1package Encode::Alias;
2use strict;
5129552c 3use Encode;
48e3bbdd 4our $VERSION = do { my @r = (q$Revision: 1.0 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
5d030b67 5our $DEBUG = 0;
6require Exporter;
7
8our @ISA = qw(Exporter);
9
10# Public, encouraged API is exported by default
5129552c 11
12our @EXPORT =
13 qw (
14 define_alias
15 find_alias
16 );
5d030b67 17
18our @Alias; # ordered matching list
19our %Alias; # cached known aliases
20
5129552c 21sub find_alias
5d030b67 22{
23 my $class = shift;
24 local $_ = shift;
25 unless (exists $Alias{$_})
26 {
27 for (my $i=0; $i < @Alias; $i += 2)
28 {
29 my $alias = $Alias[$i];
30 my $val = $Alias[$i+1];
31 my $new;
32 if (ref($alias) eq 'Regexp' && $_ =~ $alias)
33 {
a63c962f 34 $DEBUG and warn "eval $val";
5d030b67 35 $new = eval $val;
36 # $@ and warn "$val, $@";
37 }
38 elsif (ref($alias) eq 'CODE')
39 {
a63c962f 40 $DEBUG and warn "$alias", "->", "($val)";
5d030b67 41 $new = $alias->($val);
42 }
43 elsif (lc($_) eq lc($alias))
44 {
45 $new = $val;
46 }
47 if (defined($new))
48 {
49 next if $new eq $_; # avoid (direct) recursion on bugs
a63c962f 50 $DEBUG and warn "$alias, $new";
5d030b67 51 my $enc = (ref($new)) ? $new : Encode::find_encoding($new);
52 if ($enc)
53 {
54 $Alias{$_} = $enc;
55 last;
56 }
57 }
58 }
59 }
a63c962f 60 if ($DEBUG){
61 my $name;
62 if (my $e = $Alias{$_}){
63 $name = $e->name;
64 }else{
65 $name = "";
66 }
67 warn "find_alias($class, $_)->name = $name";
68 }
5d030b67 69 return $Alias{$_};
70}
71
72sub define_alias
73{
74 while (@_)
75 {
76 my ($alias,$name) = splice(@_,0,2);
77 unshift(@Alias, $alias => $name); # newer one has precedence
78 # clear %Alias cache to allow overrides
79 if (ref($alias)){
5129552c 80 my @a = keys %Alias;
81 for my $k (@a){
5d030b67 82 if (ref($alias) eq 'Regexp' && $k =~ $alias)
83 {
a63c962f 84 $DEBUG and warn "delete \$Alias\{$k\}";
5d030b67 85 delete $Alias{$k};
86 }
87 elsif (ref($alias) eq 'CODE')
88 {
a63c962f 89 $DEBUG and warn "delete \$Alias\{$k\}";
5d030b67 90 delete $Alias{$alias->($name)};
91 }
92 }
93 }else{
a63c962f 94 $DEBUG and warn "delete \$Alias\{$alias\}";
5d030b67 95 delete $Alias{$alias};
96 }
97 }
98}
99
5d030b67 100# Allow latin-1 style names as well
101 # 0 1 2 3 4 5 6 7 8 9 10
102our @Latin2iso = ( 0, 1, 2, 3, 4, 9, 10, 13, 14, 15, 16 );
5d030b67 103# Allow winlatin1 style names as well
104our %Winlatin2cp = (
105 'latin1' => 1252,
106 'latin2' => 1250,
107 'cyrillic' => 1251,
108 'greek' => 1253,
109 'turkish' => 1254,
110 'hebrew' => 1255,
111 'arabic' => 1256,
112 'baltic' => 1257,
113 'vietnamese' => 1258,
114 );
115
5129552c 116init_aliases();
117
118sub undef_aliases{
119 @Alias = ();
120 %Alias = ();
121}
122
123sub init_aliases
124{
125 undef_aliases();
48e3bbdd 126 # 'C' => 'US-ascii' so you can feed default locale directly.
127 define_alias('C' => 'US-ascii');
128 # Allow variants of iso-8859-1 etc.
129 define_alias( qr/\biso[-_]?(\d+)[-_](\d+)$/i => '"iso-$1-$2"' );
5129552c 130
48e3bbdd 131 # At least HP-UX has these.
132 define_alias( qr/\biso8859(\d+)$/i => '"iso-8859-$1"' );
5129552c 133
48e3bbdd 134 # More HP stuff.
135 define_alias( qr/\b(?:hp-)?(arabic|greek|hebrew|kana|roman|thai|turkish)8$/i => '"${1}8"' );
5129552c 136
48e3bbdd 137 # The Official name of ASCII.
138 define_alias( qr/\bANSI[-_]?X3\.4[-_]?1968$/i => '"ascii"' );
5129552c 139
48e3bbdd 140 # This is a font issue, not an encoding issue.
141 # (The currency symbol of the Latin 1 upper half
142 # has been redefined as the euro symbol.)
5129552c 143 define_alias( qr/^(.+)\@euro$/i => '"$1"' );
144
48e3bbdd 145 define_alias( qr/\b(?:iso[-_]?)?latin[-_]?(\d+)$/i
5129552c 146 => '"iso-8859-$Encode::Alias::Latin2iso[$1]"' );
147
48e3bbdd 148 define_alias( qr/\bwin(latin[12]|cyrillic|baltic|greek|turkish|
5129552c 149 hebrew|arabic|baltic|vietnamese)$/ix =>
150 '"cp" . $Encode::Alias::Winlatin2cp{lc($1)}' );
5d030b67 151
48e3bbdd 152 # Common names for non-latin prefered MIME names
5129552c 153 define_alias( 'ascii' => 'US-ascii',
154 'cyrillic' => 'iso-8859-5',
155 'arabic' => 'iso-8859-6',
156 'greek' => 'iso-8859-7',
157 'hebrew' => 'iso-8859-8',
158 'thai' => 'iso-8859-11',
159 'tis620' => 'iso-8859-11',
160 );
5d030b67 161
48e3bbdd 162 # At least AIX has IBM-NNN (surprisingly...) instead of cpNNN.
163 # And Microsoft has their own naming (again, surprisingly).
164 # And windows-* is registered in IANA!
165 define_alias( qr/\b(?:ibm|ms|windows)[-_]?(\d\d\d\d?)$/i => '"cp$1"');
5d030b67 166
48e3bbdd 167 # Sometimes seen with a leading zero.
168 define_alias( qr/\bcp037\b/i => '"cp37"');
5d030b67 169
48e3bbdd 170 # Ououououou.
171 define_alias( qr/\bmacRomanian$/i => '"macRumanian"');
5d030b67 172
173# Standardize on the dashed versions.
48e3bbdd 174 # define_alias( qr/\butf8$/i => 'utf-8' );
175 define_alias( qr/\bkoi8r$/i => 'koi8-r' );
176 define_alias( qr/\bkoi8u$/i => 'koi8-u' );
5129552c 177
a63c962f 178 unless ($Encode::ON_EBCDIC){
179 # for Encode::CN
48e3bbdd 180 define_alias( qr/\beuc.*cn$/i => '"euc-cn"' );
181 define_alias( qr/\bcn.*euc$/i => '"euc-cn"' );
182 # define_alias( qr/\bGB[- ]?(\d+)$/i => '"euc-cn"' )
183 # CP936 doesn't have vendor-addon for GBK, so they're identical.
184 define_alias( qr/^gbk$/i => '"cp936"');
185 # This fixes gb2312 vs. euc-cn confusion, practically
186 define_alias( qr/\bGB[-_ ]?2312(?:\D.*$|$)/i => '"euc-cn"' );
187 # for Encode::JP
188 define_alias( qr/\bjis$/i => '"7bit-jis"' );
189 define_alias( qr/\beuc.*jp$/i => '"euc-jp"' );
190 define_alias( qr/\bjp.*euc$/i => '"euc-jp"' );
191 define_alias( qr/\bujis$/i => '"euc-jp"' );
192 define_alias( qr/\bshift.*jis$/i => '"shiftjis"' );
193 define_alias( qr/\bsjis$/i => '"shiftjis"' );
a63c962f 194 # for Encode::KR
48e3bbdd 195 define_alias( qr/\beuc.*kr$/i => '"euc-kr"' );
196 define_alias( qr/\bkr.*euc$/i => '"euc-kr"' );
197 # This fixes ksc5601 vs. euc-kr confusion, practically
198 define_alias( qr/(?:x-)?uhc$/i => '"cp949"' );
199 define_alias( qr/(?:x-)?windows-949$/i => '"cp949"' );
200 define_alias( qr/\bks_c_5601-1987$/i => '"cp949"' );
a63c962f 201 # for Encode::TW
48e3bbdd 202 define_alias( qr/\bbig-?5$/i => '"big5"' );
203 define_alias( qr/\bbig5-hk(?:scs)?$/i => '"big5-hkscs"' );
a63c962f 204 }
5129552c 205
48e3bbdd 206 # At last, Map white space and _ to '-'
5129552c 207 define_alias( qr/^(\S+)[\s_]+(.*)$/i => '"$1-$2"' );
208}
209
2101;
211__END__
5d030b67 212
213# TODO: HP-UX '8' encodings arabic8 greek8 hebrew8 kana8 thai8 turkish8
214# TODO: HP-UX '15' encodings japanese15 korean15 roi15
215# TODO: Cyrillic encoding ISO-IR-111 (useful?)
216# TODO: Armenian encoding ARMSCII-8
217# TODO: Hebrew encoding ISO-8859-8-1
218# TODO: Thai encoding TCVN
5d030b67 219# TODO: Vietnamese encodings VPS
220# TODO: Mac Asian+African encodings: Arabic Armenian Bengali Burmese
221# ChineseSimp ChineseTrad Devanagari Ethiopic ExtArabic
222# Farsi Georgian Gujarati Gurmukhi Hebrew Japanese
223# Kannada Khmer Korean Laotian Malayalam Mongolian
224# Oriya Sinhalese Symbol Tamil Telugu Tibetan Vietnamese
225
5d030b67 226=head1 NAME
227
228Encode::Alias - alias defintions to encodings
229
230=head1 SYNOPSIS
231
5129552c 232 use Encode;
233 use Encode::Alias;
5d030b67 234 define_alias( newName => ENCODING);
235
236=head1 DESCRIPTION
237
238Allows newName to be used as am alias for ENCODING. ENCODING may be
239either the name of an encoding or and encoding object (as described in L<Encode>).
240
241Currently I<newName> can be specified in the following ways:
242
243=over 4
244
245=item As a simple string.
246
247=item As a qr// compiled regular expression, e.g.:
248
249 define_alias( qr/^iso8859-(\d+)$/i => '"iso-8859-$1"' );
250
251In this case if I<ENCODING> is not a reference it is C<eval>-ed to
252allow C<$1> etc. to be subsituted. The example is one way to names as
253used in X11 font names to alias the MIME names for the iso-8859-*
254family. Note the double quote inside the single quote.
255
256If you are using regex here, you have to do so or it won't work in
257this case. Also not regex is tricky even for the experienced. Use it
258with caution.
259
260=item As a code reference, e.g.:
261
262 define_alias( sub { return /^iso8859-(\d+)$/i ? "iso-8859-$1" : undef } , '');
263
264In this case C<$_> will be set to the name that is being looked up and
265I<ENCODING> is passed to the sub as its first argument. The example
266is another way to names as used in X11 font names to alias the MIME
267names for the iso-8859-* family.
268
5129552c 269=back
270
271=head2 Alias overloading
5d030b67 272
273You can override predefined aliases by simply applying define_alias().
274New alias is always evaluated first and when neccessary define_alias()
275flushes internal cache to make new definition available.
276
277 # redirect SHIFT_JIS to MS/IBM Code Page 932, which is a
278 # superset of SHIFT_JIS
279
5129552c 280 define_alias( qr/shift.*jis$/i => '"cp932"' );
281 define_alias( qr/sjis$/i => '"cp932"' );
282
283If you want to zap all predefined aliases, you can
284
285 Encode::Alias->undef_aliases;
286
287to do so. And
288
289 Encode::Alias->init_aliases;
290
291gets factory setting back.
292
5d030b67 293
294=head1 SEE ALSO
295
296L<Encode>, L<Encode::Supported>
297
5129552c 298=cut
5d030b67 299