Upgrade to Encode 0.96, from Dan Kogai.
[p5sagit/p5-mst-13.2.git] / ext / Encode / lib / Encode / Alias.pm
CommitLineData
5d030b67 1package Encode::Alias;
2use strict;
5129552c 3use Encode;
4our $VERSION = do { my @r = (q$Revision: 0.96 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
5d030b67 5our $DEBUG = 0;
6require Exporter;
7
8our @ISA = qw(Exporter);
9
10# Public, encouraged API is exported by default
5129552c 11
12our @EXPORT =
13 qw (
14 define_alias
15 find_alias
16 );
5d030b67 17
18our @Alias; # ordered matching list
19our %Alias; # cached known aliases
20
5129552c 21sub find_alias
5d030b67 22{
23 my $class = shift;
24 local $_ = shift;
25 unless (exists $Alias{$_})
26 {
27 for (my $i=0; $i < @Alias; $i += 2)
28 {
29 my $alias = $Alias[$i];
30 my $val = $Alias[$i+1];
31 my $new;
32 if (ref($alias) eq 'Regexp' && $_ =~ $alias)
33 {
34 $new = eval $val;
35 # $@ and warn "$val, $@";
36 }
37 elsif (ref($alias) eq 'CODE')
38 {
39 $new = $alias->($val);
40 }
41 elsif (lc($_) eq lc($alias))
42 {
43 $new = $val;
44 }
45 if (defined($new))
46 {
47 next if $new eq $_; # avoid (direct) recursion on bugs
48 my $enc = (ref($new)) ? $new : Encode::find_encoding($new);
49 if ($enc)
50 {
51 $Alias{$_} = $enc;
52 last;
53 }
54 }
55 }
56 }
57 return $Alias{$_};
58}
59
60sub define_alias
61{
62 while (@_)
63 {
64 my ($alias,$name) = splice(@_,0,2);
65 unshift(@Alias, $alias => $name); # newer one has precedence
66 # clear %Alias cache to allow overrides
67 if (ref($alias)){
5129552c 68 my @a = keys %Alias;
69 for my $k (@a){
5d030b67 70 if (ref($alias) eq 'Regexp' && $k =~ $alias)
71 {
72 $DEBUG and warn $k;
73 delete $Alias{$k};
74 }
75 elsif (ref($alias) eq 'CODE')
76 {
77 delete $Alias{$alias->($name)};
78 }
79 }
80 }else{
81 delete $Alias{$alias};
82 }
83 }
84}
85
5d030b67 86# Allow latin-1 style names as well
87 # 0 1 2 3 4 5 6 7 8 9 10
88our @Latin2iso = ( 0, 1, 2, 3, 4, 9, 10, 13, 14, 15, 16 );
5d030b67 89# Allow winlatin1 style names as well
90our %Winlatin2cp = (
91 'latin1' => 1252,
92 'latin2' => 1250,
93 'cyrillic' => 1251,
94 'greek' => 1253,
95 'turkish' => 1254,
96 'hebrew' => 1255,
97 'arabic' => 1256,
98 'baltic' => 1257,
99 'vietnamese' => 1258,
100 );
101
5129552c 102init_aliases();
103
104sub undef_aliases{
105 @Alias = ();
106 %Alias = ();
107}
108
109sub init_aliases
110{
111 undef_aliases();
112# Allow variants of iso-8859-1 etc.
113 define_alias( qr/^iso[-_]?(\d+)[-_](\d+)$/i => '"iso-$1-$2"' );
114
115# At least HP-UX has these.
116 define_alias( qr/^iso8859(\d+)$/i => '"iso-8859-$1"' );
117
118# More HP stuff.
119 define_alias( qr/^(?:hp-)?(arabic|greek|hebrew|kana|roman|thai|turkish)8$/i => '"${1}8"' );
120
121# The Official name of ASCII.
122 define_alias( qr/^ANSI[-_]?X3\.4[-_]?1968$/i => '"ascii"' );
123
124# This is a font issue, not an encoding issue.
125# (The currency symbol of the Latin 1 upper half
126# has been redefined as the euro symbol.)
127 define_alias( qr/^(.+)\@euro$/i => '"$1"' );
128
129 define_alias( qr/^(?:iso[-_]?)?latin[-_]?(\d+)$/i
130 => '"iso-8859-$Encode::Alias::Latin2iso[$1]"' );
131
132 define_alias( qr/win(latin[12]|cyrillic|baltic|greek|turkish|
133 hebrew|arabic|baltic|vietnamese)$/ix =>
134 '"cp" . $Encode::Alias::Winlatin2cp{lc($1)}' );
5d030b67 135
136# Common names for non-latin prefered MIME names
5129552c 137 define_alias( 'ascii' => 'US-ascii',
138 'cyrillic' => 'iso-8859-5',
139 'arabic' => 'iso-8859-6',
140 'greek' => 'iso-8859-7',
141 'hebrew' => 'iso-8859-8',
142 'thai' => 'iso-8859-11',
143 'tis620' => 'iso-8859-11',
144 );
5d030b67 145
146# At least AIX has IBM-NNN (surprisingly...) instead of cpNNN.
147# And Microsoft has their own naming (again, surprisingly).
5129552c 148 define_alias( qr/^(?:ibm|ms)[-_]?(\d\d\d\d?)$/i => '"cp$1"');
5d030b67 149
150# Sometimes seen with a leading zero.
5129552c 151 define_alias( qr/^cp037$/i => '"cp37"');
5d030b67 152
153# Ououououou.
5129552c 154 define_alias( qr/^macRomanian$/i => '"macRumanian"');
5d030b67 155
156# Standardize on the dashed versions.
5129552c 157 define_alias( qr/^utf8$/i => 'utf-8' );
158 define_alias( qr/^koi8r$/i => 'koi8-r' );
159 define_alias( qr/^koi8u$/i => 'koi8-u' );
160
161# for Encode::CN
162 define_alias( qr/euc.*cn$/i => '"euc-cn"' );
163 define_alias( qr/cn.*euc/i => '"euc-cn"' );
164
165# for Encode::JP
166 define_alias( qr/euc.*jp$/i => '"euc-jp"' );
167 define_alias( qr/jp.*euc/i => '"euc-jp"' );
168 define_alias( qr/ujis$/i => '"euc-jp"' );
169 define_alias( qr/shift.*jis$/i => '"shiftjis"' );
170 define_alias( qr/sjis$/i => '"shiftjis"' );
171 define_alias( qr/^jis$/i => '"7bit-jis"' );
172
173# for Encode::KR
174 define_alias( qr/euc.*kr$/i => '"euc-kr"' );
175 define_alias( qr/kr.*euc/i => '"euc-kr"' );
176
177# for Encode::TW
178 define_alias( qr/big-?5$/i => '"big5"' );
179 define_alias( qr/big5-hk(?:scs)?/i => '"big5-hkscs"' );
180
181# At last, Map white space and _ to '-'
182 define_alias( qr/^(\S+)[\s_]+(.*)$/i => '"$1-$2"' );
183}
184
1851;
186__END__
5d030b67 187
188# TODO: HP-UX '8' encodings arabic8 greek8 hebrew8 kana8 thai8 turkish8
189# TODO: HP-UX '15' encodings japanese15 korean15 roi15
190# TODO: Cyrillic encoding ISO-IR-111 (useful?)
191# TODO: Armenian encoding ARMSCII-8
192# TODO: Hebrew encoding ISO-8859-8-1
193# TODO: Thai encoding TCVN
194# TODO: Korean encoding Johab
195# TODO: Vietnamese encodings VPS
196# TODO: Mac Asian+African encodings: Arabic Armenian Bengali Burmese
197# ChineseSimp ChineseTrad Devanagari Ethiopic ExtArabic
198# Farsi Georgian Gujarati Gurmukhi Hebrew Japanese
199# Kannada Khmer Korean Laotian Malayalam Mongolian
200# Oriya Sinhalese Symbol Tamil Telugu Tibetan Vietnamese
201
5d030b67 202=head1 NAME
203
204Encode::Alias - alias defintions to encodings
205
206=head1 SYNOPSIS
207
5129552c 208 use Encode;
209 use Encode::Alias;
5d030b67 210 define_alias( newName => ENCODING);
211
212=head1 DESCRIPTION
213
214Allows newName to be used as am alias for ENCODING. ENCODING may be
215either the name of an encoding or and encoding object (as described in L<Encode>).
216
217Currently I<newName> can be specified in the following ways:
218
219=over 4
220
221=item As a simple string.
222
223=item As a qr// compiled regular expression, e.g.:
224
225 define_alias( qr/^iso8859-(\d+)$/i => '"iso-8859-$1"' );
226
227In this case if I<ENCODING> is not a reference it is C<eval>-ed to
228allow C<$1> etc. to be subsituted. The example is one way to names as
229used in X11 font names to alias the MIME names for the iso-8859-*
230family. Note the double quote inside the single quote.
231
232If you are using regex here, you have to do so or it won't work in
233this case. Also not regex is tricky even for the experienced. Use it
234with caution.
235
236=item As a code reference, e.g.:
237
238 define_alias( sub { return /^iso8859-(\d+)$/i ? "iso-8859-$1" : undef } , '');
239
240In this case C<$_> will be set to the name that is being looked up and
241I<ENCODING> is passed to the sub as its first argument. The example
242is another way to names as used in X11 font names to alias the MIME
243names for the iso-8859-* family.
244
5129552c 245=back
246
247=head2 Alias overloading
5d030b67 248
249You can override predefined aliases by simply applying define_alias().
250New alias is always evaluated first and when neccessary define_alias()
251flushes internal cache to make new definition available.
252
253 # redirect SHIFT_JIS to MS/IBM Code Page 932, which is a
254 # superset of SHIFT_JIS
255
5129552c 256 define_alias( qr/shift.*jis$/i => '"cp932"' );
257 define_alias( qr/sjis$/i => '"cp932"' );
258
259If you want to zap all predefined aliases, you can
260
261 Encode::Alias->undef_aliases;
262
263to do so. And
264
265 Encode::Alias->init_aliases;
266
267gets factory setting back.
268
5d030b67 269
270=head1 SEE ALSO
271
272L<Encode>, L<Encode::Supported>
273
5129552c 274=cut
5d030b67 275