Upgrade to Encode 1.11, from Dan Kogai.
[p5sagit/p5-mst-13.2.git] / ext / Encode / lib / Encode / Alias.pm
CommitLineData
5d030b67 1package Encode::Alias;
2use strict;
5129552c 3use Encode;
3ef515df 4our $VERSION = do { my @r = (q$Revision: 1.2 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
5d030b67 5our $DEBUG = 0;
6require Exporter;
7
8our @ISA = qw(Exporter);
9
10# Public, encouraged API is exported by default
5129552c 11
12our @EXPORT =
13 qw (
14 define_alias
15 find_alias
16 );
5d030b67 17
18our @Alias; # ordered matching list
19our %Alias; # cached known aliases
20
5129552c 21sub find_alias
5d030b67 22{
23 my $class = shift;
24 local $_ = shift;
25 unless (exists $Alias{$_})
26 {
27 for (my $i=0; $i < @Alias; $i += 2)
28 {
29 my $alias = $Alias[$i];
30 my $val = $Alias[$i+1];
31 my $new;
32 if (ref($alias) eq 'Regexp' && $_ =~ $alias)
33 {
a63c962f 34 $DEBUG and warn "eval $val";
5d030b67 35 $new = eval $val;
36 # $@ and warn "$val, $@";
37 }
38 elsif (ref($alias) eq 'CODE')
39 {
a63c962f 40 $DEBUG and warn "$alias", "->", "($val)";
5d030b67 41 $new = $alias->($val);
42 }
43 elsif (lc($_) eq lc($alias))
44 {
45 $new = $val;
46 }
47 if (defined($new))
48 {
49 next if $new eq $_; # avoid (direct) recursion on bugs
a63c962f 50 $DEBUG and warn "$alias, $new";
5d030b67 51 my $enc = (ref($new)) ? $new : Encode::find_encoding($new);
52 if ($enc)
53 {
54 $Alias{$_} = $enc;
55 last;
56 }
57 }
58 }
59 }
a63c962f 60 if ($DEBUG){
61 my $name;
62 if (my $e = $Alias{$_}){
63 $name = $e->name;
64 }else{
65 $name = "";
66 }
67 warn "find_alias($class, $_)->name = $name";
68 }
5d030b67 69 return $Alias{$_};
70}
71
72sub define_alias
73{
74 while (@_)
75 {
76 my ($alias,$name) = splice(@_,0,2);
77 unshift(@Alias, $alias => $name); # newer one has precedence
78 # clear %Alias cache to allow overrides
79 if (ref($alias)){
5129552c 80 my @a = keys %Alias;
81 for my $k (@a){
5d030b67 82 if (ref($alias) eq 'Regexp' && $k =~ $alias)
83 {
a63c962f 84 $DEBUG and warn "delete \$Alias\{$k\}";
5d030b67 85 delete $Alias{$k};
86 }
87 elsif (ref($alias) eq 'CODE')
88 {
a63c962f 89 $DEBUG and warn "delete \$Alias\{$k\}";
5d030b67 90 delete $Alias{$alias->($name)};
91 }
92 }
93 }else{
a63c962f 94 $DEBUG and warn "delete \$Alias\{$alias\}";
5d030b67 95 delete $Alias{$alias};
96 }
97 }
98}
99
5d030b67 100# Allow latin-1 style names as well
101 # 0 1 2 3 4 5 6 7 8 9 10
102our @Latin2iso = ( 0, 1, 2, 3, 4, 9, 10, 13, 14, 15, 16 );
5d030b67 103# Allow winlatin1 style names as well
104our %Winlatin2cp = (
105 'latin1' => 1252,
106 'latin2' => 1250,
107 'cyrillic' => 1251,
108 'greek' => 1253,
109 'turkish' => 1254,
110 'hebrew' => 1255,
111 'arabic' => 1256,
112 'baltic' => 1257,
113 'vietnamese' => 1258,
114 );
115
5129552c 116init_aliases();
117
118sub undef_aliases{
119 @Alias = ();
120 %Alias = ();
121}
122
123sub init_aliases
124{
125 undef_aliases();
67d7b5ef 126 # 'C' => 'US-ascii' so you can feed default locale directly.
127 define_alias('C' => 'US-ascii');
128 # Allow variants of iso-8859-1 etc.
129 define_alias( qr/\biso[-_]?(\d+)[-_](\d+)$/i => '"iso-$1-$2"' );
5129552c 130
67d7b5ef 131 # At least HP-UX has these.
132 define_alias( qr/\biso8859(\d+)$/i => '"iso-8859-$1"' );
5129552c 133
67d7b5ef 134 # More HP stuff.
135 define_alias( qr/\b(?:hp-)?(arabic|greek|hebrew|kana|roman|thai|turkish)8$/i => '"${1}8"' );
5129552c 136
67d7b5ef 137 # The Official name of ASCII.
138 define_alias( qr/\bANSI[-_]?X3\.4[-_]?1968$/i => '"ascii"' );
5129552c 139
67d7b5ef 140 # This is a font issue, not an encoding issue.
141 # (The currency symbol of the Latin 1 upper half
142 # has been redefined as the euro symbol.)
5129552c 143 define_alias( qr/^(.+)\@euro$/i => '"$1"' );
144
67d7b5ef 145 define_alias( qr/\b(?:iso[-_]?)?latin[-_]?(\d+)$/i
5129552c 146 => '"iso-8859-$Encode::Alias::Latin2iso[$1]"' );
147
67d7b5ef 148 define_alias( qr/\bwin(latin[12]|cyrillic|baltic|greek|turkish|
5129552c 149 hebrew|arabic|baltic|vietnamese)$/ix =>
150 '"cp" . $Encode::Alias::Winlatin2cp{lc($1)}' );
5d030b67 151
67d7b5ef 152 # Common names for non-latin prefered MIME names
5129552c 153 define_alias( 'ascii' => 'US-ascii',
154 'cyrillic' => 'iso-8859-5',
155 'arabic' => 'iso-8859-6',
156 'greek' => 'iso-8859-7',
157 'hebrew' => 'iso-8859-8',
158 'thai' => 'iso-8859-11',
159 'tis620' => 'iso-8859-11',
160 );
5d030b67 161
67d7b5ef 162 # At least AIX has IBM-NNN (surprisingly...) instead of cpNNN.
163 # And Microsoft has their own naming (again, surprisingly).
164 # And windows-* is registered in IANA!
165 define_alias( qr/\b(?:ibm|ms|windows)[-_]?(\d\d\d\d?)$/i => '"cp$1"');
5d030b67 166
67d7b5ef 167 # Sometimes seen with a leading zero.
168 define_alias( qr/\bcp037\b/i => '"cp37"');
5d030b67 169
3ef515df 170 # Mac Mappings
171 define_alias( qr/\bmacIcelandic$/i => '"macIceland"');
172 define_alias( qr/^mac_(.*)$/i => '"mac$1"');
67d7b5ef 173 # Ououououou.
174 define_alias( qr/\bmacRomanian$/i => '"macRumanian"');
5d030b67 175
176# Standardize on the dashed versions.
67d7b5ef 177 # define_alias( qr/\butf8$/i => 'utf-8' );
178 define_alias( qr/\bkoi8r$/i => 'koi8-r' );
179 define_alias( qr/\bkoi8u$/i => 'koi8-u' );
5129552c 180
a63c962f 181 unless ($Encode::ON_EBCDIC){
182 # for Encode::CN
67d7b5ef 183 define_alias( qr/\beuc.*cn$/i => '"euc-cn"' );
184 define_alias( qr/\bcn.*euc$/i => '"euc-cn"' );
185 # define_alias( qr/\bGB[- ]?(\d+)$/i => '"euc-cn"' )
186 # CP936 doesn't have vendor-addon for GBK, so they're identical.
187 define_alias( qr/^gbk$/i => '"cp936"');
188 # This fixes gb2312 vs. euc-cn confusion, practically
189 define_alias( qr/\bGB[-_ ]?2312(?:\D.*$|$)/i => '"euc-cn"' );
190 # for Encode::JP
191 define_alias( qr/\bjis$/i => '"7bit-jis"' );
192 define_alias( qr/\beuc.*jp$/i => '"euc-jp"' );
193 define_alias( qr/\bjp.*euc$/i => '"euc-jp"' );
194 define_alias( qr/\bujis$/i => '"euc-jp"' );
195 define_alias( qr/\bshift.*jis$/i => '"shiftjis"' );
196 define_alias( qr/\bsjis$/i => '"shiftjis"' );
a63c962f 197 # for Encode::KR
67d7b5ef 198 define_alias( qr/\beuc.*kr$/i => '"euc-kr"' );
199 define_alias( qr/\bkr.*euc$/i => '"euc-kr"' );
200 # This fixes ksc5601 vs. euc-kr confusion, practically
201 define_alias( qr/(?:x-)?uhc$/i => '"cp949"' );
202 define_alias( qr/(?:x-)?windows-949$/i => '"cp949"' );
203 define_alias( qr/\bks_c_5601-1987$/i => '"cp949"' );
a63c962f 204 # for Encode::TW
67d7b5ef 205 define_alias( qr/\bbig-?5$/i => '"big5"' );
206 define_alias( qr/\bbig5-hk(?:scs)?$/i => '"big5-hkscs"' );
a63c962f 207 }
5129552c 208
67d7b5ef 209 # At last, Map white space and _ to '-'
5129552c 210 define_alias( qr/^(\S+)[\s_]+(.*)$/i => '"$1-$2"' );
211}
212
2131;
214__END__
5d030b67 215
216# TODO: HP-UX '8' encodings arabic8 greek8 hebrew8 kana8 thai8 turkish8
217# TODO: HP-UX '15' encodings japanese15 korean15 roi15
218# TODO: Cyrillic encoding ISO-IR-111 (useful?)
219# TODO: Armenian encoding ARMSCII-8
220# TODO: Hebrew encoding ISO-8859-8-1
221# TODO: Thai encoding TCVN
5d030b67 222# TODO: Vietnamese encodings VPS
223# TODO: Mac Asian+African encodings: Arabic Armenian Bengali Burmese
224# ChineseSimp ChineseTrad Devanagari Ethiopic ExtArabic
225# Farsi Georgian Gujarati Gurmukhi Hebrew Japanese
226# Kannada Khmer Korean Laotian Malayalam Mongolian
227# Oriya Sinhalese Symbol Tamil Telugu Tibetan Vietnamese
228
5d030b67 229=head1 NAME
230
231Encode::Alias - alias defintions to encodings
232
233=head1 SYNOPSIS
234
5129552c 235 use Encode;
236 use Encode::Alias;
5d030b67 237 define_alias( newName => ENCODING);
238
239=head1 DESCRIPTION
240
3ef515df 241Allows newName to be used as an alias for ENCODING. ENCODING may be
242either the name of an encoding or an encoding object (as described
243in L<Encode>).
5d030b67 244
245Currently I<newName> can be specified in the following ways:
246
247=over 4
248
249=item As a simple string.
250
251=item As a qr// compiled regular expression, e.g.:
252
253 define_alias( qr/^iso8859-(\d+)$/i => '"iso-8859-$1"' );
254
255In this case if I<ENCODING> is not a reference it is C<eval>-ed to
3ef515df 256allow C<$1> etc. to be substituted. The example is one way to alias
257names as used in X11 fonts to the MIME names for the iso-8859-*
258family. Note the double quote inside the single quote.
5d030b67 259
3ef515df 260If you are using a regex here, you have to use the quotes as shown or
261it won't work. Also note that regex handling is tricky even for the
262experienced. Use it with caution.
5d030b67 263
264=item As a code reference, e.g.:
265
266 define_alias( sub { return /^iso8859-(\d+)$/i ? "iso-8859-$1" : undef } , '');
267
3ef515df 268
5d030b67 269In this case C<$_> will be set to the name that is being looked up and
270I<ENCODING> is passed to the sub as its first argument. The example
3ef515df 271is another way to alias names as used in X11 fonts to the MIME names
272for the iso-8859-* family.
5d030b67 273
5129552c 274=back
275
276=head2 Alias overloading
5d030b67 277
3ef515df 278You can override predefined aliases by simply applying define_alias().
5d030b67 279New alias is always evaluated first and when neccessary define_alias()
280flushes internal cache to make new definition available.
281
282 # redirect SHIFT_JIS to MS/IBM Code Page 932, which is a
283 # superset of SHIFT_JIS
284
5129552c 285 define_alias( qr/shift.*jis$/i => '"cp932"' );
286 define_alias( qr/sjis$/i => '"cp932"' );
287
288If you want to zap all predefined aliases, you can
289
290 Encode::Alias->undef_aliases;
291
292to do so. And
293
294 Encode::Alias->init_aliases;
295
296gets factory setting back.
297
5d030b67 298
299=head1 SEE ALSO
300
301L<Encode>, L<Encode::Supported>
302
5129552c 303=cut
5d030b67 304