Upgrade to Encode 1.32, from Dan Kogai.
[p5sagit/p5-mst-13.2.git] / ext / Encode / lib / Encode / Alias.pm
CommitLineData
5d030b67 1package Encode::Alias;
2use strict;
5129552c 3use Encode;
fcb875d4 4our $VERSION = do { my @r = (q$Revision: 1.27 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
5d030b67 5our $DEBUG = 0;
6require Exporter;
7
8our @ISA = qw(Exporter);
9
10# Public, encouraged API is exported by default
5129552c 11
fcb875d4 12our @EXPORT =
5129552c 13 qw (
14 define_alias
15 find_alias
16 );
5d030b67 17
18our @Alias; # ordered matching list
19our %Alias; # cached known aliases
20
5129552c 21sub find_alias
5d030b67 22{
23 my $class = shift;
24 local $_ = shift;
25 unless (exists $Alias{$_})
26 {
eaac0a15 27 $Alias{$_} = undef; # Recursion guard
5d030b67 28 for (my $i=0; $i < @Alias; $i += 2)
29 {
30 my $alias = $Alias[$i];
31 my $val = $Alias[$i+1];
32 my $new;
33 if (ref($alias) eq 'Regexp' && $_ =~ $alias)
34 {
a63c962f 35 $DEBUG and warn "eval $val";
5d030b67 36 $new = eval $val;
37 # $@ and warn "$val, $@";
38 }
39 elsif (ref($alias) eq 'CODE')
40 {
a63c962f 41 $DEBUG and warn "$alias", "->", "($val)";
5d030b67 42 $new = $alias->($val);
43 }
44 elsif (lc($_) eq lc($alias))
45 {
46 $new = $val;
47 }
48 if (defined($new))
49 {
50 next if $new eq $_; # avoid (direct) recursion on bugs
a63c962f 51 $DEBUG and warn "$alias, $new";
5d030b67 52 my $enc = (ref($new)) ? $new : Encode::find_encoding($new);
53 if ($enc)
54 {
55 $Alias{$_} = $enc;
56 last;
57 }
58 }
59 }
60 }
a63c962f 61 if ($DEBUG){
62 my $name;
63 if (my $e = $Alias{$_}){
64 $name = $e->name;
65 }else{
66 $name = "";
67 }
68 warn "find_alias($class, $_)->name = $name";
69 }
5d030b67 70 return $Alias{$_};
71}
72
73sub define_alias
74{
75 while (@_)
76 {
77 my ($alias,$name) = splice(@_,0,2);
78 unshift(@Alias, $alias => $name); # newer one has precedence
79 # clear %Alias cache to allow overrides
80 if (ref($alias)){
5129552c 81 my @a = keys %Alias;
82 for my $k (@a){
5d030b67 83 if (ref($alias) eq 'Regexp' && $k =~ $alias)
84 {
a63c962f 85 $DEBUG and warn "delete \$Alias\{$k\}";
5d030b67 86 delete $Alias{$k};
87 }
88 elsif (ref($alias) eq 'CODE')
89 {
a63c962f 90 $DEBUG and warn "delete \$Alias\{$k\}";
5d030b67 91 delete $Alias{$alias->($name)};
92 }
93 }
94 }else{
a63c962f 95 $DEBUG and warn "delete \$Alias\{$alias\}";
5d030b67 96 delete $Alias{$alias};
97 }
98 }
99}
100
5d030b67 101# Allow latin-1 style names as well
102 # 0 1 2 3 4 5 6 7 8 9 10
103our @Latin2iso = ( 0, 1, 2, 3, 4, 9, 10, 13, 14, 15, 16 );
5d030b67 104# Allow winlatin1 style names as well
105our %Winlatin2cp = (
106 'latin1' => 1252,
107 'latin2' => 1250,
108 'cyrillic' => 1251,
109 'greek' => 1253,
110 'turkish' => 1254,
111 'hebrew' => 1255,
112 'arabic' => 1256,
113 'baltic' => 1257,
114 'vietnamese' => 1258,
115 );
116
5129552c 117init_aliases();
118
119sub undef_aliases{
120 @Alias = ();
121 %Alias = ();
122}
123
124sub init_aliases
125{
126 undef_aliases();
a999c27c 127
f2a2953c 128 # Try all-lower-case version should all else fails
a999c27c 129 define_alias( qr/^(.*)$/ => '"\L$1"' );
130
f2a2953c 131 # UTF/UCS stuff
132 define_alias( qr/^UCS-?2-?LE$/i => '"UCS-2LE"' );
133 define_alias( qr/^UCS-?2-?(BE)?$/i => '"UCS-2BE"',
134 qr/^iso-10646-1$/i => '"UCS-2BE"' );
135 define_alias( qr/^UTF(16|32)-?BE$/i => '"UTF-$1BE"',
136 qr/^UTF(16|32)-?LE$/i => '"UTF-$1LE"',
137 qr/^UTF(16|32)$/i => '"UTF-$1"',
138 );
139 # ASCII
a999c27c 140 define_alias(qr/^(?:US-?)ascii$/i => '"ascii"');
a999c27c 141 define_alias('C' => 'ascii');
67d7b5ef 142 # Allow variants of iso-8859-1 etc.
143 define_alias( qr/\biso[-_]?(\d+)[-_](\d+)$/i => '"iso-$1-$2"' );
5129552c 144
67d7b5ef 145 # At least HP-UX has these.
146 define_alias( qr/\biso8859(\d+)$/i => '"iso-8859-$1"' );
5129552c 147
67d7b5ef 148 # More HP stuff.
149 define_alias( qr/\b(?:hp-)?(arabic|greek|hebrew|kana|roman|thai|turkish)8$/i => '"${1}8"' );
5129552c 150
67d7b5ef 151 # The Official name of ASCII.
152 define_alias( qr/\bANSI[-_]?X3\.4[-_]?1968$/i => '"ascii"' );
5129552c 153
67d7b5ef 154 # This is a font issue, not an encoding issue.
155 # (The currency symbol of the Latin 1 upper half
156 # has been redefined as the euro symbol.)
5129552c 157 define_alias( qr/^(.+)\@euro$/i => '"$1"' );
158
fcb875d4 159 define_alias( qr/\b(?:iso[-_]?)?latin[-_]?(\d+)$/i
5129552c 160 => '"iso-8859-$Encode::Alias::Latin2iso[$1]"' );
161
67d7b5ef 162 define_alias( qr/\bwin(latin[12]|cyrillic|baltic|greek|turkish|
fcb875d4 163 hebrew|arabic|baltic|vietnamese)$/ix =>
5129552c 164 '"cp" . $Encode::Alias::Winlatin2cp{lc($1)}' );
5d030b67 165
67d7b5ef 166 # Common names for non-latin prefered MIME names
5129552c 167 define_alias( 'ascii' => 'US-ascii',
168 'cyrillic' => 'iso-8859-5',
169 'arabic' => 'iso-8859-6',
170 'greek' => 'iso-8859-7',
171 'hebrew' => 'iso-8859-8',
172 'thai' => 'iso-8859-11',
173 'tis620' => 'iso-8859-11',
174 );
5d030b67 175
67d7b5ef 176 # At least AIX has IBM-NNN (surprisingly...) instead of cpNNN.
177 # And Microsoft has their own naming (again, surprisingly).
fcb875d4 178 # And windows-* is registered in IANA!
67d7b5ef 179 define_alias( qr/\b(?:ibm|ms|windows)[-_]?(\d\d\d\d?)$/i => '"cp$1"');
5d030b67 180
67d7b5ef 181 # Sometimes seen with a leading zero.
c731e18e 182 # define_alias( qr/\bcp037\b/i => '"cp37"');
5d030b67 183
3ef515df 184 # Mac Mappings
a999c27c 185 # predefined in *.ucm; unneeded
186 # define_alias( qr/\bmacIcelandic$/i => '"macIceland"');
3ef515df 187 define_alias( qr/^mac_(.*)$/i => '"mac$1"');
a999c27c 188 # Ououououou. gone. They are differente!
189 # define_alias( qr/\bmacRomanian$/i => '"macRumanian"');
fcb875d4 190
f2a2953c 191 # Standardize on the dashed versions.
67d7b5ef 192 # define_alias( qr/\butf8$/i => 'utf-8' );
193 define_alias( qr/\bkoi8r$/i => 'koi8-r' );
194 define_alias( qr/\bkoi8u$/i => 'koi8-u' );
5129552c 195
a63c962f 196 unless ($Encode::ON_EBCDIC){
197 # for Encode::CN
67d7b5ef 198 define_alias( qr/\beuc.*cn$/i => '"euc-cn"' );
199 define_alias( qr/\bcn.*euc$/i => '"euc-cn"' );
200 # define_alias( qr/\bGB[- ]?(\d+)$/i => '"euc-cn"' )
201 # CP936 doesn't have vendor-addon for GBK, so they're identical.
202 define_alias( qr/^gbk$/i => '"cp936"');
203 # This fixes gb2312 vs. euc-cn confusion, practically
204 define_alias( qr/\bGB[-_ ]?2312(?:\D.*$|$)/i => '"euc-cn"' );
205 # for Encode::JP
206 define_alias( qr/\bjis$/i => '"7bit-jis"' );
207 define_alias( qr/\beuc.*jp$/i => '"euc-jp"' );
208 define_alias( qr/\bjp.*euc$/i => '"euc-jp"' );
209 define_alias( qr/\bujis$/i => '"euc-jp"' );
210 define_alias( qr/\bshift.*jis$/i => '"shiftjis"' );
211 define_alias( qr/\bsjis$/i => '"shiftjis"' );
a63c962f 212 # for Encode::KR
67d7b5ef 213 define_alias( qr/\beuc.*kr$/i => '"euc-kr"' );
214 define_alias( qr/\bkr.*euc$/i => '"euc-kr"' );
215 # This fixes ksc5601 vs. euc-kr confusion, practically
216 define_alias( qr/(?:x-)?uhc$/i => '"cp949"' );
217 define_alias( qr/(?:x-)?windows-949$/i => '"cp949"' );
218 define_alias( qr/\bks_c_5601-1987$/i => '"cp949"' );
a63c962f 219 # for Encode::TW
67d7b5ef 220 define_alias( qr/\bbig-?5$/i => '"big5"' );
221 define_alias( qr/\bbig5-hk(?:scs)?$/i => '"big5-hkscs"' );
a63c962f 222 }
f2a2953c 223 # utf8 is blessed :)
224 define_alias( qr/^UTF-8$/i => '"utf8"',);
67d7b5ef 225 # At last, Map white space and _ to '-'
5129552c 226 define_alias( qr/^(\S+)[\s_]+(.*)$/i => '"$1-$2"' );
227}
228
2291;
230__END__
5d030b67 231
232# TODO: HP-UX '8' encodings arabic8 greek8 hebrew8 kana8 thai8 turkish8
233# TODO: HP-UX '15' encodings japanese15 korean15 roi15
234# TODO: Cyrillic encoding ISO-IR-111 (useful?)
235# TODO: Armenian encoding ARMSCII-8
236# TODO: Hebrew encoding ISO-8859-8-1
237# TODO: Thai encoding TCVN
5d030b67 238# TODO: Vietnamese encodings VPS
239# TODO: Mac Asian+African encodings: Arabic Armenian Bengali Burmese
240# ChineseSimp ChineseTrad Devanagari Ethiopic ExtArabic
241# Farsi Georgian Gujarati Gurmukhi Hebrew Japanese
242# Kannada Khmer Korean Laotian Malayalam Mongolian
243# Oriya Sinhalese Symbol Tamil Telugu Tibetan Vietnamese
244
5d030b67 245=head1 NAME
246
ce912cd4 247Encode::Alias - alias definitions to encodings
5d030b67 248
249=head1 SYNOPSIS
250
5129552c 251 use Encode;
252 use Encode::Alias;
5d030b67 253 define_alias( newName => ENCODING);
254
255=head1 DESCRIPTION
256
3ef515df 257Allows newName to be used as an alias for ENCODING. ENCODING may be
fcb875d4 258either the name of an encoding or an encoding object (as described
3ef515df 259in L<Encode>).
5d030b67 260
261Currently I<newName> can be specified in the following ways:
262
263=over 4
264
265=item As a simple string.
266
267=item As a qr// compiled regular expression, e.g.:
268
269 define_alias( qr/^iso8859-(\d+)$/i => '"iso-8859-$1"' );
270
271In this case if I<ENCODING> is not a reference it is C<eval>-ed to
3ef515df 272allow C<$1> etc. to be substituted. The example is one way to alias
273names as used in X11 fonts to the MIME names for the iso-8859-*
274family. Note the double quote inside the single quote.
5d030b67 275
3ef515df 276If you are using a regex here, you have to use the quotes as shown or
277it won't work. Also note that regex handling is tricky even for the
278experienced. Use it with caution.
5d030b67 279
280=item As a code reference, e.g.:
281
282 define_alias( sub { return /^iso8859-(\d+)$/i ? "iso-8859-$1" : undef } , '');
283
3ef515df 284
5d030b67 285In this case C<$_> will be set to the name that is being looked up and
286I<ENCODING> is passed to the sub as its first argument. The example
3ef515df 287is another way to alias names as used in X11 fonts to the MIME names
288for the iso-8859-* family.
5d030b67 289
5129552c 290=back
291
292=head2 Alias overloading
5d030b67 293
3ef515df 294You can override predefined aliases by simply applying define_alias().
5d030b67 295New alias is always evaluated first and when neccessary define_alias()
296flushes internal cache to make new definition available.
297
298 # redirect SHIFT_JIS to MS/IBM Code Page 932, which is a
299 # superset of SHIFT_JIS
300
5129552c 301 define_alias( qr/shift.*jis$/i => '"cp932"' );
302 define_alias( qr/sjis$/i => '"cp932"' );
303
304If you want to zap all predefined aliases, you can
305
306 Encode::Alias->undef_aliases;
307
308to do so. And
309
310 Encode::Alias->init_aliases;
311
312gets factory setting back.
313
5d030b67 314
315=head1 SEE ALSO
316
317L<Encode>, L<Encode::Supported>
318
5129552c 319=cut
5d030b67 320