Integrate perlio:
[p5sagit/p5-mst-13.2.git] / ext / Encode / lib / Encode / Alias.pm
CommitLineData
5d030b67 1package Encode::Alias;
2use strict;
5129552c 3use Encode;
10c5ecbb 4our $VERSION = do { my @r = (q$Revision: 1.30 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
5d030b67 5our $DEBUG = 0;
5d030b67 6
10c5ecbb 7use base qw(Exporter);
5d030b67 8
9# Public, encouraged API is exported by default
5129552c 10
fcb875d4 11our @EXPORT =
5129552c 12 qw (
13 define_alias
14 find_alias
15 );
5d030b67 16
17our @Alias; # ordered matching list
18our %Alias; # cached known aliases
19
5129552c 20sub find_alias
5d030b67 21{
22 my $class = shift;
23 local $_ = shift;
24 unless (exists $Alias{$_})
25 {
eaac0a15 26 $Alias{$_} = undef; # Recursion guard
5d030b67 27 for (my $i=0; $i < @Alias; $i += 2)
28 {
29 my $alias = $Alias[$i];
30 my $val = $Alias[$i+1];
31 my $new;
32 if (ref($alias) eq 'Regexp' && $_ =~ $alias)
33 {
a63c962f 34 $DEBUG and warn "eval $val";
5d030b67 35 $new = eval $val;
36 # $@ and warn "$val, $@";
37 }
38 elsif (ref($alias) eq 'CODE')
39 {
a63c962f 40 $DEBUG and warn "$alias", "->", "($val)";
5d030b67 41 $new = $alias->($val);
42 }
43 elsif (lc($_) eq lc($alias))
44 {
45 $new = $val;
46 }
47 if (defined($new))
48 {
49 next if $new eq $_; # avoid (direct) recursion on bugs
a63c962f 50 $DEBUG and warn "$alias, $new";
5d030b67 51 my $enc = (ref($new)) ? $new : Encode::find_encoding($new);
52 if ($enc)
53 {
54 $Alias{$_} = $enc;
55 last;
56 }
57 }
58 }
59 }
a63c962f 60 if ($DEBUG){
61 my $name;
62 if (my $e = $Alias{$_}){
63 $name = $e->name;
64 }else{
65 $name = "";
66 }
67 warn "find_alias($class, $_)->name = $name";
68 }
5d030b67 69 return $Alias{$_};
70}
71
72sub define_alias
73{
74 while (@_)
75 {
76 my ($alias,$name) = splice(@_,0,2);
77 unshift(@Alias, $alias => $name); # newer one has precedence
78 # clear %Alias cache to allow overrides
79 if (ref($alias)){
5129552c 80 my @a = keys %Alias;
81 for my $k (@a){
5d030b67 82 if (ref($alias) eq 'Regexp' && $k =~ $alias)
83 {
a63c962f 84 $DEBUG and warn "delete \$Alias\{$k\}";
5d030b67 85 delete $Alias{$k};
86 }
87 elsif (ref($alias) eq 'CODE')
88 {
a63c962f 89 $DEBUG and warn "delete \$Alias\{$k\}";
5d030b67 90 delete $Alias{$alias->($name)};
91 }
92 }
93 }else{
a63c962f 94 $DEBUG and warn "delete \$Alias\{$alias\}";
5d030b67 95 delete $Alias{$alias};
96 }
97 }
98}
99
5d030b67 100# Allow latin-1 style names as well
101 # 0 1 2 3 4 5 6 7 8 9 10
102our @Latin2iso = ( 0, 1, 2, 3, 4, 9, 10, 13, 14, 15, 16 );
5d030b67 103# Allow winlatin1 style names as well
104our %Winlatin2cp = (
105 'latin1' => 1252,
106 'latin2' => 1250,
107 'cyrillic' => 1251,
108 'greek' => 1253,
109 'turkish' => 1254,
110 'hebrew' => 1255,
111 'arabic' => 1256,
112 'baltic' => 1257,
113 'vietnamese' => 1258,
114 );
115
5129552c 116init_aliases();
117
118sub undef_aliases{
119 @Alias = ();
120 %Alias = ();
121}
122
123sub init_aliases
124{
125 undef_aliases();
a999c27c 126
f2a2953c 127 # Try all-lower-case version should all else fails
a999c27c 128 define_alias( qr/^(.*)$/ => '"\L$1"' );
129
f2a2953c 130 # UTF/UCS stuff
131 define_alias( qr/^UCS-?2-?LE$/i => '"UCS-2LE"' );
132 define_alias( qr/^UCS-?2-?(BE)?$/i => '"UCS-2BE"',
133 qr/^iso-10646-1$/i => '"UCS-2BE"' );
134 define_alias( qr/^UTF(16|32)-?BE$/i => '"UTF-$1BE"',
135 qr/^UTF(16|32)-?LE$/i => '"UTF-$1LE"',
136 qr/^UTF(16|32)$/i => '"UTF-$1"',
137 );
138 # ASCII
a999c27c 139 define_alias(qr/^(?:US-?)ascii$/i => '"ascii"');
a999c27c 140 define_alias('C' => 'ascii');
67d7b5ef 141 # Allow variants of iso-8859-1 etc.
142 define_alias( qr/\biso[-_]?(\d+)[-_](\d+)$/i => '"iso-$1-$2"' );
5129552c 143
67d7b5ef 144 # At least HP-UX has these.
145 define_alias( qr/\biso8859(\d+)$/i => '"iso-8859-$1"' );
5129552c 146
67d7b5ef 147 # More HP stuff.
148 define_alias( qr/\b(?:hp-)?(arabic|greek|hebrew|kana|roman|thai|turkish)8$/i => '"${1}8"' );
5129552c 149
67d7b5ef 150 # The Official name of ASCII.
151 define_alias( qr/\bANSI[-_]?X3\.4[-_]?1968$/i => '"ascii"' );
5129552c 152
67d7b5ef 153 # This is a font issue, not an encoding issue.
154 # (The currency symbol of the Latin 1 upper half
155 # has been redefined as the euro symbol.)
5129552c 156 define_alias( qr/^(.+)\@euro$/i => '"$1"' );
157
fcb875d4 158 define_alias( qr/\b(?:iso[-_]?)?latin[-_]?(\d+)$/i
5129552c 159 => '"iso-8859-$Encode::Alias::Latin2iso[$1]"' );
160
67d7b5ef 161 define_alias( qr/\bwin(latin[12]|cyrillic|baltic|greek|turkish|
fcb875d4 162 hebrew|arabic|baltic|vietnamese)$/ix =>
5129552c 163 '"cp" . $Encode::Alias::Winlatin2cp{lc($1)}' );
5d030b67 164
67d7b5ef 165 # Common names for non-latin prefered MIME names
5129552c 166 define_alias( 'ascii' => 'US-ascii',
167 'cyrillic' => 'iso-8859-5',
168 'arabic' => 'iso-8859-6',
169 'greek' => 'iso-8859-7',
170 'hebrew' => 'iso-8859-8',
171 'thai' => 'iso-8859-11',
172 'tis620' => 'iso-8859-11',
173 );
5d030b67 174
67d7b5ef 175 # At least AIX has IBM-NNN (surprisingly...) instead of cpNNN.
176 # And Microsoft has their own naming (again, surprisingly).
fcb875d4 177 # And windows-* is registered in IANA!
67d7b5ef 178 define_alias( qr/\b(?:ibm|ms|windows)[-_]?(\d\d\d\d?)$/i => '"cp$1"');
5d030b67 179
67d7b5ef 180 # Sometimes seen with a leading zero.
c731e18e 181 # define_alias( qr/\bcp037\b/i => '"cp37"');
5d030b67 182
3ef515df 183 # Mac Mappings
a999c27c 184 # predefined in *.ucm; unneeded
185 # define_alias( qr/\bmacIcelandic$/i => '"macIceland"');
3ef515df 186 define_alias( qr/^mac_(.*)$/i => '"mac$1"');
a999c27c 187 # Ououououou. gone. They are differente!
188 # define_alias( qr/\bmacRomanian$/i => '"macRumanian"');
fcb875d4 189
f2a2953c 190 # Standardize on the dashed versions.
67d7b5ef 191 # define_alias( qr/\butf8$/i => 'utf-8' );
192 define_alias( qr/\bkoi8r$/i => 'koi8-r' );
193 define_alias( qr/\bkoi8u$/i => 'koi8-u' );
5129552c 194
a63c962f 195 unless ($Encode::ON_EBCDIC){
196 # for Encode::CN
67d7b5ef 197 define_alias( qr/\beuc.*cn$/i => '"euc-cn"' );
198 define_alias( qr/\bcn.*euc$/i => '"euc-cn"' );
199 # define_alias( qr/\bGB[- ]?(\d+)$/i => '"euc-cn"' )
200 # CP936 doesn't have vendor-addon for GBK, so they're identical.
201 define_alias( qr/^gbk$/i => '"cp936"');
202 # This fixes gb2312 vs. euc-cn confusion, practically
203 define_alias( qr/\bGB[-_ ]?2312(?:\D.*$|$)/i => '"euc-cn"' );
204 # for Encode::JP
205 define_alias( qr/\bjis$/i => '"7bit-jis"' );
206 define_alias( qr/\beuc.*jp$/i => '"euc-jp"' );
207 define_alias( qr/\bjp.*euc$/i => '"euc-jp"' );
208 define_alias( qr/\bujis$/i => '"euc-jp"' );
209 define_alias( qr/\bshift.*jis$/i => '"shiftjis"' );
210 define_alias( qr/\bsjis$/i => '"shiftjis"' );
a63c962f 211 # for Encode::KR
67d7b5ef 212 define_alias( qr/\beuc.*kr$/i => '"euc-kr"' );
213 define_alias( qr/\bkr.*euc$/i => '"euc-kr"' );
214 # This fixes ksc5601 vs. euc-kr confusion, practically
215 define_alias( qr/(?:x-)?uhc$/i => '"cp949"' );
216 define_alias( qr/(?:x-)?windows-949$/i => '"cp949"' );
217 define_alias( qr/\bks_c_5601-1987$/i => '"cp949"' );
a63c962f 218 # for Encode::TW
b0b300a3 219 define_alias( qr/\bbig-?5$/i => '"big5-eten"' );
220 define_alias( qr/\bbig5-?et(?:en)$/i => '"big5-eten"' );
221 define_alias( qr/\bbig5-?hk(?:scs)?$/i => '"big5-hkscs"' );
a63c962f 222 }
f2a2953c 223 # utf8 is blessed :)
224 define_alias( qr/^UTF-8$/i => '"utf8"',);
67d7b5ef 225 # At last, Map white space and _ to '-'
5129552c 226 define_alias( qr/^(\S+)[\s_]+(.*)$/i => '"$1-$2"' );
227}
228
2291;
230__END__
5d030b67 231
232# TODO: HP-UX '8' encodings arabic8 greek8 hebrew8 kana8 thai8 turkish8
233# TODO: HP-UX '15' encodings japanese15 korean15 roi15
234# TODO: Cyrillic encoding ISO-IR-111 (useful?)
235# TODO: Armenian encoding ARMSCII-8
236# TODO: Hebrew encoding ISO-8859-8-1
237# TODO: Thai encoding TCVN
5d030b67 238# TODO: Vietnamese encodings VPS
239# TODO: Mac Asian+African encodings: Arabic Armenian Bengali Burmese
240# ChineseSimp ChineseTrad Devanagari Ethiopic ExtArabic
241# Farsi Georgian Gujarati Gurmukhi Hebrew Japanese
242# Kannada Khmer Korean Laotian Malayalam Mongolian
243# Oriya Sinhalese Symbol Tamil Telugu Tibetan Vietnamese
244
5d030b67 245=head1 NAME
246
ce912cd4 247Encode::Alias - alias definitions to encodings
5d030b67 248
249=head1 SYNOPSIS
250
5129552c 251 use Encode;
252 use Encode::Alias;
5d030b67 253 define_alias( newName => ENCODING);
254
255=head1 DESCRIPTION
256
3ef515df 257Allows newName to be used as an alias for ENCODING. ENCODING may be
fcb875d4 258either the name of an encoding or an encoding object (as described
3ef515df 259in L<Encode>).
5d030b67 260
261Currently I<newName> can be specified in the following ways:
262
263=over 4
264
265=item As a simple string.
266
267=item As a qr// compiled regular expression, e.g.:
268
269 define_alias( qr/^iso8859-(\d+)$/i => '"iso-8859-$1"' );
270
0ab8f81e 271In this case, if I<ENCODING> is not a reference, it is C<eval>-ed
272in order to allow C<$1> etc. to be substituted. The example is one
273way to alias names as used in X11 fonts to the MIME names for the
274iso-8859-* family. Note the double quotes inside the single quotes.
5d030b67 275
3ef515df 276If you are using a regex here, you have to use the quotes as shown or
277it won't work. Also note that regex handling is tricky even for the
278experienced. Use it with caution.
5d030b67 279
280=item As a code reference, e.g.:
281
282 define_alias( sub { return /^iso8859-(\d+)$/i ? "iso-8859-$1" : undef } , '');
283
0ab8f81e 284In this case, C<$_> will be set to the name that is being looked up and
5d030b67 285I<ENCODING> is passed to the sub as its first argument. The example
3ef515df 286is another way to alias names as used in X11 fonts to the MIME names
287for the iso-8859-* family.
5d030b67 288
5129552c 289=back
290
0ab8f81e 291=head2 Alias overloading
5d030b67 292
3ef515df 293You can override predefined aliases by simply applying define_alias().
0ab8f81e 294The new alias is always evaluated first, and when neccessary,
295define_alias() flushes the internal cache to make the new definition
296available.
5d030b67 297
0ab8f81e 298 # redirect SHIFT_JIS to MS/IBM Code Page 932, which is a
5d030b67 299 # superset of SHIFT_JIS
300
5129552c 301 define_alias( qr/shift.*jis$/i => '"cp932"' );
302 define_alias( qr/sjis$/i => '"cp932"' );
303
0ab8f81e 304If you want to zap all predefined aliases, you can use
5129552c 305
306 Encode::Alias->undef_aliases;
307
308to do so. And
309
310 Encode::Alias->init_aliases;
311
0ab8f81e 312gets the factory settings back.
5d030b67 313
314=head1 SEE ALSO
315
316L<Encode>, L<Encode::Supported>
317
5129552c 318=cut
5d030b67 319