Show gnulibc_version in myconfig.
[p5sagit/p5-mst-13.2.git] / ext / Encode / lib / Encode / Alias.pm
CommitLineData
5d030b67 1package Encode::Alias;
2use strict;
5129552c 3use Encode;
2d06ad02 4our $VERSION = do { my @r = (q$Revision: 1.32 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
5d030b67 5our $DEBUG = 0;
5d030b67 6
10c5ecbb 7use base qw(Exporter);
5d030b67 8
9# Public, encouraged API is exported by default
5129552c 10
fcb875d4 11our @EXPORT =
5129552c 12 qw (
13 define_alias
14 find_alias
15 );
5d030b67 16
17our @Alias; # ordered matching list
18our %Alias; # cached known aliases
19
5129552c 20sub find_alias
5d030b67 21{
22 my $class = shift;
23 local $_ = shift;
24 unless (exists $Alias{$_})
25 {
eaac0a15 26 $Alias{$_} = undef; # Recursion guard
5d030b67 27 for (my $i=0; $i < @Alias; $i += 2)
28 {
29 my $alias = $Alias[$i];
30 my $val = $Alias[$i+1];
31 my $new;
32 if (ref($alias) eq 'Regexp' && $_ =~ $alias)
33 {
a63c962f 34 $DEBUG and warn "eval $val";
5d030b67 35 $new = eval $val;
36 # $@ and warn "$val, $@";
37 }
38 elsif (ref($alias) eq 'CODE')
39 {
a63c962f 40 $DEBUG and warn "$alias", "->", "($val)";
5d030b67 41 $new = $alias->($val);
42 }
43 elsif (lc($_) eq lc($alias))
44 {
45 $new = $val;
46 }
47 if (defined($new))
48 {
49 next if $new eq $_; # avoid (direct) recursion on bugs
a63c962f 50 $DEBUG and warn "$alias, $new";
5d030b67 51 my $enc = (ref($new)) ? $new : Encode::find_encoding($new);
52 if ($enc)
53 {
54 $Alias{$_} = $enc;
55 last;
56 }
57 }
58 }
59 }
a63c962f 60 if ($DEBUG){
61 my $name;
62 if (my $e = $Alias{$_}){
63 $name = $e->name;
64 }else{
65 $name = "";
66 }
67 warn "find_alias($class, $_)->name = $name";
68 }
5d030b67 69 return $Alias{$_};
70}
71
72sub define_alias
73{
74 while (@_)
75 {
76 my ($alias,$name) = splice(@_,0,2);
77 unshift(@Alias, $alias => $name); # newer one has precedence
78 # clear %Alias cache to allow overrides
79 if (ref($alias)){
5129552c 80 my @a = keys %Alias;
81 for my $k (@a){
5d030b67 82 if (ref($alias) eq 'Regexp' && $k =~ $alias)
83 {
a63c962f 84 $DEBUG and warn "delete \$Alias\{$k\}";
5d030b67 85 delete $Alias{$k};
86 }
87 elsif (ref($alias) eq 'CODE')
88 {
a63c962f 89 $DEBUG and warn "delete \$Alias\{$k\}";
5d030b67 90 delete $Alias{$alias->($name)};
91 }
92 }
93 }else{
a63c962f 94 $DEBUG and warn "delete \$Alias\{$alias\}";
5d030b67 95 delete $Alias{$alias};
96 }
97 }
98}
99
5d030b67 100# Allow latin-1 style names as well
101 # 0 1 2 3 4 5 6 7 8 9 10
102our @Latin2iso = ( 0, 1, 2, 3, 4, 9, 10, 13, 14, 15, 16 );
5d030b67 103# Allow winlatin1 style names as well
104our %Winlatin2cp = (
105 'latin1' => 1252,
106 'latin2' => 1250,
107 'cyrillic' => 1251,
108 'greek' => 1253,
109 'turkish' => 1254,
110 'hebrew' => 1255,
111 'arabic' => 1256,
112 'baltic' => 1257,
113 'vietnamese' => 1258,
114 );
115
5129552c 116init_aliases();
117
118sub undef_aliases{
119 @Alias = ();
120 %Alias = ();
121}
122
123sub init_aliases
124{
125 undef_aliases();
a999c27c 126
f2a2953c 127 # Try all-lower-case version should all else fails
a999c27c 128 define_alias( qr/^(.*)$/ => '"\L$1"' );
129
f2a2953c 130 # UTF/UCS stuff
11067275 131 define_alias( qr/^UCS-?2-?LE$/i => '"UCS-2LE"' );
132 define_alias( qr/^UCS-?2-?(BE)?$/i => '"UCS-2BE"',
126bf8bf 133 qr/^UCS-?4-?(BE|LE)?$/i => 'uc("UTF-32$1")',
11067275 134 qr/^iso-10646-1$/i => '"UCS-2BE"' );
135 define_alias( qr/^UTF(16|32)-?BE$/i => '"UTF-$1BE"',
136 qr/^UTF(16|32)-?LE$/i => '"UTF-$1LE"',
137 qr/^UTF(16|32)$/i => '"UTF-$1"',
f2a2953c 138 );
139 # ASCII
a999c27c 140 define_alias(qr/^(?:US-?)ascii$/i => '"ascii"');
a999c27c 141 define_alias('C' => 'ascii');
2d06ad02 142 define_alias(qr/\bISO[-_]?646[-_]?US$/i => '"ascii"');
67d7b5ef 143 # Allow variants of iso-8859-1 etc.
144 define_alias( qr/\biso[-_]?(\d+)[-_](\d+)$/i => '"iso-$1-$2"' );
5129552c 145
67d7b5ef 146 # At least HP-UX has these.
147 define_alias( qr/\biso8859(\d+)$/i => '"iso-8859-$1"' );
5129552c 148
67d7b5ef 149 # More HP stuff.
150 define_alias( qr/\b(?:hp-)?(arabic|greek|hebrew|kana|roman|thai|turkish)8$/i => '"${1}8"' );
5129552c 151
67d7b5ef 152 # The Official name of ASCII.
153 define_alias( qr/\bANSI[-_]?X3\.4[-_]?1968$/i => '"ascii"' );
5129552c 154
67d7b5ef 155 # This is a font issue, not an encoding issue.
156 # (The currency symbol of the Latin 1 upper half
157 # has been redefined as the euro symbol.)
5129552c 158 define_alias( qr/^(.+)\@euro$/i => '"$1"' );
159
fcb875d4 160 define_alias( qr/\b(?:iso[-_]?)?latin[-_]?(\d+)$/i
5129552c 161 => '"iso-8859-$Encode::Alias::Latin2iso[$1]"' );
162
67d7b5ef 163 define_alias( qr/\bwin(latin[12]|cyrillic|baltic|greek|turkish|
fcb875d4 164 hebrew|arabic|baltic|vietnamese)$/ix =>
5129552c 165 '"cp" . $Encode::Alias::Winlatin2cp{lc($1)}' );
5d030b67 166
67d7b5ef 167 # Common names for non-latin prefered MIME names
5129552c 168 define_alias( 'ascii' => 'US-ascii',
169 'cyrillic' => 'iso-8859-5',
170 'arabic' => 'iso-8859-6',
171 'greek' => 'iso-8859-7',
172 'hebrew' => 'iso-8859-8',
173 'thai' => 'iso-8859-11',
174 'tis620' => 'iso-8859-11',
175 );
5d030b67 176
67d7b5ef 177 # At least AIX has IBM-NNN (surprisingly...) instead of cpNNN.
178 # And Microsoft has their own naming (again, surprisingly).
fcb875d4 179 # And windows-* is registered in IANA!
2d06ad02 180 define_alias( qr/\b(?:cp|ibm|ms|windows)[-_ ]?(\d{2,4})$/i => '"cp$1"');
5d030b67 181
67d7b5ef 182 # Sometimes seen with a leading zero.
c731e18e 183 # define_alias( qr/\bcp037\b/i => '"cp37"');
5d030b67 184
3ef515df 185 # Mac Mappings
a999c27c 186 # predefined in *.ucm; unneeded
187 # define_alias( qr/\bmacIcelandic$/i => '"macIceland"');
3ef515df 188 define_alias( qr/^mac_(.*)$/i => '"mac$1"');
a999c27c 189 # Ououououou. gone. They are differente!
190 # define_alias( qr/\bmacRomanian$/i => '"macRumanian"');
fcb875d4 191
f2a2953c 192 # Standardize on the dashed versions.
67d7b5ef 193 # define_alias( qr/\butf8$/i => 'utf-8' );
194 define_alias( qr/\bkoi8r$/i => 'koi8-r' );
195 define_alias( qr/\bkoi8u$/i => 'koi8-u' );
5129552c 196
a63c962f 197 unless ($Encode::ON_EBCDIC){
198 # for Encode::CN
67d7b5ef 199 define_alias( qr/\beuc.*cn$/i => '"euc-cn"' );
200 define_alias( qr/\bcn.*euc$/i => '"euc-cn"' );
201 # define_alias( qr/\bGB[- ]?(\d+)$/i => '"euc-cn"' )
202 # CP936 doesn't have vendor-addon for GBK, so they're identical.
203 define_alias( qr/^gbk$/i => '"cp936"');
204 # This fixes gb2312 vs. euc-cn confusion, practically
205 define_alias( qr/\bGB[-_ ]?2312(?:\D.*$|$)/i => '"euc-cn"' );
206 # for Encode::JP
207 define_alias( qr/\bjis$/i => '"7bit-jis"' );
208 define_alias( qr/\beuc.*jp$/i => '"euc-jp"' );
209 define_alias( qr/\bjp.*euc$/i => '"euc-jp"' );
210 define_alias( qr/\bujis$/i => '"euc-jp"' );
211 define_alias( qr/\bshift.*jis$/i => '"shiftjis"' );
212 define_alias( qr/\bsjis$/i => '"shiftjis"' );
a63c962f 213 # for Encode::KR
67d7b5ef 214 define_alias( qr/\beuc.*kr$/i => '"euc-kr"' );
215 define_alias( qr/\bkr.*euc$/i => '"euc-kr"' );
216 # This fixes ksc5601 vs. euc-kr confusion, practically
217 define_alias( qr/(?:x-)?uhc$/i => '"cp949"' );
218 define_alias( qr/(?:x-)?windows-949$/i => '"cp949"' );
219 define_alias( qr/\bks_c_5601-1987$/i => '"cp949"' );
a63c962f 220 # for Encode::TW
b0b300a3 221 define_alias( qr/\bbig-?5$/i => '"big5-eten"' );
222 define_alias( qr/\bbig5-?et(?:en)$/i => '"big5-eten"' );
2d06ad02 223 define_alias( qr/\btca[-_]?big5$/i => '"big5-eten"' );
b0b300a3 224 define_alias( qr/\bbig5-?hk(?:scs)?$/i => '"big5-hkscs"' );
2d06ad02 225 define_alias( qr/\bhk(?:scs)?[-_]?big5$/i => '"big5-hkscs"' );
a63c962f 226 }
f2a2953c 227 # utf8 is blessed :)
228 define_alias( qr/^UTF-8$/i => '"utf8"',);
67d7b5ef 229 # At last, Map white space and _ to '-'
5129552c 230 define_alias( qr/^(\S+)[\s_]+(.*)$/i => '"$1-$2"' );
231}
232
2331;
234__END__
5d030b67 235
236# TODO: HP-UX '8' encodings arabic8 greek8 hebrew8 kana8 thai8 turkish8
237# TODO: HP-UX '15' encodings japanese15 korean15 roi15
238# TODO: Cyrillic encoding ISO-IR-111 (useful?)
239# TODO: Armenian encoding ARMSCII-8
240# TODO: Hebrew encoding ISO-8859-8-1
241# TODO: Thai encoding TCVN
5d030b67 242# TODO: Vietnamese encodings VPS
243# TODO: Mac Asian+African encodings: Arabic Armenian Bengali Burmese
244# ChineseSimp ChineseTrad Devanagari Ethiopic ExtArabic
245# Farsi Georgian Gujarati Gurmukhi Hebrew Japanese
246# Kannada Khmer Korean Laotian Malayalam Mongolian
247# Oriya Sinhalese Symbol Tamil Telugu Tibetan Vietnamese
248
5d030b67 249=head1 NAME
250
ce912cd4 251Encode::Alias - alias definitions to encodings
5d030b67 252
253=head1 SYNOPSIS
254
5129552c 255 use Encode;
256 use Encode::Alias;
5d030b67 257 define_alias( newName => ENCODING);
258
259=head1 DESCRIPTION
260
3ef515df 261Allows newName to be used as an alias for ENCODING. ENCODING may be
fcb875d4 262either the name of an encoding or an encoding object (as described
3ef515df 263in L<Encode>).
5d030b67 264
265Currently I<newName> can be specified in the following ways:
266
267=over 4
268
269=item As a simple string.
270
271=item As a qr// compiled regular expression, e.g.:
272
273 define_alias( qr/^iso8859-(\d+)$/i => '"iso-8859-$1"' );
274
0ab8f81e 275In this case, if I<ENCODING> is not a reference, it is C<eval>-ed
276in order to allow C<$1> etc. to be substituted. The example is one
277way to alias names as used in X11 fonts to the MIME names for the
278iso-8859-* family. Note the double quotes inside the single quotes.
5d030b67 279
3ef515df 280If you are using a regex here, you have to use the quotes as shown or
281it won't work. Also note that regex handling is tricky even for the
282experienced. Use it with caution.
5d030b67 283
284=item As a code reference, e.g.:
285
286 define_alias( sub { return /^iso8859-(\d+)$/i ? "iso-8859-$1" : undef } , '');
287
0ab8f81e 288In this case, C<$_> will be set to the name that is being looked up and
5d030b67 289I<ENCODING> is passed to the sub as its first argument. The example
3ef515df 290is another way to alias names as used in X11 fonts to the MIME names
291for the iso-8859-* family.
5d030b67 292
5129552c 293=back
294
0ab8f81e 295=head2 Alias overloading
5d030b67 296
3ef515df 297You can override predefined aliases by simply applying define_alias().
0ab8f81e 298The new alias is always evaluated first, and when neccessary,
299define_alias() flushes the internal cache to make the new definition
300available.
5d030b67 301
0ab8f81e 302 # redirect SHIFT_JIS to MS/IBM Code Page 932, which is a
5d030b67 303 # superset of SHIFT_JIS
304
5129552c 305 define_alias( qr/shift.*jis$/i => '"cp932"' );
306 define_alias( qr/sjis$/i => '"cp932"' );
307
0ab8f81e 308If you want to zap all predefined aliases, you can use
5129552c 309
310 Encode::Alias->undef_aliases;
311
312to do so. And
313
314 Encode::Alias->init_aliases;
315
0ab8f81e 316gets the factory settings back.
5d030b67 317
318=head1 SEE ALSO
319
320L<Encode>, L<Encode::Supported>
321
5129552c 322=cut
5d030b67 323