Add characters that VOS forbids in filenames, mentioned by Paul Green
[p5sagit/p5-mst-13.2.git] / ext / Encode / lib / Encode / Alias.pm
CommitLineData
5d030b67 1package Encode::Alias;
2use strict;
656ebd29 3use warnings;
1485817e 4no warnings 'redefine';
5129552c 5use Encode;
51e4e64d 6our $VERSION = do { my @r = ( q$Revision: 2.7 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r };
8f139f4c 7sub DEBUG () { 0 }
5d030b67 8
10c5ecbb 9use base qw(Exporter);
5d030b67 10
11# Public, encouraged API is exported by default
5129552c 12
d1256cb1 13our @EXPORT =
14 qw (
15 define_alias
16 find_alias
17);
5d030b67 18
d1256cb1 19our @Alias; # ordered matching list
20our %Alias; # cached known aliases
5d030b67 21
d1256cb1 22sub find_alias {
5d030b67 23 my $class = shift;
d1256cb1 24 my $find = shift;
25 unless ( exists $Alias{$find} ) {
26 $Alias{$find} = undef; # Recursion guard
27 for ( my $i = 0 ; $i < @Alias ; $i += 2 ) {
28 my $alias = $Alias[$i];
29 my $val = $Alias[ $i + 1 ];
30 my $new;
31 if ( ref($alias) eq 'Regexp' && $find =~ $alias ) {
32 DEBUG and warn "eval $val";
33 $new = eval $val;
34 DEBUG and $@ and warn "$val, $@";
35 }
36 elsif ( ref($alias) eq 'CODE' ) {
37 DEBUG and warn "$alias", "->", "($find)";
38 $new = $alias->($find);
39 }
40 elsif ( lc($find) eq lc($alias) ) {
41 $new = $val;
42 }
43 if ( defined($new) ) {
44 next if $new eq $find; # avoid (direct) recursion on bugs
45 DEBUG and warn "$alias, $new";
46 my $enc =
47 ( ref($new) ) ? $new : Encode::find_encoding($new);
48 if ($enc) {
49 $Alias{$find} = $enc;
50 last;
51 }
52 }
53 }
54
55 # case insensitive search when canonical is not in all lowercase
56 # RT ticket #7835
57 unless ( $Alias{$find} ) {
58 my $lcfind = lc($find);
59 for my $name ( keys %Encode::Encoding, keys %Encode::ExtModule )
60 {
61 $lcfind eq lc($name) or next;
62 $Alias{$find} = Encode::find_encoding($name);
63 DEBUG and warn "$find => $name";
64 }
65 }
5d030b67 66 }
d1256cb1 67 if (DEBUG) {
68 my $name;
69 if ( my $e = $Alias{$find} ) {
70 $name = $e->name;
71 }
72 else {
73 $name = "";
74 }
75 warn "find_alias($class, $find)->name = $name";
a63c962f 76 }
151b5d36 77 return $Alias{$find};
5d030b67 78}
79
d1256cb1 80sub define_alias {
81 while (@_) {
82 my ( $alias, $name ) = splice( @_, 0, 2 );
83 unshift( @Alias, $alias => $name ); # newer one has precedence
84 if ( ref($alias) ) {
85
86 # clear %Alias cache to allow overrides
87 my @a = keys %Alias;
88 for my $k (@a) {
89 if ( ref($alias) eq 'Regexp' && $k =~ $alias ) {
90 DEBUG and warn "delete \$Alias\{$k\}";
91 delete $Alias{$k};
92 }
93 elsif ( ref($alias) eq 'CODE' ) {
94 DEBUG and warn "delete \$Alias\{$k\}";
95 delete $Alias{ $alias->($name) };
96 }
97 }
98 }
99 else {
100 DEBUG and warn "delete \$Alias\{$alias\}";
101 delete $Alias{$alias};
102 }
5d030b67 103 }
104}
105
5d030b67 106# Allow latin-1 style names as well
8f1ed24a 107# 0 1 2 3 4 5 6 7 8 9 10
5d030b67 108our @Latin2iso = ( 0, 1, 2, 3, 4, 9, 10, 13, 14, 15, 16 );
d1256cb1 109
5d030b67 110# Allow winlatin1 style names as well
d1256cb1 111our %Winlatin2cp = (
112 'latin1' => 1252,
113 'latin2' => 1250,
114 'cyrillic' => 1251,
115 'greek' => 1253,
116 'turkish' => 1254,
117 'hebrew' => 1255,
118 'arabic' => 1256,
119 'baltic' => 1257,
120 'vietnamese' => 1258,
121);
5d030b67 122
5129552c 123init_aliases();
124
d1256cb1 125sub undef_aliases {
5129552c 126 @Alias = ();
127 %Alias = ();
128}
129
d1256cb1 130sub init_aliases {
5129552c 131 undef_aliases();
d1256cb1 132
f2a2953c 133 # Try all-lower-case version should all else fails
a999c27c 134 define_alias( qr/^(.*)$/ => '"\L$1"' );
135
f2a2953c 136 # UTF/UCS stuff
d1256cb1 137 define_alias( qr/^UTF-?7$/i => '"UTF-7"' );
138 define_alias( qr/^UCS-?2-?LE$/i => '"UCS-2LE"' );
139 define_alias(
140 qr/^UCS-?2-?(BE)?$/i => '"UCS-2BE"',
141 qr/^UCS-?4-?(BE|LE)?$/i => 'uc("UTF-32$1")',
142 qr/^iso-10646-1$/i => '"UCS-2BE"'
143 );
144 define_alias(
145 qr/^UTF-?(16|32)-?BE$/i => '"UTF-$1BE"',
146 qr/^UTF-?(16|32)-?LE$/i => '"UTF-$1LE"',
147 qr/^UTF-?(16|32)$/i => '"UTF-$1"',
148 );
149
f2a2953c 150 # ASCII
d1256cb1 151 define_alias( qr/^(?:US-?)ascii$/i => '"ascii"' );
152 define_alias( 'C' => 'ascii' );
153 define_alias( qr/\bISO[-_]?646[-_]?US$/i => '"ascii"' );
154
67d7b5ef 155 # Allow variants of iso-8859-1 etc.
156 define_alias( qr/\biso[-_]?(\d+)[-_](\d+)$/i => '"iso-$1-$2"' );
5129552c 157
67d7b5ef 158 # At least HP-UX has these.
159 define_alias( qr/\biso8859(\d+)$/i => '"iso-8859-$1"' );
5129552c 160
67d7b5ef 161 # More HP stuff.
d1256cb1 162 define_alias(
163 qr/\b(?:hp-)?(arabic|greek|hebrew|kana|roman|thai|turkish)8$/i =>
164 '"${1}8"' );
5129552c 165
67d7b5ef 166 # The Official name of ASCII.
167 define_alias( qr/\bANSI[-_]?X3\.4[-_]?1968$/i => '"ascii"' );
5129552c 168
67d7b5ef 169 # This is a font issue, not an encoding issue.
170 # (The currency symbol of the Latin 1 upper half
171 # has been redefined as the euro symbol.)
5129552c 172 define_alias( qr/^(.+)\@euro$/i => '"$1"' );
173
d1256cb1 174 define_alias( qr/\b(?:iso[-_]?)?latin[-_]?(\d+)$/i =>
175'defined $Encode::Alias::Latin2iso[$1] ? "iso-8859-$Encode::Alias::Latin2iso[$1]" : undef'
176 );
5129552c 177
d1256cb1 178 define_alias(
179 qr/\bwin(latin[12]|cyrillic|baltic|greek|turkish|
180 hebrew|arabic|baltic|vietnamese)$/ix =>
181 '"cp" . $Encode::Alias::Winlatin2cp{lc($1)}'
182 );
5d030b67 183
3c4b39be 184 # Common names for non-latin preferred MIME names
d1256cb1 185 define_alias(
186 'ascii' => 'US-ascii',
187 'cyrillic' => 'iso-8859-5',
188 'arabic' => 'iso-8859-6',
189 'greek' => 'iso-8859-7',
190 'hebrew' => 'iso-8859-8',
191 'thai' => 'iso-8859-11',
d1256cb1 192 );
51e4e64d 193 # RT #20781
194 define_alias(qr/\btis-?620\b/i => '"iso-8859-11"');
5d030b67 195
67d7b5ef 196 # At least AIX has IBM-NNN (surprisingly...) instead of cpNNN.
197 # And Microsoft has their own naming (again, surprisingly).
d1256cb1 198 # And windows-* is registered in IANA!
199 define_alias(
200 qr/\b(?:cp|ibm|ms|windows)[-_ ]?(\d{2,4})$/i => '"cp$1"' );
5d030b67 201
67d7b5ef 202 # Sometimes seen with a leading zero.
c731e18e 203 # define_alias( qr/\bcp037\b/i => '"cp37"');
5d030b67 204
3ef515df 205 # Mac Mappings
a999c27c 206 # predefined in *.ucm; unneeded
207 # define_alias( qr/\bmacIcelandic$/i => '"macIceland"');
d1256cb1 208 define_alias( qr/^mac_(.*)$/i => '"mac$1"' );
209
a999c27c 210 # Ououououou. gone. They are differente!
211 # define_alias( qr/\bmacRomanian$/i => '"macRumanian"');
d1256cb1 212
f2a2953c 213 # Standardize on the dashed versions.
cf9f87ce 214 define_alias( qr/\bkoi8[\s\-_]*([ru])$/i => '"koi8-$1"' );
5129552c 215
d1256cb1 216 unless ($Encode::ON_EBCDIC) {
217
a63c962f 218 # for Encode::CN
d1256cb1 219 define_alias( qr/\beuc.*cn$/i => '"euc-cn"' );
220 define_alias( qr/\bcn.*euc$/i => '"euc-cn"' );
221
222 # define_alias( qr/\bGB[- ]?(\d+)$/i => '"euc-cn"' )
223 # CP936 doesn't have vendor-addon for GBK, so they're identical.
224 define_alias( qr/^gbk$/i => '"cp936"' );
225
226 # This fixes gb2312 vs. euc-cn confusion, practically
227 define_alias( qr/\bGB[-_ ]?2312(?!-?raw)/i => '"euc-cn"' );
228
229 # for Encode::JP
230 define_alias( qr/\bjis$/i => '"7bit-jis"' );
231 define_alias( qr/\beuc.*jp$/i => '"euc-jp"' );
232 define_alias( qr/\bjp.*euc$/i => '"euc-jp"' );
233 define_alias( qr/\bujis$/i => '"euc-jp"' );
234 define_alias( qr/\bshift.*jis$/i => '"shiftjis"' );
235 define_alias( qr/\bsjis$/i => '"shiftjis"' );
236 define_alias( qr/\bwindows-31j$/i => '"cp932"' );
237
a63c962f 238 # for Encode::KR
d1256cb1 239 define_alias( qr/\beuc.*kr$/i => '"euc-kr"' );
240 define_alias( qr/\bkr.*euc$/i => '"euc-kr"' );
241
242 # This fixes ksc5601 vs. euc-kr confusion, practically
243 define_alias( qr/(?:x-)?uhc$/i => '"cp949"' );
244 define_alias( qr/(?:x-)?windows-949$/i => '"cp949"' );
245 define_alias( qr/\bks_c_5601-1987$/i => '"cp949"' );
246
a63c962f 247 # for Encode::TW
d1256cb1 248 define_alias( qr/\bbig-?5$/i => '"big5-eten"' );
249 define_alias( qr/\bbig5-?et(?:en)?$/i => '"big5-eten"' );
250 define_alias( qr/\btca[-_]?big5$/i => '"big5-eten"' );
251 define_alias( qr/\bbig5-?hk(?:scs)?$/i => '"big5-hkscs"' );
252 define_alias( qr/\bhk(?:scs)?[-_]?big5$/i => '"big5-hkscs"' );
a63c962f 253 }
d1256cb1 254
f2a2953c 255 # utf8 is blessed :)
d1256cb1 256 define_alias( qr/^UTF-8$/i => '"utf-8-strict"' );
257
67d7b5ef 258 # At last, Map white space and _ to '-'
5129552c 259 define_alias( qr/^(\S+)[\s_]+(.*)$/i => '"$1-$2"' );
260}
261
2621;
263__END__
5d030b67 264
265# TODO: HP-UX '8' encodings arabic8 greek8 hebrew8 kana8 thai8 turkish8
266# TODO: HP-UX '15' encodings japanese15 korean15 roi15
267# TODO: Cyrillic encoding ISO-IR-111 (useful?)
268# TODO: Armenian encoding ARMSCII-8
269# TODO: Hebrew encoding ISO-8859-8-1
270# TODO: Thai encoding TCVN
5d030b67 271# TODO: Vietnamese encodings VPS
272# TODO: Mac Asian+African encodings: Arabic Armenian Bengali Burmese
273# ChineseSimp ChineseTrad Devanagari Ethiopic ExtArabic
274# Farsi Georgian Gujarati Gurmukhi Hebrew Japanese
275# Kannada Khmer Korean Laotian Malayalam Mongolian
276# Oriya Sinhalese Symbol Tamil Telugu Tibetan Vietnamese
277
5d030b67 278=head1 NAME
279
ce912cd4 280Encode::Alias - alias definitions to encodings
5d030b67 281
282=head1 SYNOPSIS
283
5129552c 284 use Encode;
285 use Encode::Alias;
5d030b67 286 define_alias( newName => ENCODING);
287
288=head1 DESCRIPTION
289
3ef515df 290Allows newName to be used as an alias for ENCODING. ENCODING may be
fcb875d4 291either the name of an encoding or an encoding object (as described
3ef515df 292in L<Encode>).
5d030b67 293
294Currently I<newName> can be specified in the following ways:
295
296=over 4
297
298=item As a simple string.
299
300=item As a qr// compiled regular expression, e.g.:
301
302 define_alias( qr/^iso8859-(\d+)$/i => '"iso-8859-$1"' );
303
0ab8f81e 304In this case, if I<ENCODING> is not a reference, it is C<eval>-ed
305in order to allow C<$1> etc. to be substituted. The example is one
306way to alias names as used in X11 fonts to the MIME names for the
307iso-8859-* family. Note the double quotes inside the single quotes.
5d030b67 308
151b5d36 309(or, you don't have to do this yourself because this example is predefined)
310
3ef515df 311If you are using a regex here, you have to use the quotes as shown or
312it won't work. Also note that regex handling is tricky even for the
151b5d36 313experienced. Use this feature with caution.
5d030b67 314
315=item As a code reference, e.g.:
316
151b5d36 317 define_alias( sub {shift =~ /^iso8859-(\d+)$/i ? "iso-8859-$1" : undef } );
5d030b67 318
151b5d36 319The same effect as the example above in a different way. The coderef
320takes the alias name as an argument and returns a canonical name on
321success or undef if not. Note the second argument is not required.
322Use this with even more caution than the regex version.
5d030b67 323
5129552c 324=back
325
151b5d36 326=head3 Changes in code reference aliasing
327
328As of Encode 1.87, the older form
329
330 define_alias( sub { return /^iso8859-(\d+)$/i ? "iso-8859-$1" : undef } );
331
332no longer works.
333
334Encode up to 1.86 internally used "local $_" to implement ths older
335form. But consider the code below;
336
337 use Encode;
338 $_ = "eeeee" ;
339 while (/(e)/g) {
340 my $utf = decode('aliased-encoding-name', $1);
341 print "position:",pos,"\n";
342 }
343
344Prior to Encode 1.86 this fails because of "local $_".
345
0ab8f81e 346=head2 Alias overloading
5d030b67 347
3ef515df 348You can override predefined aliases by simply applying define_alias().
3c4b39be 349The new alias is always evaluated first, and when necessary,
0ab8f81e 350define_alias() flushes the internal cache to make the new definition
351available.
5d030b67 352
0ab8f81e 353 # redirect SHIFT_JIS to MS/IBM Code Page 932, which is a
5d030b67 354 # superset of SHIFT_JIS
355
5129552c 356 define_alias( qr/shift.*jis$/i => '"cp932"' );
357 define_alias( qr/sjis$/i => '"cp932"' );
358
0ab8f81e 359If you want to zap all predefined aliases, you can use
5129552c 360
361 Encode::Alias->undef_aliases;
362
363to do so. And
364
365 Encode::Alias->init_aliases;
366
0ab8f81e 367gets the factory settings back.
5d030b67 368
369=head1 SEE ALSO
370
371L<Encode>, L<Encode::Supported>
372
5129552c 373=cut
5d030b67 374