4 no warnings 'redefine';
5 our $VERSION = do { my @r = ( q$Revision: 2.9 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r };
10 # Public, encouraged API is exported by default
18 our @Alias; # ordered matching list
19 our %Alias; # cached known aliases
25 unless ( exists $Alias{$find} ) {
26 $Alias{$find} = undef; # Recursion guard
27 for ( my $i = 0 ; $i < @Alias ; $i += 2 ) {
28 my $alias = $Alias[$i];
29 my $val = $Alias[ $i + 1 ];
31 if ( ref($alias) eq 'Regexp' && $find =~ $alias ) {
32 DEBUG and warn "eval $val";
34 DEBUG and $@ and warn "$val, $@";
36 elsif ( ref($alias) eq 'CODE' ) {
37 DEBUG and warn "$alias", "->", "($find)";
38 $new = $alias->($find);
40 elsif ( lc($find) eq lc($alias) ) {
43 if ( defined($new) ) {
44 next if $new eq $find; # avoid (direct) recursion on bugs
45 DEBUG and warn "$alias, $new";
47 ( ref($new) ) ? $new : Encode::find_encoding($new);
55 # case insensitive search when canonical is not in all lowercase
57 unless ( $Alias{$find} ) {
58 my $lcfind = lc($find);
59 for my $name ( keys %Encode::Encoding, keys %Encode::ExtModule )
61 $lcfind eq lc($name) or next;
62 $Alias{$find} = Encode::find_encoding($name);
63 DEBUG and warn "$find => $name";
69 if ( my $e = $Alias{$find} ) {
75 warn "find_alias($class, $find)->name = $name";
82 my ( $alias, $name ) = splice( @_, 0, 2 );
83 unshift( @Alias, $alias => $name ); # newer one has precedence
86 # clear %Alias cache to allow overrides
89 if ( ref($alias) eq 'Regexp' && $k =~ $alias ) {
90 DEBUG and warn "delete \$Alias\{$k\}";
93 elsif ( ref($alias) eq 'CODE' ) {
94 DEBUG and warn "delete \$Alias\{$k\}";
95 delete $Alias{ $alias->($name) };
100 DEBUG and warn "delete \$Alias\{$alias\}";
101 delete $Alias{$alias};
106 # Allow latin-1 style names as well
107 # 0 1 2 3 4 5 6 7 8 9 10
108 our @Latin2iso = ( 0, 1, 2, 3, 4, 9, 10, 13, 14, 15, 16 );
110 # Allow winlatin1 style names as well
120 'vietnamese' => 1258,
134 # Try all-lower-case version should all else fails
135 define_alias( qr/^(.*)$/ => '"\L$1"' );
138 define_alias( qr/^UTF-?7$/i => '"UTF-7"' );
139 define_alias( qr/^UCS-?2-?LE$/i => '"UCS-2LE"' );
141 qr/^UCS-?2-?(BE)?$/i => '"UCS-2BE"',
142 qr/^UCS-?4-?(BE|LE)?$/i => 'uc("UTF-32$1")',
143 qr/^iso-10646-1$/i => '"UCS-2BE"'
146 qr/^UTF-?(16|32)-?BE$/i => '"UTF-$1BE"',
147 qr/^UTF-?(16|32)-?LE$/i => '"UTF-$1LE"',
148 qr/^UTF-?(16|32)$/i => '"UTF-$1"',
152 define_alias( qr/^(?:US-?)ascii$/i => '"ascii"' );
153 define_alias( 'C' => 'ascii' );
154 define_alias( qr/\b(?:ISO[-_]?)?646(?:[-_]?US)?$/i => '"ascii"' );
156 # Allow variants of iso-8859-1 etc.
157 define_alias( qr/\biso[-_]?(\d+)[-_](\d+)$/i => '"iso-$1-$2"' );
159 # At least HP-UX has these.
160 define_alias( qr/\biso8859(\d+)$/i => '"iso-8859-$1"' );
164 qr/\b(?:hp-)?(arabic|greek|hebrew|kana|roman|thai|turkish)8$/i =>
167 # The Official name of ASCII.
168 define_alias( qr/\bANSI[-_]?X3\.4[-_]?1968$/i => '"ascii"' );
170 # This is a font issue, not an encoding issue.
171 # (The currency symbol of the Latin 1 upper half
172 # has been redefined as the euro symbol.)
173 define_alias( qr/^(.+)\@euro$/i => '"$1"' );
175 define_alias( qr/\b(?:iso[-_]?)?latin[-_]?(\d+)$/i =>
176 'defined $Encode::Alias::Latin2iso[$1] ? "iso-8859-$Encode::Alias::Latin2iso[$1]" : undef'
180 qr/\bwin(latin[12]|cyrillic|baltic|greek|turkish|
181 hebrew|arabic|baltic|vietnamese)$/ix =>
182 '"cp" . $Encode::Alias::Winlatin2cp{lc($1)}'
185 # Common names for non-latin preferred MIME names
187 'ascii' => 'US-ascii',
188 'cyrillic' => 'iso-8859-5',
189 'arabic' => 'iso-8859-6',
190 'greek' => 'iso-8859-7',
191 'hebrew' => 'iso-8859-8',
192 'thai' => 'iso-8859-11',
195 define_alias(qr/\btis-?620\b/i => '"iso-8859-11"');
197 # At least AIX has IBM-NNN (surprisingly...) instead of cpNNN.
198 # And Microsoft has their own naming (again, surprisingly).
199 # And windows-* is registered in IANA!
201 qr/\b(?:cp|ibm|ms|windows)[-_ ]?(\d{2,4})$/i => '"cp$1"' );
203 # Sometimes seen with a leading zero.
204 # define_alias( qr/\bcp037\b/i => '"cp37"');
207 # predefined in *.ucm; unneeded
208 # define_alias( qr/\bmacIcelandic$/i => '"macIceland"');
209 define_alias( qr/^mac_(.*)$/i => '"mac$1"' );
211 # Ououououou. gone. They are differente!
212 # define_alias( qr/\bmacRomanian$/i => '"macRumanian"');
214 # Standardize on the dashed versions.
215 define_alias( qr/\bkoi8[\s\-_]*([ru])$/i => '"koi8-$1"' );
217 unless ($Encode::ON_EBCDIC) {
220 define_alias( qr/\beuc.*cn$/i => '"euc-cn"' );
221 define_alias( qr/\bcn.*euc$/i => '"euc-cn"' );
223 # define_alias( qr/\bGB[- ]?(\d+)$/i => '"euc-cn"' )
224 # CP936 doesn't have vendor-addon for GBK, so they're identical.
225 define_alias( qr/^gbk$/i => '"cp936"' );
227 # This fixes gb2312 vs. euc-cn confusion, practically
228 define_alias( qr/\bGB[-_ ]?2312(?!-?raw)/i => '"euc-cn"' );
231 define_alias( qr/\bjis$/i => '"7bit-jis"' );
232 define_alias( qr/\beuc.*jp$/i => '"euc-jp"' );
233 define_alias( qr/\bjp.*euc$/i => '"euc-jp"' );
234 define_alias( qr/\bujis$/i => '"euc-jp"' );
235 define_alias( qr/\bshift.*jis$/i => '"shiftjis"' );
236 define_alias( qr/\bsjis$/i => '"shiftjis"' );
237 define_alias( qr/\bwindows-31j$/i => '"cp932"' );
240 define_alias( qr/\beuc.*kr$/i => '"euc-kr"' );
241 define_alias( qr/\bkr.*euc$/i => '"euc-kr"' );
243 # This fixes ksc5601 vs. euc-kr confusion, practically
244 define_alias( qr/(?:x-)?uhc$/i => '"cp949"' );
245 define_alias( qr/(?:x-)?windows-949$/i => '"cp949"' );
246 define_alias( qr/\bks_c_5601-1987$/i => '"cp949"' );
249 define_alias( qr/\bbig-?5$/i => '"big5-eten"' );
250 define_alias( qr/\bbig5-?et(?:en)?$/i => '"big5-eten"' );
251 define_alias( qr/\btca[-_]?big5$/i => '"big5-eten"' );
252 define_alias( qr/\bbig5-?hk(?:scs)?$/i => '"big5-hkscs"' );
253 define_alias( qr/\bhk(?:scs)?[-_]?big5$/i => '"big5-hkscs"' );
257 define_alias( qr/^UTF-8$/i => '"utf-8-strict"' );
259 # At last, Map white space and _ to '-'
260 define_alias( qr/^(\S+)[\s_]+(.*)$/i => '"$1-$2"' );
266 # TODO: HP-UX '8' encodings arabic8 greek8 hebrew8 kana8 thai8 turkish8
267 # TODO: HP-UX '15' encodings japanese15 korean15 roi15
268 # TODO: Cyrillic encoding ISO-IR-111 (useful?)
269 # TODO: Armenian encoding ARMSCII-8
270 # TODO: Hebrew encoding ISO-8859-8-1
271 # TODO: Thai encoding TCVN
272 # TODO: Vietnamese encodings VPS
273 # TODO: Mac Asian+African encodings: Arabic Armenian Bengali Burmese
274 # ChineseSimp ChineseTrad Devanagari Ethiopic ExtArabic
275 # Farsi Georgian Gujarati Gurmukhi Hebrew Japanese
276 # Kannada Khmer Korean Laotian Malayalam Mongolian
277 # Oriya Sinhalese Symbol Tamil Telugu Tibetan Vietnamese
281 Encode::Alias - alias definitions to encodings
287 define_alias( newName => ENCODING);
291 Allows newName to be used as an alias for ENCODING. ENCODING may be
292 either the name of an encoding or an encoding object (as described
295 Currently I<newName> can be specified in the following ways:
299 =item As a simple string.
301 =item As a qr// compiled regular expression, e.g.:
303 define_alias( qr/^iso8859-(\d+)$/i => '"iso-8859-$1"' );
305 In this case, if I<ENCODING> is not a reference, it is C<eval>-ed
306 in order to allow C<$1> etc. to be substituted. The example is one
307 way to alias names as used in X11 fonts to the MIME names for the
308 iso-8859-* family. Note the double quotes inside the single quotes.
310 (or, you don't have to do this yourself because this example is predefined)
312 If you are using a regex here, you have to use the quotes as shown or
313 it won't work. Also note that regex handling is tricky even for the
314 experienced. Use this feature with caution.
316 =item As a code reference, e.g.:
318 define_alias( sub {shift =~ /^iso8859-(\d+)$/i ? "iso-8859-$1" : undef } );
320 The same effect as the example above in a different way. The coderef
321 takes the alias name as an argument and returns a canonical name on
322 success or undef if not. Note the second argument is not required.
323 Use this with even more caution than the regex version.
327 =head3 Changes in code reference aliasing
329 As of Encode 1.87, the older form
331 define_alias( sub { return /^iso8859-(\d+)$/i ? "iso-8859-$1" : undef } );
335 Encode up to 1.86 internally used "local $_" to implement ths older
336 form. But consider the code below;
341 my $utf = decode('aliased-encoding-name', $1);
342 print "position:",pos,"\n";
345 Prior to Encode 1.86 this fails because of "local $_".
347 =head2 Alias overloading
349 You can override predefined aliases by simply applying define_alias().
350 The new alias is always evaluated first, and when necessary,
351 define_alias() flushes the internal cache to make the new definition
354 # redirect SHIFT_JIS to MS/IBM Code Page 932, which is a
355 # superset of SHIFT_JIS
357 define_alias( qr/shift.*jis$/i => '"cp932"' );
358 define_alias( qr/sjis$/i => '"cp932"' );
360 If you want to zap all predefined aliases, you can use
362 Encode::Alias->undef_aliases;
366 Encode::Alias->init_aliases;
368 gets the factory settings back.
372 L<Encode>, L<Encode::Supported>