4 no warnings 'redefine';
6 our $VERSION = do { my @r = ( q$Revision: 2.6 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r };
11 # Public, encouraged API is exported by default
19 our @Alias; # ordered matching list
20 our %Alias; # cached known aliases
25 unless ( exists $Alias{$find} ) {
26 $Alias{$find} = undef; # Recursion guard
27 for ( my $i = 0 ; $i < @Alias ; $i += 2 ) {
28 my $alias = $Alias[$i];
29 my $val = $Alias[ $i + 1 ];
31 if ( ref($alias) eq 'Regexp' && $find =~ $alias ) {
32 DEBUG and warn "eval $val";
34 DEBUG and $@ and warn "$val, $@";
36 elsif ( ref($alias) eq 'CODE' ) {
37 DEBUG and warn "$alias", "->", "($find)";
38 $new = $alias->($find);
40 elsif ( lc($find) eq lc($alias) ) {
43 if ( defined($new) ) {
44 next if $new eq $find; # avoid (direct) recursion on bugs
45 DEBUG and warn "$alias, $new";
47 ( ref($new) ) ? $new : Encode::find_encoding($new);
55 # case insensitive search when canonical is not in all lowercase
57 unless ( $Alias{$find} ) {
58 my $lcfind = lc($find);
59 for my $name ( keys %Encode::Encoding, keys %Encode::ExtModule )
61 $lcfind eq lc($name) or next;
62 $Alias{$find} = Encode::find_encoding($name);
63 DEBUG and warn "$find => $name";
69 if ( my $e = $Alias{$find} ) {
75 warn "find_alias($class, $find)->name = $name";
82 my ( $alias, $name ) = splice( @_, 0, 2 );
83 unshift( @Alias, $alias => $name ); # newer one has precedence
86 # clear %Alias cache to allow overrides
89 if ( ref($alias) eq 'Regexp' && $k =~ $alias ) {
90 DEBUG and warn "delete \$Alias\{$k\}";
93 elsif ( ref($alias) eq 'CODE' ) {
94 DEBUG and warn "delete \$Alias\{$k\}";
95 delete $Alias{ $alias->($name) };
100 DEBUG and warn "delete \$Alias\{$alias\}";
101 delete $Alias{$alias};
106 # Allow latin-1 style names as well
107 # 0 1 2 3 4 5 6 7 8 9 10
108 our @Latin2iso = ( 0, 1, 2, 3, 4, 9, 10, 13, 14, 15, 16 );
110 # Allow winlatin1 style names as well
120 'vietnamese' => 1258,
133 # Try all-lower-case version should all else fails
134 define_alias( qr/^(.*)$/ => '"\L$1"' );
137 define_alias( qr/^UTF-?7$/i => '"UTF-7"' );
138 define_alias( qr/^UCS-?2-?LE$/i => '"UCS-2LE"' );
140 qr/^UCS-?2-?(BE)?$/i => '"UCS-2BE"',
141 qr/^UCS-?4-?(BE|LE)?$/i => 'uc("UTF-32$1")',
142 qr/^iso-10646-1$/i => '"UCS-2BE"'
145 qr/^UTF-?(16|32)-?BE$/i => '"UTF-$1BE"',
146 qr/^UTF-?(16|32)-?LE$/i => '"UTF-$1LE"',
147 qr/^UTF-?(16|32)$/i => '"UTF-$1"',
151 define_alias( qr/^(?:US-?)ascii$/i => '"ascii"' );
152 define_alias( 'C' => 'ascii' );
153 define_alias( qr/\bISO[-_]?646[-_]?US$/i => '"ascii"' );
155 # Allow variants of iso-8859-1 etc.
156 define_alias( qr/\biso[-_]?(\d+)[-_](\d+)$/i => '"iso-$1-$2"' );
158 # At least HP-UX has these.
159 define_alias( qr/\biso8859(\d+)$/i => '"iso-8859-$1"' );
163 qr/\b(?:hp-)?(arabic|greek|hebrew|kana|roman|thai|turkish)8$/i =>
166 # The Official name of ASCII.
167 define_alias( qr/\bANSI[-_]?X3\.4[-_]?1968$/i => '"ascii"' );
169 # This is a font issue, not an encoding issue.
170 # (The currency symbol of the Latin 1 upper half
171 # has been redefined as the euro symbol.)
172 define_alias( qr/^(.+)\@euro$/i => '"$1"' );
174 define_alias( qr/\b(?:iso[-_]?)?latin[-_]?(\d+)$/i =>
175 'defined $Encode::Alias::Latin2iso[$1] ? "iso-8859-$Encode::Alias::Latin2iso[$1]" : undef'
179 qr/\bwin(latin[12]|cyrillic|baltic|greek|turkish|
180 hebrew|arabic|baltic|vietnamese)$/ix =>
181 '"cp" . $Encode::Alias::Winlatin2cp{lc($1)}'
184 # Common names for non-latin preferred MIME names
186 'ascii' => 'US-ascii',
187 'cyrillic' => 'iso-8859-5',
188 'arabic' => 'iso-8859-6',
189 'greek' => 'iso-8859-7',
190 'hebrew' => 'iso-8859-8',
191 'thai' => 'iso-8859-11',
192 'tis620' => 'iso-8859-11',
195 # At least AIX has IBM-NNN (surprisingly...) instead of cpNNN.
196 # And Microsoft has their own naming (again, surprisingly).
197 # And windows-* is registered in IANA!
199 qr/\b(?:cp|ibm|ms|windows)[-_ ]?(\d{2,4})$/i => '"cp$1"' );
201 # Sometimes seen with a leading zero.
202 # define_alias( qr/\bcp037\b/i => '"cp37"');
205 # predefined in *.ucm; unneeded
206 # define_alias( qr/\bmacIcelandic$/i => '"macIceland"');
207 define_alias( qr/^mac_(.*)$/i => '"mac$1"' );
209 # Ououououou. gone. They are differente!
210 # define_alias( qr/\bmacRomanian$/i => '"macRumanian"');
212 # Standardize on the dashed versions.
213 define_alias( qr/\bkoi8[\s\-_]*([ru])$/i => '"koi8-$1"' );
215 unless ($Encode::ON_EBCDIC) {
218 define_alias( qr/\beuc.*cn$/i => '"euc-cn"' );
219 define_alias( qr/\bcn.*euc$/i => '"euc-cn"' );
221 # define_alias( qr/\bGB[- ]?(\d+)$/i => '"euc-cn"' )
222 # CP936 doesn't have vendor-addon for GBK, so they're identical.
223 define_alias( qr/^gbk$/i => '"cp936"' );
225 # This fixes gb2312 vs. euc-cn confusion, practically
226 define_alias( qr/\bGB[-_ ]?2312(?!-?raw)/i => '"euc-cn"' );
229 define_alias( qr/\bjis$/i => '"7bit-jis"' );
230 define_alias( qr/\beuc.*jp$/i => '"euc-jp"' );
231 define_alias( qr/\bjp.*euc$/i => '"euc-jp"' );
232 define_alias( qr/\bujis$/i => '"euc-jp"' );
233 define_alias( qr/\bshift.*jis$/i => '"shiftjis"' );
234 define_alias( qr/\bsjis$/i => '"shiftjis"' );
235 define_alias( qr/\bwindows-31j$/i => '"cp932"' );
238 define_alias( qr/\beuc.*kr$/i => '"euc-kr"' );
239 define_alias( qr/\bkr.*euc$/i => '"euc-kr"' );
241 # This fixes ksc5601 vs. euc-kr confusion, practically
242 define_alias( qr/(?:x-)?uhc$/i => '"cp949"' );
243 define_alias( qr/(?:x-)?windows-949$/i => '"cp949"' );
244 define_alias( qr/\bks_c_5601-1987$/i => '"cp949"' );
247 define_alias( qr/\bbig-?5$/i => '"big5-eten"' );
248 define_alias( qr/\bbig5-?et(?:en)?$/i => '"big5-eten"' );
249 define_alias( qr/\btca[-_]?big5$/i => '"big5-eten"' );
250 define_alias( qr/\bbig5-?hk(?:scs)?$/i => '"big5-hkscs"' );
251 define_alias( qr/\bhk(?:scs)?[-_]?big5$/i => '"big5-hkscs"' );
255 define_alias( qr/^UTF-8$/i => '"utf-8-strict"' );
257 # At last, Map white space and _ to '-'
258 define_alias( qr/^(\S+)[\s_]+(.*)$/i => '"$1-$2"' );
264 # TODO: HP-UX '8' encodings arabic8 greek8 hebrew8 kana8 thai8 turkish8
265 # TODO: HP-UX '15' encodings japanese15 korean15 roi15
266 # TODO: Cyrillic encoding ISO-IR-111 (useful?)
267 # TODO: Armenian encoding ARMSCII-8
268 # TODO: Hebrew encoding ISO-8859-8-1
269 # TODO: Thai encoding TCVN
270 # TODO: Vietnamese encodings VPS
271 # TODO: Mac Asian+African encodings: Arabic Armenian Bengali Burmese
272 # ChineseSimp ChineseTrad Devanagari Ethiopic ExtArabic
273 # Farsi Georgian Gujarati Gurmukhi Hebrew Japanese
274 # Kannada Khmer Korean Laotian Malayalam Mongolian
275 # Oriya Sinhalese Symbol Tamil Telugu Tibetan Vietnamese
279 Encode::Alias - alias definitions to encodings
285 define_alias( newName => ENCODING);
289 Allows newName to be used as an alias for ENCODING. ENCODING may be
290 either the name of an encoding or an encoding object (as described
293 Currently I<newName> can be specified in the following ways:
297 =item As a simple string.
299 =item As a qr// compiled regular expression, e.g.:
301 define_alias( qr/^iso8859-(\d+)$/i => '"iso-8859-$1"' );
303 In this case, if I<ENCODING> is not a reference, it is C<eval>-ed
304 in order to allow C<$1> etc. to be substituted. The example is one
305 way to alias names as used in X11 fonts to the MIME names for the
306 iso-8859-* family. Note the double quotes inside the single quotes.
308 (or, you don't have to do this yourself because this example is predefined)
310 If you are using a regex here, you have to use the quotes as shown or
311 it won't work. Also note that regex handling is tricky even for the
312 experienced. Use this feature with caution.
314 =item As a code reference, e.g.:
316 define_alias( sub {shift =~ /^iso8859-(\d+)$/i ? "iso-8859-$1" : undef } );
318 The same effect as the example above in a different way. The coderef
319 takes the alias name as an argument and returns a canonical name on
320 success or undef if not. Note the second argument is not required.
321 Use this with even more caution than the regex version.
325 =head3 Changes in code reference aliasing
327 As of Encode 1.87, the older form
329 define_alias( sub { return /^iso8859-(\d+)$/i ? "iso-8859-$1" : undef } );
333 Encode up to 1.86 internally used "local $_" to implement ths older
334 form. But consider the code below;
339 my $utf = decode('aliased-encoding-name', $1);
340 print "position:",pos,"\n";
343 Prior to Encode 1.86 this fails because of "local $_".
345 =head2 Alias overloading
347 You can override predefined aliases by simply applying define_alias().
348 The new alias is always evaluated first, and when necessary,
349 define_alias() flushes the internal cache to make the new definition
352 # redirect SHIFT_JIS to MS/IBM Code Page 932, which is a
353 # superset of SHIFT_JIS
355 define_alias( qr/shift.*jis$/i => '"cp932"' );
356 define_alias( qr/sjis$/i => '"cp932"' );
358 If you want to zap all predefined aliases, you can use
360 Encode::Alias->undef_aliases;
364 Encode::Alias->init_aliases;
366 gets the factory settings back.
370 L<Encode>, L<Encode::Supported>