3 no warnings 'redefine';
5 our $VERSION = do { my @r = (q$Revision: 1.36 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
10 # Public, encouraged API is exported by default
18 our @Alias; # ordered matching list
19 our %Alias; # cached known aliases
25 unless (exists $Alias{$find})
27 $Alias{$find} = undef; # Recursion guard
28 for (my $i=0; $i < @Alias; $i += 2)
30 my $alias = $Alias[$i];
31 my $val = $Alias[$i+1];
33 if (ref($alias) eq 'Regexp' && $find =~ $alias)
35 $DEBUG and warn "eval $val";
37 $DEBUG and $@ and warn "$val, $@";
39 elsif (ref($alias) eq 'CODE')
41 $DEBUG and warn "$alias", "->", "($find)";
42 $new = $alias->($find);
44 elsif (lc($find) eq lc($alias))
50 next if $new eq $find; # avoid (direct) recursion on bugs
51 $DEBUG and warn "$alias, $new";
52 my $enc = (ref($new)) ? $new : Encode::find_encoding($new);
63 if (my $e = $Alias{$find}){
68 warn "find_alias($class, $find)->name = $name";
77 my ($alias,$name) = splice(@_,0,2);
78 unshift(@Alias, $alias => $name); # newer one has precedence
79 # clear %Alias cache to allow overrides
83 if (ref($alias) eq 'Regexp' && $k =~ $alias)
85 $DEBUG and warn "delete \$Alias\{$k\}";
88 elsif (ref($alias) eq 'CODE')
90 $DEBUG and warn "delete \$Alias\{$k\}";
91 delete $Alias{$alias->($name)};
95 $DEBUG and warn "delete \$Alias\{$alias\}";
96 delete $Alias{$alias};
101 # Allow latin-1 style names as well
102 # 0 1 2 3 4 5 6 7 8 9 10
103 our @Latin2iso = ( 0, 1, 2, 3, 4, 9, 10, 13, 14, 15, 16 );
104 # Allow winlatin1 style names as well
114 'vietnamese' => 1258,
128 # Try all-lower-case version should all else fails
129 define_alias( qr/^(.*)$/ => '"\L$1"' );
132 define_alias( qr/^UTF-?7$/i => '"UTF-7"');
133 define_alias( qr/^UCS-?2-?LE$/i => '"UCS-2LE"' );
134 define_alias( qr/^UCS-?2-?(BE)?$/i => '"UCS-2BE"',
135 qr/^UCS-?4-?(BE|LE)?$/i => 'uc("UTF-32$1")',
136 qr/^iso-10646-1$/i => '"UCS-2BE"' );
137 define_alias( qr/^UTF(16|32)-?BE$/i => '"UTF-$1BE"',
138 qr/^UTF(16|32)-?LE$/i => '"UTF-$1LE"',
139 qr/^UTF(16|32)$/i => '"UTF-$1"',
142 define_alias(qr/^(?:US-?)ascii$/i => '"ascii"');
143 define_alias('C' => 'ascii');
144 define_alias(qr/\bISO[-_]?646[-_]?US$/i => '"ascii"');
145 # Allow variants of iso-8859-1 etc.
146 define_alias( qr/\biso[-_]?(\d+)[-_](\d+)$/i => '"iso-$1-$2"' );
148 # At least HP-UX has these.
149 define_alias( qr/\biso8859(\d+)$/i => '"iso-8859-$1"' );
152 define_alias( qr/\b(?:hp-)?(arabic|greek|hebrew|kana|roman|thai|turkish)8$/i => '"${1}8"' );
154 # The Official name of ASCII.
155 define_alias( qr/\bANSI[-_]?X3\.4[-_]?1968$/i => '"ascii"' );
157 # This is a font issue, not an encoding issue.
158 # (The currency symbol of the Latin 1 upper half
159 # has been redefined as the euro symbol.)
160 define_alias( qr/^(.+)\@euro$/i => '"$1"' );
162 define_alias( qr/\b(?:iso[-_]?)?latin[-_]?(\d+)$/i
163 => 'defined $Encode::Alias::Latin2iso[$1] ? "iso-8859-$Encode::Alias::Latin2iso[$1]" : undef' );
165 define_alias( qr/\bwin(latin[12]|cyrillic|baltic|greek|turkish|
166 hebrew|arabic|baltic|vietnamese)$/ix =>
167 '"cp" . $Encode::Alias::Winlatin2cp{lc($1)}' );
169 # Common names for non-latin prefered MIME names
170 define_alias( 'ascii' => 'US-ascii',
171 'cyrillic' => 'iso-8859-5',
172 'arabic' => 'iso-8859-6',
173 'greek' => 'iso-8859-7',
174 'hebrew' => 'iso-8859-8',
175 'thai' => 'iso-8859-11',
176 'tis620' => 'iso-8859-11',
179 # At least AIX has IBM-NNN (surprisingly...) instead of cpNNN.
180 # And Microsoft has their own naming (again, surprisingly).
181 # And windows-* is registered in IANA!
182 define_alias( qr/\b(?:cp|ibm|ms|windows)[-_ ]?(\d{2,4})$/i => '"cp$1"');
184 # Sometimes seen with a leading zero.
185 # define_alias( qr/\bcp037\b/i => '"cp37"');
188 # predefined in *.ucm; unneeded
189 # define_alias( qr/\bmacIcelandic$/i => '"macIceland"');
190 define_alias( qr/^mac_(.*)$/i => '"mac$1"');
191 # Ououououou. gone. They are differente!
192 # define_alias( qr/\bmacRomanian$/i => '"macRumanian"');
194 # Standardize on the dashed versions.
195 # define_alias( qr/\butf8$/i => 'utf-8' );
196 define_alias( qr/\bkoi8r$/i => 'koi8-r' );
197 define_alias( qr/\bkoi8u$/i => 'koi8-u' );
199 unless ($Encode::ON_EBCDIC){
201 define_alias( qr/\beuc.*cn$/i => '"euc-cn"' );
202 define_alias( qr/\bcn.*euc$/i => '"euc-cn"' );
203 # define_alias( qr/\bGB[- ]?(\d+)$/i => '"euc-cn"' )
204 # CP936 doesn't have vendor-addon for GBK, so they're identical.
205 define_alias( qr/^gbk$/i => '"cp936"');
206 # This fixes gb2312 vs. euc-cn confusion, practically
207 define_alias( qr/\bGB[-_ ]?2312(?!-?raw)/i => '"euc-cn"' );
209 define_alias( qr/\bjis$/i => '"7bit-jis"' );
210 define_alias( qr/\beuc.*jp$/i => '"euc-jp"' );
211 define_alias( qr/\bjp.*euc$/i => '"euc-jp"' );
212 define_alias( qr/\bujis$/i => '"euc-jp"' );
213 define_alias( qr/\bshift.*jis$/i => '"shiftjis"' );
214 define_alias( qr/\bsjis$/i => '"shiftjis"' );
216 define_alias( qr/\beuc.*kr$/i => '"euc-kr"' );
217 define_alias( qr/\bkr.*euc$/i => '"euc-kr"' );
218 # This fixes ksc5601 vs. euc-kr confusion, practically
219 define_alias( qr/(?:x-)?uhc$/i => '"cp949"' );
220 define_alias( qr/(?:x-)?windows-949$/i => '"cp949"' );
221 define_alias( qr/\bks_c_5601-1987$/i => '"cp949"' );
223 define_alias( qr/\bbig-?5$/i => '"big5-eten"' );
224 define_alias( qr/\bbig5-?et(?:en)?$/i => '"big5-eten"' );
225 define_alias( qr/\btca[-_]?big5$/i => '"big5-eten"' );
226 define_alias( qr/\bbig5-?hk(?:scs)?$/i => '"big5-hkscs"' );
227 define_alias( qr/\bhk(?:scs)?[-_]?big5$/i => '"big5-hkscs"' );
230 define_alias( qr/^UTF-8$/i => '"utf8"',);
231 # At last, Map white space and _ to '-'
232 define_alias( qr/^(\S+)[\s_]+(.*)$/i => '"$1-$2"' );
238 # TODO: HP-UX '8' encodings arabic8 greek8 hebrew8 kana8 thai8 turkish8
239 # TODO: HP-UX '15' encodings japanese15 korean15 roi15
240 # TODO: Cyrillic encoding ISO-IR-111 (useful?)
241 # TODO: Armenian encoding ARMSCII-8
242 # TODO: Hebrew encoding ISO-8859-8-1
243 # TODO: Thai encoding TCVN
244 # TODO: Vietnamese encodings VPS
245 # TODO: Mac Asian+African encodings: Arabic Armenian Bengali Burmese
246 # ChineseSimp ChineseTrad Devanagari Ethiopic ExtArabic
247 # Farsi Georgian Gujarati Gurmukhi Hebrew Japanese
248 # Kannada Khmer Korean Laotian Malayalam Mongolian
249 # Oriya Sinhalese Symbol Tamil Telugu Tibetan Vietnamese
253 Encode::Alias - alias definitions to encodings
259 define_alias( newName => ENCODING);
263 Allows newName to be used as an alias for ENCODING. ENCODING may be
264 either the name of an encoding or an encoding object (as described
267 Currently I<newName> can be specified in the following ways:
271 =item As a simple string.
273 =item As a qr// compiled regular expression, e.g.:
275 define_alias( qr/^iso8859-(\d+)$/i => '"iso-8859-$1"' );
277 In this case, if I<ENCODING> is not a reference, it is C<eval>-ed
278 in order to allow C<$1> etc. to be substituted. The example is one
279 way to alias names as used in X11 fonts to the MIME names for the
280 iso-8859-* family. Note the double quotes inside the single quotes.
282 (or, you don't have to do this yourself because this example is predefined)
284 If you are using a regex here, you have to use the quotes as shown or
285 it won't work. Also note that regex handling is tricky even for the
286 experienced. Use this feature with caution.
288 =item As a code reference, e.g.:
290 define_alias( sub {shift =~ /^iso8859-(\d+)$/i ? "iso-8859-$1" : undef } );
292 The same effect as the example above in a different way. The coderef
293 takes the alias name as an argument and returns a canonical name on
294 success or undef if not. Note the second argument is not required.
295 Use this with even more caution than the regex version.
299 =head3 Changes in code reference aliasing
301 As of Encode 1.87, the older form
303 define_alias( sub { return /^iso8859-(\d+)$/i ? "iso-8859-$1" : undef } );
307 Encode up to 1.86 internally used "local $_" to implement ths older
308 form. But consider the code below;
313 my $utf = decode('aliased-encoding-name', $1);
314 print "position:",pos,"\n";
317 Prior to Encode 1.86 this fails because of "local $_".
319 =head2 Alias overloading
321 You can override predefined aliases by simply applying define_alias().
322 The new alias is always evaluated first, and when neccessary,
323 define_alias() flushes the internal cache to make the new definition
326 # redirect SHIFT_JIS to MS/IBM Code Page 932, which is a
327 # superset of SHIFT_JIS
329 define_alias( qr/shift.*jis$/i => '"cp932"' );
330 define_alias( qr/sjis$/i => '"cp932"' );
332 If you want to zap all predefined aliases, you can use
334 Encode::Alias->undef_aliases;
338 Encode::Alias->init_aliases;
340 gets the factory settings back.
344 L<Encode>, L<Encode::Supported>