3 no warnings 'redefine';
5 our $VERSION = do { my @r = (q$Revision: 2.2 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
10 # Public, encouraged API is exported by default
18 our @Alias; # ordered matching list
19 our %Alias; # cached known aliases
24 unless (exists $Alias{$find}) {
25 $Alias{$find} = undef; # Recursion guard
26 for (my $i=0; $i < @Alias; $i += 2){
27 my $alias = $Alias[$i];
28 my $val = $Alias[$i+1];
30 if (ref($alias) eq 'Regexp' && $find =~ $alias){
31 DEBUG and warn "eval $val";
33 DEBUG and $@ and warn "$val, $@";
34 }elsif (ref($alias) eq 'CODE'){
35 DEBUG and warn "$alias", "->", "($find)";
36 $new = $alias->($find);
37 }elsif (lc($find) eq lc($alias)){
41 next if $new eq $find; # avoid (direct) recursion on bugs
42 DEBUG and warn "$alias, $new";
43 my $enc = (ref($new)) ? $new : Encode::find_encoding($new);
50 # case insensitive search when canonical is not in all lowercase
52 unless ($Alias{$find}){
53 my $lcfind = lc($find);
54 for my $name (keys %Encode::Encoding, keys %Encode::ExtModule){
55 $lcfind eq lc($name) or next;
56 $Alias{$find} = Encode::find_encoding($name);
57 DEBUG and warn "$find => $name";
63 if (my $e = $Alias{$find}){
68 warn "find_alias($class, $find)->name = $name";
75 my ($alias,$name) = splice(@_,0,2);
76 unshift(@Alias, $alias => $name); # newer one has precedence
78 # clear %Alias cache to allow overrides
81 if (ref($alias) eq 'Regexp' && $k =~ $alias){
82 DEBUG and warn "delete \$Alias\{$k\}";
85 elsif (ref($alias) eq 'CODE'){
86 DEBUG and warn "delete \$Alias\{$k\}";
87 delete $Alias{$alias->($name)};
91 DEBUG and warn "delete \$Alias\{$alias\}";
92 delete $Alias{$alias};
97 # Allow latin-1 style names as well
98 # 0 1 2 3 4 5 6 7 8 9 10
99 our @Latin2iso = ( 0, 1, 2, 3, 4, 9, 10, 13, 14, 15, 16 );
100 # Allow winlatin1 style names as well
110 'vietnamese' => 1258,
123 # Try all-lower-case version should all else fails
124 define_alias( qr/^(.*)$/ => '"\L$1"' );
127 define_alias( qr/^UTF-?7$/i => '"UTF-7"');
128 define_alias( qr/^UCS-?2-?LE$/i => '"UCS-2LE"' );
129 define_alias( qr/^UCS-?2-?(BE)?$/i => '"UCS-2BE"',
130 qr/^UCS-?4-?(BE|LE)?$/i => 'uc("UTF-32$1")',
131 qr/^iso-10646-1$/i => '"UCS-2BE"' );
132 define_alias( qr/^UTF-?(16|32)-?BE$/i => '"UTF-$1BE"',
133 qr/^UTF-?(16|32)-?LE$/i => '"UTF-$1LE"',
134 qr/^UTF-?(16|32)$/i => '"UTF-$1"',
137 define_alias(qr/^(?:US-?)ascii$/i => '"ascii"');
138 define_alias('C' => 'ascii');
139 define_alias(qr/\bISO[-_]?646[-_]?US$/i => '"ascii"');
140 # Allow variants of iso-8859-1 etc.
141 define_alias( qr/\biso[-_]?(\d+)[-_](\d+)$/i => '"iso-$1-$2"' );
143 # At least HP-UX has these.
144 define_alias( qr/\biso8859(\d+)$/i => '"iso-8859-$1"' );
147 define_alias( qr/\b(?:hp-)?(arabic|greek|hebrew|kana|roman|thai|turkish)8$/i => '"${1}8"' );
149 # The Official name of ASCII.
150 define_alias( qr/\bANSI[-_]?X3\.4[-_]?1968$/i => '"ascii"' );
152 # This is a font issue, not an encoding issue.
153 # (The currency symbol of the Latin 1 upper half
154 # has been redefined as the euro symbol.)
155 define_alias( qr/^(.+)\@euro$/i => '"$1"' );
157 define_alias( qr/\b(?:iso[-_]?)?latin[-_]?(\d+)$/i
158 => 'defined $Encode::Alias::Latin2iso[$1] ? "iso-8859-$Encode::Alias::Latin2iso[$1]" : undef' );
160 define_alias( qr/\bwin(latin[12]|cyrillic|baltic|greek|turkish|
161 hebrew|arabic|baltic|vietnamese)$/ix =>
162 '"cp" . $Encode::Alias::Winlatin2cp{lc($1)}' );
164 # Common names for non-latin prefered MIME names
165 define_alias( 'ascii' => 'US-ascii',
166 'cyrillic' => 'iso-8859-5',
167 'arabic' => 'iso-8859-6',
168 'greek' => 'iso-8859-7',
169 'hebrew' => 'iso-8859-8',
170 'thai' => 'iso-8859-11',
171 'tis620' => 'iso-8859-11',
174 # At least AIX has IBM-NNN (surprisingly...) instead of cpNNN.
175 # And Microsoft has their own naming (again, surprisingly).
176 # And windows-* is registered in IANA!
177 define_alias( qr/\b(?:cp|ibm|ms|windows)[-_ ]?(\d{2,4})$/i => '"cp$1"');
179 # Sometimes seen with a leading zero.
180 # define_alias( qr/\bcp037\b/i => '"cp37"');
183 # predefined in *.ucm; unneeded
184 # define_alias( qr/\bmacIcelandic$/i => '"macIceland"');
185 define_alias( qr/^mac_(.*)$/i => '"mac$1"');
186 # Ououououou. gone. They are differente!
187 # define_alias( qr/\bmacRomanian$/i => '"macRumanian"');
189 # Standardize on the dashed versions.
190 # define_alias( qr/\butf8$/i => '"utf-8"' );
191 define_alias( qr/\bkoi8[\s\-_]*([ru])$/i => '"koi8-$1"' );
193 unless ($Encode::ON_EBCDIC){
195 define_alias( qr/\beuc.*cn$/i => '"euc-cn"' );
196 define_alias( qr/\bcn.*euc$/i => '"euc-cn"' );
197 # define_alias( qr/\bGB[- ]?(\d+)$/i => '"euc-cn"' )
198 # CP936 doesn't have vendor-addon for GBK, so they're identical.
199 define_alias( qr/^gbk$/i => '"cp936"');
200 # This fixes gb2312 vs. euc-cn confusion, practically
201 define_alias( qr/\bGB[-_ ]?2312(?!-?raw)/i => '"euc-cn"' );
203 define_alias( qr/\bjis$/i => '"7bit-jis"' );
204 define_alias( qr/\beuc.*jp$/i => '"euc-jp"' );
205 define_alias( qr/\bjp.*euc$/i => '"euc-jp"' );
206 define_alias( qr/\bujis$/i => '"euc-jp"' );
207 define_alias( qr/\bshift.*jis$/i => '"shiftjis"' );
208 define_alias( qr/\bsjis$/i => '"shiftjis"' );
209 define_alias( qr/\bwindows-31j$/i => '"cp932"' );
211 define_alias( qr/\beuc.*kr$/i => '"euc-kr"' );
212 define_alias( qr/\bkr.*euc$/i => '"euc-kr"' );
213 # This fixes ksc5601 vs. euc-kr confusion, practically
214 define_alias( qr/(?:x-)?uhc$/i => '"cp949"' );
215 define_alias( qr/(?:x-)?windows-949$/i => '"cp949"' );
216 define_alias( qr/\bks_c_5601-1987$/i => '"cp949"' );
218 define_alias( qr/\bbig-?5$/i => '"big5-eten"' );
219 define_alias( qr/\bbig5-?et(?:en)?$/i => '"big5-eten"' );
220 define_alias( qr/\btca[-_]?big5$/i => '"big5-eten"' );
221 define_alias( qr/\bbig5-?hk(?:scs)?$/i => '"big5-hkscs"' );
222 define_alias( qr/\bhk(?:scs)?[-_]?big5$/i => '"big5-hkscs"' );
225 define_alias( qr/^UTF-8$/i => '"utf8"',);
226 # At last, Map white space and _ to '-'
227 define_alias( qr/^(\S+)[\s_]+(.*)$/i => '"$1-$2"' );
233 # TODO: HP-UX '8' encodings arabic8 greek8 hebrew8 kana8 thai8 turkish8
234 # TODO: HP-UX '15' encodings japanese15 korean15 roi15
235 # TODO: Cyrillic encoding ISO-IR-111 (useful?)
236 # TODO: Armenian encoding ARMSCII-8
237 # TODO: Hebrew encoding ISO-8859-8-1
238 # TODO: Thai encoding TCVN
239 # TODO: Vietnamese encodings VPS
240 # TODO: Mac Asian+African encodings: Arabic Armenian Bengali Burmese
241 # ChineseSimp ChineseTrad Devanagari Ethiopic ExtArabic
242 # Farsi Georgian Gujarati Gurmukhi Hebrew Japanese
243 # Kannada Khmer Korean Laotian Malayalam Mongolian
244 # Oriya Sinhalese Symbol Tamil Telugu Tibetan Vietnamese
248 Encode::Alias - alias definitions to encodings
254 define_alias( newName => ENCODING);
258 Allows newName to be used as an alias for ENCODING. ENCODING may be
259 either the name of an encoding or an encoding object (as described
262 Currently I<newName> can be specified in the following ways:
266 =item As a simple string.
268 =item As a qr// compiled regular expression, e.g.:
270 define_alias( qr/^iso8859-(\d+)$/i => '"iso-8859-$1"' );
272 In this case, if I<ENCODING> is not a reference, it is C<eval>-ed
273 in order to allow C<$1> etc. to be substituted. The example is one
274 way to alias names as used in X11 fonts to the MIME names for the
275 iso-8859-* family. Note the double quotes inside the single quotes.
277 (or, you don't have to do this yourself because this example is predefined)
279 If you are using a regex here, you have to use the quotes as shown or
280 it won't work. Also note that regex handling is tricky even for the
281 experienced. Use this feature with caution.
283 =item As a code reference, e.g.:
285 define_alias( sub {shift =~ /^iso8859-(\d+)$/i ? "iso-8859-$1" : undef } );
287 The same effect as the example above in a different way. The coderef
288 takes the alias name as an argument and returns a canonical name on
289 success or undef if not. Note the second argument is not required.
290 Use this with even more caution than the regex version.
294 =head3 Changes in code reference aliasing
296 As of Encode 1.87, the older form
298 define_alias( sub { return /^iso8859-(\d+)$/i ? "iso-8859-$1" : undef } );
302 Encode up to 1.86 internally used "local $_" to implement ths older
303 form. But consider the code below;
308 my $utf = decode('aliased-encoding-name', $1);
309 print "position:",pos,"\n";
312 Prior to Encode 1.86 this fails because of "local $_".
314 =head2 Alias overloading
316 You can override predefined aliases by simply applying define_alias().
317 The new alias is always evaluated first, and when neccessary,
318 define_alias() flushes the internal cache to make the new definition
321 # redirect SHIFT_JIS to MS/IBM Code Page 932, which is a
322 # superset of SHIFT_JIS
324 define_alias( qr/shift.*jis$/i => '"cp932"' );
325 define_alias( qr/sjis$/i => '"cp932"' );
327 If you want to zap all predefined aliases, you can use
329 Encode::Alias->undef_aliases;
333 Encode::Alias->init_aliases;
335 gets the factory settings back.
339 L<Encode>, L<Encode::Supported>