4 our $VERSION = do { my @r = (q$Revision: 1.27 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
8 our @ISA = qw(Exporter);
10 # Public, encouraged API is exported by default
18 our @Alias; # ordered matching list
19 our %Alias; # cached known aliases
25 unless (exists $Alias{$_})
27 $Alias{$_} = undef; # Recursion guard
28 for (my $i=0; $i < @Alias; $i += 2)
30 my $alias = $Alias[$i];
31 my $val = $Alias[$i+1];
33 if (ref($alias) eq 'Regexp' && $_ =~ $alias)
35 $DEBUG and warn "eval $val";
37 # $@ and warn "$val, $@";
39 elsif (ref($alias) eq 'CODE')
41 $DEBUG and warn "$alias", "->", "($val)";
42 $new = $alias->($val);
44 elsif (lc($_) eq lc($alias))
50 next if $new eq $_; # avoid (direct) recursion on bugs
51 $DEBUG and warn "$alias, $new";
52 my $enc = (ref($new)) ? $new : Encode::find_encoding($new);
63 if (my $e = $Alias{$_}){
68 warn "find_alias($class, $_)->name = $name";
77 my ($alias,$name) = splice(@_,0,2);
78 unshift(@Alias, $alias => $name); # newer one has precedence
79 # clear %Alias cache to allow overrides
83 if (ref($alias) eq 'Regexp' && $k =~ $alias)
85 $DEBUG and warn "delete \$Alias\{$k\}";
88 elsif (ref($alias) eq 'CODE')
90 $DEBUG and warn "delete \$Alias\{$k\}";
91 delete $Alias{$alias->($name)};
95 $DEBUG and warn "delete \$Alias\{$alias\}";
96 delete $Alias{$alias};
101 # Allow latin-1 style names as well
102 # 0 1 2 3 4 5 6 7 8 9 10
103 our @Latin2iso = ( 0, 1, 2, 3, 4, 9, 10, 13, 14, 15, 16 );
104 # Allow winlatin1 style names as well
114 'vietnamese' => 1258,
128 # Try all-lower-case version should all else fails
129 define_alias( qr/^(.*)$/ => '"\L$1"' );
132 define_alias( qr/^UCS-?2-?LE$/i => '"UCS-2LE"' );
133 define_alias( qr/^UCS-?2-?(BE)?$/i => '"UCS-2BE"',
134 qr/^iso-10646-1$/i => '"UCS-2BE"' );
135 define_alias( qr/^UTF(16|32)-?BE$/i => '"UTF-$1BE"',
136 qr/^UTF(16|32)-?LE$/i => '"UTF-$1LE"',
137 qr/^UTF(16|32)$/i => '"UTF-$1"',
140 define_alias(qr/^(?:US-?)ascii$/i => '"ascii"');
141 define_alias('C' => 'ascii');
142 # Allow variants of iso-8859-1 etc.
143 define_alias( qr/\biso[-_]?(\d+)[-_](\d+)$/i => '"iso-$1-$2"' );
145 # At least HP-UX has these.
146 define_alias( qr/\biso8859(\d+)$/i => '"iso-8859-$1"' );
149 define_alias( qr/\b(?:hp-)?(arabic|greek|hebrew|kana|roman|thai|turkish)8$/i => '"${1}8"' );
151 # The Official name of ASCII.
152 define_alias( qr/\bANSI[-_]?X3\.4[-_]?1968$/i => '"ascii"' );
154 # This is a font issue, not an encoding issue.
155 # (The currency symbol of the Latin 1 upper half
156 # has been redefined as the euro symbol.)
157 define_alias( qr/^(.+)\@euro$/i => '"$1"' );
159 define_alias( qr/\b(?:iso[-_]?)?latin[-_]?(\d+)$/i
160 => '"iso-8859-$Encode::Alias::Latin2iso[$1]"' );
162 define_alias( qr/\bwin(latin[12]|cyrillic|baltic|greek|turkish|
163 hebrew|arabic|baltic|vietnamese)$/ix =>
164 '"cp" . $Encode::Alias::Winlatin2cp{lc($1)}' );
166 # Common names for non-latin prefered MIME names
167 define_alias( 'ascii' => 'US-ascii',
168 'cyrillic' => 'iso-8859-5',
169 'arabic' => 'iso-8859-6',
170 'greek' => 'iso-8859-7',
171 'hebrew' => 'iso-8859-8',
172 'thai' => 'iso-8859-11',
173 'tis620' => 'iso-8859-11',
176 # At least AIX has IBM-NNN (surprisingly...) instead of cpNNN.
177 # And Microsoft has their own naming (again, surprisingly).
178 # And windows-* is registered in IANA!
179 define_alias( qr/\b(?:ibm|ms|windows)[-_]?(\d\d\d\d?)$/i => '"cp$1"');
181 # Sometimes seen with a leading zero.
182 # define_alias( qr/\bcp037\b/i => '"cp37"');
185 # predefined in *.ucm; unneeded
186 # define_alias( qr/\bmacIcelandic$/i => '"macIceland"');
187 define_alias( qr/^mac_(.*)$/i => '"mac$1"');
188 # Ououououou. gone. They are differente!
189 # define_alias( qr/\bmacRomanian$/i => '"macRumanian"');
191 # Standardize on the dashed versions.
192 # define_alias( qr/\butf8$/i => 'utf-8' );
193 define_alias( qr/\bkoi8r$/i => 'koi8-r' );
194 define_alias( qr/\bkoi8u$/i => 'koi8-u' );
196 unless ($Encode::ON_EBCDIC){
198 define_alias( qr/\beuc.*cn$/i => '"euc-cn"' );
199 define_alias( qr/\bcn.*euc$/i => '"euc-cn"' );
200 # define_alias( qr/\bGB[- ]?(\d+)$/i => '"euc-cn"' )
201 # CP936 doesn't have vendor-addon for GBK, so they're identical.
202 define_alias( qr/^gbk$/i => '"cp936"');
203 # This fixes gb2312 vs. euc-cn confusion, practically
204 define_alias( qr/\bGB[-_ ]?2312(?:\D.*$|$)/i => '"euc-cn"' );
206 define_alias( qr/\bjis$/i => '"7bit-jis"' );
207 define_alias( qr/\beuc.*jp$/i => '"euc-jp"' );
208 define_alias( qr/\bjp.*euc$/i => '"euc-jp"' );
209 define_alias( qr/\bujis$/i => '"euc-jp"' );
210 define_alias( qr/\bshift.*jis$/i => '"shiftjis"' );
211 define_alias( qr/\bsjis$/i => '"shiftjis"' );
213 define_alias( qr/\beuc.*kr$/i => '"euc-kr"' );
214 define_alias( qr/\bkr.*euc$/i => '"euc-kr"' );
215 # This fixes ksc5601 vs. euc-kr confusion, practically
216 define_alias( qr/(?:x-)?uhc$/i => '"cp949"' );
217 define_alias( qr/(?:x-)?windows-949$/i => '"cp949"' );
218 define_alias( qr/\bks_c_5601-1987$/i => '"cp949"' );
220 define_alias( qr/\bbig-?5$/i => '"big5"' );
221 define_alias( qr/\bbig5-hk(?:scs)?$/i => '"big5-hkscs"' );
224 define_alias( qr/^UTF-8$/i => '"utf8"',);
225 # At last, Map white space and _ to '-'
226 define_alias( qr/^(\S+)[\s_]+(.*)$/i => '"$1-$2"' );
232 # TODO: HP-UX '8' encodings arabic8 greek8 hebrew8 kana8 thai8 turkish8
233 # TODO: HP-UX '15' encodings japanese15 korean15 roi15
234 # TODO: Cyrillic encoding ISO-IR-111 (useful?)
235 # TODO: Armenian encoding ARMSCII-8
236 # TODO: Hebrew encoding ISO-8859-8-1
237 # TODO: Thai encoding TCVN
238 # TODO: Vietnamese encodings VPS
239 # TODO: Mac Asian+African encodings: Arabic Armenian Bengali Burmese
240 # ChineseSimp ChineseTrad Devanagari Ethiopic ExtArabic
241 # Farsi Georgian Gujarati Gurmukhi Hebrew Japanese
242 # Kannada Khmer Korean Laotian Malayalam Mongolian
243 # Oriya Sinhalese Symbol Tamil Telugu Tibetan Vietnamese
247 Encode::Alias - alias definitions to encodings
253 define_alias( newName => ENCODING);
257 Allows newName to be used as an alias for ENCODING. ENCODING may be
258 either the name of an encoding or an encoding object (as described
261 Currently I<newName> can be specified in the following ways:
265 =item As a simple string.
267 =item As a qr// compiled regular expression, e.g.:
269 define_alias( qr/^iso8859-(\d+)$/i => '"iso-8859-$1"' );
271 In this case if I<ENCODING> is not a reference it is C<eval>-ed to
272 allow C<$1> etc. to be substituted. The example is one way to alias
273 names as used in X11 fonts to the MIME names for the iso-8859-*
274 family. Note the double quote inside the single quote.
276 If you are using a regex here, you have to use the quotes as shown or
277 it won't work. Also note that regex handling is tricky even for the
278 experienced. Use it with caution.
280 =item As a code reference, e.g.:
282 define_alias( sub { return /^iso8859-(\d+)$/i ? "iso-8859-$1" : undef } , '');
285 In this case C<$_> will be set to the name that is being looked up and
286 I<ENCODING> is passed to the sub as its first argument. The example
287 is another way to alias names as used in X11 fonts to the MIME names
288 for the iso-8859-* family.
292 =head2 Alias overloading
294 You can override predefined aliases by simply applying define_alias().
295 New alias is always evaluated first and when neccessary define_alias()
296 flushes internal cache to make new definition available.
298 # redirect SHIFT_JIS to MS/IBM Code Page 932, which is a
299 # superset of SHIFT_JIS
301 define_alias( qr/shift.*jis$/i => '"cp932"' );
302 define_alias( qr/sjis$/i => '"cp932"' );
304 If you want to zap all predefined aliases, you can
306 Encode::Alias->undef_aliases;
310 Encode::Alias->init_aliases;
312 gets factory setting back.
317 L<Encode>, L<Encode::Supported>