4 our $VERSION = do { my @r = (q$Revision: 1.2 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
8 our @ISA = qw(Exporter);
10 # Public, encouraged API is exported by default
18 our @Alias; # ordered matching list
19 our %Alias; # cached known aliases
25 unless (exists $Alias{$_})
27 for (my $i=0; $i < @Alias; $i += 2)
29 my $alias = $Alias[$i];
30 my $val = $Alias[$i+1];
32 if (ref($alias) eq 'Regexp' && $_ =~ $alias)
34 $DEBUG and warn "eval $val";
36 # $@ and warn "$val, $@";
38 elsif (ref($alias) eq 'CODE')
40 $DEBUG and warn "$alias", "->", "($val)";
41 $new = $alias->($val);
43 elsif (lc($_) eq lc($alias))
49 next if $new eq $_; # avoid (direct) recursion on bugs
50 $DEBUG and warn "$alias, $new";
51 my $enc = (ref($new)) ? $new : Encode::find_encoding($new);
62 if (my $e = $Alias{$_}){
67 warn "find_alias($class, $_)->name = $name";
76 my ($alias,$name) = splice(@_,0,2);
77 unshift(@Alias, $alias => $name); # newer one has precedence
78 # clear %Alias cache to allow overrides
82 if (ref($alias) eq 'Regexp' && $k =~ $alias)
84 $DEBUG and warn "delete \$Alias\{$k\}";
87 elsif (ref($alias) eq 'CODE')
89 $DEBUG and warn "delete \$Alias\{$k\}";
90 delete $Alias{$alias->($name)};
94 $DEBUG and warn "delete \$Alias\{$alias\}";
95 delete $Alias{$alias};
100 # Allow latin-1 style names as well
101 # 0 1 2 3 4 5 6 7 8 9 10
102 our @Latin2iso = ( 0, 1, 2, 3, 4, 9, 10, 13, 14, 15, 16 );
103 # Allow winlatin1 style names as well
113 'vietnamese' => 1258,
126 # 'C' => 'US-ascii' so you can feed default locale directly.
127 define_alias('C' => 'US-ascii');
128 # Allow variants of iso-8859-1 etc.
129 define_alias( qr/\biso[-_]?(\d+)[-_](\d+)$/i => '"iso-$1-$2"' );
131 # At least HP-UX has these.
132 define_alias( qr/\biso8859(\d+)$/i => '"iso-8859-$1"' );
135 define_alias( qr/\b(?:hp-)?(arabic|greek|hebrew|kana|roman|thai|turkish)8$/i => '"${1}8"' );
137 # The Official name of ASCII.
138 define_alias( qr/\bANSI[-_]?X3\.4[-_]?1968$/i => '"ascii"' );
140 # This is a font issue, not an encoding issue.
141 # (The currency symbol of the Latin 1 upper half
142 # has been redefined as the euro symbol.)
143 define_alias( qr/^(.+)\@euro$/i => '"$1"' );
145 define_alias( qr/\b(?:iso[-_]?)?latin[-_]?(\d+)$/i
146 => '"iso-8859-$Encode::Alias::Latin2iso[$1]"' );
148 define_alias( qr/\bwin(latin[12]|cyrillic|baltic|greek|turkish|
149 hebrew|arabic|baltic|vietnamese)$/ix =>
150 '"cp" . $Encode::Alias::Winlatin2cp{lc($1)}' );
152 # Common names for non-latin prefered MIME names
153 define_alias( 'ascii' => 'US-ascii',
154 'cyrillic' => 'iso-8859-5',
155 'arabic' => 'iso-8859-6',
156 'greek' => 'iso-8859-7',
157 'hebrew' => 'iso-8859-8',
158 'thai' => 'iso-8859-11',
159 'tis620' => 'iso-8859-11',
162 # At least AIX has IBM-NNN (surprisingly...) instead of cpNNN.
163 # And Microsoft has their own naming (again, surprisingly).
164 # And windows-* is registered in IANA!
165 define_alias( qr/\b(?:ibm|ms|windows)[-_]?(\d\d\d\d?)$/i => '"cp$1"');
167 # Sometimes seen with a leading zero.
168 define_alias( qr/\bcp037\b/i => '"cp37"');
171 define_alias( qr/\bmacIcelandic$/i => '"macIceland"');
172 define_alias( qr/^mac_(.*)$/i => '"mac$1"');
174 define_alias( qr/\bmacRomanian$/i => '"macRumanian"');
176 # Standardize on the dashed versions.
177 # define_alias( qr/\butf8$/i => 'utf-8' );
178 define_alias( qr/\bkoi8r$/i => 'koi8-r' );
179 define_alias( qr/\bkoi8u$/i => 'koi8-u' );
181 unless ($Encode::ON_EBCDIC){
183 define_alias( qr/\beuc.*cn$/i => '"euc-cn"' );
184 define_alias( qr/\bcn.*euc$/i => '"euc-cn"' );
185 # define_alias( qr/\bGB[- ]?(\d+)$/i => '"euc-cn"' )
186 # CP936 doesn't have vendor-addon for GBK, so they're identical.
187 define_alias( qr/^gbk$/i => '"cp936"');
188 # This fixes gb2312 vs. euc-cn confusion, practically
189 define_alias( qr/\bGB[-_ ]?2312(?:\D.*$|$)/i => '"euc-cn"' );
191 define_alias( qr/\bjis$/i => '"7bit-jis"' );
192 define_alias( qr/\beuc.*jp$/i => '"euc-jp"' );
193 define_alias( qr/\bjp.*euc$/i => '"euc-jp"' );
194 define_alias( qr/\bujis$/i => '"euc-jp"' );
195 define_alias( qr/\bshift.*jis$/i => '"shiftjis"' );
196 define_alias( qr/\bsjis$/i => '"shiftjis"' );
198 define_alias( qr/\beuc.*kr$/i => '"euc-kr"' );
199 define_alias( qr/\bkr.*euc$/i => '"euc-kr"' );
200 # This fixes ksc5601 vs. euc-kr confusion, practically
201 define_alias( qr/(?:x-)?uhc$/i => '"cp949"' );
202 define_alias( qr/(?:x-)?windows-949$/i => '"cp949"' );
203 define_alias( qr/\bks_c_5601-1987$/i => '"cp949"' );
205 define_alias( qr/\bbig-?5$/i => '"big5"' );
206 define_alias( qr/\bbig5-hk(?:scs)?$/i => '"big5-hkscs"' );
209 # At last, Map white space and _ to '-'
210 define_alias( qr/^(\S+)[\s_]+(.*)$/i => '"$1-$2"' );
216 # TODO: HP-UX '8' encodings arabic8 greek8 hebrew8 kana8 thai8 turkish8
217 # TODO: HP-UX '15' encodings japanese15 korean15 roi15
218 # TODO: Cyrillic encoding ISO-IR-111 (useful?)
219 # TODO: Armenian encoding ARMSCII-8
220 # TODO: Hebrew encoding ISO-8859-8-1
221 # TODO: Thai encoding TCVN
222 # TODO: Vietnamese encodings VPS
223 # TODO: Mac Asian+African encodings: Arabic Armenian Bengali Burmese
224 # ChineseSimp ChineseTrad Devanagari Ethiopic ExtArabic
225 # Farsi Georgian Gujarati Gurmukhi Hebrew Japanese
226 # Kannada Khmer Korean Laotian Malayalam Mongolian
227 # Oriya Sinhalese Symbol Tamil Telugu Tibetan Vietnamese
231 Encode::Alias - alias defintions to encodings
237 define_alias( newName => ENCODING);
241 Allows newName to be used as an alias for ENCODING. ENCODING may be
242 either the name of an encoding or an encoding object (as described
245 Currently I<newName> can be specified in the following ways:
249 =item As a simple string.
251 =item As a qr// compiled regular expression, e.g.:
253 define_alias( qr/^iso8859-(\d+)$/i => '"iso-8859-$1"' );
255 In this case if I<ENCODING> is not a reference it is C<eval>-ed to
256 allow C<$1> etc. to be substituted. The example is one way to alias
257 names as used in X11 fonts to the MIME names for the iso-8859-*
258 family. Note the double quote inside the single quote.
260 If you are using a regex here, you have to use the quotes as shown or
261 it won't work. Also note that regex handling is tricky even for the
262 experienced. Use it with caution.
264 =item As a code reference, e.g.:
266 define_alias( sub { return /^iso8859-(\d+)$/i ? "iso-8859-$1" : undef } , '');
269 In this case C<$_> will be set to the name that is being looked up and
270 I<ENCODING> is passed to the sub as its first argument. The example
271 is another way to alias names as used in X11 fonts to the MIME names
272 for the iso-8859-* family.
276 =head2 Alias overloading
278 You can override predefined aliases by simply applying define_alias().
279 New alias is always evaluated first and when neccessary define_alias()
280 flushes internal cache to make new definition available.
282 # redirect SHIFT_JIS to MS/IBM Code Page 932, which is a
283 # superset of SHIFT_JIS
285 define_alias( qr/shift.*jis$/i => '"cp932"' );
286 define_alias( qr/sjis$/i => '"cp932"' );
288 If you want to zap all predefined aliases, you can
290 Encode::Alias->undef_aliases;
294 Encode::Alias->init_aliases;
296 gets factory setting back.
301 L<Encode>, L<Encode::Supported>