3 use Encode qw(find_encoding);
4 our $VERSION = do { my @r = (q$Revision: 0.95 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
8 our @ISA = qw(Exporter);
10 # Public, encouraged API is exported by default
16 our @Alias; # ordered matching list
17 our %Alias; # cached known aliases
23 unless (exists $Alias{$_})
25 for (my $i=0; $i < @Alias; $i += 2)
27 my $alias = $Alias[$i];
28 my $val = $Alias[$i+1];
30 if (ref($alias) eq 'Regexp' && $_ =~ $alias)
33 # $@ and warn "$val, $@";
35 elsif (ref($alias) eq 'CODE')
37 $new = $alias->($val);
39 elsif (lc($_) eq lc($alias))
45 next if $new eq $_; # avoid (direct) recursion on bugs
46 my $enc = (ref($new)) ? $new : Encode::find_encoding($new);
62 my ($alias,$name) = splice(@_,0,2);
63 unshift(@Alias, $alias => $name); # newer one has precedence
64 # clear %Alias cache to allow overrides
66 for my $k (keys %Alias){
67 if (ref($alias) eq 'Regexp' && $k =~ $alias)
72 elsif (ref($alias) eq 'CODE')
74 delete $Alias{$alias->($name)};
78 delete $Alias{$alias};
84 # Allow variants of iso-8859-1 etc.
85 define_alias( qr/^iso[-_]?(\d+)[-_](\d+)$/i => '"iso-$1-$2"' );
87 # At least HP-UX has these.
88 define_alias( qr/^iso8859(\d+)$/i => '"iso-8859-$1"' );
91 define_alias( qr/^(?:hp-)?(arabic|greek|hebrew|kana|roman|thai|turkish)8$/i => '"${1}8"' );
93 # The Official name of ASCII.
94 define_alias( qr/^ANSI[-_]?X3\.4[-_]?1968$/i => '"ascii"' );
96 # This is a font issue, not an encoding issue.
97 # (The currency symbol of the Latin 1 upper half
98 # has been redefined as the euro symbol.)
99 define_alias( qr/^(.+)\@euro$/i => '"$1"' );
101 # Allow latin-1 style names as well
102 # 0 1 2 3 4 5 6 7 8 9 10
103 our @Latin2iso = ( 0, 1, 2, 3, 4, 9, 10, 13, 14, 15, 16 );
104 define_alias( qr/^(?:iso[-_]?)?latin[-_]?(\d+)$/i
105 => '"iso-8859-$Encode::Alias::Latin2iso[$1]"' );
107 # Allow winlatin1 style names as well
117 'vietnamese' => 1258,
120 define_alias( qr/win(latin[12]|cyrillic|baltic|greek|turkish|
121 hebrew|arabic|baltic|vietnamese)$/ix =>
122 '"cp" . $Encode::Alias::Winlatin2cp{lc($1)}' );
124 # Common names for non-latin prefered MIME names
125 define_alias( 'ascii' => 'US-ascii',
126 'cyrillic' => 'iso-8859-5',
127 'arabic' => 'iso-8859-6',
128 'greek' => 'iso-8859-7',
129 'hebrew' => 'iso-8859-8',
130 'thai' => 'iso-8859-11',
131 'tis620' => 'iso-8859-11',
134 # At least AIX has IBM-NNN (surprisingly...) instead of cpNNN.
135 # And Microsoft has their own naming (again, surprisingly).
136 define_alias( qr/^(?:ibm|ms)[-_]?(\d\d\d\d?)$/i => '"cp$1"');
138 # Sometimes seen with a leading zero.
139 define_alias( qr/^cp037$/i => '"cp37"');
142 define_alias( qr/^macRomanian$/i => '"macRumanian"');
144 # Standardize on the dashed versions.
145 define_alias( qr/^utf8$/i => 'utf-8' );
146 define_alias( qr/^koi8r$/i => 'koi8-r' );
147 define_alias( qr/^koi8u$/i => 'koi8-u' );
149 # TODO: HP-UX '8' encodings arabic8 greek8 hebrew8 kana8 thai8 turkish8
150 # TODO: HP-UX '15' encodings japanese15 korean15 roi15
151 # TODO: Cyrillic encoding ISO-IR-111 (useful?)
152 # TODO: Armenian encoding ARMSCII-8
153 # TODO: Hebrew encoding ISO-8859-8-1
154 # TODO: Thai encoding TCVN
155 # TODO: Korean encoding Johab
156 # TODO: Vietnamese encodings VPS
157 # TODO: Mac Asian+African encodings: Arabic Armenian Bengali Burmese
158 # ChineseSimp ChineseTrad Devanagari Ethiopic ExtArabic
159 # Farsi Georgian Gujarati Gurmukhi Hebrew Japanese
160 # Kannada Khmer Korean Laotian Malayalam Mongolian
161 # Oriya Sinhalese Symbol Tamil Telugu Tibetan Vietnamese
163 # Map white space and _ to '-'
164 define_alias( qr/^(\S+)[\s_]+(.*)$/i => '"$1-$2"' );
170 Encode::Alias - alias defintions to encodings
174 use Encode qw(define_alias);
175 define_alias( newName => ENCODING);
179 Allows newName to be used as am alias for ENCODING. ENCODING may be
180 either the name of an encoding or and encoding object (as described in L<Encode>).
182 Currently I<newName> can be specified in the following ways:
186 =item As a simple string.
188 =item As a qr// compiled regular expression, e.g.:
190 define_alias( qr/^iso8859-(\d+)$/i => '"iso-8859-$1"' );
192 In this case if I<ENCODING> is not a reference it is C<eval>-ed to
193 allow C<$1> etc. to be subsituted. The example is one way to names as
194 used in X11 font names to alias the MIME names for the iso-8859-*
195 family. Note the double quote inside the single quote.
197 If you are using regex here, you have to do so or it won't work in
198 this case. Also not regex is tricky even for the experienced. Use it
201 =item As a code reference, e.g.:
203 define_alias( sub { return /^iso8859-(\d+)$/i ? "iso-8859-$1" : undef } , '');
205 In this case C<$_> will be set to the name that is being looked up and
206 I<ENCODING> is passed to the sub as its first argument. The example
207 is another way to names as used in X11 font names to alias the MIME
208 names for the iso-8859-* family.
210 =item Alias overloading
212 You can override predefined aliases by simply applying define_alias().
213 New alias is always evaluated first and when neccessary define_alias()
214 flushes internal cache to make new definition available.
216 # redirect SHIFT_JIS to MS/IBM Code Page 932, which is a
217 # superset of SHIFT_JIS
219 Encode::define_alias( qr/shift.*jis$/i => '"cp932"' );
220 Encode::define_alias( qr/sjis$/i => '"cp932"' );
224 L<Encode>, L<Encode::Supported>