Commit | Line | Data |
5d030b67 |
1 | package Encode::Alias; |
2 | use strict; |
5129552c |
3 | use Encode; |
4 | our $VERSION = do { my @r = (q$Revision: 0.96 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; |
5d030b67 |
5 | our $DEBUG = 0; |
6 | require Exporter; |
7 | |
8 | our @ISA = qw(Exporter); |
9 | |
10 | # Public, encouraged API is exported by default |
5129552c |
11 | |
12 | our @EXPORT = |
13 | qw ( |
14 | define_alias |
15 | find_alias |
16 | ); |
5d030b67 |
17 | |
18 | our @Alias; # ordered matching list |
19 | our %Alias; # cached known aliases |
20 | |
5129552c |
21 | sub find_alias |
5d030b67 |
22 | { |
23 | my $class = shift; |
24 | local $_ = shift; |
25 | unless (exists $Alias{$_}) |
26 | { |
27 | for (my $i=0; $i < @Alias; $i += 2) |
28 | { |
29 | my $alias = $Alias[$i]; |
30 | my $val = $Alias[$i+1]; |
31 | my $new; |
32 | if (ref($alias) eq 'Regexp' && $_ =~ $alias) |
33 | { |
34 | $new = eval $val; |
35 | # $@ and warn "$val, $@"; |
36 | } |
37 | elsif (ref($alias) eq 'CODE') |
38 | { |
39 | $new = $alias->($val); |
40 | } |
41 | elsif (lc($_) eq lc($alias)) |
42 | { |
43 | $new = $val; |
44 | } |
45 | if (defined($new)) |
46 | { |
47 | next if $new eq $_; # avoid (direct) recursion on bugs |
48 | my $enc = (ref($new)) ? $new : Encode::find_encoding($new); |
49 | if ($enc) |
50 | { |
51 | $Alias{$_} = $enc; |
52 | last; |
53 | } |
54 | } |
55 | } |
56 | } |
57 | return $Alias{$_}; |
58 | } |
59 | |
60 | sub define_alias |
61 | { |
62 | while (@_) |
63 | { |
64 | my ($alias,$name) = splice(@_,0,2); |
65 | unshift(@Alias, $alias => $name); # newer one has precedence |
66 | # clear %Alias cache to allow overrides |
67 | if (ref($alias)){ |
5129552c |
68 | my @a = keys %Alias; |
69 | for my $k (@a){ |
5d030b67 |
70 | if (ref($alias) eq 'Regexp' && $k =~ $alias) |
71 | { |
72 | $DEBUG and warn $k; |
73 | delete $Alias{$k}; |
74 | } |
75 | elsif (ref($alias) eq 'CODE') |
76 | { |
77 | delete $Alias{$alias->($name)}; |
78 | } |
79 | } |
80 | }else{ |
81 | delete $Alias{$alias}; |
82 | } |
83 | } |
84 | } |
85 | |
5d030b67 |
86 | # Allow latin-1 style names as well |
87 | # 0 1 2 3 4 5 6 7 8 9 10 |
88 | our @Latin2iso = ( 0, 1, 2, 3, 4, 9, 10, 13, 14, 15, 16 ); |
5d030b67 |
89 | # Allow winlatin1 style names as well |
90 | our %Winlatin2cp = ( |
91 | 'latin1' => 1252, |
92 | 'latin2' => 1250, |
93 | 'cyrillic' => 1251, |
94 | 'greek' => 1253, |
95 | 'turkish' => 1254, |
96 | 'hebrew' => 1255, |
97 | 'arabic' => 1256, |
98 | 'baltic' => 1257, |
99 | 'vietnamese' => 1258, |
100 | ); |
101 | |
5129552c |
102 | init_aliases(); |
103 | |
104 | sub undef_aliases{ |
105 | @Alias = (); |
106 | %Alias = (); |
107 | } |
108 | |
109 | sub init_aliases |
110 | { |
111 | undef_aliases(); |
112 | # Allow variants of iso-8859-1 etc. |
113 | define_alias( qr/^iso[-_]?(\d+)[-_](\d+)$/i => '"iso-$1-$2"' ); |
114 | |
115 | # At least HP-UX has these. |
116 | define_alias( qr/^iso8859(\d+)$/i => '"iso-8859-$1"' ); |
117 | |
118 | # More HP stuff. |
119 | define_alias( qr/^(?:hp-)?(arabic|greek|hebrew|kana|roman|thai|turkish)8$/i => '"${1}8"' ); |
120 | |
121 | # The Official name of ASCII. |
122 | define_alias( qr/^ANSI[-_]?X3\.4[-_]?1968$/i => '"ascii"' ); |
123 | |
124 | # This is a font issue, not an encoding issue. |
125 | # (The currency symbol of the Latin 1 upper half |
126 | # has been redefined as the euro symbol.) |
127 | define_alias( qr/^(.+)\@euro$/i => '"$1"' ); |
128 | |
129 | define_alias( qr/^(?:iso[-_]?)?latin[-_]?(\d+)$/i |
130 | => '"iso-8859-$Encode::Alias::Latin2iso[$1]"' ); |
131 | |
132 | define_alias( qr/win(latin[12]|cyrillic|baltic|greek|turkish| |
133 | hebrew|arabic|baltic|vietnamese)$/ix => |
134 | '"cp" . $Encode::Alias::Winlatin2cp{lc($1)}' ); |
5d030b67 |
135 | |
136 | # Common names for non-latin prefered MIME names |
5129552c |
137 | define_alias( 'ascii' => 'US-ascii', |
138 | 'cyrillic' => 'iso-8859-5', |
139 | 'arabic' => 'iso-8859-6', |
140 | 'greek' => 'iso-8859-7', |
141 | 'hebrew' => 'iso-8859-8', |
142 | 'thai' => 'iso-8859-11', |
143 | 'tis620' => 'iso-8859-11', |
144 | ); |
5d030b67 |
145 | |
146 | # At least AIX has IBM-NNN (surprisingly...) instead of cpNNN. |
147 | # And Microsoft has their own naming (again, surprisingly). |
5129552c |
148 | define_alias( qr/^(?:ibm|ms)[-_]?(\d\d\d\d?)$/i => '"cp$1"'); |
5d030b67 |
149 | |
150 | # Sometimes seen with a leading zero. |
5129552c |
151 | define_alias( qr/^cp037$/i => '"cp37"'); |
5d030b67 |
152 | |
153 | # Ououououou. |
5129552c |
154 | define_alias( qr/^macRomanian$/i => '"macRumanian"'); |
5d030b67 |
155 | |
156 | # Standardize on the dashed versions. |
5129552c |
157 | define_alias( qr/^utf8$/i => 'utf-8' ); |
158 | define_alias( qr/^koi8r$/i => 'koi8-r' ); |
159 | define_alias( qr/^koi8u$/i => 'koi8-u' ); |
160 | |
161 | # for Encode::CN |
162 | define_alias( qr/euc.*cn$/i => '"euc-cn"' ); |
163 | define_alias( qr/cn.*euc/i => '"euc-cn"' ); |
164 | |
165 | # for Encode::JP |
166 | define_alias( qr/euc.*jp$/i => '"euc-jp"' ); |
167 | define_alias( qr/jp.*euc/i => '"euc-jp"' ); |
168 | define_alias( qr/ujis$/i => '"euc-jp"' ); |
169 | define_alias( qr/shift.*jis$/i => '"shiftjis"' ); |
170 | define_alias( qr/sjis$/i => '"shiftjis"' ); |
171 | define_alias( qr/^jis$/i => '"7bit-jis"' ); |
172 | |
173 | # for Encode::KR |
174 | define_alias( qr/euc.*kr$/i => '"euc-kr"' ); |
175 | define_alias( qr/kr.*euc/i => '"euc-kr"' ); |
176 | |
177 | # for Encode::TW |
178 | define_alias( qr/big-?5$/i => '"big5"' ); |
179 | define_alias( qr/big5-hk(?:scs)?/i => '"big5-hkscs"' ); |
180 | |
181 | # At last, Map white space and _ to '-' |
182 | define_alias( qr/^(\S+)[\s_]+(.*)$/i => '"$1-$2"' ); |
183 | } |
184 | |
185 | 1; |
186 | __END__ |
5d030b67 |
187 | |
188 | # TODO: HP-UX '8' encodings arabic8 greek8 hebrew8 kana8 thai8 turkish8 |
189 | # TODO: HP-UX '15' encodings japanese15 korean15 roi15 |
190 | # TODO: Cyrillic encoding ISO-IR-111 (useful?) |
191 | # TODO: Armenian encoding ARMSCII-8 |
192 | # TODO: Hebrew encoding ISO-8859-8-1 |
193 | # TODO: Thai encoding TCVN |
194 | # TODO: Korean encoding Johab |
195 | # TODO: Vietnamese encodings VPS |
196 | # TODO: Mac Asian+African encodings: Arabic Armenian Bengali Burmese |
197 | # ChineseSimp ChineseTrad Devanagari Ethiopic ExtArabic |
198 | # Farsi Georgian Gujarati Gurmukhi Hebrew Japanese |
199 | # Kannada Khmer Korean Laotian Malayalam Mongolian |
200 | # Oriya Sinhalese Symbol Tamil Telugu Tibetan Vietnamese |
201 | |
5d030b67 |
202 | =head1 NAME |
203 | |
204 | Encode::Alias - alias defintions to encodings |
205 | |
206 | =head1 SYNOPSIS |
207 | |
5129552c |
208 | use Encode; |
209 | use Encode::Alias; |
5d030b67 |
210 | define_alias( newName => ENCODING); |
211 | |
212 | =head1 DESCRIPTION |
213 | |
214 | Allows newName to be used as am alias for ENCODING. ENCODING may be |
215 | either the name of an encoding or and encoding object (as described in L<Encode>). |
216 | |
217 | Currently I<newName> can be specified in the following ways: |
218 | |
219 | =over 4 |
220 | |
221 | =item As a simple string. |
222 | |
223 | =item As a qr// compiled regular expression, e.g.: |
224 | |
225 | define_alias( qr/^iso8859-(\d+)$/i => '"iso-8859-$1"' ); |
226 | |
227 | In this case if I<ENCODING> is not a reference it is C<eval>-ed to |
228 | allow C<$1> etc. to be subsituted. The example is one way to names as |
229 | used in X11 font names to alias the MIME names for the iso-8859-* |
230 | family. Note the double quote inside the single quote. |
231 | |
232 | If you are using regex here, you have to do so or it won't work in |
233 | this case. Also not regex is tricky even for the experienced. Use it |
234 | with caution. |
235 | |
236 | =item As a code reference, e.g.: |
237 | |
238 | define_alias( sub { return /^iso8859-(\d+)$/i ? "iso-8859-$1" : undef } , ''); |
239 | |
240 | In this case C<$_> will be set to the name that is being looked up and |
241 | I<ENCODING> is passed to the sub as its first argument. The example |
242 | is another way to names as used in X11 font names to alias the MIME |
243 | names for the iso-8859-* family. |
244 | |
5129552c |
245 | =back |
246 | |
247 | =head2 Alias overloading |
5d030b67 |
248 | |
249 | You can override predefined aliases by simply applying define_alias(). |
250 | New alias is always evaluated first and when neccessary define_alias() |
251 | flushes internal cache to make new definition available. |
252 | |
253 | # redirect SHIFT_JIS to MS/IBM Code Page 932, which is a |
254 | # superset of SHIFT_JIS |
255 | |
5129552c |
256 | define_alias( qr/shift.*jis$/i => '"cp932"' ); |
257 | define_alias( qr/sjis$/i => '"cp932"' ); |
258 | |
259 | If you want to zap all predefined aliases, you can |
260 | |
261 | Encode::Alias->undef_aliases; |
262 | |
263 | to do so. And |
264 | |
265 | Encode::Alias->init_aliases; |
266 | |
267 | gets factory setting back. |
268 | |
5d030b67 |
269 | |
270 | =head1 SEE ALSO |
271 | |
272 | L<Encode>, L<Encode::Supported> |
273 | |
5129552c |
274 | =cut |
5d030b67 |
275 | |