Commit | Line | Data |
5d030b67 |
1 | package Encode::Alias; |
2 | use strict; |
1485817e |
3 | no warnings 'redefine'; |
5129552c |
4 | use Encode; |
cf9f87ce |
5 | our $VERSION = do { my @r = (q$Revision: 2.1 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; |
8f139f4c |
6 | sub DEBUG () { 0 } |
5d030b67 |
7 | |
10c5ecbb |
8 | use base qw(Exporter); |
5d030b67 |
9 | |
10 | # Public, encouraged API is exported by default |
5129552c |
11 | |
fcb875d4 |
12 | our @EXPORT = |
5129552c |
13 | qw ( |
14 | define_alias |
15 | find_alias |
16 | ); |
5d030b67 |
17 | |
18 | our @Alias; # ordered matching list |
19 | our %Alias; # cached known aliases |
20 | |
5129552c |
21 | sub find_alias |
5d030b67 |
22 | { |
23 | my $class = shift; |
151b5d36 |
24 | my $find = shift; |
25 | unless (exists $Alias{$find}) |
5d030b67 |
26 | { |
151b5d36 |
27 | $Alias{$find} = undef; # Recursion guard |
5d030b67 |
28 | for (my $i=0; $i < @Alias; $i += 2) |
29 | { |
30 | my $alias = $Alias[$i]; |
31 | my $val = $Alias[$i+1]; |
32 | my $new; |
151b5d36 |
33 | if (ref($alias) eq 'Regexp' && $find =~ $alias) |
5d030b67 |
34 | { |
8f139f4c |
35 | DEBUG and warn "eval $val"; |
5d030b67 |
36 | $new = eval $val; |
8f139f4c |
37 | DEBUG and $@ and warn "$val, $@"; |
5d030b67 |
38 | } |
39 | elsif (ref($alias) eq 'CODE') |
40 | { |
8f139f4c |
41 | DEBUG and warn "$alias", "->", "($find)"; |
151b5d36 |
42 | $new = $alias->($find); |
5d030b67 |
43 | } |
151b5d36 |
44 | elsif (lc($find) eq lc($alias)) |
5d030b67 |
45 | { |
46 | $new = $val; |
47 | } |
48 | if (defined($new)) |
49 | { |
151b5d36 |
50 | next if $new eq $find; # avoid (direct) recursion on bugs |
8f139f4c |
51 | DEBUG and warn "$alias, $new"; |
5d030b67 |
52 | my $enc = (ref($new)) ? $new : Encode::find_encoding($new); |
53 | if ($enc) |
54 | { |
151b5d36 |
55 | $Alias{$find} = $enc; |
5d030b67 |
56 | last; |
57 | } |
58 | } |
59 | } |
60 | } |
8f139f4c |
61 | if (DEBUG){ |
a63c962f |
62 | my $name; |
151b5d36 |
63 | if (my $e = $Alias{$find}){ |
a63c962f |
64 | $name = $e->name; |
65 | }else{ |
66 | $name = ""; |
67 | } |
151b5d36 |
68 | warn "find_alias($class, $find)->name = $name"; |
a63c962f |
69 | } |
151b5d36 |
70 | return $Alias{$find}; |
5d030b67 |
71 | } |
72 | |
73 | sub define_alias |
74 | { |
75 | while (@_) |
76 | { |
77 | my ($alias,$name) = splice(@_,0,2); |
78 | unshift(@Alias, $alias => $name); # newer one has precedence |
79 | # clear %Alias cache to allow overrides |
80 | if (ref($alias)){ |
5129552c |
81 | my @a = keys %Alias; |
82 | for my $k (@a){ |
5d030b67 |
83 | if (ref($alias) eq 'Regexp' && $k =~ $alias) |
84 | { |
8f139f4c |
85 | DEBUG and warn "delete \$Alias\{$k\}"; |
5d030b67 |
86 | delete $Alias{$k}; |
87 | } |
88 | elsif (ref($alias) eq 'CODE') |
89 | { |
8f139f4c |
90 | DEBUG and warn "delete \$Alias\{$k\}"; |
5d030b67 |
91 | delete $Alias{$alias->($name)}; |
92 | } |
93 | } |
94 | }else{ |
8f139f4c |
95 | DEBUG and warn "delete \$Alias\{$alias\}"; |
5d030b67 |
96 | delete $Alias{$alias}; |
97 | } |
98 | } |
99 | } |
100 | |
5d030b67 |
101 | # Allow latin-1 style names as well |
102 | # 0 1 2 3 4 5 6 7 8 9 10 |
103 | our @Latin2iso = ( 0, 1, 2, 3, 4, 9, 10, 13, 14, 15, 16 ); |
5d030b67 |
104 | # Allow winlatin1 style names as well |
105 | our %Winlatin2cp = ( |
106 | 'latin1' => 1252, |
107 | 'latin2' => 1250, |
108 | 'cyrillic' => 1251, |
109 | 'greek' => 1253, |
110 | 'turkish' => 1254, |
111 | 'hebrew' => 1255, |
112 | 'arabic' => 1256, |
113 | 'baltic' => 1257, |
114 | 'vietnamese' => 1258, |
115 | ); |
116 | |
5129552c |
117 | init_aliases(); |
118 | |
119 | sub undef_aliases{ |
120 | @Alias = (); |
121 | %Alias = (); |
122 | } |
123 | |
124 | sub init_aliases |
125 | { |
126 | undef_aliases(); |
a999c27c |
127 | |
f2a2953c |
128 | # Try all-lower-case version should all else fails |
a999c27c |
129 | define_alias( qr/^(.*)$/ => '"\L$1"' ); |
130 | |
f2a2953c |
131 | # UTF/UCS stuff |
1485817e |
132 | define_alias( qr/^UTF-?7$/i => '"UTF-7"'); |
11067275 |
133 | define_alias( qr/^UCS-?2-?LE$/i => '"UCS-2LE"' ); |
134 | define_alias( qr/^UCS-?2-?(BE)?$/i => '"UCS-2BE"', |
126bf8bf |
135 | qr/^UCS-?4-?(BE|LE)?$/i => 'uc("UTF-32$1")', |
11067275 |
136 | qr/^iso-10646-1$/i => '"UCS-2BE"' ); |
137 | define_alias( qr/^UTF(16|32)-?BE$/i => '"UTF-$1BE"', |
138 | qr/^UTF(16|32)-?LE$/i => '"UTF-$1LE"', |
139 | qr/^UTF(16|32)$/i => '"UTF-$1"', |
f2a2953c |
140 | ); |
141 | # ASCII |
a999c27c |
142 | define_alias(qr/^(?:US-?)ascii$/i => '"ascii"'); |
a999c27c |
143 | define_alias('C' => 'ascii'); |
2d06ad02 |
144 | define_alias(qr/\bISO[-_]?646[-_]?US$/i => '"ascii"'); |
67d7b5ef |
145 | # Allow variants of iso-8859-1 etc. |
146 | define_alias( qr/\biso[-_]?(\d+)[-_](\d+)$/i => '"iso-$1-$2"' ); |
5129552c |
147 | |
67d7b5ef |
148 | # At least HP-UX has these. |
149 | define_alias( qr/\biso8859(\d+)$/i => '"iso-8859-$1"' ); |
5129552c |
150 | |
67d7b5ef |
151 | # More HP stuff. |
152 | define_alias( qr/\b(?:hp-)?(arabic|greek|hebrew|kana|roman|thai|turkish)8$/i => '"${1}8"' ); |
5129552c |
153 | |
67d7b5ef |
154 | # The Official name of ASCII. |
155 | define_alias( qr/\bANSI[-_]?X3\.4[-_]?1968$/i => '"ascii"' ); |
5129552c |
156 | |
67d7b5ef |
157 | # This is a font issue, not an encoding issue. |
158 | # (The currency symbol of the Latin 1 upper half |
159 | # has been redefined as the euro symbol.) |
5129552c |
160 | define_alias( qr/^(.+)\@euro$/i => '"$1"' ); |
161 | |
fcb875d4 |
162 | define_alias( qr/\b(?:iso[-_]?)?latin[-_]?(\d+)$/i |
bb7af5ca |
163 | => 'defined $Encode::Alias::Latin2iso[$1] ? "iso-8859-$Encode::Alias::Latin2iso[$1]" : undef' ); |
5129552c |
164 | |
67d7b5ef |
165 | define_alias( qr/\bwin(latin[12]|cyrillic|baltic|greek|turkish| |
fcb875d4 |
166 | hebrew|arabic|baltic|vietnamese)$/ix => |
5129552c |
167 | '"cp" . $Encode::Alias::Winlatin2cp{lc($1)}' ); |
5d030b67 |
168 | |
67d7b5ef |
169 | # Common names for non-latin prefered MIME names |
5129552c |
170 | define_alias( 'ascii' => 'US-ascii', |
171 | 'cyrillic' => 'iso-8859-5', |
172 | 'arabic' => 'iso-8859-6', |
173 | 'greek' => 'iso-8859-7', |
174 | 'hebrew' => 'iso-8859-8', |
175 | 'thai' => 'iso-8859-11', |
176 | 'tis620' => 'iso-8859-11', |
177 | ); |
5d030b67 |
178 | |
67d7b5ef |
179 | # At least AIX has IBM-NNN (surprisingly...) instead of cpNNN. |
180 | # And Microsoft has their own naming (again, surprisingly). |
fcb875d4 |
181 | # And windows-* is registered in IANA! |
2d06ad02 |
182 | define_alias( qr/\b(?:cp|ibm|ms|windows)[-_ ]?(\d{2,4})$/i => '"cp$1"'); |
5d030b67 |
183 | |
67d7b5ef |
184 | # Sometimes seen with a leading zero. |
c731e18e |
185 | # define_alias( qr/\bcp037\b/i => '"cp37"'); |
5d030b67 |
186 | |
3ef515df |
187 | # Mac Mappings |
a999c27c |
188 | # predefined in *.ucm; unneeded |
189 | # define_alias( qr/\bmacIcelandic$/i => '"macIceland"'); |
3ef515df |
190 | define_alias( qr/^mac_(.*)$/i => '"mac$1"'); |
a999c27c |
191 | # Ououououou. gone. They are differente! |
192 | # define_alias( qr/\bmacRomanian$/i => '"macRumanian"'); |
fcb875d4 |
193 | |
f2a2953c |
194 | # Standardize on the dashed versions. |
78c56a8c |
195 | # define_alias( qr/\butf8$/i => '"utf-8"' ); |
cf9f87ce |
196 | define_alias( qr/\bkoi8[\s\-_]*([ru])$/i => '"koi8-$1"' ); |
5129552c |
197 | |
a63c962f |
198 | unless ($Encode::ON_EBCDIC){ |
199 | # for Encode::CN |
67d7b5ef |
200 | define_alias( qr/\beuc.*cn$/i => '"euc-cn"' ); |
201 | define_alias( qr/\bcn.*euc$/i => '"euc-cn"' ); |
202 | # define_alias( qr/\bGB[- ]?(\d+)$/i => '"euc-cn"' ) |
203 | # CP936 doesn't have vendor-addon for GBK, so they're identical. |
204 | define_alias( qr/^gbk$/i => '"cp936"'); |
205 | # This fixes gb2312 vs. euc-cn confusion, practically |
b9531c19 |
206 | define_alias( qr/\bGB[-_ ]?2312(?!-?raw)/i => '"euc-cn"' ); |
67d7b5ef |
207 | # for Encode::JP |
208 | define_alias( qr/\bjis$/i => '"7bit-jis"' ); |
209 | define_alias( qr/\beuc.*jp$/i => '"euc-jp"' ); |
210 | define_alias( qr/\bjp.*euc$/i => '"euc-jp"' ); |
211 | define_alias( qr/\bujis$/i => '"euc-jp"' ); |
212 | define_alias( qr/\bshift.*jis$/i => '"shiftjis"' ); |
213 | define_alias( qr/\bsjis$/i => '"shiftjis"' ); |
a63c962f |
214 | # for Encode::KR |
67d7b5ef |
215 | define_alias( qr/\beuc.*kr$/i => '"euc-kr"' ); |
216 | define_alias( qr/\bkr.*euc$/i => '"euc-kr"' ); |
217 | # This fixes ksc5601 vs. euc-kr confusion, practically |
218 | define_alias( qr/(?:x-)?uhc$/i => '"cp949"' ); |
219 | define_alias( qr/(?:x-)?windows-949$/i => '"cp949"' ); |
220 | define_alias( qr/\bks_c_5601-1987$/i => '"cp949"' ); |
a63c962f |
221 | # for Encode::TW |
b0b300a3 |
222 | define_alias( qr/\bbig-?5$/i => '"big5-eten"' ); |
4b291ae6 |
223 | define_alias( qr/\bbig5-?et(?:en)?$/i => '"big5-eten"' ); |
2d06ad02 |
224 | define_alias( qr/\btca[-_]?big5$/i => '"big5-eten"' ); |
b0b300a3 |
225 | define_alias( qr/\bbig5-?hk(?:scs)?$/i => '"big5-hkscs"' ); |
2d06ad02 |
226 | define_alias( qr/\bhk(?:scs)?[-_]?big5$/i => '"big5-hkscs"' ); |
a63c962f |
227 | } |
f2a2953c |
228 | # utf8 is blessed :) |
229 | define_alias( qr/^UTF-8$/i => '"utf8"',); |
67d7b5ef |
230 | # At last, Map white space and _ to '-' |
5129552c |
231 | define_alias( qr/^(\S+)[\s_]+(.*)$/i => '"$1-$2"' ); |
232 | } |
233 | |
234 | 1; |
235 | __END__ |
5d030b67 |
236 | |
237 | # TODO: HP-UX '8' encodings arabic8 greek8 hebrew8 kana8 thai8 turkish8 |
238 | # TODO: HP-UX '15' encodings japanese15 korean15 roi15 |
239 | # TODO: Cyrillic encoding ISO-IR-111 (useful?) |
240 | # TODO: Armenian encoding ARMSCII-8 |
241 | # TODO: Hebrew encoding ISO-8859-8-1 |
242 | # TODO: Thai encoding TCVN |
5d030b67 |
243 | # TODO: Vietnamese encodings VPS |
244 | # TODO: Mac Asian+African encodings: Arabic Armenian Bengali Burmese |
245 | # ChineseSimp ChineseTrad Devanagari Ethiopic ExtArabic |
246 | # Farsi Georgian Gujarati Gurmukhi Hebrew Japanese |
247 | # Kannada Khmer Korean Laotian Malayalam Mongolian |
248 | # Oriya Sinhalese Symbol Tamil Telugu Tibetan Vietnamese |
249 | |
5d030b67 |
250 | =head1 NAME |
251 | |
ce912cd4 |
252 | Encode::Alias - alias definitions to encodings |
5d030b67 |
253 | |
254 | =head1 SYNOPSIS |
255 | |
5129552c |
256 | use Encode; |
257 | use Encode::Alias; |
5d030b67 |
258 | define_alias( newName => ENCODING); |
259 | |
260 | =head1 DESCRIPTION |
261 | |
3ef515df |
262 | Allows newName to be used as an alias for ENCODING. ENCODING may be |
fcb875d4 |
263 | either the name of an encoding or an encoding object (as described |
3ef515df |
264 | in L<Encode>). |
5d030b67 |
265 | |
266 | Currently I<newName> can be specified in the following ways: |
267 | |
268 | =over 4 |
269 | |
270 | =item As a simple string. |
271 | |
272 | =item As a qr// compiled regular expression, e.g.: |
273 | |
274 | define_alias( qr/^iso8859-(\d+)$/i => '"iso-8859-$1"' ); |
275 | |
0ab8f81e |
276 | In this case, if I<ENCODING> is not a reference, it is C<eval>-ed |
277 | in order to allow C<$1> etc. to be substituted. The example is one |
278 | way to alias names as used in X11 fonts to the MIME names for the |
279 | iso-8859-* family. Note the double quotes inside the single quotes. |
5d030b67 |
280 | |
151b5d36 |
281 | (or, you don't have to do this yourself because this example is predefined) |
282 | |
3ef515df |
283 | If you are using a regex here, you have to use the quotes as shown or |
284 | it won't work. Also note that regex handling is tricky even for the |
151b5d36 |
285 | experienced. Use this feature with caution. |
5d030b67 |
286 | |
287 | =item As a code reference, e.g.: |
288 | |
151b5d36 |
289 | define_alias( sub {shift =~ /^iso8859-(\d+)$/i ? "iso-8859-$1" : undef } ); |
5d030b67 |
290 | |
151b5d36 |
291 | The same effect as the example above in a different way. The coderef |
292 | takes the alias name as an argument and returns a canonical name on |
293 | success or undef if not. Note the second argument is not required. |
294 | Use this with even more caution than the regex version. |
5d030b67 |
295 | |
5129552c |
296 | =back |
297 | |
151b5d36 |
298 | =head3 Changes in code reference aliasing |
299 | |
300 | As of Encode 1.87, the older form |
301 | |
302 | define_alias( sub { return /^iso8859-(\d+)$/i ? "iso-8859-$1" : undef } ); |
303 | |
304 | no longer works. |
305 | |
306 | Encode up to 1.86 internally used "local $_" to implement ths older |
307 | form. But consider the code below; |
308 | |
309 | use Encode; |
310 | $_ = "eeeee" ; |
311 | while (/(e)/g) { |
312 | my $utf = decode('aliased-encoding-name', $1); |
313 | print "position:",pos,"\n"; |
314 | } |
315 | |
316 | Prior to Encode 1.86 this fails because of "local $_". |
317 | |
0ab8f81e |
318 | =head2 Alias overloading |
5d030b67 |
319 | |
3ef515df |
320 | You can override predefined aliases by simply applying define_alias(). |
0ab8f81e |
321 | The new alias is always evaluated first, and when neccessary, |
322 | define_alias() flushes the internal cache to make the new definition |
323 | available. |
5d030b67 |
324 | |
0ab8f81e |
325 | # redirect SHIFT_JIS to MS/IBM Code Page 932, which is a |
5d030b67 |
326 | # superset of SHIFT_JIS |
327 | |
5129552c |
328 | define_alias( qr/shift.*jis$/i => '"cp932"' ); |
329 | define_alias( qr/sjis$/i => '"cp932"' ); |
330 | |
0ab8f81e |
331 | If you want to zap all predefined aliases, you can use |
5129552c |
332 | |
333 | Encode::Alias->undef_aliases; |
334 | |
335 | to do so. And |
336 | |
337 | Encode::Alias->init_aliases; |
338 | |
0ab8f81e |
339 | gets the factory settings back. |
5d030b67 |
340 | |
341 | =head1 SEE ALSO |
342 | |
343 | L<Encode>, L<Encode::Supported> |
344 | |
5129552c |
345 | =cut |
5d030b67 |
346 | |