Commit | Line | Data |
6b6e008c |
1 | #----------------------------------------------------------------------- |
2 | |
3 | =head1 NAME |
4 | |
5 | Locale::Script - ISO codes for script identification (ISO 15924) |
6 | |
7 | =head1 SYNOPSIS |
8 | |
9 | use Locale::Script; |
10 | use Locale::Constants; |
11 | |
12 | $script = code2script('ph'); # 'Phoenician' |
13 | $code = script2code('Tibetan'); # 'bo' |
14 | $code3 = script2code('Tibetan', |
15 | LOCALE_CODE_ALPHA_3); # 'bod' |
16 | $codeN = script2code('Tibetan', |
17 | LOCALE_CODE_ALPHA_NUMERIC); # 330 |
18 | |
19 | @codes = all_script_codes(); |
20 | @scripts = all_script_names(); |
21 | |
22 | =cut |
23 | |
24 | #----------------------------------------------------------------------- |
25 | |
26 | package Locale::Script; |
27 | use strict; |
28 | require 5.002; |
29 | |
30 | #----------------------------------------------------------------------- |
31 | |
32 | =head1 DESCRIPTION |
33 | |
34 | The C<Locale::Script> module provides access to the ISO |
35 | codes for identifying scripts, as defined in ISO 15924. |
36 | For example, Egyptian hieroglyphs are denoted by the two-letter |
37 | code 'eg', the three-letter code 'egy', and the numeric code 050. |
38 | |
39 | You can either access the codes via the conversion routines |
40 | (described below), or with the two functions which return lists |
41 | of all script codes or all script names. |
42 | |
43 | There are three different code sets you can use for identifying |
44 | scripts: |
45 | |
46 | =over 4 |
47 | |
48 | =item B<alpha-2> |
49 | |
50 | Two letter codes, such as 'bo' for Tibetan. |
51 | This code set is identified with the symbol C<LOCALE_CODE_ALPHA_2>. |
52 | |
53 | =item B<alpha-3> |
54 | |
55 | Three letter codes, such as 'ell' for Greek. |
56 | This code set is identified with the symbol C<LOCALE_CODE_ALPHA_3>. |
57 | |
58 | =item B<numeric> |
59 | |
60 | Numeric codes, such as 410 for Hiragana. |
61 | This code set is identified with the symbol C<LOCALE_CODE_NUMERIC>. |
62 | |
63 | =back |
64 | |
65 | All of the routines take an optional additional argument |
66 | which specifies the code set to use. |
67 | If not specified, it defaults to the two-letter codes. |
68 | This is partly for backwards compatibility (previous versions |
69 | of Locale modules only supported the alpha-2 codes), and |
70 | partly because they are the most widely used codes. |
71 | |
72 | The alpha-2 and alpha-3 codes are not case-dependent, |
73 | so you can use 'BO', 'Bo', 'bO' or 'bo' for Tibetan. |
74 | When a code is returned by one of the functions in |
75 | this module, it will always be lower-case. |
76 | |
77 | =head2 SPECIAL CODES |
78 | |
79 | The standard defines various special codes. |
80 | |
81 | =over 4 |
82 | |
83 | =item * |
84 | |
85 | The standard reserves codes in the ranges B<qa> - B<qt>, |
86 | B<qaa> - B<qat>, and B<900> - B<919>, for private use. |
87 | |
88 | =item * |
89 | |
90 | B<zx>, B<zxx>, and B<997>, are the codes for unwritten languages. |
91 | |
92 | =item * |
93 | |
94 | B<zy>, B<zyy>, and B<998>, are the codes for an undetermined script. |
95 | |
96 | =item * |
97 | |
98 | B<zz>, B<zzz>, and B<999>, are the codes for an uncoded script. |
99 | |
100 | =back |
101 | |
102 | The private codes are not recognised by Locale::Script, |
103 | but the others are. |
104 | |
105 | =cut |
106 | |
107 | #----------------------------------------------------------------------- |
108 | |
109 | require Exporter; |
110 | use Carp; |
111 | use Locale::Constants; |
112 | |
113 | |
114 | #----------------------------------------------------------------------- |
115 | # Public Global Variables |
116 | #----------------------------------------------------------------------- |
117 | use vars qw($VERSION @ISA @EXPORT @EXPORT_OK); |
118 | $VERSION = sprintf("%d.%02d", q$Revision: 2.0 $ =~ /(\d+)\.(\d+)/); |
119 | @ISA = qw(Exporter); |
120 | @EXPORT = qw(code2script script2code |
121 | all_script_codes all_script_names |
122 | script_code2code |
123 | LOCALE_CODE_ALPHA_2 LOCALE_CODE_ALPHA_3 LOCALE_CODE_NUMERIC); |
124 | |
125 | #----------------------------------------------------------------------- |
126 | # Private Global Variables |
127 | #----------------------------------------------------------------------- |
128 | my $CODES = []; |
129 | my $COUNTRIES = []; |
130 | |
131 | |
132 | #======================================================================= |
133 | |
134 | =head1 CONVERSION ROUTINES |
135 | |
136 | There are three conversion routines: C<code2script()>, C<script2code()>, |
137 | and C<script_code2code()>. |
138 | |
139 | =over 8 |
140 | |
141 | =item code2script( CODE, [ CODESET ] ) |
142 | |
143 | This function takes a script code and returns a string |
144 | which contains the name of the script identified. |
145 | If the code is not a valid script code, as defined by ISO 15924, |
146 | then C<undef> will be returned: |
147 | |
148 | $script = code2script('cy'); # Cyrillic |
149 | |
150 | =item script2code( STRING, [ CODESET ] ) |
151 | |
152 | This function takes a script name and returns the corresponding |
153 | script code, if such exists. |
154 | If the argument could not be identified as a script name, |
155 | then C<undef> will be returned: |
156 | |
157 | $code = script2code('Gothic', LOCALE_CODE_ALPHA_3); |
158 | # $code will now be 'gth' |
159 | |
160 | The case of the script name is not important. |
161 | See the section L<KNOWN BUGS AND LIMITATIONS> below. |
162 | |
163 | =item script_code2code( CODE, CODESET, CODESET ) |
164 | |
165 | This function takes a script code from one code set, |
166 | and returns the corresponding code from another code set. |
167 | |
168 | $alpha2 = script_code2code('jwi', |
169 | LOCALE_CODE_ALPHA_3 => LOCALE_CODE_ALPHA_2); |
170 | # $alpha2 will now be 'jw' (Javanese) |
171 | |
172 | If the code passed is not a valid script code in |
173 | the first code set, or if there isn't a code for the |
174 | corresponding script in the second code set, |
175 | then C<undef> will be returned. |
176 | |
177 | =back |
178 | |
179 | =cut |
180 | |
181 | #======================================================================= |
182 | sub code2script |
183 | { |
184 | my $code = shift; |
185 | my $codeset = @_ > 0 ? shift : LOCALE_CODE_DEFAULT; |
186 | |
187 | |
188 | return undef unless defined $code; |
189 | |
190 | #------------------------------------------------------------------- |
191 | # Make sure the code is in the right form before we use it |
192 | # to look up the corresponding script. |
193 | # We have to sprintf because the codes are given as 3-digits, |
194 | # with leading 0's. Eg 070 for Egyptian demotic. |
195 | #------------------------------------------------------------------- |
196 | if ($codeset == LOCALE_CODE_NUMERIC) |
197 | { |
198 | return undef if ($code =~ /\D/); |
199 | $code = sprintf("%.3d", $code); |
200 | } |
201 | else |
202 | { |
203 | $code = lc($code); |
204 | } |
205 | |
206 | if (exists $CODES->[$codeset]->{$code}) |
207 | { |
208 | return $CODES->[$codeset]->{$code}; |
209 | } |
210 | else |
211 | { |
212 | #--------------------------------------------------------------- |
213 | # no such script code! |
214 | #--------------------------------------------------------------- |
215 | return undef; |
216 | } |
217 | } |
218 | |
219 | sub script2code |
220 | { |
221 | my $script = shift; |
222 | my $codeset = @_ > 0 ? shift : LOCALE_CODE_DEFAULT; |
223 | |
224 | |
225 | return undef unless defined $script; |
226 | $script = lc($script); |
227 | if (exists $COUNTRIES->[$codeset]->{$script}) |
228 | { |
229 | return $COUNTRIES->[$codeset]->{$script}; |
230 | } |
231 | else |
232 | { |
233 | #--------------------------------------------------------------- |
234 | # no such script! |
235 | #--------------------------------------------------------------- |
236 | return undef; |
237 | } |
238 | } |
239 | |
240 | sub script_code2code |
241 | { |
242 | (@_ == 3) or croak "script_code2code() takes 3 arguments!"; |
243 | |
244 | my $code = shift; |
245 | my $inset = shift; |
246 | my $outset = shift; |
247 | my $outcode = shift; |
248 | my $script; |
249 | |
250 | |
251 | return undef if $inset == $outset; |
252 | $script = code2script($code, $inset); |
253 | return undef if not defined $script; |
254 | $outcode = script2code($script, $outset); |
255 | return $outcode; |
256 | } |
257 | |
258 | #======================================================================= |
259 | |
260 | =head1 QUERY ROUTINES |
261 | |
262 | There are two function which can be used to obtain a list of all codes, |
263 | or all script names: |
264 | |
265 | =over 8 |
266 | |
267 | =item C<all_script_codes ( [ CODESET ] )> |
268 | |
269 | Returns a list of all two-letter script codes. |
270 | The codes are guaranteed to be all lower-case, |
271 | and not in any particular order. |
272 | |
273 | =item C<all_script_names ( [ CODESET ] )> |
274 | |
275 | Returns a list of all script names for which there is a corresponding |
276 | script code in the specified code set. |
277 | The names are capitalised, and not returned in any particular order. |
278 | |
279 | =back |
280 | |
281 | =cut |
282 | |
283 | #======================================================================= |
284 | sub all_script_codes |
285 | { |
286 | my $codeset = @_ > 0 ? shift : LOCALE_CODE_DEFAULT; |
287 | |
288 | return keys %{ $CODES->[$codeset] }; |
289 | } |
290 | |
291 | sub all_script_names |
292 | { |
293 | my $codeset = @_ > 0 ? shift : LOCALE_CODE_DEFAULT; |
294 | |
295 | return values %{ $CODES->[$codeset] }; |
296 | } |
297 | |
298 | |
299 | #----------------------------------------------------------------------- |
300 | |
301 | =head1 EXAMPLES |
302 | |
303 | The following example illustrates use of the C<code2script()> function. |
304 | The user is prompted for a script code, and then told the corresponding |
305 | script name: |
306 | |
307 | $| = 1; # turn off buffering |
308 | |
309 | print "Enter script code: "; |
310 | chop($code = <STDIN>); |
311 | $script = code2script($code, LOCALE_CODE_ALPHA_2); |
312 | if (defined $script) |
313 | { |
314 | print "$code = $script\n"; |
315 | } |
316 | else |
317 | { |
318 | print "'$code' is not a valid script code!\n"; |
319 | } |
320 | |
321 | |
322 | =head1 KNOWN BUGS AND LIMITATIONS |
323 | |
324 | =over 4 |
325 | |
326 | =item * |
327 | |
328 | When using C<script2code()>, the script name must currently appear |
329 | exactly as it does in the source of the module. For example, |
330 | |
331 | script2code('Egyptian hieroglyphs') |
332 | |
333 | will return B<eg>, as expected. But the following will all return C<undef>: |
334 | |
335 | script2code('hieroglyphs') |
336 | script2code('Egyptian Hieroglypics') |
337 | |
338 | If there's need for it, a future version could have variants |
339 | for script names. |
340 | |
341 | =item * |
342 | |
343 | In the current implementation, all data is read in when the |
344 | module is loaded, and then held in memory. |
345 | A lazy implementation would be more memory friendly. |
346 | |
347 | =back |
348 | |
349 | =head1 SEE ALSO |
350 | |
351 | =over 4 |
352 | |
353 | =item Locale::Language |
354 | |
355 | ISO two letter codes for identification of language (ISO 639). |
356 | |
357 | =item Locale::Currency |
358 | |
359 | ISO three letter codes for identification of currencies |
360 | and funds (ISO 4217). |
361 | |
362 | =item Locale::Country |
363 | |
364 | ISO three letter codes for identification of countries (ISO 3166) |
365 | |
366 | =item ISO 15924 |
367 | |
368 | The ISO standard which defines these codes. |
369 | |
370 | =item http://www.evertype.com/standards/iso15924/ |
371 | |
372 | Home page for ISO 15924. |
373 | |
374 | |
375 | =back |
376 | |
377 | |
378 | =head1 AUTHOR |
379 | |
380 | Neil Bowers E<lt>neil@bowers.comE<gt> |
381 | |
382 | =head1 COPYRIGHT |
383 | |
384 | Copyright (c) 2002 Neil Bowers. |
385 | |
386 | This module is free software; you can redistribute it and/or |
387 | modify it under the same terms as Perl itself. |
388 | |
389 | =cut |
390 | |
391 | #----------------------------------------------------------------------- |
392 | |
393 | #======================================================================= |
394 | # initialisation code - stuff the DATA into the ALPHA2 hash |
395 | #======================================================================= |
396 | { |
397 | my ($alpha2, $alpha3, $numeric); |
398 | my $script; |
399 | |
400 | |
401 | while (<DATA>) |
402 | { |
403 | next unless /\S/; |
404 | chop; |
405 | ($alpha2, $alpha3, $numeric, $script) = split(/:/, $_, 4); |
406 | |
407 | $CODES->[LOCALE_CODE_ALPHA_2]->{$alpha2} = $script; |
408 | $COUNTRIES->[LOCALE_CODE_ALPHA_2]->{"\L$script"} = $alpha2; |
409 | |
410 | if ($alpha3) |
411 | { |
412 | $CODES->[LOCALE_CODE_ALPHA_3]->{$alpha3} = $script; |
413 | $COUNTRIES->[LOCALE_CODE_ALPHA_3]->{"\L$script"} = $alpha3; |
414 | } |
415 | |
416 | if ($numeric) |
417 | { |
418 | $CODES->[LOCALE_CODE_NUMERIC]->{$numeric} = $script; |
419 | $COUNTRIES->[LOCALE_CODE_NUMERIC]->{"\L$script"} = $numeric; |
420 | } |
421 | |
422 | } |
423 | } |
424 | |
425 | 1; |
426 | |
427 | __DATA__ |
428 | am:ama:130:Aramaic |
429 | ar:ara:160:Arabic |
430 | av:ave:151:Avestan |
431 | bh:bhm:300:Brahmi (Ashoka) |
432 | bi:bid:372:Buhid |
433 | bn:ben:325:Bengali |
434 | bo:bod:330:Tibetan |
435 | bp:bpm:285:Bopomofo |
436 | br:brl:570:Braille |
437 | bt:btk:365:Batak |
438 | bu:bug:367:Buginese (Makassar) |
439 | by:bys:550:Blissymbols |
440 | ca:cam:358:Cham |
441 | ch:chu:221:Old Church Slavonic |
442 | ci:cir:291:Cirth |
443 | cm:cmn:402:Cypro-Minoan |
444 | co:cop:205:Coptic |
445 | cp:cpr:403:Cypriote syllabary |
446 | cy:cyr:220:Cyrillic |
447 | ds:dsr:250:Deserel (Mormon) |
448 | dv:dvn:315:Devanagari (Nagari) |
449 | ed:egd:070:Egyptian demotic |
450 | eg:egy:050:Egyptian hieroglyphs |
451 | eh:egh:060:Egyptian hieratic |
452 | el:ell:200:Greek |
453 | eo:eos:210:Etruscan and Oscan |
454 | et:eth:430:Ethiopic |
455 | gl:glg:225:Glagolitic |
456 | gm:gmu:310:Gurmukhi |
457 | gt:gth:206:Gothic |
458 | gu:guj:320:Gujarati |
459 | ha:han:500:Han ideographs |
460 | he:heb:125:Hebrew |
461 | hg:hgl:420:Hangul |
462 | hm:hmo:450:Pahawh Hmong |
463 | ho:hoo:371:Hanunoo |
464 | hr:hrg:410:Hiragana |
465 | hu:hun:176:Old Hungarian runic |
466 | hv:hvn:175:Kok Turki runic |
467 | hy:hye:230:Armenian |
468 | iv:ivl:610:Indus Valley |
469 | ja:jap:930:(alias for Han + Hiragana + Katakana) |
470 | jl:jlg:445:Cherokee syllabary |
471 | jw:jwi:360:Javanese |
472 | ka:kam:241:Georgian (Mxedruli) |
473 | kh:khn:931:(alias for Hangul + Han) |
474 | kk:kkn:411:Katakana |
475 | km:khm:354:Khmer |
476 | kn:kan:345:Kannada |
477 | kr:krn:357:Karenni (Kayah Li) |
478 | ks:kst:305:Kharoshthi |
479 | kx:kax:240:Georgian (Xucuri) |
480 | la:lat:217:Latin |
481 | lf:laf:215:Latin (Fraktur variant) |
482 | lg:lag:216:Latin (Gaelic variant) |
483 | lo:lao:356:Lao |
484 | lp:lpc:335:Lepcha (Rong) |
485 | md:mda:140:Mandaean |
486 | me:mer:100:Meroitic |
487 | mh:may:090:Mayan hieroglyphs |
488 | ml:mlm:347:Malayalam |
489 | mn:mon:145:Mongolian |
490 | my:mya:350:Burmese |
491 | na:naa:400:Linear A |
492 | nb:nbb:401:Linear B |
493 | og:ogm:212:Ogham |
494 | or:ory:327:Oriya |
495 | os:osm:260:Osmanya |
496 | ph:phx:115:Phoenician |
497 | ph:pah:150:Pahlavi |
498 | pl:pld:282:Pollard Phonetic |
499 | pq:pqd:295:Klingon plQaD |
500 | pr:prm:227:Old Permic |
501 | ps:pst:600:Phaistos Disk |
502 | rn:rnr:211:Runic (Germanic) |
503 | rr:rro:620:Rongo-rongo |
504 | sa:sar:110:South Arabian |
505 | si:sin:348:Sinhala |
506 | sj:syj:137:Syriac (Jacobite variant) |
507 | sl:slb:440:Unified Canadian Aboriginal Syllabics |
508 | sn:syn:136:Syriac (Nestorian variant) |
509 | sw:sww:281:Shavian (Shaw) |
510 | sy:syr:135:Syriac (Estrangelo) |
511 | ta:tam:346:Tamil |
512 | tb:tbw:373:Tagbanwa |
513 | te:tel:340:Telugu |
514 | tf:tfn:120:Tifnagh |
515 | tg:tag:370:Tagalog |
516 | th:tha:352:Thai |
517 | tn:tna:170:Thaana |
518 | tw:twr:290:Tengwar |
519 | va:vai:470:Vai |
520 | vs:vsp:280:Visible Speech |
521 | xa:xas:000:Cuneiform, Sumero-Akkadian |
522 | xf:xfa:105:Cuneiform, Old Persian |
523 | xk:xkn:412:(alias for Hiragana + Katakana) |
524 | xu:xug:106:Cuneiform, Ugaritic |
525 | yi:yii:460:Yi |
526 | zx:zxx:997:Unwritten language |
527 | zy:zyy:998:Undetermined script |
528 | zz:zzz:999:Uncoded script |