1 #-----------------------------------------------------------------------
5 Locale::Script - ISO codes for script identification (ISO 15924)
10 use Locale::Constants;
12 $script = code2script('ph'); # 'Phoenician'
13 $code = script2code('Tibetan'); # 'bo'
14 $code3 = script2code('Tibetan',
15 LOCALE_CODE_ALPHA_3); # 'bod'
16 $codeN = script2code('Tibetan',
17 LOCALE_CODE_ALPHA_NUMERIC); # 330
19 @codes = all_script_codes();
20 @scripts = all_script_names();
24 #-----------------------------------------------------------------------
26 package Locale::Script;
30 #-----------------------------------------------------------------------
34 The C<Locale::Script> module provides access to the ISO
35 codes for identifying scripts, as defined in ISO 15924.
36 For example, Egyptian hieroglyphs are denoted by the two-letter
37 code 'eg', the three-letter code 'egy', and the numeric code 050.
39 You can either access the codes via the conversion routines
40 (described below), or with the two functions which return lists
41 of all script codes or all script names.
43 There are three different code sets you can use for identifying
50 Two letter codes, such as 'bo' for Tibetan.
51 This code set is identified with the symbol C<LOCALE_CODE_ALPHA_2>.
55 Three letter codes, such as 'ell' for Greek.
56 This code set is identified with the symbol C<LOCALE_CODE_ALPHA_3>.
60 Numeric codes, such as 410 for Hiragana.
61 This code set is identified with the symbol C<LOCALE_CODE_NUMERIC>.
65 All of the routines take an optional additional argument
66 which specifies the code set to use.
67 If not specified, it defaults to the two-letter codes.
68 This is partly for backwards compatibility (previous versions
69 of Locale modules only supported the alpha-2 codes), and
70 partly because they are the most widely used codes.
72 The alpha-2 and alpha-3 codes are not case-dependent,
73 so you can use 'BO', 'Bo', 'bO' or 'bo' for Tibetan.
74 When a code is returned by one of the functions in
75 this module, it will always be lower-case.
79 The standard defines various special codes.
85 The standard reserves codes in the ranges B<qa> - B<qt>,
86 B<qaa> - B<qat>, and B<900> - B<919>, for private use.
90 B<zx>, B<zxx>, and B<997>, are the codes for unwritten languages.
94 B<zy>, B<zyy>, and B<998>, are the codes for an undetermined script.
98 B<zz>, B<zzz>, and B<999>, are the codes for an uncoded script.
102 The private codes are not recognised by Locale::Script,
107 #-----------------------------------------------------------------------
111 use Locale::Constants;
114 #-----------------------------------------------------------------------
115 # Public Global Variables
116 #-----------------------------------------------------------------------
117 use vars qw($VERSION @ISA @EXPORT @EXPORT_OK);
118 $VERSION = sprintf("%d.%02d", q$Revision: 2.0 $ =~ /(\d+)\.(\d+)/);
120 @EXPORT = qw(code2script script2code
121 all_script_codes all_script_names
123 LOCALE_CODE_ALPHA_2 LOCALE_CODE_ALPHA_3 LOCALE_CODE_NUMERIC);
125 #-----------------------------------------------------------------------
126 # Private Global Variables
127 #-----------------------------------------------------------------------
132 #=======================================================================
134 =head1 CONVERSION ROUTINES
136 There are three conversion routines: C<code2script()>, C<script2code()>,
137 and C<script_code2code()>.
141 =item code2script( CODE, [ CODESET ] )
143 This function takes a script code and returns a string
144 which contains the name of the script identified.
145 If the code is not a valid script code, as defined by ISO 15924,
146 then C<undef> will be returned:
148 $script = code2script('cy'); # Cyrillic
150 =item script2code( STRING, [ CODESET ] )
152 This function takes a script name and returns the corresponding
153 script code, if such exists.
154 If the argument could not be identified as a script name,
155 then C<undef> will be returned:
157 $code = script2code('Gothic', LOCALE_CODE_ALPHA_3);
158 # $code will now be 'gth'
160 The case of the script name is not important.
161 See the section L<KNOWN BUGS AND LIMITATIONS> below.
163 =item script_code2code( CODE, CODESET, CODESET )
165 This function takes a script code from one code set,
166 and returns the corresponding code from another code set.
168 $alpha2 = script_code2code('jwi',
169 LOCALE_CODE_ALPHA_3 => LOCALE_CODE_ALPHA_2);
170 # $alpha2 will now be 'jw' (Javanese)
172 If the code passed is not a valid script code in
173 the first code set, or if there isn't a code for the
174 corresponding script in the second code set,
175 then C<undef> will be returned.
181 #=======================================================================
185 my $codeset = @_ > 0 ? shift : LOCALE_CODE_DEFAULT;
188 return undef unless defined $code;
190 #-------------------------------------------------------------------
191 # Make sure the code is in the right form before we use it
192 # to look up the corresponding script.
193 # We have to sprintf because the codes are given as 3-digits,
194 # with leading 0's. Eg 070 for Egyptian demotic.
195 #-------------------------------------------------------------------
196 if ($codeset == LOCALE_CODE_NUMERIC)
198 return undef if ($code =~ /\D/);
199 $code = sprintf("%.3d", $code);
206 if (exists $CODES->[$codeset]->{$code})
208 return $CODES->[$codeset]->{$code};
212 #---------------------------------------------------------------
213 # no such script code!
214 #---------------------------------------------------------------
222 my $codeset = @_ > 0 ? shift : LOCALE_CODE_DEFAULT;
225 return undef unless defined $script;
226 $script = lc($script);
227 if (exists $COUNTRIES->[$codeset]->{$script})
229 return $COUNTRIES->[$codeset]->{$script};
233 #---------------------------------------------------------------
235 #---------------------------------------------------------------
242 (@_ == 3) or croak "script_code2code() takes 3 arguments!";
251 return undef if $inset == $outset;
252 $script = code2script($code, $inset);
253 return undef if not defined $script;
254 $outcode = script2code($script, $outset);
258 #=======================================================================
260 =head1 QUERY ROUTINES
262 There are two function which can be used to obtain a list of all codes,
267 =item C<all_script_codes ( [ CODESET ] )>
269 Returns a list of all two-letter script codes.
270 The codes are guaranteed to be all lower-case,
271 and not in any particular order.
273 =item C<all_script_names ( [ CODESET ] )>
275 Returns a list of all script names for which there is a corresponding
276 script code in the specified code set.
277 The names are capitalised, and not returned in any particular order.
283 #=======================================================================
286 my $codeset = @_ > 0 ? shift : LOCALE_CODE_DEFAULT;
288 return keys %{ $CODES->[$codeset] };
293 my $codeset = @_ > 0 ? shift : LOCALE_CODE_DEFAULT;
295 return values %{ $CODES->[$codeset] };
299 #-----------------------------------------------------------------------
303 The following example illustrates use of the C<code2script()> function.
304 The user is prompted for a script code, and then told the corresponding
307 $| = 1; # turn off buffering
309 print "Enter script code: ";
310 chop($code = <STDIN>);
311 $script = code2script($code, LOCALE_CODE_ALPHA_2);
314 print "$code = $script\n";
318 print "'$code' is not a valid script code!\n";
322 =head1 KNOWN BUGS AND LIMITATIONS
328 When using C<script2code()>, the script name must currently appear
329 exactly as it does in the source of the module. For example,
331 script2code('Egyptian hieroglyphs')
333 will return B<eg>, as expected. But the following will all return C<undef>:
335 script2code('hieroglyphs')
336 script2code('Egyptian Hieroglypics')
338 If there's need for it, a future version could have variants
343 In the current implementation, all data is read in when the
344 module is loaded, and then held in memory.
345 A lazy implementation would be more memory friendly.
353 =item Locale::Language
355 ISO two letter codes for identification of language (ISO 639).
357 =item Locale::Currency
359 ISO three letter codes for identification of currencies
360 and funds (ISO 4217).
362 =item Locale::Country
364 ISO three letter codes for identification of countries (ISO 3166)
368 The ISO standard which defines these codes.
370 =item http://www.evertype.com/standards/iso15924/
372 Home page for ISO 15924.
380 Neil Bowers E<lt>neil@bowers.comE<gt>
384 Copyright (c) 2002 Neil Bowers.
386 This module is free software; you can redistribute it and/or
387 modify it under the same terms as Perl itself.
391 #-----------------------------------------------------------------------
393 #=======================================================================
394 # initialisation code - stuff the DATA into the ALPHA2 hash
395 #=======================================================================
397 my ($alpha2, $alpha3, $numeric);
405 ($alpha2, $alpha3, $numeric, $script) = split(/:/, $_, 4);
407 $CODES->[LOCALE_CODE_ALPHA_2]->{$alpha2} = $script;
408 $COUNTRIES->[LOCALE_CODE_ALPHA_2]->{"\L$script"} = $alpha2;
412 $CODES->[LOCALE_CODE_ALPHA_3]->{$alpha3} = $script;
413 $COUNTRIES->[LOCALE_CODE_ALPHA_3]->{"\L$script"} = $alpha3;
418 $CODES->[LOCALE_CODE_NUMERIC]->{$numeric} = $script;
419 $COUNTRIES->[LOCALE_CODE_NUMERIC]->{"\L$script"} = $numeric;
431 bh:bhm:300:Brahmi (Ashoka)
438 bu:bug:367:Buginese (Makassar)
439 by:bys:550:Blissymbols
441 ch:chu:221:Old Church Slavonic
443 cm:cmn:402:Cypro-Minoan
445 cp:cpr:403:Cypriote syllabary
447 ds:dsr:250:Deserel (Mormon)
448 dv:dvn:315:Devanagari (Nagari)
449 ed:egd:070:Egyptian demotic
450 eg:egy:050:Egyptian hieroglyphs
451 eh:egh:060:Egyptian hieratic
453 eo:eos:210:Etruscan and Oscan
455 gl:glg:225:Glagolitic
459 ha:han:500:Han ideographs
462 hm:hmo:450:Pahawh Hmong
465 hu:hun:176:Old Hungarian runic
466 hv:hvn:175:Kok Turki runic
468 iv:ivl:610:Indus Valley
469 ja:jap:930:(alias for Han + Hiragana + Katakana)
470 jl:jlg:445:Cherokee syllabary
472 ka:kam:241:Georgian (Mxedruli)
473 kh:khn:931:(alias for Hangul + Han)
477 kr:krn:357:Karenni (Kayah Li)
478 ks:kst:305:Kharoshthi
479 kx:kax:240:Georgian (Xucuri)
481 lf:laf:215:Latin (Fraktur variant)
482 lg:lag:216:Latin (Gaelic variant)
484 lp:lpc:335:Lepcha (Rong)
487 mh:may:090:Mayan hieroglyphs
496 ph:phx:115:Phoenician
498 pl:pld:282:Pollard Phonetic
499 pq:pqd:295:Klingon plQaD
500 pr:prm:227:Old Permic
501 ps:pst:600:Phaistos Disk
502 rn:rnr:211:Runic (Germanic)
503 rr:rro:620:Rongo-rongo
504 sa:sar:110:South Arabian
506 sj:syj:137:Syriac (Jacobite variant)
507 sl:slb:440:Unified Canadian Aboriginal Syllabics
508 sn:syn:136:Syriac (Nestorian variant)
509 sw:sww:281:Shavian (Shaw)
510 sy:syr:135:Syriac (Estrangelo)
520 vs:vsp:280:Visible Speech
521 xa:xas:000:Cuneiform, Sumero-Akkadian
522 xf:xfa:105:Cuneiform, Old Persian
523 xk:xkn:412:(alias for Hiragana + Katakana)
524 xu:xug:106:Cuneiform, Ugaritic
526 zx:zxx:997:Unwritten language
527 zy:zyy:998:Undetermined script
528 zz:zzz:999:Uncoded script