2 # Time-stamp: "2001-05-27 19:53:11 MDT"
3 # Sean M. Burke <sburke@cpan.org>
6 package I18N::LangTags;
8 use vars qw(@ISA @EXPORT @EXPORT_OK $VERSION); # $Debug
13 @EXPORT_OK = qw(is_language_tag same_language_tag
14 extract_language_tags super_languages
15 similarity_language_tag is_dialect_of
16 locale2language_tag alternate_language_tags
24 I18N::LangTags - functions for dealing with RFC3066-style language tags
28 use I18N::LangTags qw(is_language_tag same_language_tag
29 extract_language_tags super_languages
30 similarity_language_tag is_dialect_of
31 locale2language_tag alternate_language_tags
35 ...or whatever of those functions you want to import. Those are
36 all the exportable functions -- you're free to import only some,
37 or none at all. By default, none are imported.
39 If you don't import any of these functions, assume a C<&I18N::LangTags::>
40 in front of all the function names in the following examples.
44 Language tags are a formalism, described in RFC 3066 (obsoleting
45 1766), for declaring what language form (language and possibly
46 dialect) a given chunk of information is in.
48 This library provides functions for common tasks involving language
49 tags as they are needed in a variety of protocols and applications.
51 Please see the "See Also" references for a thorough explanation
52 of how to correctly use language tags.
58 ###########################################################################
60 =item * the function is_language_tag($lang1)
62 Returns true iff $lang1 is a formally valid language tag.
64 is_language_tag("fr") is TRUE
65 is_language_tag("x-jicarilla") is FALSE
66 (Subtags can be 8 chars long at most -- 'jicarilla' is 9)
68 is_language_tag("sgn-US") is TRUE
69 (That's American Sign Language)
71 is_language_tag("i-Klikitat") is TRUE
72 (True without regard to the fact noone has actually
73 registered Klikitat -- it's a formally valid tag)
75 is_language_tag("fr-patois") is TRUE
76 (Formally valid -- altho descriptively weak!)
78 is_language_tag("Spanish") is FALSE
79 is_language_tag("french-patois") is FALSE
80 (No good -- first subtag has to match
81 /^([xXiI]|[a-zA-Z]{2,3})$/ -- see RFC3066)
83 is_language_tag("x-borg-prot2532") is TRUE
84 (Yes, subtags can contain digits, as of RFC3066)
90 ## Changes in the language tagging standards may have to be reflected here.
94 return 0 if $tag eq "i" or $tag eq "x";
95 # Bad degenerate cases the following
96 # regexp would erroneously let pass
102 (?: # Subtags thereafter
104 [a-z0-9]{1,8} # subtag
109 ###########################################################################
111 =item * the function extract_language_tags($whatever)
113 Returns a list of whatever looks like formally valid language tags
114 in $whatever. Not very smart, so don't get too creative with
115 what you want to feed it.
117 extract_language_tags("fr, fr-ca, i-mingo")
118 returns: ('fr', 'fr-ca', 'i-mingo')
120 extract_language_tags("It's like this: I'm in fr -- French!")
121 returns: ('It', 'in', 'fr')
122 (So don't just feed it any old thing.)
124 The output is untainted. If you don't know what tainting is,
125 don't worry about it.
129 sub extract_language_tags {
131 ## Changes in the language tagging standards may have to be reflected here.
134 $_[0] =~ m/(.+)/ # to make for an untainted result
138 return grep(!m/^[ixIX]$/s, # 'i' and 'x' aren't good tags
143 [iIxX] | [a-zA-Z]{2,3}
145 (?: # Subtags thereafter
147 [a-zA-Z0-9]{1,8} # subtag
154 ###########################################################################
156 =item * the function same_language_tag($lang1, $lang2)
158 Returns true iff $lang1 and $lang2 are acceptable variant tags
159 representing the same language-form.
161 same_language_tag('x-kadara', 'i-kadara') is TRUE
162 (The x/i- alternation doesn't matter)
163 same_language_tag('X-KADARA', 'i-kadara') is TRUE
164 (...and neither does case)
165 same_language_tag('en', 'en-US') is FALSE
166 (all-English is not the SAME as US English)
167 same_language_tag('x-kadara', 'x-kadar') is FALSE
168 (these are totally unrelated tags)
170 C<same_language_tag> works by just seeing whether
171 C<encode_language_tag($lang1)> is the same as
172 C<encode_language_tag($lang2)>.
174 (Yes, I know this function is named a bit oddly. Call it historic
179 sub same_language_tag {
180 my $el1 = &encode_language_tag($_[0]);
181 return 0 unless defined $el1;
182 # this avoids the problem of
183 # encode_language_tag($lang1) eq and encode_language_tag($lang2)
184 # being true if $lang1 and $lang2 are both undef
186 return $el1 eq &encode_language_tag($_[1]) ? 1 : 0;
189 ###########################################################################
191 =item * the function similarity_language_tag($lang1, $lang2)
193 Returns an integer representing the degree of similarity between
194 tags $lang1 and $lang2 (the order of which does not matter), where
195 similarity is the number of common elements on the left,
196 without regard to case and to x/i- alternation.
198 similarity_language_tag('fr', 'fr-ca') is 1
199 (one element in common)
200 similarity_language_tag('fr-ca', 'fr-FR') is 1
201 (one element in common)
203 similarity_language_tag('fr-CA-joual',
205 similarity_language_tag('fr-CA-joual', 'fr-CA') is 2
206 (two elements in common)
208 similarity_language_tag('x-kadara', 'i-kadara') is 1
209 (x/i- doesn't matter)
211 similarity_language_tag('en', 'x-kadar') is 0
212 similarity_language_tag('x-kadara', 'x-kadar') is 0
213 (unrelated tags -- no similarity)
215 similarity_language_tag('i-cree-syllabic',
216 'i-cherokee-syllabic') is 0
217 (no B<leftmost> elements in common!)
221 sub similarity_language_tag {
222 my $lang1 = &encode_language_tag($_[0]);
223 my $lang2 = &encode_language_tag($_[1]);
225 # NB: (i-sil-...)? (i-sgn-...)?
227 return undef if !defined($lang1) and !defined($lang2);
228 return 0 if !defined($lang1) or !defined($lang2);
230 my @l1_subtags = split('-', $lang1);
231 my @l2_subtags = split('-', $lang2);
234 while(@l1_subtags and @l2_subtags) {
235 if(shift(@l1_subtags) eq shift(@l2_subtags)) {
244 ###########################################################################
246 =item * the function is_dialect_of($lang1, $lang2)
248 Returns true iff language tag $lang1 represents a subdialect of
251 B<Get the order right! It doesn't work the other way around!>
253 is_dialect_of('en-US', 'en') is TRUE
254 (American English IS a dialect of all-English)
256 is_dialect_of('fr-CA-joual', 'fr-CA') is TRUE
257 is_dialect_of('fr-CA-joual', 'fr') is TRUE
258 (Joual is a dialect of (a dialect of) French)
260 is_dialect_of('en', 'en-US') is FALSE
261 (all-English is a NOT dialect of American English)
263 is_dialect_of('fr', 'en-CA') is FALSE
265 is_dialect_of('en', 'en' ) is TRUE
266 is_dialect_of('en-US', 'en-US') is TRUE
267 (B<Note:> these are degenerate cases)
269 is_dialect_of('i-mingo-tom', 'x-Mingo') is TRUE
270 (the x/i thing doesn't matter, nor does case)
276 my $lang1 = &encode_language_tag($_[0]);
277 my $lang2 = &encode_language_tag($_[1]);
279 return undef if !defined($lang1) and !defined($lang2);
280 return 0 if !defined($lang1) or !defined($lang2);
282 return 1 if $lang1 eq $lang2;
283 return 0 if length($lang1) < length($lang2);
288 (substr($lang1, 0, length($lang2)) eq $lang2) ? 1 : 0;
291 ###########################################################################
293 =item * the function super_languages($lang1)
295 Returns a list of language tags that are superordinate tags to $lang1
296 -- it gets this by removing subtags from the end of $lang1 until
297 nothing (or just "i" or "x") is left.
299 super_languages("fr-CA-joual") is ("fr-CA", "fr")
301 super_languages("en-AU") is ("en")
303 super_languages("en") is empty-list, ()
305 super_languages("i-cherokee") is empty-list, ()
306 ...not ("i"), which would be illegal as well as pointless.
308 If $lang1 is not a valid language tag, returns empty-list in
309 a list context, undef in a scalar context.
311 A notable and rather unavoidable problem with this method:
312 "x-mingo-tom" has an "x" because the whole tag isn't an
313 IANA-registered tag -- but super_languages('x-mingo-tom') is
314 ('x-mingo') -- which isn't really right, since 'i-mingo' is
315 registered. But this module has no way of knowing that. (But note
316 that same_language_tag('x-mingo', 'i-mingo') is TRUE.)
318 More importantly, you assume I<at your peril> that superordinates of
319 $lang1 are mutually intelligible with $lang1. Consider this
324 sub super_languages {
326 return() unless defined($lang1) && &is_language_tag($lang1);
327 my @l1_subtags = split('-', $lang1);
329 ## Changes in the language tagging standards may have to be reflected here.
334 foreach my $bit (@l1_subtags) {
336 scalar(@supers) ? ($supers[-1] . '-' . $bit) : $bit;
338 pop @supers if @supers;
339 shift @supers if @supers && $supers[0] =~ m<^[iIxX]$>s;
340 return reverse @supers;
343 ###########################################################################
345 =item * the function locale2language_tag($locale_identifier)
347 This takes a locale name (like "en", "en_US", or "en_US.ISO8859-1")
348 and maps it to a language tag. If it's not mappable (as with,
349 notably, "C" and "POSIX"), this returns empty-list in a list context,
350 or undef in a scalar context.
352 locale2language_tag("en") is "en"
354 locale2language_tag("en_US") is "en-US"
356 locale2language_tag("en_US.ISO8859-1") is "en-US"
358 locale2language_tag("C") is undef or ()
360 locale2language_tag("POSIX") is undef or ()
362 locale2language_tag("POSIX") is undef or ()
364 I'm not totally sure that locale names map satisfactorily to language
365 tags. Think REAL hard about how you use this. YOU HAVE BEEN WARNED.
367 The output is untainted. If you don't know what tainting is,
368 don't worry about it.
372 sub locale2language_tag {
374 $_[0] =~ m/(.+)/ # to make for an untainted result
378 return $lang if &is_language_tag($lang); # like "en"
380 $lang =~ tr<_><->; # "en_US" -> en-US
381 $lang =~ s<\.[-_a-zA-Z0-9\.]*><>s; # "en_US.ISO8859-1" -> en-US
383 return $lang if &is_language_tag($lang);
388 ###########################################################################
390 =item * the function encode_language_tag($lang1)
392 This function, if given a language tag, returns an encoding of it such
395 * tags representing different languages never get the same encoding.
397 * tags representing the same language always get the same encoding.
399 * an encoding of a formally valid language tag always is a string
400 value that is defined, has length, and is true if considered as a
403 Note that the encoding itself is B<not> a formally valid language tag.
404 Note also that you cannot, currently, go from an encoding back to a
405 language tag that it's an encoding of.
407 Note also that you B<must> consider the encoded value as atomic; i.e.,
408 you should not consider it as anything but an opaque, unanalysable
409 string value. (The internals of the encoding method may change in
410 future versions, as the language tagging standard changes over time.)
412 C<encode_language_tag> returns undef if given anything other than a
413 formally valid language tag.
415 The reason C<encode_language_tag> exists is because different language
416 tags may represent the same language; this is normally treatable with
417 C<same_language_tag>, but consider this situation:
419 You have a data file that expresses greetings in different languages.
420 Its format is "[language tag]=[how to say 'Hello']", like:
426 And suppose you write a program that reads that file and then runs as
427 a daemon, answering client requests that specify a language tag and
428 then expect the string that says how to greet in that language. So an
429 interaction looks like:
431 greeting-client asks: fr
432 greeting-server answers: Bonjour
434 So far so good. But suppose the way you're implementing this is:
437 die unless open(IN, "<in.dat");
440 next unless /^([^=]+)=(.+)/s;
441 my($lang, $expr) = ($1, $2);
442 $greetings{$lang} = $expr;
446 at which point %greetings has the contents:
452 And suppose then that you answer client requests for language $wanted
453 by just looking up $greetings{$wanted}.
455 If the client asks for "fr", that will look up successfully in
456 %greetings, to the value "Bonjour". And if the client asks for
457 "i-mingo", that will look up successfully in %greetings, to the value
460 But if the client asks for "i-Mingo" or "x-mingo", or "Fr", then the
461 lookup in %greetings fails. That's the Wrong Thing.
463 You could instead do lookups on $wanted with:
465 use I18N::LangTags qw(same_language_tag);
467 foreach my $l2 (keys %greetings) {
468 if(same_language_tag($wanted, $l2)) {
469 $response = $greetings{$l2};
474 But that's rather inefficient. A better way to do it is to start your
477 use I18N::LangTags qw(encode_language_tag);
479 die unless open(IN, "<in.dat");
482 next unless /^([^=]+)=(.+)/s;
483 my($lang, $expr) = ($1, $2);
485 encode_language_tag($lang)
490 and then just answer client requests for language $wanted by just
493 $greetings{encode_language_tag($wanted)}
495 And that does the Right Thing.
499 sub encode_language_tag {
500 # Only similarity_language_tag() is allowed to analyse encodings!
502 ## Changes in the language tagging standards may have to be reflected here.
504 my($tag) = uc($_[0]); # smash case
505 return undef unless &is_language_tag($tag);
506 # If it's not a language tag, its encoding is undef
508 $tag =~ s/^[xiXI]-//s;
509 # Just lop off any leading "x/i-"
510 # Or I suppose I could do s/^[xiXI]-/_/s or something.
515 #--------------------------------------------------------------------------
517 =item * the function alternate_language_tags($lang1)
519 This function, if given a language tag, returns all language tags that
520 are alternate forms of this language tag. (There is little
521 alternation in the C<current> language tagging formalism, but
522 extensions to the formalism are under consideration which could add a
523 great deal of alternation.)
525 Examples from the current formalism:
527 alternate_language_tags('en') is ()
528 alternate_language_tags('x-mingo-tom') is ('i-mingo-tom')
529 alternate_language_tags('x-klikitat') is ('i-klikitat')
530 alternate_language_tags('i-klikitat') is ('x-klikitat')
532 This function returns undef if given anything other than a formally
537 my %alt = qw( i x x i I X X I );
538 sub alternate_language_tags {
539 ## Changes in the language tagging standards may have to be reflected here.
541 return() unless &is_language_tag($tag);
543 # might as well preserve case
545 if($tag =~ /^([XIxi])(-.+)/) {
546 # This handles all the alternation that exists CURRENTLY
547 return($alt{$1} . $2);
552 ###########################################################################
556 =head1 ABOUT LOWERCASING
558 I've considered making all the above functions that output language
559 tags return all those tags strictly in lowercase. Having all your
560 language tags in lowercase does make some things easier. But you
561 might as well just lowercase as you like, or call
562 C<encode_language_tag($lang1)> where appropriate.
564 =head1 ABOUT UNICODE PLAINTEXT LANGUAGE TAGS
566 In some future version of I18N::LangTags, I plan to include support
567 for RFC2482-style language tags -- which are basically just normal
568 language tags with their ASCII characters shifted into Plane 14.
572 * L<I18N::LangTags::List|I18N::LangTags::List>
574 * RFC 3066, C<ftp://ftp.isi.edu/in-notes/rfc3066.txt>, "Tags for the
575 Identification of Languages". (Obsoletes RFC 1766)
577 * RFC 2277, C<ftp://ftp.isi.edu/in-notes/rfc2277.txt>, "IETF Policy on
578 Character Sets and Languages".
580 * RFC 2231, C<ftp://ftp.isi.edu/in-notes/rfc2231.txt>, "MIME Parameter
581 Value and Encoded Word Extensions: Character Sets, Languages, and
584 * RFC 2482, C<ftp://ftp.isi.edu/in-notes/rfc2482.txt>,
585 "Language Tagging in Unicode Plain Text".
588 C<http://www.perl.com/CPAN/modules/by-module/Locale/>
590 * ISO 639, "Code for the representation of names of languages",
591 C<http://www.indigo.ie/egt/standards/iso639/iso639-1-en.html>
593 * ISO 639-2, "Codes for the representation of names of languages",
594 including three-letter codes,
595 C<http://lcweb.loc.gov/standards/iso639-2/bibcodes.html>
597 * The IANA list of registered languages (hopefully up-to-date),
598 C<ftp://ftp.isi.edu/in-notes/iana/assignments/languages/>
602 Copyright (c) 1998-2001 Sean M. Burke. All rights reserved.
604 This library is free software; you can redistribute it and/or
605 modify it under the same terms as Perl itself.
607 The programs and documentation in this dist are distributed in
608 the hope that they will be useful, but without any warranty; without
609 even the implied warranty of merchantability or fitness for a
614 Sean M. Burke C<sburke@cpan.org>