More forgiving version comparison in perlivp.
[p5sagit/p5-mst-13.2.git] / lib / I18N / LangTags.pm
CommitLineData
4b053158 1
537c2f98 2# Time-stamp: "2003-10-10 17:43:04 ADT"
4b053158 3# Sean M. Burke <sburke@cpan.org>
4
5require 5.000;
6package I18N::LangTags;
7use strict;
21aeefd5 8use vars qw(@ISA @EXPORT @EXPORT_OK %EXPORT_TAGS $VERSION %Panic);
4b053158 9require Exporter;
4b053158 10@ISA = qw(Exporter);
11@EXPORT = qw();
12@EXPORT_OK = qw(is_language_tag same_language_tag
13 extract_language_tags super_languages
14 similarity_language_tag is_dialect_of
15 locale2language_tag alternate_language_tags
21aeefd5 16 encode_language_tag panic_languages
4b053158 17 );
21aeefd5 18%EXPORT_TAGS = ('ALL' => \@EXPORT_OK);
4b053158 19
537c2f98 20$VERSION = "0.29";
4b053158 21
22=head1 NAME
23
24I18N::LangTags - functions for dealing with RFC3066-style language tags
25
26=head1 SYNOPSIS
27
28 use I18N::LangTags qw(is_language_tag same_language_tag
29 extract_language_tags super_languages
30 similarity_language_tag is_dialect_of
31 locale2language_tag alternate_language_tags
21aeefd5 32 encode_language_tag panic_languages
4b053158 33 );
34
35...or whatever of those functions you want to import. Those are
36all the exportable functions -- you're free to import only some,
21aeefd5 37or none at all. By default, none are imported. If you say:
38
39 use I18N::LangTags qw(:ALL)
40
41...then all are exported. (This saves you from having to use
42something less obvious like C<use I18N::LangTags qw(/./)>.)
4b053158 43
44If you don't import any of these functions, assume a C<&I18N::LangTags::>
45in front of all the function names in the following examples.
46
47=head1 DESCRIPTION
48
49Language tags are a formalism, described in RFC 3066 (obsoleting
501766), for declaring what language form (language and possibly
51dialect) a given chunk of information is in.
52
53This library provides functions for common tasks involving language
54tags as they are needed in a variety of protocols and applications.
55
56Please see the "See Also" references for a thorough explanation
57of how to correctly use language tags.
58
59=over
60
61=cut
62
63###########################################################################
64
65=item * the function is_language_tag($lang1)
66
67Returns true iff $lang1 is a formally valid language tag.
68
69 is_language_tag("fr") is TRUE
70 is_language_tag("x-jicarilla") is FALSE
71 (Subtags can be 8 chars long at most -- 'jicarilla' is 9)
72
73 is_language_tag("sgn-US") is TRUE
74 (That's American Sign Language)
75
76 is_language_tag("i-Klikitat") is TRUE
77 (True without regard to the fact noone has actually
78 registered Klikitat -- it's a formally valid tag)
79
80 is_language_tag("fr-patois") is TRUE
81 (Formally valid -- altho descriptively weak!)
82
83 is_language_tag("Spanish") is FALSE
84 is_language_tag("french-patois") is FALSE
85 (No good -- first subtag has to match
86 /^([xXiI]|[a-zA-Z]{2,3})$/ -- see RFC3066)
87
88 is_language_tag("x-borg-prot2532") is TRUE
89 (Yes, subtags can contain digits, as of RFC3066)
90
91=cut
92
93sub is_language_tag {
94
95 ## Changes in the language tagging standards may have to be reflected here.
96
97 my($tag) = lc($_[0]);
98
99 return 0 if $tag eq "i" or $tag eq "x";
21aeefd5 100 # Bad degenerate cases that the following
4b053158 101 # regexp would erroneously let pass
102
103 return $tag =~
104 /^(?: # First subtag
105 [xi] | [a-z]{2,3}
106 )
107 (?: # Subtags thereafter
108 - # separator
109 [a-z0-9]{1,8} # subtag
110 )*
111 $/xs ? 1 : 0;
112}
113
114###########################################################################
115
116=item * the function extract_language_tags($whatever)
117
118Returns a list of whatever looks like formally valid language tags
119in $whatever. Not very smart, so don't get too creative with
120what you want to feed it.
121
122 extract_language_tags("fr, fr-ca, i-mingo")
123 returns: ('fr', 'fr-ca', 'i-mingo')
124
125 extract_language_tags("It's like this: I'm in fr -- French!")
126 returns: ('It', 'in', 'fr')
127 (So don't just feed it any old thing.)
128
129The output is untainted. If you don't know what tainting is,
130don't worry about it.
131
132=cut
133
134sub extract_language_tags {
135
136 ## Changes in the language tagging standards may have to be reflected here.
137
138 my($text) =
139 $_[0] =~ m/(.+)/ # to make for an untainted result
140 ? $1 : ''
141 ;
142
143 return grep(!m/^[ixIX]$/s, # 'i' and 'x' aren't good tags
144 $text =~
145 m/
146 \b
147 (?: # First subtag
148 [iIxX] | [a-zA-Z]{2,3}
149 )
150 (?: # Subtags thereafter
151 - # separator
152 [a-zA-Z0-9]{1,8} # subtag
153 )*
154 \b
155 /xsg
156 );
157}
158
159###########################################################################
160
161=item * the function same_language_tag($lang1, $lang2)
162
163Returns true iff $lang1 and $lang2 are acceptable variant tags
164representing the same language-form.
165
166 same_language_tag('x-kadara', 'i-kadara') is TRUE
167 (The x/i- alternation doesn't matter)
168 same_language_tag('X-KADARA', 'i-kadara') is TRUE
169 (...and neither does case)
170 same_language_tag('en', 'en-US') is FALSE
171 (all-English is not the SAME as US English)
172 same_language_tag('x-kadara', 'x-kadar') is FALSE
173 (these are totally unrelated tags)
21aeefd5 174 same_language_tag('no-bok', 'nb') is TRUE
175 (no-bok is a legacy tag for nb (Norwegian Bokmal))
4b053158 176
177C<same_language_tag> works by just seeing whether
178C<encode_language_tag($lang1)> is the same as
179C<encode_language_tag($lang2)>.
180
181(Yes, I know this function is named a bit oddly. Call it historic
182reasons.)
183
184=cut
185
186sub same_language_tag {
187 my $el1 = &encode_language_tag($_[0]);
188 return 0 unless defined $el1;
189 # this avoids the problem of
190 # encode_language_tag($lang1) eq and encode_language_tag($lang2)
191 # being true if $lang1 and $lang2 are both undef
192
193 return $el1 eq &encode_language_tag($_[1]) ? 1 : 0;
194}
195
196###########################################################################
197
198=item * the function similarity_language_tag($lang1, $lang2)
199
200Returns an integer representing the degree of similarity between
201tags $lang1 and $lang2 (the order of which does not matter), where
202similarity is the number of common elements on the left,
203without regard to case and to x/i- alternation.
204
205 similarity_language_tag('fr', 'fr-ca') is 1
206 (one element in common)
207 similarity_language_tag('fr-ca', 'fr-FR') is 1
208 (one element in common)
209
210 similarity_language_tag('fr-CA-joual',
211 'fr-CA-PEI') is 2
212 similarity_language_tag('fr-CA-joual', 'fr-CA') is 2
213 (two elements in common)
214
215 similarity_language_tag('x-kadara', 'i-kadara') is 1
216 (x/i- doesn't matter)
217
218 similarity_language_tag('en', 'x-kadar') is 0
219 similarity_language_tag('x-kadara', 'x-kadar') is 0
220 (unrelated tags -- no similarity)
221
222 similarity_language_tag('i-cree-syllabic',
223 'i-cherokee-syllabic') is 0
224 (no B<leftmost> elements in common!)
225
226=cut
227
228sub similarity_language_tag {
229 my $lang1 = &encode_language_tag($_[0]);
230 my $lang2 = &encode_language_tag($_[1]);
21aeefd5 231 # And encode_language_tag takes care of the whole
232 # no-nyn==nn, i-hakka==zh-hakka, etc, things
233
4b053158 234 # NB: (i-sil-...)? (i-sgn-...)?
235
236 return undef if !defined($lang1) and !defined($lang2);
237 return 0 if !defined($lang1) or !defined($lang2);
238
239 my @l1_subtags = split('-', $lang1);
240 my @l2_subtags = split('-', $lang2);
241 my $similarity = 0;
242
243 while(@l1_subtags and @l2_subtags) {
244 if(shift(@l1_subtags) eq shift(@l2_subtags)) {
245 ++$similarity;
246 } else {
247 last;
248 }
249 }
250 return $similarity;
251}
252
253###########################################################################
254
255=item * the function is_dialect_of($lang1, $lang2)
256
21aeefd5 257Returns true iff language tag $lang1 represents a subform of
4b053158 258language tag $lang2.
259
260B<Get the order right! It doesn't work the other way around!>
261
262 is_dialect_of('en-US', 'en') is TRUE
263 (American English IS a dialect of all-English)
264
265 is_dialect_of('fr-CA-joual', 'fr-CA') is TRUE
266 is_dialect_of('fr-CA-joual', 'fr') is TRUE
267 (Joual is a dialect of (a dialect of) French)
268
269 is_dialect_of('en', 'en-US') is FALSE
270 (all-English is a NOT dialect of American English)
271
272 is_dialect_of('fr', 'en-CA') is FALSE
273
21aeefd5 274 is_dialect_of('en', 'en' ) is TRUE
275 is_dialect_of('en-US', 'en-US') is TRUE
4b053158 276 (B<Note:> these are degenerate cases)
277
278 is_dialect_of('i-mingo-tom', 'x-Mingo') is TRUE
279 (the x/i thing doesn't matter, nor does case)
280
21aeefd5 281 is_dialect_of('nn', 'no') is TRUE
282 (because 'nn' (New Norse) is aliased to 'no-nyn',
283 as a special legacy case, and 'no-nyn' is a
284 subform of 'no' (Norwegian))
285
4b053158 286=cut
287
288sub is_dialect_of {
289
290 my $lang1 = &encode_language_tag($_[0]);
291 my $lang2 = &encode_language_tag($_[1]);
292
293 return undef if !defined($lang1) and !defined($lang2);
294 return 0 if !defined($lang1) or !defined($lang2);
295
296 return 1 if $lang1 eq $lang2;
297 return 0 if length($lang1) < length($lang2);
298
299 $lang1 .= '-';
300 $lang2 .= '-';
301 return
302 (substr($lang1, 0, length($lang2)) eq $lang2) ? 1 : 0;
303}
304
305###########################################################################
306
307=item * the function super_languages($lang1)
308
309Returns a list of language tags that are superordinate tags to $lang1
310-- it gets this by removing subtags from the end of $lang1 until
311nothing (or just "i" or "x") is left.
312
313 super_languages("fr-CA-joual") is ("fr-CA", "fr")
314
315 super_languages("en-AU") is ("en")
316
317 super_languages("en") is empty-list, ()
318
319 super_languages("i-cherokee") is empty-list, ()
320 ...not ("i"), which would be illegal as well as pointless.
321
322If $lang1 is not a valid language tag, returns empty-list in
323a list context, undef in a scalar context.
324
325A notable and rather unavoidable problem with this method:
326"x-mingo-tom" has an "x" because the whole tag isn't an
327IANA-registered tag -- but super_languages('x-mingo-tom') is
328('x-mingo') -- which isn't really right, since 'i-mingo' is
329registered. But this module has no way of knowing that. (But note
330that same_language_tag('x-mingo', 'i-mingo') is TRUE.)
331
332More importantly, you assume I<at your peril> that superordinates of
333$lang1 are mutually intelligible with $lang1. Consider this
334carefully.
335
537c2f98 336=cut
4b053158 337
338sub super_languages {
339 my $lang1 = $_[0];
340 return() unless defined($lang1) && &is_language_tag($lang1);
21aeefd5 341
342 # a hack for those annoying new (2001) tags:
343 $lang1 =~ s/^nb\b/no-bok/i; # yes, backwards
344 $lang1 =~ s/^nn\b/no-nyn/i; # yes, backwards
345 $lang1 =~ s/^[ix](-hakka\b)/zh$1/i; # goes the right way
346 # i-hakka-bork-bjork-bjark => zh-hakka-bork-bjork-bjark
347
4b053158 348 my @l1_subtags = split('-', $lang1);
349
350 ## Changes in the language tagging standards may have to be reflected here.
351
352 # NB: (i-sil-...)?
353
354 my @supers = ();
355 foreach my $bit (@l1_subtags) {
356 push @supers,
357 scalar(@supers) ? ($supers[-1] . '-' . $bit) : $bit;
358 }
359 pop @supers if @supers;
360 shift @supers if @supers && $supers[0] =~ m<^[iIxX]$>s;
361 return reverse @supers;
362}
363
364###########################################################################
365
366=item * the function locale2language_tag($locale_identifier)
367
368This takes a locale name (like "en", "en_US", or "en_US.ISO8859-1")
369and maps it to a language tag. If it's not mappable (as with,
370notably, "C" and "POSIX"), this returns empty-list in a list context,
371or undef in a scalar context.
372
373 locale2language_tag("en") is "en"
374
375 locale2language_tag("en_US") is "en-US"
376
377 locale2language_tag("en_US.ISO8859-1") is "en-US"
378
379 locale2language_tag("C") is undef or ()
380
381 locale2language_tag("POSIX") is undef or ()
382
383 locale2language_tag("POSIX") is undef or ()
384
385I'm not totally sure that locale names map satisfactorily to language
386tags. Think REAL hard about how you use this. YOU HAVE BEEN WARNED.
387
388The output is untainted. If you don't know what tainting is,
389don't worry about it.
390
537c2f98 391=cut
4b053158 392
393sub locale2language_tag {
394 my $lang =
395 $_[0] =~ m/(.+)/ # to make for an untainted result
396 ? $1 : ''
397 ;
398
399 return $lang if &is_language_tag($lang); # like "en"
400
401 $lang =~ tr<_><->; # "en_US" -> en-US
402 $lang =~ s<\.[-_a-zA-Z0-9\.]*><>s; # "en_US.ISO8859-1" -> en-US
403
404 return $lang if &is_language_tag($lang);
405
406 return;
407}
408
409###########################################################################
410
411=item * the function encode_language_tag($lang1)
412
413This function, if given a language tag, returns an encoding of it such
414that:
415
416* tags representing different languages never get the same encoding.
417
418* tags representing the same language always get the same encoding.
419
420* an encoding of a formally valid language tag always is a string
421value that is defined, has length, and is true if considered as a
422boolean.
423
424Note that the encoding itself is B<not> a formally valid language tag.
425Note also that you cannot, currently, go from an encoding back to a
426language tag that it's an encoding of.
427
428Note also that you B<must> consider the encoded value as atomic; i.e.,
429you should not consider it as anything but an opaque, unanalysable
430string value. (The internals of the encoding method may change in
431future versions, as the language tagging standard changes over time.)
432
433C<encode_language_tag> returns undef if given anything other than a
434formally valid language tag.
435
436The reason C<encode_language_tag> exists is because different language
437tags may represent the same language; this is normally treatable with
438C<same_language_tag>, but consider this situation:
439
440You have a data file that expresses greetings in different languages.
441Its format is "[language tag]=[how to say 'Hello']", like:
442
443 en-US=Hiho
444 fr=Bonjour
445 i-mingo=Hau'
446
447And suppose you write a program that reads that file and then runs as
448a daemon, answering client requests that specify a language tag and
449then expect the string that says how to greet in that language. So an
450interaction looks like:
451
452 greeting-client asks: fr
453 greeting-server answers: Bonjour
454
455So far so good. But suppose the way you're implementing this is:
456
457 my %greetings;
458 die unless open(IN, "<in.dat");
459 while(<IN>) {
460 chomp;
461 next unless /^([^=]+)=(.+)/s;
462 my($lang, $expr) = ($1, $2);
463 $greetings{$lang} = $expr;
464 }
465 close(IN);
466
467at which point %greetings has the contents:
468
469 "en-US" => "Hiho"
470 "fr" => "Bonjour"
471 "i-mingo" => "Hau'"
472
473And suppose then that you answer client requests for language $wanted
474by just looking up $greetings{$wanted}.
475
476If the client asks for "fr", that will look up successfully in
477%greetings, to the value "Bonjour". And if the client asks for
478"i-mingo", that will look up successfully in %greetings, to the value
479"Hau'".
480
481But if the client asks for "i-Mingo" or "x-mingo", or "Fr", then the
482lookup in %greetings fails. That's the Wrong Thing.
483
484You could instead do lookups on $wanted with:
485
486 use I18N::LangTags qw(same_language_tag);
487 my $repsonse = '';
488 foreach my $l2 (keys %greetings) {
489 if(same_language_tag($wanted, $l2)) {
490 $response = $greetings{$l2};
491 last;
492 }
493 }
494
495But that's rather inefficient. A better way to do it is to start your
496program with:
497
498 use I18N::LangTags qw(encode_language_tag);
499 my %greetings;
500 die unless open(IN, "<in.dat");
501 while(<IN>) {
502 chomp;
503 next unless /^([^=]+)=(.+)/s;
504 my($lang, $expr) = ($1, $2);
505 $greetings{
506 encode_language_tag($lang)
507 } = $expr;
508 }
509 close(IN);
510
511and then just answer client requests for language $wanted by just
512looking up
513
514 $greetings{encode_language_tag($wanted)}
515
516And that does the Right Thing.
517
518=cut
519
520sub encode_language_tag {
521 # Only similarity_language_tag() is allowed to analyse encodings!
522
523 ## Changes in the language tagging standards may have to be reflected here.
524
21aeefd5 525 my($tag) = $_[0] || return undef;
4b053158 526 return undef unless &is_language_tag($tag);
21aeefd5 527
528 # For the moment, these legacy variances are few enough that
529 # we can just handle them here with regexps.
530 $tag =~ s/^iw\b/he/i; # Hebrew
531 $tag =~ s/^in\b/id/i; # Indonesian
aaf52a42 532 $tag =~ s/^cre\b/cr/i; # Cree
533 $tag =~ s/^jw\b/jv/i; # Javanese
21aeefd5 534 $tag =~ s/^[ix]-lux\b/lb/i; # Luxemburger
535 $tag =~ s/^[ix]-navajo\b/nv/i; # Navajo
536 $tag =~ s/^ji\b/yi/i; # Yiddish
aaf52a42 537 # SMB 2003 -- Hm. There's a bunch of new XXX->YY variances now,
538 # but maybe they're all so obscure I can ignore them. "Obscure"
539 # meaning either that the language is obscure, and/or that the
540 # XXX form was extant so briefly that it's unlikely it was ever
541 # used. I hope.
21aeefd5 542 #
543 # These go FROM the simplex to complex form, to get
544 # similarity-comparison right. And that's okay, since
545 # similarity_language_tag is the only thing that
546 # analyzes our output.
547 $tag =~ s/^[ix]-hakka\b/zh-hakka/i; # Hakka
548 $tag =~ s/^nb\b/no-bok/i; # BACKWARDS for Bokmal
549 $tag =~ s/^nn\b/no-nyn/i; # BACKWARDS for Nynorsk
4b053158 550
551 $tag =~ s/^[xiXI]-//s;
552 # Just lop off any leading "x/i-"
4b053158 553
21aeefd5 554 return "~" . uc($tag);
4b053158 555}
556
557#--------------------------------------------------------------------------
558
559=item * the function alternate_language_tags($lang1)
560
561This function, if given a language tag, returns all language tags that
21aeefd5 562are alternate forms of this language tag. (I.e., tags which refer to
563the same language.) This is meant to handle legacy tags caused by
564the minor changes in language tag standards over the years; and
565the x-/i- alternation is also dealt with.
566
567Note that this function does I<not> try to equate new (and never-used,
568and unusable)
569ISO639-2 three-letter tags to old (and still in use) ISO639-1
570two-letter equivalents -- like "ara" -> "ar" -- because
571"ara" has I<never> been in use as an Internet language tag,
572and RFC 3066 stipulates that it never should be, since a shorter
573tag ("ar") exists.
574
575Examples:
576
577 alternate_language_tags('no-bok') is ('nb')
578 alternate_language_tags('nb') is ('no-bok')
579 alternate_language_tags('he') is ('iw')
580 alternate_language_tags('iw') is ('he')
581 alternate_language_tags('i-hakka') is ('zh-hakka', 'x-hakka')
582 alternate_language_tags('zh-hakka') is ('i-hakka', 'x-hakka')
583 alternate_language_tags('en') is ()
584 alternate_language_tags('x-mingo-tom') is ('i-mingo-tom')
585 alternate_language_tags('x-klikitat') is ('i-klikitat')
586 alternate_language_tags('i-klikitat') is ('x-klikitat')
587
588This function returns empty-list if given anything other than a formally
4b053158 589valid language tag.
590
591=cut
592
593my %alt = qw( i x x i I X X I );
594sub alternate_language_tags {
4b053158 595 my $tag = $_[0];
596 return() unless &is_language_tag($tag);
597
21aeefd5 598 my @em; # push 'em real goood!
599
600 # For the moment, these legacy variances are few enough that
601 # we can just handle them here with regexps.
602
603 if( $tag =~ m/^[ix]-hakka\b(.*)/i) {push @em, "zh-hakka$1";
604 } elsif($tag =~ m/^zh-hakka\b(.*)/i) { push @em, "x-hakka$1", "i-hakka$1";
605
606 } elsif($tag =~ m/^he\b(.*)/i) { push @em, "iw$1";
607 } elsif($tag =~ m/^iw\b(.*)/i) { push @em, "he$1";
608
609 } elsif($tag =~ m/^in\b(.*)/i) { push @em, "id$1";
610 } elsif($tag =~ m/^id\b(.*)/i) { push @em, "in$1";
611
612 } elsif($tag =~ m/^[ix]-lux\b(.*)/i) { push @em, "lb$1";
613 } elsif($tag =~ m/^lb\b(.*)/i) { push @em, "i-lux$1", "x-lux$1";
614
615 } elsif($tag =~ m/^[ix]-navajo\b(.*)/i) { push @em, "nv$1";
616 } elsif($tag =~ m/^nv\b(.*)/i) { push @em, "i-navajo$1", "x-navajo$1";
617
618 } elsif($tag =~ m/^yi\b(.*)/i) { push @em, "ji$1";
619 } elsif($tag =~ m/^ji\b(.*)/i) { push @em, "yi$1";
620
621 } elsif($tag =~ m/^nb\b(.*)/i) { push @em, "no-bok$1";
622 } elsif($tag =~ m/^no-bok\b(.*)/i) { push @em, "nb$1";
623
624 } elsif($tag =~ m/^nn\b(.*)/i) { push @em, "no-nyn$1";
625 } elsif($tag =~ m/^no-nyn\b(.*)/i) { push @em, "nn$1";
626 }
627
628 push @em, $alt{$1} . $2 if $tag =~ /^([XIxi])(-.+)/;
629 return @em;
630}
631
632###########################################################################
633
634{
635 # Init %Panic...
636
637 my @panic = ( # MUST all be lowercase!
638 # Only large ("national") languages make it in this list.
639 # If you, as a user, are so bizarre that the /only/ language
640 # you claim to accept is Galician, then no, we won't do you
641 # the favor of providing Catalan as a panic-fallback for
642 # you. Because if I start trying to add "little languages" in
643 # here, I'll just go crazy.
644
4cf5bee0 645 # Scandinavian lgs. All based on opinion and hearsay.
646 'sv' => [qw(nb no da nn)],
647 'da' => [qw(nb no sv nn)], # I guess
648 [qw(no nn nb)], [qw(no nn nb sv da)],
649 'is' => [qw(da sv no nb nn)],
650 'fo' => [qw(da is no nb nn sv)], # I guess
21aeefd5 651
652 # I think this is about the extent of tolerable intelligibility
653 # among large modern Romance languages.
654 'pt' => [qw(es ca it fr)], # Portuguese, Spanish, Catalan, Italian, French
655 'ca' => [qw(es pt it fr)],
656 'es' => [qw(ca it fr pt)],
657 'it' => [qw(es fr ca pt)],
658 'fr' => [qw(es it ca pt)],
659
660 # Also assume that speakers of the main Indian languages prefer
661 # to read/hear Hindi over English
662 [qw(
663 as bn gu kn ks kok ml mni mr ne or pa sa sd te ta ur
664 )] => 'hi',
665 # Assamese, Bengali, Gujarati, [Hindi,] Kannada (Kanarese), Kashmiri,
666 # Konkani, Malayalam, Meithei (Manipuri), Marathi, Nepali, Oriya,
667 # Punjabi, Sanskrit, Sindhi, Telugu, Tamil, and Urdu.
668 'hi' => [qw(bn pa as or)],
669 # I welcome finer data for the other Indian languages.
670 # E.g., what should Oriya's list be, besides just Hindi?
671
672 # And the panic languages for English is, of course, nil!
673
674 # My guesses at Slavic intelligibility:
675 ([qw(ru be uk)]) x 2, # Russian, Belarusian, Ukranian
676 'sr' => 'hr', 'hr' => 'sr', # Serb + Croat
677 'cs' => 'sk', 'sk' => 'cs', # Czech + Slovak
678
679 'ms' => 'id', 'id' => 'ms', # Malay + Indonesian
680
681 'et' => 'fi', 'fi' => 'et', # Estonian + Finnish
682
683 #?? 'lo' => 'th', 'th' => 'lo', # Lao + Thai
684
685 );
686 my($k,$v);
687 while(@panic) {
688 ($k,$v) = splice(@panic,0,2);
689 foreach my $k (ref($k) ? @$k : $k) {
690 foreach my $v (ref($v) ? @$v : $v) {
691 push @{$Panic{$k} ||= []}, $v unless $k eq $v;
692 }
693 }
694 }
695}
696
697=item * the function @langs = panic_languages(@accept_languages)
698
699This function takes a list of 0 or more language
700tags that constitute a given user's Accept-Language list, and
701returns a list of tags for I<other> (non-super)
702languages that are probably acceptable to the user, to be
703used I<if all else fails>.
704
705For example, if a user accepts only 'ca' (Catalan) and
706'es' (Spanish), and the documents/interfaces you have
707available are just in German, Italian, and Chinese, then
708the user will most likely want the Italian one (and not
709the Chinese or German one!), instead of getting
710nothing. So C<panic_languages('ca', 'es')> returns
711a list containing 'it' (Italian).
712
713English ('en') is I<always> in the return list, but
714whether it's at the very end or not depends
715on the input languages. This function works by consulting
716an internal table that stipulates what common
717languages are "close" to each other.
718
719A useful construct you might consider using is:
720
721 @fallbacks = super_languages(@accept_languages);
722 push @fallbacks, panic_languages(
723 @accept_languages, @fallbacks,
724 );
725
726=cut
4b053158 727
21aeefd5 728sub panic_languages {
729 # When in panic or in doubt, run in circles, scream, and shout!
730 my(@out, %seen);
731 foreach my $t (@_) {
732 next unless $t;
733 next if $seen{$t}++; # so we don't return it or hit it again
734 # push @out, super_languages($t); # nah, keep that separate
735 push @out, @{ $Panic{lc $t} || next };
4b053158 736 }
21aeefd5 737 return grep !$seen{$_}++, @out, 'en';
4b053158 738}
739
740###########################################################################
21aeefd5 7411;
742__END__
4b053158 743
744=back
745
746=head1 ABOUT LOWERCASING
747
748I've considered making all the above functions that output language
749tags return all those tags strictly in lowercase. Having all your
750language tags in lowercase does make some things easier. But you
751might as well just lowercase as you like, or call
752C<encode_language_tag($lang1)> where appropriate.
753
754=head1 ABOUT UNICODE PLAINTEXT LANGUAGE TAGS
755
756In some future version of I18N::LangTags, I plan to include support
757for RFC2482-style language tags -- which are basically just normal
758language tags with their ASCII characters shifted into Plane 14.
759
760=head1 SEE ALSO
761
e7525a17 762* L<I18N::LangTags::List|I18N::LangTags::List>
763
4b053158 764* RFC 3066, C<ftp://ftp.isi.edu/in-notes/rfc3066.txt>, "Tags for the
765Identification of Languages". (Obsoletes RFC 1766)
766
767* RFC 2277, C<ftp://ftp.isi.edu/in-notes/rfc2277.txt>, "IETF Policy on
768Character Sets and Languages".
769
770* RFC 2231, C<ftp://ftp.isi.edu/in-notes/rfc2231.txt>, "MIME Parameter
771Value and Encoded Word Extensions: Character Sets, Languages, and
772Continuations".
773
537c2f98 774* RFC 2482, C<ftp://ftp.isi.edu/in-notes/rfc2482.txt>,
4b053158 775"Language Tagging in Unicode Plain Text".
776
777* Locale::Codes, in
f70da2ef 778C<http://www.perl.com/CPAN/modules/by-module/Locale/>
4b053158 779
4b053158 780* ISO 639-2, "Codes for the representation of names of languages",
aaf52a42 781including two-letter and three-letter codes,
782C<http://www.loc.gov/standards/iso639-2/langcodes.html>
4b053158 783
784* The IANA list of registered languages (hopefully up-to-date),
aaf52a42 785C<http://www.iana.org/assignments/language-tags>
4b053158 786
787=head1 COPYRIGHT
788
aaf52a42 789Copyright (c) 1998-2003 Sean M. Burke. All rights reserved.
4b053158 790
791This library is free software; you can redistribute it and/or
792modify it under the same terms as Perl itself.
793
794The programs and documentation in this dist are distributed in
795the hope that they will be useful, but without any warranty; without
796even the implied warranty of merchantability or fitness for a
797particular purpose.
798
799=head1 AUTHOR
800
801Sean M. Burke C<sburke@cpan.org>
802
803=cut
804