lib/I18N/LangTags.pm

   1
   2 # Time-stamp: "2003-07-20 07:44:42 ADT"
   3 # Sean M. Burke <sburke@cpan.org>
   4
   5 require 5.000;
   6 package I18N::LangTags;
   7 use strict;
   8 use vars qw(@ISA @EXPORT @EXPORT_OK %EXPORT_TAGS $VERSION %Panic);
   9 require Exporter;
  10 @ISA = qw(Exporter);
  11 @EXPORT = qw();
  12 @EXPORT_OK = qw(is_language_tag same_language_tag
  13                 extract_language_tags super_languages
  14                 similarity_language_tag is_dialect_of
  15                 locale2language_tag alternate_language_tags
  16                 encode_language_tag panic_languages
  17                );
  18 %EXPORT_TAGS = ('ALL' => \@EXPORT_OK);
  19
  20 $VERSION = "0.28";
  21
  22 =head1 NAME
  23
  24 I18N::LangTags - functions for dealing with RFC3066-style language tags
  25
  26 =head1 SYNOPSIS
  27
  28     use I18N::LangTags qw(is_language_tag same_language_tag
  29                           extract_language_tags super_languages
  30                           similarity_language_tag is_dialect_of
  31                           locale2language_tag alternate_language_tags
  32                           encode_language_tag panic_languages
  33                          );
  34
  35 ...or whatever of those functions you want to import.  Those are
  36 all the exportable functions -- you're free to import only some,
  37 or none at all.  By default, none are imported.  If you say:
  38
  39     use I18N::LangTags qw(:ALL)
  40
  41 ...then all are exported.  (This saves you from having to use
  42 something less obvious like C<use I18N::LangTags qw(/./)>.)
  43
  44 If you don't import any of these functions, assume a C<&I18N::LangTags::>
  45 in front of all the function names in the following examples.
  46
  47 =head1 DESCRIPTION
  48
  49 Language tags are a formalism, described in RFC 3066 (obsoleting
  50 1766), for declaring what language form (language and possibly
  51 dialect) a given chunk of information is in.
  52
  53 This library provides functions for common tasks involving language
  54 tags as they are needed in a variety of protocols and applications.
  55
  56 Please see the "See Also" references for a thorough explanation
  57 of how to correctly use language tags.
  58
  59 =over
  60
  61 =cut
  62
  63 ###########################################################################
  64
  65 =item * the function is_language_tag($lang1)
  66
  67 Returns true iff $lang1 is a formally valid language tag.
  68
  69    is_language_tag("fr")            is TRUE
  70    is_language_tag("x-jicarilla")   is FALSE
  71        (Subtags can be 8 chars long at most -- 'jicarilla' is 9)
  72
  73    is_language_tag("sgn-US")    is TRUE
  74        (That's American Sign Language)
  75
  76    is_language_tag("i-Klikitat")    is TRUE
  77        (True without regard to the fact noone has actually
  78         registered Klikitat -- it's a formally valid tag)
  79
  80    is_language_tag("fr-patois")     is TRUE
  81        (Formally valid -- altho descriptively weak!)
  82
  83    is_language_tag("Spanish")       is FALSE
  84    is_language_tag("french-patois") is FALSE
  85        (No good -- first subtag has to match
  86         /^([xXiI]|[a-zA-Z]{2,3})$/ -- see RFC3066)
  87
  88    is_language_tag("x-borg-prot2532") is TRUE
  89        (Yes, subtags can contain digits, as of RFC3066)
  90
  91 =cut
  92
  93 sub is_language_tag {
  94
  95   ## Changes in the language tagging standards may have to be reflected here.
  96
  97   my($tag) = lc($_[0]);
  98
  99   return 0 if $tag eq "i" or $tag eq "x";
 100   # Bad degenerate cases that the following
 101   #  regexp would erroneously let pass
 102
 103   return $tag =~
 104     /^(?:  # First subtag
 105          [xi] | [a-z]{2,3}
 106       )
 107       (?:  # Subtags thereafter
 108          -           # separator
 109          [a-z0-9]{1,8}  # subtag
 110       )*
 111     $/xs ? 1 : 0;
 112 }
 113
 114 ###########################################################################
 115
 116 =item * the function extract_language_tags($whatever)
 117
 118 Returns a list of whatever looks like formally valid language tags
 119 in $whatever.  Not very smart, so don't get too creative with
 120 what you want to feed it.
 121
 122   extract_language_tags("fr, fr-ca, i-mingo")
 123     returns:   ('fr', 'fr-ca', 'i-mingo')
 124
 125   extract_language_tags("It's like this: I'm in fr -- French!")
 126     returns:   ('It', 'in', 'fr')
 127   (So don't just feed it any old thing.)
 128
 129 The output is untainted.  If you don't know what tainting is,
 130 don't worry about it.
 131
 132 =cut
 133
 134 sub extract_language_tags {
 135
 136   ## Changes in the language tagging standards may have to be reflected here.
 137
 138   my($text) =
 139     $_[0] =~ m/(.+)/  # to make for an untainted result
 140     ? $1 : ''
 141   ;
 142
 143   return grep(!m/^[ixIX]$/s, # 'i' and 'x' aren't good tags
 144     $text =~
 145     m/
 146       \b
 147       (?:  # First subtag
 148          [iIxX] | [a-zA-Z]{2,3}
 149       )
 150       (?:  # Subtags thereafter
 151          -           # separator
 152          [a-zA-Z0-9]{1,8}  # subtag
 153       )*
 154       \b
 155     /xsg
 156   );
 157 }
 158
 159 ###########################################################################
 160
 161 =item * the function same_language_tag($lang1, $lang2)
 162
 163 Returns true iff $lang1 and $lang2 are acceptable variant tags
 164 representing the same language-form.
 165
 166    same_language_tag('x-kadara', 'i-kadara')  is TRUE
 167       (The x/i- alternation doesn't matter)
 168    same_language_tag('X-KADARA', 'i-kadara')  is TRUE
 169       (...and neither does case)
 170    same_language_tag('en',       'en-US')     is FALSE
 171       (all-English is not the SAME as US English)
 172    same_language_tag('x-kadara', 'x-kadar')   is FALSE
 173       (these are totally unrelated tags)
 174    same_language_tag('no-bok',    'nb')       is TRUE
 175       (no-bok is a legacy tag for nb (Norwegian Bokmal))
 176
 177 C<same_language_tag> works by just seeing whether
 178 C<encode_language_tag($lang1)> is the same as
 179 C<encode_language_tag($lang2)>.
 180
 181 (Yes, I know this function is named a bit oddly.  Call it historic
 182 reasons.)
 183
 184 =cut
 185
 186 sub same_language_tag {
 187   my $el1 = &encode_language_tag($_[0]);
 188   return 0 unless defined $el1;
 189    # this avoids the problem of
 190    # encode_language_tag($lang1) eq and encode_language_tag($lang2)
 191    # being true if $lang1 and $lang2 are both undef
 192
 193   return $el1 eq &encode_language_tag($_[1]) ? 1 : 0;
 194 }
 195
 196 ###########################################################################
 197
 198 =item * the function similarity_language_tag($lang1, $lang2)
 199
 200 Returns an integer representing the degree of similarity between
 201 tags $lang1 and $lang2 (the order of which does not matter), where
 202 similarity is the number of common elements on the left,
 203 without regard to case and to x/i- alternation.
 204
 205    similarity_language_tag('fr', 'fr-ca')           is 1
 206       (one element in common)
 207    similarity_language_tag('fr-ca', 'fr-FR')        is 1
 208       (one element in common)
 209
 210    similarity_language_tag('fr-CA-joual',
 211                            'fr-CA-PEI')             is 2
 212    similarity_language_tag('fr-CA-joual', 'fr-CA')  is 2
 213       (two elements in common)
 214
 215    similarity_language_tag('x-kadara', 'i-kadara')  is 1
 216       (x/i- doesn't matter)
 217
 218    similarity_language_tag('en',       'x-kadar')   is 0
 219    similarity_language_tag('x-kadara', 'x-kadar')   is 0
 220       (unrelated tags -- no similarity)
 221
 222    similarity_language_tag('i-cree-syllabic',
 223                            'i-cherokee-syllabic')   is 0
 224       (no B<leftmost> elements in common!)
 225
 226 =cut
 227
 228 sub similarity_language_tag {
 229   my $lang1 = &encode_language_tag($_[0]);
 230   my $lang2 = &encode_language_tag($_[1]);
 231    # And encode_language_tag takes care of the whole
 232    #  no-nyn==nn, i-hakka==zh-hakka, etc, things
 233
 234   # NB: (i-sil-...)?  (i-sgn-...)?
 235
 236   return undef if !defined($lang1) and !defined($lang2);
 237   return 0 if !defined($lang1) or !defined($lang2);
 238
 239   my @l1_subtags = split('-', $lang1);
 240   my @l2_subtags = split('-', $lang2);
 241   my $similarity = 0;
 242
 243   while(@l1_subtags and @l2_subtags) {
 244     if(shift(@l1_subtags) eq shift(@l2_subtags)) {
 245       ++$similarity;
 246     } else {
 247       last;
 248     }
 249   }
 250   return $similarity;
 251 }
 252
 253 ###########################################################################
 254
 255 =item * the function is_dialect_of($lang1, $lang2)
 256
 257 Returns true iff language tag $lang1 represents a subform of
 258 language tag $lang2.
 259
 260 B<Get the order right!  It doesn't work the other way around!>
 261
 262    is_dialect_of('en-US', 'en')            is TRUE
 263      (American English IS a dialect of all-English)
 264
 265    is_dialect_of('fr-CA-joual', 'fr-CA')   is TRUE
 266    is_dialect_of('fr-CA-joual', 'fr')      is TRUE
 267      (Joual is a dialect of (a dialect of) French)
 268
 269    is_dialect_of('en', 'en-US')            is FALSE
 270      (all-English is a NOT dialect of American English)
 271
 272    is_dialect_of('fr', 'en-CA')            is FALSE
 273
 274    is_dialect_of('en',    'en'   )         is TRUE
 275    is_dialect_of('en-US', 'en-US')         is TRUE
 276      (B<Note:> these are degenerate cases)
 277
 278    is_dialect_of('i-mingo-tom', 'x-Mingo') is TRUE
 279      (the x/i thing doesn't matter, nor does case)
 280
 281    is_dialect_of('nn', 'no')               is TRUE
 282      (because 'nn' (New Norse) is aliased to 'no-nyn',
 283       as a special legacy case, and 'no-nyn' is a
 284       subform of 'no' (Norwegian))
 285
 286 =cut
 287
 288 sub is_dialect_of {
 289
 290   my $lang1 = &encode_language_tag($_[0]);
 291   my $lang2 = &encode_language_tag($_[1]);
 292
 293   return undef if !defined($lang1) and !defined($lang2);
 294   return 0 if !defined($lang1) or !defined($lang2);
 295
 296   return 1 if $lang1 eq $lang2;
 297   return 0 if length($lang1) < length($lang2);
 298
 299   $lang1 .= '-';
 300   $lang2 .= '-';
 301   return
 302     (substr($lang1, 0, length($lang2)) eq $lang2) ? 1 : 0;
 303 }
 304
 305 ###########################################################################
 306
 307 =item * the function super_languages($lang1)
 308
 309 Returns a list of language tags that are superordinate tags to $lang1
 310 -- it gets this by removing subtags from the end of $lang1 until
 311 nothing (or just "i" or "x") is left.
 312
 313    super_languages("fr-CA-joual")  is  ("fr-CA", "fr")
 314
 315    super_languages("en-AU")  is  ("en")
 316
 317    super_languages("en")  is  empty-list, ()
 318
 319    super_languages("i-cherokee")  is  empty-list, ()
 320     ...not ("i"), which would be illegal as well as pointless.
 321
 322 If $lang1 is not a valid language tag, returns empty-list in
 323 a list context, undef in a scalar context.
 324
 325 A notable and rather unavoidable problem with this method:
 326 "x-mingo-tom" has an "x" because the whole tag isn't an
 327 IANA-registered tag -- but super_languages('x-mingo-tom') is
 328 ('x-mingo') -- which isn't really right, since 'i-mingo' is
 329 registered.  But this module has no way of knowing that.  (But note
 330 that same_language_tag('x-mingo', 'i-mingo') is TRUE.)
 331
 332 More importantly, you assume I<at your peril> that superordinates of
 333 $lang1 are mutually intelligible with $lang1.  Consider this
 334 carefully.
 335
 336 =cut
 337
 338 sub super_languages {
 339   my $lang1 = $_[0];
 340   return() unless defined($lang1) && &is_language_tag($lang1);
 341
 342   # a hack for those annoying new (2001) tags:
 343   $lang1 =~ s/^nb\b/no-bok/i; # yes, backwards
 344   $lang1 =~ s/^nn\b/no-nyn/i; # yes, backwards
 345   $lang1 =~ s/^[ix](-hakka\b)/zh$1/i; # goes the right way
 346    # i-hakka-bork-bjork-bjark => zh-hakka-bork-bjork-bjark
 347
 348   my @l1_subtags = split('-', $lang1);
 349
 350   ## Changes in the language tagging standards may have to be reflected here.
 351
 352   # NB: (i-sil-...)?
 353
 354   my @supers = ();
 355   foreach my $bit (@l1_subtags) {
 356     push @supers,
 357       scalar(@supers) ? ($supers[-1] . '-' . $bit) : $bit;
 358   }
 359   pop @supers if @supers;
 360   shift @supers if @supers && $supers[0] =~ m<^[iIxX]$>s;
 361   return reverse @supers;
 362 }
 363
 364 ###########################################################################
 365
 366 =item * the function locale2language_tag($locale_identifier)
 367
 368 This takes a locale name (like "en", "en_US", or "en_US.ISO8859-1")
 369 and maps it to a language tag.  If it's not mappable (as with,
 370 notably, "C" and "POSIX"), this returns empty-list in a list context,
 371 or undef in a scalar context.
 372
 373    locale2language_tag("en") is "en"
 374
 375    locale2language_tag("en_US") is "en-US"
 376
 377    locale2language_tag("en_US.ISO8859-1") is "en-US"
 378
 379    locale2language_tag("C") is undef or ()
 380
 381    locale2language_tag("POSIX") is undef or ()
 382
 383    locale2language_tag("POSIX") is undef or ()
 384
 385 I'm not totally sure that locale names map satisfactorily to language
 386 tags.  Think REAL hard about how you use this.  YOU HAVE BEEN WARNED.
 387
 388 The output is untainted.  If you don't know what tainting is,
 389 don't worry about it.
 390
 391 =cut
 392
 393 sub locale2language_tag {
 394   my $lang =
 395     $_[0] =~ m/(.+)/  # to make for an untainted result
 396     ? $1 : ''
 397   ;
 398
 399   return $lang if &is_language_tag($lang); # like "en"
 400
 401   $lang =~ tr<_><->;  # "en_US" -> en-US
 402   $lang =~ s<\.[-_a-zA-Z0-9\.]*><>s;  # "en_US.ISO8859-1" -> en-US
 403
 404   return $lang if &is_language_tag($lang);
 405
 406   return;
 407 }
 408
 409 ###########################################################################
 410
 411 =item * the function encode_language_tag($lang1)
 412
 413 This function, if given a language tag, returns an encoding of it such
 414 that:
 415
 416 * tags representing different languages never get the same encoding.
 417
 418 * tags representing the same language always get the same encoding.
 419
 420 * an encoding of a formally valid language tag always is a string
 421 value that is defined, has length, and is true if considered as a
 422 boolean.
 423
 424 Note that the encoding itself is B<not> a formally valid language tag.
 425 Note also that you cannot, currently, go from an encoding back to a
 426 language tag that it's an encoding of.
 427
 428 Note also that you B<must> consider the encoded value as atomic; i.e.,
 429 you should not consider it as anything but an opaque, unanalysable
 430 string value.  (The internals of the encoding method may change in
 431 future versions, as the language tagging standard changes over time.)
 432
 433 C<encode_language_tag> returns undef if given anything other than a
 434 formally valid language tag.
 435
 436 The reason C<encode_language_tag> exists is because different language
 437 tags may represent the same language; this is normally treatable with
 438 C<same_language_tag>, but consider this situation:
 439
 440 You have a data file that expresses greetings in different languages.
 441 Its format is "[language tag]=[how to say 'Hello']", like:
 442
 443           en-US=Hiho
 444           fr=Bonjour
 445           i-mingo=Hau'
 446
 447 And suppose you write a program that reads that file and then runs as
 448 a daemon, answering client requests that specify a language tag and
 449 then expect the string that says how to greet in that language.  So an
 450 interaction looks like:
 451
 452           greeting-client asks:    fr
 453           greeting-server answers: Bonjour
 454
 455 So far so good.  But suppose the way you're implementing this is:
 456
 457           my %greetings;
 458           die unless open(IN, "<in.dat");
 459           while(<IN>) {
 460             chomp;
 461             next unless /^([^=]+)=(.+)/s;
 462             my($lang, $expr) = ($1, $2);
 463             $greetings{$lang} = $expr;
 464           }
 465           close(IN);
 466
 467 at which point %greetings has the contents:
 468
 469           "en-US"   => "Hiho"
 470           "fr"      => "Bonjour"
 471           "i-mingo" => "Hau'"
 472
 473 And suppose then that you answer client requests for language $wanted
 474 by just looking up $greetings{$wanted}.
 475
 476 If the client asks for "fr", that will look up successfully in
 477 %greetings, to the value "Bonjour".  And if the client asks for
 478 "i-mingo", that will look up successfully in %greetings, to the value
 479 "Hau'".
 480
 481 But if the client asks for "i-Mingo" or "x-mingo", or "Fr", then the
 482 lookup in %greetings fails.  That's the Wrong Thing.
 483
 484 You could instead do lookups on $wanted with:
 485
 486           use I18N::LangTags qw(same_language_tag);
 487           my $repsonse = '';
 488           foreach my $l2 (keys %greetings) {
 489             if(same_language_tag($wanted, $l2)) {
 490               $response = $greetings{$l2};
 491               last;
 492             }
 493           }
 494
 495 But that's rather inefficient.  A better way to do it is to start your
 496 program with:
 497
 498           use I18N::LangTags qw(encode_language_tag);
 499           my %greetings;
 500           die unless open(IN, "<in.dat");
 501           while(<IN>) {
 502             chomp;
 503             next unless /^([^=]+)=(.+)/s;
 504             my($lang, $expr) = ($1, $2);
 505             $greetings{
 506                         encode_language_tag($lang)
 507                       } = $expr;
 508           }
 509           close(IN);
 510
 511 and then just answer client requests for language $wanted by just
 512 looking up
 513
 514           $greetings{encode_language_tag($wanted)}
 515
 516 And that does the Right Thing.
 517
 518 =cut
 519
 520 sub encode_language_tag {
 521   # Only similarity_language_tag() is allowed to analyse encodings!
 522
 523   ## Changes in the language tagging standards may have to be reflected here.
 524
 525   my($tag) = $_[0] || return undef;
 526   return undef unless &is_language_tag($tag);
 527
 528   # For the moment, these legacy variances are few enough that
 529   #  we can just handle them here with regexps.
 530   $tag =~ s/^iw\b/he/i; # Hebrew
 531   $tag =~ s/^in\b/id/i; # Indonesian
 532   $tag =~ s/^cre\b/cr/i; # Cree
 533   $tag =~ s/^jw\b/jv/i; # Javanese
 534   $tag =~ s/^[ix]-lux\b/lb/i;  # Luxemburger
 535   $tag =~ s/^[ix]-navajo\b/nv/i;  # Navajo
 536   $tag =~ s/^ji\b/yi/i;  # Yiddish
 537   # SMB 2003 -- Hm.  There's a bunch of new XXX->YY variances now,
 538   #  but maybe they're all so obscure I can ignore them.   "Obscure"
 539   #  meaning either that the language is obscure, and/or that the
 540   #  XXX form was extant so briefly that it's unlikely it was ever
 541   #  used.  I hope.
 542   #
 543   # These go FROM the simplex to complex form, to get
 544   #  similarity-comparison right.  And that's okay, since
 545   #  similarity_language_tag is the only thing that
 546   #  analyzes our output.
 547   $tag =~ s/^[ix]-hakka\b/zh-hakka/i;  # Hakka
 548   $tag =~ s/^nb\b/no-bok/i;  # BACKWARDS for Bokmal
 549   $tag =~ s/^nn\b/no-nyn/i;  # BACKWARDS for Nynorsk
 550
 551   $tag =~ s/^[xiXI]-//s;
 552    # Just lop off any leading "x/i-"
 553
 554   return "~" . uc($tag);
 555 }
 556
 557 #--------------------------------------------------------------------------
 558
 559 =item * the function alternate_language_tags($lang1)
 560
 561 This function, if given a language tag, returns all language tags that
 562 are alternate forms of this language tag.  (I.e., tags which refer to
 563 the same language.)  This is meant to handle legacy tags caused by
 564 the minor changes in language tag standards over the years; and
 565 the x-/i- alternation is also dealt with.
 566
 567 Note that this function does I<not> try to equate new (and never-used,
 568 and unusable)
 569 ISO639-2 three-letter tags to old (and still in use) ISO639-1
 570 two-letter equivalents -- like "ara" -> "ar" -- because
 571 "ara" has I<never> been in use as an Internet language tag,
 572 and RFC 3066 stipulates that it never should be, since a shorter
 573 tag ("ar") exists.
 574
 575 Examples:
 576
 577           alternate_language_tags('no-bok')       is ('nb')
 578           alternate_language_tags('nb')           is ('no-bok')
 579           alternate_language_tags('he')           is ('iw')
 580           alternate_language_tags('iw')           is ('he')
 581           alternate_language_tags('i-hakka')      is ('zh-hakka', 'x-hakka')
 582           alternate_language_tags('zh-hakka')     is ('i-hakka', 'x-hakka')
 583           alternate_language_tags('en')           is ()
 584           alternate_language_tags('x-mingo-tom')  is ('i-mingo-tom')
 585           alternate_language_tags('x-klikitat')   is ('i-klikitat')
 586           alternate_language_tags('i-klikitat')   is ('x-klikitat')
 587
 588 This function returns empty-list if given anything other than a formally
 589 valid language tag.
 590
 591 =cut
 592
 593 my %alt = qw( i x   x i   I X   X I );
 594 sub alternate_language_tags {
 595   my $tag = $_[0];
 596   return() unless &is_language_tag($tag);
 597
 598   my @em; # push 'em real goood!
 599
 600   # For the moment, these legacy variances are few enough that
 601   #  we can just handle them here with regexps.
 602
 603   if(     $tag =~ m/^[ix]-hakka\b(.*)/i) {push @em, "zh-hakka$1";
 604   } elsif($tag =~ m/^zh-hakka\b(.*)/i) {  push @em, "x-hakka$1", "i-hakka$1";
 605
 606   } elsif($tag =~ m/^he\b(.*)/i) { push @em, "iw$1";
 607   } elsif($tag =~ m/^iw\b(.*)/i) { push @em, "he$1";
 608
 609   } elsif($tag =~ m/^in\b(.*)/i) { push @em, "id$1";
 610   } elsif($tag =~ m/^id\b(.*)/i) { push @em, "in$1";
 611
 612   } elsif($tag =~ m/^[ix]-lux\b(.*)/i) { push @em, "lb$1";
 613   } elsif($tag =~ m/^lb\b(.*)/i) {       push @em, "i-lux$1", "x-lux$1";
 614
 615   } elsif($tag =~ m/^[ix]-navajo\b(.*)/i) { push @em, "nv$1";
 616   } elsif($tag =~ m/^nv\b(.*)/i) {          push @em, "i-navajo$1", "x-navajo$1";
 617
 618   } elsif($tag =~ m/^yi\b(.*)/i) { push @em, "ji$1";
 619   } elsif($tag =~ m/^ji\b(.*)/i) { push @em, "yi$1";
 620
 621   } elsif($tag =~ m/^nb\b(.*)/i) {     push @em, "no-bok$1";
 622   } elsif($tag =~ m/^no-bok\b(.*)/i) { push @em, "nb$1";
 623
 624   } elsif($tag =~ m/^nn\b(.*)/i) {     push @em, "no-nyn$1";
 625   } elsif($tag =~ m/^no-nyn\b(.*)/i) { push @em, "nn$1";
 626   }
 627
 628   push @em, $alt{$1} . $2 if $tag =~ /^([XIxi])(-.+)/;
 629   return @em;
 630 }
 631
 632 ###########################################################################
 633
 634 {
 635   # Init %Panic...
 636
 637   my @panic = (  # MUST all be lowercase!
 638    # Only large ("national") languages make it in this list.
 639    #  If you, as a user, are so bizarre that the /only/ language
 640    #  you claim to accept is Galician, then no, we won't do you
 641    #  the favor of providing Catalan as a panic-fallback for
 642    #  you.  Because if I start trying to add "little languages" in
 643    #  here, I'll just go crazy.
 644
 645    # Scandinavian lgs.  All based on opinion and hearsay.
 646    'sv' => [qw(nb no da nn)],
 647    'da' => [qw(nb no sv nn)], # I guess
 648    [qw(no nn nb)], [qw(no nn nb sv da)],
 649    'is' => [qw(da sv no nb nn)],
 650    'fo' => [qw(da is no nb nn sv)], # I guess
 651
 652    # I think this is about the extent of tolerable intelligibility
 653    #  among large modern Romance languages.
 654    'pt' => [qw(es ca it fr)], # Portuguese, Spanish, Catalan, Italian, French
 655    'ca' => [qw(es pt it fr)],
 656    'es' => [qw(ca it fr pt)],
 657    'it' => [qw(es fr ca pt)],
 658    'fr' => [qw(es it ca pt)],
 659
 660    # Also assume that speakers of the main Indian languages prefer
 661    #  to read/hear Hindi over English
 662    [qw(
 663      as bn gu kn ks kok ml mni mr ne or pa sa sd te ta ur
 664    )] => 'hi',
 665     # Assamese, Bengali, Gujarati, [Hindi,] Kannada (Kanarese), Kashmiri,
 666     # Konkani, Malayalam, Meithei (Manipuri), Marathi, Nepali, Oriya,
 667     # Punjabi, Sanskrit, Sindhi, Telugu, Tamil, and Urdu.
 668    'hi' => [qw(bn pa as or)],
 669    # I welcome finer data for the other Indian languages.
 670    #  E.g., what should Oriya's list be, besides just Hindi?
 671
 672    # And the panic languages for English is, of course, nil!
 673
 674    # My guesses at Slavic intelligibility:
 675    ([qw(ru be uk)]) x 2,  # Russian, Belarusian, Ukranian
 676    'sr' => 'hr', 'hr' => 'sr', # Serb + Croat
 677    'cs' => 'sk', 'sk' => 'cs', # Czech + Slovak
 678
 679    'ms' => 'id', 'id' => 'ms', # Malay + Indonesian
 680
 681    'et' => 'fi', 'fi' => 'et', # Estonian + Finnish
 682
 683    #?? 'lo' => 'th', 'th' => 'lo', # Lao + Thai
 684
 685   );
 686   my($k,$v);
 687   while(@panic) {
 688     ($k,$v) = splice(@panic,0,2);
 689     foreach my $k (ref($k) ? @$k : $k) {
 690       foreach my $v (ref($v) ? @$v : $v) {
 691         push @{$Panic{$k} ||= []}, $v unless $k eq $v;
 692       }
 693     }
 694   }
 695 }
 696
 697 =item * the function @langs = panic_languages(@accept_languages)
 698
 699 This function takes a list of 0 or more language
 700 tags that constitute a given user's Accept-Language list, and
 701 returns a list of tags for I<other> (non-super)
 702 languages that are probably acceptable to the user, to be
 703 used I<if all else fails>.
 704
 705 For example, if a user accepts only 'ca' (Catalan) and
 706 'es' (Spanish), and the documents/interfaces you have
 707 available are just in German, Italian, and Chinese, then
 708 the user will most likely want the Italian one (and not
 709 the Chinese or German one!), instead of getting
 710 nothing.  So C<panic_languages('ca', 'es')> returns
 711 a list containing 'it' (Italian).
 712
 713 English ('en') is I<always> in the return list, but
 714 whether it's at the very end or not depends
 715 on the input languages.  This function works by consulting
 716 an internal table that stipulates what common
 717 languages are "close" to each other.
 718
 719 A useful construct you might consider using is:
 720
 721   @fallbacks = super_languages(@accept_languages);
 722   push @fallbacks, panic_languages(
 723     @accept_languages, @fallbacks,
 724   );
 725
 726 =cut
 727
 728 sub panic_languages {
 729   # When in panic or in doubt, run in circles, scream, and shout!
 730   my(@out, %seen);
 731   foreach my $t (@_) {
 732     next unless $t;
 733     next if $seen{$t}++; # so we don't return it or hit it again
 734     # push @out, super_languages($t); # nah, keep that separate
 735     push @out, @{ $Panic{lc $t} || next };
 736   }
 737   return grep !$seen{$_}++,  @out, 'en';
 738 }
 739
 740 ###########################################################################
 741 1;
 742 __END__
 743
 744 =back
 745
 746 =head1 ABOUT LOWERCASING
 747
 748 I've considered making all the above functions that output language
 749 tags return all those tags strictly in lowercase.  Having all your
 750 language tags in lowercase does make some things easier.  But you
 751 might as well just lowercase as you like, or call
 752 C<encode_language_tag($lang1)> where appropriate.
 753
 754 =head1 ABOUT UNICODE PLAINTEXT LANGUAGE TAGS
 755
 756 In some future version of I18N::LangTags, I plan to include support
 757 for RFC2482-style language tags -- which are basically just normal
 758 language tags with their ASCII characters shifted into Plane 14.
 759
 760 =head1 SEE ALSO
 761
 762 * L<I18N::LangTags::List|I18N::LangTags::List>
 763
 764 * RFC 3066, C<ftp://ftp.isi.edu/in-notes/rfc3066.txt>, "Tags for the
 765 Identification of Languages".  (Obsoletes RFC 1766)
 766
 767 * RFC 2277, C<ftp://ftp.isi.edu/in-notes/rfc2277.txt>, "IETF Policy on
 768 Character Sets and Languages".
 769
 770 * RFC 2231, C<ftp://ftp.isi.edu/in-notes/rfc2231.txt>, "MIME Parameter
 771 Value and Encoded Word Extensions: Character Sets, Languages, and
 772 Continuations".
 773
 774 * RFC 2482, C<ftp://ftp.isi.edu/in-notes/rfc2482.txt>,
 775 "Language Tagging in Unicode Plain Text".
 776
 777 * Locale::Codes, in
 778 C<http://www.perl.com/CPAN/modules/by-module/Locale/>
 779
 780 * ISO 639-2, "Codes for the representation of names of languages",
 781 including two-letter and three-letter codes,
 782 C<http://www.loc.gov/standards/iso639-2/langcodes.html>
 783
 784 * The IANA list of registered languages (hopefully up-to-date),
 785 C<http://www.iana.org/assignments/language-tags>
 786
 787 =head1 COPYRIGHT
 788
 789 Copyright (c) 1998-2003 Sean M. Burke. All rights reserved.
 790
 791 This library is free software; you can redistribute it and/or
 792 modify it under the same terms as Perl itself.
 793
 794 The programs and documentation in this dist are distributed in
 795 the hope that they will be useful, but without any warranty; without
 796 even the implied warranty of merchantability or fitness for a
 797 particular purpose.
 798
 799 =head1 AUTHOR
 800
 801 Sean M. Burke C<sburke@cpan.org>
 802
 803 =cut
 804