Temporary fix for usemallocwrap problems on IRIX (was Re: usemallocwrap problems...
[p5sagit/p5-mst-13.2.git] / lib / I18N / LangTags.pm
CommitLineData
4b053158 1
66ee8a1c 2# Time-stamp: "2004-07-01 14:34:40 ADT"
4b053158 3# Sean M. Burke <sburke@cpan.org>
4
5require 5.000;
6package I18N::LangTags;
7use strict;
21aeefd5 8use vars qw(@ISA @EXPORT @EXPORT_OK %EXPORT_TAGS $VERSION %Panic);
4b053158 9require Exporter;
4b053158 10@ISA = qw(Exporter);
11@EXPORT = qw();
12@EXPORT_OK = qw(is_language_tag same_language_tag
13 extract_language_tags super_languages
14 similarity_language_tag is_dialect_of
15 locale2language_tag alternate_language_tags
21aeefd5 16 encode_language_tag panic_languages
8000a3fa 17 implicate_supers
18 implicate_supers_strictly
4b053158 19 );
21aeefd5 20%EXPORT_TAGS = ('ALL' => \@EXPORT_OK);
4b053158 21
66ee8a1c 22$VERSION = "0.33";
8000a3fa 23
24sub uniq { my %seen; return grep(!($seen{$_}++), @_); } # a util function
25
4b053158 26
27=head1 NAME
28
29I18N::LangTags - functions for dealing with RFC3066-style language tags
30
31=head1 SYNOPSIS
32
8000a3fa 33 use I18N::LangTags();
34
35...or specify whichever of those functions you want to import, like so:
4b053158 36
8000a3fa 37 use I18N::LangTags qw(implicate_supers similarity_language_tag);
38
39All the exportable functions are listed below -- you're free to import
40only some, or none at all. By default, none are imported. If you
41say:
21aeefd5 42
43 use I18N::LangTags qw(:ALL)
44
45...then all are exported. (This saves you from having to use
46something less obvious like C<use I18N::LangTags qw(/./)>.)
4b053158 47
48If you don't import any of these functions, assume a C<&I18N::LangTags::>
49in front of all the function names in the following examples.
50
51=head1 DESCRIPTION
52
53Language tags are a formalism, described in RFC 3066 (obsoleting
541766), for declaring what language form (language and possibly
55dialect) a given chunk of information is in.
56
57This library provides functions for common tasks involving language
58tags as they are needed in a variety of protocols and applications.
59
60Please see the "See Also" references for a thorough explanation
61of how to correctly use language tags.
62
63=over
64
65=cut
66
67###########################################################################
68
69=item * the function is_language_tag($lang1)
70
71Returns true iff $lang1 is a formally valid language tag.
72
73 is_language_tag("fr") is TRUE
74 is_language_tag("x-jicarilla") is FALSE
75 (Subtags can be 8 chars long at most -- 'jicarilla' is 9)
76
77 is_language_tag("sgn-US") is TRUE
78 (That's American Sign Language)
79
80 is_language_tag("i-Klikitat") is TRUE
81 (True without regard to the fact noone has actually
82 registered Klikitat -- it's a formally valid tag)
83
84 is_language_tag("fr-patois") is TRUE
85 (Formally valid -- altho descriptively weak!)
86
87 is_language_tag("Spanish") is FALSE
88 is_language_tag("french-patois") is FALSE
89 (No good -- first subtag has to match
90 /^([xXiI]|[a-zA-Z]{2,3})$/ -- see RFC3066)
91
92 is_language_tag("x-borg-prot2532") is TRUE
93 (Yes, subtags can contain digits, as of RFC3066)
94
95=cut
96
97sub is_language_tag {
98
99 ## Changes in the language tagging standards may have to be reflected here.
100
101 my($tag) = lc($_[0]);
102
103 return 0 if $tag eq "i" or $tag eq "x";
21aeefd5 104 # Bad degenerate cases that the following
4b053158 105 # regexp would erroneously let pass
106
107 return $tag =~
108 /^(?: # First subtag
109 [xi] | [a-z]{2,3}
110 )
111 (?: # Subtags thereafter
112 - # separator
113 [a-z0-9]{1,8} # subtag
114 )*
115 $/xs ? 1 : 0;
116}
117
118###########################################################################
119
120=item * the function extract_language_tags($whatever)
121
122Returns a list of whatever looks like formally valid language tags
123in $whatever. Not very smart, so don't get too creative with
124what you want to feed it.
125
126 extract_language_tags("fr, fr-ca, i-mingo")
127 returns: ('fr', 'fr-ca', 'i-mingo')
128
129 extract_language_tags("It's like this: I'm in fr -- French!")
130 returns: ('It', 'in', 'fr')
131 (So don't just feed it any old thing.)
132
133The output is untainted. If you don't know what tainting is,
134don't worry about it.
135
136=cut
137
138sub extract_language_tags {
139
140 ## Changes in the language tagging standards may have to be reflected here.
141
142 my($text) =
143 $_[0] =~ m/(.+)/ # to make for an untainted result
144 ? $1 : ''
145 ;
146
147 return grep(!m/^[ixIX]$/s, # 'i' and 'x' aren't good tags
148 $text =~
149 m/
150 \b
151 (?: # First subtag
152 [iIxX] | [a-zA-Z]{2,3}
153 )
154 (?: # Subtags thereafter
155 - # separator
156 [a-zA-Z0-9]{1,8} # subtag
157 )*
158 \b
159 /xsg
160 );
161}
162
163###########################################################################
164
165=item * the function same_language_tag($lang1, $lang2)
166
167Returns true iff $lang1 and $lang2 are acceptable variant tags
168representing the same language-form.
169
170 same_language_tag('x-kadara', 'i-kadara') is TRUE
171 (The x/i- alternation doesn't matter)
172 same_language_tag('X-KADARA', 'i-kadara') is TRUE
173 (...and neither does case)
174 same_language_tag('en', 'en-US') is FALSE
175 (all-English is not the SAME as US English)
176 same_language_tag('x-kadara', 'x-kadar') is FALSE
177 (these are totally unrelated tags)
21aeefd5 178 same_language_tag('no-bok', 'nb') is TRUE
179 (no-bok is a legacy tag for nb (Norwegian Bokmal))
4b053158 180
181C<same_language_tag> works by just seeing whether
182C<encode_language_tag($lang1)> is the same as
183C<encode_language_tag($lang2)>.
184
185(Yes, I know this function is named a bit oddly. Call it historic
186reasons.)
187
188=cut
189
190sub same_language_tag {
191 my $el1 = &encode_language_tag($_[0]);
192 return 0 unless defined $el1;
193 # this avoids the problem of
194 # encode_language_tag($lang1) eq and encode_language_tag($lang2)
195 # being true if $lang1 and $lang2 are both undef
196
197 return $el1 eq &encode_language_tag($_[1]) ? 1 : 0;
198}
199
200###########################################################################
201
202=item * the function similarity_language_tag($lang1, $lang2)
203
204Returns an integer representing the degree of similarity between
205tags $lang1 and $lang2 (the order of which does not matter), where
206similarity is the number of common elements on the left,
207without regard to case and to x/i- alternation.
208
209 similarity_language_tag('fr', 'fr-ca') is 1
210 (one element in common)
211 similarity_language_tag('fr-ca', 'fr-FR') is 1
212 (one element in common)
213
214 similarity_language_tag('fr-CA-joual',
215 'fr-CA-PEI') is 2
216 similarity_language_tag('fr-CA-joual', 'fr-CA') is 2
217 (two elements in common)
218
219 similarity_language_tag('x-kadara', 'i-kadara') is 1
220 (x/i- doesn't matter)
221
222 similarity_language_tag('en', 'x-kadar') is 0
223 similarity_language_tag('x-kadara', 'x-kadar') is 0
224 (unrelated tags -- no similarity)
225
226 similarity_language_tag('i-cree-syllabic',
227 'i-cherokee-syllabic') is 0
228 (no B<leftmost> elements in common!)
229
230=cut
231
232sub similarity_language_tag {
233 my $lang1 = &encode_language_tag($_[0]);
234 my $lang2 = &encode_language_tag($_[1]);
21aeefd5 235 # And encode_language_tag takes care of the whole
236 # no-nyn==nn, i-hakka==zh-hakka, etc, things
237
4b053158 238 # NB: (i-sil-...)? (i-sgn-...)?
239
240 return undef if !defined($lang1) and !defined($lang2);
241 return 0 if !defined($lang1) or !defined($lang2);
242
243 my @l1_subtags = split('-', $lang1);
244 my @l2_subtags = split('-', $lang2);
245 my $similarity = 0;
246
247 while(@l1_subtags and @l2_subtags) {
248 if(shift(@l1_subtags) eq shift(@l2_subtags)) {
249 ++$similarity;
250 } else {
251 last;
252 }
253 }
254 return $similarity;
255}
256
257###########################################################################
258
259=item * the function is_dialect_of($lang1, $lang2)
260
21aeefd5 261Returns true iff language tag $lang1 represents a subform of
4b053158 262language tag $lang2.
263
264B<Get the order right! It doesn't work the other way around!>
265
266 is_dialect_of('en-US', 'en') is TRUE
267 (American English IS a dialect of all-English)
268
269 is_dialect_of('fr-CA-joual', 'fr-CA') is TRUE
270 is_dialect_of('fr-CA-joual', 'fr') is TRUE
271 (Joual is a dialect of (a dialect of) French)
272
273 is_dialect_of('en', 'en-US') is FALSE
274 (all-English is a NOT dialect of American English)
275
276 is_dialect_of('fr', 'en-CA') is FALSE
277
21aeefd5 278 is_dialect_of('en', 'en' ) is TRUE
279 is_dialect_of('en-US', 'en-US') is TRUE
4b053158 280 (B<Note:> these are degenerate cases)
281
282 is_dialect_of('i-mingo-tom', 'x-Mingo') is TRUE
283 (the x/i thing doesn't matter, nor does case)
284
21aeefd5 285 is_dialect_of('nn', 'no') is TRUE
286 (because 'nn' (New Norse) is aliased to 'no-nyn',
287 as a special legacy case, and 'no-nyn' is a
288 subform of 'no' (Norwegian))
289
4b053158 290=cut
291
292sub is_dialect_of {
293
294 my $lang1 = &encode_language_tag($_[0]);
295 my $lang2 = &encode_language_tag($_[1]);
296
297 return undef if !defined($lang1) and !defined($lang2);
298 return 0 if !defined($lang1) or !defined($lang2);
299
300 return 1 if $lang1 eq $lang2;
301 return 0 if length($lang1) < length($lang2);
302
303 $lang1 .= '-';
304 $lang2 .= '-';
305 return
306 (substr($lang1, 0, length($lang2)) eq $lang2) ? 1 : 0;
307}
308
309###########################################################################
310
311=item * the function super_languages($lang1)
312
313Returns a list of language tags that are superordinate tags to $lang1
314-- it gets this by removing subtags from the end of $lang1 until
315nothing (or just "i" or "x") is left.
316
317 super_languages("fr-CA-joual") is ("fr-CA", "fr")
318
319 super_languages("en-AU") is ("en")
320
321 super_languages("en") is empty-list, ()
322
323 super_languages("i-cherokee") is empty-list, ()
324 ...not ("i"), which would be illegal as well as pointless.
325
326If $lang1 is not a valid language tag, returns empty-list in
327a list context, undef in a scalar context.
328
329A notable and rather unavoidable problem with this method:
330"x-mingo-tom" has an "x" because the whole tag isn't an
331IANA-registered tag -- but super_languages('x-mingo-tom') is
332('x-mingo') -- which isn't really right, since 'i-mingo' is
333registered. But this module has no way of knowing that. (But note
334that same_language_tag('x-mingo', 'i-mingo') is TRUE.)
335
336More importantly, you assume I<at your peril> that superordinates of
337$lang1 are mutually intelligible with $lang1. Consider this
338carefully.
339
8000a3fa 340=cut
4b053158 341
342sub super_languages {
343 my $lang1 = $_[0];
344 return() unless defined($lang1) && &is_language_tag($lang1);
21aeefd5 345
346 # a hack for those annoying new (2001) tags:
347 $lang1 =~ s/^nb\b/no-bok/i; # yes, backwards
348 $lang1 =~ s/^nn\b/no-nyn/i; # yes, backwards
349 $lang1 =~ s/^[ix](-hakka\b)/zh$1/i; # goes the right way
350 # i-hakka-bork-bjork-bjark => zh-hakka-bork-bjork-bjark
351
4b053158 352 my @l1_subtags = split('-', $lang1);
353
354 ## Changes in the language tagging standards may have to be reflected here.
355
356 # NB: (i-sil-...)?
357
358 my @supers = ();
359 foreach my $bit (@l1_subtags) {
360 push @supers,
361 scalar(@supers) ? ($supers[-1] . '-' . $bit) : $bit;
362 }
363 pop @supers if @supers;
364 shift @supers if @supers && $supers[0] =~ m<^[iIxX]$>s;
365 return reverse @supers;
366}
367
368###########################################################################
369
370=item * the function locale2language_tag($locale_identifier)
371
372This takes a locale name (like "en", "en_US", or "en_US.ISO8859-1")
373and maps it to a language tag. If it's not mappable (as with,
374notably, "C" and "POSIX"), this returns empty-list in a list context,
375or undef in a scalar context.
376
377 locale2language_tag("en") is "en"
378
379 locale2language_tag("en_US") is "en-US"
380
381 locale2language_tag("en_US.ISO8859-1") is "en-US"
382
383 locale2language_tag("C") is undef or ()
384
385 locale2language_tag("POSIX") is undef or ()
386
387 locale2language_tag("POSIX") is undef or ()
388
389I'm not totally sure that locale names map satisfactorily to language
390tags. Think REAL hard about how you use this. YOU HAVE BEEN WARNED.
391
392The output is untainted. If you don't know what tainting is,
393don't worry about it.
394
8000a3fa 395=cut
4b053158 396
397sub locale2language_tag {
398 my $lang =
399 $_[0] =~ m/(.+)/ # to make for an untainted result
400 ? $1 : ''
401 ;
402
403 return $lang if &is_language_tag($lang); # like "en"
404
405 $lang =~ tr<_><->; # "en_US" -> en-US
406 $lang =~ s<\.[-_a-zA-Z0-9\.]*><>s; # "en_US.ISO8859-1" -> en-US
407
408 return $lang if &is_language_tag($lang);
409
410 return;
411}
412
413###########################################################################
414
415=item * the function encode_language_tag($lang1)
416
417This function, if given a language tag, returns an encoding of it such
418that:
419
420* tags representing different languages never get the same encoding.
421
422* tags representing the same language always get the same encoding.
423
424* an encoding of a formally valid language tag always is a string
425value that is defined, has length, and is true if considered as a
426boolean.
427
428Note that the encoding itself is B<not> a formally valid language tag.
429Note also that you cannot, currently, go from an encoding back to a
430language tag that it's an encoding of.
431
432Note also that you B<must> consider the encoded value as atomic; i.e.,
433you should not consider it as anything but an opaque, unanalysable
434string value. (The internals of the encoding method may change in
435future versions, as the language tagging standard changes over time.)
436
437C<encode_language_tag> returns undef if given anything other than a
438formally valid language tag.
439
440The reason C<encode_language_tag> exists is because different language
441tags may represent the same language; this is normally treatable with
442C<same_language_tag>, but consider this situation:
443
444You have a data file that expresses greetings in different languages.
445Its format is "[language tag]=[how to say 'Hello']", like:
446
447 en-US=Hiho
448 fr=Bonjour
449 i-mingo=Hau'
450
451And suppose you write a program that reads that file and then runs as
452a daemon, answering client requests that specify a language tag and
453then expect the string that says how to greet in that language. So an
454interaction looks like:
455
456 greeting-client asks: fr
457 greeting-server answers: Bonjour
458
459So far so good. But suppose the way you're implementing this is:
460
461 my %greetings;
462 die unless open(IN, "<in.dat");
463 while(<IN>) {
464 chomp;
465 next unless /^([^=]+)=(.+)/s;
466 my($lang, $expr) = ($1, $2);
467 $greetings{$lang} = $expr;
468 }
469 close(IN);
470
471at which point %greetings has the contents:
472
473 "en-US" => "Hiho"
474 "fr" => "Bonjour"
475 "i-mingo" => "Hau'"
476
477And suppose then that you answer client requests for language $wanted
478by just looking up $greetings{$wanted}.
479
480If the client asks for "fr", that will look up successfully in
481%greetings, to the value "Bonjour". And if the client asks for
482"i-mingo", that will look up successfully in %greetings, to the value
483"Hau'".
484
485But if the client asks for "i-Mingo" or "x-mingo", or "Fr", then the
486lookup in %greetings fails. That's the Wrong Thing.
487
488You could instead do lookups on $wanted with:
489
490 use I18N::LangTags qw(same_language_tag);
491 my $repsonse = '';
492 foreach my $l2 (keys %greetings) {
493 if(same_language_tag($wanted, $l2)) {
494 $response = $greetings{$l2};
495 last;
496 }
497 }
498
499But that's rather inefficient. A better way to do it is to start your
500program with:
501
502 use I18N::LangTags qw(encode_language_tag);
503 my %greetings;
504 die unless open(IN, "<in.dat");
505 while(<IN>) {
506 chomp;
507 next unless /^([^=]+)=(.+)/s;
508 my($lang, $expr) = ($1, $2);
509 $greetings{
510 encode_language_tag($lang)
511 } = $expr;
512 }
513 close(IN);
514
515and then just answer client requests for language $wanted by just
516looking up
517
518 $greetings{encode_language_tag($wanted)}
519
520And that does the Right Thing.
521
522=cut
523
524sub encode_language_tag {
525 # Only similarity_language_tag() is allowed to analyse encodings!
526
527 ## Changes in the language tagging standards may have to be reflected here.
528
21aeefd5 529 my($tag) = $_[0] || return undef;
4b053158 530 return undef unless &is_language_tag($tag);
21aeefd5 531
532 # For the moment, these legacy variances are few enough that
533 # we can just handle them here with regexps.
534 $tag =~ s/^iw\b/he/i; # Hebrew
535 $tag =~ s/^in\b/id/i; # Indonesian
aaf52a42 536 $tag =~ s/^cre\b/cr/i; # Cree
537 $tag =~ s/^jw\b/jv/i; # Javanese
21aeefd5 538 $tag =~ s/^[ix]-lux\b/lb/i; # Luxemburger
539 $tag =~ s/^[ix]-navajo\b/nv/i; # Navajo
540 $tag =~ s/^ji\b/yi/i; # Yiddish
aaf52a42 541 # SMB 2003 -- Hm. There's a bunch of new XXX->YY variances now,
542 # but maybe they're all so obscure I can ignore them. "Obscure"
543 # meaning either that the language is obscure, and/or that the
544 # XXX form was extant so briefly that it's unlikely it was ever
545 # used. I hope.
21aeefd5 546 #
547 # These go FROM the simplex to complex form, to get
548 # similarity-comparison right. And that's okay, since
549 # similarity_language_tag is the only thing that
550 # analyzes our output.
551 $tag =~ s/^[ix]-hakka\b/zh-hakka/i; # Hakka
552 $tag =~ s/^nb\b/no-bok/i; # BACKWARDS for Bokmal
553 $tag =~ s/^nn\b/no-nyn/i; # BACKWARDS for Nynorsk
4b053158 554
555 $tag =~ s/^[xiXI]-//s;
556 # Just lop off any leading "x/i-"
4b053158 557
21aeefd5 558 return "~" . uc($tag);
4b053158 559}
560
561#--------------------------------------------------------------------------
562
563=item * the function alternate_language_tags($lang1)
564
565This function, if given a language tag, returns all language tags that
21aeefd5 566are alternate forms of this language tag. (I.e., tags which refer to
567the same language.) This is meant to handle legacy tags caused by
568the minor changes in language tag standards over the years; and
569the x-/i- alternation is also dealt with.
570
571Note that this function does I<not> try to equate new (and never-used,
572and unusable)
573ISO639-2 three-letter tags to old (and still in use) ISO639-1
574two-letter equivalents -- like "ara" -> "ar" -- because
575"ara" has I<never> been in use as an Internet language tag,
576and RFC 3066 stipulates that it never should be, since a shorter
577tag ("ar") exists.
578
579Examples:
580
581 alternate_language_tags('no-bok') is ('nb')
582 alternate_language_tags('nb') is ('no-bok')
583 alternate_language_tags('he') is ('iw')
584 alternate_language_tags('iw') is ('he')
585 alternate_language_tags('i-hakka') is ('zh-hakka', 'x-hakka')
586 alternate_language_tags('zh-hakka') is ('i-hakka', 'x-hakka')
587 alternate_language_tags('en') is ()
588 alternate_language_tags('x-mingo-tom') is ('i-mingo-tom')
589 alternate_language_tags('x-klikitat') is ('i-klikitat')
590 alternate_language_tags('i-klikitat') is ('x-klikitat')
591
592This function returns empty-list if given anything other than a formally
4b053158 593valid language tag.
594
595=cut
596
597my %alt = qw( i x x i I X X I );
598sub alternate_language_tags {
4b053158 599 my $tag = $_[0];
600 return() unless &is_language_tag($tag);
601
21aeefd5 602 my @em; # push 'em real goood!
603
604 # For the moment, these legacy variances are few enough that
605 # we can just handle them here with regexps.
606
607 if( $tag =~ m/^[ix]-hakka\b(.*)/i) {push @em, "zh-hakka$1";
608 } elsif($tag =~ m/^zh-hakka\b(.*)/i) { push @em, "x-hakka$1", "i-hakka$1";
609
610 } elsif($tag =~ m/^he\b(.*)/i) { push @em, "iw$1";
611 } elsif($tag =~ m/^iw\b(.*)/i) { push @em, "he$1";
612
613 } elsif($tag =~ m/^in\b(.*)/i) { push @em, "id$1";
614 } elsif($tag =~ m/^id\b(.*)/i) { push @em, "in$1";
615
616 } elsif($tag =~ m/^[ix]-lux\b(.*)/i) { push @em, "lb$1";
617 } elsif($tag =~ m/^lb\b(.*)/i) { push @em, "i-lux$1", "x-lux$1";
618
619 } elsif($tag =~ m/^[ix]-navajo\b(.*)/i) { push @em, "nv$1";
620 } elsif($tag =~ m/^nv\b(.*)/i) { push @em, "i-navajo$1", "x-navajo$1";
621
622 } elsif($tag =~ m/^yi\b(.*)/i) { push @em, "ji$1";
623 } elsif($tag =~ m/^ji\b(.*)/i) { push @em, "yi$1";
624
625 } elsif($tag =~ m/^nb\b(.*)/i) { push @em, "no-bok$1";
626 } elsif($tag =~ m/^no-bok\b(.*)/i) { push @em, "nb$1";
627
628 } elsif($tag =~ m/^nn\b(.*)/i) { push @em, "no-nyn$1";
629 } elsif($tag =~ m/^no-nyn\b(.*)/i) { push @em, "nn$1";
630 }
631
632 push @em, $alt{$1} . $2 if $tag =~ /^([XIxi])(-.+)/;
633 return @em;
634}
635
636###########################################################################
637
638{
639 # Init %Panic...
640
641 my @panic = ( # MUST all be lowercase!
642 # Only large ("national") languages make it in this list.
643 # If you, as a user, are so bizarre that the /only/ language
644 # you claim to accept is Galician, then no, we won't do you
645 # the favor of providing Catalan as a panic-fallback for
646 # you. Because if I start trying to add "little languages" in
647 # here, I'll just go crazy.
648
4cf5bee0 649 # Scandinavian lgs. All based on opinion and hearsay.
650 'sv' => [qw(nb no da nn)],
651 'da' => [qw(nb no sv nn)], # I guess
652 [qw(no nn nb)], [qw(no nn nb sv da)],
653 'is' => [qw(da sv no nb nn)],
654 'fo' => [qw(da is no nb nn sv)], # I guess
21aeefd5 655
656 # I think this is about the extent of tolerable intelligibility
657 # among large modern Romance languages.
658 'pt' => [qw(es ca it fr)], # Portuguese, Spanish, Catalan, Italian, French
659 'ca' => [qw(es pt it fr)],
660 'es' => [qw(ca it fr pt)],
661 'it' => [qw(es fr ca pt)],
662 'fr' => [qw(es it ca pt)],
663
664 # Also assume that speakers of the main Indian languages prefer
665 # to read/hear Hindi over English
666 [qw(
667 as bn gu kn ks kok ml mni mr ne or pa sa sd te ta ur
668 )] => 'hi',
669 # Assamese, Bengali, Gujarati, [Hindi,] Kannada (Kanarese), Kashmiri,
670 # Konkani, Malayalam, Meithei (Manipuri), Marathi, Nepali, Oriya,
671 # Punjabi, Sanskrit, Sindhi, Telugu, Tamil, and Urdu.
672 'hi' => [qw(bn pa as or)],
673 # I welcome finer data for the other Indian languages.
674 # E.g., what should Oriya's list be, besides just Hindi?
675
676 # And the panic languages for English is, of course, nil!
677
678 # My guesses at Slavic intelligibility:
679 ([qw(ru be uk)]) x 2, # Russian, Belarusian, Ukranian
680 'sr' => 'hr', 'hr' => 'sr', # Serb + Croat
681 'cs' => 'sk', 'sk' => 'cs', # Czech + Slovak
682
683 'ms' => 'id', 'id' => 'ms', # Malay + Indonesian
684
685 'et' => 'fi', 'fi' => 'et', # Estonian + Finnish
686
687 #?? 'lo' => 'th', 'th' => 'lo', # Lao + Thai
688
689 );
690 my($k,$v);
691 while(@panic) {
692 ($k,$v) = splice(@panic,0,2);
693 foreach my $k (ref($k) ? @$k : $k) {
694 foreach my $v (ref($v) ? @$v : $v) {
695 push @{$Panic{$k} ||= []}, $v unless $k eq $v;
696 }
697 }
698 }
699}
700
701=item * the function @langs = panic_languages(@accept_languages)
702
703This function takes a list of 0 or more language
704tags that constitute a given user's Accept-Language list, and
705returns a list of tags for I<other> (non-super)
706languages that are probably acceptable to the user, to be
707used I<if all else fails>.
708
709For example, if a user accepts only 'ca' (Catalan) and
710'es' (Spanish), and the documents/interfaces you have
711available are just in German, Italian, and Chinese, then
712the user will most likely want the Italian one (and not
713the Chinese or German one!), instead of getting
714nothing. So C<panic_languages('ca', 'es')> returns
715a list containing 'it' (Italian).
716
717English ('en') is I<always> in the return list, but
718whether it's at the very end or not depends
719on the input languages. This function works by consulting
720an internal table that stipulates what common
721languages are "close" to each other.
722
723A useful construct you might consider using is:
724
725 @fallbacks = super_languages(@accept_languages);
726 push @fallbacks, panic_languages(
727 @accept_languages, @fallbacks,
728 );
729
730=cut
4b053158 731
21aeefd5 732sub panic_languages {
733 # When in panic or in doubt, run in circles, scream, and shout!
734 my(@out, %seen);
735 foreach my $t (@_) {
736 next unless $t;
737 next if $seen{$t}++; # so we don't return it or hit it again
738 # push @out, super_languages($t); # nah, keep that separate
739 push @out, @{ $Panic{lc $t} || next };
4b053158 740 }
21aeefd5 741 return grep !$seen{$_}++, @out, 'en';
4b053158 742}
743
8000a3fa 744#---------------------------------------------------------------------------
745#---------------------------------------------------------------------------
746
747=item * the function implicate_supers( ...languages... )
748
749This takes a list of strings (which are presumed to be language-tags;
750strings that aren't, are ignored); and after each one, this function
751inserts super-ordinate forms that don't already appear in the list.
752The original list, plus these insertions, is returned.
753
754In other words, it takes this:
755
756 pt-br de-DE en-US fr pt-br-janeiro
757
758and returns this:
759
760 pt-br pt de-DE de en-US en fr pt-br-janeiro
761
762This function is most useful in the idiom
763
764 implicate_supers( I18N::LangTags::Detect::detect() );
765
766(See L<I18N::LangTags::Detect>.)
767
768
769=item * the function implicate_supers_strictly( ...languages... )
770
771This works like C<implicate_supers> except that the implicated
772forms are added to the end of the return list.
773
774In other words, implicate_supers_strictly takes a list of strings
775(which are presumed to be language-tags; strings that aren't, are
776ignored) and after the whole given list, it inserts the super-ordinate forms
777of all given tags, minus any tags that already appear in the input list.
778
779In other words, it takes this:
780
781 pt-br de-DE en-US fr pt-br-janeiro
782
783and returns this:
784
785 pt-br de-DE en-US fr pt-br-janeiro pt de en
786
787The reason this function has "_strictly" in its name is that when
788you're processing an Accept-Language list according to the RFCs, if
789you interpret the RFCs quite strictly, then you would use
790implicate_supers_strictly, but for normal use (i.e., common-sense use,
791as far as I'm concerned) you'd use implicate_supers.
792
793=cut
794
795sub implicate_supers {
796 my @languages = grep is_language_tag($_), @_;
797 my %seen_encoded;
798 foreach my $lang (@languages) {
799 $seen_encoded{ I18N::LangTags::encode_language_tag($lang) } = 1
800 }
801
802 my(@output_languages);
803 foreach my $lang (@languages) {
804 push @output_languages, $lang;
805 foreach my $s ( I18N::LangTags::super_languages($lang) ) {
806 # Note that super_languages returns the longest first.
807 last if $seen_encoded{ I18N::LangTags::encode_language_tag($s) };
808 push @output_languages, $s;
809 }
810 }
811 return uniq( @output_languages );
812
813}
814
815sub implicate_supers_strictly {
816 my @tags = grep is_language_tag($_), @_;
817 return uniq( @_, map super_languages($_), @_ );
818}
819
820
821
4b053158 822###########################################################################
21aeefd5 8231;
824__END__
4b053158 825
826=back
827
828=head1 ABOUT LOWERCASING
829
830I've considered making all the above functions that output language
831tags return all those tags strictly in lowercase. Having all your
832language tags in lowercase does make some things easier. But you
833might as well just lowercase as you like, or call
834C<encode_language_tag($lang1)> where appropriate.
835
836=head1 ABOUT UNICODE PLAINTEXT LANGUAGE TAGS
837
838In some future version of I18N::LangTags, I plan to include support
839for RFC2482-style language tags -- which are basically just normal
840language tags with their ASCII characters shifted into Plane 14.
841
842=head1 SEE ALSO
843
e7525a17 844* L<I18N::LangTags::List|I18N::LangTags::List>
845
4b053158 846* RFC 3066, C<ftp://ftp.isi.edu/in-notes/rfc3066.txt>, "Tags for the
847Identification of Languages". (Obsoletes RFC 1766)
848
849* RFC 2277, C<ftp://ftp.isi.edu/in-notes/rfc2277.txt>, "IETF Policy on
850Character Sets and Languages".
851
852* RFC 2231, C<ftp://ftp.isi.edu/in-notes/rfc2231.txt>, "MIME Parameter
853Value and Encoded Word Extensions: Character Sets, Languages, and
854Continuations".
855
8000a3fa 856* RFC 2482, C<ftp://ftp.isi.edu/in-notes/rfc2482.txt>,
4b053158 857"Language Tagging in Unicode Plain Text".
858
859* Locale::Codes, in
f70da2ef 860C<http://www.perl.com/CPAN/modules/by-module/Locale/>
4b053158 861
4b053158 862* ISO 639-2, "Codes for the representation of names of languages",
aaf52a42 863including two-letter and three-letter codes,
864C<http://www.loc.gov/standards/iso639-2/langcodes.html>
4b053158 865
866* The IANA list of registered languages (hopefully up-to-date),
aaf52a42 867C<http://www.iana.org/assignments/language-tags>
4b053158 868
869=head1 COPYRIGHT
870
8000a3fa 871Copyright (c) 1998-2004 Sean M. Burke. All rights reserved.
4b053158 872
873This library is free software; you can redistribute it and/or
874modify it under the same terms as Perl itself.
875
876The programs and documentation in this dist are distributed in
877the hope that they will be useful, but without any warranty; without
878even the implied warranty of merchantability or fitness for a
879particular purpose.
880
881=head1 AUTHOR
882
883Sean M. Burke C<sburke@cpan.org>
884
885=cut
886