Commit | Line | Data |
4b053158 |
1 | |
77b20956 |
2 | # Time-stamp: "2004-10-06 23:26:33 ADT" |
4b053158 |
3 | # Sean M. Burke <sburke@cpan.org> |
4 | |
5 | require 5.000; |
6 | package I18N::LangTags; |
7 | use strict; |
21aeefd5 |
8 | use vars qw(@ISA @EXPORT @EXPORT_OK %EXPORT_TAGS $VERSION %Panic); |
4b053158 |
9 | require Exporter; |
4b053158 |
10 | @ISA = qw(Exporter); |
11 | @EXPORT = qw(); |
12 | @EXPORT_OK = qw(is_language_tag same_language_tag |
13 | extract_language_tags super_languages |
14 | similarity_language_tag is_dialect_of |
15 | locale2language_tag alternate_language_tags |
21aeefd5 |
16 | encode_language_tag panic_languages |
8000a3fa |
17 | implicate_supers |
18 | implicate_supers_strictly |
4b053158 |
19 | ); |
21aeefd5 |
20 | %EXPORT_TAGS = ('ALL' => \@EXPORT_OK); |
4b053158 |
21 | |
77b20956 |
22 | $VERSION = "0.35"; |
8000a3fa |
23 | |
24 | sub uniq { my %seen; return grep(!($seen{$_}++), @_); } # a util function |
25 | |
4b053158 |
26 | |
27 | =head1 NAME |
28 | |
29 | I18N::LangTags - functions for dealing with RFC3066-style language tags |
30 | |
31 | =head1 SYNOPSIS |
32 | |
8000a3fa |
33 | use I18N::LangTags(); |
34 | |
35 | ...or specify whichever of those functions you want to import, like so: |
4b053158 |
36 | |
8000a3fa |
37 | use I18N::LangTags qw(implicate_supers similarity_language_tag); |
38 | |
39 | All the exportable functions are listed below -- you're free to import |
40 | only some, or none at all. By default, none are imported. If you |
41 | say: |
21aeefd5 |
42 | |
43 | use I18N::LangTags qw(:ALL) |
44 | |
45 | ...then all are exported. (This saves you from having to use |
46 | something less obvious like C<use I18N::LangTags qw(/./)>.) |
4b053158 |
47 | |
48 | If you don't import any of these functions, assume a C<&I18N::LangTags::> |
49 | in front of all the function names in the following examples. |
50 | |
51 | =head1 DESCRIPTION |
52 | |
53 | Language tags are a formalism, described in RFC 3066 (obsoleting |
54 | 1766), for declaring what language form (language and possibly |
55 | dialect) a given chunk of information is in. |
56 | |
57 | This library provides functions for common tasks involving language |
58 | tags as they are needed in a variety of protocols and applications. |
59 | |
60 | Please see the "See Also" references for a thorough explanation |
61 | of how to correctly use language tags. |
62 | |
63 | =over |
64 | |
65 | =cut |
66 | |
67 | ########################################################################### |
68 | |
69 | =item * the function is_language_tag($lang1) |
70 | |
71 | Returns true iff $lang1 is a formally valid language tag. |
72 | |
73 | is_language_tag("fr") is TRUE |
74 | is_language_tag("x-jicarilla") is FALSE |
75 | (Subtags can be 8 chars long at most -- 'jicarilla' is 9) |
76 | |
77 | is_language_tag("sgn-US") is TRUE |
78 | (That's American Sign Language) |
79 | |
80 | is_language_tag("i-Klikitat") is TRUE |
81 | (True without regard to the fact noone has actually |
82 | registered Klikitat -- it's a formally valid tag) |
83 | |
84 | is_language_tag("fr-patois") is TRUE |
85 | (Formally valid -- altho descriptively weak!) |
86 | |
87 | is_language_tag("Spanish") is FALSE |
88 | is_language_tag("french-patois") is FALSE |
89 | (No good -- first subtag has to match |
90 | /^([xXiI]|[a-zA-Z]{2,3})$/ -- see RFC3066) |
91 | |
92 | is_language_tag("x-borg-prot2532") is TRUE |
93 | (Yes, subtags can contain digits, as of RFC3066) |
94 | |
95 | =cut |
96 | |
97 | sub is_language_tag { |
98 | |
99 | ## Changes in the language tagging standards may have to be reflected here. |
100 | |
101 | my($tag) = lc($_[0]); |
102 | |
103 | return 0 if $tag eq "i" or $tag eq "x"; |
21aeefd5 |
104 | # Bad degenerate cases that the following |
4b053158 |
105 | # regexp would erroneously let pass |
106 | |
107 | return $tag =~ |
108 | /^(?: # First subtag |
109 | [xi] | [a-z]{2,3} |
110 | ) |
111 | (?: # Subtags thereafter |
112 | - # separator |
113 | [a-z0-9]{1,8} # subtag |
114 | )* |
115 | $/xs ? 1 : 0; |
116 | } |
117 | |
118 | ########################################################################### |
119 | |
120 | =item * the function extract_language_tags($whatever) |
121 | |
122 | Returns a list of whatever looks like formally valid language tags |
123 | in $whatever. Not very smart, so don't get too creative with |
124 | what you want to feed it. |
125 | |
126 | extract_language_tags("fr, fr-ca, i-mingo") |
127 | returns: ('fr', 'fr-ca', 'i-mingo') |
128 | |
129 | extract_language_tags("It's like this: I'm in fr -- French!") |
130 | returns: ('It', 'in', 'fr') |
131 | (So don't just feed it any old thing.) |
132 | |
133 | The output is untainted. If you don't know what tainting is, |
134 | don't worry about it. |
135 | |
136 | =cut |
137 | |
138 | sub extract_language_tags { |
139 | |
140 | ## Changes in the language tagging standards may have to be reflected here. |
141 | |
142 | my($text) = |
143 | $_[0] =~ m/(.+)/ # to make for an untainted result |
144 | ? $1 : '' |
145 | ; |
146 | |
147 | return grep(!m/^[ixIX]$/s, # 'i' and 'x' aren't good tags |
148 | $text =~ |
149 | m/ |
150 | \b |
151 | (?: # First subtag |
152 | [iIxX] | [a-zA-Z]{2,3} |
153 | ) |
154 | (?: # Subtags thereafter |
155 | - # separator |
156 | [a-zA-Z0-9]{1,8} # subtag |
157 | )* |
158 | \b |
159 | /xsg |
160 | ); |
161 | } |
162 | |
163 | ########################################################################### |
164 | |
165 | =item * the function same_language_tag($lang1, $lang2) |
166 | |
167 | Returns true iff $lang1 and $lang2 are acceptable variant tags |
168 | representing the same language-form. |
169 | |
170 | same_language_tag('x-kadara', 'i-kadara') is TRUE |
171 | (The x/i- alternation doesn't matter) |
172 | same_language_tag('X-KADARA', 'i-kadara') is TRUE |
173 | (...and neither does case) |
174 | same_language_tag('en', 'en-US') is FALSE |
175 | (all-English is not the SAME as US English) |
176 | same_language_tag('x-kadara', 'x-kadar') is FALSE |
177 | (these are totally unrelated tags) |
21aeefd5 |
178 | same_language_tag('no-bok', 'nb') is TRUE |
179 | (no-bok is a legacy tag for nb (Norwegian Bokmal)) |
4b053158 |
180 | |
181 | C<same_language_tag> works by just seeing whether |
182 | C<encode_language_tag($lang1)> is the same as |
183 | C<encode_language_tag($lang2)>. |
184 | |
185 | (Yes, I know this function is named a bit oddly. Call it historic |
186 | reasons.) |
187 | |
188 | =cut |
189 | |
190 | sub same_language_tag { |
191 | my $el1 = &encode_language_tag($_[0]); |
192 | return 0 unless defined $el1; |
193 | # this avoids the problem of |
194 | # encode_language_tag($lang1) eq and encode_language_tag($lang2) |
195 | # being true if $lang1 and $lang2 are both undef |
196 | |
197 | return $el1 eq &encode_language_tag($_[1]) ? 1 : 0; |
198 | } |
199 | |
200 | ########################################################################### |
201 | |
202 | =item * the function similarity_language_tag($lang1, $lang2) |
203 | |
204 | Returns an integer representing the degree of similarity between |
205 | tags $lang1 and $lang2 (the order of which does not matter), where |
206 | similarity is the number of common elements on the left, |
207 | without regard to case and to x/i- alternation. |
208 | |
209 | similarity_language_tag('fr', 'fr-ca') is 1 |
210 | (one element in common) |
211 | similarity_language_tag('fr-ca', 'fr-FR') is 1 |
212 | (one element in common) |
213 | |
214 | similarity_language_tag('fr-CA-joual', |
215 | 'fr-CA-PEI') is 2 |
216 | similarity_language_tag('fr-CA-joual', 'fr-CA') is 2 |
217 | (two elements in common) |
218 | |
219 | similarity_language_tag('x-kadara', 'i-kadara') is 1 |
220 | (x/i- doesn't matter) |
221 | |
222 | similarity_language_tag('en', 'x-kadar') is 0 |
223 | similarity_language_tag('x-kadara', 'x-kadar') is 0 |
224 | (unrelated tags -- no similarity) |
225 | |
226 | similarity_language_tag('i-cree-syllabic', |
227 | 'i-cherokee-syllabic') is 0 |
228 | (no B<leftmost> elements in common!) |
229 | |
230 | =cut |
231 | |
232 | sub similarity_language_tag { |
233 | my $lang1 = &encode_language_tag($_[0]); |
234 | my $lang2 = &encode_language_tag($_[1]); |
21aeefd5 |
235 | # And encode_language_tag takes care of the whole |
236 | # no-nyn==nn, i-hakka==zh-hakka, etc, things |
237 | |
4b053158 |
238 | # NB: (i-sil-...)? (i-sgn-...)? |
239 | |
240 | return undef if !defined($lang1) and !defined($lang2); |
241 | return 0 if !defined($lang1) or !defined($lang2); |
242 | |
243 | my @l1_subtags = split('-', $lang1); |
244 | my @l2_subtags = split('-', $lang2); |
245 | my $similarity = 0; |
246 | |
247 | while(@l1_subtags and @l2_subtags) { |
248 | if(shift(@l1_subtags) eq shift(@l2_subtags)) { |
249 | ++$similarity; |
250 | } else { |
251 | last; |
252 | } |
253 | } |
254 | return $similarity; |
255 | } |
256 | |
257 | ########################################################################### |
258 | |
259 | =item * the function is_dialect_of($lang1, $lang2) |
260 | |
21aeefd5 |
261 | Returns true iff language tag $lang1 represents a subform of |
4b053158 |
262 | language tag $lang2. |
263 | |
264 | B<Get the order right! It doesn't work the other way around!> |
265 | |
266 | is_dialect_of('en-US', 'en') is TRUE |
267 | (American English IS a dialect of all-English) |
268 | |
269 | is_dialect_of('fr-CA-joual', 'fr-CA') is TRUE |
270 | is_dialect_of('fr-CA-joual', 'fr') is TRUE |
271 | (Joual is a dialect of (a dialect of) French) |
272 | |
273 | is_dialect_of('en', 'en-US') is FALSE |
274 | (all-English is a NOT dialect of American English) |
275 | |
276 | is_dialect_of('fr', 'en-CA') is FALSE |
277 | |
21aeefd5 |
278 | is_dialect_of('en', 'en' ) is TRUE |
279 | is_dialect_of('en-US', 'en-US') is TRUE |
4b053158 |
280 | (B<Note:> these are degenerate cases) |
281 | |
282 | is_dialect_of('i-mingo-tom', 'x-Mingo') is TRUE |
283 | (the x/i thing doesn't matter, nor does case) |
284 | |
21aeefd5 |
285 | is_dialect_of('nn', 'no') is TRUE |
286 | (because 'nn' (New Norse) is aliased to 'no-nyn', |
287 | as a special legacy case, and 'no-nyn' is a |
288 | subform of 'no' (Norwegian)) |
289 | |
4b053158 |
290 | =cut |
291 | |
292 | sub is_dialect_of { |
293 | |
294 | my $lang1 = &encode_language_tag($_[0]); |
295 | my $lang2 = &encode_language_tag($_[1]); |
296 | |
297 | return undef if !defined($lang1) and !defined($lang2); |
298 | return 0 if !defined($lang1) or !defined($lang2); |
299 | |
300 | return 1 if $lang1 eq $lang2; |
301 | return 0 if length($lang1) < length($lang2); |
302 | |
303 | $lang1 .= '-'; |
304 | $lang2 .= '-'; |
305 | return |
306 | (substr($lang1, 0, length($lang2)) eq $lang2) ? 1 : 0; |
307 | } |
308 | |
309 | ########################################################################### |
310 | |
311 | =item * the function super_languages($lang1) |
312 | |
313 | Returns a list of language tags that are superordinate tags to $lang1 |
314 | -- it gets this by removing subtags from the end of $lang1 until |
315 | nothing (or just "i" or "x") is left. |
316 | |
317 | super_languages("fr-CA-joual") is ("fr-CA", "fr") |
318 | |
319 | super_languages("en-AU") is ("en") |
320 | |
321 | super_languages("en") is empty-list, () |
322 | |
323 | super_languages("i-cherokee") is empty-list, () |
324 | ...not ("i"), which would be illegal as well as pointless. |
325 | |
326 | If $lang1 is not a valid language tag, returns empty-list in |
327 | a list context, undef in a scalar context. |
328 | |
329 | A notable and rather unavoidable problem with this method: |
330 | "x-mingo-tom" has an "x" because the whole tag isn't an |
331 | IANA-registered tag -- but super_languages('x-mingo-tom') is |
332 | ('x-mingo') -- which isn't really right, since 'i-mingo' is |
333 | registered. But this module has no way of knowing that. (But note |
334 | that same_language_tag('x-mingo', 'i-mingo') is TRUE.) |
335 | |
336 | More importantly, you assume I<at your peril> that superordinates of |
337 | $lang1 are mutually intelligible with $lang1. Consider this |
338 | carefully. |
339 | |
8000a3fa |
340 | =cut |
4b053158 |
341 | |
342 | sub super_languages { |
343 | my $lang1 = $_[0]; |
344 | return() unless defined($lang1) && &is_language_tag($lang1); |
21aeefd5 |
345 | |
346 | # a hack for those annoying new (2001) tags: |
347 | $lang1 =~ s/^nb\b/no-bok/i; # yes, backwards |
348 | $lang1 =~ s/^nn\b/no-nyn/i; # yes, backwards |
349 | $lang1 =~ s/^[ix](-hakka\b)/zh$1/i; # goes the right way |
350 | # i-hakka-bork-bjork-bjark => zh-hakka-bork-bjork-bjark |
351 | |
4b053158 |
352 | my @l1_subtags = split('-', $lang1); |
353 | |
354 | ## Changes in the language tagging standards may have to be reflected here. |
355 | |
356 | # NB: (i-sil-...)? |
357 | |
358 | my @supers = (); |
359 | foreach my $bit (@l1_subtags) { |
360 | push @supers, |
361 | scalar(@supers) ? ($supers[-1] . '-' . $bit) : $bit; |
362 | } |
363 | pop @supers if @supers; |
364 | shift @supers if @supers && $supers[0] =~ m<^[iIxX]$>s; |
365 | return reverse @supers; |
366 | } |
367 | |
368 | ########################################################################### |
369 | |
370 | =item * the function locale2language_tag($locale_identifier) |
371 | |
372 | This takes a locale name (like "en", "en_US", or "en_US.ISO8859-1") |
373 | and maps it to a language tag. If it's not mappable (as with, |
374 | notably, "C" and "POSIX"), this returns empty-list in a list context, |
375 | or undef in a scalar context. |
376 | |
377 | locale2language_tag("en") is "en" |
378 | |
379 | locale2language_tag("en_US") is "en-US" |
380 | |
381 | locale2language_tag("en_US.ISO8859-1") is "en-US" |
382 | |
383 | locale2language_tag("C") is undef or () |
384 | |
385 | locale2language_tag("POSIX") is undef or () |
386 | |
387 | locale2language_tag("POSIX") is undef or () |
388 | |
389 | I'm not totally sure that locale names map satisfactorily to language |
390 | tags. Think REAL hard about how you use this. YOU HAVE BEEN WARNED. |
391 | |
392 | The output is untainted. If you don't know what tainting is, |
393 | don't worry about it. |
394 | |
8000a3fa |
395 | =cut |
4b053158 |
396 | |
397 | sub locale2language_tag { |
398 | my $lang = |
399 | $_[0] =~ m/(.+)/ # to make for an untainted result |
400 | ? $1 : '' |
401 | ; |
402 | |
403 | return $lang if &is_language_tag($lang); # like "en" |
404 | |
405 | $lang =~ tr<_><->; # "en_US" -> en-US |
77b20956 |
406 | $lang =~ s<(?:[\.\@][-_a-zA-Z0-9]+)+$><>s; # "en_US.ISO8859-1" -> en-US |
407 | # it_IT.utf8@euro => it-IT |
4b053158 |
408 | |
409 | return $lang if &is_language_tag($lang); |
410 | |
411 | return; |
412 | } |
413 | |
414 | ########################################################################### |
415 | |
416 | =item * the function encode_language_tag($lang1) |
417 | |
418 | This function, if given a language tag, returns an encoding of it such |
419 | that: |
420 | |
421 | * tags representing different languages never get the same encoding. |
422 | |
423 | * tags representing the same language always get the same encoding. |
424 | |
425 | * an encoding of a formally valid language tag always is a string |
426 | value that is defined, has length, and is true if considered as a |
427 | boolean. |
428 | |
429 | Note that the encoding itself is B<not> a formally valid language tag. |
430 | Note also that you cannot, currently, go from an encoding back to a |
431 | language tag that it's an encoding of. |
432 | |
433 | Note also that you B<must> consider the encoded value as atomic; i.e., |
434 | you should not consider it as anything but an opaque, unanalysable |
435 | string value. (The internals of the encoding method may change in |
436 | future versions, as the language tagging standard changes over time.) |
437 | |
438 | C<encode_language_tag> returns undef if given anything other than a |
439 | formally valid language tag. |
440 | |
441 | The reason C<encode_language_tag> exists is because different language |
442 | tags may represent the same language; this is normally treatable with |
443 | C<same_language_tag>, but consider this situation: |
444 | |
445 | You have a data file that expresses greetings in different languages. |
446 | Its format is "[language tag]=[how to say 'Hello']", like: |
447 | |
448 | en-US=Hiho |
449 | fr=Bonjour |
450 | i-mingo=Hau' |
451 | |
452 | And suppose you write a program that reads that file and then runs as |
453 | a daemon, answering client requests that specify a language tag and |
454 | then expect the string that says how to greet in that language. So an |
455 | interaction looks like: |
456 | |
457 | greeting-client asks: fr |
458 | greeting-server answers: Bonjour |
459 | |
460 | So far so good. But suppose the way you're implementing this is: |
461 | |
462 | my %greetings; |
463 | die unless open(IN, "<in.dat"); |
464 | while(<IN>) { |
465 | chomp; |
466 | next unless /^([^=]+)=(.+)/s; |
467 | my($lang, $expr) = ($1, $2); |
468 | $greetings{$lang} = $expr; |
469 | } |
470 | close(IN); |
471 | |
472 | at which point %greetings has the contents: |
473 | |
474 | "en-US" => "Hiho" |
475 | "fr" => "Bonjour" |
476 | "i-mingo" => "Hau'" |
477 | |
478 | And suppose then that you answer client requests for language $wanted |
479 | by just looking up $greetings{$wanted}. |
480 | |
481 | If the client asks for "fr", that will look up successfully in |
482 | %greetings, to the value "Bonjour". And if the client asks for |
483 | "i-mingo", that will look up successfully in %greetings, to the value |
484 | "Hau'". |
485 | |
486 | But if the client asks for "i-Mingo" or "x-mingo", or "Fr", then the |
487 | lookup in %greetings fails. That's the Wrong Thing. |
488 | |
489 | You could instead do lookups on $wanted with: |
490 | |
491 | use I18N::LangTags qw(same_language_tag); |
3c4b39be |
492 | my $response = ''; |
4b053158 |
493 | foreach my $l2 (keys %greetings) { |
494 | if(same_language_tag($wanted, $l2)) { |
495 | $response = $greetings{$l2}; |
496 | last; |
497 | } |
498 | } |
499 | |
500 | But that's rather inefficient. A better way to do it is to start your |
501 | program with: |
502 | |
503 | use I18N::LangTags qw(encode_language_tag); |
504 | my %greetings; |
505 | die unless open(IN, "<in.dat"); |
506 | while(<IN>) { |
507 | chomp; |
508 | next unless /^([^=]+)=(.+)/s; |
509 | my($lang, $expr) = ($1, $2); |
510 | $greetings{ |
511 | encode_language_tag($lang) |
512 | } = $expr; |
513 | } |
514 | close(IN); |
515 | |
516 | and then just answer client requests for language $wanted by just |
517 | looking up |
518 | |
519 | $greetings{encode_language_tag($wanted)} |
520 | |
521 | And that does the Right Thing. |
522 | |
523 | =cut |
524 | |
525 | sub encode_language_tag { |
526 | # Only similarity_language_tag() is allowed to analyse encodings! |
527 | |
528 | ## Changes in the language tagging standards may have to be reflected here. |
529 | |
21aeefd5 |
530 | my($tag) = $_[0] || return undef; |
4b053158 |
531 | return undef unless &is_language_tag($tag); |
21aeefd5 |
532 | |
533 | # For the moment, these legacy variances are few enough that |
534 | # we can just handle them here with regexps. |
535 | $tag =~ s/^iw\b/he/i; # Hebrew |
536 | $tag =~ s/^in\b/id/i; # Indonesian |
aaf52a42 |
537 | $tag =~ s/^cre\b/cr/i; # Cree |
538 | $tag =~ s/^jw\b/jv/i; # Javanese |
21aeefd5 |
539 | $tag =~ s/^[ix]-lux\b/lb/i; # Luxemburger |
540 | $tag =~ s/^[ix]-navajo\b/nv/i; # Navajo |
541 | $tag =~ s/^ji\b/yi/i; # Yiddish |
aaf52a42 |
542 | # SMB 2003 -- Hm. There's a bunch of new XXX->YY variances now, |
543 | # but maybe they're all so obscure I can ignore them. "Obscure" |
544 | # meaning either that the language is obscure, and/or that the |
545 | # XXX form was extant so briefly that it's unlikely it was ever |
546 | # used. I hope. |
21aeefd5 |
547 | # |
548 | # These go FROM the simplex to complex form, to get |
549 | # similarity-comparison right. And that's okay, since |
550 | # similarity_language_tag is the only thing that |
551 | # analyzes our output. |
552 | $tag =~ s/^[ix]-hakka\b/zh-hakka/i; # Hakka |
553 | $tag =~ s/^nb\b/no-bok/i; # BACKWARDS for Bokmal |
554 | $tag =~ s/^nn\b/no-nyn/i; # BACKWARDS for Nynorsk |
4b053158 |
555 | |
556 | $tag =~ s/^[xiXI]-//s; |
557 | # Just lop off any leading "x/i-" |
4b053158 |
558 | |
21aeefd5 |
559 | return "~" . uc($tag); |
4b053158 |
560 | } |
561 | |
562 | #-------------------------------------------------------------------------- |
563 | |
564 | =item * the function alternate_language_tags($lang1) |
565 | |
566 | This function, if given a language tag, returns all language tags that |
21aeefd5 |
567 | are alternate forms of this language tag. (I.e., tags which refer to |
568 | the same language.) This is meant to handle legacy tags caused by |
569 | the minor changes in language tag standards over the years; and |
570 | the x-/i- alternation is also dealt with. |
571 | |
572 | Note that this function does I<not> try to equate new (and never-used, |
573 | and unusable) |
574 | ISO639-2 three-letter tags to old (and still in use) ISO639-1 |
575 | two-letter equivalents -- like "ara" -> "ar" -- because |
576 | "ara" has I<never> been in use as an Internet language tag, |
577 | and RFC 3066 stipulates that it never should be, since a shorter |
578 | tag ("ar") exists. |
579 | |
580 | Examples: |
581 | |
582 | alternate_language_tags('no-bok') is ('nb') |
583 | alternate_language_tags('nb') is ('no-bok') |
584 | alternate_language_tags('he') is ('iw') |
585 | alternate_language_tags('iw') is ('he') |
586 | alternate_language_tags('i-hakka') is ('zh-hakka', 'x-hakka') |
587 | alternate_language_tags('zh-hakka') is ('i-hakka', 'x-hakka') |
588 | alternate_language_tags('en') is () |
589 | alternate_language_tags('x-mingo-tom') is ('i-mingo-tom') |
590 | alternate_language_tags('x-klikitat') is ('i-klikitat') |
591 | alternate_language_tags('i-klikitat') is ('x-klikitat') |
592 | |
593 | This function returns empty-list if given anything other than a formally |
4b053158 |
594 | valid language tag. |
595 | |
596 | =cut |
597 | |
598 | my %alt = qw( i x x i I X X I ); |
599 | sub alternate_language_tags { |
4b053158 |
600 | my $tag = $_[0]; |
601 | return() unless &is_language_tag($tag); |
602 | |
21aeefd5 |
603 | my @em; # push 'em real goood! |
604 | |
605 | # For the moment, these legacy variances are few enough that |
606 | # we can just handle them here with regexps. |
607 | |
608 | if( $tag =~ m/^[ix]-hakka\b(.*)/i) {push @em, "zh-hakka$1"; |
609 | } elsif($tag =~ m/^zh-hakka\b(.*)/i) { push @em, "x-hakka$1", "i-hakka$1"; |
610 | |
611 | } elsif($tag =~ m/^he\b(.*)/i) { push @em, "iw$1"; |
612 | } elsif($tag =~ m/^iw\b(.*)/i) { push @em, "he$1"; |
613 | |
614 | } elsif($tag =~ m/^in\b(.*)/i) { push @em, "id$1"; |
615 | } elsif($tag =~ m/^id\b(.*)/i) { push @em, "in$1"; |
616 | |
617 | } elsif($tag =~ m/^[ix]-lux\b(.*)/i) { push @em, "lb$1"; |
618 | } elsif($tag =~ m/^lb\b(.*)/i) { push @em, "i-lux$1", "x-lux$1"; |
619 | |
620 | } elsif($tag =~ m/^[ix]-navajo\b(.*)/i) { push @em, "nv$1"; |
621 | } elsif($tag =~ m/^nv\b(.*)/i) { push @em, "i-navajo$1", "x-navajo$1"; |
622 | |
623 | } elsif($tag =~ m/^yi\b(.*)/i) { push @em, "ji$1"; |
624 | } elsif($tag =~ m/^ji\b(.*)/i) { push @em, "yi$1"; |
625 | |
626 | } elsif($tag =~ m/^nb\b(.*)/i) { push @em, "no-bok$1"; |
627 | } elsif($tag =~ m/^no-bok\b(.*)/i) { push @em, "nb$1"; |
628 | |
629 | } elsif($tag =~ m/^nn\b(.*)/i) { push @em, "no-nyn$1"; |
630 | } elsif($tag =~ m/^no-nyn\b(.*)/i) { push @em, "nn$1"; |
631 | } |
632 | |
633 | push @em, $alt{$1} . $2 if $tag =~ /^([XIxi])(-.+)/; |
634 | return @em; |
635 | } |
636 | |
637 | ########################################################################### |
638 | |
639 | { |
640 | # Init %Panic... |
641 | |
642 | my @panic = ( # MUST all be lowercase! |
643 | # Only large ("national") languages make it in this list. |
644 | # If you, as a user, are so bizarre that the /only/ language |
645 | # you claim to accept is Galician, then no, we won't do you |
646 | # the favor of providing Catalan as a panic-fallback for |
647 | # you. Because if I start trying to add "little languages" in |
648 | # here, I'll just go crazy. |
649 | |
4cf5bee0 |
650 | # Scandinavian lgs. All based on opinion and hearsay. |
651 | 'sv' => [qw(nb no da nn)], |
652 | 'da' => [qw(nb no sv nn)], # I guess |
653 | [qw(no nn nb)], [qw(no nn nb sv da)], |
654 | 'is' => [qw(da sv no nb nn)], |
655 | 'fo' => [qw(da is no nb nn sv)], # I guess |
21aeefd5 |
656 | |
657 | # I think this is about the extent of tolerable intelligibility |
658 | # among large modern Romance languages. |
659 | 'pt' => [qw(es ca it fr)], # Portuguese, Spanish, Catalan, Italian, French |
660 | 'ca' => [qw(es pt it fr)], |
661 | 'es' => [qw(ca it fr pt)], |
662 | 'it' => [qw(es fr ca pt)], |
663 | 'fr' => [qw(es it ca pt)], |
664 | |
665 | # Also assume that speakers of the main Indian languages prefer |
666 | # to read/hear Hindi over English |
667 | [qw( |
668 | as bn gu kn ks kok ml mni mr ne or pa sa sd te ta ur |
669 | )] => 'hi', |
670 | # Assamese, Bengali, Gujarati, [Hindi,] Kannada (Kanarese), Kashmiri, |
671 | # Konkani, Malayalam, Meithei (Manipuri), Marathi, Nepali, Oriya, |
672 | # Punjabi, Sanskrit, Sindhi, Telugu, Tamil, and Urdu. |
673 | 'hi' => [qw(bn pa as or)], |
674 | # I welcome finer data for the other Indian languages. |
675 | # E.g., what should Oriya's list be, besides just Hindi? |
676 | |
677 | # And the panic languages for English is, of course, nil! |
678 | |
679 | # My guesses at Slavic intelligibility: |
680 | ([qw(ru be uk)]) x 2, # Russian, Belarusian, Ukranian |
681 | 'sr' => 'hr', 'hr' => 'sr', # Serb + Croat |
682 | 'cs' => 'sk', 'sk' => 'cs', # Czech + Slovak |
683 | |
684 | 'ms' => 'id', 'id' => 'ms', # Malay + Indonesian |
685 | |
686 | 'et' => 'fi', 'fi' => 'et', # Estonian + Finnish |
687 | |
688 | #?? 'lo' => 'th', 'th' => 'lo', # Lao + Thai |
689 | |
690 | ); |
691 | my($k,$v); |
692 | while(@panic) { |
693 | ($k,$v) = splice(@panic,0,2); |
694 | foreach my $k (ref($k) ? @$k : $k) { |
695 | foreach my $v (ref($v) ? @$v : $v) { |
696 | push @{$Panic{$k} ||= []}, $v unless $k eq $v; |
697 | } |
698 | } |
699 | } |
700 | } |
701 | |
702 | =item * the function @langs = panic_languages(@accept_languages) |
703 | |
704 | This function takes a list of 0 or more language |
705 | tags that constitute a given user's Accept-Language list, and |
706 | returns a list of tags for I<other> (non-super) |
707 | languages that are probably acceptable to the user, to be |
708 | used I<if all else fails>. |
709 | |
710 | For example, if a user accepts only 'ca' (Catalan) and |
711 | 'es' (Spanish), and the documents/interfaces you have |
712 | available are just in German, Italian, and Chinese, then |
713 | the user will most likely want the Italian one (and not |
714 | the Chinese or German one!), instead of getting |
715 | nothing. So C<panic_languages('ca', 'es')> returns |
716 | a list containing 'it' (Italian). |
717 | |
718 | English ('en') is I<always> in the return list, but |
719 | whether it's at the very end or not depends |
720 | on the input languages. This function works by consulting |
721 | an internal table that stipulates what common |
722 | languages are "close" to each other. |
723 | |
724 | A useful construct you might consider using is: |
725 | |
726 | @fallbacks = super_languages(@accept_languages); |
727 | push @fallbacks, panic_languages( |
728 | @accept_languages, @fallbacks, |
729 | ); |
730 | |
731 | =cut |
4b053158 |
732 | |
21aeefd5 |
733 | sub panic_languages { |
734 | # When in panic or in doubt, run in circles, scream, and shout! |
735 | my(@out, %seen); |
736 | foreach my $t (@_) { |
737 | next unless $t; |
738 | next if $seen{$t}++; # so we don't return it or hit it again |
739 | # push @out, super_languages($t); # nah, keep that separate |
740 | push @out, @{ $Panic{lc $t} || next }; |
4b053158 |
741 | } |
21aeefd5 |
742 | return grep !$seen{$_}++, @out, 'en'; |
4b053158 |
743 | } |
744 | |
8000a3fa |
745 | #--------------------------------------------------------------------------- |
746 | #--------------------------------------------------------------------------- |
747 | |
748 | =item * the function implicate_supers( ...languages... ) |
749 | |
750 | This takes a list of strings (which are presumed to be language-tags; |
751 | strings that aren't, are ignored); and after each one, this function |
752 | inserts super-ordinate forms that don't already appear in the list. |
753 | The original list, plus these insertions, is returned. |
754 | |
755 | In other words, it takes this: |
756 | |
757 | pt-br de-DE en-US fr pt-br-janeiro |
758 | |
759 | and returns this: |
760 | |
761 | pt-br pt de-DE de en-US en fr pt-br-janeiro |
762 | |
763 | This function is most useful in the idiom |
764 | |
765 | implicate_supers( I18N::LangTags::Detect::detect() ); |
766 | |
767 | (See L<I18N::LangTags::Detect>.) |
768 | |
769 | |
770 | =item * the function implicate_supers_strictly( ...languages... ) |
771 | |
772 | This works like C<implicate_supers> except that the implicated |
773 | forms are added to the end of the return list. |
774 | |
775 | In other words, implicate_supers_strictly takes a list of strings |
776 | (which are presumed to be language-tags; strings that aren't, are |
777 | ignored) and after the whole given list, it inserts the super-ordinate forms |
778 | of all given tags, minus any tags that already appear in the input list. |
779 | |
780 | In other words, it takes this: |
781 | |
782 | pt-br de-DE en-US fr pt-br-janeiro |
783 | |
784 | and returns this: |
785 | |
786 | pt-br de-DE en-US fr pt-br-janeiro pt de en |
787 | |
788 | The reason this function has "_strictly" in its name is that when |
789 | you're processing an Accept-Language list according to the RFCs, if |
790 | you interpret the RFCs quite strictly, then you would use |
791 | implicate_supers_strictly, but for normal use (i.e., common-sense use, |
792 | as far as I'm concerned) you'd use implicate_supers. |
793 | |
794 | =cut |
795 | |
796 | sub implicate_supers { |
797 | my @languages = grep is_language_tag($_), @_; |
798 | my %seen_encoded; |
799 | foreach my $lang (@languages) { |
800 | $seen_encoded{ I18N::LangTags::encode_language_tag($lang) } = 1 |
801 | } |
802 | |
803 | my(@output_languages); |
804 | foreach my $lang (@languages) { |
805 | push @output_languages, $lang; |
806 | foreach my $s ( I18N::LangTags::super_languages($lang) ) { |
807 | # Note that super_languages returns the longest first. |
808 | last if $seen_encoded{ I18N::LangTags::encode_language_tag($s) }; |
809 | push @output_languages, $s; |
810 | } |
811 | } |
812 | return uniq( @output_languages ); |
813 | |
814 | } |
815 | |
816 | sub implicate_supers_strictly { |
817 | my @tags = grep is_language_tag($_), @_; |
818 | return uniq( @_, map super_languages($_), @_ ); |
819 | } |
820 | |
821 | |
822 | |
4b053158 |
823 | ########################################################################### |
21aeefd5 |
824 | 1; |
825 | __END__ |
4b053158 |
826 | |
827 | =back |
828 | |
829 | =head1 ABOUT LOWERCASING |
830 | |
831 | I've considered making all the above functions that output language |
832 | tags return all those tags strictly in lowercase. Having all your |
833 | language tags in lowercase does make some things easier. But you |
834 | might as well just lowercase as you like, or call |
835 | C<encode_language_tag($lang1)> where appropriate. |
836 | |
837 | =head1 ABOUT UNICODE PLAINTEXT LANGUAGE TAGS |
838 | |
839 | In some future version of I18N::LangTags, I plan to include support |
840 | for RFC2482-style language tags -- which are basically just normal |
841 | language tags with their ASCII characters shifted into Plane 14. |
842 | |
843 | =head1 SEE ALSO |
844 | |
e7525a17 |
845 | * L<I18N::LangTags::List|I18N::LangTags::List> |
846 | |
4b053158 |
847 | * RFC 3066, C<ftp://ftp.isi.edu/in-notes/rfc3066.txt>, "Tags for the |
848 | Identification of Languages". (Obsoletes RFC 1766) |
849 | |
850 | * RFC 2277, C<ftp://ftp.isi.edu/in-notes/rfc2277.txt>, "IETF Policy on |
851 | Character Sets and Languages". |
852 | |
853 | * RFC 2231, C<ftp://ftp.isi.edu/in-notes/rfc2231.txt>, "MIME Parameter |
854 | Value and Encoded Word Extensions: Character Sets, Languages, and |
855 | Continuations". |
856 | |
8000a3fa |
857 | * RFC 2482, C<ftp://ftp.isi.edu/in-notes/rfc2482.txt>, |
4b053158 |
858 | "Language Tagging in Unicode Plain Text". |
859 | |
860 | * Locale::Codes, in |
f70da2ef |
861 | C<http://www.perl.com/CPAN/modules/by-module/Locale/> |
4b053158 |
862 | |
4b053158 |
863 | * ISO 639-2, "Codes for the representation of names of languages", |
aaf52a42 |
864 | including two-letter and three-letter codes, |
865 | C<http://www.loc.gov/standards/iso639-2/langcodes.html> |
4b053158 |
866 | |
867 | * The IANA list of registered languages (hopefully up-to-date), |
aaf52a42 |
868 | C<http://www.iana.org/assignments/language-tags> |
4b053158 |
869 | |
870 | =head1 COPYRIGHT |
871 | |
77b20956 |
872 | Copyright (c) 1998+ Sean M. Burke. All rights reserved. |
4b053158 |
873 | |
874 | This library is free software; you can redistribute it and/or |
875 | modify it under the same terms as Perl itself. |
876 | |
877 | The programs and documentation in this dist are distributed in |
878 | the hope that they will be useful, but without any warranty; without |
879 | even the implied warranty of merchantability or fitness for a |
880 | particular purpose. |
881 | |
882 | =head1 AUTHOR |
883 | |
884 | Sean M. Burke C<sburke@cpan.org> |
885 | |
886 | =cut |
887 | |