Commit | Line | Data |
4b053158 |
1 | |
2 | # Time-stamp: "2001-05-25 07:36:55 MDT" |
3 | # Sean M. Burke <sburke@cpan.org> |
4 | |
5 | require 5.000; |
6 | package I18N::LangTags; |
7 | use strict; |
8 | use vars qw(@ISA @EXPORT @EXPORT_OK $VERSION); # $Debug |
9 | require Exporter; |
10 | # $Debug = 0; |
11 | @ISA = qw(Exporter); |
12 | @EXPORT = qw(); |
13 | @EXPORT_OK = qw(is_language_tag same_language_tag |
14 | extract_language_tags super_languages |
15 | similarity_language_tag is_dialect_of |
16 | locale2language_tag alternate_language_tags |
17 | encode_language_tag |
18 | ); |
19 | |
20 | $VERSION = "0.21"; |
21 | |
22 | =head1 NAME |
23 | |
24 | I18N::LangTags - functions for dealing with RFC3066-style language tags |
25 | |
26 | =head1 SYNOPSIS |
27 | |
28 | use I18N::LangTags qw(is_language_tag same_language_tag |
29 | extract_language_tags super_languages |
30 | similarity_language_tag is_dialect_of |
31 | locale2language_tag alternate_language_tags |
32 | encode_language_tag |
33 | ); |
34 | |
35 | ...or whatever of those functions you want to import. Those are |
36 | all the exportable functions -- you're free to import only some, |
37 | or none at all. By default, none are imported. |
38 | |
39 | If you don't import any of these functions, assume a C<&I18N::LangTags::> |
40 | in front of all the function names in the following examples. |
41 | |
42 | =head1 DESCRIPTION |
43 | |
44 | Language tags are a formalism, described in RFC 3066 (obsoleting |
45 | 1766), for declaring what language form (language and possibly |
46 | dialect) a given chunk of information is in. |
47 | |
48 | This library provides functions for common tasks involving language |
49 | tags as they are needed in a variety of protocols and applications. |
50 | |
51 | Please see the "See Also" references for a thorough explanation |
52 | of how to correctly use language tags. |
53 | |
54 | =over |
55 | |
56 | =cut |
57 | |
58 | ########################################################################### |
59 | |
60 | =item * the function is_language_tag($lang1) |
61 | |
62 | Returns true iff $lang1 is a formally valid language tag. |
63 | |
64 | is_language_tag("fr") is TRUE |
65 | is_language_tag("x-jicarilla") is FALSE |
66 | (Subtags can be 8 chars long at most -- 'jicarilla' is 9) |
67 | |
68 | is_language_tag("sgn-US") is TRUE |
69 | (That's American Sign Language) |
70 | |
71 | is_language_tag("i-Klikitat") is TRUE |
72 | (True without regard to the fact noone has actually |
73 | registered Klikitat -- it's a formally valid tag) |
74 | |
75 | is_language_tag("fr-patois") is TRUE |
76 | (Formally valid -- altho descriptively weak!) |
77 | |
78 | is_language_tag("Spanish") is FALSE |
79 | is_language_tag("french-patois") is FALSE |
80 | (No good -- first subtag has to match |
81 | /^([xXiI]|[a-zA-Z]{2,3})$/ -- see RFC3066) |
82 | |
83 | is_language_tag("x-borg-prot2532") is TRUE |
84 | (Yes, subtags can contain digits, as of RFC3066) |
85 | |
86 | =cut |
87 | |
88 | sub is_language_tag { |
89 | |
90 | ## Changes in the language tagging standards may have to be reflected here. |
91 | |
92 | my($tag) = lc($_[0]); |
93 | |
94 | return 0 if $tag eq "i" or $tag eq "x"; |
95 | # Bad degenerate cases the following |
96 | # regexp would erroneously let pass |
97 | |
98 | return $tag =~ |
99 | /^(?: # First subtag |
100 | [xi] | [a-z]{2,3} |
101 | ) |
102 | (?: # Subtags thereafter |
103 | - # separator |
104 | [a-z0-9]{1,8} # subtag |
105 | )* |
106 | $/xs ? 1 : 0; |
107 | } |
108 | |
109 | ########################################################################### |
110 | |
111 | =item * the function extract_language_tags($whatever) |
112 | |
113 | Returns a list of whatever looks like formally valid language tags |
114 | in $whatever. Not very smart, so don't get too creative with |
115 | what you want to feed it. |
116 | |
117 | extract_language_tags("fr, fr-ca, i-mingo") |
118 | returns: ('fr', 'fr-ca', 'i-mingo') |
119 | |
120 | extract_language_tags("It's like this: I'm in fr -- French!") |
121 | returns: ('It', 'in', 'fr') |
122 | (So don't just feed it any old thing.) |
123 | |
124 | The output is untainted. If you don't know what tainting is, |
125 | don't worry about it. |
126 | |
127 | =cut |
128 | |
129 | sub extract_language_tags { |
130 | |
131 | ## Changes in the language tagging standards may have to be reflected here. |
132 | |
133 | my($text) = |
134 | $_[0] =~ m/(.+)/ # to make for an untainted result |
135 | ? $1 : '' |
136 | ; |
137 | |
138 | return grep(!m/^[ixIX]$/s, # 'i' and 'x' aren't good tags |
139 | $text =~ |
140 | m/ |
141 | \b |
142 | (?: # First subtag |
143 | [iIxX] | [a-zA-Z]{2,3} |
144 | ) |
145 | (?: # Subtags thereafter |
146 | - # separator |
147 | [a-zA-Z0-9]{1,8} # subtag |
148 | )* |
149 | \b |
150 | /xsg |
151 | ); |
152 | } |
153 | |
154 | ########################################################################### |
155 | |
156 | =item * the function same_language_tag($lang1, $lang2) |
157 | |
158 | Returns true iff $lang1 and $lang2 are acceptable variant tags |
159 | representing the same language-form. |
160 | |
161 | same_language_tag('x-kadara', 'i-kadara') is TRUE |
162 | (The x/i- alternation doesn't matter) |
163 | same_language_tag('X-KADARA', 'i-kadara') is TRUE |
164 | (...and neither does case) |
165 | same_language_tag('en', 'en-US') is FALSE |
166 | (all-English is not the SAME as US English) |
167 | same_language_tag('x-kadara', 'x-kadar') is FALSE |
168 | (these are totally unrelated tags) |
169 | |
170 | C<same_language_tag> works by just seeing whether |
171 | C<encode_language_tag($lang1)> is the same as |
172 | C<encode_language_tag($lang2)>. |
173 | |
174 | (Yes, I know this function is named a bit oddly. Call it historic |
175 | reasons.) |
176 | |
177 | =cut |
178 | |
179 | sub same_language_tag { |
180 | my $el1 = &encode_language_tag($_[0]); |
181 | return 0 unless defined $el1; |
182 | # this avoids the problem of |
183 | # encode_language_tag($lang1) eq and encode_language_tag($lang2) |
184 | # being true if $lang1 and $lang2 are both undef |
185 | |
186 | return $el1 eq &encode_language_tag($_[1]) ? 1 : 0; |
187 | } |
188 | |
189 | ########################################################################### |
190 | |
191 | =item * the function similarity_language_tag($lang1, $lang2) |
192 | |
193 | Returns an integer representing the degree of similarity between |
194 | tags $lang1 and $lang2 (the order of which does not matter), where |
195 | similarity is the number of common elements on the left, |
196 | without regard to case and to x/i- alternation. |
197 | |
198 | similarity_language_tag('fr', 'fr-ca') is 1 |
199 | (one element in common) |
200 | similarity_language_tag('fr-ca', 'fr-FR') is 1 |
201 | (one element in common) |
202 | |
203 | similarity_language_tag('fr-CA-joual', |
204 | 'fr-CA-PEI') is 2 |
205 | similarity_language_tag('fr-CA-joual', 'fr-CA') is 2 |
206 | (two elements in common) |
207 | |
208 | similarity_language_tag('x-kadara', 'i-kadara') is 1 |
209 | (x/i- doesn't matter) |
210 | |
211 | similarity_language_tag('en', 'x-kadar') is 0 |
212 | similarity_language_tag('x-kadara', 'x-kadar') is 0 |
213 | (unrelated tags -- no similarity) |
214 | |
215 | similarity_language_tag('i-cree-syllabic', |
216 | 'i-cherokee-syllabic') is 0 |
217 | (no B<leftmost> elements in common!) |
218 | |
219 | =cut |
220 | |
221 | sub similarity_language_tag { |
222 | my $lang1 = &encode_language_tag($_[0]); |
223 | my $lang2 = &encode_language_tag($_[1]); |
224 | |
225 | # NB: (i-sil-...)? (i-sgn-...)? |
226 | |
227 | return undef if !defined($lang1) and !defined($lang2); |
228 | return 0 if !defined($lang1) or !defined($lang2); |
229 | |
230 | my @l1_subtags = split('-', $lang1); |
231 | my @l2_subtags = split('-', $lang2); |
232 | my $similarity = 0; |
233 | |
234 | while(@l1_subtags and @l2_subtags) { |
235 | if(shift(@l1_subtags) eq shift(@l2_subtags)) { |
236 | ++$similarity; |
237 | } else { |
238 | last; |
239 | } |
240 | } |
241 | return $similarity; |
242 | } |
243 | |
244 | ########################################################################### |
245 | |
246 | =item * the function is_dialect_of($lang1, $lang2) |
247 | |
248 | Returns true iff language tag $lang1 represents a subdialect of |
249 | language tag $lang2. |
250 | |
251 | B<Get the order right! It doesn't work the other way around!> |
252 | |
253 | is_dialect_of('en-US', 'en') is TRUE |
254 | (American English IS a dialect of all-English) |
255 | |
256 | is_dialect_of('fr-CA-joual', 'fr-CA') is TRUE |
257 | is_dialect_of('fr-CA-joual', 'fr') is TRUE |
258 | (Joual is a dialect of (a dialect of) French) |
259 | |
260 | is_dialect_of('en', 'en-US') is FALSE |
261 | (all-English is a NOT dialect of American English) |
262 | |
263 | is_dialect_of('fr', 'en-CA') is FALSE |
264 | |
265 | is_dialect_of('en', 'en' ) is TRUE |
266 | is_dialect_of('en-US', 'en-US') is TRUE |
267 | (B<Note:> these are degenerate cases) |
268 | |
269 | is_dialect_of('i-mingo-tom', 'x-Mingo') is TRUE |
270 | (the x/i thing doesn't matter, nor does case) |
271 | |
272 | =cut |
273 | |
274 | sub is_dialect_of { |
275 | |
276 | my $lang1 = &encode_language_tag($_[0]); |
277 | my $lang2 = &encode_language_tag($_[1]); |
278 | |
279 | return undef if !defined($lang1) and !defined($lang2); |
280 | return 0 if !defined($lang1) or !defined($lang2); |
281 | |
282 | return 1 if $lang1 eq $lang2; |
283 | return 0 if length($lang1) < length($lang2); |
284 | |
285 | $lang1 .= '-'; |
286 | $lang2 .= '-'; |
287 | return |
288 | (substr($lang1, 0, length($lang2)) eq $lang2) ? 1 : 0; |
289 | } |
290 | |
291 | ########################################################################### |
292 | |
293 | =item * the function super_languages($lang1) |
294 | |
295 | Returns a list of language tags that are superordinate tags to $lang1 |
296 | -- it gets this by removing subtags from the end of $lang1 until |
297 | nothing (or just "i" or "x") is left. |
298 | |
299 | super_languages("fr-CA-joual") is ("fr-CA", "fr") |
300 | |
301 | super_languages("en-AU") is ("en") |
302 | |
303 | super_languages("en") is empty-list, () |
304 | |
305 | super_languages("i-cherokee") is empty-list, () |
306 | ...not ("i"), which would be illegal as well as pointless. |
307 | |
308 | If $lang1 is not a valid language tag, returns empty-list in |
309 | a list context, undef in a scalar context. |
310 | |
311 | A notable and rather unavoidable problem with this method: |
312 | "x-mingo-tom" has an "x" because the whole tag isn't an |
313 | IANA-registered tag -- but super_languages('x-mingo-tom') is |
314 | ('x-mingo') -- which isn't really right, since 'i-mingo' is |
315 | registered. But this module has no way of knowing that. (But note |
316 | that same_language_tag('x-mingo', 'i-mingo') is TRUE.) |
317 | |
318 | More importantly, you assume I<at your peril> that superordinates of |
319 | $lang1 are mutually intelligible with $lang1. Consider this |
320 | carefully. |
321 | |
322 | =cut |
323 | |
324 | sub super_languages { |
325 | my $lang1 = $_[0]; |
326 | return() unless defined($lang1) && &is_language_tag($lang1); |
327 | my @l1_subtags = split('-', $lang1); |
328 | |
329 | ## Changes in the language tagging standards may have to be reflected here. |
330 | |
331 | # NB: (i-sil-...)? |
332 | |
333 | my @supers = (); |
334 | foreach my $bit (@l1_subtags) { |
335 | push @supers, |
336 | scalar(@supers) ? ($supers[-1] . '-' . $bit) : $bit; |
337 | } |
338 | pop @supers if @supers; |
339 | shift @supers if @supers && $supers[0] =~ m<^[iIxX]$>s; |
340 | return reverse @supers; |
341 | } |
342 | |
343 | ########################################################################### |
344 | |
345 | =item * the function locale2language_tag($locale_identifier) |
346 | |
347 | This takes a locale name (like "en", "en_US", or "en_US.ISO8859-1") |
348 | and maps it to a language tag. If it's not mappable (as with, |
349 | notably, "C" and "POSIX"), this returns empty-list in a list context, |
350 | or undef in a scalar context. |
351 | |
352 | locale2language_tag("en") is "en" |
353 | |
354 | locale2language_tag("en_US") is "en-US" |
355 | |
356 | locale2language_tag("en_US.ISO8859-1") is "en-US" |
357 | |
358 | locale2language_tag("C") is undef or () |
359 | |
360 | locale2language_tag("POSIX") is undef or () |
361 | |
362 | locale2language_tag("POSIX") is undef or () |
363 | |
364 | I'm not totally sure that locale names map satisfactorily to language |
365 | tags. Think REAL hard about how you use this. YOU HAVE BEEN WARNED. |
366 | |
367 | The output is untainted. If you don't know what tainting is, |
368 | don't worry about it. |
369 | |
370 | =cut |
371 | |
372 | sub locale2language_tag { |
373 | my $lang = |
374 | $_[0] =~ m/(.+)/ # to make for an untainted result |
375 | ? $1 : '' |
376 | ; |
377 | |
378 | return $lang if &is_language_tag($lang); # like "en" |
379 | |
380 | $lang =~ tr<_><->; # "en_US" -> en-US |
381 | $lang =~ s<\.[-_a-zA-Z0-9\.]*><>s; # "en_US.ISO8859-1" -> en-US |
382 | |
383 | return $lang if &is_language_tag($lang); |
384 | |
385 | return; |
386 | } |
387 | |
388 | ########################################################################### |
389 | |
390 | =item * the function encode_language_tag($lang1) |
391 | |
392 | This function, if given a language tag, returns an encoding of it such |
393 | that: |
394 | |
395 | * tags representing different languages never get the same encoding. |
396 | |
397 | * tags representing the same language always get the same encoding. |
398 | |
399 | * an encoding of a formally valid language tag always is a string |
400 | value that is defined, has length, and is true if considered as a |
401 | boolean. |
402 | |
403 | Note that the encoding itself is B<not> a formally valid language tag. |
404 | Note also that you cannot, currently, go from an encoding back to a |
405 | language tag that it's an encoding of. |
406 | |
407 | Note also that you B<must> consider the encoded value as atomic; i.e., |
408 | you should not consider it as anything but an opaque, unanalysable |
409 | string value. (The internals of the encoding method may change in |
410 | future versions, as the language tagging standard changes over time.) |
411 | |
412 | C<encode_language_tag> returns undef if given anything other than a |
413 | formally valid language tag. |
414 | |
415 | The reason C<encode_language_tag> exists is because different language |
416 | tags may represent the same language; this is normally treatable with |
417 | C<same_language_tag>, but consider this situation: |
418 | |
419 | You have a data file that expresses greetings in different languages. |
420 | Its format is "[language tag]=[how to say 'Hello']", like: |
421 | |
422 | en-US=Hiho |
423 | fr=Bonjour |
424 | i-mingo=Hau' |
425 | |
426 | And suppose you write a program that reads that file and then runs as |
427 | a daemon, answering client requests that specify a language tag and |
428 | then expect the string that says how to greet in that language. So an |
429 | interaction looks like: |
430 | |
431 | greeting-client asks: fr |
432 | greeting-server answers: Bonjour |
433 | |
434 | So far so good. But suppose the way you're implementing this is: |
435 | |
436 | my %greetings; |
437 | die unless open(IN, "<in.dat"); |
438 | while(<IN>) { |
439 | chomp; |
440 | next unless /^([^=]+)=(.+)/s; |
441 | my($lang, $expr) = ($1, $2); |
442 | $greetings{$lang} = $expr; |
443 | } |
444 | close(IN); |
445 | |
446 | at which point %greetings has the contents: |
447 | |
448 | "en-US" => "Hiho" |
449 | "fr" => "Bonjour" |
450 | "i-mingo" => "Hau'" |
451 | |
452 | And suppose then that you answer client requests for language $wanted |
453 | by just looking up $greetings{$wanted}. |
454 | |
455 | If the client asks for "fr", that will look up successfully in |
456 | %greetings, to the value "Bonjour". And if the client asks for |
457 | "i-mingo", that will look up successfully in %greetings, to the value |
458 | "Hau'". |
459 | |
460 | But if the client asks for "i-Mingo" or "x-mingo", or "Fr", then the |
461 | lookup in %greetings fails. That's the Wrong Thing. |
462 | |
463 | You could instead do lookups on $wanted with: |
464 | |
465 | use I18N::LangTags qw(same_language_tag); |
466 | my $repsonse = ''; |
467 | foreach my $l2 (keys %greetings) { |
468 | if(same_language_tag($wanted, $l2)) { |
469 | $response = $greetings{$l2}; |
470 | last; |
471 | } |
472 | } |
473 | |
474 | But that's rather inefficient. A better way to do it is to start your |
475 | program with: |
476 | |
477 | use I18N::LangTags qw(encode_language_tag); |
478 | my %greetings; |
479 | die unless open(IN, "<in.dat"); |
480 | while(<IN>) { |
481 | chomp; |
482 | next unless /^([^=]+)=(.+)/s; |
483 | my($lang, $expr) = ($1, $2); |
484 | $greetings{ |
485 | encode_language_tag($lang) |
486 | } = $expr; |
487 | } |
488 | close(IN); |
489 | |
490 | and then just answer client requests for language $wanted by just |
491 | looking up |
492 | |
493 | $greetings{encode_language_tag($wanted)} |
494 | |
495 | And that does the Right Thing. |
496 | |
497 | =cut |
498 | |
499 | sub encode_language_tag { |
500 | # Only similarity_language_tag() is allowed to analyse encodings! |
501 | |
502 | ## Changes in the language tagging standards may have to be reflected here. |
503 | |
504 | my($tag) = uc($_[0]); # smash case |
505 | return undef unless &is_language_tag($tag); |
506 | # If it's not a language tag, its encoding is undef |
507 | |
508 | $tag =~ s/^[xiXI]-//s; |
509 | # Just lop off any leading "x/i-" |
510 | # Or I suppose I could do s/^[xiXI]-/_/s or something. |
511 | |
512 | return "~$tag"; |
513 | } |
514 | |
515 | #-------------------------------------------------------------------------- |
516 | |
517 | =item * the function alternate_language_tags($lang1) |
518 | |
519 | This function, if given a language tag, returns all language tags that |
520 | are alternate forms of this language tag. (There is little |
521 | alternation in the C<current> language tagging formalism, but |
522 | extensions to the formalism are under consideration which could add a |
523 | great deal of alternation.) |
524 | |
525 | Examples from the current formalism: |
526 | |
527 | alternate_language_tags('en') is () |
528 | alternate_language_tags('x-mingo-tom') is ('i-mingo-tom') |
529 | alternate_language_tags('x-klikitat') is ('i-klikitat') |
530 | alternate_language_tags('i-klikitat') is ('x-klikitat') |
531 | |
532 | This function returns undef if given anything other than a formally |
533 | valid language tag. |
534 | |
535 | =cut |
536 | |
537 | my %alt = qw( i x x i I X X I ); |
538 | sub alternate_language_tags { |
539 | ## Changes in the language tagging standards may have to be reflected here. |
540 | my $tag = $_[0]; |
541 | return() unless &is_language_tag($tag); |
542 | |
543 | # might as well preserve case |
544 | |
545 | if($tag =~ /^([XIxi])(-.+)/) { |
546 | # This handles all the alternation that exists CURRENTLY |
547 | return($alt{$1} . $2); |
548 | } |
549 | return(); |
550 | } |
551 | |
552 | ########################################################################### |
553 | |
554 | =back |
555 | |
556 | =head1 ABOUT LOWERCASING |
557 | |
558 | I've considered making all the above functions that output language |
559 | tags return all those tags strictly in lowercase. Having all your |
560 | language tags in lowercase does make some things easier. But you |
561 | might as well just lowercase as you like, or call |
562 | C<encode_language_tag($lang1)> where appropriate. |
563 | |
564 | =head1 ABOUT UNICODE PLAINTEXT LANGUAGE TAGS |
565 | |
566 | In some future version of I18N::LangTags, I plan to include support |
567 | for RFC2482-style language tags -- which are basically just normal |
568 | language tags with their ASCII characters shifted into Plane 14. |
569 | |
570 | =head1 SEE ALSO |
571 | |
572 | * RFC 3066, C<ftp://ftp.isi.edu/in-notes/rfc3066.txt>, "Tags for the |
573 | Identification of Languages". (Obsoletes RFC 1766) |
574 | |
575 | * RFC 2277, C<ftp://ftp.isi.edu/in-notes/rfc2277.txt>, "IETF Policy on |
576 | Character Sets and Languages". |
577 | |
578 | * RFC 2231, C<ftp://ftp.isi.edu/in-notes/rfc2231.txt>, "MIME Parameter |
579 | Value and Encoded Word Extensions: Character Sets, Languages, and |
580 | Continuations". |
581 | |
582 | * RFC 2482, C<ftp://ftp.isi.edu/in-notes/rfc2482.txt>, |
583 | "Language Tagging in Unicode Plain Text". |
584 | |
585 | * Locale::Codes, in |
586 | C<http://www.perl.com/CPAN/modules/by-module/Locale/> |
587 | |
588 | * ISO 639, "Code for the representation of names of languages", |
589 | C<http://www.indigo.ie/egt/standards/iso639/iso639-1-en.html> |
590 | |
591 | * ISO 639-2, "Codes for the representation of names of languages", |
592 | including three-letter codes, |
593 | C<http://lcweb.loc.gov/standards/iso639-2/bibcodes.html> |
594 | |
595 | * The IANA list of registered languages (hopefully up-to-date), |
596 | C<ftp://ftp.isi.edu/in-notes/iana/assignments/languages/> |
597 | |
598 | =head1 COPYRIGHT |
599 | |
600 | Copyright (c) 1998-2001 Sean M. Burke. All rights reserved. |
601 | |
602 | This library is free software; you can redistribute it and/or |
603 | modify it under the same terms as Perl itself. |
604 | |
605 | The programs and documentation in this dist are distributed in |
606 | the hope that they will be useful, but without any warranty; without |
607 | even the implied warranty of merchantability or fitness for a |
608 | particular purpose. |
609 | |
610 | =head1 AUTHOR |
611 | |
612 | Sean M. Burke C<sburke@cpan.org> |
613 | |
614 | =cut |
615 | |
616 | 1; |
617 | |
618 | __END__ |