Regen headers.
[p5sagit/p5-mst-13.2.git] / lib / I18N / LangTags.pm
CommitLineData
4b053158 1
e7525a17 2# Time-stamp: "2001-05-27 19:53:11 MDT"
4b053158 3# Sean M. Burke <sburke@cpan.org>
4
5require 5.000;
6package I18N::LangTags;
7use strict;
8use vars qw(@ISA @EXPORT @EXPORT_OK $VERSION); # $Debug
9require Exporter;
10# $Debug = 0;
11@ISA = qw(Exporter);
12@EXPORT = qw();
13@EXPORT_OK = qw(is_language_tag same_language_tag
14 extract_language_tags super_languages
15 similarity_language_tag is_dialect_of
16 locale2language_tag alternate_language_tags
17 encode_language_tag
18 );
19
e7525a17 20$VERSION = "0.22";
4b053158 21
22=head1 NAME
23
24I18N::LangTags - functions for dealing with RFC3066-style language tags
25
26=head1 SYNOPSIS
27
28 use I18N::LangTags qw(is_language_tag same_language_tag
29 extract_language_tags super_languages
30 similarity_language_tag is_dialect_of
31 locale2language_tag alternate_language_tags
32 encode_language_tag
33 );
34
35...or whatever of those functions you want to import. Those are
36all the exportable functions -- you're free to import only some,
37or none at all. By default, none are imported.
38
39If you don't import any of these functions, assume a C<&I18N::LangTags::>
40in front of all the function names in the following examples.
41
42=head1 DESCRIPTION
43
44Language tags are a formalism, described in RFC 3066 (obsoleting
451766), for declaring what language form (language and possibly
46dialect) a given chunk of information is in.
47
48This library provides functions for common tasks involving language
49tags as they are needed in a variety of protocols and applications.
50
51Please see the "See Also" references for a thorough explanation
52of how to correctly use language tags.
53
54=over
55
56=cut
57
58###########################################################################
59
60=item * the function is_language_tag($lang1)
61
62Returns true iff $lang1 is a formally valid language tag.
63
64 is_language_tag("fr") is TRUE
65 is_language_tag("x-jicarilla") is FALSE
66 (Subtags can be 8 chars long at most -- 'jicarilla' is 9)
67
68 is_language_tag("sgn-US") is TRUE
69 (That's American Sign Language)
70
71 is_language_tag("i-Klikitat") is TRUE
72 (True without regard to the fact noone has actually
73 registered Klikitat -- it's a formally valid tag)
74
75 is_language_tag("fr-patois") is TRUE
76 (Formally valid -- altho descriptively weak!)
77
78 is_language_tag("Spanish") is FALSE
79 is_language_tag("french-patois") is FALSE
80 (No good -- first subtag has to match
81 /^([xXiI]|[a-zA-Z]{2,3})$/ -- see RFC3066)
82
83 is_language_tag("x-borg-prot2532") is TRUE
84 (Yes, subtags can contain digits, as of RFC3066)
85
86=cut
87
88sub is_language_tag {
89
90 ## Changes in the language tagging standards may have to be reflected here.
91
92 my($tag) = lc($_[0]);
93
94 return 0 if $tag eq "i" or $tag eq "x";
95 # Bad degenerate cases the following
96 # regexp would erroneously let pass
97
98 return $tag =~
99 /^(?: # First subtag
100 [xi] | [a-z]{2,3}
101 )
102 (?: # Subtags thereafter
103 - # separator
104 [a-z0-9]{1,8} # subtag
105 )*
106 $/xs ? 1 : 0;
107}
108
109###########################################################################
110
111=item * the function extract_language_tags($whatever)
112
113Returns a list of whatever looks like formally valid language tags
114in $whatever. Not very smart, so don't get too creative with
115what you want to feed it.
116
117 extract_language_tags("fr, fr-ca, i-mingo")
118 returns: ('fr', 'fr-ca', 'i-mingo')
119
120 extract_language_tags("It's like this: I'm in fr -- French!")
121 returns: ('It', 'in', 'fr')
122 (So don't just feed it any old thing.)
123
124The output is untainted. If you don't know what tainting is,
125don't worry about it.
126
127=cut
128
129sub extract_language_tags {
130
131 ## Changes in the language tagging standards may have to be reflected here.
132
133 my($text) =
134 $_[0] =~ m/(.+)/ # to make for an untainted result
135 ? $1 : ''
136 ;
137
138 return grep(!m/^[ixIX]$/s, # 'i' and 'x' aren't good tags
139 $text =~
140 m/
141 \b
142 (?: # First subtag
143 [iIxX] | [a-zA-Z]{2,3}
144 )
145 (?: # Subtags thereafter
146 - # separator
147 [a-zA-Z0-9]{1,8} # subtag
148 )*
149 \b
150 /xsg
151 );
152}
153
154###########################################################################
155
156=item * the function same_language_tag($lang1, $lang2)
157
158Returns true iff $lang1 and $lang2 are acceptable variant tags
159representing the same language-form.
160
161 same_language_tag('x-kadara', 'i-kadara') is TRUE
162 (The x/i- alternation doesn't matter)
163 same_language_tag('X-KADARA', 'i-kadara') is TRUE
164 (...and neither does case)
165 same_language_tag('en', 'en-US') is FALSE
166 (all-English is not the SAME as US English)
167 same_language_tag('x-kadara', 'x-kadar') is FALSE
168 (these are totally unrelated tags)
169
170C<same_language_tag> works by just seeing whether
171C<encode_language_tag($lang1)> is the same as
172C<encode_language_tag($lang2)>.
173
174(Yes, I know this function is named a bit oddly. Call it historic
175reasons.)
176
177=cut
178
179sub same_language_tag {
180 my $el1 = &encode_language_tag($_[0]);
181 return 0 unless defined $el1;
182 # this avoids the problem of
183 # encode_language_tag($lang1) eq and encode_language_tag($lang2)
184 # being true if $lang1 and $lang2 are both undef
185
186 return $el1 eq &encode_language_tag($_[1]) ? 1 : 0;
187}
188
189###########################################################################
190
191=item * the function similarity_language_tag($lang1, $lang2)
192
193Returns an integer representing the degree of similarity between
194tags $lang1 and $lang2 (the order of which does not matter), where
195similarity is the number of common elements on the left,
196without regard to case and to x/i- alternation.
197
198 similarity_language_tag('fr', 'fr-ca') is 1
199 (one element in common)
200 similarity_language_tag('fr-ca', 'fr-FR') is 1
201 (one element in common)
202
203 similarity_language_tag('fr-CA-joual',
204 'fr-CA-PEI') is 2
205 similarity_language_tag('fr-CA-joual', 'fr-CA') is 2
206 (two elements in common)
207
208 similarity_language_tag('x-kadara', 'i-kadara') is 1
209 (x/i- doesn't matter)
210
211 similarity_language_tag('en', 'x-kadar') is 0
212 similarity_language_tag('x-kadara', 'x-kadar') is 0
213 (unrelated tags -- no similarity)
214
215 similarity_language_tag('i-cree-syllabic',
216 'i-cherokee-syllabic') is 0
217 (no B<leftmost> elements in common!)
218
219=cut
220
221sub similarity_language_tag {
222 my $lang1 = &encode_language_tag($_[0]);
223 my $lang2 = &encode_language_tag($_[1]);
224
225 # NB: (i-sil-...)? (i-sgn-...)?
226
227 return undef if !defined($lang1) and !defined($lang2);
228 return 0 if !defined($lang1) or !defined($lang2);
229
230 my @l1_subtags = split('-', $lang1);
231 my @l2_subtags = split('-', $lang2);
232 my $similarity = 0;
233
234 while(@l1_subtags and @l2_subtags) {
235 if(shift(@l1_subtags) eq shift(@l2_subtags)) {
236 ++$similarity;
237 } else {
238 last;
239 }
240 }
241 return $similarity;
242}
243
244###########################################################################
245
246=item * the function is_dialect_of($lang1, $lang2)
247
248Returns true iff language tag $lang1 represents a subdialect of
249language tag $lang2.
250
251B<Get the order right! It doesn't work the other way around!>
252
253 is_dialect_of('en-US', 'en') is TRUE
254 (American English IS a dialect of all-English)
255
256 is_dialect_of('fr-CA-joual', 'fr-CA') is TRUE
257 is_dialect_of('fr-CA-joual', 'fr') is TRUE
258 (Joual is a dialect of (a dialect of) French)
259
260 is_dialect_of('en', 'en-US') is FALSE
261 (all-English is a NOT dialect of American English)
262
263 is_dialect_of('fr', 'en-CA') is FALSE
264
265 is_dialect_of('en', 'en' ) is TRUE
266 is_dialect_of('en-US', 'en-US') is TRUE
267 (B<Note:> these are degenerate cases)
268
269 is_dialect_of('i-mingo-tom', 'x-Mingo') is TRUE
270 (the x/i thing doesn't matter, nor does case)
271
272=cut
273
274sub is_dialect_of {
275
276 my $lang1 = &encode_language_tag($_[0]);
277 my $lang2 = &encode_language_tag($_[1]);
278
279 return undef if !defined($lang1) and !defined($lang2);
280 return 0 if !defined($lang1) or !defined($lang2);
281
282 return 1 if $lang1 eq $lang2;
283 return 0 if length($lang1) < length($lang2);
284
285 $lang1 .= '-';
286 $lang2 .= '-';
287 return
288 (substr($lang1, 0, length($lang2)) eq $lang2) ? 1 : 0;
289}
290
291###########################################################################
292
293=item * the function super_languages($lang1)
294
295Returns a list of language tags that are superordinate tags to $lang1
296-- it gets this by removing subtags from the end of $lang1 until
297nothing (or just "i" or "x") is left.
298
299 super_languages("fr-CA-joual") is ("fr-CA", "fr")
300
301 super_languages("en-AU") is ("en")
302
303 super_languages("en") is empty-list, ()
304
305 super_languages("i-cherokee") is empty-list, ()
306 ...not ("i"), which would be illegal as well as pointless.
307
308If $lang1 is not a valid language tag, returns empty-list in
309a list context, undef in a scalar context.
310
311A notable and rather unavoidable problem with this method:
312"x-mingo-tom" has an "x" because the whole tag isn't an
313IANA-registered tag -- but super_languages('x-mingo-tom') is
314('x-mingo') -- which isn't really right, since 'i-mingo' is
315registered. But this module has no way of knowing that. (But note
316that same_language_tag('x-mingo', 'i-mingo') is TRUE.)
317
318More importantly, you assume I<at your peril> that superordinates of
319$lang1 are mutually intelligible with $lang1. Consider this
320carefully.
321
322=cut
323
324sub super_languages {
325 my $lang1 = $_[0];
326 return() unless defined($lang1) && &is_language_tag($lang1);
327 my @l1_subtags = split('-', $lang1);
328
329 ## Changes in the language tagging standards may have to be reflected here.
330
331 # NB: (i-sil-...)?
332
333 my @supers = ();
334 foreach my $bit (@l1_subtags) {
335 push @supers,
336 scalar(@supers) ? ($supers[-1] . '-' . $bit) : $bit;
337 }
338 pop @supers if @supers;
339 shift @supers if @supers && $supers[0] =~ m<^[iIxX]$>s;
340 return reverse @supers;
341}
342
343###########################################################################
344
345=item * the function locale2language_tag($locale_identifier)
346
347This takes a locale name (like "en", "en_US", or "en_US.ISO8859-1")
348and maps it to a language tag. If it's not mappable (as with,
349notably, "C" and "POSIX"), this returns empty-list in a list context,
350or undef in a scalar context.
351
352 locale2language_tag("en") is "en"
353
354 locale2language_tag("en_US") is "en-US"
355
356 locale2language_tag("en_US.ISO8859-1") is "en-US"
357
358 locale2language_tag("C") is undef or ()
359
360 locale2language_tag("POSIX") is undef or ()
361
362 locale2language_tag("POSIX") is undef or ()
363
364I'm not totally sure that locale names map satisfactorily to language
365tags. Think REAL hard about how you use this. YOU HAVE BEEN WARNED.
366
367The output is untainted. If you don't know what tainting is,
368don't worry about it.
369
370=cut
371
372sub locale2language_tag {
373 my $lang =
374 $_[0] =~ m/(.+)/ # to make for an untainted result
375 ? $1 : ''
376 ;
377
378 return $lang if &is_language_tag($lang); # like "en"
379
380 $lang =~ tr<_><->; # "en_US" -> en-US
381 $lang =~ s<\.[-_a-zA-Z0-9\.]*><>s; # "en_US.ISO8859-1" -> en-US
382
383 return $lang if &is_language_tag($lang);
384
385 return;
386}
387
388###########################################################################
389
390=item * the function encode_language_tag($lang1)
391
392This function, if given a language tag, returns an encoding of it such
393that:
394
395* tags representing different languages never get the same encoding.
396
397* tags representing the same language always get the same encoding.
398
399* an encoding of a formally valid language tag always is a string
400value that is defined, has length, and is true if considered as a
401boolean.
402
403Note that the encoding itself is B<not> a formally valid language tag.
404Note also that you cannot, currently, go from an encoding back to a
405language tag that it's an encoding of.
406
407Note also that you B<must> consider the encoded value as atomic; i.e.,
408you should not consider it as anything but an opaque, unanalysable
409string value. (The internals of the encoding method may change in
410future versions, as the language tagging standard changes over time.)
411
412C<encode_language_tag> returns undef if given anything other than a
413formally valid language tag.
414
415The reason C<encode_language_tag> exists is because different language
416tags may represent the same language; this is normally treatable with
417C<same_language_tag>, but consider this situation:
418
419You have a data file that expresses greetings in different languages.
420Its format is "[language tag]=[how to say 'Hello']", like:
421
422 en-US=Hiho
423 fr=Bonjour
424 i-mingo=Hau'
425
426And suppose you write a program that reads that file and then runs as
427a daemon, answering client requests that specify a language tag and
428then expect the string that says how to greet in that language. So an
429interaction looks like:
430
431 greeting-client asks: fr
432 greeting-server answers: Bonjour
433
434So far so good. But suppose the way you're implementing this is:
435
436 my %greetings;
437 die unless open(IN, "<in.dat");
438 while(<IN>) {
439 chomp;
440 next unless /^([^=]+)=(.+)/s;
441 my($lang, $expr) = ($1, $2);
442 $greetings{$lang} = $expr;
443 }
444 close(IN);
445
446at which point %greetings has the contents:
447
448 "en-US" => "Hiho"
449 "fr" => "Bonjour"
450 "i-mingo" => "Hau'"
451
452And suppose then that you answer client requests for language $wanted
453by just looking up $greetings{$wanted}.
454
455If the client asks for "fr", that will look up successfully in
456%greetings, to the value "Bonjour". And if the client asks for
457"i-mingo", that will look up successfully in %greetings, to the value
458"Hau'".
459
460But if the client asks for "i-Mingo" or "x-mingo", or "Fr", then the
461lookup in %greetings fails. That's the Wrong Thing.
462
463You could instead do lookups on $wanted with:
464
465 use I18N::LangTags qw(same_language_tag);
466 my $repsonse = '';
467 foreach my $l2 (keys %greetings) {
468 if(same_language_tag($wanted, $l2)) {
469 $response = $greetings{$l2};
470 last;
471 }
472 }
473
474But that's rather inefficient. A better way to do it is to start your
475program with:
476
477 use I18N::LangTags qw(encode_language_tag);
478 my %greetings;
479 die unless open(IN, "<in.dat");
480 while(<IN>) {
481 chomp;
482 next unless /^([^=]+)=(.+)/s;
483 my($lang, $expr) = ($1, $2);
484 $greetings{
485 encode_language_tag($lang)
486 } = $expr;
487 }
488 close(IN);
489
490and then just answer client requests for language $wanted by just
491looking up
492
493 $greetings{encode_language_tag($wanted)}
494
495And that does the Right Thing.
496
497=cut
498
499sub encode_language_tag {
500 # Only similarity_language_tag() is allowed to analyse encodings!
501
502 ## Changes in the language tagging standards may have to be reflected here.
503
504 my($tag) = uc($_[0]); # smash case
505 return undef unless &is_language_tag($tag);
506 # If it's not a language tag, its encoding is undef
507
508 $tag =~ s/^[xiXI]-//s;
509 # Just lop off any leading "x/i-"
510 # Or I suppose I could do s/^[xiXI]-/_/s or something.
511
512 return "~$tag";
513}
514
515#--------------------------------------------------------------------------
516
517=item * the function alternate_language_tags($lang1)
518
519This function, if given a language tag, returns all language tags that
520are alternate forms of this language tag. (There is little
521alternation in the C<current> language tagging formalism, but
522extensions to the formalism are under consideration which could add a
523great deal of alternation.)
524
525Examples from the current formalism:
526
527 alternate_language_tags('en') is ()
528 alternate_language_tags('x-mingo-tom') is ('i-mingo-tom')
529 alternate_language_tags('x-klikitat') is ('i-klikitat')
530 alternate_language_tags('i-klikitat') is ('x-klikitat')
531
532This function returns undef if given anything other than a formally
533valid language tag.
534
535=cut
536
537my %alt = qw( i x x i I X X I );
538sub alternate_language_tags {
539 ## Changes in the language tagging standards may have to be reflected here.
540 my $tag = $_[0];
541 return() unless &is_language_tag($tag);
542
543 # might as well preserve case
544
545 if($tag =~ /^([XIxi])(-.+)/) {
546 # This handles all the alternation that exists CURRENTLY
547 return($alt{$1} . $2);
548 }
549 return();
550}
551
552###########################################################################
553
554=back
555
556=head1 ABOUT LOWERCASING
557
558I've considered making all the above functions that output language
559tags return all those tags strictly in lowercase. Having all your
560language tags in lowercase does make some things easier. But you
561might as well just lowercase as you like, or call
562C<encode_language_tag($lang1)> where appropriate.
563
564=head1 ABOUT UNICODE PLAINTEXT LANGUAGE TAGS
565
566In some future version of I18N::LangTags, I plan to include support
567for RFC2482-style language tags -- which are basically just normal
568language tags with their ASCII characters shifted into Plane 14.
569
570=head1 SEE ALSO
571
e7525a17 572* L<I18N::LangTags::List|I18N::LangTags::List>
573
4b053158 574* RFC 3066, C<ftp://ftp.isi.edu/in-notes/rfc3066.txt>, "Tags for the
575Identification of Languages". (Obsoletes RFC 1766)
576
577* RFC 2277, C<ftp://ftp.isi.edu/in-notes/rfc2277.txt>, "IETF Policy on
578Character Sets and Languages".
579
580* RFC 2231, C<ftp://ftp.isi.edu/in-notes/rfc2231.txt>, "MIME Parameter
581Value and Encoded Word Extensions: Character Sets, Languages, and
582Continuations".
583
584* RFC 2482, C<ftp://ftp.isi.edu/in-notes/rfc2482.txt>,
585"Language Tagging in Unicode Plain Text".
586
587* Locale::Codes, in
588C<http://www.perl.com/CPAN/modules/by-module/Locale/>
589
590* ISO 639, "Code for the representation of names of languages",
591C<http://www.indigo.ie/egt/standards/iso639/iso639-1-en.html>
592
593* ISO 639-2, "Codes for the representation of names of languages",
594including three-letter codes,
595C<http://lcweb.loc.gov/standards/iso639-2/bibcodes.html>
596
597* The IANA list of registered languages (hopefully up-to-date),
598C<ftp://ftp.isi.edu/in-notes/iana/assignments/languages/>
599
600=head1 COPYRIGHT
601
602Copyright (c) 1998-2001 Sean M. Burke. All rights reserved.
603
604This library is free software; you can redistribute it and/or
605modify it under the same terms as Perl itself.
606
607The programs and documentation in this dist are distributed in
608the hope that they will be useful, but without any warranty; without
609even the implied warranty of merchantability or fitness for a
610particular purpose.
611
612=head1 AUTHOR
613
614Sean M. Burke C<sburke@cpan.org>
615
616=cut
617
6181;
619
620__END__