lib/I18N/LangTags.pm

   1
   2 # Time-stamp: "2001-05-27 19:53:11 MDT"
   3 # Sean M. Burke <sburke@cpan.org>
   4
   5 require 5.000;
   6 package I18N::LangTags;
   7 use strict;
   8 use vars qw(@ISA @EXPORT @EXPORT_OK $VERSION); # $Debug
   9 require Exporter;
  10 # $Debug = 0;
  11 @ISA = qw(Exporter);
  12 @EXPORT = qw();
  13 @EXPORT_OK = qw(is_language_tag same_language_tag
  14                 extract_language_tags super_languages
  15                 similarity_language_tag is_dialect_of
  16                 locale2language_tag alternate_language_tags
  17                 encode_language_tag
  18                );
  19
  20 $VERSION = "0.22";
  21
  22 =head1 NAME
  23
  24 I18N::LangTags - functions for dealing with RFC3066-style language tags
  25
  26 =head1 SYNOPSIS
  27
  28     use I18N::LangTags qw(is_language_tag same_language_tag
  29                           extract_language_tags super_languages
  30                           similarity_language_tag is_dialect_of
  31                           locale2language_tag alternate_language_tags
  32                           encode_language_tag
  33                          );
  34
  35 ...or whatever of those functions you want to import.  Those are
  36 all the exportable functions -- you're free to import only some,
  37 or none at all.  By default, none are imported.
  38
  39 If you don't import any of these functions, assume a C<&I18N::LangTags::>
  40 in front of all the function names in the following examples.
  41
  42 =head1 DESCRIPTION
  43
  44 Language tags are a formalism, described in RFC 3066 (obsoleting
  45 1766), for declaring what language form (language and possibly
  46 dialect) a given chunk of information is in.
  47
  48 This library provides functions for common tasks involving language
  49 tags as they are needed in a variety of protocols and applications.
  50
  51 Please see the "See Also" references for a thorough explanation
  52 of how to correctly use language tags.
  53
  54 =over
  55
  56 =cut
  57
  58 ###########################################################################
  59
  60 =item * the function is_language_tag($lang1)
  61
  62 Returns true iff $lang1 is a formally valid language tag.
  63
  64    is_language_tag("fr")            is TRUE
  65    is_language_tag("x-jicarilla")   is FALSE
  66        (Subtags can be 8 chars long at most -- 'jicarilla' is 9)
  67
  68    is_language_tag("sgn-US")    is TRUE
  69        (That's American Sign Language)
  70
  71    is_language_tag("i-Klikitat")    is TRUE
  72        (True without regard to the fact noone has actually
  73         registered Klikitat -- it's a formally valid tag)
  74
  75    is_language_tag("fr-patois")     is TRUE
  76        (Formally valid -- altho descriptively weak!)
  77
  78    is_language_tag("Spanish")       is FALSE
  79    is_language_tag("french-patois") is FALSE
  80        (No good -- first subtag has to match
  81         /^([xXiI]|[a-zA-Z]{2,3})$/ -- see RFC3066)
  82
  83    is_language_tag("x-borg-prot2532") is TRUE
  84        (Yes, subtags can contain digits, as of RFC3066)
  85
  86 =cut
  87
  88 sub is_language_tag {
  89
  90   ## Changes in the language tagging standards may have to be reflected here.
  91
  92   my($tag) = lc($_[0]);
  93
  94   return 0 if $tag eq "i" or $tag eq "x";
  95   # Bad degenerate cases the following
  96   #  regexp would erroneously let pass
  97
  98   return $tag =~
  99     /^(?:  # First subtag
 100          [xi] | [a-z]{2,3}
 101       )
 102       (?:  # Subtags thereafter
 103          -           # separator
 104          [a-z0-9]{1,8}  # subtag
 105       )*
 106     $/xs ? 1 : 0;
 107 }
 108
 109 ###########################################################################
 110
 111 =item * the function extract_language_tags($whatever)
 112
 113 Returns a list of whatever looks like formally valid language tags
 114 in $whatever.  Not very smart, so don't get too creative with
 115 what you want to feed it.
 116
 117   extract_language_tags("fr, fr-ca, i-mingo")
 118     returns:   ('fr', 'fr-ca', 'i-mingo')
 119
 120   extract_language_tags("It's like this: I'm in fr -- French!")
 121     returns:   ('It', 'in', 'fr')
 122   (So don't just feed it any old thing.)
 123
 124 The output is untainted.  If you don't know what tainting is,
 125 don't worry about it.
 126
 127 =cut
 128
 129 sub extract_language_tags {
 130
 131   ## Changes in the language tagging standards may have to be reflected here.
 132
 133   my($text) =
 134     $_[0] =~ m/(.+)/  # to make for an untainted result
 135     ? $1 : ''
 136   ;
 137
 138   return grep(!m/^[ixIX]$/s, # 'i' and 'x' aren't good tags
 139     $text =~
 140     m/
 141       \b
 142       (?:  # First subtag
 143          [iIxX] | [a-zA-Z]{2,3}
 144       )
 145       (?:  # Subtags thereafter
 146          -           # separator
 147          [a-zA-Z0-9]{1,8}  # subtag
 148       )*
 149       \b
 150     /xsg
 151   );
 152 }
 153
 154 ###########################################################################
 155
 156 =item * the function same_language_tag($lang1, $lang2)
 157
 158 Returns true iff $lang1 and $lang2 are acceptable variant tags
 159 representing the same language-form.
 160
 161    same_language_tag('x-kadara', 'i-kadara')  is TRUE
 162       (The x/i- alternation doesn't matter)
 163    same_language_tag('X-KADARA', 'i-kadara')  is TRUE
 164       (...and neither does case)
 165    same_language_tag('en',       'en-US')     is FALSE
 166       (all-English is not the SAME as US English)
 167    same_language_tag('x-kadara', 'x-kadar')   is FALSE
 168       (these are totally unrelated tags)
 169
 170 C<same_language_tag> works by just seeing whether
 171 C<encode_language_tag($lang1)> is the same as
 172 C<encode_language_tag($lang2)>.
 173
 174 (Yes, I know this function is named a bit oddly.  Call it historic
 175 reasons.)
 176
 177 =cut
 178
 179 sub same_language_tag {
 180   my $el1 = &encode_language_tag($_[0]);
 181   return 0 unless defined $el1;
 182    # this avoids the problem of
 183    # encode_language_tag($lang1) eq and encode_language_tag($lang2)
 184    # being true if $lang1 and $lang2 are both undef
 185
 186   return $el1 eq &encode_language_tag($_[1]) ? 1 : 0;
 187 }
 188
 189 ###########################################################################
 190
 191 =item * the function similarity_language_tag($lang1, $lang2)
 192
 193 Returns an integer representing the degree of similarity between
 194 tags $lang1 and $lang2 (the order of which does not matter), where
 195 similarity is the number of common elements on the left,
 196 without regard to case and to x/i- alternation.
 197
 198    similarity_language_tag('fr', 'fr-ca')           is 1
 199       (one element in common)
 200    similarity_language_tag('fr-ca', 'fr-FR')        is 1
 201       (one element in common)
 202
 203    similarity_language_tag('fr-CA-joual',
 204                            'fr-CA-PEI')             is 2
 205    similarity_language_tag('fr-CA-joual', 'fr-CA')  is 2
 206       (two elements in common)
 207
 208    similarity_language_tag('x-kadara', 'i-kadara')  is 1
 209       (x/i- doesn't matter)
 210
 211    similarity_language_tag('en',       'x-kadar')   is 0
 212    similarity_language_tag('x-kadara', 'x-kadar')   is 0
 213       (unrelated tags -- no similarity)
 214
 215    similarity_language_tag('i-cree-syllabic',
 216                            'i-cherokee-syllabic')   is 0
 217       (no B<leftmost> elements in common!)
 218
 219 =cut
 220
 221 sub similarity_language_tag {
 222   my $lang1 = &encode_language_tag($_[0]);
 223   my $lang2 = &encode_language_tag($_[1]);
 224
 225   # NB: (i-sil-...)?  (i-sgn-...)?
 226
 227   return undef if !defined($lang1) and !defined($lang2);
 228   return 0 if !defined($lang1) or !defined($lang2);
 229
 230   my @l1_subtags = split('-', $lang1);
 231   my @l2_subtags = split('-', $lang2);
 232   my $similarity = 0;
 233
 234   while(@l1_subtags and @l2_subtags) {
 235     if(shift(@l1_subtags) eq shift(@l2_subtags)) {
 236       ++$similarity;
 237     } else {
 238       last;
 239     }
 240   }
 241   return $similarity;
 242 }
 243
 244 ###########################################################################
 245
 246 =item * the function is_dialect_of($lang1, $lang2)
 247
 248 Returns true iff language tag $lang1 represents a subdialect of
 249 language tag $lang2.
 250
 251 B<Get the order right!  It doesn't work the other way around!>
 252
 253    is_dialect_of('en-US', 'en')            is TRUE
 254      (American English IS a dialect of all-English)
 255
 256    is_dialect_of('fr-CA-joual', 'fr-CA')   is TRUE
 257    is_dialect_of('fr-CA-joual', 'fr')      is TRUE
 258      (Joual is a dialect of (a dialect of) French)
 259
 260    is_dialect_of('en', 'en-US')            is FALSE
 261      (all-English is a NOT dialect of American English)
 262
 263    is_dialect_of('fr', 'en-CA')            is FALSE
 264
 265    is_dialect_of('en',    'en'   )            is TRUE
 266    is_dialect_of('en-US', 'en-US')            is TRUE
 267      (B<Note:> these are degenerate cases)
 268
 269    is_dialect_of('i-mingo-tom', 'x-Mingo') is TRUE
 270      (the x/i thing doesn't matter, nor does case)
 271
 272 =cut
 273
 274 sub is_dialect_of {
 275
 276   my $lang1 = &encode_language_tag($_[0]);
 277   my $lang2 = &encode_language_tag($_[1]);
 278
 279   return undef if !defined($lang1) and !defined($lang2);
 280   return 0 if !defined($lang1) or !defined($lang2);
 281
 282   return 1 if $lang1 eq $lang2;
 283   return 0 if length($lang1) < length($lang2);
 284
 285   $lang1 .= '-';
 286   $lang2 .= '-';
 287   return
 288     (substr($lang1, 0, length($lang2)) eq $lang2) ? 1 : 0;
 289 }
 290
 291 ###########################################################################
 292
 293 =item * the function super_languages($lang1)
 294
 295 Returns a list of language tags that are superordinate tags to $lang1
 296 -- it gets this by removing subtags from the end of $lang1 until
 297 nothing (or just "i" or "x") is left.
 298
 299    super_languages("fr-CA-joual")  is  ("fr-CA", "fr")
 300
 301    super_languages("en-AU")  is  ("en")
 302
 303    super_languages("en")  is  empty-list, ()
 304
 305    super_languages("i-cherokee")  is  empty-list, ()
 306     ...not ("i"), which would be illegal as well as pointless.
 307
 308 If $lang1 is not a valid language tag, returns empty-list in
 309 a list context, undef in a scalar context.
 310
 311 A notable and rather unavoidable problem with this method:
 312 "x-mingo-tom" has an "x" because the whole tag isn't an
 313 IANA-registered tag -- but super_languages('x-mingo-tom') is
 314 ('x-mingo') -- which isn't really right, since 'i-mingo' is
 315 registered.  But this module has no way of knowing that.  (But note
 316 that same_language_tag('x-mingo', 'i-mingo') is TRUE.)
 317
 318 More importantly, you assume I<at your peril> that superordinates of
 319 $lang1 are mutually intelligible with $lang1.  Consider this
 320 carefully.
 321
 322 =cut
 323
 324 sub super_languages {
 325   my $lang1 = $_[0];
 326   return() unless defined($lang1) && &is_language_tag($lang1);
 327   my @l1_subtags = split('-', $lang1);
 328
 329   ## Changes in the language tagging standards may have to be reflected here.
 330
 331   # NB: (i-sil-...)?
 332
 333   my @supers = ();
 334   foreach my $bit (@l1_subtags) {
 335     push @supers,
 336       scalar(@supers) ? ($supers[-1] . '-' . $bit) : $bit;
 337   }
 338   pop @supers if @supers;
 339   shift @supers if @supers && $supers[0] =~ m<^[iIxX]$>s;
 340   return reverse @supers;
 341 }
 342
 343 ###########################################################################
 344
 345 =item * the function locale2language_tag($locale_identifier)
 346
 347 This takes a locale name (like "en", "en_US", or "en_US.ISO8859-1")
 348 and maps it to a language tag.  If it's not mappable (as with,
 349 notably, "C" and "POSIX"), this returns empty-list in a list context,
 350 or undef in a scalar context.
 351
 352    locale2language_tag("en") is "en"
 353
 354    locale2language_tag("en_US") is "en-US"
 355
 356    locale2language_tag("en_US.ISO8859-1") is "en-US"
 357
 358    locale2language_tag("C") is undef or ()
 359
 360    locale2language_tag("POSIX") is undef or ()
 361
 362    locale2language_tag("POSIX") is undef or ()
 363
 364 I'm not totally sure that locale names map satisfactorily to language
 365 tags.  Think REAL hard about how you use this.  YOU HAVE BEEN WARNED.
 366
 367 The output is untainted.  If you don't know what tainting is,
 368 don't worry about it.
 369
 370 =cut
 371
 372 sub locale2language_tag {
 373   my $lang =
 374     $_[0] =~ m/(.+)/  # to make for an untainted result
 375     ? $1 : ''
 376   ;
 377
 378   return $lang if &is_language_tag($lang); # like "en"
 379
 380   $lang =~ tr<_><->;  # "en_US" -> en-US
 381   $lang =~ s<\.[-_a-zA-Z0-9\.]*><>s;  # "en_US.ISO8859-1" -> en-US
 382
 383   return $lang if &is_language_tag($lang);
 384
 385   return;
 386 }
 387
 388 ###########################################################################
 389
 390 =item * the function encode_language_tag($lang1)
 391
 392 This function, if given a language tag, returns an encoding of it such
 393 that:
 394
 395 * tags representing different languages never get the same encoding.
 396
 397 * tags representing the same language always get the same encoding.
 398
 399 * an encoding of a formally valid language tag always is a string
 400 value that is defined, has length, and is true if considered as a
 401 boolean.
 402
 403 Note that the encoding itself is B<not> a formally valid language tag.
 404 Note also that you cannot, currently, go from an encoding back to a
 405 language tag that it's an encoding of.
 406
 407 Note also that you B<must> consider the encoded value as atomic; i.e.,
 408 you should not consider it as anything but an opaque, unanalysable
 409 string value.  (The internals of the encoding method may change in
 410 future versions, as the language tagging standard changes over time.)
 411
 412 C<encode_language_tag> returns undef if given anything other than a
 413 formally valid language tag.
 414
 415 The reason C<encode_language_tag> exists is because different language
 416 tags may represent the same language; this is normally treatable with
 417 C<same_language_tag>, but consider this situation:
 418
 419 You have a data file that expresses greetings in different languages.
 420 Its format is "[language tag]=[how to say 'Hello']", like:
 421
 422           en-US=Hiho
 423           fr=Bonjour
 424           i-mingo=Hau'
 425
 426 And suppose you write a program that reads that file and then runs as
 427 a daemon, answering client requests that specify a language tag and
 428 then expect the string that says how to greet in that language.  So an
 429 interaction looks like:
 430
 431           greeting-client asks:    fr
 432           greeting-server answers: Bonjour
 433
 434 So far so good.  But suppose the way you're implementing this is:
 435
 436           my %greetings;
 437           die unless open(IN, "<in.dat");
 438           while(<IN>) {
 439             chomp;
 440             next unless /^([^=]+)=(.+)/s;
 441             my($lang, $expr) = ($1, $2);
 442             $greetings{$lang} = $expr;
 443           }
 444           close(IN);
 445
 446 at which point %greetings has the contents:
 447
 448           "en-US"   => "Hiho"
 449           "fr"      => "Bonjour"
 450           "i-mingo" => "Hau'"
 451
 452 And suppose then that you answer client requests for language $wanted
 453 by just looking up $greetings{$wanted}.
 454
 455 If the client asks for "fr", that will look up successfully in
 456 %greetings, to the value "Bonjour".  And if the client asks for
 457 "i-mingo", that will look up successfully in %greetings, to the value
 458 "Hau'".
 459
 460 But if the client asks for "i-Mingo" or "x-mingo", or "Fr", then the
 461 lookup in %greetings fails.  That's the Wrong Thing.
 462
 463 You could instead do lookups on $wanted with:
 464
 465           use I18N::LangTags qw(same_language_tag);
 466           my $repsonse = '';
 467           foreach my $l2 (keys %greetings) {
 468             if(same_language_tag($wanted, $l2)) {
 469               $response = $greetings{$l2};
 470               last;
 471             }
 472           }
 473
 474 But that's rather inefficient.  A better way to do it is to start your
 475 program with:
 476
 477           use I18N::LangTags qw(encode_language_tag);
 478           my %greetings;
 479           die unless open(IN, "<in.dat");
 480           while(<IN>) {
 481             chomp;
 482             next unless /^([^=]+)=(.+)/s;
 483             my($lang, $expr) = ($1, $2);
 484             $greetings{
 485                         encode_language_tag($lang)
 486                       } = $expr;
 487           }
 488           close(IN);
 489
 490 and then just answer client requests for language $wanted by just
 491 looking up
 492
 493           $greetings{encode_language_tag($wanted)}
 494
 495 And that does the Right Thing.
 496
 497 =cut
 498
 499 sub encode_language_tag {
 500   # Only similarity_language_tag() is allowed to analyse encodings!
 501
 502   ## Changes in the language tagging standards may have to be reflected here.
 503
 504   my($tag) = uc($_[0]); # smash case
 505   return undef unless &is_language_tag($tag);
 506    # If it's not a language tag, its encoding is undef
 507
 508   $tag =~ s/^[xiXI]-//s;
 509    # Just lop off any leading "x/i-"
 510    # Or I suppose I could do s/^[xiXI]-/_/s or something.
 511
 512   return "~$tag";
 513 }
 514
 515 #--------------------------------------------------------------------------
 516
 517 =item * the function alternate_language_tags($lang1)
 518
 519 This function, if given a language tag, returns all language tags that
 520 are alternate forms of this language tag.  (There is little
 521 alternation in the C<current> language tagging formalism, but
 522 extensions to the formalism are under consideration which could add a
 523 great deal of alternation.)
 524
 525 Examples from the current formalism:
 526
 527           alternate_language_tags('en')           is   ()
 528           alternate_language_tags('x-mingo-tom')  is   ('i-mingo-tom')
 529           alternate_language_tags('x-klikitat')   is   ('i-klikitat')
 530           alternate_language_tags('i-klikitat')   is   ('x-klikitat')
 531
 532 This function returns undef if given anything other than a formally
 533 valid language tag.
 534
 535 =cut
 536
 537 my %alt = qw( i x   x i   I X   X I );
 538 sub alternate_language_tags {
 539   ## Changes in the language tagging standards may have to be reflected here.
 540   my $tag = $_[0];
 541   return() unless &is_language_tag($tag);
 542
 543  # might as well preserve case
 544
 545   if($tag =~ /^([XIxi])(-.+)/) {
 546     # This handles all the alternation that exists CURRENTLY
 547     return($alt{$1} . $2);
 548   }
 549   return();
 550 }
 551
 552 ###########################################################################
 553
 554 =back
 555
 556 =head1 ABOUT LOWERCASING
 557
 558 I've considered making all the above functions that output language
 559 tags return all those tags strictly in lowercase.  Having all your
 560 language tags in lowercase does make some things easier.  But you
 561 might as well just lowercase as you like, or call
 562 C<encode_language_tag($lang1)> where appropriate.
 563
 564 =head1 ABOUT UNICODE PLAINTEXT LANGUAGE TAGS
 565
 566 In some future version of I18N::LangTags, I plan to include support
 567 for RFC2482-style language tags -- which are basically just normal
 568 language tags with their ASCII characters shifted into Plane 14.
 569
 570 =head1 SEE ALSO
 571
 572 * L<I18N::LangTags::List|I18N::LangTags::List>
 573
 574 * RFC 3066, C<ftp://ftp.isi.edu/in-notes/rfc3066.txt>, "Tags for the
 575 Identification of Languages".  (Obsoletes RFC 1766)
 576
 577 * RFC 2277, C<ftp://ftp.isi.edu/in-notes/rfc2277.txt>, "IETF Policy on
 578 Character Sets and Languages".
 579
 580 * RFC 2231, C<ftp://ftp.isi.edu/in-notes/rfc2231.txt>, "MIME Parameter
 581 Value and Encoded Word Extensions: Character Sets, Languages, and
 582 Continuations".
 583
 584 * RFC 2482, C<ftp://ftp.isi.edu/in-notes/rfc2482.txt>,
 585 "Language Tagging in Unicode Plain Text".
 586
 587 * Locale::Codes, in
 588 C<http://www.perl.com/CPAN/modules/by-module/Locale/>
 589
 590 * ISO 639, "Code for the representation of names of languages",
 591 C<http://www.indigo.ie/egt/standards/iso639/iso639-1-en.html>
 592
 593 * ISO 639-2, "Codes for the representation of names of languages",
 594 including three-letter codes,
 595 C<http://lcweb.loc.gov/standards/iso639-2/bibcodes.html>
 596
 597 * The IANA list of registered languages (hopefully up-to-date),
 598 C<ftp://ftp.isi.edu/in-notes/iana/assignments/languages/>
 599
 600 =head1 COPYRIGHT
 601
 602 Copyright (c) 1998-2001 Sean M. Burke. All rights reserved.
 603
 604 This library is free software; you can redistribute it and/or
 605 modify it under the same terms as Perl itself.
 606
 607 The programs and documentation in this dist are distributed in
 608 the hope that they will be useful, but without any warranty; without
 609 even the implied warranty of merchantability or fitness for a
 610 particular purpose.
 611
 612 =head1 AUTHOR
 613
 614 Sean M. Burke C<sburke@cpan.org>
 615
 616 =cut
 617
 618 1;
 619
 620 __END__