From: Jarkko Hietaniemi Date: Wed, 20 Jun 2001 18:02:18 +0000 (+0000) Subject: Update to I18N::LangTags 0.24, from Sean Burke. X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=21aeefd57a5747f2eae308d425d77bf17cddfc0c;p=p5sagit%2Fp5-mst-13.2.git Update to I18N::LangTags 0.24, from Sean Burke. p4raw-id: //depot/perl@10759 --- diff --git a/MANIFEST b/MANIFEST index 0e54da7..0ca611b 100644 --- a/MANIFEST +++ b/MANIFEST @@ -974,7 +974,9 @@ lib/Getopt/Std.t See if Getopt::Std and Getopt::Long work lib/I18N/Collate.pm Routines to do strxfrm-based collation lib/I18N/Collate.t See if I18N::Collate works lib/I18N/LangTags.pm I18N::LangTags -lib/I18N/LangTags/List.pod list of tags for human languages +lib/I18N/LangTags/ChangeLog I18N::LangTags +lib/I18N/LangTags/List.pm List of tags for human languages +lib/I18N/LangTags/README I18N::LangTags lib/I18N/LangTags/test.pl See if I18N::LangTags works lib/IPC/Open2.pm Open a two-ended pipe lib/IPC/Open2.t See if IPC::Open2 works diff --git a/lib/I18N/LangTags.pm b/lib/I18N/LangTags.pm index f5db282..58f2464 100644 --- a/lib/I18N/LangTags.pm +++ b/lib/I18N/LangTags.pm @@ -1,23 +1,23 @@ -# Time-stamp: "2001-05-27 19:53:11 MDT" +# Time-stamp: "2001-06-20 01:28:32 MDT" # Sean M. Burke require 5.000; package I18N::LangTags; use strict; -use vars qw(@ISA @EXPORT @EXPORT_OK $VERSION); # $Debug +use vars qw(@ISA @EXPORT @EXPORT_OK %EXPORT_TAGS $VERSION %Panic); require Exporter; -# $Debug = 0; @ISA = qw(Exporter); @EXPORT = qw(); @EXPORT_OK = qw(is_language_tag same_language_tag extract_language_tags super_languages similarity_language_tag is_dialect_of locale2language_tag alternate_language_tags - encode_language_tag + encode_language_tag panic_languages ); +%EXPORT_TAGS = ('ALL' => \@EXPORT_OK); -$VERSION = "0.22"; +$VERSION = "0.24"; =head1 NAME @@ -29,12 +29,17 @@ I18N::LangTags - functions for dealing with RFC3066-style language tags extract_language_tags super_languages similarity_language_tag is_dialect_of locale2language_tag alternate_language_tags - encode_language_tag + encode_language_tag panic_languages ); ...or whatever of those functions you want to import. Those are all the exportable functions -- you're free to import only some, -or none at all. By default, none are imported. +or none at all. By default, none are imported. If you say: + + use I18N::LangTags qw(:ALL) + +...then all are exported. (This saves you from having to use +something less obvious like C.) If you don't import any of these functions, assume a C<&I18N::LangTags::> in front of all the function names in the following examples. @@ -92,7 +97,7 @@ sub is_language_tag { my($tag) = lc($_[0]); return 0 if $tag eq "i" or $tag eq "x"; - # Bad degenerate cases the following + # Bad degenerate cases that the following # regexp would erroneously let pass return $tag =~ @@ -166,6 +171,8 @@ representing the same language-form. (all-English is not the SAME as US English) same_language_tag('x-kadara', 'x-kadar') is FALSE (these are totally unrelated tags) + same_language_tag('no-bok', 'nb') is TRUE + (no-bok is a legacy tag for nb (Norwegian Bokmal)) C works by just seeing whether C is the same as @@ -221,7 +228,9 @@ without regard to case and to x/i- alternation. sub similarity_language_tag { my $lang1 = &encode_language_tag($_[0]); my $lang2 = &encode_language_tag($_[1]); - + # And encode_language_tag takes care of the whole + # no-nyn==nn, i-hakka==zh-hakka, etc, things + # NB: (i-sil-...)? (i-sgn-...)? return undef if !defined($lang1) and !defined($lang2); @@ -245,7 +254,7 @@ sub similarity_language_tag { =item * the function is_dialect_of($lang1, $lang2) -Returns true iff language tag $lang1 represents a subdialect of +Returns true iff language tag $lang1 represents a subform of language tag $lang2. B @@ -262,13 +271,18 @@ B is_dialect_of('fr', 'en-CA') is FALSE - is_dialect_of('en', 'en' ) is TRUE - is_dialect_of('en-US', 'en-US') is TRUE + is_dialect_of('en', 'en' ) is TRUE + is_dialect_of('en-US', 'en-US') is TRUE (B these are degenerate cases) is_dialect_of('i-mingo-tom', 'x-Mingo') is TRUE (the x/i thing doesn't matter, nor does case) + is_dialect_of('nn', 'no') is TRUE + (because 'nn' (New Norse) is aliased to 'no-nyn', + as a special legacy case, and 'no-nyn' is a + subform of 'no' (Norwegian)) + =cut sub is_dialect_of { @@ -324,6 +338,13 @@ carefully. sub super_languages { my $lang1 = $_[0]; return() unless defined($lang1) && &is_language_tag($lang1); + + # a hack for those annoying new (2001) tags: + $lang1 =~ s/^nb\b/no-bok/i; # yes, backwards + $lang1 =~ s/^nn\b/no-nyn/i; # yes, backwards + $lang1 =~ s/^[ix](-hakka\b)/zh$1/i; # goes the right way + # i-hakka-bork-bjork-bjark => zh-hakka-bork-bjork-bjark + my @l1_subtags = split('-', $lang1); ## Changes in the language tagging standards may have to be reflected here. @@ -501,15 +522,29 @@ sub encode_language_tag { ## Changes in the language tagging standards may have to be reflected here. - my($tag) = uc($_[0]); # smash case + my($tag) = $_[0] || return undef; return undef unless &is_language_tag($tag); - # If it's not a language tag, its encoding is undef + + # For the moment, these legacy variances are few enough that + # we can just handle them here with regexps. + $tag =~ s/^iw\b/he/i; # Hebrew + $tag =~ s/^in\b/id/i; # Indonesian + $tag =~ s/^[ix]-lux\b/lb/i; # Luxemburger + $tag =~ s/^[ix]-navajo\b/nv/i; # Navajo + $tag =~ s/^ji\b/yi/i; # Yiddish + # + # These go FROM the simplex to complex form, to get + # similarity-comparison right. And that's okay, since + # similarity_language_tag is the only thing that + # analyzes our output. + $tag =~ s/^[ix]-hakka\b/zh-hakka/i; # Hakka + $tag =~ s/^nb\b/no-bok/i; # BACKWARDS for Bokmal + $tag =~ s/^nn\b/no-nyn/i; # BACKWARDS for Nynorsk $tag =~ s/^[xiXI]-//s; # Just lop off any leading "x/i-" - # Or I suppose I could do s/^[xiXI]-/_/s or something. - return "~$tag"; + return "~" . uc($tag); } #-------------------------------------------------------------------------- @@ -517,39 +552,184 @@ sub encode_language_tag { =item * the function alternate_language_tags($lang1) This function, if given a language tag, returns all language tags that -are alternate forms of this language tag. (There is little -alternation in the C language tagging formalism, but -extensions to the formalism are under consideration which could add a -great deal of alternation.) - -Examples from the current formalism: - - alternate_language_tags('en') is () - alternate_language_tags('x-mingo-tom') is ('i-mingo-tom') - alternate_language_tags('x-klikitat') is ('i-klikitat') - alternate_language_tags('i-klikitat') is ('x-klikitat') - -This function returns undef if given anything other than a formally +are alternate forms of this language tag. (I.e., tags which refer to +the same language.) This is meant to handle legacy tags caused by +the minor changes in language tag standards over the years; and +the x-/i- alternation is also dealt with. + +Note that this function does I try to equate new (and never-used, +and unusable) +ISO639-2 three-letter tags to old (and still in use) ISO639-1 +two-letter equivalents -- like "ara" -> "ar" -- because +"ara" has I been in use as an Internet language tag, +and RFC 3066 stipulates that it never should be, since a shorter +tag ("ar") exists. + +Examples: + + alternate_language_tags('no-bok') is ('nb') + alternate_language_tags('nb') is ('no-bok') + alternate_language_tags('he') is ('iw') + alternate_language_tags('iw') is ('he') + alternate_language_tags('i-hakka') is ('zh-hakka', 'x-hakka') + alternate_language_tags('zh-hakka') is ('i-hakka', 'x-hakka') + alternate_language_tags('en') is () + alternate_language_tags('x-mingo-tom') is ('i-mingo-tom') + alternate_language_tags('x-klikitat') is ('i-klikitat') + alternate_language_tags('i-klikitat') is ('x-klikitat') + +This function returns empty-list if given anything other than a formally valid language tag. =cut my %alt = qw( i x x i I X X I ); sub alternate_language_tags { - ## Changes in the language tagging standards may have to be reflected here. my $tag = $_[0]; return() unless &is_language_tag($tag); - # might as well preserve case + my @em; # push 'em real goood! + + # For the moment, these legacy variances are few enough that + # we can just handle them here with regexps. + + if( $tag =~ m/^[ix]-hakka\b(.*)/i) {push @em, "zh-hakka$1"; + } elsif($tag =~ m/^zh-hakka\b(.*)/i) { push @em, "x-hakka$1", "i-hakka$1"; + + } elsif($tag =~ m/^he\b(.*)/i) { push @em, "iw$1"; + } elsif($tag =~ m/^iw\b(.*)/i) { push @em, "he$1"; + + } elsif($tag =~ m/^in\b(.*)/i) { push @em, "id$1"; + } elsif($tag =~ m/^id\b(.*)/i) { push @em, "in$1"; + + } elsif($tag =~ m/^[ix]-lux\b(.*)/i) { push @em, "lb$1"; + } elsif($tag =~ m/^lb\b(.*)/i) { push @em, "i-lux$1", "x-lux$1"; + + } elsif($tag =~ m/^[ix]-navajo\b(.*)/i) { push @em, "nv$1"; + } elsif($tag =~ m/^nv\b(.*)/i) { push @em, "i-navajo$1", "x-navajo$1"; + + } elsif($tag =~ m/^yi\b(.*)/i) { push @em, "ji$1"; + } elsif($tag =~ m/^ji\b(.*)/i) { push @em, "yi$1"; + + } elsif($tag =~ m/^nb\b(.*)/i) { push @em, "no-bok$1"; + } elsif($tag =~ m/^no-bok\b(.*)/i) { push @em, "nb$1"; + + } elsif($tag =~ m/^nn\b(.*)/i) { push @em, "no-nyn$1"; + } elsif($tag =~ m/^no-nyn\b(.*)/i) { push @em, "nn$1"; + } + + push @em, $alt{$1} . $2 if $tag =~ /^([XIxi])(-.+)/; + return @em; +} + +########################################################################### + +{ + # Init %Panic... + + my @panic = ( # MUST all be lowercase! + # Only large ("national") languages make it in this list. + # If you, as a user, are so bizarre that the /only/ language + # you claim to accept is Galician, then no, we won't do you + # the favor of providing Catalan as a panic-fallback for + # you. Because if I start trying to add "little languages" in + # here, I'll just go crazy. + + # In a pinch, consider Scandinavian languages (minus + # Icelandic?) to be mutually intelligible. + ([qw(no nn nb se da fo)]) x 2, + + # I think this is about the extent of tolerable intelligibility + # among large modern Romance languages. + 'pt' => [qw(es ca it fr)], # Portuguese, Spanish, Catalan, Italian, French + 'ca' => [qw(es pt it fr)], + 'es' => [qw(ca it fr pt)], + 'it' => [qw(es fr ca pt)], + 'fr' => [qw(es it ca pt)], + + # Also assume that speakers of the main Indian languages prefer + # to read/hear Hindi over English + [qw( + as bn gu kn ks kok ml mni mr ne or pa sa sd te ta ur + )] => 'hi', + # Assamese, Bengali, Gujarati, [Hindi,] Kannada (Kanarese), Kashmiri, + # Konkani, Malayalam, Meithei (Manipuri), Marathi, Nepali, Oriya, + # Punjabi, Sanskrit, Sindhi, Telugu, Tamil, and Urdu. + 'hi' => [qw(bn pa as or)], + # I welcome finer data for the other Indian languages. + # E.g., what should Oriya's list be, besides just Hindi? + + # And the panic languages for English is, of course, nil! + + # My guesses at Slavic intelligibility: + ([qw(ru be uk)]) x 2, # Russian, Belarusian, Ukranian + 'sr' => 'hr', 'hr' => 'sr', # Serb + Croat + 'cs' => 'sk', 'sk' => 'cs', # Czech + Slovak + + 'ms' => 'id', 'id' => 'ms', # Malay + Indonesian + + 'et' => 'fi', 'fi' => 'et', # Estonian + Finnish + + #?? 'lo' => 'th', 'th' => 'lo', # Lao + Thai + + ); + my($k,$v); + while(@panic) { + ($k,$v) = splice(@panic,0,2); + foreach my $k (ref($k) ? @$k : $k) { + foreach my $v (ref($v) ? @$v : $v) { + push @{$Panic{$k} ||= []}, $v unless $k eq $v; + } + } + } +} + +=item * the function @langs = panic_languages(@accept_languages) + +This function takes a list of 0 or more language +tags that constitute a given user's Accept-Language list, and +returns a list of tags for I (non-super) +languages that are probably acceptable to the user, to be +used I. + +For example, if a user accepts only 'ca' (Catalan) and +'es' (Spanish), and the documents/interfaces you have +available are just in German, Italian, and Chinese, then +the user will most likely want the Italian one (and not +the Chinese or German one!), instead of getting +nothing. So C returns +a list containing 'it' (Italian). + +English ('en') is I in the return list, but +whether it's at the very end or not depends +on the input languages. This function works by consulting +an internal table that stipulates what common +languages are "close" to each other. + +A useful construct you might consider using is: + + @fallbacks = super_languages(@accept_languages); + push @fallbacks, panic_languages( + @accept_languages, @fallbacks, + ); + +=cut - if($tag =~ /^([XIxi])(-.+)/) { - # This handles all the alternation that exists CURRENTLY - return($alt{$1} . $2); +sub panic_languages { + # When in panic or in doubt, run in circles, scream, and shout! + my(@out, %seen); + foreach my $t (@_) { + next unless $t; + next if $seen{$t}++; # so we don't return it or hit it again + # push @out, super_languages($t); # nah, keep that separate + push @out, @{ $Panic{lc $t} || next }; } - return(); + return grep !$seen{$_}++, @out, 'en'; } ########################################################################### +1; +__END__ =back @@ -615,6 +795,3 @@ Sean M. Burke C =cut -1; - -__END__ diff --git a/lib/I18N/LangTags/ChangeLog b/lib/I18N/LangTags/ChangeLog new file mode 100644 index 0000000..55b84ac --- /dev/null +++ b/lib/I18N/LangTags/ChangeLog @@ -0,0 +1,90 @@ +Revision history for Perl module I18N::LangTags. + Time-stamp: "2001-06-20 01:52:29 MDT" + +2001-06-20 Sean M. Burke sburke@cpan.org + + * Release 0.24 + + * I18N::LangTags -- some elaborate hacks to make us + recognize legacy aliases like no-nyn == nn. + Added panic_languages(). + Added :ALL export tag. + Minor docs fixes, and spiffing up test.pl. + + * I18N::LangTags::List -- minor corrections; added + a few aliases. + +2001-05-29 Sean M. Burke sburke@cpan.org + + * Release 0.23 + + * I18N::LangTags::List -- minor corrections. And is now + a module, not just documentation. + +2001-05-27 Sean M. Burke sburke@cpan.org + + * Release 0.22 + + * Now bundling I18N::LangTags::List, a reference for lang tags, + replacing generate_language_table.plx and language_codes.txt + +2001-05-25 Sean M. Burke sburke@cpan.org + + * Release 0.21 + + * extract_language_tags and locale2langauge_tag now + return untainted output. Useful if you feed tainted + things, like $ENV{'LANG'}. + +2001-03-13 Sean M. Burke sburke@cpan.org + + * Release 0.20 + + * Added support for RFC 3066 tags: allowing three-letter primary + tags ("nav"), and allowing digits in subtags ("x-borg-prot3252"). + + * Changed all references from RFC 1766 to RFC 3066. + + * Now bundling fulltext of RFC 3066 in the dist. + + * Now bundling generate_language_table.plx and language_codes.txt + + * Added some nice tests to test.pl + + * Inverting order of listings in this ChangeLog file. + +2000-05-13 Sean M. Burke sburke@cpan.org + + * Release 0.13 + + * Just noting my new email address. + +1999-03-06 Sean M. Burke sburke@netadventure.net + + * Release 0.11 + + * Added functions + similarity_language_tag, is_dialect_of, + locale2language_tag, alternate_language_tags, and + encode_language_tag + +1998-12-14 Sean M. Burke sburke@netadventure.net + + * Release 0.09 + + * Added function super_languages() + +1998-10-31 Sean M. Burke sburke@netadventure.net + + * Release 0.08 + + * Just changes in the docs and bundle -- no change + in functionality. + +1998-04-02 Sean M. Burke sburke@netadventure.net + + * Release 0.07 + + * First public release. + +[END OF CHANGELOG] diff --git a/lib/I18N/LangTags/List.pm b/lib/I18N/LangTags/List.pm new file mode 100644 index 0000000..dec2a3f --- /dev/null +++ b/lib/I18N/LangTags/List.pm @@ -0,0 +1,1620 @@ + +require 5; +package I18N::LangTags::List; +# Time-stamp: "2001-05-27 19:55:19 MDT" +use strict; +use vars qw(%Name $Debug $VERSION); +$VERSION = '0.24'; +# POD at the end. + +#---------------------------------------------------------------------- +{ +# read the table out of our own POD! + my $seeking = 1; + my $count = 0; + my($tag,$name); + while() { + if($seeking) { + $seeking = 0 if m/=for woohah/; + } else { + next unless ($tag, $name) = + m/\{([-0-9a-zA-Z]+)\}(?:\s*:)?\s*([^\[\]]+)/; + $name =~ s/\s*[;\.]*\s*$//g; + next unless $name; + ++$count; + print "<$tag> <$name>\n" if $Debug; + $Name{$tag} = $name; + } + } + die "No tags read??" unless $count; +} +#---------------------------------------------------------------------- + +sub name { + my $tag = lc($_[0] || return); + $tag =~ s/^\s+//s; + $tag =~ s/\s+$//s; + + my $alt; + if($tag =~ m/^x-(.+)/) { + $alt = "i-$1"; + } elsif($tag =~ m/^i-(.+)/) { + $alt = "x-$1"; + } else { + $alt = ''; + } + + my $subform = ''; + my $name = ''; + print "Input: {$tag}\n" if $Debug; + while(length $tag) { + last if $name = $Name{$tag}; + last if $name = $Name{$alt}; + if($tag =~ s/(-[a-z0-9]+)$//s) { + print "Shaving off: $1 leaving $tag\n" if $Debug; + $subform = "$1$subform"; + # and loop around again + + $alt =~ s/(-[a-z0-9]+)$//s && $Debug && print " alt -> $alt\n"; + } else { + # we're trying to pull a subform off a primary tag. TILT! + print "Aborting on: {$name}{$subform}\n" if $Debug; + last; + } + } + print "Output: {$name}{$subform}\n" if $Debug; + + return unless $name; # Failure + return $name unless $subform; # Exact match + $subform =~ s/^-//s; + $subform =~ s/-$//s; + return "$name (Subform \"$subform\")"; +} + +1; + +__DATA__ + +=head1 NAME + +I18N::LangTags::List -- tags and names for human languages + +=head1 SYNOPSIS + + use I18N::LangTags::List; + print "Parlez-vous... ", join(', ', + I18N::LangTags::List::name('elx') || 'unknown_language', + I18N::LangTags::List::name('ar-Kw') || 'unknown_language', + I18N::LangTags::List::name('en') || 'unknown_language', + I18N::LangTags::List::name('en-CA') || 'unknown_language', + ), "?\n"; + +prints: + + Parlez-vous... Elamite, Kuwait Arabic, English, Canadian English? + +=head1 DESCRIPTION + +This module provides a function +C ) > that takes +a language tag (see L) +and returns the best attempt at an English name for it, or +undef if it can't make sense of the tag. + +The function I18N::LangTags::List::name(...) is not exported. + +The map of tags-to-names that it uses is accessable as +%I18N::LangTags::List::Name, and it's the same as the list +that follows in this documentation, which should be useful +to you even if you don't use this module. + +=head1 ABOUT LANGUAGE TAGS + +Internet language tags, as defined in RFC 3066, are a formalism +for denoting human languages. The two-letter ISO 639-1 language +codes are well known (as "en" for English), as are their forms +when qualified by a country code ("en-US"). Less well-known are the +arbitrary-length non-ISO codes (like "i-mingo"), and the +recently (in 2001) introduced three-letter ISO-639-2 codes. + +Remember this important facts: + +=over + +=item * + +Language tags are not locale IDs. A locale ID is written with a "_" +instead of a "-", (almost?) always matches C, and +I something different than a language tag. A language tag +denotes a language. A locale ID denotes a language I +a particular place, in combination with non-linguistic +location-specific information such as what currency in used +there. Locales I often denote character set information, +as in "en_US.ISO8859-1". + +=item * + +Language tags are not for computer languages. + +=item * + +"Dialect" is not a useful term, since there is no objective +criterion for establishing when two languages are +dialects of eachother, or are separate languages. + +=item * + +Language tags are not case-sensitive. en-US, en-us, En-Us, etc., +are all the same tag, and denote the same language. + +=item * + +Not every language tag really refers to a single language. Some +language tags refer to conditions: i-default (system-message text +in English plus maybe other languages), und (undetermined +language). Others (notably lots of the three-letter codes) are +bibliographic tags that classify whole groups of languages, as +with cus "Cushitic (Other)" (i.e., a +language that has been classed as Cushtic, but which has no more +specific code) or the even less linguistically coherent +sai for "South American Indian (Other)". While useful in +bibliography, B. For further guidance, email me. + +=item * + +Language tags are not country codes. In fact, they are often +distinct codes, as with language tag ja for Japanese, and +ISO 3166 country code C<.jp> for Japan. + +=back + +=head1 LIST OF LANGUAGES + +The first part of each item is the language tag, between +{...}. It +is followed by an English name for the language or language-group. +Language tags that I judge to be not for general use, are bracketed. + +This list is in alphabetical order by English name of the language. + +=for reminder + The name in the =item line MUST NOT have E<...>'s in it!! + +=for woohah START + +=over + +=item {ab} : Abkhazian + +eq Abkhaz + +=item {ace} : Achinese + +=item {ach} : Acoli + +=item {ada} : Adangme + +=item {aa} : Afar + +=item {afh} : Afrihili + +(Artificial) + +=item {af} : Afrikaans + +=item [{afa} : Afro-Asiatic (Other)] + +=item {aka} : Akan + +=item {akk} : Akkadian + +(Historical) + +=item {sq} : Albanian + +=item {ale} : Aleut + +=item [{alg} : Algonquian languages] + +NOT Algonquin! + +=item [{tut} : Altaic (Other)] + +=item {am} : Amharic + +NOT Aramaic! + +=item {i-ami} : Ami + +eq Amis. eq 'Amis. eq Pangca. + +=item [{apa} : Apache languages] + +=item {ar} : Arabic + +Many forms are mutually un-intelligible in spoken media. +Notable forms: +{ar-ae} UAE Arabic; +{ar-bh} Bahrain Arabic; +{ar-dz} Algerian Arabic; +{ar-eg} Egyptian Arabic; +{ar-iq} Iraqi Arabic; +{ar-jo} Jordanian Arabic; +{ar-kw} Kuwait Arabic; +{ar-lb} Lebanese Arabic; +{ar-ly} Libyan Arabic; +{ar-ma} Moroccan Arabic; +{ar-om} Omani Arabic; +{ar-qa} Qatari Arabic; +{ar-sa} Sauda Arabic; +{ar-sy} Syrian Arabic; +{ar-tn} Tunisian Arabic; +{ar-ye} Yemen Arabic. + +=item {arc} : Aramaic + +NOT Amharic! NOT Samaritan Aramaic! + +=item {arp} : Arapaho + +=item {arn} : Araucanian + +=item {arw} : Arawak + +=item {hy} : Armenian + +=item [{art} : Artificial (Other)] + +=item {as} : Assamese + +=item [{ath} : Athapascan languages] + +eq Athabaskan. eq Athapaskan. eq Athabascan. + +=item [{aus} : Australian languages] + +=item [{map} : Austronesian (Other)] + +=item {ava} : Avaric + +=item {ae} : Avestan + +eq Zend + +=item {awa} : Awadhi + +=item {ay} : Aymara + +=item {az} : Azerbaijani + +eq Azeri + +=item {ban} : Balinese + +=item [{bat} : Baltic (Other)] + +=item {bal} : Baluchi + +=item {bam} : Bambara + +=item [{bai} : Bamileke languages] + +=item {bad} : Banda + +=item [{bnt} : Bantu (Other)] + +=item {bas} : Basa + +=item {ba} : Bashkir + +=item {eu} : Basque + +=item {btk} : Batak (Indonesia) + +=item {bej} : Beja + +=item {be} : Belarusian + +eq Belarussian. eq Byelarussian. +eq Belorussian. eq Byelorussian. +eq White Russian. eq White Ruthenian. +NOT Ruthenian! + +=item {bem} : Bemba + +=item {bn} : Bengali + +eq Bangla. + +=item [{ber} : Berber (Other)] + +=item {bho} : Bhojpuri + +=item {bh} : Bihari + +=item {bik} : Bikol + +=item {bin} : Bini + +=item {bi} : Bislama + +eq Bichelamar. + +=item {bs} : Bosnian + +=item {bra} : Braj + +=item {br} : Breton + +=item {bug} : Buginese + +=item {bg} : Bulgarian + +=item {i-bnn} : Bunun + +=item {bua} : Buriat + +=item {my} : Burmese + +=item {cad} : Caddo + +=item {car} : Carib + +=item {ca} : Catalan + +eq CatalEn. eq Catalonian. + +=item [{cau} : Caucasian (Other)] + +=item {ceb} : Cebuano + +=item [{cel} : Celtic (Other)] + +Notable forms: +{cel-gaulish} Gaulish (Historical) + +=item [{cai} : Central American Indian (Other)] + +=item {chg} : Chagatai + +(Historical?) + +=item [{cmc} : Chamic languages] + +=item {ch} : Chamorro + +=item {ce} : Chechen + +=item {chr} : Cherokee + +eq Tsalagi + +=item {chy} : Cheyenne + +=item {chb} : Chibcha + +(Historical) NOT Chibchan (which is a language family). + +=item {ny} : Chichewa + +eq Nyanja. eq Chinyanja. + +=item {zh} : Chinese + +Many forms are mutually un-intelligible in spoken media. +Notable subforms: +{zh-cn} PRC Chinese; +{zh-hk} Hong Kong Chinese; +{zh-mo} Macau Chinese; +{zh-sg} Singapore Chinese; +{zh-tw} Taiwan Chinese; +{zh-guoyu} Mandarin [Putonghua/Guoyu]; +{zh-hakka} Hakka [formerly i-hakka]; +{zh-min} Hokkien; +{zh-min-nan} Southern Hokkien; +{zh-wuu} Shanghaiese; +{zh-xiang} Hunanese; +{zh-gan} Gan; +{zh-yue} Cantonese. + +=for etc +{i-hakka} Hakka (old tag) + +=item {chn} : Chinook Jargon + +eq Chinook Wawa. + +=item {chp} : Chipewyan + +=item {cho} : Choctaw + +=item {cu} : Church Slavic + +eq Old Church Slavonic. + +=item {chk} : Chuukese + +eq Trukese. eq Chuuk. eq Truk. eq Ruk. + +=item {cv} : Chuvash + +=item {cop} : Coptic + +=item {kw} : Cornish + +=item {co} : Corsican + +eq Corse. + +=item {cre} : Cree + +NOT Creek! + +=item {mus} : Creek + +NOT Cree! + +=item [{cpe} : English-based Creoles and pidgins (Other)] + +=item [{cpf} : French-based Creoles and pidgins (Other)] + +=item [{cpp} : Portuguese-based Creoles and pidgins (Other)] + +=item [{crp} : Creoles and pidgins (Other)] + +=item {hr} : Croatian + +eq Croat. + +=item [{cus} : Cushitic (Other)] + +=item {cs} : Czech + +=item {dak} : Dakota + +eq Nakota. eq Latoka. + +=item {da} : Danish + +=item {day} : Dayak + +=item {i-default} : Default (Fallthru) Language + +Defined in RFC 2277, this is for tagging text +(which must include English text, and might/should include text +in other appropriate languages) that is emitted in a context +where language-negotiation wasn't possible -- in SMTP mail failure +messages, for example. + +=item {del} : Delaware + +=item {din} : Dinka + +=item {div} : Divehi + +=item {doi} : Dogri + +NOT Dogrib! + +=item {dgr} : Dogrib + +NOT Dogri! + +=item [{dra} : Dravidian (Other)] + +=item {dua} : Duala + +=item {nl} : Dutch + +eq Netherlander. Notable forms: +{nl-nl} Netherlands Dutch; +{nl-be} Belgian Dutch. + +=item {dum} : Middle Dutch (ca.1050-1350) + +(Historical) + +=item {dyu} : Dyula + +=item {dz} : Dzongkha + +=item {efi} : Efik + +=item {egy} : Ancient Egyptian + +(Historical) + +=item {eka} : Ekajuk + +=item {elx} : Elamite + +(Historical) + +=item {en} : English + +Notable forms: +{en-au} Australian English; +{en-bz} Belize English; +{en-ca} Canadian English; +{en-gb} UK English; +{en-ie} Irish English; +{en-jm} Jamaican English; +{en-nz} New Zealand English; +{en-ph} Philippine English; +{en-tt} Trinidad English; +{en-us} US English; +{en-za} South African English; +{en-zw} Zimbabwe English. + +=item {enm} : Old English (1100-1500) + +(Historical) + +=item {ang} : Old English (ca.450-1100) + +eq Anglo-Saxon. (Historical) + +=item {eo} : Esperanto + +(Artificial) + +=item {et} : Estonian + +=item {ewe} : Ewe + +=item {ewo} : Ewondo + +=item {fan} : Fang + +=item {fat} : Fanti + +=item {fo} : Faroese + +=item {fj} : Fijian + +=item {fi} : Finnish + +=item [{fiu} : Finno-Ugrian (Other)] + +eq Finno-Ugric. NOT Ugaritic! + +=item {fon} : Fon + +=item {fr} : French + +Notable forms: +{fr-fr} France French; +{fr-be} Belgian French; +{fr-ca} Canadian French; +{fr-ch} Swiss French; +{fr-lu} Luxembourg French; +{fr-mc} Monaco French. + +=item {frm} : Middle French (ca.1400-1600) + +(Historical) + +=item {fro} : Old French (842-ca.1400) + +(Historical) + +=item {fy} : Frisian + +=item {fur} : Friulian + +=item {ful} : Fulah + +=item {gaa} : Ga + +=item {gd} : Scots Gaelic + +NOT Scots! + +=item {gl} : Gallegan + +eq Galician + +=item {lug} : Ganda + +=item {gay} : Gayo + +=item {gba} : Gbaya + +=item {gez} : Geez + +eq Ge'ez + +=item {ka} : Georgian + +=item {de} : German + +Notable forms: +{de-at} Austrian German; +{de-be} Belgian German; +{de-ch} Swiss German; +{de-de} Germany German; +{de-li} Liechtenstein German; +{de-lu} Luxembourg German. + +=item {gmh} : Middle High German (ca.1050-1500) + +(Historical) + +=item {goh} : Old High German (ca.750-1050) + +(Historical) + +=item [{gem} : Germanic (Other)] + +=item {gil} : Gilbertese + +=item {gon} : Gondi + +=item {gor} : Gorontalo + +=item {got} : Gothic + +(Historical) + +=item {grb} : Grebo + +=item {grc} : Ancient Greek (to 1453) + +(Historical) + +=item {el} : Modern Greek (1453-) + +=item {gn} : Guarani + +GuaranE + +=item {gu} : Gujarati + +=item {gwi} : Gwich'in + +eq Gwichin + +=item {hai} : Haida + +=item {ha} : Hausa + +=item {haw} : Hawaiian + +Hawai'ian + +=item {he} : Hebrew + +(Formerly "iw".) + +=for etc +{iw} Hebrew (old tag) + +=item {hz} : Herero + +=item {hil} : Hiligaynon + +=item {him} : Himachali + +=item {hi} : Hindi + +=item {ho} : Hiri Motu + +=item {hit} : Hittite + +(Historical) + +=item {hmn} : Hmong + +=item {hu} : Hungarian + +=item {hup} : Hupa + +=item {iba} : Iban + +=item {is} : Icelandic + +=item {ibo} : Igbo + +=item {ijo} : Ijo + +=item {ilo} : Iloko + +=item [{inc} : Indic (Other)] + +=item [{ine} : Indo-European (Other)] + +=item {id} : Indonesian + +(Formerly "in".) + +=for etc +{in} Indonesian (old tag) + +=item {ia} : Interlingua (International Auxiliary Language Association) + +(Artificial) NOT Interlingue! + +=item {ie} : Interlingue + +(Artificial) NOT Interlingua! + +=item {iu} : Inuktitut + +A subform of "Eskimo". + +=item {ik} : Inupiaq + +A subform of "Eskimo". + +=item [{ira} : Iranian (Other)] + +=item {ga} : Irish + +=item {mga} : Middle Irish (900-1200) + +(Historical) + +=item {sga} : Old Irish (to 900) + +(Historical) + +=item [{iro} : Iroquoian languages] + +=item {it} : Italian + +Notable forms: +{it-it} Italy Italian; +{it-ch} Swiss Italian. + +=item {ja} : Japanese + +(NOT "jp"!) + +=item {jw} : Javanese + +=item {jrb} : Judeo-Arabic + +=item {jpr} : Judeo-Persian + +=item {kab} : Kabyle + +=item {kac} : Kachin + +=item {kl} : Kalaallisut + +eq Greenlandic "Eskimo" + +=item {kam} : Kamba + +=item {kn} : Kannada + +eq Kanarese. NOT Canadian! + +=item {kau} : Kanuri + +=item {kaa} : Kara-Kalpak + +=item {kar} : Karen + +=item {ks} : Kashmiri + +=item {kaw} : Kawi + +=item {kk} : Kazakh + +=item {kha} : Khasi + +=item {km} : Khmer + +eq Cambodian. eq Kampuchean. + +=item [{khi} : Khoisan (Other)] + +=item {kho} : Khotanese + +=item {ki} : Kikuyu + +eq Gikuyu. + +=item {kmb} : Kimbundu + +=item {rw} : Kinyarwanda + +=item {ky} : Kirghiz + +=item {i-klingon} : Klingon + +=item {kv} : Komi + +=item {kon} : Kongo + +=item {kok} : Konkani + +=item {ko} : Korean + +=item {kos} : Kosraean + +=item {kpe} : Kpelle + +=item {kro} : Kru + +=item {kj} : Kuanyama + +=item {kum} : Kumyk + +=item {ku} : Kurdish + +=item {kru} : Kurukh + +=item {kut} : Kutenai + +=item {lad} : Ladino + +eq Judeo-Spanish. NOT Ladin (a minority language in Italy). + +=item {lah} : Lahnda + +NOT Lamba! + +=item {lam} : Lamba + +NOT Lahnda! + +=item {lo} : Lao + +eq Laotian. + +=item {la} : Latin + +(Historical) NOT Ladin! NOT Ladino! + +=item {lv} : Latvian + +eq Lettish. + +=item {lb} : Letzeburgesch + +eq Luxemburgian, eq Luxemburger. (Formerly i-lux.) + +=for etc +{i-lux} Letzeburgesch (old tag) + +=item {lez} : Lezghian + +=item {ln} : Lingala + +=item {lt} : Lithuanian + +=item {nds} : Low German + +eq Low Saxon. eq Low German. eq Low Saxon. + +=item {loz} : Lozi + +=item {lub} : Luba-Katanga + +=item {lua} : Luba-Lulua + +=item {lui} : Luiseno + +eq LuiseEo. + +=item {lun} : Lunda + +=item {luo} : Luo (Kenya and Tanzania) + +=item {lus} : Lushai + +=item {mk} : Macedonian + +eq the modern Slavic language spoken in what was Yugoslavia. +NOT the form of Greek spoken in Greek Macedonia! + +=item {mad} : Madurese + +=item {mag} : Magahi + +=item {mai} : Maithili + +=item {mak} : Makasar + +=item {mg} : Malagasy + +=item {ms} : Malay + +NOT Malayalam! + +=item {ml} : Malayalam + +NOT Malay! + +=item {mt} : Maltese + +=item {mnc} : Manchu + +=item {mdr} : Mandar + +NOT Mandarin! + +=item {man} : Mandingo + +=item {mni} : Manipuri + +eq Meithei. + +=item [{mno} : Manobo languages] + +=item {gv} : Manx + +=item {mi} : Maori + +NOT Mari! + +=item {mr} : Marathi + +=item {chm} : Mari + +NOT Maori! + +=item {mh} : Marshall + +eq Marshallese. + +=item {mwr} : Marwari + +=item {mas} : Masai + +=item [{myn} : Mayan languages] + +=item {men} : Mende + +=item {mic} : Micmac + +=item {min} : Minangkabau + +=item {i-mingo} : Mingo + +eq the Irquoian language West Virginia Seneca. NOT New York Seneca! + +=item [{mis} : Miscellaneous languages] + +Don't use this. + +=item {moh} : Mohawk + +=item {mo} : Moldavian + +eq Moldovan. + +=item [{mkh} : Mon-Khmer (Other)] + +=item {lol} : Mongo + +=item {mn} : Mongolian + +eq Mongol. + +=item {mos} : Mossi + +=item [{mul} : Multiple languages] + +Not for normal use. + +=item [{mun} : Munda languages] + +=item {nah} : Nahuatl + +=item {na} : Nauru + +=item {nv} : Navajo + +eq Navaho. (Formerly i-navajo.) + +=for etc +{i-navajo} Navajo (old tag) + +=item {nd} : North Ndebele + +=item {nr} : South Ndebele + +=item {ng} : Ndonga + +=item {ne} : Nepali + +eq Nepalese. Notable forms: +{ne-np} Nepal Nepali; +{ne-in} India Nepali. + +=item {new} : Newari + +=item {nia} : Nias + +=item [{nic} : Niger-Kordofanian (Other)] + +=item [{ssa} : Nilo-Saharan (Other)] + +=item {niu} : Niuean + +=item {non} : Old Norse + +(Historical) + +=item [{nai} : North American Indian] + +Do not use this. + +=item {se} : Northern Sami + +eq Lappish. eq Lapp. eq (Northern) Saami. + +=item {no} : Norwegian + +Note the two following forms: + +=item {nb} : Norwegian Bokmal + +eq BokmEl, (A form of Norwegian.) (Formerly no-bok.) + +=for etc +{no-bok} Norwegian Bokmal (old tag) + +=item {nn} : Norwegian Nynorsk + +(A form of Norwegian.) (Formerly no-nyn.) + +=for etc +{no-nyn} Norwegian Nynorsk (old tag) + +=item [{nub} : Nubian languages] + +=item {nym} : Nyamwezi + +=item {nyn} : Nyankole + +=item {nyo} : Nyoro + +=item {nzi} : Nzima + +=item {oc} : Occitan (post 1500) + +eq ProvenEal, eq Provencal + +=item {oji} : Ojibwa + +eq Ojibwe. + +=item {or} : Oriya + +=item {om} : Oromo + +=item {osa} : Osage + +=item {os} : Ossetian; Ossetic + +=item [{oto} : Otomian languages] + +Group of languages collectively called "OtomE". + +=item {pal} : Pahlavi + +eq Pahlevi + +=item {i-pwn} : Paiwan + +eq Pariwan + +=item {pau} : Palauan + +=item {pi} : Pali + +(Historical?) + +=item {pam} : Pampanga + +=item {pag} : Pangasinan + +=item {pa} : Panjabi + +eq Punjabi + +=item {pap} : Papiamento + +eq Papiamentu. + +=item [{paa} : Papuan (Other)] + +=item {fa} : Persian + +eq Farsi. eq Iranian. + +=item {peo} : Old Persian (ca.600-400 B.C.) + +=item [{phi} : Philippine (Other)] + +=item {phn} : Phoenician + +(Historical) + +=item {pon} : Pohnpeian + +NOT Pompeiian! + +=item {pl} : Polish + +=item {pt} : Portuguese + +eq Portugese. Notable forms: +{pt-pt} Portugal Portuguese; +{pt-br} Brazilian Portuguese. + +=item [{pra} : Prakrit languages] + +=item {pro} : Old Provencal (to 1500) + +eq Old ProvenEal. (Historical.) + +=item {ps} : Pushto + +eq Pashto. eq Pushtu. + +=item {qu} : Quechua + +eq Quecha. + +=item {rm} : Raeto-Romance + +eq Romansh. + +=item {raj} : Rajasthani + +=item {rap} : Rapanui + +=item {rar} : Rarotongan + +=item [{qaa - qtz} : Reserved for local use.] + +=item [{roa} : Romance (Other)] + +NOT Romanian! NOT Romany! NOT Romansh! + +=item {ro} : Romanian + +eq Rumanian. NOT Romany! + +=item {rom} : Romany + +eq Rom. NOT Romanian! + +=item {rn} : Rundi + +=item {ru} : Russian + +NOT White Russian! NOT Rusyn! + +=item [{sal} : Salishan languages] + +Large language group. + +=item {sam} : Samaritan Aramaic + +NOT Aramaic! + +=item [{smi} : Sami languages (Other)] + +=item {sm} : Samoan + +=item {sad} : Sandawe + +=item {sg} : Sango + +=item {sa} : Sanskrit + +(Historical) + +=item {sat} : Santali + +=item {sc} : Sardinian + +eq Sard. + +=item {sas} : Sasak + +=item {sco} : Scots + +NOT Scots Gaelic! + +=item {sel} : Selkup + +=item [{sem} : Semitic (Other)] + +=item {sr} : Serbian + +eq Serb. NOT Sorbian. + +=item {srr} : Serer + +=item {shn} : Shan + +=item {sn} : Shona + +=item {sid} : Sidamo + +=item {sgn-...} : Sign Languages + +Always use with a subtag. Notable forms: +{sgn-gb} British Sign Language (BSL); +{sgn-ie} Irish Sign Language (ESL); +{sgn-ni} Nicaraguan Sign Language (ISN); +{sgn-us} American Sign Language (ASL). + +=item {bla} : Siksika + +eq Blackfoot. eq Pikanii. + +=item {sd} : Sindhi + +=item {si} : Sinhalese + +eq Sinhala. + +=item [{sit} : Sino-Tibetan (Other)] + +=item [{sio} : Siouan languages] + +=item {den} : Slave (Athapascan) + +("Slavey" is a subform.) + +=item [{sla} : Slavic (Other)] + +=item {sk} : Slovak + +eq Slovakian. + +=item {sl} : Slovenian + +eq Slovene. + +=item {sog} : Sogdian + +=item {so} : Somali + +=item {son} : Songhai + +=item {snk} : Soninke + +=item {wen} : Sorbian languages + +eq Wendish. eq Sorb. eq Lusatian. eq Wend. NOT Venda! NOT Serbian! + +=item {nso} : Northern Sotho + +=item {st} : Southern Sotho + +eq Sutu. eq Sesotho. + +=item [{sai} : South American Indian (Other)] + +=item {es} : Spanish + +Notable forms: +{es-ar} Argentine Spanish; +{es-bo} Bolivian Spanish; +{es-cl} Chilean Spanish; +{es-co} Colombian Spanish; +{es-do} Dominican Spanish; +{es-ec} Ecuadorian Spanish; +{es-es} Spain Spanish; +{es-gt} Guatemalan Spanish; +{es-hn} Honduran Spanish; +{es-mx} Mexican Spanish; +{es-pa} Panamanian Spanish; +{es-pe} Peruvian Spanish; +{es-pr} Puerto Rican Spanish; +{es-py} Paraguay Spanish; +{es-sv} Salvadoran Spanish; +{es-us} US Spanish; +{es-uy} Uruguayan Spanish; +{es-ve} Venezuelan Spanish. + +=item {suk} : Sukuma + +=item {sux} : Sumerian + +(Historical) + +=item {su} : Sundanese + +=item {sus} : Susu + +=item {sw} : Swahili + +eq Kiswahili + +=item {ss} : Swati + +=item {sv} : Swedish + +Notable forms: +sv-se {Sweden Swedish}; +sv-fi {Finland Swedish}. + +=item {syr} : Syriac + +=item {tl} : Tagalog + +=item {ty} : Tahitian + +=item [{tai} : Tai (Other)] + +NOT Thai! + +=item {tg} : Tajik + +=item {tmh} : Tamashek + +=item {ta} : Tamil + +=item {i-tao} : Tao + +eq Yami. + +=item {tt} : Tatar + +=item {i-tay} : Tayal + +eq Atayal. eq Atayan. + +=item {te} : Telugu + +=item {ter} : Tereno + +=item {tet} : Tetum + +=item {th} : Thai + +NOT Tai! + +=item {bo} : Tibetan + +=item {tig} : Tigre + +=item {ti} : Tigrinya + +=item {tem} : Timne + +eq Themne. eq Timene. + +=item {tiv} : Tiv + +=item {tli} : Tlingit + +=item {tpi} : Tok Pisin + +=item {tkl} : Tokelau + +=item {tog} : Tonga (Nyasa) + +NOT Tsonga! + +=item {to} : Tonga (Tonga Islands) + +(Pronounced "Tong-a", not "Tong-ga") + +NOT Tsonga! + +=item {tsi} : Tsimshian + +eq Sm'algyax + +=item {ts} : Tsonga + +NOT Tonga! + +=item {i-tsu} : Tsou + +=item {tn} : Tswana + +Same as Setswana. + +=item {tum} : Tumbuka + +=item {tr} : Turkish + +(Typically in Roman script) + +=item {ota} : Ottoman Turkish (1500-1928) + +(Typically in Arabic script) (Historical) + +=item {tk} : Turkmen + +eq Turkmeni. + +=item {tvl} : Tuvalu + +=item {tyv} : Tuvinian + +eq Tuvan. eq Tuvin. + +=item {tw} : Twi + +=item {uga} : Ugaritic + +NOT Ugric! + +=item {ug} : Uighur + +=item {uk} : Ukrainian + +=item {umb} : Umbundu + +=item {und} : Undetermined + +Not a tag for normal use. + +=item {ur} : Urdu + +=item {uz} : Uzbek + +eq Ezbek + +=item {vai} : Vai + +=item {ven} : Venda + +NOT Wendish! NOT Wend! NOT Avestan! + +=item {vi} : Vietnamese + +eq Viet. + +=item {vo} : Volapuk + +eq VolapEk. (Artificial) + +=item {vot} : Votic + +eq Votian. eq Vod. + +=item [{wak} : Wakashan languages] + +=item {wal} : Walamo + +eq Wolaytta. + +=item {war} : Waray + +Presumably the Philippine language Waray-Waray (SamareEo), +not the smaller Philippine language Waray Sorsogon, nor the extinct +Australian language Waray. + +=item {was} : Washo + +eq Washoe + +=item {cy} : Welsh + +=item {wo} : Wolof + +=item {x-...} : Unregistered (Semi-Private Use) + +"x-" is a prefix for language tags that are not registered with ISO +or IANA. Example, x-double-dutch + +=item {xh} : Xhosa + +=item {sah} : Yakut + +=item {yao} : Yao + +(The Yao in Malawi?) + +=item {yap} : Yapese + +eq Yap + +=item {yi} : Yiddish + +Formerly "ji". Sometimes in Roman script, sometimes in Hebrew script. + +=for etc +{ji} Yiddish (old tag) + +=item {yo} : Yoruba + +=item [{ypk} : Yupik languages] + +Several "Eskimo" languages. + +=item {znd} : Zande + +=item [{zap} : Zapotec] + +(A group of languages.) + +=item {zen} : Zenaga + +NOT Zend. + +=item {za} : Zhuang + +=item {zu} : Zulu + +=item {zun} : Zuni + +eq ZuEi + +=back + +=for woohah END + +=head1 SEE ALSO + +L and its "See Also" section. + +=head1 COPYRIGHT AND DISCLAIMER + +Copyright (c) 2001 Sean M. Burke. All rights reserved. + +You can redistribute and/or +modify this document under the same terms as Perl itself. + +This document is provided in the the hope that it will be +useful, but without any warranty; +without even the implied warranty of accuracy, authoritativeness, +completeness, merchantability, or fitness for a particular purpose. + +Email any corrections or questions to me. + +=head1 AUTHOR + +Sean M. Burke, sburkeE<64>cpan.org + +=cut + + +# To generate a list of just the two and three-letter codes: + +#!/usr/local/bin/perl -w + +require 5; # Time-stamp: "2001-03-13 21:53:39 MST" + # Sean M. Burke, sburke@cpan.org + # This program is for generating the language_codes.txt file +use strict; +use LWP::Simple; +use HTML::TreeBuilder 3.10; +my $root = HTML::TreeBuilder->new(); +my $url = 'http://lcweb.loc.gov/standards/iso639-2/bibcodes.html'; +$root->parse(get($url) || die "Can't get $url"); +$root->eof(); + +my @codes; + +foreach my $tr ($root->find_by_tag_name('tr')) { + my @f = map $_->as_text(), $tr->content_list(); + #print map("<$_> ", @f), "\n"; + next unless @f == 5; + pop @f; # nix the French name + next if $f[-1] eq 'Language Name (English)'; # it's a header line + my $xx = splice(@f, 2,1); # pull out the two-letter code + $f[-1] =~ s/^\s+//; + $f[-1] =~ s/\s+$//; + if($xx =~ m/[a-zA-Z]/) { # there's a two-letter code for it + push @codes, [ lc($f[-1]), "$xx\t$f[-1]\n" ]; + } else { # print the three-letter codes. + if($f[0] eq $f[1]) { + push @codes, [ lc($f[-1]), "$f[1]\t$f[2]\n" ]; + } else { # shouldn't happen + push @codes, [ lc($f[-1]), "@f !!!!!!!!!!\n" ]; + } + } +} + +print map $_->[1], sort {; $a->[0] cmp $b->[0] } @codes; +print "[ based on $url\n at ", scalar(localtime), "]\n", + "[Note: doesn't include IANA-registered codes.]\n"; +exit; +__END__ + diff --git a/lib/I18N/LangTags/List.pod b/lib/I18N/LangTags/List.pod deleted file mode 100644 index 9bb5e07..0000000 --- a/lib/I18N/LangTags/List.pod +++ /dev/null @@ -1,1446 +0,0 @@ -=head1 NAME - -I18n::LangTags::List -- list of tags for human languages - -=head1 SYNOPSIS - - Time-stamp: "2001-05-27 19:55:19 MDT" - [This is not a module; it is documentation] - -=head1 ABOUT LANGUAGE TAGS - -Internet language tags, as defined in RFC 3066, are a formalism -for denoting human languages. The two-letter ISO 639-1 language -codes are well known (as "en" for English), as are their forms -when qualified by a country code ("en-US"). Less well-known are the -arbitrary-length non-ISO codes (like "i-mingo"), and the -recently (in 2001) introduced three-letter ISO-639-2 codes. - -Remember this important facts: - -=over - -=item * - -Language tags are not locale IDs. A locale ID is written with a "_" -instead of a "-", (almost?) always matches C, and -I something different than a language tag. A language tag -denotes a language. A locale ID denotes a language I -a particular place, in combination with non-linguistic -location-specific information such as what currency in used -there. Locales I often denote character set information, -as in "en_US.ISO8859-1". - -=item * - -Language tags are not for computer languages. - -=item * - -"Dialect" is not a useful term, since there is no objective -criterion for establishing when two languages are -dialects of eachother, or are separate languages. - -=item * - -Language tags are not case-sensitive. en-US, en-us, En-Us, etc., -are all the same tag, and denote the same language. - -=item * - -Not every language tag really refers to a single language. Some -language tags refer to conditions: i-default (system-message text -in English plus maybe other languages), und (undetermined -language). Others (notably lots of the three-letter codes) are -bibliographic tags that classify whole groups of languages, as -with cus "Cushitic (Other)" (i.e., a -language that has been classed as Cushtic, but which has no more -specific code) or the even less linguistically coherent -sai for "South American Indian (Other)". While useful in -bibliography, B. For further guidance, email me. - -=item * - -Language tags are not country codes. In fact, they are often -distinct codes, as with language tag ja for Japanese, and -ISO 3166 country code C<.jp> for Japan. - -=back - -=head1 LIST OF LANGUAGES - -The first part of each item is the language tag, between -{...} and in italic characters. It -is followed by an English name for the language or language-group. -Language tags that I judge to be not for general use, are bracketed. - -This list is in alphabetical order by English name of the language. - -=over - -=item I<{ab}> : Abkhazian - -eq Abkhaz - -=item I<{ace}> : Achinese - -=item I<{ach}> : Acoli - -=item I<{ada}> : Adangme - -=item I<{aa}> : Afar - -=item I<{afh}> : Afrihili - -(Artificial) - -=item I<{af}> : Afrikaans - -=item [I<{afa}> : Afro-Asiatic (Other)] - -=item I<{aka}> : Akan - -=item I<{akk}> : Akkadian - -(Historical) - -=item I<{sq}> : Albanian - -=item I<{ale}> : Aleut - -=item [I<{alg}> : Algonquian languages] - -NOT Algonquin! - -=item [I<{tut}> : Altaic (Other)] - -=item I<{am}> : Amharic - -NOT Aramaic! - -=item I<{i-ami}> : Ami - -eq Amis. eq 'Amis. eq Pangca. - -=item [I<{apa}> : Apache languages] - -=item I<{ar}> : Arabic - -Many forms are mutually un-intelligible in spoken media. -Notable forms: -ar-ae -ar-bh -ar-dz -ar-eg -ar-iq -ar-jo -ar-kw -ar-lb -ar-ly -ar-ma -ar-om -ar-qa -ar-sa -ar-sy -ar-tn -ar-ye. - -=item I<{arc}> : Aramaic - -NOT Amharic! NOT Samaritan Aramaic! - -=item I<{arp}> : Arapaho - -=item I<{arn}> : Araucanian - -=item I<{arw}> : Arawak - -=item I<{hy}> : Armenian - -=item [I<{art}> : Artificial (Other)] - -=item I<{as}> : Assamese - -=item [I<{ath}> : Athapascan languages] - -eq Athabaskan. eq Athapaskan. eq Athabascan. - -=item [I<{aus}> : Australian languages] - -=item [I<{map}> : Austronesian (Other)] - -=item I<{ava}> : Avaric - -=item I<{ae}> : Avestan - -eq Zend - -=item I<{awa}> : Awadhi - -=item I<{ay}> : Aymara - -=item I<{az}> : Azerbaijani - -eq Azeri - -=item I<{ban}> : Balinese - -=item [I<{bat}> : Baltic (Other)] - -=item I<{bal}> : Baluchi - -=item I<{bam}> : Bambara - -=item [I<{bai}> : Bamileke languages] - -=item I<{bad}> : Banda - -=item [I<{bnt}> : Bantu (Other)] - -=item I<{bas}> : Basa - -=item I<{ba}> : Bashkir - -=item I<{eu}> : Basque - -=item I<{btk}> : Batak (Indonesia) - -=item I<{bej}> : Beja - -=item I<{be}> : Belarusian - -eq Belarussian. eq Byelarussian. -eq Belorussian. eq Byelorussian. -eq White Russian. eq White Ruthenian. -NOT Ruthenian! - -=item I<{bem}> : Bemba - -=item I<{bn}> : Bengali - -=item [I<{ber}> : Berber (Other)] - -=item I<{bho}> : Bhojpuri - -=item I<{bh}> : Bihari - -=item I<{bik}> : Bikol - -=item I<{bin}> : Bini - -=item I<{bi}> : Bislama - -=item I<{bs}> : Bosnian - -=item I<{bra}> : Braj - -=item I<{br}> : Breton - -=item I<{bug}> : Buginese - -=item I<{bg}> : Bulgarian - -=item I<{i-bnn}> : Bunun - -=item I<{bua}> : Buriat - -=item I<{my}> : Burmese - -=item I<{cad}> : Caddo - -=item I<{car}> : Carib - -=item I<{ca}> : Catalan - -eq CatalEn. eq Catalonian. - -=item [I<{cau}> : Caucasian (Other)] - -=item I<{ceb}> : Cebuano - -=item [I<{cel}> : Celtic (Other)] - -Notable forms: cel-gaulish. - -=item [I<{cai}> : Central American Indian (Other)] - -=item I<{chg}> : Chagatai - -(Historical?) - -=item [I<{cmc}> : Chamic languages] - -=item I<{ch}> : Chamorro - -=item I<{ce}> : Chechen - -=item I<{chr}> : Cherokee - -eq Tsalagi - -=item I<{chy}> : Cheyenne - -=item I<{chb}> : Chibcha - -(Historical) NOT Chibchan (which is a language family). - -=item I<{ny}> : Chichewa - -eq Nyanja. eq Chinyanja. - -=item I<{zh}> : Chinese - -Many forms are mutually un-intelligible in spoken media. -Notable subforms: -zh-cn (PRC Chinese), -zh-hk (Hong Kong Chinese), -zh-mo (Macau Chinese), -zh-sg (Singapore Chinese), -zh-tw (Taiwan Chinese), -zh-guoyu (Putonghua/Guoyu/Mandarin), -zh-hakka (Hakka; formerly i-hakka), -zh-min (Hokkien), -zh-min-nan (Southern Hokkien), -zh-wuu (Shanghaiese), -zh-xiang (Hunanese), -zh-yue (Cantonese). - -=item I<{chn}> : Chinook Jargon - -eq Chinook Wawa. - -=item I<{chp}> : Chipewyan - -=item I<{cho}> : Choctaw - -=item I<{cu}> : Church Slavic - -eq Old Church Slavonic. - -=item I<{chk}> : Chuukese - -eq Trukese. eq Chuuk. eq Truk. eq Ruk. - -=item I<{cv}> : Chuvash - -=item I<{cop}> : Coptic - -=item I<{kw}> : Cornish - -=item I<{co}> : Corsican - -eq Corse. - -=item I<{cre}> : Cree - -NOT Creek! - -=item I<{mus}> : Creek - -NOT Cree! - -=item [I<{cpe}> : English-based Creoles and pidgins (Other)] - -=item [I<{cpf}> : French-based Creoles and pidgins (Other)] - -=item [I<{cpp}> : Portuguese-based Creoles and pidgins (Other)] - -=item [I<{crp}> : Creoles and pidgins (Other)] - -=item I<{hr}> : Croatian - -eq Croat. - -=item [I<{cus}> : Cushitic (Other)] - -=item I<{cs}> : Czech - -=item I<{dak}> : Dakota - -eq Nakota. eq Latoka. - -=item I<{da}> : Danish - -=item I<{day}> : Dayak - -=item I<{i-default}> : Default (Fallthru) Language - -Defined in RFC 2277, this is for tagging text -(which must include English text, and might/should include text -in other appropriate languages) that is emitted in a context -where language-negotiation wasn't possible -- in SMTP mail failure -messages, for example. - -=item I<{del}> : Delaware - -=item I<{din}> : Dinka - -=item I<{div}> : Divehi - -=item I<{doi}> : Dogri - -NOT Dogrib! - -=item I<{dgr}> : Dogrib - -NOT Dogri! - -=item [I<{dra}> : Dravidian (Other)] - -=item I<{dua}> : Duala - -=item I<{nl}> : Dutch - -eq Netherlander. Notable forms: nl-nl, nl-be. - -=item I<{dum}> : Middle Dutch (ca.1050-1350) - -(Historical) - -=item I<{dyu}> : Dyula - -=item I<{dz}> : Dzongkha - -=item I<{efi}> : Efik - -=item I<{egy}> : Ancient Egyptian - -(Historical) - -=item I<{eka}> : Ekajuk - -=item I<{elx}> : Elamite - -(Historical) - -=item I<{en}> : English - -Notable forms: -en-au -en-bz -en-ca -en-gb -en-ie -en-jm -en-nz -en-ph -en-tt -en-us -en-za -en-zw. - -=item I<{enm}> : Old English (1100-1500) - -(Historical) - -=item I<{ang}> : Old English (ca.450-1100) - -eq Anglo-Saxon. (Historical) - -=item I<{eo}> : Esperanto - -(Artificial) - -=item I<{et}> : Estonian - -=item I<{ewe}> : Ewe - -=item I<{ewo}> : Ewondo - -=item I<{fan}> : Fang - -=item I<{fat}> : Fanti - -=item I<{fo}> : Faroese - -=item I<{fj}> : Fijian - -=item I<{fi}> : Finnish - -=item [I<{fiu}> : Finno-Ugrian (Other)] - -eq Finno-Ugric. NOT Ugaritic! - -=item I<{fon}> : Fon - -=item I<{fr}> : French - -Notable forms: -fr-fr -fr-be -fr-ca -fr-ch -fr-lu -fr-mc. - -=item I<{frm}> : Middle French (ca.1400-1600) - -(Historical) - -=item I<{fro}> : Old French (842-ca.1400) - -(Historical) - -=item I<{fy}> : Frisian - -=item I<{fur}> : Friulian - -=item I<{ful}> : Fulah - -=item I<{gaa}> : Ga - -=item I<{gd}> : Scots Gaelic - -NOT Scots! - -=item I<{gl}> : Gallegan - -eq Galician - -=item I<{lug}> : Ganda - -=item I<{gay}> : Gayo - -=item I<{gba}> : Gbaya - -=item I<{gez}> : Geez - -eq Ge'ez - -=item I<{ka}> : Georgian - -=item I<{de}> : German - -Notable forms: de-at -de-be -de-ch -de-de -de-li -de-lu. - -=item I<{gmh}> : Middle High German (ca.1050-1500) - -(Historical) - -=item I<{goh}> : Old High German (ca.750-1050) - -(Historical) - -=item [I<{gem}> : Germanic (Other)] - -=item I<{gil}> : Gilbertese - -=item I<{gon}> : Gondi - -=item I<{gor}> : Gorontalo - -=item I<{got}> : Gothic - -(Historical) - -=item I<{grb}> : Grebo - -=item I<{grc}> : Ancient Greek (to 1453) - -(Historical) - -=item I<{el}> : Modern Greek (1453-) - -=item I<{gn}> : Guarani - -GuaranE - -=item I<{gu}> : Gujarati - -=item I<{gwi}> : Gwich'in - -eq Gwichin - -=item I<{hai}> : Haida - -=item I<{ha}> : Hausa - -=item I<{haw}> : Hawaiian - -Hawai'ian - -=item I<{he}> : Hebrew - -(Formerly "iw".) - -=item I<{hz}> : Herero - -=item I<{hil}> : Hiligaynon - -=item I<{him}> : Himachali - -=item I<{hi}> : Hindi - -=item I<{ho}> : Hiri Motu - -=item I<{hit}> : Hittite - -(Historical) - -=item I<{hmn}> : Hmong - -=item I<{hu}> : Hungarian - -=item I<{hup}> : Hupa - -=item I<{iba}> : Iban - -=item I<{is}> : Icelandic - -=item I<{ibo}> : Igbo - -=item I<{ijo}> : Ijo - -=item I<{ilo}> : Iloko - -=item [I<{inc}> : Indic (Other)] - -=item [I<{ine}> : Indo-European (Other)] - -=item I<{id}> : Indonesian - -(Formerly "in".) - -=item I<{ia}> : Interlingua (International Auxiliary Language Association) - -(Artificial) NOT Interlingue! - -=item I<{ie}> : Interlingue - -(Artificial) NOT Interlingua! - -=item I<{iu}> : Inuktitut - -A subform of "Eskimo". - -=item I<{ik}> : Inupiaq - -A subform of "Eskimo". - -=item [I<{ira}> : Iranian (Other)] - -=item I<{ga}> : Irish - -=item I<{mga}> : Middle Irish (900-1200) - -(Historical) - -=item I<{sga}> : Old Irish (to 900) - -(Historical) - -=item [I<{iro}> : Iroquoian languages] - -=item I<{it}> : Italian - -Notable forms: it-it, it-ch - -=item I<{ja}> : Japanese - -(NOT "jp"!) - -=item I<{jw}> : Javanese - -=item I<{jrb}> : Judeo-Arabic - -=item I<{jpr}> : Judeo-Persian - -=item I<{kab}> : Kabyle - -=item I<{kac}> : Kachin - -=item I<{kl}> : Kalaallisut - -eq Greenlandic "Eskimo" - -=item I<{kam}> : Kamba - -=item I<{kn}> : Kannada - -NOT Canadian! - -=item I<{kau}> : Kanuri - -=item I<{kaa}> : Kara-Kalpak - -=item I<{kar}> : Karen - -=item I<{ks}> : Kashmiri - -=item I<{kaw}> : Kawi - -=item I<{kk}> : Kazakh - -=item I<{kha}> : Khasi - -=item I<{km}> : Khmer - -eq Cambodian. eq Kampuchean. - -=item [I<{khi}> : Khoisan (Other)] - -=item I<{kho}> : Khotanese - -=item I<{ki}> : Kikuyu - -eq Gikuyu. - -=item I<{kmb}> : Kimbundu - -=item I<{rw}> : Kinyarwanda - -=item I<{ky}> : Kirghiz - -=item I<{i-klingon}> : Klingon - -=item I<{kv}> : Komi - -=item I<{kon}> : Kongo - -=item I<{kok}> : Konkani - -=item I<{ko}> : Korean - -=item I<{kos}> : Kosraean - -=item I<{kpe}> : Kpelle - -=item I<{kro}> : Kru - -=item I<{kj}> : Kuanyama - -=item I<{kum}> : Kumyk - -=item I<{ku}> : Kurdish - -=item I<{kru}> : Kurukh - -=item I<{kut}> : Kutenai - -=item I<{lad}> : Ladino - -eq Judeo-Spanish. NOT Ladin (a minority language in Italy). - -=item I<{lah}> : Lahnda - -NOT Lamba! - -=item I<{lam}> : Lamba - -NOT Lahnda! - -=item I<{lo}> : Lao - -=item I<{la}> : Latin - -(Historical) NOT Ladin! NOT Ladino! - -=item I<{lv}> : Latvian - -eq Lettish. - -=item I<{lb}> : Letzeburgesch - -eq Luxemburgian, eq Luxemburger. (Formerly i-lux.) - -=item I<{lez}> : Lezghian - -=item I<{ln}> : Lingala - -=item I<{lt}> : Lithuanian - -=item I<{nds}> : Low German - -eq Low Saxon. eq Low German. eq Low Saxon. - -=item I<{loz}> : Lozi - -=item I<{lub}> : Luba-Katanga - -=item I<{lua}> : Luba-Lulua - -=item I<{lui}> : Luiseno - -eq LuiseEo. - -=item I<{lun}> : Lunda - -=item I<{luo}> : Luo (Kenya and Tanzania) - -=item I<{lus}> : Lushai - -=item I<{mk}> : Macedonian - -eq the modern Slavic language spoken in what was Yugoslavia. -NOT the form of Greek spoken in Greek Macedonia! - -=item I<{mad}> : Madurese - -=item I<{mag}> : Magahi - -=item I<{mai}> : Maithili - -=item I<{mak}> : Makasar - -=item I<{mg}> : Malagasy - -=item I<{ms}> : Malay - -NOT Malayalam! - -=item I<{ml}> : Malayalam - -NOT Malay! - -=item I<{mt}> : Maltese - -=item I<{mnc}> : Manchu - -=item I<{mdr}> : Mandar - -NOT Mandarin! - -=item I<{man}> : Mandingo - -=item I<{mni}> : Manipuri - -=item [I<{mno}> : Manobo languages] - -=item I<{gv}> : Manx - -=item I<{mi}> : Maori - -NOT Mari! - -=item I<{mr}> : Marathi - -=item I<{chm}> : Mari - -NOT Maori! - -=item I<{mh}> : Marshall - -eq Marshallese. - -=item I<{mwr}> : Marwari - -=item I<{mas}> : Masai - -=item [I<{myn}> : Mayan languages] - -=item I<{men}> : Mende - -=item I<{mic}> : Micmac - -=item I<{min}> : Minangkabau - -=item I<{i-mingo}> : Mingo - -eq the Irquoian language West Virginia Seneca. NOT New York Seneca! - -=item [I<{mis}> : Miscellaneous languages] - -Don't use this. - -=item I<{moh}> : Mohawk - -=item I<{mo}> : Moldavian - -eq Moldovan. - -=item [I<{mkh}> : Mon-Khmer (Other)] - -=item I<{lol}> : Mongo - -=item I<{mn}> : Mongolian - -eq Mongol. - -=item I<{mos}> : Mossi - -=item [I<{mul}> : Multiple languages] - -Not for normal use. - -=item [I<{mun}> : Munda languages] - -=item I<{nah}> : Nahuatl - -=item I<{na}> : Nauru - -=item I<{nv}> : Navajo - -eq Navaho. (Formerly i-navajo.) - -=item I<{nd}> : North Ndebele - -=item I<{nr}> : South Ndebele - -=item I<{ng}> : Ndonga - -=item I<{ne}> : Nepali - -eq Nepalese. Notable forms: ne-np ne-in. - -=item I<{new}> : Newari - -=item I<{nia}> : Nias - -=item [I<{nic}> : Niger-Kordofanian (Other)] - -=item [I<{ssa}> : Nilo-Saharan (Other)] - -=item I<{niu}> : Niuean - -=item I<{non}> : Old Norse - -(Historical) - -=item [I<{nai}> : North American Indian] - -Do not use this. - -=item I<{se}> : Northern Sami - -eq Lappish. eq Lapp. eq (Northern) Saami. - -=item I<{no}> : Norwegian - -Note the two following forms: - -=item I<{nb}> : Norwegian BokmEl - -(A form of Norwegian.) (Formerly no-bok.) - -=item I<{nn}> : Norwegian Nynorsk - -(A form of Norwegian.) (Formerly no-nyn.) - -=item [I<{nub}> : Nubian languages] - -=item I<{nym}> : Nyamwezi - -=item I<{nyn}> : Nyankole - -=item I<{nyo}> : Nyoro - -=item I<{nzi}> : Nzima - -=item I<{oc}> : Occitan (post 1500) - -eq ProvenEal, eq Provencal - -=item I<{oji}> : Ojibwa - -eq Ojibwe. - -=item I<{or}> : Oriya - -=item I<{om}> : Oromo - -=item I<{osa}> : Osage - -=item I<{os}> : Ossetian; Ossetic - -=item [I<{oto}> : Otomian languages] - -Group of languages collectively called "OtomE". - -=item I<{pal}> : Pahlavi - -eq Pahlevi - -=item I<{i-pwn}> : Paiwan - -eq Pariwan - -=item I<{pau}> : Palauan - -=item I<{pi}> : Pali - -(Historical?) - -=item I<{pam}> : Pampanga - -=item I<{pag}> : Pangasinan - -=item I<{pa}> : Panjabi - -eq Punjabi - -=item I<{pap}> : Papiamento - -eq Papiamentu. - -=item [I<{paa}> : Papuan (Other)] - -=item I<{fa}> : Persian - -eq Farsi. - -=item I<{peo}> : Old Persian (ca.600-400 B.C.) - -=item [I<{phi}> : Philippine (Other)] - -=item I<{phn}> : Phoenician - -(Historical) - -=item I<{pon}> : Pohnpeian - -=item I<{pl}> : Polish - -=item I<{pt}> : Portuguese - -eq Portugese. Notable forms: pt-pt pt-br. - -=item [I<{pra}> : Prakrit languages] - -=item I<{pro}> : Old ProvenEal (to 1500) - -eq Old Provencal. (Historical.) - -=item I<{ps}> : Pushto - -eq Pashto. eq Pushtu. - -=item I<{qu}> : Quechua - -eq Quecha. - -=item I<{rm}> : Raeto-Romance - -eq Romansh. - -=item I<{raj}> : Rajasthani - -=item I<{rap}> : Rapanui - -=item I<{rar}> : Rarotongan - -=item [I<{qaa}>-I : Reserved for local use.] - -=item [I<{roa}> : Romance (Other)] - -NOT Romanian! NOT Romany! NOT Romansh! - -=item I<{ro}> : Romanian - -eq Rumanian. NOT Romany! - -=item I<{rom}> : Romany - -eq Rom. NOT Romanian! - -=item I<{rn}> : Rundi - -=item I<{ru}> : Russian - -NOT White Russian! NOT Rusyn! - -=item [I<{sal}> : Salishan languages] - -Large language group. - -=item I<{sam}> : Samaritan Aramaic - -NOT Aramaic! - -=item [I<{smi}> : Sami languages (Other)] - -=item I<{sm}> : Samoan - -=item I<{sad}> : Sandawe - -=item I<{sg}> : Sango - -=item I<{sa}> : Sanskrit - -(Historical) - -=item I<{sat}> : Santali - -=item I<{sc}> : Sardinian - -eq Sard. - -=item I<{sas}> : Sasak - -=item I<{sco}> : Scots - -NOT Scots Gaelic! - -=item I<{sel}> : Selkup - -=item [I<{sem}> : Semitic (Other)] - -=item I<{sr}> : Serbian - -eq Serb. NOT Sorbian. - -=item I<{srr}> : Serer - -=item I<{shn}> : Shan - -=item I<{sn}> : Shona - -=item I<{sid}> : Sidamo - -=item I<{sgn-...}> : Sign Languages - -Always use with a subtag. Notable forms: sgn-gb sgn-ie sgn-ni sgn-us. - -=item I<{bla}> : Siksika - -eq Blackfoot. eq Pikanii. - -=item I<{sd}> : Sindhi - -=item I<{si}> : Sinhalese - -eq Sinhala. - -=item [I<{sit}> : Sino-Tibetan (Other)] - -=item [I<{sio}> : Siouan languages] - -=item I<{den}> : Slave (Athapascan) - -("Slavey" is a subform.) - -=item [I<{sla}> : Slavic (Other)] - -=item I<{sk}> : Slovak - -eq Slovakian. - -=item I<{sl}> : Slovenian - -eq Slovene. - -=item I<{sog}> : Sogdian - -=item I<{so}> : Somali - -=item I<{son}> : Songhai - -=item I<{snk}> : Soninke - -=item I<{wen}> : Sorbian languages - -eq Wendish. eq Sorb. eq Lusatian. eq Wend. NOT Venda! NOT Serbian! - -=item I<{nso}> : Northern Sotho - -=item I<{st}> : Southern Sotho - -eq Sutu. eq Sesotho. - -=item [I<{sai}> : South American Indian (Other)] - -=item I<{es}> : Spanish - -Notable forms: -es-ar es-bo es-cl es-co es-do es-ec es-es es-gt -es-hn es-mx es-pa es-pe es-pr es-py es-sv es-us -es-uy es-ve - -=item I<{suk}> : Sukuma - -=item I<{sux}> : Sumerian - -(Historical) - -=item I<{su}> : Sundanese - -=item I<{sus}> : Susu - -=item I<{sw}> : Swahili - -eq Kiswahili - -=item I<{ss}> : Swati - -=item I<{sv}> : Swedish - -Notable forms: sv-se sv-fi. - -=item I<{syr}> : Syriac - -=item I<{tl}> : Tagalog - -=item I<{ty}> : Tahitian - -=item [I<{tai}> : Tai (Other)] - -NOT Thai! - -=item I<{tg}> : Tajik - -=item I<{tmh}> : Tamashek - -=item I<{ta}> : Tamil - -=item I<{i-tao}> : Tao - -eq Yami. - -=item I<{tt}> : Tatar - -=item I<{i-tay}> : Tayal - -eq Atayal. eq Atayan. - -=item I<{te}> : Telugu - -=item I<{ter}> : Tereno - -=item I<{tet}> : Tetum - -=item I<{th}> : Thai - -NOT Tai! - -=item I<{bo}> : Tibetan - -=item I<{tig}> : Tigre - -=item I<{ti}> : Tigrinya - -=item I<{tem}> : Timne - -eq Themne. eq Timene. - -=item I<{tiv}> : Tiv - -=item I<{tli}> : Tlingit - -=item I<{tpi}> : Tok Pisin - -=item I<{tkl}> : Tokelau - -=item I<{tog}> : Tonga (Nyasa) - -NOT Tsonga! - -=item I<{to}> : Tonga (Tonga Islands) - -(Pronounced "Tong-a", not "Tong-ga") - -NOT Tsonga! - -=item I<{tsi}> : Tsimshian - -eq Sm'algyax - -=item I<{ts}> : Tsonga - -NOT Tonga! - -=item I<{i-tsu}> : Tsou - -=item I<{tn}> : Tswana - -Same as Setswana. - -=item I<{tum}> : Tumbuka - -=item I<{tr}> : Turkish - -(Typically in Roman script) - -=item I<{ota}> : Ottoman Turkish (1500-1928) - -(Typically in Arabic script) (Historical) - -=item I<{tk}> : Turkmen - -eq Turkmeni. - -=item I<{tvl}> : Tuvalu - -=item I<{tyv}> : Tuvinian - -eq Tuvan. eq Tuvin. - -=item I<{tw}> : Twi - -=item I<{uga}> : Ugaritic - -NOT Ugric! - -=item I<{ug}> : Uighur - -=item I<{uk}> : Ukrainian - -=item I<{umb}> : Umbundu - -=item I<{und}> : Undetermined - -Not a tag for normal use. - -=item I<{ur}> : Urdu - -=item I<{uz}> : Uzbek - -eq Ezbek - -=item I<{vai}> : Vai - -=item I<{ven}> : Venda - -NOT Wendish! NOT Wend! NOT Avestan! - -=item I<{vi}> : Vietnamese - -eq Viet. - -=item I<{vo}> : VolapEk - -eq Volapuk. (Artificial) - -=item I<{vot}> : Votic - -eq Votian. eq Vod. - -=item [I<{wak}> : Wakashan languages] - -=item I<{wal}> : Walamo - -eq Wolaytta. - -=item I<{war}> : Waray - -Presumably the Philippine language Waray-Waray (SamareEo), -not the smaller Philippine language Waray Sorsogon, nor the extinct -Australian language Waray. - -=item I<{was}> : Washo - -eq Washoe - -=item I<{cy}> : Welsh - -=item I<{wo}> : Wolof - -=item I<{x-...}> : Unregistered (Private Use) - -"x-" is a prefix for language tags that are not registered with ISO -or IANA. Example, x-double-dutch - -=item I<{xh}> : Xhosa - -=item I<{sah}> : Yakut - -=item I<{yao}> : Yao - -(The Yao in Malawi?) - -=item I<{yap}> : Yapese - -eq Yap - -=item I<{yi}> : Yiddish - -Formerly "ji". Sometimes in Roman script, sometimes in Hebrew script. - -=item I<{yo}> : Yoruba - -=item [I<{ypk}> : Yupik languages] - -Several "Eskimo" languages. - -=item I<{znd}> : Zande - -=item [I<{zap}> : Zapotec] - -(A group of languages.) - -=item I<{zen}> : Zenaga - -NOT Zend. - -=item I<{za}> : Zhuang - -=item I<{zu}> : Zulu - -=item I<{zun}> : Zuni - -eq ZuEi - -=back - -=head1 SEE ALSO - -L - -=head1 COPYRIGHT AND DISCLAIMER - -Copyright (c) 2001 Sean M. Burke. All rights reserved. - -You can redistribute and/or -modify this document under the same terms as Perl itself. - -This document is provided in the the hope that it will be -useful, but without any warranty; -without even the implied warranty of accuracy, authoritativeness, -completeness, merchantability, or fitness for a particular purpose. - -Email any corrections or questions to me. - -=head1 AUTHOR - -Sean M. Burke, sburkeE<64>cpan.org - -=cut - - -# To generate a list of just the two and three-letter codes: - -#!/usr/local/bin/perl -w - -require 5; # Time-stamp: "2001-03-13 21:53:39 MST" - # Sean M. Burke, sburke@cpan.org - # This program is for generating the language_codes.txt file -use strict; -use LWP::Simple; -use HTML::TreeBuilder 3.10; -my $root = HTML::TreeBuilder->new(); -my $url = 'http://lcweb.loc.gov/standards/iso639-2/bibcodes.html'; -$root->parse(get($url) || die "Can't get $url"); -$root->eof(); - -my @codes; - -foreach my $tr ($root->find_by_tag_name('tr')) { - my @f = map $_->as_text(), $tr->content_list(); - #print map("<$_> ", @f), "\n"; - next unless @f == 5; - pop @f; # nix the French name - next if $f[-1] eq 'Language Name (English)'; # it's a header line - my $xx = splice(@f, 2,1); # pull out the two-letter code - $f[-1] =~ s/^\s+//; - $f[-1] =~ s/\s+$//; - if($xx =~ m/[a-zA-Z]/) { # there's a two-letter code for it - push @codes, [ lc($f[-1]), "$xx\t$f[-1]\n" ]; - } else { # print the three-letter codes. - if($f[0] eq $f[1]) { - push @codes, [ lc($f[-1]), "$f[1]\t$f[2]\n" ]; - } else { # shouldn't happen - push @codes, [ lc($f[-1]), "@f !!!!!!!!!!\n" ]; - } - } -} - -print map $_->[1], sort {; $a->[0] cmp $b->[0] } @codes; -print "[ based on $url\n at ", scalar(localtime), "]\n", - "[Note: doesn't include IANA-registered codes.]\n"; -exit; -__END__ - diff --git a/lib/I18N/LangTags/README b/lib/I18N/LangTags/README new file mode 100644 index 0000000..2ac6053 --- /dev/null +++ b/lib/I18N/LangTags/README @@ -0,0 +1,78 @@ +README for I18N::LangTags + Time-stamp: "2001-05-29 21:52:15 MDT" + + I18N::LangTags + +I18N::LangTags - functions for dealing with RFC3066-style language +tags + +Language tags are a formalism, described in RFC 3066 (obsoleting +1766), for declaring what language form (language and possibly +dialect) a given chunk of information is in. + +This library provides functions for common tasks involving language +tags (notably the extraction of them, comparing them, and testing the +formal validity of them) as is needed in a variety of protocols and +applications. + + +I18N::LangTags::List -- tags and names for human languages. This +module goes from known language tag names ("fr-CA") to their English +names ("Canadian French"). Its documentation also lists the several +hundred known tags and some common subforms. You may find this useful +as a reference. + + +See the POD for more information. + + +INSTALLATION + +You install I18N::LangTags and I18N::LangTags::List, as you would +install any perl module library, by running these commands: + + perl Makefile.PL + make + make test + make install + +If you want to install a private copy of I18N::LangTags in your home +directory, then you should try to produce the initial Makefile with +something like this command: + + perl Makefile.PL LIB=~/perl + +See perldoc perlmodinstall for more information on installing modules. + + +DOCUMENTATION + +POD-format documentation is included in LangTags.pm. POD is readable +with the 'perldoc' utility. See ChangeLog for recent changes. + + +SUPPORT + +Questions, bug reports, useful code bits, and suggestions for +I18N::LangTags should just be sent to me at sburke@cpan.org + + +AVAILABILITY + +The latest version of I18N::LangTags is available from the +Comprehensive Perl Archive Network (CPAN). Visit + to find a CPAN site near you. + + +COPYRIGHT + +Copyright 1998-2001, Sean M. Burke , all rights +reserved. + +The programs and documentation in this dist are distributed in +the hope that they will be useful, but without any warranty; without +even the implied warranty of merchantability or fitness for a +particular purpose. + +This library is free software; you can redistribute it and/or modify +it under the same terms as Perl itself. diff --git a/lib/I18N/LangTags/test.pl b/lib/I18N/LangTags/test.pl index 06c178e..e9e96e8 100644 --- a/lib/I18N/LangTags/test.pl +++ b/lib/I18N/LangTags/test.pl @@ -1,21 +1,14 @@ -BEGIN { - chdir 't' if -d 't'; - @INC = '../lib'; -} +# Before `make install' is performed this script should be runnable with +# `make test'. After `make install' it should work as `perl test.pl' ######################### We start with some black magic to print on failure. require 5; - + # Time-stamp: "2001-06-20 01:43:31 MDT" use strict; use Test; -BEGIN { plan tests => 23 }; +BEGIN { plan tests => 46 }; BEGIN { ok 1 } -use I18N::LangTags qw(is_language_tag same_language_tag - extract_language_tags super_languages - similarity_language_tag is_dialect_of - locale2language_tag alternate_language_tags - encode_language_tag - ); +use I18N::LangTags (':ALL'); ok !is_language_tag(''); ok is_language_tag('fr'); @@ -41,5 +34,44 @@ ok 1 == similarity_language_tag('en-ca', 'en-us'); ok 2 == similarity_language_tag('en-us-southern', 'en-us-western'); ok 2 == similarity_language_tag('en-us-southern', 'en-us'); -# print "So there!\n"; +ok grep $_ eq 'hi', panic_languages('kok'); +ok grep $_ eq 'en', panic_languages('x-woozle-wuzzle'); +ok ! grep $_ eq 'mr', panic_languages('it'); +ok grep $_ eq 'es', panic_languages('it'); +ok grep $_ eq 'it', panic_languages('es'); + + +print "# Now the ::List tests...\n"; +use I18N::LangTags::List; +foreach my $lt (qw( + en + en-us + en-kr + el + elx + i-mingo + i-mingo-tom + x-mingo-tom + it + it-it + it-IT + it-FR + yi + ji + cre-syllabic + cre-syllabic-western + cre-western + cre-latin +)) { + my $name = I18N::LangTags::List::name($lt); + if($name) { + ok(1); + print "# $lt -> $name\n"; + } else { + ok(0); + print "# Failed lookup on $lt\n"; + } +} + +print "# So there!\n";