[p5sagit/p5-mst-13.2.git] / lib / I18N / LangTags.pm


# Time-stamp: "2001-05-25 07:36:55 MDT"
# Sean M. Burke <sburke@cpan.org>

require 5.000;
package I18N::LangTags;
use strict;
use vars qw(@ISA @EXPORT @EXPORT_OK $VERSION); # $Debug
require Exporter;
# $Debug = 0;
@ISA = qw(Exporter);
@EXPORT = qw();
@EXPORT_OK = qw(is_language_tag same_language_tag
                extract_language_tags super_languages
                similarity_language_tag is_dialect_of
                locale2language_tag alternate_language_tags
                encode_language_tag
               );

$VERSION = "0.21";

=head1 NAME

I18N::LangTags - functions for dealing with RFC3066-style language tags

=head1 SYNOPSIS

    use I18N::LangTags qw(is_language_tag same_language_tag
                          extract_language_tags super_languages
                          similarity_language_tag is_dialect_of
                          locale2language_tag alternate_language_tags
                          encode_language_tag
                         );

...or whatever of those functions you want to import.  Those are
all the exportable functions -- you're free to import only some,
or none at all.  By default, none are imported.

If you don't import any of these functions, assume a C<&I18N::LangTags::>
in front of all the function names in the following examples.

=head1 DESCRIPTION

Language tags are a formalism, described in RFC 3066 (obsoleting
1766), for declaring what language form (language and possibly
dialect) a given chunk of information is in.

This library provides functions for common tasks involving language
tags as they are needed in a variety of protocols and applications.

Please see the "See Also" references for a thorough explanation
of how to correctly use language tags.

=over

=cut

###########################################################################

=item * the function is_language_tag($lang1)

Returns true iff $lang1 is a formally valid language tag.

   is_language_tag("fr")            is TRUE
   is_language_tag("x-jicarilla")   is FALSE
       (Subtags can be 8 chars long at most -- 'jicarilla' is 9)

   is_language_tag("sgn-US")    is TRUE
       (That's American Sign Language)

   is_language_tag("i-Klikitat")    is TRUE
       (True without regard to the fact noone has actually
        registered Klikitat -- it's a formally valid tag)

   is_language_tag("fr-patois")     is TRUE
       (Formally valid -- altho descriptively weak!)

   is_language_tag("Spanish")       is FALSE
   is_language_tag("french-patois") is FALSE
       (No good -- first subtag has to match
        /^([xXiI]|[a-zA-Z]{2,3})$/ -- see RFC3066)

   is_language_tag("x-borg-prot2532") is TRUE
       (Yes, subtags can contain digits, as of RFC3066)

=cut

sub is_language_tag {

  ## Changes in the language tagging standards may have to be reflected here.

  my($tag) = lc($_[0]);

  return 0 if $tag eq "i" or $tag eq "x";
  # Bad degenerate cases the following
  #  regexp would erroneously let pass

  return $tag =~ 
    /^(?:  # First subtag
         [xi] | [a-z]{2,3}
      )
      (?:  # Subtags thereafter
         -           # separator
         [a-z0-9]{1,8}  # subtag  
      )*
    $/xs ? 1 : 0;
}

###########################################################################

=item * the function extract_language_tags($whatever)

Returns a list of whatever looks like formally valid language tags
in $whatever.  Not very smart, so don't get too creative with
what you want to feed it.

  extract_language_tags("fr, fr-ca, i-mingo")
    returns:   ('fr', 'fr-ca', 'i-mingo')

  extract_language_tags("It's like this: I'm in fr -- French!")
    returns:   ('It', 'in', 'fr')
  (So don't just feed it any old thing.)

The output is untainted.  If you don't know what tainting is,
don't worry about it.

=cut

sub extract_language_tags {

  ## Changes in the language tagging standards may have to be reflected here.

  my($text) =
    $_[0] =~ m/(.+)/  # to make for an untainted result
    ? $1 : ''
  ;
  
  return grep(!m/^[ixIX]$/s, # 'i' and 'x' aren't good tags
    $text =~ 
    m/
      \b
      (?:  # First subtag
         [iIxX] | [a-zA-Z]{2,3}
      )
      (?:  # Subtags thereafter
         -           # separator
         [a-zA-Z0-9]{1,8}  # subtag  
      )*
      \b
    /xsg
  );
}

###########################################################################

=item * the function same_language_tag($lang1, $lang2)

Returns true iff $lang1 and $lang2 are acceptable variant tags
representing the same language-form.

   same_language_tag('x-kadara', 'i-kadara')  is TRUE
      (The x/i- alternation doesn't matter)
   same_language_tag('X-KADARA', 'i-kadara')  is TRUE
      (...and neither does case)
   same_language_tag('en',       'en-US')     is FALSE
      (all-English is not the SAME as US English)
   same_language_tag('x-kadara', 'x-kadar')   is FALSE
      (these are totally unrelated tags)

C<same_language_tag> works by just seeing whether
C<encode_language_tag($lang1)> is the same as
C<encode_language_tag($lang2)>.

(Yes, I know this function is named a bit oddly.  Call it historic
reasons.)

=cut

sub same_language_tag {
  my $el1 = &encode_language_tag($_[0]);
  return 0 unless defined $el1;
   # this avoids the problem of
   # encode_language_tag($lang1) eq and encode_language_tag($lang2)
   # being true if $lang1 and $lang2 are both undef

  return $el1 eq &encode_language_tag($_[1]) ? 1 : 0;
}

###########################################################################

=item * the function similarity_language_tag($lang1, $lang2)

Returns an integer representing the degree of similarity between
tags $lang1 and $lang2 (the order of which does not matter), where
similarity is the number of common elements on the left,
without regard to case and to x/i- alternation.

   similarity_language_tag('fr', 'fr-ca')           is 1
      (one element in common)
   similarity_language_tag('fr-ca', 'fr-FR')        is 1
      (one element in common)

   similarity_language_tag('fr-CA-joual',
                           'fr-CA-PEI')             is 2
   similarity_language_tag('fr-CA-joual', 'fr-CA')  is 2
      (two elements in common)

   similarity_language_tag('x-kadara', 'i-kadara')  is 1
      (x/i- doesn't matter)

   similarity_language_tag('en',       'x-kadar')   is 0
   similarity_language_tag('x-kadara', 'x-kadar')   is 0
      (unrelated tags -- no similarity)

   similarity_language_tag('i-cree-syllabic',
                           'i-cherokee-syllabic')   is 0
      (no B<leftmost> elements in common!)

=cut

sub similarity_language_tag {
  my $lang1 = &encode_language_tag($_[0]);
  my $lang2 = &encode_language_tag($_[1]);

  # NB: (i-sil-...)?  (i-sgn-...)?

  return undef if !defined($lang1) and !defined($lang2);
  return 0 if !defined($lang1) or !defined($lang2);

  my @l1_subtags = split('-', $lang1);
  my @l2_subtags = split('-', $lang2);
  my $similarity = 0;

  while(@l1_subtags and @l2_subtags) {
    if(shift(@l1_subtags) eq shift(@l2_subtags)) {
      ++$similarity;
    } else {
      last;
    } 
  }
  return $similarity;
}

###########################################################################

=item * the function is_dialect_of($lang1, $lang2)

Returns true iff language tag $lang1 represents a subdialect of
language tag $lang2.

B<Get the order right!  It doesn't work the other way around!>

   is_dialect_of('en-US', 'en')            is TRUE
     (American English IS a dialect of all-English)

   is_dialect_of('fr-CA-joual', 'fr-CA')   is TRUE
   is_dialect_of('fr-CA-joual', 'fr')      is TRUE
     (Joual is a dialect of (a dialect of) French)

   is_dialect_of('en', 'en-US')            is FALSE
     (all-English is a NOT dialect of American English)

   is_dialect_of('fr', 'en-CA')            is FALSE

   is_dialect_of('en',    'en'   )            is TRUE
   is_dialect_of('en-US', 'en-US')            is TRUE
     (B<Note:> these are degenerate cases)

   is_dialect_of('i-mingo-tom', 'x-Mingo') is TRUE
     (the x/i thing doesn't matter, nor does case)

=cut

sub is_dialect_of {

  my $lang1 = &encode_language_tag($_[0]);
  my $lang2 = &encode_language_tag($_[1]);

  return undef if !defined($lang1) and !defined($lang2);
  return 0 if !defined($lang1) or !defined($lang2);

  return 1 if $lang1 eq $lang2;
  return 0 if length($lang1) < length($lang2);

  $lang1 .= '-';
  $lang2 .= '-';
  return
    (substr($lang1, 0, length($lang2)) eq $lang2) ? 1 : 0;
}

###########################################################################

=item * the function super_languages($lang1)

Returns a list of language tags that are superordinate tags to $lang1
-- it gets this by removing subtags from the end of $lang1 until
nothing (or just "i" or "x") is left.

   super_languages("fr-CA-joual")  is  ("fr-CA", "fr")

   super_languages("en-AU")  is  ("en")

   super_languages("en")  is  empty-list, ()

   super_languages("i-cherokee")  is  empty-list, ()
    ...not ("i"), which would be illegal as well as pointless.

If $lang1 is not a valid language tag, returns empty-list in
a list context, undef in a scalar context.

A notable and rather unavoidable problem with this method:
"x-mingo-tom" has an "x" because the whole tag isn't an
IANA-registered tag -- but super_languages('x-mingo-tom') is
('x-mingo') -- which isn't really right, since 'i-mingo' is
registered.  But this module has no way of knowing that.  (But note
that same_language_tag('x-mingo', 'i-mingo') is TRUE.)

More importantly, you assume I<at your peril> that superordinates of
$lang1 are mutually intelligible with $lang1.  Consider this
carefully.

=cut 

sub super_languages {
  my $lang1 = $_[0];
  return() unless defined($lang1) && &is_language_tag($lang1);
  my @l1_subtags = split('-', $lang1);

  ## Changes in the language tagging standards may have to be reflected here.

  # NB: (i-sil-...)?

  my @supers = ();
  foreach my $bit (@l1_subtags) {
    push @supers, 
      scalar(@supers) ? ($supers[-1] . '-' . $bit) : $bit;
  }
  pop @supers if @supers;
  shift @supers if @supers && $supers[0] =~ m<^[iIxX]$>s;
  return reverse @supers;
}

###########################################################################

=item * the function locale2language_tag($locale_identifier)

This takes a locale name (like "en", "en_US", or "en_US.ISO8859-1")
and maps it to a language tag.  If it's not mappable (as with,
notably, "C" and "POSIX"), this returns empty-list in a list context,
or undef in a scalar context.

   locale2language_tag("en") is "en"

   locale2language_tag("en_US") is "en-US"

   locale2language_tag("en_US.ISO8859-1") is "en-US"

   locale2language_tag("C") is undef or ()

   locale2language_tag("POSIX") is undef or ()

   locale2language_tag("POSIX") is undef or ()

I'm not totally sure that locale names map satisfactorily to language
tags.  Think REAL hard about how you use this.  YOU HAVE BEEN WARNED.

The output is untainted.  If you don't know what tainting is,
don't worry about it.

=cut 

sub locale2language_tag {
  my $lang =
    $_[0] =~ m/(.+)/  # to make for an untainted result
    ? $1 : ''
  ;

  return $lang if &is_language_tag($lang); # like "en"

  $lang =~ tr<_><->;  # "en_US" -> en-US
  $lang =~ s<\.[-_a-zA-Z0-9\.]*><>s;  # "en_US.ISO8859-1" -> en-US

  return $lang if &is_language_tag($lang);

  return;
}

###########################################################################

=item * the function encode_language_tag($lang1)

This function, if given a language tag, returns an encoding of it such
that:

* tags representing different languages never get the same encoding.

* tags representing the same language always get the same encoding.

* an encoding of a formally valid language tag always is a string
value that is defined, has length, and is true if considered as a
boolean.

Note that the encoding itself is B<not> a formally valid language tag.
Note also that you cannot, currently, go from an encoding back to a
language tag that it's an encoding of.

Note also that you B<must> consider the encoded value as atomic; i.e.,
you should not consider it as anything but an opaque, unanalysable
string value.  (The internals of the encoding method may change in
future versions, as the language tagging standard changes over time.)

C<encode_language_tag> returns undef if given anything other than a
formally valid language tag.

The reason C<encode_language_tag> exists is because different language
tags may represent the same language; this is normally treatable with
C<same_language_tag>, but consider this situation:

You have a data file that expresses greetings in different languages.
Its format is "[language tag]=[how to say 'Hello']", like:

          en-US=Hiho
          fr=Bonjour
          i-mingo=Hau'

And suppose you write a program that reads that file and then runs as
a daemon, answering client requests that specify a language tag and
then expect the string that says how to greet in that language.  So an
interaction looks like:

          greeting-client asks:    fr
          greeting-server answers: Bonjour

So far so good.  But suppose the way you're implementing this is:

          my %greetings;
          die unless open(IN, "<in.dat");
          while(<IN>) {
            chomp;
            next unless /^([^=]+)=(.+)/s;
            my($lang, $expr) = ($1, $2);
            $greetings{$lang} = $expr;
          }
          close(IN);

at which point %greetings has the contents:

          "en-US"   => "Hiho"
          "fr"      => "Bonjour"
          "i-mingo" => "Hau'"

And suppose then that you answer client requests for language $wanted
by just looking up $greetings{$wanted}.

If the client asks for "fr", that will look up successfully in
%greetings, to the value "Bonjour".  And if the client asks for
"i-mingo", that will look up successfully in %greetings, to the value
"Hau'".

But if the client asks for "i-Mingo" or "x-mingo", or "Fr", then the
lookup in %greetings fails.  That's the Wrong Thing.

You could instead do lookups on $wanted with:

          use I18N::LangTags qw(same_language_tag);
          my $repsonse = '';
          foreach my $l2 (keys %greetings) {
            if(same_language_tag($wanted, $l2)) {
              $response = $greetings{$l2};
              last;
            }
          }

But that's rather inefficient.  A better way to do it is to start your
program with:

          use I18N::LangTags qw(encode_language_tag);
          my %greetings;
          die unless open(IN, "<in.dat");
          while(<IN>) {
            chomp;
            next unless /^([^=]+)=(.+)/s;
            my($lang, $expr) = ($1, $2);
            $greetings{
                        encode_language_tag($lang)
                      } = $expr;
          }
          close(IN);

and then just answer client requests for language $wanted by just
looking up

          $greetings{encode_language_tag($wanted)}

And that does the Right Thing.

=cut

sub encode_language_tag {
  # Only similarity_language_tag() is allowed to analyse encodings!

  ## Changes in the language tagging standards may have to be reflected here.

  my($tag) = uc($_[0]); # smash case
  return undef unless &is_language_tag($tag);
   # If it's not a language tag, its encoding is undef

  $tag =~ s/^[xiXI]-//s;
   # Just lop off any leading "x/i-"
   # Or I suppose I could do s/^[xiXI]-/_/s or something.

  return "~$tag";
}

#--------------------------------------------------------------------------

=item * the function alternate_language_tags($lang1)

This function, if given a language tag, returns all language tags that
are alternate forms of this language tag.  (There is little
alternation in the C<current> language tagging formalism, but
extensions to the formalism are under consideration which could add a
great deal of alternation.)

Examples from the current formalism:

          alternate_language_tags('en')           is   ()
          alternate_language_tags('x-mingo-tom')  is   ('i-mingo-tom')
          alternate_language_tags('x-klikitat')   is   ('i-klikitat')
          alternate_language_tags('i-klikitat')   is   ('x-klikitat')

This function returns undef if given anything other than a formally
valid language tag.

=cut

my %alt = qw( i x   x i   I X   X I );
sub alternate_language_tags {
  ## Changes in the language tagging standards may have to be reflected here.
  my $tag = $_[0];
  return() unless &is_language_tag($tag);

 # might as well preserve case

  if($tag =~ /^([XIxi])(-.+)/) {
    # This handles all the alternation that exists CURRENTLY
    return($alt{$1} . $2);
  }
  return();
}

###########################################################################

=back

=head1 ABOUT LOWERCASING

I've considered making all the above functions that output language
tags return all those tags strictly in lowercase.  Having all your
language tags in lowercase does make some things easier.  But you
might as well just lowercase as you like, or call
C<encode_language_tag($lang1)> where appropriate.

=head1 ABOUT UNICODE PLAINTEXT LANGUAGE TAGS

In some future version of I18N::LangTags, I plan to include support
for RFC2482-style language tags -- which are basically just normal
language tags with their ASCII characters shifted into Plane 14.

=head1 SEE ALSO

* RFC 3066, C<ftp://ftp.isi.edu/in-notes/rfc3066.txt>, "Tags for the
Identification of Languages".  (Obsoletes RFC 1766)

* RFC 2277, C<ftp://ftp.isi.edu/in-notes/rfc2277.txt>, "IETF Policy on
Character Sets and Languages".

* RFC 2231, C<ftp://ftp.isi.edu/in-notes/rfc2231.txt>, "MIME Parameter
Value and Encoded Word Extensions: Character Sets, Languages, and
Continuations".

* RFC 2482, C<ftp://ftp.isi.edu/in-notes/rfc2482.txt>, 
"Language Tagging in Unicode Plain Text".

* Locale::Codes, in
C<http://www.perl.com/CPAN/modules/by-module/Locale/>

* ISO 639, "Code for the representation of names of languages",
C<http://www.indigo.ie/egt/standards/iso639/iso639-1-en.html>

* ISO 639-2, "Codes for the representation of names of languages",
including three-letter codes,
C<http://lcweb.loc.gov/standards/iso639-2/bibcodes.html>

* The IANA list of registered languages (hopefully up-to-date),
C<ftp://ftp.isi.edu/in-notes/iana/assignments/languages/>

=head1 COPYRIGHT

Copyright (c) 1998-2001 Sean M. Burke. All rights reserved.

This library is free software; you can redistribute it and/or
modify it under the same terms as Perl itself.

The programs and documentation in this dist are distributed in
the hope that they will be useful, but without any warranty; without
even the implied warranty of merchantability or fitness for a
particular purpose.

=head1 AUTHOR

Sean M. Burke C<sburke@cpan.org>

=cut

1;

__END__
Commit	Line	Data
4b053158	1
	2	# Time-stamp: "2001-05-25 07:36:55 MDT"
	3	# Sean M. Burke <sburke@cpan.org>
	4
	5	require 5.000;
	6	package I18N::LangTags;
	7	use strict;
	8	use vars qw(@ISA @EXPORT @EXPORT_OK $VERSION); # $Debug
	9	require Exporter;
	10	# $Debug = 0;
	11	@ISA = qw(Exporter);
	12	@EXPORT = qw();
	13	@EXPORT_OK = qw(is_language_tag same_language_tag
	14	extract_language_tags super_languages
	15	similarity_language_tag is_dialect_of
	16	locale2language_tag alternate_language_tags
	17	encode_language_tag
	18	);
	19
	20	$VERSION = "0.21";
	21
	22	=head1 NAME
	23
	24	I18N::LangTags - functions for dealing with RFC3066-style language tags
	25
	26	=head1 SYNOPSIS
	27
	28	use I18N::LangTags qw(is_language_tag same_language_tag
	29	extract_language_tags super_languages
	30	similarity_language_tag is_dialect_of
	31	locale2language_tag alternate_language_tags
	32	encode_language_tag
	33	);
	34
	35	...or whatever of those functions you want to import. Those are
	36	all the exportable functions -- you're free to import only some,
	37	or none at all. By default, none are imported.
	38
	39	If you don't import any of these functions, assume a C<&I18N::LangTags::>
	40	in front of all the function names in the following examples.
	41
	42	=head1 DESCRIPTION
	43
	44	Language tags are a formalism, described in RFC 3066 (obsoleting
	45	1766), for declaring what language form (language and possibly
	46	dialect) a given chunk of information is in.
	47
	48	This library provides functions for common tasks involving language
	49	tags as they are needed in a variety of protocols and applications.
	50
	51	Please see the "See Also" references for a thorough explanation
	52	of how to correctly use language tags.
	53
	54	=over
	55
	56	=cut
	57
	58	###########################################################################
	59
	60	=item * the function is_language_tag($lang1)
	61
	62	Returns true iff $lang1 is a formally valid language tag.
	63
	64	is_language_tag("fr") is TRUE
65	is_language_tag("x-jicarilla") is FALSE
66	(Subtags can be 8 chars long at most -- 'jicarilla' is 9)
67
68	is_language_tag("sgn-US") is TRUE
69	(That's American Sign Language)
70
71	is_language_tag("i-Klikitat") is TRUE
72	(True without regard to the fact noone has actually
73	registered Klikitat -- it's a formally valid tag)
74
75	is_language_tag("fr-patois") is TRUE
76	(Formally valid -- altho descriptively weak!)
77
78	is_language_tag("Spanish") is FALSE
79	is_language_tag("french-patois") is FALSE
80	(No good -- first subtag has to match
81	/^([xXiI]\|[a-zA-Z]{2,3})$/ -- see RFC3066)
82
83	is_language_tag("x-borg-prot2532") is TRUE
84	(Yes, subtags can contain digits, as of RFC3066)
85
86	=cut
87
88	sub is_language_tag {
89
90	## Changes in the language tagging standards may have to be reflected here.
91
92	my($tag) = lc($_[0]);
93
94	return 0 if $tag eq "i" or $tag eq "x";
95	# Bad degenerate cases the following
96	# regexp would erroneously let pass
97
98	return $tag =~
99	/^(?: # First subtag
100	[xi] \| [a-z]{2,3}
101	)
102	(?: # Subtags thereafter
103	- # separator
104	[a-z0-9]{1,8} # subtag
105	)*
106	$/xs ? 1 : 0;
107	}
108
109	###########################################################################
110
111	=item * the function extract_language_tags($whatever)
112
113	Returns a list of whatever looks like formally valid language tags
114	in $whatever. Not very smart, so don't get too creative with
115	what you want to feed it.
116
117	extract_language_tags("fr, fr-ca, i-mingo")
118	returns: ('fr', 'fr-ca', 'i-mingo')
119
120	extract_language_tags("It's like this: I'm in fr -- French!")
121	returns: ('It', 'in', 'fr')
122	(So don't just feed it any old thing.)
123
124	The output is untainted. If you don't know what tainting is,
125	don't worry about it.
126
127	=cut
128
129	sub extract_language_tags {
130
131	## Changes in the language tagging standards may have to be reflected here.
132
133	my($text) =
134	$_[0] =~ m/(.+)/ # to make for an untainted result
135	? $1 : ''
136	;
137
138	return grep(!m/^[ixIX]$/s, # 'i' and 'x' aren't good tags
139	$text =~
140	m/
141	\b
142	(?: # First subtag
143	[iIxX] \| [a-zA-Z]{2,3}
144	)
145	(?: # Subtags thereafter
146	- # separator
147	[a-zA-Z0-9]{1,8} # subtag
148	)*
149	\b
150	/xsg
151	);
152	}
153
154	###########################################################################
155
156	=item * the function same_language_tag($lang1, $lang2)
157
158	Returns true iff $lang1 and $lang2 are acceptable variant tags
159	representing the same language-form.
160
161	same_language_tag('x-kadara', 'i-kadara') is TRUE
162	(The x/i- alternation doesn't matter)
163	same_language_tag('X-KADARA', 'i-kadara') is TRUE
164	(...and neither does case)
165	same_language_tag('en', 'en-US') is FALSE
166	(all-English is not the SAME as US English)
167	same_language_tag('x-kadara', 'x-kadar') is FALSE
168	(these are totally unrelated tags)
169
170	C<same_language_tag> works by just seeing whether
171	C<encode_language_tag($lang1)> is the same as
172	C<encode_language_tag($lang2)>.
173
174	(Yes, I know this function is named a bit oddly. Call it historic
175	reasons.)
176
177	=cut
178
179	sub same_language_tag {
180	my $el1 = &encode_language_tag($_[0]);
181	return 0 unless defined $el1;
182	# this avoids the problem of
183	# encode_language_tag($lang1) eq and encode_language_tag($lang2)
184	# being true if $lang1 and $lang2 are both undef
185
186	return $el1 eq &encode_language_tag($_[1]) ? 1 : 0;
187	}
188
189	###########################################################################
190
191	=item * the function similarity_language_tag($lang1, $lang2)
192
193	Returns an integer representing the degree of similarity between
194	tags $lang1 and $lang2 (the order of which does not matter), where
195	similarity is the number of common elements on the left,
196	without regard to case and to x/i- alternation.
197
198	similarity_language_tag('fr', 'fr-ca') is 1
199	(one element in common)
200	similarity_language_tag('fr-ca', 'fr-FR') is 1
201	(one element in common)
202
203	similarity_language_tag('fr-CA-joual',
204	'fr-CA-PEI') is 2
205	similarity_language_tag('fr-CA-joual', 'fr-CA') is 2
206	(two elements in common)
207
208	similarity_language_tag('x-kadara', 'i-kadara') is 1
209	(x/i- doesn't matter)
210
211	similarity_language_tag('en', 'x-kadar') is 0
212	similarity_language_tag('x-kadara', 'x-kadar') is 0
213	(unrelated tags -- no similarity)
214
215	similarity_language_tag('i-cree-syllabic',
216	'i-cherokee-syllabic') is 0
217	(no B<leftmost> elements in common!)
218
219	=cut
220
221	sub similarity_language_tag {
222	my $lang1 = &encode_language_tag($_[0]);
223	my $lang2 = &encode_language_tag($_[1]);
224
225	# NB: (i-sil-...)? (i-sgn-...)?
226
227	return undef if !defined($lang1) and !defined($lang2);
228	return 0 if !defined($lang1) or !defined($lang2);
229
230	my @l1_subtags = split('-', $lang1);
231	my @l2_subtags = split('-', $lang2);
232	my $similarity = 0;
233
234	while(@l1_subtags and @l2_subtags) {
235	if(shift(@l1_subtags) eq shift(@l2_subtags)) {
236	++$similarity;
237	} else {
238	last;
239	}
240	}
241	return $similarity;
242	}
243
244	###########################################################################
245
246	=item * the function is_dialect_of($lang1, $lang2)
247
248	Returns true iff language tag $lang1 represents a subdialect of
249	language tag $lang2.
250
251	B<Get the order right! It doesn't work the other way around!>
252
253	is_dialect_of('en-US', 'en') is TRUE
254	(American English IS a dialect of all-English)
255
256	is_dialect_of('fr-CA-joual', 'fr-CA') is TRUE
257	is_dialect_of('fr-CA-joual', 'fr') is TRUE
258	(Joual is a dialect of (a dialect of) French)
259
260	is_dialect_of('en', 'en-US') is FALSE
261	(all-English is a NOT dialect of American English)
262
263	is_dialect_of('fr', 'en-CA') is FALSE
264
265	is_dialect_of('en', 'en' ) is TRUE
266	is_dialect_of('en-US', 'en-US') is TRUE
267	(B<Note:> these are degenerate cases)
268
269	is_dialect_of('i-mingo-tom', 'x-Mingo') is TRUE
270	(the x/i thing doesn't matter, nor does case)
271
272	=cut
273
274	sub is_dialect_of {
275
276	my $lang1 = &encode_language_tag($_[0]);
277	my $lang2 = &encode_language_tag($_[1]);
278
279	return undef if !defined($lang1) and !defined($lang2);
280	return 0 if !defined($lang1) or !defined($lang2);
281
282	return 1 if $lang1 eq $lang2;
283	return 0 if length($lang1) < length($lang2);
284
285	$lang1 .= '-';
286	$lang2 .= '-';
287	return
288	(substr($lang1, 0, length($lang2)) eq $lang2) ? 1 : 0;
289	}
290
291	###########################################################################
292
293	=item * the function super_languages($lang1)
294
295	Returns a list of language tags that are superordinate tags to $lang1
296	-- it gets this by removing subtags from the end of $lang1 until
297	nothing (or just "i" or "x") is left.
298
299	super_languages("fr-CA-joual") is ("fr-CA", "fr")
300
301	super_languages("en-AU") is ("en")
302
303	super_languages("en") is empty-list, ()
304
305	super_languages("i-cherokee") is empty-list, ()
306	...not ("i"), which would be illegal as well as pointless.
307
308	If $lang1 is not a valid language tag, returns empty-list in
309	a list context, undef in a scalar context.
310
311	A notable and rather unavoidable problem with this method:
312	"x-mingo-tom" has an "x" because the whole tag isn't an
313	IANA-registered tag -- but super_languages('x-mingo-tom') is
314	('x-mingo') -- which isn't really right, since 'i-mingo' is
315	registered. But this module has no way of knowing that. (But note
316	that same_language_tag('x-mingo', 'i-mingo') is TRUE.)
317
318	More importantly, you assume I<at your peril> that superordinates of
319	$lang1 are mutually intelligible with $lang1. Consider this
320	carefully.
321
322	=cut
323
324	sub super_languages {
325	my $lang1 = $_[0];
326	return() unless defined($lang1) && &is_language_tag($lang1);
327	my @l1_subtags = split('-', $lang1);
328
329	## Changes in the language tagging standards may have to be reflected here.
330
331	# NB: (i-sil-...)?
332
333	my @supers = ();
334	foreach my $bit (@l1_subtags) {
335	push @supers,
336	scalar(@supers) ? ($supers[-1] . '-' . $bit) : $bit;
337	}
338	pop @supers if @supers;
339	shift @supers if @supers && $supers[0] =~ m<^[iIxX]$>s;
340	return reverse @supers;
341	}
342
343	###########################################################################
344
345	=item * the function locale2language_tag($locale_identifier)
346
347	This takes a locale name (like "en", "en_US", or "en_US.ISO8859-1")
348	and maps it to a language tag. If it's not mappable (as with,
349	notably, "C" and "POSIX"), this returns empty-list in a list context,
350	or undef in a scalar context.
351
352	locale2language_tag("en") is "en"
353
354	locale2language_tag("en_US") is "en-US"
355
356	locale2language_tag("en_US.ISO8859-1") is "en-US"
357
358	locale2language_tag("C") is undef or ()
359
360	locale2language_tag("POSIX") is undef or ()
361
362	locale2language_tag("POSIX") is undef or ()
363
364	I'm not totally sure that locale names map satisfactorily to language
365	tags. Think REAL hard about how you use this. YOU HAVE BEEN WARNED.
366
367	The output is untainted. If you don't know what tainting is,
368	don't worry about it.
369
370	=cut
371
372	sub locale2language_tag {
373	my $lang =
374	$_[0] =~ m/(.+)/ # to make for an untainted result
375	? $1 : ''
376	;
377
378	return $lang if &is_language_tag($lang); # like "en"
379
380	$lang =~ tr<_><->; # "en_US" -> en-US
381	$lang =~ s<\.[-_a-zA-Z0-9\.]*><>s; # "en_US.ISO8859-1" -> en-US
382
383	return $lang if &is_language_tag($lang);
384
385	return;
386	}
387
388	###########################################################################
389
390	=item * the function encode_language_tag($lang1)
391
392	This function, if given a language tag, returns an encoding of it such
393	that:
394
395	* tags representing different languages never get the same encoding.
396
397	* tags representing the same language always get the same encoding.
398
399	* an encoding of a formally valid language tag always is a string
400	value that is defined, has length, and is true if considered as a
401	boolean.
402
403	Note that the encoding itself is B<not> a formally valid language tag.
404	Note also that you cannot, currently, go from an encoding back to a
405	language tag that it's an encoding of.
406
407	Note also that you B<must> consider the encoded value as atomic; i.e.,
408	you should not consider it as anything but an opaque, unanalysable
409	string value. (The internals of the encoding method may change in
410	future versions, as the language tagging standard changes over time.)
411
412	C<encode_language_tag> returns undef if given anything other than a
413	formally valid language tag.
414
415	The reason C<encode_language_tag> exists is because different language
416	tags may represent the same language; this is normally treatable with
417	C<same_language_tag>, but consider this situation:
418
419	You have a data file that expresses greetings in different languages.
420	Its format is "[language tag]=[how to say 'Hello']", like:
421
422	en-US=Hiho
423	fr=Bonjour
424	i-mingo=Hau'
425
426	And suppose you write a program that reads that file and then runs as
427	a daemon, answering client requests that specify a language tag and
428	then expect the string that says how to greet in that language. So an
429	interaction looks like:
430
431	greeting-client asks: fr
432	greeting-server answers: Bonjour
433
434	So far so good. But suppose the way you're implementing this is:
435
436	my %greetings;
437	die unless open(IN, "<in.dat");
438	while(<IN>) {
439	chomp;
440	next unless /^([^=]+)=(.+)/s;
441	my($lang, $expr) = ($1, $2);
442	$greetings{$lang} = $expr;
443	}
444	close(IN);
445
446	at which point %greetings has the contents:
447
448	"en-US" => "Hiho"
449	"fr" => "Bonjour"
450	"i-mingo" => "Hau'"
451
452	And suppose then that you answer client requests for language $wanted
453	by just looking up $greetings{$wanted}.
454
455	If the client asks for "fr", that will look up successfully in
456	%greetings, to the value "Bonjour". And if the client asks for
457	"i-mingo", that will look up successfully in %greetings, to the value
458	"Hau'".
459
460	But if the client asks for "i-Mingo" or "x-mingo", or "Fr", then the
461	lookup in %greetings fails. That's the Wrong Thing.
462
463	You could instead do lookups on $wanted with:
464
465	use I18N::LangTags qw(same_language_tag);
466	my $repsonse = '';
467	foreach my $l2 (keys %greetings) {
468	if(same_language_tag($wanted, $l2)) {
469	$response = $greetings{$l2};
470	last;
471	}
472	}
473
474	But that's rather inefficient. A better way to do it is to start your
475	program with:
476
477	use I18N::LangTags qw(encode_language_tag);
478	my %greetings;
479	die unless open(IN, "<in.dat");
480	while(<IN>) {
481	chomp;
482	next unless /^([^=]+)=(.+)/s;
483	my($lang, $expr) = ($1, $2);
484	$greetings{
485	encode_language_tag($lang)
486	} = $expr;
487	}
488	close(IN);
489
490	and then just answer client requests for language $wanted by just
491	looking up
492
493	$greetings{encode_language_tag($wanted)}
494
495	And that does the Right Thing.
496
497	=cut
498
499	sub encode_language_tag {
500	# Only similarity_language_tag() is allowed to analyse encodings!
501
502	## Changes in the language tagging standards may have to be reflected here.
503
504	my($tag) = uc($_[0]); # smash case
505	return undef unless &is_language_tag($tag);
506	# If it's not a language tag, its encoding is undef
507
508	$tag =~ s/^[xiXI]-//s;
509	# Just lop off any leading "x/i-"
510	# Or I suppose I could do s/^[xiXI]-/_/s or something.
511
512	return "~$tag";
513	}
514
515	#--------------------------------------------------------------------------
516
517	=item * the function alternate_language_tags($lang1)
518
519	This function, if given a language tag, returns all language tags that
520	are alternate forms of this language tag. (There is little
521	alternation in the C<current> language tagging formalism, but
522	extensions to the formalism are under consideration which could add a
523	great deal of alternation.)
524
525	Examples from the current formalism:
526
527	alternate_language_tags('en') is ()
528	alternate_language_tags('x-mingo-tom') is ('i-mingo-tom')
529	alternate_language_tags('x-klikitat') is ('i-klikitat')
530	alternate_language_tags('i-klikitat') is ('x-klikitat')
531
532	This function returns undef if given anything other than a formally
533	valid language tag.
534
535	=cut
536
537	my %alt = qw( i x x i I X X I );
538	sub alternate_language_tags {
539	## Changes in the language tagging standards may have to be reflected here.
540	my $tag = $_[0];
541	return() unless &is_language_tag($tag);
542
543	# might as well preserve case
544
545	if($tag =~ /^([XIxi])(-.+)/) {
546	# This handles all the alternation that exists CURRENTLY
547	return($alt{$1} . $2);
548	}
549	return();
550	}
551
552	###########################################################################
553
554	=back
555
556	=head1 ABOUT LOWERCASING
557
558	I've considered making all the above functions that output language
559	tags return all those tags strictly in lowercase. Having all your
560	language tags in lowercase does make some things easier. But you
561	might as well just lowercase as you like, or call
562	C<encode_language_tag($lang1)> where appropriate.
563
564	=head1 ABOUT UNICODE PLAINTEXT LANGUAGE TAGS
565
566	In some future version of I18N::LangTags, I plan to include support
567	for RFC2482-style language tags -- which are basically just normal
568	language tags with their ASCII characters shifted into Plane 14.
569
570	=head1 SEE ALSO
571
572	* RFC 3066, C<ftp://ftp.isi.edu/in-notes/rfc3066.txt>, "Tags for the
573	Identification of Languages". (Obsoletes RFC 1766)
574
575	* RFC 2277, C<ftp://ftp.isi.edu/in-notes/rfc2277.txt>, "IETF Policy on
576	Character Sets and Languages".
577
578	* RFC 2231, C<ftp://ftp.isi.edu/in-notes/rfc2231.txt>, "MIME Parameter
579	Value and Encoded Word Extensions: Character Sets, Languages, and
580	Continuations".
581
582	* RFC 2482, C<ftp://ftp.isi.edu/in-notes/rfc2482.txt>,
583	"Language Tagging in Unicode Plain Text".
584
585	* Locale::Codes, in
586	C<http://www.perl.com/CPAN/modules/by-module/Locale/>
587
588	* ISO 639, "Code for the representation of names of languages",
589	C<http://www.indigo.ie/egt/standards/iso639/iso639-1-en.html>
590
591	* ISO 639-2, "Codes for the representation of names of languages",
592	including three-letter codes,
593	C<http://lcweb.loc.gov/standards/iso639-2/bibcodes.html>
594
595	* The IANA list of registered languages (hopefully up-to-date),
596	C<ftp://ftp.isi.edu/in-notes/iana/assignments/languages/>
597
598	=head1 COPYRIGHT
599
600	Copyright (c) 1998-2001 Sean M. Burke. All rights reserved.
601
602	This library is free software; you can redistribute it and/or
603	modify it under the same terms as Perl itself.
604
605	The programs and documentation in this dist are distributed in
606	the hope that they will be useful, but without any warranty; without
607	even the implied warranty of merchantability or fitness for a
608	particular purpose.
609
610	=head1 AUTHOR
611
612	Sean M. Burke C<sburke@cpan.org>
613
614	=cut
615
616	1;
617
618	__END__