[p5sagit/p5-mst-13.2.git] / ext / Text / Soundex / Soundex.pm

# -*- perl -*-

# (c) Copyright 1998-2003 by Mark Mielke
#
# Freedom to use these sources for whatever you want, as long as credit
# is given where credit is due, is hereby granted. You may make modifications
# where you see fit but leave this copyright somewhere visible. As well, try
# to initial any changes you make so that if I like the changes I can
# incorporate them into later versions.
#
#      - Mark Mielke <mark@mielke.cc>
#

package Text::Soundex;
require 5.006;

use Exporter ();
use XSLoader ();

use strict;

our $VERSION   = '3.02';
our @EXPORT_OK = qw(soundex soundex_unicode soundex_nara soundex_nara_unicode
                    $soundex_nocode);
our @EXPORT    = qw(soundex $soundex_nocode);
our @ISA       = qw(Exporter);

our $nocode;

# Previous releases of Text::Soundex made $nocode available as $soundex_nocode.
# For now, this part of the interface is exported and maintained.
# In the feature, $soundex_nocode will be deprecated.
*Text::Soundex::soundex_nocode = \$nocode;

sub soundex_noxs
{
    # Strict implementation of Knuth's soundex algorithm.

    my @results = map {
        my $code = $_;
        $code =~ tr/AaEeHhIiOoUuWwYyBbFfPpVvCcGgJjKkQqSsXxZzDdTtLlMmNnRr//cd;

	if (length($code)) {
            my $firstchar = substr($code, 0, 1);
	    $code =~ tr[AaEeHhIiOoUuWwYyBbFfPpVvCcGgJjKkQqSsXxZzDdTtLlMmNnRr]
                       [0000000000000000111111112222222222222222333344555566]s;
	    ($code = substr($code, 1)) =~ tr/0//d;
	    substr($firstchar . $code . '000', 0, 4);
	} else {
	    $nocode;
	}
    } @_;

    wantarray ? @results : $results[0];
}

sub soundex_nara
{
    # Implementation of NARA's soundex algorithm. If two sounds are
    # identical, and separated by only an H or a W... they should be
    # treated as one. This requires an additional "s///", as well as
    # the "9" character code to represent H and W. ("9" works like "0"
    # except it combines indentical sounds around it into one)

    my @results = map {
	my $code = uc($_);
        $code =~ tr/AaEeHhIiOoUuWwYyBbFfPpVvCcGgJjKkQqSsXxZzDdTtLlMmNnRr//cd;

	if (length($code)) {
            my $firstchar = substr($code, 0, 1);
	    $code =~ tr[AaEeHhIiOoUuWwYyBbFfPpVvCcGgJjKkQqSsXxZzDdTtLlMmNnRr]
                       [0000990000009900111111112222222222222222333344555566]s;
            $code =~ s/(.)9\1/$1/g;
	    ($code = substr($code, 1)) =~ tr/09//d;
	    substr($firstchar . $code . '000', 0, 4);
	} else {
	    $nocode
	}
    } @_;

    wantarray ? @results : $results[0];
}

sub soundex_unicode
{
    require Text::Unidecode unless defined &Text::Unidecode::unidecode;
    soundex(Text::Unidecode::unidecode(@_));
}

sub soundex_nara_unicode
{
    require Text::Unidecode unless defined &Text::Unidecode::unidecode;
    soundex_nara(Text::Unidecode::unidecode(@_));
}

eval { XSLoader::load(__PACKAGE__, $VERSION) };

if (defined(&soundex_xs)) {
    *soundex = \&soundex_xs;
} else {
    *soundex = \&soundex_noxs;
    *soundex_xs = sub {
        require Carp;
        Carp::croak("XS implementation of Text::Soundex::soundex_xs() ".
                    "could not be loaded");
    };
}

1;

__END__

# Implementation of soundex algorithm as described by Knuth in volume
# 3 of The Art of Computer Programming.
#
# Some of this documention was written by Mike Stok.
#
# Knuth's test cases are:
#
# Euler, Ellery -> E460
# Gauss, Ghosh -> G200
# Hilbert, Heilbronn -> H416
# Knuth, Kant -> K530
# Lloyd, Ladd -> L300
# Lukasiewicz, Lissajous -> L222
#

=head1 NAME

Text::Soundex - Implementation of the Soundex Algorithm as Described by Knuth

=head1 SYNOPSIS

  use Text::Soundex 'soundex';

  $code = soundex($name);    # Get the soundex code for a name.
  @codes = soundex(@names);  # Get the list of codes for a list of names.

  # Redefine the value that soundex() will return if the input string
  # contains no identifiable sounds within it.
  $Text::Soundex::nocode = 'Z000';

=head1 DESCRIPTION

This module implements the soundex algorithm as described by Donald Knuth
in Volume 3 of B<The Art of Computer Programming>.  The algorithm is
intended to hash words (in particular surnames) into a small space
using a simple model which approximates the sound of the word when
spoken by an English speaker.  Each word is reduced to a four
character string, the first character being an upper case letter and
the remaining three being digits.

The value returned for strings which have no soundex encoding is
defined using C<$Text::Soundex::nocode>. The default value is C<undef>,
however values such as C<'Z000'> are commonly used alternatives.

For backward compatibility with older versions of this module the
C<$Text::Soundex::nocode> is exported into the caller's namespace as
C<$soundex_nocode>.

In scalar context, C<soundex()> returns the soundex code of its first
argument. In list context, a list is returned in which each element is the
soundex code for the corresponding argument passed to C<soundex()>. For
example, the following code assigns @codes the value C<('M200', 'S320')>:

  @codes = soundex qw(Mike Stok);

To use C<Text::Soundex> to generate codes that can be used to search one
of the publically available US Censuses, a variant of the soundex()
subroutine must be used:

    use Text::Soundex 'soundex_nara';
    $code = soundex_nara($name);

The algorithm used by the US Censuses is slightly different than that
defined by Knuth and others. The descrepancy shows up in names such as
"Ashcraft":

    use Text::Soundex qw(soundex soundex_nara);
    print soundex("Ashcraft"), "\n";       # prints: A226
    print soundex_nara("Ashcraft"), "\n";  # prints: A261

=head1 EXAMPLES

Knuth's examples of various names and the soundex codes they map to
are listed below:

  Euler, Ellery -> E460
  Gauss, Ghosh -> G200
  Hilbert, Heilbronn -> H416
  Knuth, Kant -> K530
  Lloyd, Ladd -> L300
  Lukasiewicz, Lissajous -> L222

so:

  $code = soundex 'Knuth';         # $code contains 'K530'
  @list = soundex qw(Lloyd Gauss); # @list contains 'L300', 'G200'

=head1 LIMITATIONS

As the soundex algorithm was originally used a B<long> time ago in the US
it considers only the English alphabet and pronunciation. In particular,
non-ASCII characters will be ignored. The recommended method of dealing
with characters that have accents, or other unicode characters, is to use
the Text::Unidecode module available from CPAN. Either use the module
explicitly:

    use Text::Soundex;
    use Text::Unidecode;

    print soundex(unidecode("Fran\xE7ais")), "\n"; # Prints "F652\n"

Or use the convenient wrapper routine:

    use Text::Soundex 'soundex_unicode';

    print soundex_unicode("Fran\xE7ais"), "\n";    # Prints "F652\n"

Since the soundex algorithm maps a large space (strings of arbitrary
length) onto a small space (single letter plus 3 digits) no inference
can be made about the similarity of two strings which end up with the
same soundex code.  For example, both C<Hilbert> and C<Heilbronn> end
up with a soundex code of C<H416>.

=head1 MAINTAINER

This module is currently maintain by Mark Mielke (C<mark@mielke.cc>).

=head1 HISTORY

Version 3 is a significant update to provide support for versions of
Perl later than Perl 5.004. Specifically, the XS version of the
soundex() subroutine understands strings that are encoded using UTF-8
(unicode strings).

Version 2 of this module was a re-write by Mark Mielke (C<mark@mielke.cc>)
to improve the speed of the subroutines. The XS version of the soundex()
subroutine was introduced in 2.00.

Version 1 of this module was written by Mike Stok (C<mike@stok.co.uk>)
and was included into the Perl core library set.

Dave Carlsen (C<dcarlsen@csranet.com>) made the request for the NARA
algorithm to be included. The NARA soundex page can be viewed at:
C<http://www.nara.gov/genealogy/soundex/soundex.html>

Ian Phillips (C<ian@pipex.net>) and Rich Pinder (C<rpinder@hsc.usc.edu>)
supplied ideas and spotted mistakes for v1.x.

=cut
Commit	Line	Data
b9eae89e	1	# -- perl --
	2
	3	# (c) Copyright 1998-2003 by Mark Mielke
	4	#
	5	# Freedom to use these sources for whatever you want, as long as credit
	6	# is given where credit is due, is hereby granted. You may make modifications
	7	# where you see fit but leave this copyright somewhere visible. As well, try
	8	# to initial any changes you make so that if I like the changes I can
	9	# incorporate them into later versions.
	10	#
	11	# - Mark Mielke <mark@mielke.cc>
	12	#
	13
a0d0e21e	14	package Text::Soundex;
b9eae89e	15	require 5.006;
a0d0e21e	16
b9eae89e	17	use Exporter ();
b9eae89e	18	use XSLoader ();
a0d0e21e	19
b9eae89e	20	use strict;
	21
	22	our $VERSION = '3.02';
	23	our @EXPORT_OK = qw(soundex soundex_unicode soundex_nara soundex_nara_unicode
	24	$soundex_nocode);
	25	our @EXPORT = qw(soundex $soundex_nocode);
	26	our @ISA = qw(Exporter);
	27
	28	our $nocode;
	29
	30	# Previous releases of Text::Soundex made $nocode available as $soundex_nocode.
	31	# For now, this part of the interface is exported and maintained.
	32	# In the feature, $soundex_nocode will be deprecated.
	33	*Text::Soundex::soundex_nocode = \$nocode;
	34
	35	sub soundex_noxs
	36	{
	37	# Strict implementation of Knuth's soundex algorithm.
	38
	39	my @results = map {
	40	my $code = $_;
	41	$code =~ tr/AaEeHhIiOoUuWwYyBbFfPpVvCcGgJjKkQqSsXxZzDdTtLlMmNnRr//cd;
	42
	43	if (length($code)) {
	44	my $firstchar = substr($code, 0, 1);
	45	$code =~ tr[AaEeHhIiOoUuWwYyBbFfPpVvCcGgJjKkQqSsXxZzDdTtLlMmNnRr]
	46	[0000000000000000111111112222222222222222333344555566]s;
	47	($code = substr($code, 1)) =~ tr/0//d;
	48	substr($firstchar . $code . '000', 0, 4);
	49	} else {
	50	$nocode;
	51	}
	52	} @_;
	53
	54	wantarray ? @results : $results[0];
	55	}
	56
	57	sub soundex_nara
	58	{
	59	# Implementation of NARA's soundex algorithm. If two sounds are
	60	# identical, and separated by only an H or a W... they should be
	61	# treated as one. This requires an additional "s///", as well as
	62	# the "9" character code to represent H and W. ("9" works like "0"
	63	# except it combines indentical sounds around it into one)
	64
	65	my @results = map {
	66	my $code = uc($_);
	67	$code =~ tr/AaEeHhIiOoUuWwYyBbFfPpVvCcGgJjKkQqSsXxZzDdTtLlMmNnRr//cd;
	68
	69	if (length($code)) {
	70	my $firstchar = substr($code, 0, 1);
	71	$code =~ tr[AaEeHhIiOoUuWwYyBbFfPpVvCcGgJjKkQqSsXxZzDdTtLlMmNnRr]
	72	[0000990000009900111111112222222222222222333344555566]s;
	73	$code =~ s/(.)9\1/$1/g;
	74	($code = substr($code, 1)) =~ tr/09//d;
	75	substr($firstchar . $code . '000', 0, 4);
	76	} else {
	77	$nocode
	78	}
	79	} @_;
	80
	81	wantarray ? @results : $results[0];
	82	}
	83
84	sub soundex_unicode
85	{
86	require Text::Unidecode unless defined &Text::Unidecode::unidecode;
87	soundex(Text::Unidecode::unidecode(@_));
88	}
89
90	sub soundex_nara_unicode
91	{
92	require Text::Unidecode unless defined &Text::Unidecode::unidecode;
93	soundex_nara(Text::Unidecode::unidecode(@_));
94	}
95
96	eval { XSLoader::load(__PACKAGE__, $VERSION) };
97
98	if (defined(&soundex_xs)) {
99	*soundex = \&soundex_xs;
100	} else {
101	*soundex = \&soundex_noxs;
102	*soundex_xs = sub {
103	require Carp;
104	Carp::croak("XS implementation of Text::Soundex::soundex_xs() ".
105	"could not be loaded");
106	};
107	}
108
109	1;
110
111	__END__
8cd2b3b0	112
a0d0e21e	113	# Implementation of soundex algorithm as described by Knuth in volume
b9eae89e	114	# 3 of The Art of Computer Programming.
a0d0e21e	115	#
b9eae89e	116	# Some of this documention was written by Mike Stok.
a0d0e21e	117	#
a0d0e21e	118	# Knuth's test cases are:
b9eae89e	119	#
a0d0e21e	120	# Euler, Ellery -> E460
	121	# Gauss, Ghosh -> G200
	122	# Hilbert, Heilbronn -> H416
	123	# Knuth, Kant -> K530
	124	# Lloyd, Ladd -> L300
	125	# Lukasiewicz, Lissajous -> L222
	126	#
cb1a09d0	127
	128	=head1 NAME
	129
	130	Text::Soundex - Implementation of the Soundex Algorithm as Described by Knuth
	131
	132	=head1 SYNOPSIS
	133
b9eae89e	134	use Text::Soundex 'soundex';
cb1a09d0	135
b9eae89e	136	$code = soundex($name); # Get the soundex code for a name.
b9eae89e	137	@codes = soundex(@names); # Get the list of codes for a list of names.
cb1a09d0	138
b9eae89e	139	# Redefine the value that soundex() will return if the input string
	140	# contains no identifiable sounds within it.
	141	$Text::Soundex::nocode = 'Z000';
cb1a09d0	142
	143	=head1 DESCRIPTION
	144
	145	This module implements the soundex algorithm as described by Donald Knuth
	146	in Volume 3 of B<The Art of Computer Programming>. The algorithm is
b9eae89e	147	intended to hash words (in particular surnames) into a small space
	148	using a simple model which approximates the sound of the word when
	149	spoken by an English speaker. Each word is reduced to a four
	150	character string, the first character being an upper case letter and
	151	the remaining three being digits.
	152
	153	The value returned for strings which have no soundex encoding is
	154	defined using C<$Text::Soundex::nocode>. The default value is C<undef>,
	155	however values such as C<'Z000'> are commonly used alternatives.
cb1a09d0	156
b9eae89e	157	For backward compatibility with older versions of this module the
	158	C<$Text::Soundex::nocode> is exported into the caller's namespace as
	159	C<$soundex_nocode>.
cb1a09d0	160
b9eae89e	161	In scalar context, C<soundex()> returns the soundex code of its first
	162	argument. In list context, a list is returned in which each element is the
	163	soundex code for the corresponding argument passed to C<soundex()>. For
	164	example, the following code assigns @codes the value C<('M200', 'S320')>:
cb1a09d0	165
	166	@codes = soundex qw(Mike Stok);
	167
b9eae89e	168	To use C<Text::Soundex> to generate codes that can be used to search one
	169	of the publically available US Censuses, a variant of the soundex()
	170	subroutine must be used:
	171
	172	use Text::Soundex 'soundex_nara';
	173	$code = soundex_nara($name);
	174
	175	The algorithm used by the US Censuses is slightly different than that
	176	defined by Knuth and others. The descrepancy shows up in names such as
	177	"Ashcraft":
	178
	179	use Text::Soundex qw(soundex soundex_nara);
	180	print soundex("Ashcraft"), "\n"; # prints: A226
	181	print soundex_nara("Ashcraft"), "\n"; # prints: A261
cb1a09d0	182
	183	=head1 EXAMPLES
	184
	185	Knuth's examples of various names and the soundex codes they map to
	186	are listed below:
	187
	188	Euler, Ellery -> E460
	189	Gauss, Ghosh -> G200
	190	Hilbert, Heilbronn -> H416
	191	Knuth, Kant -> K530
	192	Lloyd, Ladd -> L300
	193	Lukasiewicz, Lissajous -> L222
	194
	195	so:
	196
b9eae89e	197	$code = soundex 'Knuth'; # $code contains 'K530'
b9eae89e	198	@list = soundex qw(Lloyd Gauss); # @list contains 'L300', 'G200'
cb1a09d0	199
	200	=head1 LIMITATIONS
	201
	202	As the soundex algorithm was originally used a B<long> time ago in the US
b9eae89e	203	it considers only the English alphabet and pronunciation. In particular,
	204	non-ASCII characters will be ignored. The recommended method of dealing
	205	with characters that have accents, or other unicode characters, is to use
	206	the Text::Unidecode module available from CPAN. Either use the module
	207	explicitly:
	208
	209	use Text::Soundex;
	210	use Text::Unidecode;
	211
	212	print soundex(unidecode("Fran\xE7ais")), "\n"; # Prints "F652\n"
	213
	214	Or use the convenient wrapper routine:
	215
	216	use Text::Soundex 'soundex_unicode';
	217
	218	print soundex_unicode("Fran\xE7ais"), "\n"; # Prints "F652\n"
	219
	220	Since the soundex algorithm maps a large space (strings of arbitrary
	221	length) onto a small space (single letter plus 3 digits) no inference
	222	can be made about the similarity of two strings which end up with the
	223	same soundex code. For example, both C<Hilbert> and C<Heilbronn> end
	224	up with a soundex code of C<H416>.
	225
	226	=head1 MAINTAINER
	227
	228	This module is currently maintain by Mark Mielke (C<mark@mielke.cc>).
	229
	230	=head1 HISTORY
	231
	232	Version 3 is a significant update to provide support for versions of
	233	Perl later than Perl 5.004. Specifically, the XS version of the
	234	soundex() subroutine understands strings that are encoded using UTF-8
	235	(unicode strings).
	236
	237	Version 2 of this module was a re-write by Mark Mielke (C<mark@mielke.cc>)
	238	to improve the speed of the subroutines. The XS version of the soundex()
	239	subroutine was introduced in 2.00.
	240
	241	Version 1 of this module was written by Mike Stok (C<mike@stok.co.uk>)
	242	and was included into the Perl core library set.
cb1a09d0	243
b9eae89e	244	Dave Carlsen (C<dcarlsen@csranet.com>) made the request for the NARA
	245	algorithm to be included. The NARA soundex page can be viewed at:
	246	C<http://www.nara.gov/genealogy/soundex/soundex.html>
cb1a09d0	247
b9eae89e	248	Ian Phillips (C<ian@pipex.net>) and Rich Pinder (C<rpinder@hsc.usc.edu>)
b9eae89e	249	supplied ideas and spotted mistakes for v1.x.
cb1a09d0	250
b9eae89e	251	=cut