Actually submit previous change.
[p5sagit/p5-mst-13.2.git] / ext / Text / Soundex / Soundex.pm
CommitLineData
b9eae89e 1# -*- perl -*-
2
3# (c) Copyright 1998-2003 by Mark Mielke
4#
5# Freedom to use these sources for whatever you want, as long as credit
6# is given where credit is due, is hereby granted. You may make modifications
7# where you see fit but leave this copyright somewhere visible. As well, try
8# to initial any changes you make so that if I like the changes I can
9# incorporate them into later versions.
10#
11# - Mark Mielke <mark@mielke.cc>
12#
13
a0d0e21e 14package Text::Soundex;
b9eae89e 15require 5.006;
a0d0e21e 16
b9eae89e 17use Exporter ();
18use XSLoader ();
a0d0e21e 19
b9eae89e 20use strict;
21
22our $VERSION = '3.02';
23our @EXPORT_OK = qw(soundex soundex_unicode soundex_nara soundex_nara_unicode
24 $soundex_nocode);
25our @EXPORT = qw(soundex $soundex_nocode);
26our @ISA = qw(Exporter);
27
28our $nocode;
29
30# Previous releases of Text::Soundex made $nocode available as $soundex_nocode.
31# For now, this part of the interface is exported and maintained.
32# In the feature, $soundex_nocode will be deprecated.
33*Text::Soundex::soundex_nocode = \$nocode;
34
35sub soundex_noxs
36{
37 # Strict implementation of Knuth's soundex algorithm.
38
39 my @results = map {
40 my $code = $_;
41 $code =~ tr/AaEeHhIiOoUuWwYyBbFfPpVvCcGgJjKkQqSsXxZzDdTtLlMmNnRr//cd;
42
43 if (length($code)) {
44 my $firstchar = substr($code, 0, 1);
45 $code =~ tr[AaEeHhIiOoUuWwYyBbFfPpVvCcGgJjKkQqSsXxZzDdTtLlMmNnRr]
46 [0000000000000000111111112222222222222222333344555566]s;
47 ($code = substr($code, 1)) =~ tr/0//d;
48 substr($firstchar . $code . '000', 0, 4);
49 } else {
50 $nocode;
51 }
52 } @_;
53
54 wantarray ? @results : $results[0];
55}
56
57sub soundex_nara
58{
59 # Implementation of NARA's soundex algorithm. If two sounds are
60 # identical, and separated by only an H or a W... they should be
61 # treated as one. This requires an additional "s///", as well as
62 # the "9" character code to represent H and W. ("9" works like "0"
63 # except it combines indentical sounds around it into one)
64
65 my @results = map {
66 my $code = uc($_);
67 $code =~ tr/AaEeHhIiOoUuWwYyBbFfPpVvCcGgJjKkQqSsXxZzDdTtLlMmNnRr//cd;
68
69 if (length($code)) {
70 my $firstchar = substr($code, 0, 1);
71 $code =~ tr[AaEeHhIiOoUuWwYyBbFfPpVvCcGgJjKkQqSsXxZzDdTtLlMmNnRr]
72 [0000990000009900111111112222222222222222333344555566]s;
73 $code =~ s/(.)9\1/$1/g;
74 ($code = substr($code, 1)) =~ tr/09//d;
75 substr($firstchar . $code . '000', 0, 4);
76 } else {
77 $nocode
78 }
79 } @_;
80
81 wantarray ? @results : $results[0];
82}
83
84sub soundex_unicode
85{
86 require Text::Unidecode unless defined &Text::Unidecode::unidecode;
87 soundex(Text::Unidecode::unidecode(@_));
88}
89
90sub soundex_nara_unicode
91{
92 require Text::Unidecode unless defined &Text::Unidecode::unidecode;
93 soundex_nara(Text::Unidecode::unidecode(@_));
94}
95
96eval { XSLoader::load(__PACKAGE__, $VERSION) };
97
98if (defined(&soundex_xs)) {
99 *soundex = \&soundex_xs;
100} else {
101 *soundex = \&soundex_noxs;
102 *soundex_xs = sub {
103 require Carp;
104 Carp::croak("XS implementation of Text::Soundex::soundex_xs() ".
105 "could not be loaded");
106 };
107}
108
1091;
110
111__END__
8cd2b3b0 112
a0d0e21e 113# Implementation of soundex algorithm as described by Knuth in volume
b9eae89e 114# 3 of The Art of Computer Programming.
a0d0e21e 115#
b9eae89e 116# Some of this documention was written by Mike Stok.
a0d0e21e 117#
118# Knuth's test cases are:
b9eae89e 119#
a0d0e21e 120# Euler, Ellery -> E460
121# Gauss, Ghosh -> G200
122# Hilbert, Heilbronn -> H416
123# Knuth, Kant -> K530
124# Lloyd, Ladd -> L300
125# Lukasiewicz, Lissajous -> L222
126#
cb1a09d0 127
128=head1 NAME
129
130Text::Soundex - Implementation of the Soundex Algorithm as Described by Knuth
131
132=head1 SYNOPSIS
133
b9eae89e 134 use Text::Soundex 'soundex';
cb1a09d0 135
b9eae89e 136 $code = soundex($name); # Get the soundex code for a name.
137 @codes = soundex(@names); # Get the list of codes for a list of names.
cb1a09d0 138
b9eae89e 139 # Redefine the value that soundex() will return if the input string
140 # contains no identifiable sounds within it.
141 $Text::Soundex::nocode = 'Z000';
cb1a09d0 142
143=head1 DESCRIPTION
144
145This module implements the soundex algorithm as described by Donald Knuth
146in Volume 3 of B<The Art of Computer Programming>. The algorithm is
b9eae89e 147intended to hash words (in particular surnames) into a small space
148using a simple model which approximates the sound of the word when
149spoken by an English speaker. Each word is reduced to a four
150character string, the first character being an upper case letter and
151the remaining three being digits.
152
153The value returned for strings which have no soundex encoding is
154defined using C<$Text::Soundex::nocode>. The default value is C<undef>,
155however values such as C<'Z000'> are commonly used alternatives.
cb1a09d0 156
b9eae89e 157For backward compatibility with older versions of this module the
158C<$Text::Soundex::nocode> is exported into the caller's namespace as
159C<$soundex_nocode>.
cb1a09d0 160
b9eae89e 161In scalar context, C<soundex()> returns the soundex code of its first
162argument. In list context, a list is returned in which each element is the
163soundex code for the corresponding argument passed to C<soundex()>. For
164example, the following code assigns @codes the value C<('M200', 'S320')>:
cb1a09d0 165
166 @codes = soundex qw(Mike Stok);
167
b9eae89e 168To use C<Text::Soundex> to generate codes that can be used to search one
169of the publically available US Censuses, a variant of the soundex()
170subroutine must be used:
171
172 use Text::Soundex 'soundex_nara';
173 $code = soundex_nara($name);
174
175The algorithm used by the US Censuses is slightly different than that
176defined by Knuth and others. The descrepancy shows up in names such as
177"Ashcraft":
178
179 use Text::Soundex qw(soundex soundex_nara);
180 print soundex("Ashcraft"), "\n"; # prints: A226
181 print soundex_nara("Ashcraft"), "\n"; # prints: A261
cb1a09d0 182
183=head1 EXAMPLES
184
185Knuth's examples of various names and the soundex codes they map to
186are listed below:
187
188 Euler, Ellery -> E460
189 Gauss, Ghosh -> G200
190 Hilbert, Heilbronn -> H416
191 Knuth, Kant -> K530
192 Lloyd, Ladd -> L300
193 Lukasiewicz, Lissajous -> L222
194
195so:
196
b9eae89e 197 $code = soundex 'Knuth'; # $code contains 'K530'
198 @list = soundex qw(Lloyd Gauss); # @list contains 'L300', 'G200'
cb1a09d0 199
200=head1 LIMITATIONS
201
202As the soundex algorithm was originally used a B<long> time ago in the US
b9eae89e 203it considers only the English alphabet and pronunciation. In particular,
204non-ASCII characters will be ignored. The recommended method of dealing
205with characters that have accents, or other unicode characters, is to use
206the Text::Unidecode module available from CPAN. Either use the module
207explicitly:
208
209 use Text::Soundex;
210 use Text::Unidecode;
211
212 print soundex(unidecode("Fran\xE7ais")), "\n"; # Prints "F652\n"
213
214Or use the convenient wrapper routine:
215
216 use Text::Soundex 'soundex_unicode';
217
218 print soundex_unicode("Fran\xE7ais"), "\n"; # Prints "F652\n"
219
220Since the soundex algorithm maps a large space (strings of arbitrary
221length) onto a small space (single letter plus 3 digits) no inference
222can be made about the similarity of two strings which end up with the
223same soundex code. For example, both C<Hilbert> and C<Heilbronn> end
224up with a soundex code of C<H416>.
225
226=head1 MAINTAINER
227
228This module is currently maintain by Mark Mielke (C<mark@mielke.cc>).
229
230=head1 HISTORY
231
232Version 3 is a significant update to provide support for versions of
233Perl later than Perl 5.004. Specifically, the XS version of the
234soundex() subroutine understands strings that are encoded using UTF-8
235(unicode strings).
236
237Version 2 of this module was a re-write by Mark Mielke (C<mark@mielke.cc>)
238to improve the speed of the subroutines. The XS version of the soundex()
239subroutine was introduced in 2.00.
240
241Version 1 of this module was written by Mike Stok (C<mike@stok.co.uk>)
242and was included into the Perl core library set.
cb1a09d0 243
b9eae89e 244Dave Carlsen (C<dcarlsen@csranet.com>) made the request for the NARA
245algorithm to be included. The NARA soundex page can be viewed at:
246C<http://www.nara.gov/genealogy/soundex/soundex.html>
cb1a09d0 247
b9eae89e 248Ian Phillips (C<ian@pipex.net>) and Rich Pinder (C<rpinder@hsc.usc.edu>)
249supplied ideas and spotted mistakes for v1.x.
cb1a09d0 250
b9eae89e 251=cut