Commit | Line | Data |
b9eae89e |
1 | # -*- perl -*- |
2 | |
3 | # (c) Copyright 1998-2003 by Mark Mielke |
4 | # |
5 | # Freedom to use these sources for whatever you want, as long as credit |
6 | # is given where credit is due, is hereby granted. You may make modifications |
7 | # where you see fit but leave this copyright somewhere visible. As well, try |
8 | # to initial any changes you make so that if I like the changes I can |
9 | # incorporate them into later versions. |
10 | # |
11 | # - Mark Mielke <mark@mielke.cc> |
12 | # |
13 | |
a0d0e21e |
14 | package Text::Soundex; |
b9eae89e |
15 | require 5.006; |
a0d0e21e |
16 | |
b9eae89e |
17 | use Exporter (); |
18 | use XSLoader (); |
a0d0e21e |
19 | |
b9eae89e |
20 | use strict; |
21 | |
22 | our $VERSION = '3.02'; |
23 | our @EXPORT_OK = qw(soundex soundex_unicode soundex_nara soundex_nara_unicode |
24 | $soundex_nocode); |
25 | our @EXPORT = qw(soundex $soundex_nocode); |
26 | our @ISA = qw(Exporter); |
27 | |
28 | our $nocode; |
29 | |
30 | # Previous releases of Text::Soundex made $nocode available as $soundex_nocode. |
31 | # For now, this part of the interface is exported and maintained. |
32 | # In the feature, $soundex_nocode will be deprecated. |
33 | *Text::Soundex::soundex_nocode = \$nocode; |
34 | |
35 | sub soundex_noxs |
36 | { |
37 | # Strict implementation of Knuth's soundex algorithm. |
38 | |
39 | my @results = map { |
40 | my $code = $_; |
41 | $code =~ tr/AaEeHhIiOoUuWwYyBbFfPpVvCcGgJjKkQqSsXxZzDdTtLlMmNnRr//cd; |
42 | |
43 | if (length($code)) { |
44 | my $firstchar = substr($code, 0, 1); |
45 | $code =~ tr[AaEeHhIiOoUuWwYyBbFfPpVvCcGgJjKkQqSsXxZzDdTtLlMmNnRr] |
46 | [0000000000000000111111112222222222222222333344555566]s; |
47 | ($code = substr($code, 1)) =~ tr/0//d; |
48 | substr($firstchar . $code . '000', 0, 4); |
49 | } else { |
50 | $nocode; |
51 | } |
52 | } @_; |
53 | |
54 | wantarray ? @results : $results[0]; |
55 | } |
56 | |
57 | sub soundex_nara |
58 | { |
59 | # Implementation of NARA's soundex algorithm. If two sounds are |
60 | # identical, and separated by only an H or a W... they should be |
61 | # treated as one. This requires an additional "s///", as well as |
62 | # the "9" character code to represent H and W. ("9" works like "0" |
63 | # except it combines indentical sounds around it into one) |
64 | |
65 | my @results = map { |
66 | my $code = uc($_); |
67 | $code =~ tr/AaEeHhIiOoUuWwYyBbFfPpVvCcGgJjKkQqSsXxZzDdTtLlMmNnRr//cd; |
68 | |
69 | if (length($code)) { |
70 | my $firstchar = substr($code, 0, 1); |
71 | $code =~ tr[AaEeHhIiOoUuWwYyBbFfPpVvCcGgJjKkQqSsXxZzDdTtLlMmNnRr] |
72 | [0000990000009900111111112222222222222222333344555566]s; |
73 | $code =~ s/(.)9\1/$1/g; |
74 | ($code = substr($code, 1)) =~ tr/09//d; |
75 | substr($firstchar . $code . '000', 0, 4); |
76 | } else { |
77 | $nocode |
78 | } |
79 | } @_; |
80 | |
81 | wantarray ? @results : $results[0]; |
82 | } |
83 | |
84 | sub soundex_unicode |
85 | { |
86 | require Text::Unidecode unless defined &Text::Unidecode::unidecode; |
87 | soundex(Text::Unidecode::unidecode(@_)); |
88 | } |
89 | |
90 | sub soundex_nara_unicode |
91 | { |
92 | require Text::Unidecode unless defined &Text::Unidecode::unidecode; |
93 | soundex_nara(Text::Unidecode::unidecode(@_)); |
94 | } |
95 | |
96 | eval { XSLoader::load(__PACKAGE__, $VERSION) }; |
97 | |
98 | if (defined(&soundex_xs)) { |
99 | *soundex = \&soundex_xs; |
100 | } else { |
101 | *soundex = \&soundex_noxs; |
102 | *soundex_xs = sub { |
103 | require Carp; |
104 | Carp::croak("XS implementation of Text::Soundex::soundex_xs() ". |
105 | "could not be loaded"); |
106 | }; |
107 | } |
108 | |
109 | 1; |
110 | |
111 | __END__ |
8cd2b3b0 |
112 | |
a0d0e21e |
113 | # Implementation of soundex algorithm as described by Knuth in volume |
b9eae89e |
114 | # 3 of The Art of Computer Programming. |
a0d0e21e |
115 | # |
b9eae89e |
116 | # Some of this documention was written by Mike Stok. |
a0d0e21e |
117 | # |
118 | # Knuth's test cases are: |
b9eae89e |
119 | # |
a0d0e21e |
120 | # Euler, Ellery -> E460 |
121 | # Gauss, Ghosh -> G200 |
122 | # Hilbert, Heilbronn -> H416 |
123 | # Knuth, Kant -> K530 |
124 | # Lloyd, Ladd -> L300 |
125 | # Lukasiewicz, Lissajous -> L222 |
126 | # |
cb1a09d0 |
127 | |
128 | =head1 NAME |
129 | |
130 | Text::Soundex - Implementation of the Soundex Algorithm as Described by Knuth |
131 | |
132 | =head1 SYNOPSIS |
133 | |
b9eae89e |
134 | use Text::Soundex 'soundex'; |
cb1a09d0 |
135 | |
b9eae89e |
136 | $code = soundex($name); # Get the soundex code for a name. |
137 | @codes = soundex(@names); # Get the list of codes for a list of names. |
cb1a09d0 |
138 | |
b9eae89e |
139 | # Redefine the value that soundex() will return if the input string |
140 | # contains no identifiable sounds within it. |
141 | $Text::Soundex::nocode = 'Z000'; |
cb1a09d0 |
142 | |
143 | =head1 DESCRIPTION |
144 | |
145 | This module implements the soundex algorithm as described by Donald Knuth |
146 | in Volume 3 of B<The Art of Computer Programming>. The algorithm is |
b9eae89e |
147 | intended to hash words (in particular surnames) into a small space |
148 | using a simple model which approximates the sound of the word when |
149 | spoken by an English speaker. Each word is reduced to a four |
150 | character string, the first character being an upper case letter and |
151 | the remaining three being digits. |
152 | |
153 | The value returned for strings which have no soundex encoding is |
154 | defined using C<$Text::Soundex::nocode>. The default value is C<undef>, |
155 | however values such as C<'Z000'> are commonly used alternatives. |
cb1a09d0 |
156 | |
b9eae89e |
157 | For backward compatibility with older versions of this module the |
158 | C<$Text::Soundex::nocode> is exported into the caller's namespace as |
159 | C<$soundex_nocode>. |
cb1a09d0 |
160 | |
b9eae89e |
161 | In scalar context, C<soundex()> returns the soundex code of its first |
162 | argument. In list context, a list is returned in which each element is the |
163 | soundex code for the corresponding argument passed to C<soundex()>. For |
164 | example, the following code assigns @codes the value C<('M200', 'S320')>: |
cb1a09d0 |
165 | |
166 | @codes = soundex qw(Mike Stok); |
167 | |
b9eae89e |
168 | To use C<Text::Soundex> to generate codes that can be used to search one |
169 | of the publically available US Censuses, a variant of the soundex() |
170 | subroutine must be used: |
171 | |
172 | use Text::Soundex 'soundex_nara'; |
173 | $code = soundex_nara($name); |
174 | |
175 | The algorithm used by the US Censuses is slightly different than that |
176 | defined by Knuth and others. The descrepancy shows up in names such as |
177 | "Ashcraft": |
178 | |
179 | use Text::Soundex qw(soundex soundex_nara); |
180 | print soundex("Ashcraft"), "\n"; # prints: A226 |
181 | print soundex_nara("Ashcraft"), "\n"; # prints: A261 |
cb1a09d0 |
182 | |
183 | =head1 EXAMPLES |
184 | |
185 | Knuth's examples of various names and the soundex codes they map to |
186 | are listed below: |
187 | |
188 | Euler, Ellery -> E460 |
189 | Gauss, Ghosh -> G200 |
190 | Hilbert, Heilbronn -> H416 |
191 | Knuth, Kant -> K530 |
192 | Lloyd, Ladd -> L300 |
193 | Lukasiewicz, Lissajous -> L222 |
194 | |
195 | so: |
196 | |
b9eae89e |
197 | $code = soundex 'Knuth'; # $code contains 'K530' |
198 | @list = soundex qw(Lloyd Gauss); # @list contains 'L300', 'G200' |
cb1a09d0 |
199 | |
200 | =head1 LIMITATIONS |
201 | |
202 | As the soundex algorithm was originally used a B<long> time ago in the US |
b9eae89e |
203 | it considers only the English alphabet and pronunciation. In particular, |
204 | non-ASCII characters will be ignored. The recommended method of dealing |
205 | with characters that have accents, or other unicode characters, is to use |
206 | the Text::Unidecode module available from CPAN. Either use the module |
207 | explicitly: |
208 | |
209 | use Text::Soundex; |
210 | use Text::Unidecode; |
211 | |
212 | print soundex(unidecode("Fran\xE7ais")), "\n"; # Prints "F652\n" |
213 | |
214 | Or use the convenient wrapper routine: |
215 | |
216 | use Text::Soundex 'soundex_unicode'; |
217 | |
218 | print soundex_unicode("Fran\xE7ais"), "\n"; # Prints "F652\n" |
219 | |
220 | Since the soundex algorithm maps a large space (strings of arbitrary |
221 | length) onto a small space (single letter plus 3 digits) no inference |
222 | can be made about the similarity of two strings which end up with the |
223 | same soundex code. For example, both C<Hilbert> and C<Heilbronn> end |
224 | up with a soundex code of C<H416>. |
225 | |
226 | =head1 MAINTAINER |
227 | |
228 | This module is currently maintain by Mark Mielke (C<mark@mielke.cc>). |
229 | |
230 | =head1 HISTORY |
231 | |
232 | Version 3 is a significant update to provide support for versions of |
233 | Perl later than Perl 5.004. Specifically, the XS version of the |
234 | soundex() subroutine understands strings that are encoded using UTF-8 |
235 | (unicode strings). |
236 | |
237 | Version 2 of this module was a re-write by Mark Mielke (C<mark@mielke.cc>) |
238 | to improve the speed of the subroutines. The XS version of the soundex() |
239 | subroutine was introduced in 2.00. |
240 | |
241 | Version 1 of this module was written by Mike Stok (C<mike@stok.co.uk>) |
242 | and was included into the Perl core library set. |
cb1a09d0 |
243 | |
b9eae89e |
244 | Dave Carlsen (C<dcarlsen@csranet.com>) made the request for the NARA |
245 | algorithm to be included. The NARA soundex page can be viewed at: |
246 | C<http://www.nara.gov/genealogy/soundex/soundex.html> |
cb1a09d0 |
247 | |
b9eae89e |
248 | Ian Phillips (C<ian@pipex.net>) and Rich Pinder (C<rpinder@hsc.usc.edu>) |
249 | supplied ideas and spotted mistakes for v1.x. |
cb1a09d0 |
250 | |
b9eae89e |
251 | =cut |