Commit | Line | Data |
b9eae89e |
1 | # -*- perl -*- |
2 | |
ef0f5379 |
3 | # (c) Copyright 1998-2007 by Mark Mielke |
b9eae89e |
4 | # |
5 | # Freedom to use these sources for whatever you want, as long as credit |
6 | # is given where credit is due, is hereby granted. You may make modifications |
7 | # where you see fit but leave this copyright somewhere visible. As well, try |
8 | # to initial any changes you make so that if I like the changes I can |
9 | # incorporate them into later versions. |
10 | # |
11 | # - Mark Mielke <mark@mielke.cc> |
12 | # |
13 | |
a0d0e21e |
14 | package Text::Soundex; |
b9eae89e |
15 | require 5.006; |
a0d0e21e |
16 | |
b9eae89e |
17 | use Exporter (); |
18 | use XSLoader (); |
a0d0e21e |
19 | |
b9eae89e |
20 | use strict; |
21 | |
ef0f5379 |
22 | our $VERSION = '3.03'; |
b9eae89e |
23 | our @EXPORT_OK = qw(soundex soundex_unicode soundex_nara soundex_nara_unicode |
24 | $soundex_nocode); |
ef0f5379 |
25 | our @EXPORT = qw(soundex soundex_nara $soundex_nocode); |
b9eae89e |
26 | our @ISA = qw(Exporter); |
27 | |
28 | our $nocode; |
29 | |
30 | # Previous releases of Text::Soundex made $nocode available as $soundex_nocode. |
31 | # For now, this part of the interface is exported and maintained. |
32 | # In the feature, $soundex_nocode will be deprecated. |
33 | *Text::Soundex::soundex_nocode = \$nocode; |
34 | |
35 | sub soundex_noxs |
36 | { |
ef0f5379 |
37 | # Original Soundex algorithm |
b9eae89e |
38 | |
39 | my @results = map { |
ef0f5379 |
40 | my $code = uc($_); |
b9eae89e |
41 | $code =~ tr/AaEeHhIiOoUuWwYyBbFfPpVvCcGgJjKkQqSsXxZzDdTtLlMmNnRr//cd; |
42 | |
43 | if (length($code)) { |
44 | my $firstchar = substr($code, 0, 1); |
45 | $code =~ tr[AaEeHhIiOoUuWwYyBbFfPpVvCcGgJjKkQqSsXxZzDdTtLlMmNnRr] |
46 | [0000000000000000111111112222222222222222333344555566]s; |
47 | ($code = substr($code, 1)) =~ tr/0//d; |
48 | substr($firstchar . $code . '000', 0, 4); |
49 | } else { |
50 | $nocode; |
51 | } |
52 | } @_; |
53 | |
54 | wantarray ? @results : $results[0]; |
55 | } |
56 | |
57 | sub soundex_nara |
58 | { |
ef0f5379 |
59 | # US census (NARA) algorithm. |
b9eae89e |
60 | |
61 | my @results = map { |
62 | my $code = uc($_); |
63 | $code =~ tr/AaEeHhIiOoUuWwYyBbFfPpVvCcGgJjKkQqSsXxZzDdTtLlMmNnRr//cd; |
64 | |
65 | if (length($code)) { |
66 | my $firstchar = substr($code, 0, 1); |
67 | $code =~ tr[AaEeHhIiOoUuWwYyBbFfPpVvCcGgJjKkQqSsXxZzDdTtLlMmNnRr] |
68 | [0000990000009900111111112222222222222222333344555566]s; |
ef0f5379 |
69 | $code =~ s/(.)9\1/$1/gs; |
b9eae89e |
70 | ($code = substr($code, 1)) =~ tr/09//d; |
71 | substr($firstchar . $code . '000', 0, 4); |
72 | } else { |
73 | $nocode |
74 | } |
75 | } @_; |
76 | |
77 | wantarray ? @results : $results[0]; |
78 | } |
79 | |
80 | sub soundex_unicode |
81 | { |
82 | require Text::Unidecode unless defined &Text::Unidecode::unidecode; |
83 | soundex(Text::Unidecode::unidecode(@_)); |
84 | } |
85 | |
86 | sub soundex_nara_unicode |
87 | { |
88 | require Text::Unidecode unless defined &Text::Unidecode::unidecode; |
89 | soundex_nara(Text::Unidecode::unidecode(@_)); |
90 | } |
91 | |
92 | eval { XSLoader::load(__PACKAGE__, $VERSION) }; |
93 | |
94 | if (defined(&soundex_xs)) { |
95 | *soundex = \&soundex_xs; |
96 | } else { |
97 | *soundex = \&soundex_noxs; |
98 | *soundex_xs = sub { |
99 | require Carp; |
100 | Carp::croak("XS implementation of Text::Soundex::soundex_xs() ". |
101 | "could not be loaded"); |
102 | }; |
103 | } |
104 | |
105 | 1; |
106 | |
107 | __END__ |
8cd2b3b0 |
108 | |
ef0f5379 |
109 | # Implementation of the soundex algorithm. |
a0d0e21e |
110 | # |
b9eae89e |
111 | # Some of this documention was written by Mike Stok. |
a0d0e21e |
112 | # |
ef0f5379 |
113 | # Examples: |
b9eae89e |
114 | # |
a0d0e21e |
115 | # Euler, Ellery -> E460 |
116 | # Gauss, Ghosh -> G200 |
117 | # Hilbert, Heilbronn -> H416 |
118 | # Knuth, Kant -> K530 |
119 | # Lloyd, Ladd -> L300 |
120 | # Lukasiewicz, Lissajous -> L222 |
121 | # |
cb1a09d0 |
122 | |
123 | =head1 NAME |
124 | |
ef0f5379 |
125 | Text::Soundex - Implementation of the soundex algorithm. |
cb1a09d0 |
126 | |
127 | =head1 SYNOPSIS |
128 | |
ef0f5379 |
129 | use Text::Soundex; |
cb1a09d0 |
130 | |
ef0f5379 |
131 | # Original algorithm. |
b9eae89e |
132 | $code = soundex($name); # Get the soundex code for a name. |
133 | @codes = soundex(@names); # Get the list of codes for a list of names. |
cb1a09d0 |
134 | |
ef0f5379 |
135 | # American Soundex variant (NARA) - Used for US census data. |
136 | $code = soundex_nara($name); # Get the soundex code for a name. |
137 | @codes = soundex_nara(@names); # Get the list of codes for a list of names. |
138 | |
b9eae89e |
139 | # Redefine the value that soundex() will return if the input string |
140 | # contains no identifiable sounds within it. |
141 | $Text::Soundex::nocode = 'Z000'; |
cb1a09d0 |
142 | |
143 | =head1 DESCRIPTION |
144 | |
ef0f5379 |
145 | Soundex is a phonetic algorithm for indexing names by sound, as |
146 | pronounced in English. The goal is for names with the same |
147 | pronunciation to be encoded to the same representation so that they |
148 | can be matched despite minor differences in spelling. Soundex is the |
149 | most widely known of all phonetic algorithms and is often used |
150 | (incorrectly) as a synonym for "phonetic algorithm". Improvements to |
151 | Soundex are the basis for many modern phonetic algorithms. (Wikipedia, |
152 | 2007) |
153 | |
154 | This module implements the original soundex algorithm developed by |
155 | Robert Russell and Margaret Odell, patented in 1918 and 1922, as well |
156 | as a variation called "American Soundex" used for US census data, and |
157 | current maintained by the National Archives and Records Administration |
158 | (NARA). |
159 | |
160 | The soundex algorithm may be recognized from Donald Knuth's |
161 | B<The Art of Computer Programming>. The algorithm described by |
162 | Knuth is the NARA algorithm. |
b9eae89e |
163 | |
164 | The value returned for strings which have no soundex encoding is |
165 | defined using C<$Text::Soundex::nocode>. The default value is C<undef>, |
166 | however values such as C<'Z000'> are commonly used alternatives. |
cb1a09d0 |
167 | |
b9eae89e |
168 | For backward compatibility with older versions of this module the |
169 | C<$Text::Soundex::nocode> is exported into the caller's namespace as |
170 | C<$soundex_nocode>. |
cb1a09d0 |
171 | |
b9eae89e |
172 | In scalar context, C<soundex()> returns the soundex code of its first |
173 | argument. In list context, a list is returned in which each element is the |
174 | soundex code for the corresponding argument passed to C<soundex()>. For |
175 | example, the following code assigns @codes the value C<('M200', 'S320')>: |
cb1a09d0 |
176 | |
ef0f5379 |
177 | @codes = soundex qw(Mike Stok); |
cb1a09d0 |
178 | |
b9eae89e |
179 | To use C<Text::Soundex> to generate codes that can be used to search one |
ef0f5379 |
180 | of the publically available US Censuses, a variant of the soundex |
181 | algorithm must be used: |
b9eae89e |
182 | |
ef0f5379 |
183 | use Text::Soundex; |
b9eae89e |
184 | $code = soundex_nara($name); |
185 | |
ef0f5379 |
186 | An example of where these algorithm differ follows: |
b9eae89e |
187 | |
ef0f5379 |
188 | use Text::Soundex; |
b9eae89e |
189 | print soundex("Ashcraft"), "\n"; # prints: A226 |
190 | print soundex_nara("Ashcraft"), "\n"; # prints: A261 |
cb1a09d0 |
191 | |
192 | =head1 EXAMPLES |
193 | |
ef0f5379 |
194 | Donald Knuth's examples of names and the soundex codes they map to |
cb1a09d0 |
195 | are listed below: |
196 | |
197 | Euler, Ellery -> E460 |
198 | Gauss, Ghosh -> G200 |
199 | Hilbert, Heilbronn -> H416 |
200 | Knuth, Kant -> K530 |
201 | Lloyd, Ladd -> L300 |
202 | Lukasiewicz, Lissajous -> L222 |
203 | |
204 | so: |
205 | |
b9eae89e |
206 | $code = soundex 'Knuth'; # $code contains 'K530' |
207 | @list = soundex qw(Lloyd Gauss); # @list contains 'L300', 'G200' |
cb1a09d0 |
208 | |
209 | =head1 LIMITATIONS |
210 | |
211 | As the soundex algorithm was originally used a B<long> time ago in the US |
b9eae89e |
212 | it considers only the English alphabet and pronunciation. In particular, |
213 | non-ASCII characters will be ignored. The recommended method of dealing |
214 | with characters that have accents, or other unicode characters, is to use |
215 | the Text::Unidecode module available from CPAN. Either use the module |
216 | explicitly: |
217 | |
218 | use Text::Soundex; |
219 | use Text::Unidecode; |
220 | |
221 | print soundex(unidecode("Fran\xE7ais")), "\n"; # Prints "F652\n" |
222 | |
223 | Or use the convenient wrapper routine: |
224 | |
225 | use Text::Soundex 'soundex_unicode'; |
226 | |
227 | print soundex_unicode("Fran\xE7ais"), "\n"; # Prints "F652\n" |
228 | |
229 | Since the soundex algorithm maps a large space (strings of arbitrary |
230 | length) onto a small space (single letter plus 3 digits) no inference |
231 | can be made about the similarity of two strings which end up with the |
232 | same soundex code. For example, both C<Hilbert> and C<Heilbronn> end |
233 | up with a soundex code of C<H416>. |
234 | |
235 | =head1 MAINTAINER |
236 | |
237 | This module is currently maintain by Mark Mielke (C<mark@mielke.cc>). |
238 | |
239 | =head1 HISTORY |
240 | |
241 | Version 3 is a significant update to provide support for versions of |
242 | Perl later than Perl 5.004. Specifically, the XS version of the |
243 | soundex() subroutine understands strings that are encoded using UTF-8 |
244 | (unicode strings). |
245 | |
246 | Version 2 of this module was a re-write by Mark Mielke (C<mark@mielke.cc>) |
247 | to improve the speed of the subroutines. The XS version of the soundex() |
248 | subroutine was introduced in 2.00. |
249 | |
250 | Version 1 of this module was written by Mike Stok (C<mike@stok.co.uk>) |
251 | and was included into the Perl core library set. |
cb1a09d0 |
252 | |
b9eae89e |
253 | Dave Carlsen (C<dcarlsen@csranet.com>) made the request for the NARA |
254 | algorithm to be included. The NARA soundex page can be viewed at: |
255 | C<http://www.nara.gov/genealogy/soundex/soundex.html> |
cb1a09d0 |
256 | |
b9eae89e |
257 | Ian Phillips (C<ian@pipex.net>) and Rich Pinder (C<rpinder@hsc.usc.edu>) |
258 | supplied ideas and spotted mistakes for v1.x. |
cb1a09d0 |
259 | |
b9eae89e |
260 | =cut |