Revert "Revert "Mention the unit of time""
[p5sagit/p5-mst-13.2.git] / ext / Text-Soundex / Soundex.pm
CommitLineData
b9eae89e 1# -*- perl -*-
2
ef0f5379 3# (c) Copyright 1998-2007 by Mark Mielke
b9eae89e 4#
5# Freedom to use these sources for whatever you want, as long as credit
6# is given where credit is due, is hereby granted. You may make modifications
7# where you see fit but leave this copyright somewhere visible. As well, try
8# to initial any changes you make so that if I like the changes I can
9# incorporate them into later versions.
10#
11# - Mark Mielke <mark@mielke.cc>
12#
13
a0d0e21e 14package Text::Soundex;
b9eae89e 15require 5.006;
a0d0e21e 16
b9eae89e 17use Exporter ();
18use XSLoader ();
a0d0e21e 19
b9eae89e 20use strict;
21
ef0f5379 22our $VERSION = '3.03';
b9eae89e 23our @EXPORT_OK = qw(soundex soundex_unicode soundex_nara soundex_nara_unicode
24 $soundex_nocode);
ef0f5379 25our @EXPORT = qw(soundex soundex_nara $soundex_nocode);
b9eae89e 26our @ISA = qw(Exporter);
27
28our $nocode;
29
30# Previous releases of Text::Soundex made $nocode available as $soundex_nocode.
31# For now, this part of the interface is exported and maintained.
32# In the feature, $soundex_nocode will be deprecated.
33*Text::Soundex::soundex_nocode = \$nocode;
34
35sub soundex_noxs
36{
ef0f5379 37 # Original Soundex algorithm
b9eae89e 38
39 my @results = map {
ef0f5379 40 my $code = uc($_);
b9eae89e 41 $code =~ tr/AaEeHhIiOoUuWwYyBbFfPpVvCcGgJjKkQqSsXxZzDdTtLlMmNnRr//cd;
42
43 if (length($code)) {
44 my $firstchar = substr($code, 0, 1);
45 $code =~ tr[AaEeHhIiOoUuWwYyBbFfPpVvCcGgJjKkQqSsXxZzDdTtLlMmNnRr]
46 [0000000000000000111111112222222222222222333344555566]s;
47 ($code = substr($code, 1)) =~ tr/0//d;
48 substr($firstchar . $code . '000', 0, 4);
49 } else {
50 $nocode;
51 }
52 } @_;
53
54 wantarray ? @results : $results[0];
55}
56
57sub soundex_nara
58{
ef0f5379 59 # US census (NARA) algorithm.
b9eae89e 60
61 my @results = map {
62 my $code = uc($_);
63 $code =~ tr/AaEeHhIiOoUuWwYyBbFfPpVvCcGgJjKkQqSsXxZzDdTtLlMmNnRr//cd;
64
65 if (length($code)) {
66 my $firstchar = substr($code, 0, 1);
67 $code =~ tr[AaEeHhIiOoUuWwYyBbFfPpVvCcGgJjKkQqSsXxZzDdTtLlMmNnRr]
68 [0000990000009900111111112222222222222222333344555566]s;
ef0f5379 69 $code =~ s/(.)9\1/$1/gs;
b9eae89e 70 ($code = substr($code, 1)) =~ tr/09//d;
71 substr($firstchar . $code . '000', 0, 4);
72 } else {
73 $nocode
74 }
75 } @_;
76
77 wantarray ? @results : $results[0];
78}
79
80sub soundex_unicode
81{
82 require Text::Unidecode unless defined &Text::Unidecode::unidecode;
83 soundex(Text::Unidecode::unidecode(@_));
84}
85
86sub soundex_nara_unicode
87{
88 require Text::Unidecode unless defined &Text::Unidecode::unidecode;
89 soundex_nara(Text::Unidecode::unidecode(@_));
90}
91
92eval { XSLoader::load(__PACKAGE__, $VERSION) };
93
94if (defined(&soundex_xs)) {
95 *soundex = \&soundex_xs;
96} else {
97 *soundex = \&soundex_noxs;
98 *soundex_xs = sub {
99 require Carp;
100 Carp::croak("XS implementation of Text::Soundex::soundex_xs() ".
101 "could not be loaded");
102 };
103}
104
1051;
106
107__END__
8cd2b3b0 108
ef0f5379 109# Implementation of the soundex algorithm.
a0d0e21e 110#
b9eae89e 111# Some of this documention was written by Mike Stok.
a0d0e21e 112#
ef0f5379 113# Examples:
b9eae89e 114#
a0d0e21e 115# Euler, Ellery -> E460
116# Gauss, Ghosh -> G200
117# Hilbert, Heilbronn -> H416
118# Knuth, Kant -> K530
119# Lloyd, Ladd -> L300
120# Lukasiewicz, Lissajous -> L222
121#
cb1a09d0 122
123=head1 NAME
124
ef0f5379 125Text::Soundex - Implementation of the soundex algorithm.
cb1a09d0 126
127=head1 SYNOPSIS
128
ef0f5379 129 use Text::Soundex;
cb1a09d0 130
ef0f5379 131 # Original algorithm.
b9eae89e 132 $code = soundex($name); # Get the soundex code for a name.
133 @codes = soundex(@names); # Get the list of codes for a list of names.
cb1a09d0 134
ef0f5379 135 # American Soundex variant (NARA) - Used for US census data.
136 $code = soundex_nara($name); # Get the soundex code for a name.
137 @codes = soundex_nara(@names); # Get the list of codes for a list of names.
138
b9eae89e 139 # Redefine the value that soundex() will return if the input string
140 # contains no identifiable sounds within it.
141 $Text::Soundex::nocode = 'Z000';
cb1a09d0 142
143=head1 DESCRIPTION
144
ef0f5379 145Soundex is a phonetic algorithm for indexing names by sound, as
146pronounced in English. The goal is for names with the same
147pronunciation to be encoded to the same representation so that they
148can be matched despite minor differences in spelling. Soundex is the
149most widely known of all phonetic algorithms and is often used
150(incorrectly) as a synonym for "phonetic algorithm". Improvements to
151Soundex are the basis for many modern phonetic algorithms. (Wikipedia,
1522007)
153
154This module implements the original soundex algorithm developed by
155Robert Russell and Margaret Odell, patented in 1918 and 1922, as well
156as a variation called "American Soundex" used for US census data, and
157current maintained by the National Archives and Records Administration
158(NARA).
159
160The soundex algorithm may be recognized from Donald Knuth's
161B<The Art of Computer Programming>. The algorithm described by
162Knuth is the NARA algorithm.
b9eae89e 163
164The value returned for strings which have no soundex encoding is
165defined using C<$Text::Soundex::nocode>. The default value is C<undef>,
166however values such as C<'Z000'> are commonly used alternatives.
cb1a09d0 167
b9eae89e 168For backward compatibility with older versions of this module the
169C<$Text::Soundex::nocode> is exported into the caller's namespace as
170C<$soundex_nocode>.
cb1a09d0 171
b9eae89e 172In scalar context, C<soundex()> returns the soundex code of its first
173argument. In list context, a list is returned in which each element is the
174soundex code for the corresponding argument passed to C<soundex()>. For
175example, the following code assigns @codes the value C<('M200', 'S320')>:
cb1a09d0 176
ef0f5379 177 @codes = soundex qw(Mike Stok);
cb1a09d0 178
b9eae89e 179To use C<Text::Soundex> to generate codes that can be used to search one
ef0f5379 180of the publically available US Censuses, a variant of the soundex
181algorithm must be used:
b9eae89e 182
ef0f5379 183 use Text::Soundex;
b9eae89e 184 $code = soundex_nara($name);
185
ef0f5379 186An example of where these algorithm differ follows:
b9eae89e 187
ef0f5379 188 use Text::Soundex;
b9eae89e 189 print soundex("Ashcraft"), "\n"; # prints: A226
190 print soundex_nara("Ashcraft"), "\n"; # prints: A261
cb1a09d0 191
192=head1 EXAMPLES
193
ef0f5379 194Donald Knuth's examples of names and the soundex codes they map to
cb1a09d0 195are listed below:
196
197 Euler, Ellery -> E460
198 Gauss, Ghosh -> G200
199 Hilbert, Heilbronn -> H416
200 Knuth, Kant -> K530
201 Lloyd, Ladd -> L300
202 Lukasiewicz, Lissajous -> L222
203
204so:
205
b9eae89e 206 $code = soundex 'Knuth'; # $code contains 'K530'
207 @list = soundex qw(Lloyd Gauss); # @list contains 'L300', 'G200'
cb1a09d0 208
209=head1 LIMITATIONS
210
211As the soundex algorithm was originally used a B<long> time ago in the US
b9eae89e 212it considers only the English alphabet and pronunciation. In particular,
213non-ASCII characters will be ignored. The recommended method of dealing
214with characters that have accents, or other unicode characters, is to use
215the Text::Unidecode module available from CPAN. Either use the module
216explicitly:
217
218 use Text::Soundex;
219 use Text::Unidecode;
220
221 print soundex(unidecode("Fran\xE7ais")), "\n"; # Prints "F652\n"
222
223Or use the convenient wrapper routine:
224
225 use Text::Soundex 'soundex_unicode';
226
227 print soundex_unicode("Fran\xE7ais"), "\n"; # Prints "F652\n"
228
229Since the soundex algorithm maps a large space (strings of arbitrary
230length) onto a small space (single letter plus 3 digits) no inference
231can be made about the similarity of two strings which end up with the
232same soundex code. For example, both C<Hilbert> and C<Heilbronn> end
233up with a soundex code of C<H416>.
234
235=head1 MAINTAINER
236
237This module is currently maintain by Mark Mielke (C<mark@mielke.cc>).
238
239=head1 HISTORY
240
241Version 3 is a significant update to provide support for versions of
242Perl later than Perl 5.004. Specifically, the XS version of the
243soundex() subroutine understands strings that are encoded using UTF-8
244(unicode strings).
245
246Version 2 of this module was a re-write by Mark Mielke (C<mark@mielke.cc>)
247to improve the speed of the subroutines. The XS version of the soundex()
248subroutine was introduced in 2.00.
249
250Version 1 of this module was written by Mike Stok (C<mike@stok.co.uk>)
251and was included into the Perl core library set.
cb1a09d0 252
b9eae89e 253Dave Carlsen (C<dcarlsen@csranet.com>) made the request for the NARA
254algorithm to be included. The NARA soundex page can be viewed at:
255C<http://www.nara.gov/genealogy/soundex/soundex.html>
cb1a09d0 256
b9eae89e 257Ian Phillips (C<ian@pipex.net>) and Rich Pinder (C<rpinder@hsc.usc.edu>)
258supplied ideas and spotted mistakes for v1.x.
cb1a09d0 259
b9eae89e 260=cut