Commit | Line | Data |
a0d0e21e |
1 | package Text::Soundex; |
2 | require 5.000; |
3 | require Exporter; |
4 | |
5 | @ISA = qw(Exporter); |
6 | @EXPORT = qw(&soundex $soundex_nocode); |
7 | |
d6a466d7 |
8 | $VERSION = '1.01'; |
8cd2b3b0 |
9 | |
a0d0e21e |
10 | # $Id: soundex.pl,v 1.2 1994/03/24 00:30:27 mike Exp $ |
11 | # |
12 | # Implementation of soundex algorithm as described by Knuth in volume |
13 | # 3 of The Art of Computer Programming, with ideas stolen from Ian |
7e6e257f |
14 | # Phillipps <ian@pipex.net>. |
a0d0e21e |
15 | # |
16 | # Mike Stok <Mike.Stok@meiko.concord.ma.us>, 2 March 1994. |
17 | # |
18 | # Knuth's test cases are: |
19 | # |
20 | # Euler, Ellery -> E460 |
21 | # Gauss, Ghosh -> G200 |
22 | # Hilbert, Heilbronn -> H416 |
23 | # Knuth, Kant -> K530 |
24 | # Lloyd, Ladd -> L300 |
25 | # Lukasiewicz, Lissajous -> L222 |
26 | # |
27 | # $Log: soundex.pl,v $ |
28 | # Revision 1.2 1994/03/24 00:30:27 mike |
29 | # Subtle bug (any excuse :-) spotted by Rich Pinder <rpinder@hsc.usc.edu> |
30 | # in the way I handles leasing characters which were different but had |
31 | # the same soundex code. This showed up comparing it with Oracle's |
32 | # soundex output. |
33 | # |
34 | # Revision 1.1 1994/03/02 13:01:30 mike |
35 | # Initial revision |
36 | # |
37 | # |
38 | ############################################################################## |
39 | |
40 | # $soundex_nocode is used to indicate a string doesn't have a soundex |
41 | # code, I like undef other people may want to set it to 'Z000'. |
42 | |
43 | $soundex_nocode = undef; |
44 | |
a0d0e21e |
45 | sub soundex |
46 | { |
47 | local (@s, $f, $fc, $_) = @_; |
48 | |
cb1a09d0 |
49 | push @s, '' unless @s; # handle no args as a single empty string |
50 | |
a0d0e21e |
51 | foreach (@s) |
52 | { |
55497cff |
53 | $_ = uc $_; |
a0d0e21e |
54 | tr/A-Z//cd; |
55 | |
56 | if ($_ eq '') |
57 | { |
58 | $_ = $soundex_nocode; |
59 | } |
60 | else |
61 | { |
62 | ($f) = /^(.)/; |
63 | tr/AEHIOUWYBFPVCGJKQSXZDTLMNR/00000000111122222222334556/; |
64 | ($fc) = /^(.)/; |
65 | s/^$fc+//; |
66 | tr///cs; |
67 | tr/0//d; |
68 | $_ = $f . $_ . '000'; |
69 | s/^(.{4}).*/$1/; |
70 | } |
71 | } |
72 | |
73 | wantarray ? @s : shift @s; |
74 | } |
75 | |
76 | 1; |
77 | |
cb1a09d0 |
78 | __END__ |
79 | |
80 | =head1 NAME |
81 | |
82 | Text::Soundex - Implementation of the Soundex Algorithm as Described by Knuth |
83 | |
84 | =head1 SYNOPSIS |
85 | |
86 | use Text::Soundex; |
87 | |
88 | $code = soundex $string; # get soundex code for a string |
89 | @codes = soundex @list; # get list of codes for list of strings |
90 | |
91 | # set value to be returned for strings without soundex code |
92 | |
93 | $soundex_nocode = 'Z000'; |
94 | |
95 | =head1 DESCRIPTION |
96 | |
97 | This module implements the soundex algorithm as described by Donald Knuth |
98 | in Volume 3 of B<The Art of Computer Programming>. The algorithm is |
99 | intended to hash words (in particular surnames) into a small space using a |
100 | simple model which approximates the sound of the word when spoken by an English |
101 | speaker. Each word is reduced to a four character string, the first |
102 | character being an upper case letter and the remaining three being digits. |
103 | |
104 | If there is no soundex code representation for a string then the value of |
105 | C<$soundex_nocode> is returned. This is initially set to C<undef>, but |
106 | many people seem to prefer an I<unlikely> value like C<Z000> |
107 | (how unlikely this is depends on the data set being dealt with.) Any value |
108 | can be assigned to C<$soundex_nocode>. |
109 | |
110 | In scalar context C<soundex> returns the soundex code of its first |
91e74348 |
111 | argument, and in list context a list is returned in which each element is the |
cb1a09d0 |
112 | soundex code for the corresponding argument passed to C<soundex> e.g. |
113 | |
114 | @codes = soundex qw(Mike Stok); |
115 | |
116 | leaves C<@codes> containing C<('M200', 'S320')>. |
117 | |
118 | =head1 EXAMPLES |
119 | |
120 | Knuth's examples of various names and the soundex codes they map to |
121 | are listed below: |
122 | |
123 | Euler, Ellery -> E460 |
124 | Gauss, Ghosh -> G200 |
125 | Hilbert, Heilbronn -> H416 |
126 | Knuth, Kant -> K530 |
127 | Lloyd, Ladd -> L300 |
128 | Lukasiewicz, Lissajous -> L222 |
129 | |
130 | so: |
131 | |
132 | $code = soundex 'Knuth'; # $code contains 'K530' |
133 | @list = soundex qw(Lloyd Gauss); # @list contains 'L300', 'G200' |
134 | |
135 | =head1 LIMITATIONS |
136 | |
137 | As the soundex algorithm was originally used a B<long> time ago in the US |
138 | it considers only the English alphabet and pronunciation. |
139 | |
140 | As it is mapping a large space (arbitrary length strings) onto a small |
141 | space (single letter plus 3 digits) no inference can be made about the |
142 | similarity of two strings which end up with the same soundex code. For |
143 | example, both C<Hilbert> and C<Heilbronn> end up with a soundex code |
144 | of C<H416>. |
145 | |
146 | =head1 AUTHOR |
147 | |
148 | This code was implemented by Mike Stok (C<stok@cybercom.net>) from the |
7e6e257f |
149 | description given by Knuth. Ian Phillipps (C<ian@pipex.net>) and Rich Pinder |
cb1a09d0 |
150 | (C<rpinder@hsc.usc.edu>) supplied ideas and spotted mistakes. |