Commit | Line | Data |
a0d0e21e |
1 | package Text::Soundex; |
2 | require 5.000; |
3 | require Exporter; |
4 | |
5 | @ISA = qw(Exporter); |
6 | @EXPORT = qw(&soundex $soundex_nocode); |
7 | |
8 | # $Id: soundex.pl,v 1.2 1994/03/24 00:30:27 mike Exp $ |
9 | # |
10 | # Implementation of soundex algorithm as described by Knuth in volume |
11 | # 3 of The Art of Computer Programming, with ideas stolen from Ian |
12 | # Phillips <ian@pipex.net>. |
13 | # |
14 | # Mike Stok <Mike.Stok@meiko.concord.ma.us>, 2 March 1994. |
15 | # |
16 | # Knuth's test cases are: |
17 | # |
18 | # Euler, Ellery -> E460 |
19 | # Gauss, Ghosh -> G200 |
20 | # Hilbert, Heilbronn -> H416 |
21 | # Knuth, Kant -> K530 |
22 | # Lloyd, Ladd -> L300 |
23 | # Lukasiewicz, Lissajous -> L222 |
24 | # |
25 | # $Log: soundex.pl,v $ |
26 | # Revision 1.2 1994/03/24 00:30:27 mike |
27 | # Subtle bug (any excuse :-) spotted by Rich Pinder <rpinder@hsc.usc.edu> |
28 | # in the way I handles leasing characters which were different but had |
29 | # the same soundex code. This showed up comparing it with Oracle's |
30 | # soundex output. |
31 | # |
32 | # Revision 1.1 1994/03/02 13:01:30 mike |
33 | # Initial revision |
34 | # |
35 | # |
36 | ############################################################################## |
37 | |
38 | # $soundex_nocode is used to indicate a string doesn't have a soundex |
39 | # code, I like undef other people may want to set it to 'Z000'. |
40 | |
41 | $soundex_nocode = undef; |
42 | |
a0d0e21e |
43 | sub soundex |
44 | { |
45 | local (@s, $f, $fc, $_) = @_; |
46 | |
cb1a09d0 |
47 | push @s, '' unless @s; # handle no args as a single empty string |
48 | |
a0d0e21e |
49 | foreach (@s) |
50 | { |
55497cff |
51 | $_ = uc $_; |
a0d0e21e |
52 | tr/A-Z//cd; |
53 | |
54 | if ($_ eq '') |
55 | { |
56 | $_ = $soundex_nocode; |
57 | } |
58 | else |
59 | { |
60 | ($f) = /^(.)/; |
61 | tr/AEHIOUWYBFPVCGJKQSXZDTLMNR/00000000111122222222334556/; |
62 | ($fc) = /^(.)/; |
63 | s/^$fc+//; |
64 | tr///cs; |
65 | tr/0//d; |
66 | $_ = $f . $_ . '000'; |
67 | s/^(.{4}).*/$1/; |
68 | } |
69 | } |
70 | |
71 | wantarray ? @s : shift @s; |
72 | } |
73 | |
74 | 1; |
75 | |
cb1a09d0 |
76 | __END__ |
77 | |
78 | =head1 NAME |
79 | |
80 | Text::Soundex - Implementation of the Soundex Algorithm as Described by Knuth |
81 | |
82 | =head1 SYNOPSIS |
83 | |
84 | use Text::Soundex; |
85 | |
86 | $code = soundex $string; # get soundex code for a string |
87 | @codes = soundex @list; # get list of codes for list of strings |
88 | |
89 | # set value to be returned for strings without soundex code |
90 | |
91 | $soundex_nocode = 'Z000'; |
92 | |
93 | =head1 DESCRIPTION |
94 | |
95 | This module implements the soundex algorithm as described by Donald Knuth |
96 | in Volume 3 of B<The Art of Computer Programming>. The algorithm is |
97 | intended to hash words (in particular surnames) into a small space using a |
98 | simple model which approximates the sound of the word when spoken by an English |
99 | speaker. Each word is reduced to a four character string, the first |
100 | character being an upper case letter and the remaining three being digits. |
101 | |
102 | If there is no soundex code representation for a string then the value of |
103 | C<$soundex_nocode> is returned. This is initially set to C<undef>, but |
104 | many people seem to prefer an I<unlikely> value like C<Z000> |
105 | (how unlikely this is depends on the data set being dealt with.) Any value |
106 | can be assigned to C<$soundex_nocode>. |
107 | |
108 | In scalar context C<soundex> returns the soundex code of its first |
109 | argument, and in array context a list is returned in which each element is the |
110 | soundex code for the corresponding argument passed to C<soundex> e.g. |
111 | |
112 | @codes = soundex qw(Mike Stok); |
113 | |
114 | leaves C<@codes> containing C<('M200', 'S320')>. |
115 | |
116 | =head1 EXAMPLES |
117 | |
118 | Knuth's examples of various names and the soundex codes they map to |
119 | are listed below: |
120 | |
121 | Euler, Ellery -> E460 |
122 | Gauss, Ghosh -> G200 |
123 | Hilbert, Heilbronn -> H416 |
124 | Knuth, Kant -> K530 |
125 | Lloyd, Ladd -> L300 |
126 | Lukasiewicz, Lissajous -> L222 |
127 | |
128 | so: |
129 | |
130 | $code = soundex 'Knuth'; # $code contains 'K530' |
131 | @list = soundex qw(Lloyd Gauss); # @list contains 'L300', 'G200' |
132 | |
133 | =head1 LIMITATIONS |
134 | |
135 | As the soundex algorithm was originally used a B<long> time ago in the US |
136 | it considers only the English alphabet and pronunciation. |
137 | |
138 | As it is mapping a large space (arbitrary length strings) onto a small |
139 | space (single letter plus 3 digits) no inference can be made about the |
140 | similarity of two strings which end up with the same soundex code. For |
141 | example, both C<Hilbert> and C<Heilbronn> end up with a soundex code |
142 | of C<H416>. |
143 | |
144 | =head1 AUTHOR |
145 | |
146 | This code was implemented by Mike Stok (C<stok@cybercom.net>) from the |
147 | description given by Knuth. Ian Phillips (C<ian@pipex.net>) and Rich Pinder |
148 | (C<rpinder@hsc.usc.edu>) supplied ideas and spotted mistakes. |