Commit | Line | Data |
2304df62 |
1 | Article 20106 of comp.lang.perl: |
2 | Path: netlabs!news.cerf.net!ihnp4.ucsd.edu!mvb.saic.com!MathWorks.Com!noc.near.net!newshost.meiko.com!not-for-mail |
3 | From: mike@meiko.com (Mike Stok) |
4 | Newsgroups: comp.lang.perl |
5 | Subject: Soundex (again :-) |
6 | Date: 23 Mar 1994 19:44:35 -0500 |
7 | Organization: Meiko Scientific, Inc., MA |
8 | Lines: 272 |
9 | Message-ID: <2mqnpj$qk4@hibbert.meiko.com> |
10 | NNTP-Posting-Host: hibbert.meiko.com |
11 | |
12 | Thanks to Rich Pinder <rpinder@hsc.usc.edu> for finding a little bug in my |
13 | soundex code I posted a while back. This showed up when he compared it |
14 | with the output from Oracle's soundex function, and were caused by leading |
15 | characters which were different but shared the same soundex code. |
16 | |
17 | Here's a fixed shar file... |
18 | |
19 | Mike |
20 | |
21 | #!/bin/sh |
22 | # This is a shell archive (produced by shar 3.49) |
23 | # To extract the files from this archive, save it to a file, remove |
24 | # everything above the "!/bin/sh" line above, and type "sh file_name". |
25 | # |
26 | # made 03/24/1994 00:35 UTC by Mike.Stok@meiko.concord.ma.us |
27 | # Source directory /tmp_mnt/develop/sw/misc/mike/soundex |
28 | # |
29 | # existing files will NOT be overwritten unless -c is specified |
30 | # |
31 | # This shar contains: |
32 | # length mode name |
33 | # ------ ---------- ------------------------------------------ |
34 | # 1677 -r--r--r-- soundex.pl |
35 | # 2408 -r-xr-xr-x soundex.t |
36 | # |
37 | # ============= soundex.pl ============== |
38 | if test -f 'soundex.pl' -a X"$1" != X"-c"; then |
39 | echo 'x - skipping soundex.pl (File already exists)' |
40 | else |
41 | echo 'x - extracting soundex.pl (Text)' |
42 | sed 's/^X//' << 'SHAR_EOF' > 'soundex.pl' && |
43 | package soundex; |
44 | X |
45 | ;# $Id: soundex.pl,v 1.2 1994/03/24 00:30:27 mike Exp $ |
46 | ;# |
47 | ;# Implementation of soundex algorithm as described by Knuth in volume |
48 | ;# 3 of The Art of Computer Programming, with ideas stolen from Ian |
49 | ;# Phillips <ian@pipex.net>. |
50 | ;# |
51 | ;# Mike Stok <Mike.Stok@meiko.concord.ma.us>, 2 March 1994. |
52 | ;# |
53 | ;# Knuth's test cases are: |
54 | ;# |
55 | ;# Euler, Ellery -> E460 |
56 | ;# Gauss, Ghosh -> G200 |
57 | ;# Hilbert, Heilbronn -> H416 |
58 | ;# Knuth, Kant -> K530 |
59 | ;# Lloyd, Ladd -> L300 |
60 | ;# Lukasiewicz, Lissajous -> L222 |
61 | ;# |
62 | ;# $Log: soundex.pl,v $ |
63 | ;# Revision 1.2 1994/03/24 00:30:27 mike |
64 | ;# Subtle bug (any excuse :-) spotted by Rich Pinder <rpinder@hsc.usc.edu> |
65 | ;# in the way I handles leasing characters which were different but had |
66 | ;# the same soundex code. This showed up comparing it with Oracle's |
67 | ;# soundex output. |
68 | ;# |
69 | ;# Revision 1.1 1994/03/02 13:01:30 mike |
70 | ;# Initial revision |
71 | ;# |
72 | ;# |
73 | ;############################################################################## |
74 | X |
75 | ;# $soundex'noCode is used to indicate a string doesn't have a soundex |
76 | ;# code, I like undef other people may want to set it to 'Z000'. |
77 | X |
78 | $noCode = undef; |
79 | X |
80 | ;# main'soundex |
81 | ;# |
82 | ;# usage: |
83 | ;# |
84 | ;# @codes = &main'soundex (@wordList); |
85 | ;# $code = &main'soundex ($word); |
86 | ;# |
87 | ;# This strenuously avoids $[ |
88 | X |
89 | sub main'soundex |
90 | { |
91 | X local (@s, $f, $fc, $_) = @_; |
92 | X |
93 | X foreach (@s) |
94 | X { |
95 | X tr/a-z/A-Z/; |
96 | X tr/A-Z//cd; |
97 | X |
98 | X if ($_ eq '') |
99 | X { |
100 | X $_ = $noCode; |
101 | X } |
102 | X else |
103 | X { |
104 | X ($f) = /^(.)/; |
105 | X tr/AEHIOUWYBFPVCGJKQSXZDTLMNR/00000000111122222222334556/; |
106 | X ($fc) = /^(.)/; |
107 | X s/^$fc+//; |
108 | X tr///cs; |
109 | X tr/0//d; |
110 | X $_ = $f . $_ . '000'; |
111 | X s/^(.{4}).*/$1/; |
112 | X } |
113 | X } |
114 | X |
115 | X wantarray ? @s : shift @s; |
116 | } |
117 | X |
118 | 1; |
119 | SHAR_EOF |
120 | chmod 0444 soundex.pl || |
121 | echo 'restore of soundex.pl failed' |
122 | Wc_c="`wc -c < 'soundex.pl'`" |
123 | test 1677 -eq "$Wc_c" || |
124 | echo 'soundex.pl: original size 1677, current size' "$Wc_c" |
125 | fi |
126 | # ============= soundex.t ============== |
127 | if test -f 'soundex.t' -a X"$1" != X"-c"; then |
128 | echo 'x - skipping soundex.t (File already exists)' |
129 | else |
130 | echo 'x - extracting soundex.t (Text)' |
131 | sed 's/^X//' << 'SHAR_EOF' > 'soundex.t' && |
132 | #!./perl |
133 | ;# |
134 | ;# $Id: soundex.t,v 1.2 1994/03/24 00:30:27 mike Exp $ |
135 | ;# |
136 | ;# test module for soundex.pl |
137 | ;# |
138 | ;# $Log: soundex.t,v $ |
139 | ;# Revision 1.2 1994/03/24 00:30:27 mike |
140 | ;# Subtle bug (any excuse :-) spotted by Rich Pinder <rpinder@hsc.usc.edu> |
141 | ;# in the way I handles leasing characters which were different but had |
142 | ;# the same soundex code. This showed up comparing it with Oracle's |
143 | ;# soundex output. |
144 | ;# |
145 | ;# Revision 1.1 1994/03/02 13:03:02 mike |
146 | ;# Initial revision |
147 | ;# |
148 | ;# |
149 | X |
150 | require '../lib/soundex.pl'; |
151 | X |
152 | $test = 0; |
153 | print "1..13\n"; |
154 | X |
155 | while (<DATA>) |
156 | { |
157 | X chop; |
158 | X next if /^\s*;?#/; |
159 | X next if /^\s*$/; |
160 | X |
161 | X ++$test; |
162 | X $bad = 0; |
163 | X |
164 | X if (/^eval\s+/) |
165 | X { |
166 | X ($try = $_) =~ s/^eval\s+//; |
167 | X |
168 | X eval ($try); |
169 | X if ($@) |
170 | X { |
171 | X $bad++; |
172 | X print "not ok $test\n"; |
173 | X print "# eval '$try' returned $@"; |
174 | X } |
175 | X } |
176 | X elsif (/^\(/) |
177 | X { |
178 | X ($in, $out) = split (':'); |
179 | X |
180 | X $try = "\@expect = $out; \@got = &soundex $in;"; |
181 | X eval ($try); |
182 | X |
183 | X if (@expect != @got) |
184 | X { |
185 | X $bad++; |
186 | X print "not ok $test\n"; |
187 | X print "# expected ", scalar @expect, " results, got ", scalar @got, "\n"; |
188 | X print "# expected (", join (', ', @expect), |
189 | X ") got (", join (', ', @got), ")\n"; |
190 | X } |
191 | X else |
192 | X { |
193 | X while (@got) |
194 | X { |
195 | X $expect = shift @expect; |
196 | X $got = shift @got; |
197 | X |
198 | X if ($expect ne $got) |
199 | X { |
200 | X $bad++; |
201 | X print "not ok $test\n"; |
202 | X print "# expected $expect, got $got\n"; |
203 | X } |
204 | X } |
205 | X } |
206 | X } |
207 | X else |
208 | X { |
209 | X ($in, $out) = split (':'); |
210 | X |
211 | X $try = "\$expect = $out; \$got = &soundex ($in);"; |
212 | X eval ($try); |
213 | X |
214 | X if ($expect ne $got) |
215 | X { |
216 | X $bad++; |
217 | X print "not ok $test\n"; |
218 | X print "# expected $expect, got $got\n"; |
219 | X } |
220 | X } |
221 | X |
222 | X print "ok $test\n" unless $bad; |
223 | } |
224 | X |
225 | __END__ |
226 | # |
227 | # 1..6 |
228 | # |
229 | # Knuth's test cases, scalar in, scalar out |
230 | # |
231 | 'Euler':'E460' |
232 | 'Gauss':'G200' |
233 | 'Hilbert':'H416' |
234 | 'Knuth':'K530' |
235 | 'Lloyd':'L300' |
236 | 'Lukasiewicz':'L222' |
237 | # |
238 | # 7..8 |
239 | # |
240 | # check default bad code |
241 | # |
242 | '2 + 2 = 4':undef |
243 | undef:undef |
244 | # |
245 | # 9 |
246 | # |
247 | # check array in, array out |
248 | # |
249 | ('Ellery', 'Ghosh', 'Heilbronn', 'Kant', 'Ladd', 'Lissajous'):('E460', 'G200', 'H416', 'K530', 'L300', 'L222') |
250 | # |
251 | # 10 |
252 | # |
253 | # check array with explicit undef |
254 | # |
255 | ('Mike', undef, 'Stok'):('M200', undef, 'S320') |
256 | # |
257 | # 11..12 |
258 | # |
259 | # check setting $soundex'noCode |
260 | # |
261 | eval $soundex'noCode = 'Z000'; |
262 | ('Mike', undef, 'Stok'):('M200', 'Z000', 'S320') |
263 | # |
264 | # 13 |
265 | # |
266 | # a subtle difference between me & oracle, spotted by Rich Pinder |
267 | # <rpinder@hsc.usc.edu> |
268 | # |
269 | CZARKOWSKA:C622 |
270 | SHAR_EOF |
271 | chmod 0555 soundex.t || |
272 | echo 'restore of soundex.t failed' |
273 | Wc_c="`wc -c < 'soundex.t'`" |
274 | test 2408 -eq "$Wc_c" || |
275 | echo 'soundex.t: original size 2408, current size' "$Wc_c" |
276 | fi |
277 | exit 0 |
278 | |
279 | -- |
280 | The "usual disclaimers" apply. | Meiko |
281 | Mike Stok | 130C Baker Ave. Ext |
282 | Mike.Stok@meiko.concord.ma.us | Concord, MA 01742 |
283 | Meiko tel: (508) 371 0088 | |
284 | |
285 | |