X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=blobdiff_plain;f=lib%2FText%2FSoundex.pm;h=64a9e6507d56fbb5ac38bafb441b75e14d33041b;hb=a646417951941146b1ea568de33ca3508b9859a2;hp=655152347c3c669c575b3cd1764e899395aadcfc;hpb=a0d0e21ea6ea90a22318550944fe6cb09ae10cda;p=p5sagit%2Fp5-mst-13.2.git diff --git a/lib/Text/Soundex.pm b/lib/Text/Soundex.pm index 6551523..64a9e65 100644 --- a/lib/Text/Soundex.pm +++ b/lib/Text/Soundex.pm @@ -5,11 +5,13 @@ require Exporter; @ISA = qw(Exporter); @EXPORT = qw(&soundex $soundex_nocode); +$VERSION = '1.01'; + # $Id: soundex.pl,v 1.2 1994/03/24 00:30:27 mike Exp $ # # Implementation of soundex algorithm as described by Knuth in volume # 3 of The Art of Computer Programming, with ideas stolen from Ian -# Phillips . +# Phillipps . # # Mike Stok , 2 March 1994. # @@ -40,22 +42,15 @@ require Exporter; $soundex_nocode = undef; -# soundex -# -# usage: -# -# @codes = &soundex (@wordList); -# $code = &soundex ($word); -# -# This strenuously avoids 0 - sub soundex { local (@s, $f, $fc, $_) = @_; + push @s, '' unless @s; # handle no args as a single empty string + foreach (@s) { - tr/a-z/A-Z/; + $_ = uc $_; tr/A-Z//cd; if ($_ eq '') @@ -80,3 +75,76 @@ sub soundex 1; +__END__ + +=head1 NAME + +Text::Soundex - Implementation of the Soundex Algorithm as Described by Knuth + +=head1 SYNOPSIS + + use Text::Soundex; + + $code = soundex $string; # get soundex code for a string + @codes = soundex @list; # get list of codes for list of strings + + # set value to be returned for strings without soundex code + + $soundex_nocode = 'Z000'; + +=head1 DESCRIPTION + +This module implements the soundex algorithm as described by Donald Knuth +in Volume 3 of B. The algorithm is +intended to hash words (in particular surnames) into a small space using a +simple model which approximates the sound of the word when spoken by an English +speaker. Each word is reduced to a four character string, the first +character being an upper case letter and the remaining three being digits. + +If there is no soundex code representation for a string then the value of +C<$soundex_nocode> is returned. This is initially set to C, but +many people seem to prefer an I value like C +(how unlikely this is depends on the data set being dealt with.) Any value +can be assigned to C<$soundex_nocode>. + +In scalar context C returns the soundex code of its first +argument, and in list context a list is returned in which each element is the +soundex code for the corresponding argument passed to C e.g. + + @codes = soundex qw(Mike Stok); + +leaves C<@codes> containing C<('M200', 'S320')>. + +=head1 EXAMPLES + +Knuth's examples of various names and the soundex codes they map to +are listed below: + + Euler, Ellery -> E460 + Gauss, Ghosh -> G200 + Hilbert, Heilbronn -> H416 + Knuth, Kant -> K530 + Lloyd, Ladd -> L300 + Lukasiewicz, Lissajous -> L222 + +so: + + $code = soundex 'Knuth'; # $code contains 'K530' + @list = soundex qw(Lloyd Gauss); # @list contains 'L300', 'G200' + +=head1 LIMITATIONS + +As the soundex algorithm was originally used a B time ago in the US +it considers only the English alphabet and pronunciation. + +As it is mapping a large space (arbitrary length strings) onto a small +space (single letter plus 3 digits) no inference can be made about the +similarity of two strings which end up with the same soundex code. For +example, both C and C end up with a soundex code +of C. + +=head1 AUTHOR + +This code was implemented by Mike Stok (C) from the +description given by Knuth. Ian Phillipps (C) and Rich Pinder +(C) supplied ideas and spotted mistakes.