# -*- perl -*-
-# (c) Copyright 1998-2003 by Mark Mielke
+# (c) Copyright 1998-2007 by Mark Mielke
#
# Freedom to use these sources for whatever you want, as long as credit
# is given where credit is due, is hereby granted. You may make modifications
use strict;
-our $VERSION = '3.02';
+our $VERSION = '3.03';
our @EXPORT_OK = qw(soundex soundex_unicode soundex_nara soundex_nara_unicode
$soundex_nocode);
-our @EXPORT = qw(soundex $soundex_nocode);
+our @EXPORT = qw(soundex soundex_nara $soundex_nocode);
our @ISA = qw(Exporter);
our $nocode;
sub soundex_noxs
{
- # Strict implementation of Knuth's soundex algorithm.
+ # Original Soundex algorithm
my @results = map {
- my $code = $_;
+ my $code = uc($_);
$code =~ tr/AaEeHhIiOoUuWwYyBbFfPpVvCcGgJjKkQqSsXxZzDdTtLlMmNnRr//cd;
if (length($code)) {
sub soundex_nara
{
- # Implementation of NARA's soundex algorithm. If two sounds are
- # identical, and separated by only an H or a W... they should be
- # treated as one. This requires an additional "s///", as well as
- # the "9" character code to represent H and W. ("9" works like "0"
- # except it combines indentical sounds around it into one)
+ # US census (NARA) algorithm.
my @results = map {
my $code = uc($_);
my $firstchar = substr($code, 0, 1);
$code =~ tr[AaEeHhIiOoUuWwYyBbFfPpVvCcGgJjKkQqSsXxZzDdTtLlMmNnRr]
[0000990000009900111111112222222222222222333344555566]s;
- $code =~ s/(.)9\1/$1/g;
+ $code =~ s/(.)9\1/$1/gs;
($code = substr($code, 1)) =~ tr/09//d;
substr($firstchar . $code . '000', 0, 4);
} else {
__END__
-# Implementation of soundex algorithm as described by Knuth in volume
-# 3 of The Art of Computer Programming.
+# Implementation of the soundex algorithm.
#
# Some of this documention was written by Mike Stok.
#
-# Knuth's test cases are:
+# Examples:
#
# Euler, Ellery -> E460
# Gauss, Ghosh -> G200
=head1 NAME
-Text::Soundex - Implementation of the Soundex Algorithm as Described by Knuth
+Text::Soundex - Implementation of the soundex algorithm.
=head1 SYNOPSIS
- use Text::Soundex 'soundex';
+ use Text::Soundex;
+ # Original algorithm.
$code = soundex($name); # Get the soundex code for a name.
@codes = soundex(@names); # Get the list of codes for a list of names.
+ # American Soundex variant (NARA) - Used for US census data.
+ $code = soundex_nara($name); # Get the soundex code for a name.
+ @codes = soundex_nara(@names); # Get the list of codes for a list of names.
+
# Redefine the value that soundex() will return if the input string
# contains no identifiable sounds within it.
$Text::Soundex::nocode = 'Z000';
=head1 DESCRIPTION
-This module implements the soundex algorithm as described by Donald Knuth
-in Volume 3 of B<The Art of Computer Programming>. The algorithm is
-intended to hash words (in particular surnames) into a small space
-using a simple model which approximates the sound of the word when
-spoken by an English speaker. Each word is reduced to a four
-character string, the first character being an upper case letter and
-the remaining three being digits.
+Soundex is a phonetic algorithm for indexing names by sound, as
+pronounced in English. The goal is for names with the same
+pronunciation to be encoded to the same representation so that they
+can be matched despite minor differences in spelling. Soundex is the
+most widely known of all phonetic algorithms and is often used
+(incorrectly) as a synonym for "phonetic algorithm". Improvements to
+Soundex are the basis for many modern phonetic algorithms. (Wikipedia,
+2007)
+
+This module implements the original soundex algorithm developed by
+Robert Russell and Margaret Odell, patented in 1918 and 1922, as well
+as a variation called "American Soundex" used for US census data, and
+current maintained by the National Archives and Records Administration
+(NARA).
+
+The soundex algorithm may be recognized from Donald Knuth's
+B<The Art of Computer Programming>. The algorithm described by
+Knuth is the NARA algorithm.
The value returned for strings which have no soundex encoding is
defined using C<$Text::Soundex::nocode>. The default value is C<undef>,
soundex code for the corresponding argument passed to C<soundex()>. For
example, the following code assigns @codes the value C<('M200', 'S320')>:
- @codes = soundex qw(Mike Stok);
+ @codes = soundex qw(Mike Stok);
To use C<Text::Soundex> to generate codes that can be used to search one
-of the publically available US Censuses, a variant of the soundex()
-subroutine must be used:
+of the publically available US Censuses, a variant of the soundex
+algorithm must be used:
- use Text::Soundex 'soundex_nara';
+ use Text::Soundex;
$code = soundex_nara($name);
-The algorithm used by the US Censuses is slightly different than that
-defined by Knuth and others. The descrepancy shows up in names such as
-"Ashcraft":
+An example of where these algorithm differ follows:
- use Text::Soundex qw(soundex soundex_nara);
+ use Text::Soundex;
print soundex("Ashcraft"), "\n"; # prints: A226
print soundex_nara("Ashcraft"), "\n"; # prints: A261
=head1 EXAMPLES
-Knuth's examples of various names and the soundex codes they map to
+Donald Knuth's examples of names and the soundex codes they map to
are listed below:
Euler, Ellery -> E460
# define utf8n_to_uvchr utf8_to_uv
#endif
-static char *soundex_table =
- /*ABCDEFGHIJKLMNOPQRSTUVWXYZ*/
- "01230120022455012623010202";
+static char sv_soundex_table[0x100];
+static void sv_soundex_initialize (void)
+{
+ memset(&sv_soundex_table[0], '\0', sizeof(sv_soundex_table));
+ sv_soundex_table['A'] = '0';
+ sv_soundex_table['a'] = '0';
+ sv_soundex_table['E'] = '0';
+ sv_soundex_table['e'] = '0';
+ sv_soundex_table['H'] = '0';
+ sv_soundex_table['h'] = '0';
+ sv_soundex_table['I'] = '0';
+ sv_soundex_table['i'] = '0';
+ sv_soundex_table['O'] = '0';
+ sv_soundex_table['o'] = '0';
+ sv_soundex_table['U'] = '0';
+ sv_soundex_table['u'] = '0';
+ sv_soundex_table['W'] = '0';
+ sv_soundex_table['w'] = '0';
+ sv_soundex_table['Y'] = '0';
+ sv_soundex_table['y'] = '0';
+ sv_soundex_table['B'] = '1';
+ sv_soundex_table['b'] = '1';
+ sv_soundex_table['F'] = '1';
+ sv_soundex_table['f'] = '1';
+ sv_soundex_table['P'] = '1';
+ sv_soundex_table['p'] = '1';
+ sv_soundex_table['V'] = '1';
+ sv_soundex_table['v'] = '1';
+ sv_soundex_table['C'] = '2';
+ sv_soundex_table['c'] = '2';
+ sv_soundex_table['G'] = '2';
+ sv_soundex_table['g'] = '2';
+ sv_soundex_table['J'] = '2';
+ sv_soundex_table['j'] = '2';
+ sv_soundex_table['K'] = '2';
+ sv_soundex_table['k'] = '2';
+ sv_soundex_table['Q'] = '2';
+ sv_soundex_table['q'] = '2';
+ sv_soundex_table['S'] = '2';
+ sv_soundex_table['s'] = '2';
+ sv_soundex_table['X'] = '2';
+ sv_soundex_table['x'] = '2';
+ sv_soundex_table['Z'] = '2';
+ sv_soundex_table['z'] = '2';
+ sv_soundex_table['D'] = '3';
+ sv_soundex_table['d'] = '3';
+ sv_soundex_table['T'] = '3';
+ sv_soundex_table['t'] = '3';
+ sv_soundex_table['L'] = '4';
+ sv_soundex_table['l'] = '4';
+ sv_soundex_table['M'] = '5';
+ sv_soundex_table['m'] = '5';
+ sv_soundex_table['N'] = '5';
+ sv_soundex_table['n'] = '5';
+ sv_soundex_table['R'] = '6';
+ sv_soundex_table['r'] = '6';
+}
static SV *sv_soundex (SV *source)
{
while (source_p != source_end)
{
- if ((*source_p & ~((UV) 0x7F)) == 0 && isalpha(*source_p))
+ char codepart_last = sv_soundex_table[(unsigned char) *source_p];
+
+ if (codepart_last != '\0')
{
SV *code = newSV(SOUNDEX_ACCURACY);
char *code_p = SvPVX(code);
char *code_end = &code_p[SOUNDEX_ACCURACY];
- char code_last;
SvCUR_set(code, SOUNDEX_ACCURACY);
SvPOK_only(code);
- code_last = soundex_table[(*code_p++ = toupper(*source_p++)) - 'A'];
+ *code_p++ = toupper(*source_p++);
while (source_p != source_end && code_p != code_end)
{
char c = *source_p++;
+ char codepart = sv_soundex_table[(unsigned char) c];
- if ((c & ~((UV) 0x7F)) == 0 && isalpha(c))
- {
- *code_p = soundex_table[toupper(c) - 'A'];
- if (*code_p != code_last && (code_last = *code_p) != '0')
- code_p++;
- }
+ if (codepart != '\0')
+ if (codepart != codepart_last && (codepart_last = codepart) != '0')
+ *code_p++ = codepart;
}
while (code_p != code_end)
{
STRLEN offset;
UV c = utf8n_to_uvchr(source_p, source_end-source_p, &offset, 0);
+ char codepart_last = (c <= 0xFF) ? sv_soundex_table[c] : '\0';
source_p = (offset >= 1) ? &source_p[offset] : source_end;
- if ((c & ~((UV) 0x7F)) == 0 && isalpha(c))
+ if (codepart_last != '\0')
{
SV *code = newSV(SOUNDEX_ACCURACY);
char *code_p = SvPVX(code);
char *code_end = &code_p[SOUNDEX_ACCURACY];
- char code_last;
SvCUR_set(code, SOUNDEX_ACCURACY);
SvPOK_only(code);
- code_last = soundex_table[(*code_p++ = toupper(c)) - 'A'];
+ *code_p++ = toupper(c);
while (source_p != source_end && code_p != code_end)
{
+ char codepart;
c = utf8n_to_uvchr(source_p, source_end-source_p, &offset, 0);
+ codepart = (c <= 0xFF) ? sv_soundex_table[c] : '\0';
source_p = (offset >= 1) ? &source_p[offset] : source_end;
- if ((c & ~((UV) 0x7F)) == 0 && isalpha(c))
- {
- *code_p = soundex_table[toupper(c) - 'A'];
- if (*code_p != code_last && (code_last = *code_p) != '0')
- code_p++;
- }
+ if (codepart != '\0')
+ if (codepart != codepart_last && (codepart_last = codepart) != '0')
+ *code_p++ = codepart;
}
while (code_p != code_end)
void
soundex_xs (...)
+INIT:
+{
+ sv_soundex_initialize();
+}
PPCODE:
{
int i;