From: Steve Peters Date: Tue, 3 Oct 2006 14:52:45 +0000 (+0000) Subject: Move Text::Soundex from lib/ to ext/ and upgrade it to X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=11f885b578514fcbf59f44ca49ae6a8299238c7d;p=p5sagit%2Fp5-mst-13.2.git Move Text::Soundex from lib/ to ext/ and upgrade it to Text-Soundex-3.02. p4raw-id: //depot/perl@28927 --- diff --git a/MANIFEST b/MANIFEST index edc3f8f..9ee58ba 100644 --- a/MANIFEST +++ b/MANIFEST @@ -1088,6 +1088,12 @@ ext/Sys/Syslog/Syslog.xs Sys::Syslog extension external subroutines ext/Sys/Syslog/t/00-load.t test for Sys::Syslog ext/Sys/Syslog/t/constants.t test for Sys::Syslog ext/Sys/Syslog/t/syslog.t See if Sys::Syslog works +ext/Text/Soundex/Changes Changelog for Text::Soundex +ext/Text/Soundex/Makefile.PL Text::Soundex extension makefile writer +ext/Text/Soundex/README README for Text::Soundex +ext/Text/Soundex/Soundex.pm Text::Soundex extension Perl module +ext/Text/Soundex/Soundex.xs Text::Soundex extension external subroutines +ext/Text/Soundex/t/Soundex.t test for Text::Soundex ext/Thread/create.tx Test thread creation ext/Thread/die2.tx Test thread die() differently ext/Thread/die.tx Test thread die() @@ -2484,8 +2490,6 @@ lib/Text/Balanced/t/pod.t See if Text::Balanced works lib/Text/ParseWords.pm Perl module to split words on arbitrary delimiter lib/Text/ParseWords.t See if Text::ParseWords works lib/Text/ParseWords/taint.t See if Text::ParseWords works with tainting -lib/Text/Soundex.pm Perl module to implement Soundex -lib/Text/Soundex.t See if Soundex works lib/Text/Tabs.pm Do expand and unexpand lib/Text/TabsWrap/CHANGELOG ChangeLog for Tabs+Wrap lib/Text/TabsWrap/t/37000.t See if Text::Tabs is working diff --git a/ext/Text/Soundex/Changes b/ext/Text/Soundex/Changes new file mode 100644 index 0000000..41c78b1 --- /dev/null +++ b/ext/Text/Soundex/Changes @@ -0,0 +1,39 @@ +Revision history for Perl extension Text::Soundex. + +3.02 Sun Feb 02 02:54:00 EST 2003 + +The U8 type was over-used in 3.00 and 3.01. Now, "U8 *" is used only as a +pointer into the UTF-8 string. Also, unicode now works properly on +Perl 5.6.x as the utf8_to_uv() function is used instead of utf8n_to_uvchr() +when compiled under a version of Perl earlier than 5.8.0. + +3.01 Sun Jan 26 16:30:00 EST 2003 + +A bug with non-UTF 8 strings that contain non-ASCII alphabetic characters +was fixed. The soundex_unicode() and soundex_nara_unicode() wrapper +routines were included and the documentation refers the user to the +excellent Text::Unidecode module to perform soundex encodings using +unicode strings. The Perl versions of the routines have been further +optimized, and correct a border case involving non-alphabetic characters +at the beginning of the string. + +3.00 Sun Jan 26 04:08:00 EST 2003 + +Updated documentation, simplified the Perl interface, and updated +the XS code to be faster, and to properly work with UTF-8 strings. +UNICODE characters outside the ASCII range (0x00 - 0x7F) are +considered to be non-alphabetic for the purposes of the soundex +algorithms. + +2.10 Sun Feb 15 15:29:38 EST 1998 + +I've put in a version of my XS code and fully integrated it with the +existing 100% perl mechanism. The change should be virtually transparent +to the user. XS code is approx 7.5 times faster. + - Mark Mielke + +2.00 Thu Jan 1 16:22:11 1998 + +Incorporated Mark Mielke's rewritten version of the main soundex routine +and made the test.pl file simpler. + diff --git a/ext/Text/Soundex/Makefile.PL b/ext/Text/Soundex/Makefile.PL new file mode 100644 index 0000000..ea757f1 --- /dev/null +++ b/ext/Text/Soundex/Makefile.PL @@ -0,0 +1,11 @@ +use ExtUtils::MakeMaker; +WriteMakefile( + NAME => "Text::Soundex", + VERSION_FROM => 'Soundex.pm', + 'dist' => { + COMPRESS => 'gzip -9f', + SUFFIX => 'gz', + DIST_DEFAULT => 'all tardist', + }, + MAN3PODS => {}, +); diff --git a/ext/Text/Soundex/README b/ext/Text/Soundex/README new file mode 100644 index 0000000..7fcf22e --- /dev/null +++ b/ext/Text/Soundex/README @@ -0,0 +1,161 @@ +Text::Soundex Version 3.02 + +NOTE: Users of Text::Soundex Version 2.x should consult the 'History' + section at the end of this document before installing this module. + The interface has been simplified, and existing code that takes + advantages of Version 2.x features may need to be altered to function + properly. + +This is a perl 5 module implementing the Soundex algorithm described by +Knuth. The algorithm is used quite often for locating a person by name +where the actual spelling of the name is not known. + +This version directly supercedes the version of Text::Soundex that can be +found in the core from Perl 5.8.0 and down. (This version is a drop-in +replacement) + +The algorithm used by soundex() is NOT fully compatible with the +algorithm used to index names for US Censuses. Use the soundex_nara() +subroutine to return codes for this purpose. + +Basic Usage: + + Soundex is used to do a one way transformation of a name, converting + a character string given as input into a set of codes representing + the identifiable sounds those characters might make in the output. + + For example: + + use Text::Soundex; + + print soundex("Mark"), "\n"; # prints: M620 + print soundex("Marc"), "\n"; # prints: M620 + + print soundex("Hansen"), "\n"; # prints: H525 + print soundex("Hanson"), "\n"; # prints: H525 + print soundex("Henson"), "\n"; # prints: H525 + + In many situations, code such as the following: + + if ($name1 eq $name2) { + ... + } + + Can be substituted with: + + if (soundex($name1) eq soundex($name2)) { + ... + } + +Installation: + + Once the archive has been unpacked then the following steps are needed + to build, test and install the module (to be done in the directory which + contains the Makefile.PL) + + perl Makefile.PL + make + make test + + If the make test succeeds then the next step may need to be run as root + (on a Unix-like system) or with special privileges on other systems. + + make install + + If you do not want to use the XS code (for whatever reason) do the following + instead of the above: + + perl Makefile.PL --no-xs + make + make test + make install + + If any of the tests report 'not ok' and you are running perl 5.6.0 or later + then please contact Mark Mielke + +History: + + Version 3.02: + 3.01 and 3.00 used the 'U8' type incorrectly causing some strict + compilers to complain or refuse to compile the XS code. Also, unicode + support did not work properly for Perl 5.6.x. Both of these problems + are now fixed. + + Version 3.01: + A bug with non-UTF 8 strings that contain non-ASCII alphabetic characters + was fixed. The soundex_unicode() and soundex_nara_unicode() wrapper + routines were included and the documentation refers the user to the + excellent Text::Unidecode module to perform soundex encodings using + unicode strings. The Perl versions of the routines have been further + optimized, and correct a border case involving non-alphabetic characters + at the beginning of the string. + + Version 3.00: + Support for UTF-8 strings (unicode strings) is now in place. Note + that this allows UTF-8 strings to be passed to the XS version of + the soundex() routine. The Soundex algorithm treats characters + outside the ascii range (0x00 - 0x7F) as if they were not + alphabetical. + + The interface has been simplified. In order to explicitly use the + non-XS implementation of soundex(): + + use Text::Soundex (); + $code = Text::Soundex::soundex_noxs($name); + + In order to use the NARA soundex algorithm: + + use Text::Soundex 'soundex_nara'; + $code = soundex_nara($name); + + Use of the ':NARA-Ruleset' import directive is now obsolete. To + emulate the old behaviour: + + use Text::Soundex (); + *soundex = \&Text::Soundex::soundex_nara; + $code = soundex($name); + + Version 2.20: + This version includes support for the algorithm used to index + the U.S. Federal Censuses. There is a slight descrepancy in the + definition for a soundex code which is not commonly known or + recognized involved similar sounding letters being seperated + by the characters H or W. This is defined as the NARA ruleset, + as this descrepency was discovered by them. (Calling it "the + US Census ruleset" was too unwieldy...) + + NARA can be found at: + http://www.nara.gov/genealogy/ + + The algorithm requested by NARA can be found at: + http://home.utah-inter.net/kinsearch/Soundex.html + + Ways to use it in your code: + + Transparently change existing code like this: + ============================================= + use Text::Soundex qw(:NARA-Ruleset); + + ... soundex(...) ... + + -- + + Make the change visibly distinct like this: + =========================================== + use Text::Soundex qw(soundex_nara); + + ... soundex_nara(...) ... + + Version 2.00: + This version is a full re-write of the 1.0 engine by Mark Mielke. + The goal was for speed... and this was achieved. There is an optional + XS module which can be used completely transparently by the user + which offers a further speed increase of a factor of more than 7.5X. + + Version 1.00: + This version can be found in the perl core distribution from at + least Perl 5.8.0 and down. It was written by Mike Stok. It can be + identified by the fact that it does not contain a $VERSION + in the beginning of the module, and as well it uses an RCS + tag with a version of 1.x. This version, before some perl5'ish + packaging was introduced, was actually written for perl4. diff --git a/lib/Text/Soundex.pm b/ext/Text/Soundex/Soundex.pm similarity index 100% rename from lib/Text/Soundex.pm rename to ext/Text/Soundex/Soundex.pm diff --git a/ext/Text/Soundex/Soundex.xs b/ext/Text/Soundex/Soundex.xs new file mode 100644 index 0000000..9f5d809 --- /dev/null +++ b/ext/Text/Soundex/Soundex.xs @@ -0,0 +1,157 @@ +/* -*- c -*- */ + +/* (c) Copyright 1998-2003 by Mark Mielke + * + * Freedom to use these sources for whatever you want, as long as credit + * is given where credit is due, is hereby granted. You may make modifications + * where you see fit but leave this copyright somewhere visible. As well try + * to initial any changes you make so that if i like the changes i can + * incorporate them into any later versions of mine. + * + * - Mark Mielke + */ + +#include "EXTERN.h" +#include "perl.h" +#include "XSUB.h" + +#define SOUNDEX_ACCURACY (4) /* The maximum code length... (should be>=2) */ + +#if !(PERL_REVISION >= 5 && PERL_VERSION >= 8) +# define utf8n_to_uvchr utf8_to_uv +#endif + +static char *soundex_table = + /*ABCDEFGHIJKLMNOPQRSTUVWXYZ*/ + "01230120022455012623010202"; + +static SV *sv_soundex (source) + SV *source; +{ + char *source_p; + char *source_end; + + { + STRLEN source_len; + source_p = SvPV(source, source_len); + source_end = &source_p[source_len]; + } + + while (source_p != source_end) + { + if ((*source_p & ~((UV) 0x7F)) == 0 && isalpha(*source_p)) + { + SV *code = newSV(SOUNDEX_ACCURACY); + char *code_p = SvPVX(code); + char *code_end = &code_p[SOUNDEX_ACCURACY]; + char code_last; + + SvCUR_set(code, SOUNDEX_ACCURACY); + SvPOK_only(code); + + code_last = soundex_table[(*code_p++ = toupper(*source_p++)) - 'A']; + + while (source_p != source_end && code_p != code_end) + { + char c = *source_p++; + + if ((c & ~((UV) 0x7F)) == 0 && isalpha(c)) + { + *code_p = soundex_table[toupper(c) - 'A']; + if (*code_p != code_last && (code_last = *code_p) != '0') + code_p++; + } + } + + while (code_p != code_end) + *code_p++ = '0'; + + *code_end = '\0'; + + return code; + } + + source_p++; + } + + return SvREFCNT_inc(perl_get_sv("Text::Soundex::nocode", FALSE)); +} + +static SV *sv_soundex_utf8 (source) + SV *source; +{ + U8 *source_p; + U8 *source_end; + + { + STRLEN source_len; + source_p = (U8 *) SvPV(source, source_len); + source_end = &source_p[source_len]; + } + + while (source_p < source_end) + { + STRLEN offset; + UV c = utf8n_to_uvchr(source_p, source_end-source_p, &offset, 0); + source_p = (offset >= 1) ? &source_p[offset] : source_end; + + if ((c & ~((UV) 0x7F)) == 0 && isalpha(c)) + { + SV *code = newSV(SOUNDEX_ACCURACY); + char *code_p = SvPVX(code); + char *code_end = &code_p[SOUNDEX_ACCURACY]; + char code_last; + + SvCUR_set(code, SOUNDEX_ACCURACY); + SvPOK_only(code); + + code_last = soundex_table[(*code_p++ = toupper(c)) - 'A']; + + while (source_p != source_end && code_p != code_end) + { + c = utf8n_to_uvchr(source_p, source_end-source_p, &offset, 0); + source_p = (offset >= 1) ? &source_p[offset] : source_end; + + if ((c & ~((UV) 0x7F)) == 0 && isalpha(c)) + { + *code_p = soundex_table[toupper(c) - 'A']; + if (*code_p != code_last && (code_last = *code_p) != '0') + code_p++; + } + } + + while (code_p != code_end) + *code_p++ = '0'; + + *code_end = '\0'; + + return code; + } + + source_p++; + } + + return SvREFCNT_inc(perl_get_sv("Text::Soundex::nocode", FALSE)); +} + +MODULE = Text::Soundex PACKAGE = Text::Soundex + +PROTOTYPES: DISABLE + +void +soundex_xs (...) +PPCODE: +{ + int i; + for (i = 0; i < items; i++) + { + SV *sv = ST(i); + + if (DO_UTF8(sv)) + sv = sv_soundex_utf8(sv); + else + sv = sv_soundex(sv); + + PUSHs(sv_2mortal(sv)); + } +} diff --git a/lib/Text/Soundex.t b/ext/Text/Soundex/t/Soundex.t similarity index 100% rename from lib/Text/Soundex.t rename to ext/Text/Soundex/t/Soundex.t