Commit | Line | Data |
11f885b5 |
1 | /* -*- c -*- */ |
2 | |
3 | /* (c) Copyright 1998-2003 by Mark Mielke |
4 | * |
5 | * Freedom to use these sources for whatever you want, as long as credit |
6 | * is given where credit is due, is hereby granted. You may make modifications |
7 | * where you see fit but leave this copyright somewhere visible. As well try |
8 | * to initial any changes you make so that if i like the changes i can |
9 | * incorporate them into any later versions of mine. |
10 | * |
11 | * - Mark Mielke <mark@mielke.cc> |
12 | */ |
13 | |
14 | #include "EXTERN.h" |
15 | #include "perl.h" |
16 | #include "XSUB.h" |
17 | |
18 | #define SOUNDEX_ACCURACY (4) /* The maximum code length... (should be>=2) */ |
19 | |
20 | #if !(PERL_REVISION >= 5 && PERL_VERSION >= 8) |
21 | # define utf8n_to_uvchr utf8_to_uv |
22 | #endif |
23 | |
ef0f5379 |
24 | static char sv_soundex_table[0x100]; |
25 | static void sv_soundex_initialize (void) |
26 | { |
27 | memset(&sv_soundex_table[0], '\0', sizeof(sv_soundex_table)); |
28 | sv_soundex_table['A'] = '0'; |
29 | sv_soundex_table['a'] = '0'; |
30 | sv_soundex_table['E'] = '0'; |
31 | sv_soundex_table['e'] = '0'; |
32 | sv_soundex_table['H'] = '0'; |
33 | sv_soundex_table['h'] = '0'; |
34 | sv_soundex_table['I'] = '0'; |
35 | sv_soundex_table['i'] = '0'; |
36 | sv_soundex_table['O'] = '0'; |
37 | sv_soundex_table['o'] = '0'; |
38 | sv_soundex_table['U'] = '0'; |
39 | sv_soundex_table['u'] = '0'; |
40 | sv_soundex_table['W'] = '0'; |
41 | sv_soundex_table['w'] = '0'; |
42 | sv_soundex_table['Y'] = '0'; |
43 | sv_soundex_table['y'] = '0'; |
44 | sv_soundex_table['B'] = '1'; |
45 | sv_soundex_table['b'] = '1'; |
46 | sv_soundex_table['F'] = '1'; |
47 | sv_soundex_table['f'] = '1'; |
48 | sv_soundex_table['P'] = '1'; |
49 | sv_soundex_table['p'] = '1'; |
50 | sv_soundex_table['V'] = '1'; |
51 | sv_soundex_table['v'] = '1'; |
52 | sv_soundex_table['C'] = '2'; |
53 | sv_soundex_table['c'] = '2'; |
54 | sv_soundex_table['G'] = '2'; |
55 | sv_soundex_table['g'] = '2'; |
56 | sv_soundex_table['J'] = '2'; |
57 | sv_soundex_table['j'] = '2'; |
58 | sv_soundex_table['K'] = '2'; |
59 | sv_soundex_table['k'] = '2'; |
60 | sv_soundex_table['Q'] = '2'; |
61 | sv_soundex_table['q'] = '2'; |
62 | sv_soundex_table['S'] = '2'; |
63 | sv_soundex_table['s'] = '2'; |
64 | sv_soundex_table['X'] = '2'; |
65 | sv_soundex_table['x'] = '2'; |
66 | sv_soundex_table['Z'] = '2'; |
67 | sv_soundex_table['z'] = '2'; |
68 | sv_soundex_table['D'] = '3'; |
69 | sv_soundex_table['d'] = '3'; |
70 | sv_soundex_table['T'] = '3'; |
71 | sv_soundex_table['t'] = '3'; |
72 | sv_soundex_table['L'] = '4'; |
73 | sv_soundex_table['l'] = '4'; |
74 | sv_soundex_table['M'] = '5'; |
75 | sv_soundex_table['m'] = '5'; |
76 | sv_soundex_table['N'] = '5'; |
77 | sv_soundex_table['n'] = '5'; |
78 | sv_soundex_table['R'] = '6'; |
79 | sv_soundex_table['r'] = '6'; |
80 | } |
11f885b5 |
81 | |
81a4c762 |
82 | static SV *sv_soundex (SV *source) |
11f885b5 |
83 | { |
84 | char *source_p; |
85 | char *source_end; |
86 | |
87 | { |
88 | STRLEN source_len; |
89 | source_p = SvPV(source, source_len); |
90 | source_end = &source_p[source_len]; |
91 | } |
92 | |
93 | while (source_p != source_end) |
94 | { |
ef0f5379 |
95 | char codepart_last = sv_soundex_table[(unsigned char) *source_p]; |
96 | |
97 | if (codepart_last != '\0') |
11f885b5 |
98 | { |
99 | SV *code = newSV(SOUNDEX_ACCURACY); |
100 | char *code_p = SvPVX(code); |
101 | char *code_end = &code_p[SOUNDEX_ACCURACY]; |
11f885b5 |
102 | |
103 | SvCUR_set(code, SOUNDEX_ACCURACY); |
104 | SvPOK_only(code); |
105 | |
ef0f5379 |
106 | *code_p++ = toupper(*source_p++); |
11f885b5 |
107 | |
108 | while (source_p != source_end && code_p != code_end) |
109 | { |
110 | char c = *source_p++; |
ef0f5379 |
111 | char codepart = sv_soundex_table[(unsigned char) c]; |
11f885b5 |
112 | |
ef0f5379 |
113 | if (codepart != '\0') |
114 | if (codepart != codepart_last && (codepart_last = codepart) != '0') |
115 | *code_p++ = codepart; |
11f885b5 |
116 | } |
117 | |
118 | while (code_p != code_end) |
119 | *code_p++ = '0'; |
120 | |
121 | *code_end = '\0'; |
122 | |
123 | return code; |
124 | } |
125 | |
126 | source_p++; |
127 | } |
128 | |
129 | return SvREFCNT_inc(perl_get_sv("Text::Soundex::nocode", FALSE)); |
130 | } |
131 | |
81a4c762 |
132 | static SV *sv_soundex_utf8 (SV* source) |
11f885b5 |
133 | { |
134 | U8 *source_p; |
135 | U8 *source_end; |
136 | |
137 | { |
138 | STRLEN source_len; |
139 | source_p = (U8 *) SvPV(source, source_len); |
140 | source_end = &source_p[source_len]; |
141 | } |
142 | |
143 | while (source_p < source_end) |
144 | { |
145 | STRLEN offset; |
146 | UV c = utf8n_to_uvchr(source_p, source_end-source_p, &offset, 0); |
ef0f5379 |
147 | char codepart_last = (c <= 0xFF) ? sv_soundex_table[c] : '\0'; |
11f885b5 |
148 | source_p = (offset >= 1) ? &source_p[offset] : source_end; |
149 | |
ef0f5379 |
150 | if (codepart_last != '\0') |
11f885b5 |
151 | { |
152 | SV *code = newSV(SOUNDEX_ACCURACY); |
153 | char *code_p = SvPVX(code); |
154 | char *code_end = &code_p[SOUNDEX_ACCURACY]; |
11f885b5 |
155 | |
156 | SvCUR_set(code, SOUNDEX_ACCURACY); |
157 | SvPOK_only(code); |
158 | |
ef0f5379 |
159 | *code_p++ = toupper(c); |
11f885b5 |
160 | |
161 | while (source_p != source_end && code_p != code_end) |
162 | { |
ef0f5379 |
163 | char codepart; |
11f885b5 |
164 | c = utf8n_to_uvchr(source_p, source_end-source_p, &offset, 0); |
ef0f5379 |
165 | codepart = (c <= 0xFF) ? sv_soundex_table[c] : '\0'; |
11f885b5 |
166 | source_p = (offset >= 1) ? &source_p[offset] : source_end; |
167 | |
ef0f5379 |
168 | if (codepart != '\0') |
169 | if (codepart != codepart_last && (codepart_last = codepart) != '0') |
170 | *code_p++ = codepart; |
11f885b5 |
171 | } |
172 | |
173 | while (code_p != code_end) |
174 | *code_p++ = '0'; |
175 | |
176 | *code_end = '\0'; |
177 | |
178 | return code; |
179 | } |
180 | |
181 | source_p++; |
182 | } |
183 | |
184 | return SvREFCNT_inc(perl_get_sv("Text::Soundex::nocode", FALSE)); |
185 | } |
186 | |
187 | MODULE = Text::Soundex PACKAGE = Text::Soundex |
188 | |
189 | PROTOTYPES: DISABLE |
190 | |
191 | void |
192 | soundex_xs (...) |
ef0f5379 |
193 | INIT: |
194 | { |
195 | sv_soundex_initialize(); |
196 | } |
11f885b5 |
197 | PPCODE: |
198 | { |
199 | int i; |
200 | for (i = 0; i < items; i++) |
201 | { |
202 | SV *sv = ST(i); |
203 | |
204 | if (DO_UTF8(sv)) |
205 | sv = sv_soundex_utf8(sv); |
206 | else |
207 | sv = sv_soundex(sv); |
208 | |
209 | PUSHs(sv_2mortal(sv)); |
210 | } |
211 | } |