Commit | Line | Data |
ddd87d75 |
1 | package Lingua::EN::Inflect; |
2 | |
3 | use strict; |
4 | use vars qw($VERSION @EXPORT_OK %EXPORT_TAGS @ISA); |
5 | use Env; |
6 | |
7 | require Exporter; |
8 | @ISA = qw(Exporter); |
9 | |
10 | $VERSION = '1.89'; |
11 | |
12 | %EXPORT_TAGS = |
13 | ( |
14 | ALL => [ qw( classical inflect |
15 | PL PL_N PL_V PL_ADJ NO NUM A AN |
16 | PL_eq PL_N_eq PL_V_eq PL_ADJ_eq |
17 | PART_PRES |
18 | ORD |
19 | NUMWORDS |
20 | def_noun def_verb def_adj def_a def_an )], |
21 | |
22 | INFLECTIONS => [ qw( classical inflect |
23 | PL PL_N PL_V PL_ADJ PL_eq |
24 | NO NUM A AN PART_PRES )], |
25 | |
26 | PLURALS => [ qw( classical inflect |
27 | PL PL_N PL_V PL_ADJ NO NUM |
28 | PL_eq PL_N_eq PL_V_eq PL_ADJ_eq )], |
29 | |
30 | COMPARISONS => [ qw( classical |
31 | PL_eq PL_N_eq PL_V_eq PL_ADJ_eq )], |
32 | |
33 | ARTICLES => [ qw( classical inflect NUM A AN )], |
34 | |
35 | NUMERICAL => [ qw( ORD NUMWORDS )], |
36 | |
37 | USER_DEFINED => [ qw( def_noun def_verb def_adj def_a def_an )], |
38 | ); |
39 | |
40 | Exporter::export_ok_tags(qw( ALL )); |
41 | |
42 | # SUPPORT CLASSICAL PLURALIZATIONS |
43 | |
44 | my %def_classical = ( |
45 | all => 0, |
46 | zero => 0, |
47 | herd => 0, |
48 | names => 1, |
49 | persons => 0, |
50 | ancient => 0, |
51 | ); |
52 | |
53 | my %all_classical = ( |
54 | all => 1, |
55 | zero => 1, |
56 | herd => 1, |
57 | names => 1, |
58 | persons => 1, |
59 | ancient => 1, |
60 | ); |
61 | |
62 | my %classical = %def_classical; |
63 | |
64 | my $classical_mode = join '|', keys %all_classical; |
65 | $classical_mode = qr/^(?:$classical_mode)$/; |
66 | |
67 | sub classical |
68 | { |
69 | if (!@_) { |
70 | %classical = %all_classical; |
71 | return; |
72 | } |
73 | if (@_==1 && $_[0] !~ $classical_mode) { |
74 | %classical = $_[0] ? %all_classical : (); |
75 | return; |
76 | } |
77 | while (@_) { |
78 | my $arg = shift; |
79 | if ($arg !~ $classical_mode) { |
80 | die "Unknown classical mode ($arg)\n"; |
81 | } |
82 | if (@_ && $_[0] !~ $classical_mode) { $classical{$arg} = shift; } |
83 | else { $classical{$arg} = 1; } |
84 | |
85 | if ($arg eq 'all') { |
86 | %classical = $classical{all} ? %all_classical : (); |
87 | } |
88 | } |
89 | } |
90 | |
91 | my $persistent_count; |
92 | |
93 | sub NUM # (;$count,$show) |
94 | { |
95 | if (defined $_[0]) |
96 | { |
97 | $persistent_count = $_[0]; |
98 | return $_[0] if !defined($_[1]) || $_[1]; |
99 | } |
100 | else |
101 | { |
102 | $persistent_count = undef; |
103 | } |
104 | return ''; |
105 | } |
106 | |
107 | |
108 | # 0. PERFORM GENERAL INFLECTIONS IN A STRING |
109 | |
110 | sub enclose { "(?:$_[0])" } |
111 | |
112 | sub inflect |
113 | { |
114 | my $save_persistent_count = $persistent_count; |
115 | my @sections = split /(NUM\([^)]*\))/, $_[0]; |
116 | my $inflection = ""; |
117 | |
118 | foreach ( @sections ) |
119 | { |
120 | unless (s/NUM\(\s*?(?:([^),]*)(?:,([^)]*))?)?\)/ NUM($1,$2) /xe) |
121 | { |
122 | 1 while |
123 | s/\bPL \( ([^),]*) (, ([^)]*) )? \) / PL($1,$3) /xeg |
124 | || s/\bPL_N \( ([^),]*) (, ([^)]*) )? \) / PL_N($1,$3) /xeg |
125 | || s/\bPL_V \( ([^),]*) (, ([^)]*) )? \) / PL_V($1,$3) /xeg |
126 | || s/\bPL_ADJ \( ([^),]*) (, ([^)]*) )? \) / PL_ADJ($1,$3) /xeg |
127 | || s/\bAN? \( ([^),]*) (, ([^)]*) )? \) / A($1,$3) /xeg |
128 | || s/\bNO \( ([^),]*) (, ([^)]*) )? \) / NO($1,$3) /xeg |
129 | || s/\bORD \( ([^)]*) \) / ORD($1) /xeg |
130 | || s/\bNUMWORDS \( ([^)]*) \) / NUMWORDS($1) /xeg |
131 | || s/\bPART_PRES \( ([^)]*) \) / PART_PRES($1) /xeg |
132 | } |
133 | |
134 | $inflection .= $_; |
135 | } |
136 | |
137 | $persistent_count = $save_persistent_count; |
138 | return $inflection; |
139 | } |
140 | |
141 | |
142 | # 1. PLURALS |
143 | |
144 | my %PL_sb_irregular_s = |
145 | ( |
146 | "corpus" => "corpuses|corpora", |
147 | "opus" => "opuses|opera", |
148 | "genus" => "genera", |
149 | "mythos" => "mythoi", |
150 | "penis" => "penises|penes", |
151 | "testis" => "testes", |
152 | "atlas" => "atlases|atlantes", |
153 | ); |
154 | |
155 | my %PL_sb_irregular = |
156 | ( |
157 | "child" => "children", |
158 | "brother" => "brothers|brethren", |
159 | "loaf" => "loaves", |
160 | "hoof" => "hoofs|hooves", |
161 | "beef" => "beefs|beeves", |
162 | "money" => "monies", |
163 | "mongoose" => "mongooses", |
164 | "ox" => "oxen", |
165 | "cow" => "cows|kine", |
166 | "soliloquy" => "soliloquies", |
167 | "graffito" => "graffiti", |
168 | "prima donna" => "prima donnas|prime donne", |
169 | "octopus" => "octopuses|octopodes", |
170 | "genie" => "genies|genii", |
171 | "ganglion" => "ganglions|ganglia", |
172 | "trilby" => "trilbys", |
173 | "turf" => "turfs|turves", |
174 | "numen" => "numina", |
175 | "atman" => "atmas", |
176 | "occiput" => "occiputs|occipita", |
177 | |
178 | %PL_sb_irregular_s, |
179 | ); |
180 | |
181 | my $PL_sb_irregular = enclose join '|', keys %PL_sb_irregular; |
182 | |
183 | # CLASSICAL "..is" -> "..ides" |
184 | |
185 | my @PL_sb_C_is_ides = |
186 | ( |
187 | # GENERAL WORDS... |
188 | |
189 | "ephemeris", "iris", "clitoris", |
190 | "chrysalis", "epididymis", |
191 | |
192 | # INFLAMATIONS... |
193 | |
194 | ".*itis", |
195 | |
196 | ); |
197 | |
198 | my $PL_sb_C_is_ides = enclose join "|", map { substr($_,0,-2) } @PL_sb_C_is_ides; |
199 | |
200 | # CLASSICAL "..a" -> "..ata" |
201 | |
202 | my @PL_sb_C_a_ata = |
203 | ( |
204 | "anathema", "bema", "carcinoma", "charisma", "diploma", |
205 | "dogma", "drama", "edema", "enema", "enigma", "lemma", |
206 | "lymphoma", "magma", "melisma", "miasma", "oedema", |
207 | "sarcoma", "schema", "soma", "stigma", "stoma", "trauma", |
208 | "gumma", "pragma", |
209 | ); |
210 | |
211 | my $PL_sb_C_a_ata = enclose join "|", map { substr($_,0,-1) } @PL_sb_C_a_ata; |
212 | |
213 | # UNCONDITIONAL "..a" -> "..ae" |
214 | |
215 | my $PL_sb_U_a_ae = enclose join "|", |
216 | ( |
217 | "alumna", "alga", "vertebra", "persona" |
218 | ); |
219 | |
220 | # CLASSICAL "..a" -> "..ae" |
221 | |
222 | my $PL_sb_C_a_ae = enclose join "|", |
223 | ( |
224 | "amoeba", "antenna", "formula", "hyperbola", |
225 | "medusa", "nebula", "parabola", "abscissa", |
226 | "hydra", "nova", "lacuna", "aurora", ".*umbra", |
227 | "flora", "fauna", |
228 | ); |
229 | |
230 | # CLASSICAL "..en" -> "..ina" |
231 | |
232 | my $PL_sb_C_en_ina = enclose join "|", map { substr($_,0,-2) } |
233 | ( |
234 | "stamen", "foramen", "lumen" |
235 | ); |
236 | |
237 | # UNCONDITIONAL "..um" -> "..a" |
238 | |
239 | my $PL_sb_U_um_a = enclose join "|", map { substr($_,0,-2) } |
240 | ( |
241 | "bacterium", "agendum", "desideratum", "erratum", |
242 | "stratum", "datum", "ovum", "extremum", |
243 | "candelabrum", |
244 | ); |
245 | |
246 | # CLASSICAL "..um" -> "..a" |
247 | |
248 | my $PL_sb_C_um_a = enclose join "|", map { substr($_,0,-2) } |
249 | ( |
250 | "maximum", "minimum", "momentum", "optimum", |
251 | "quantum", "cranium", "curriculum", "dictum", |
252 | "phylum", "aquarium", "compendium", "emporium", |
253 | "enconium", "gymnasium", "honorarium", "interregnum", |
254 | "lustrum", "memorandum", "millennium", "rostrum", |
255 | "spectrum", "speculum", "stadium", "trapezium", |
256 | "ultimatum", "medium", "vacuum", "velum", |
257 | "consortium", |
258 | ); |
259 | |
260 | # UNCONDITIONAL "..us" -> "i" |
261 | |
262 | my $PL_sb_U_us_i = enclose join "|", map { substr($_,0,-2) } |
263 | ( |
264 | "alumnus", "alveolus", "bacillus", "bronchus", |
265 | "locus", "nucleus", "stimulus", "meniscus", |
266 | ); |
267 | |
268 | # CLASSICAL "..us" -> "..i" |
269 | |
270 | my $PL_sb_C_us_i = enclose join "|", map { substr($_,0,-2) } |
271 | ( |
272 | "focus", "radius", "genius", |
273 | "incubus", "succubus", "nimbus", |
274 | "fungus", "nucleolus", "stylus", |
275 | "torus", "umbilicus", "uterus", |
276 | "hippopotamus", |
277 | ); |
278 | |
279 | # CLASSICAL "..us" -> "..us" (ASSIMILATED 4TH DECLENSION LATIN NOUNS) |
280 | |
281 | my $PL_sb_C_us_us = enclose join "|", |
282 | ( |
283 | "status", "apparatus", "prospectus", "sinus", |
284 | "hiatus", "impetus", "plexus", |
285 | ); |
286 | |
287 | # UNCONDITIONAL "..on" -> "a" |
288 | |
289 | my $PL_sb_U_on_a = enclose join "|", map { substr($_,0,-2) } |
290 | ( |
291 | "criterion", "perihelion", "aphelion", |
292 | "phenomenon", "prolegomenon", "noumenon", |
293 | "organon", "asyndeton", "hyperbaton", |
294 | ); |
295 | |
296 | # CLASSICAL "..on" -> "..a" |
297 | |
298 | my $PL_sb_C_on_a = enclose join "|", map { substr($_,0,-2) } |
299 | ( |
300 | "oxymoron", |
301 | ); |
302 | |
303 | # CLASSICAL "..o" -> "..i" (BUT NORMALLY -> "..os") |
304 | |
305 | my @PL_sb_C_o_i = |
306 | ( |
307 | "solo", "soprano", "basso", "alto", |
308 | "contralto", "tempo", "piano", "virtuoso", |
309 | ); |
310 | my $PL_sb_C_o_i = enclose join "|", map { substr($_,0,-1) } @PL_sb_C_o_i; |
311 | |
312 | # ALWAYS "..o" -> "..os" |
313 | |
314 | my $PL_sb_U_o_os = enclose join "|", |
315 | ( |
316 | "albino", "archipelago", "armadillo", |
317 | "commando", "crescendo", "fiasco", |
318 | "ditto", "dynamo", "embryo", |
319 | "ghetto", "guano", "inferno", |
320 | "jumbo", "lumbago", "magneto", |
321 | "manifesto", "medico", "octavo", |
322 | "photo", "pro", "quarto", |
323 | "canto", "lingo", "generalissimo", |
324 | "stylo", "rhino", "casino", |
325 | "auto", "macro", 'zero', |
326 | |
327 | @PL_sb_C_o_i, |
328 | ); |
329 | |
330 | |
331 | # UNCONDITIONAL "..[ei]x" -> "..ices" |
332 | |
333 | my $PL_sb_U_ex_ices = enclose join "|", map { substr($_,0,-2) } |
334 | ( |
335 | "codex", "murex", "silex", |
336 | ); |
337 | |
338 | my $PL_sb_U_ix_ices = enclose join "|", map { substr($_,0,-2) } |
339 | ( |
340 | "radix", "helix", |
341 | ); |
342 | |
343 | # CLASSICAL "..[ei]x" -> "..ices" |
344 | |
345 | my $PL_sb_C_ex_ices = enclose join "|", map { substr($_,0,-2) } |
346 | ( |
347 | "vortex", "vertex", "cortex", "latex", |
348 | "pontifex", "apex", "index", "simplex", |
349 | ); |
350 | |
351 | my $PL_sb_C_ix_ices = enclose join "|", map { substr($_,0,-2) } |
352 | ( |
353 | "appendix", |
354 | ); |
355 | |
356 | # ARABIC: ".." -> "..i" |
357 | |
358 | my $PL_sb_C_i = enclose join "|", |
359 | ( |
360 | "afrit", "afreet", "efreet", |
361 | ); |
362 | |
363 | # HEBREW: ".." -> "..im" |
364 | |
365 | my $PL_sb_C_im = enclose join "|", |
366 | ( |
367 | "goy", "seraph", "cherub", |
368 | ); |
369 | |
370 | # UNCONDITIONAL "..man" -> "..mans" |
371 | |
372 | my $PL_sb_U_man_mans = enclose join "|", |
373 | qw( |
374 | human |
375 | Alabaman Bahaman Burman German |
376 | Hiroshiman Liman Nakayaman Oklahoman |
377 | Panaman Selman Sonaman Tacoman Yakiman |
378 | Yokohaman Yuman |
379 | ); |
380 | |
381 | my @PL_sb_uninflected_s = |
382 | ( |
383 | # PAIRS OR GROUPS SUBSUMED TO A SINGULAR... |
384 | "breeches", "britches", "clippers", "gallows", "hijinks", |
385 | "headquarters", "pliers", "scissors", "testes", "herpes", |
386 | "pincers", "shears", "proceedings", "trousers", |
387 | |
388 | # UNASSIMILATED LATIN 4th DECLENSION |
389 | |
390 | "cantus", "coitus", "nexus", |
391 | |
392 | # RECENT IMPORTS... |
393 | "contretemps", "corps", "debris", |
394 | ".*ois", "siemens", |
395 | |
396 | # DISEASES |
397 | ".*measles", "mumps", |
398 | |
399 | # MISCELLANEOUS OTHERS... |
400 | "diabetes", "jackanapes", "series", "species", "rabies", |
401 | "chassis", "innings", "news", "mews", |
402 | ); |
403 | |
404 | my $PL_sb_uninflected_herd = enclose join "|", |
405 | # DON'T INFLECT IN CLASSICAL MODE, OTHERWISE NORMAL INFLECTION |
406 | ( |
407 | "wildebeest", "swine", "eland", "bison", "buffalo", |
408 | "elk", "moose", "rhinoceros", |
409 | ); |
410 | |
411 | my $PL_sb_uninflected = enclose join "|", |
412 | ( |
413 | # SOME FISH AND HERD ANIMALS |
414 | ".*fish", "tuna", "salmon", "mackerel", "trout", |
415 | "bream", "sea[- ]bass", "carp", "cod", "flounder", "whiting", |
416 | |
417 | ".*deer", ".*sheep", |
418 | |
419 | # ALL NATIONALS ENDING IN -ese |
420 | "Portuguese", "Amoyese", "Borghese", "Congoese", "Faroese", |
421 | "Foochowese", "Genevese", "Genoese", "Gilbertese", "Hottentotese", |
422 | "Kiplingese", "Kongoese", "Lucchese", "Maltese", "Nankingese", |
423 | "Niasese", "Pekingese", "Piedmontese", "Pistoiese", "Sarawakese", |
424 | "Shavese", "Vermontese", "Wenchowese", "Yengeese", |
425 | ".*[nrlm]ese", |
426 | |
427 | # SOME WORDS ENDING IN ...s (OFTEN PAIRS TAKEN AS A WHOLE) |
428 | |
429 | @PL_sb_uninflected_s, |
430 | |
431 | # DISEASES |
432 | ".*pox", |
433 | |
434 | |
435 | # OTHER ODDITIES |
436 | "graffiti", "djinn" |
437 | ); |
438 | |
439 | # SINGULAR WORDS ENDING IN ...s (ALL INFLECT WITH ...es) |
440 | |
441 | my $PL_sb_singular_s = enclose join '|', |
442 | ( |
443 | ".*ss", |
444 | "acropolis", "aegis", "alias", "asbestos", "bathos", "bias", |
445 | "bronchitis", "bursitis", "caddis", "cannabis", |
446 | "canvas", "chaos", "cosmos", "dais", "digitalis", |
447 | "epidermis", "ethos", "eyas", "gas", "glottis", |
448 | "hubris", "ibis", "lens", "mantis", "marquis", "metropolis", |
449 | "pathos", "pelvis", "polis", "rhinoceros", |
450 | "sassafras", "trellis", ".*us", "[A-Z].*es", |
451 | |
452 | @PL_sb_C_is_ides, |
453 | ); |
454 | |
455 | my $PL_v_special_s = enclose join '|', |
456 | ( |
457 | $PL_sb_singular_s, |
458 | @PL_sb_uninflected_s, |
459 | keys %PL_sb_irregular_s, |
460 | '(.*[csx])is', |
461 | '(.*)ceps', |
462 | '[A-Z].*s', |
463 | ); |
464 | |
465 | my %PL_sb_postfix_adj = ( |
466 | 'general' => ['(?!major|lieutenant|brigadier|adjutant)\S+'], |
467 | 'martial' => [qw(court)], |
468 | ); |
469 | |
470 | foreach (keys %PL_sb_postfix_adj) { |
471 | $PL_sb_postfix_adj{$_} = enclose |
472 | enclose(join('|', @{$PL_sb_postfix_adj{$_}})) |
473 | . "(?=(?:-|\\s+)$_)"; |
474 | } |
475 | |
476 | my $PL_sb_postfix_adj = '(' . join('|', values %PL_sb_postfix_adj) . ')(.*)'; |
477 | |
478 | my $PL_sb_military = 'major|lieutenant|brigadier|adjutant|quartermaster'; |
479 | my $PL_sb_general = '((?!'.$PL_sb_military.').*?)((-|\s+)general)'; |
480 | |
481 | my $PL_prep = enclose join '|', qw ( |
482 | about above across after among around at athwart before behind |
483 | below beneath beside besides between betwixt beyond but by |
484 | during except for from in into near of off on onto out over |
485 | since till to under until unto upon with |
486 | ); |
487 | |
488 | my $PL_sb_prep_dual_compound = '(.*?)((?:-|\s+)(?:'.$PL_prep.'|d[eu])(?:-|\s+))a(?:-|\s+)(.*)'; |
489 | |
490 | my $PL_sb_prep_compound = '(.*?)((-|\s+)('.$PL_prep.'|d[eu])((-|\s+)(.*))?)'; |
491 | |
492 | |
493 | my %PL_pron_nom = |
494 | ( |
495 | # NOMINATIVE REFLEXIVE |
496 | |
497 | "i" => "we", "myself" => "ourselves", |
498 | "you" => "you", "yourself" => "yourselves", |
499 | "she" => "they", "herself" => "themselves", |
500 | "he" => "they", "himself" => "themselves", |
501 | "it" => "they", "itself" => "themselves", |
502 | "they" => "they", "themself" => "themselves", |
503 | |
504 | # POSSESSIVE |
505 | |
506 | "mine" => "ours", |
507 | "yours" => "yours", |
508 | "hers" => "theirs", |
509 | "his" => "theirs", |
510 | "its" => "theirs", |
511 | "theirs" => "theirs", |
512 | ); |
513 | |
514 | my %PL_pron_acc = |
515 | ( |
516 | # ACCUSATIVE REFLEXIVE |
517 | |
518 | "me" => "us", "myself" => "ourselves", |
519 | "you" => "you", "yourself" => "yourselves", |
520 | "her" => "them", "herself" => "themselves", |
521 | "him" => "them", "himself" => "themselves", |
522 | "it" => "them", "itself" => "themselves", |
523 | "them" => "them", "themself" => "themselves", |
524 | ); |
525 | |
526 | my $PL_pron_acc = enclose join '|', keys %PL_pron_acc; |
527 | |
528 | my %PL_v_irregular_pres = |
529 | ( |
530 | # 1st PERS. SING. 2ND PERS. SING. 3RD PERS. SINGULAR |
531 | # 3RD PERS. (INDET.) |
532 | |
533 | "am" => "are", "are" => "are", "is" => "are", |
534 | "was" => "were", "were" => "were", "was" => "were", |
535 | "have" => "have", "have" => "have", "has" => "have", |
536 | "do" => "do", "do" => "do", "does" => "do", |
537 | ); |
538 | |
539 | my $PL_v_irregular_pres = enclose join '|', keys %PL_v_irregular_pres; |
540 | |
541 | my %PL_v_ambiguous_pres = |
542 | ( |
543 | # 1st PERS. SING. 2ND PERS. SING. 3RD PERS. SINGULAR |
544 | # 3RD PERS. (INDET.) |
545 | |
546 | "act" => "act", "act" => "act", "acts" => "act", |
547 | "blame" => "blame", "blame" => "blame", "blames" => "blame", |
548 | "can" => "can", "can" => "can", "can" => "can", |
549 | "must" => "must", "must" => "must", "must" => "must", |
550 | "fly" => "fly", "fly" => "fly", "flies" => "fly", |
551 | "copy" => "copy", "copy" => "copy", "copies" => "copy", |
552 | "drink" => "drink", "drink" => "drink", "drinks" => "drink", |
553 | "fight" => "fight", "fight" => "fight", "fights" => "fight", |
554 | "fire" => "fire", "fire" => "fire", "fires" => "fire", |
555 | "like" => "like", "like" => "like", "likes" => "like", |
556 | "look" => "look", "look" => "look", "looks" => "look", |
557 | "make" => "make", "make" => "make", "makes" => "make", |
558 | "reach" => "reach", "reach" => "reach", "reaches" => "reach", |
559 | "run" => "run", "run" => "run", "runs" => "run", |
560 | "sink" => "sink", "sink" => "sink", "sinks" => "sink", |
561 | "sleep" => "sleep", "sleep" => "sleep", "sleeps" => "sleep", |
562 | "view" => "view", "view" => "view", "views" => "view", |
563 | ); |
564 | |
565 | my $PL_v_ambiguous_pres = enclose join '|', keys %PL_v_ambiguous_pres; |
566 | |
567 | |
568 | my $PL_v_irregular_non_pres = enclose join '|', |
569 | ( |
570 | "did", "had", "ate", "made", "put", |
571 | "spent", "fought", "sank", "gave", "sought", |
572 | "shall", "could", "ought", "should", |
573 | ); |
574 | |
575 | my $PL_v_ambiguous_non_pres = enclose join '|', |
576 | ( |
577 | "thought", "saw", "bent", "will", "might", "cut", |
578 | ); |
579 | |
580 | # "..oes" -> "..oe" (the rest are "..oes" -> "o") |
581 | |
582 | my $PL_v_oes_oe = enclose join "|", |
583 | qw( |
584 | .*shoes .*hoes .*toes |
585 | canoes floes oboes roes throes woes |
586 | ); |
587 | |
588 | my $PL_count_zero = enclose join '|', |
589 | ( |
590 | 0, "no", "zero", "nil" |
591 | ); |
592 | |
593 | my $PL_count_one = enclose join '|', |
594 | ( |
595 | 1, "a", "an", "one", "each", "every", "this", "that", |
596 | ); |
597 | |
598 | my %PL_adj_special = |
599 | ( |
600 | "a" => "some", "an" => "some", |
601 | "this" => "these", "that" => "those", |
602 | ); |
603 | my $PL_adj_special = enclose join '|', keys %PL_adj_special; |
604 | |
605 | my %PL_adj_poss = |
606 | ( |
607 | "my" => "our", |
608 | "your" => "your", |
609 | "its" => "their", |
610 | "her" => "their", |
611 | "his" => "their", |
612 | "their" => "their", |
613 | ); |
614 | my $PL_adj_poss = enclose join '|', keys %PL_adj_poss; |
615 | |
616 | |
617 | sub checkpat |
618 | { |
619 | local $SIG{__WARN__} = sub {0}; |
620 | do {$@ =~ s/at.*?$//; |
621 | die "\nBad user-defined singular pattern:\n\t$@\n"} |
622 | if (!eval "'' =~ m/$_[0]/; 1;" or $@); |
623 | return @_; |
624 | } |
625 | |
626 | sub checkpatsubs |
627 | { |
628 | checkpat($_[0]); |
629 | if (defined $_[1]) |
630 | { |
631 | local $SIG{__WARN__} = sub {0}; |
632 | do {$@ =~ s/at.*?$//; |
633 | die "\nBad user-defined plural string: '$_[1]'\n\t$@\n"} |
634 | if (!eval "qq{$_[1]}; 1;" or $@); |
635 | } |
636 | return @_; |
637 | } |
638 | |
639 | my @PL_sb_user_defined = (); |
640 | my @PL_v_user_defined = (); |
641 | my @PL_adj_user_defined = (); |
642 | my @A_a_user_defined = (); |
643 | |
644 | sub def_noun |
645 | { |
646 | unshift @PL_sb_user_defined, checkpatsubs(@_); |
647 | return 1; |
648 | } |
649 | |
650 | sub def_verb |
651 | { |
652 | unshift @PL_v_user_defined, checkpatsubs(@_[4,5]); |
653 | unshift @PL_v_user_defined, checkpatsubs(@_[2,3]); |
654 | unshift @PL_v_user_defined, checkpatsubs(@_[0,1]); |
655 | return 1; |
656 | } |
657 | |
658 | sub def_adj |
659 | { |
660 | unshift @PL_adj_user_defined, checkpatsubs(@_); |
661 | return 1; |
662 | } |
663 | |
664 | sub def_a |
665 | { |
666 | unshift @A_a_user_defined, checkpat(@_,'a'); |
667 | return 1; |
668 | } |
669 | |
670 | sub def_an |
671 | { |
672 | unshift @A_a_user_defined, checkpat(@_,'an'); |
673 | return 1; |
674 | } |
675 | |
676 | sub ud_match |
677 | { |
678 | my $word = shift; |
679 | for (my $i=0; $i < @_; $i+=2) |
680 | { |
681 | if ($word =~ /^(?:$_[$i])$/i) |
682 | { |
683 | last unless defined $_[$i+1]; |
684 | return eval '"'.$_[$i+1].'"'; |
685 | } |
686 | } |
687 | return undef; |
688 | } |
689 | |
690 | do |
691 | { |
692 | local $SIG{__WARN__} = sub {0}; |
693 | my $rcfile; |
694 | |
695 | $rcfile = $INC{'Lingua//EN/Inflect.pm'} || ''; |
696 | $rcfile =~ s/Inflect.pm$/.inflectrc/; |
697 | do $rcfile or die "\nBad .inflectrc file ($rcfile):\n\t$@\n" |
698 | if $rcfile && -r $rcfile && -s $rcfile; |
699 | |
700 | $rcfile = "$ENV{HOME}/.inflectrc" || ''; |
701 | do $rcfile or die "\nBad .inflectrc file ($rcfile):\n\t$@\n" |
702 | if $rcfile && -r $rcfile && -s $rcfile; |
703 | }; |
704 | |
705 | sub postprocess # FIX PEDANTRY AND CAPITALIZATION :-) |
706 | { |
707 | my ($orig, $inflected) = @_; |
708 | $inflected =~ s/([^|]+)\|(.+)/ $classical{all}?$2:$1 /e; |
709 | return $orig =~ /^I$/ ? $inflected |
710 | : $orig =~ /^[A-Z]+$/ ? uc $inflected |
711 | : $orig =~ /^[A-Z]/ ? ucfirst $inflected |
712 | : $inflected; |
713 | } |
714 | |
715 | sub PL |
716 | # PL($word,$number) |
717 | { |
718 | my ($str, $count) = @_; |
719 | my ($pre, $word, $post) = ($str =~ m/\A(\s*)(.+?)(\s*)\Z/); |
720 | return $str unless $word; |
721 | my $plural = postprocess $word, _PL_special_adjective($word,$count) |
722 | || _PL_special_verb($word,$count) |
723 | || _PL_noun($word,$count); |
724 | return $pre.$plural.$post; |
725 | } |
726 | |
727 | sub PL_N |
728 | # PL_N($word,$number) |
729 | { |
730 | my ($str, $count) = @_; |
731 | my ($pre, $word, $post) = ($str =~ m/\A(\s*)(.+?)(\s*)\Z/); |
732 | return $str unless $word; |
733 | my $plural = postprocess $word, _PL_noun($word,$count); |
734 | return $pre.$plural.$post; |
735 | } |
736 | |
737 | sub PL_V |
738 | # PL_V($word,$number) |
739 | { |
740 | my ($str, $count) = @_; |
741 | my ($pre, $word, $post) = ($str =~ m/\A(\s*)(.+?)(\s*)\Z/); |
742 | return $str unless $word; |
743 | my $plural = postprocess $word, _PL_special_verb($word,$count) |
744 | || _PL_general_verb($word,$count); |
745 | return $pre.$plural.$post; |
746 | } |
747 | |
748 | sub PL_ADJ |
749 | # PL_ADJ($word,$number) |
750 | { |
751 | my ($str, $count) = @_; |
752 | my ($pre, $word, $post) = ($str =~ m/\A(\s*)(.+?)(\s*)\Z/); |
753 | return $str unless $word; |
754 | my $plural = postprocess $word, _PL_special_adjective($word,$count) |
755 | || $word; |
756 | return $pre.$plural.$post; |
757 | } |
758 | |
759 | sub PL_eq { _PL_eq(@_, \&PL); } |
760 | sub PL_N_eq { _PL_eq(@_, \&PL_N); } |
761 | sub PL_V_eq { _PL_eq(@_, \&PL_V); } |
762 | sub PL_ADJ_eq { _PL_eq(@_, \&PL_ADJ); } |
763 | |
764 | sub _PL_eq |
765 | { |
766 | my ( $word1, $word2, $PL ) = @_; |
767 | my %classval = %classical; |
768 | %classical = %all_classical; |
769 | my $result = ""; |
770 | $result = "eq" if !$result && $word1 eq $word2; |
771 | $result = "p:s" if !$result && $word1 eq &$PL($word2); |
772 | $result = "s:p" if !$result && &$PL($word1) eq $word2; |
773 | %classical = (); |
774 | $result = "p:s" if !$result && $word1 eq &$PL($word2); |
775 | $result = "s:p" if !$result && &$PL($word1) eq $word2; |
776 | %classical = %classval; |
777 | |
778 | if ($PL == \&PL || $PL == \&PL_N) |
779 | { |
780 | $result = "p:p" |
781 | if !$result && _PL_check_plurals_N($word1,$word2); |
782 | $result = "p:p" |
783 | if !$result && _PL_check_plurals_N($word2,$word1); |
784 | } |
785 | if ($PL == \&PL || $PL == \&PL_ADJ) |
786 | { |
787 | $result = "p:p" |
788 | if !$result && _PL_check_plurals_ADJ($word1,$word2,$PL); |
789 | } |
790 | |
791 | return $result; |
792 | } |
793 | |
794 | sub _PL_reg_plurals |
795 | { |
796 | $_[0] =~ /($_[1])($_[2]\|\1$_[3]|$_[3]\|\1$_[2])/ |
797 | } |
798 | |
799 | sub _PL_check_plurals_N |
800 | { |
801 | my $pair = "$_[0]|$_[1]"; |
802 | foreach ( values %PL_sb_irregular_s ) { return 1 if $_ eq $pair; } |
803 | foreach ( values %PL_sb_irregular ) { return 1 if $_ eq $pair; } |
804 | |
805 | return 1 if _PL_reg_plurals($pair, $PL_sb_C_a_ata, "as","ata") |
806 | || _PL_reg_plurals($pair, $PL_sb_C_is_ides, "is","ides") |
807 | || _PL_reg_plurals($pair, $PL_sb_C_a_ae, "s","e") |
808 | || _PL_reg_plurals($pair, $PL_sb_C_en_ina, "ens","ina") |
809 | || _PL_reg_plurals($pair, $PL_sb_C_um_a, "ums","a") |
810 | || _PL_reg_plurals($pair, $PL_sb_C_us_i, "uses","i") |
811 | || _PL_reg_plurals($pair, $PL_sb_C_on_a, "ons","a") |
812 | || _PL_reg_plurals($pair, $PL_sb_C_o_i, "os","i") |
813 | || _PL_reg_plurals($pair, $PL_sb_C_ex_ices, "exes","ices") |
814 | || _PL_reg_plurals($pair, $PL_sb_C_ix_ices, "ixes","ices") |
815 | || _PL_reg_plurals($pair, $PL_sb_C_i, "s","i") |
816 | || _PL_reg_plurals($pair, $PL_sb_C_im, "s","im") |
817 | |
818 | || _PL_reg_plurals($pair, '.*eau', "s","x") |
819 | || _PL_reg_plurals($pair, '.*ieu', "s","x") |
820 | || _PL_reg_plurals($pair, '.*tri', "xes","ces") |
821 | || _PL_reg_plurals($pair, '.{2,}[yia]n', "xes","ges"); |
822 | |
823 | |
824 | return 0; |
825 | } |
826 | |
827 | sub _PL_check_plurals_ADJ |
828 | { |
829 | my ( $word1a, $word2a ) = @_; |
830 | my ( $word1b, $word2b ) = @_; |
831 | |
832 | $word1a = '' unless $word1a =~ s/'s?$//; |
833 | $word2a = '' unless $word2a =~ s/'s?$//; |
834 | $word1b = '' unless $word1b =~ s/s'$//; |
835 | $word2b = '' unless $word2b =~ s/s'$//; |
836 | |
837 | if ($word1a) |
838 | { |
839 | return 1 if $word2a && ( _PL_check_plurals_N($word1a, $word2a) |
840 | || _PL_check_plurals_N($word2a, $word1a) ); |
841 | return 1 if $word2b && ( _PL_check_plurals_N($word1a, $word2b) |
842 | || _PL_check_plurals_N($word2b, $word1a) ); |
843 | } |
844 | if ($word1b) |
845 | { |
846 | return 1 if $word2a && ( _PL_check_plurals_N($word1b, $word2a) |
847 | || _PL_check_plurals_N($word2a, $word1b) ); |
848 | return 1 if $word2b && ( _PL_check_plurals_N($word1b, $word2b) |
849 | || _PL_check_plurals_N($word2b, $word1b) ); |
850 | } |
851 | |
852 | |
853 | return ""; |
854 | } |
855 | |
856 | sub _PL_noun |
857 | { |
858 | my ( $word, $count ) = @_; |
859 | my $value; # UTILITY VARIABLE |
860 | |
861 | # DEFAULT TO PLURAL |
862 | |
863 | $count = $persistent_count |
864 | if !defined($count) && defined($persistent_count); |
865 | |
866 | $count = (defined $count and $count=~/^($PL_count_one)$/io |
867 | or defined $count and $classical{zero} |
868 | and $count=~/^($PL_count_zero)$/io) |
869 | ? 1 |
870 | : 2; |
871 | |
872 | return $word if $count==1; |
873 | |
874 | # HANDLE USER-DEFINED NOUNS |
875 | |
876 | return $value if defined($value = ud_match($word, @PL_sb_user_defined)); |
877 | |
878 | |
879 | # HANDLE EMPTY WORD, SINGULAR COUNT AND UNINFLECTED PLURALS |
880 | |
881 | $word eq '' and return $word; |
882 | |
883 | $word =~ /^($PL_sb_uninflected)$/i |
884 | and return $word; |
885 | |
886 | $classical{herd} and $word =~ /^($PL_sb_uninflected_herd)$/i |
887 | and return $word; |
888 | |
889 | |
890 | # HANDLE COMPOUNDS ("Governor General", "mother-in-law", "aide-de-camp", ETC.) |
891 | |
892 | $word =~ /^(?:$PL_sb_postfix_adj)$/i |
893 | and $value = $2 |
894 | and return _PL_noun($1,2) |
895 | . $value; |
896 | |
897 | $word =~ /^(?:$PL_sb_prep_dual_compound)$/i |
898 | and $value = [$2,$3] |
899 | and return _PL_noun($1,2) |
900 | . $value->[0] |
901 | . _PL_noun($value->[1]); |
902 | |
903 | $word =~ /^(?:$PL_sb_prep_compound)$/i |
904 | and $value = $2 |
905 | and return _PL_noun($1,2) |
906 | . $value; |
907 | |
908 | # HANDLE PRONOUNS |
909 | |
910 | $word =~ /^((?:$PL_prep)\s+)($PL_pron_acc)$/i |
911 | and return $1.$PL_pron_acc{lc($2)}; |
912 | |
913 | $value = $PL_pron_nom{lc($word)} |
914 | and return $value; |
915 | |
916 | $word =~ /^($PL_pron_acc)$/i |
917 | and return $PL_pron_acc{lc($1)}; |
918 | |
919 | # HANDLE ISOLATED IRREGULAR PLURALS |
920 | |
921 | $word =~ /(.*)\b($PL_sb_irregular)$/i |
922 | and return $1 . $PL_sb_irregular{lc $2}; |
923 | $word =~ /($PL_sb_U_man_mans)$/i |
924 | and return "$1s"; |
925 | $word =~ /(\S*)(person)$/i and return $classical{persons}?"$1persons":"$1people"; |
926 | |
927 | # HANDLE FAMILIES OF IRREGULAR PLURALS |
928 | |
929 | $word =~ /(.*)man$/i and return "$1men"; |
930 | $word =~ /(.*[ml])ouse$/i and return "$1ice"; |
931 | $word =~ /(.*)goose$/i and return "$1geese"; |
932 | $word =~ /(.*)tooth$/i and return "$1teeth"; |
933 | $word =~ /(.*)foot$/i and return "$1feet"; |
934 | |
935 | # HANDLE UNASSIMILATED IMPORTS |
936 | |
937 | $word =~ /(.*)ceps$/i and return $word; |
938 | $word =~ /(.*)zoon$/i and return "$1zoa"; |
939 | $word =~ /(.*[csx])is$/i and return "$1es"; |
940 | $word =~ /($PL_sb_U_ex_ices)ex$/i and return "$1ices"; |
941 | $word =~ /($PL_sb_U_ix_ices)ix$/i and return "$1ices"; |
942 | $word =~ /($PL_sb_U_um_a)um$/i and return "$1a"; |
943 | $word =~ /($PL_sb_U_us_i)us$/i and return "$1i"; |
944 | $word =~ /($PL_sb_U_on_a)on$/i and return "$1a"; |
945 | $word =~ /($PL_sb_U_a_ae)$/i and return "$1e"; |
946 | |
947 | # HANDLE INCOMPLETELY ASSIMILATED IMPORTS |
948 | |
949 | if ($classical{ancient}) |
950 | { |
951 | $word =~ /(.*)trix$/i and return "$1trices"; |
952 | $word =~ /(.*)eau$/i and return "$1eaux"; |
953 | $word =~ /(.*)ieu$/i and return "$1ieux"; |
954 | $word =~ /(.{2,}[yia])nx$/i and return "$1nges"; |
955 | $word =~ /($PL_sb_C_en_ina)en$/i and return "$1ina"; |
956 | $word =~ /($PL_sb_C_ex_ices)ex$/i and return "$1ices"; |
957 | $word =~ /($PL_sb_C_ix_ices)ix$/i and return "$1ices"; |
958 | $word =~ /($PL_sb_C_um_a)um$/i and return "$1a"; |
959 | $word =~ /($PL_sb_C_us_i)us$/i and return "$1i"; |
960 | $word =~ /($PL_sb_C_us_us)$/i and return "$1"; |
961 | $word =~ /($PL_sb_C_a_ae)$/i and return "$1e"; |
962 | $word =~ /($PL_sb_C_a_ata)a$/i and return "$1ata"; |
963 | $word =~ /($PL_sb_C_is_ides)is$/i and return "$1ides"; |
964 | $word =~ /($PL_sb_C_o_i)o$/i and return "$1i"; |
965 | $word =~ /($PL_sb_C_on_a)on$/i and return "$1a"; |
966 | $word =~ /$PL_sb_C_im$/i and return "${word}im"; |
967 | $word =~ /$PL_sb_C_i$/i and return "${word}i"; |
968 | } |
969 | |
970 | |
971 | # HANDLE SINGULAR NOUNS ENDING IN ...s OR OTHER SILIBANTS |
972 | |
973 | $word =~ /^($PL_sb_singular_s)$/i and return "$1es"; |
974 | $word =~ /^([A-Z].*s)$/ and $classical{names} and return "$1es"; |
975 | $word =~ /(.*)([cs]h|[zx])$/i and return "$1$2es"; |
976 | # $word =~ /(.*)(us)$/i and return "$1$2es"; |
977 | |
978 | # HANDLE ...f -> ...ves |
979 | |
980 | $word =~ /(.*[eao])lf$/i and return "$1lves"; |
981 | $word =~ /(.*[^d])eaf$/i and return "$1eaves"; |
982 | $word =~ /(.*[nlw])ife$/i and return "$1ives"; |
983 | $word =~ /(.*)arf$/i and return "$1arves"; |
984 | |
985 | # HANDLE ...y |
986 | |
987 | $word =~ /(.*[aeiou])y$/i and return "$1ys"; |
988 | $word =~ /([A-Z].*y)$/ and $classical{names} and return "$1s"; |
989 | $word =~ /(.*)y$/i and return "$1ies"; |
990 | |
991 | # HANDLE ...o |
992 | |
993 | $word =~ /$PL_sb_U_o_os$/i and return "${word}s"; |
994 | $word =~ /[aeiou]o$/i and return "${word}s"; |
995 | $word =~ /o$/i and return "${word}es"; |
996 | |
997 | |
998 | # OTHERWISE JUST ADD ...s |
999 | |
1000 | return "${word}s"; |
1001 | } |
1002 | |
1003 | |
1004 | sub _PL_special_verb |
1005 | { |
1006 | my ( $word, $count ) = @_; |
1007 | $count = $persistent_count |
1008 | if !defined($count) && defined($persistent_count); |
1009 | $count = (defined $count and $count=~/^($PL_count_one)$/io or |
1010 | defined $count and $classical{zero} and $count=~/^($PL_count_zero)$/io) ? 1 |
1011 | : 2; |
1012 | |
1013 | return undef if $count=~/^($PL_count_one)$/io; |
1014 | |
1015 | my $value; # UTILITY VARIABLE |
1016 | |
1017 | # HANDLE USER-DEFINED VERBS |
1018 | |
1019 | return $value if defined($value = ud_match($word, @PL_v_user_defined)); |
1020 | |
1021 | # HANDLE IRREGULAR PRESENT TENSE (SIMPLE AND COMPOUND) |
1022 | |
1023 | $word =~ /^($PL_v_irregular_pres)((\s.*)?)$/i |
1024 | and return $PL_v_irregular_pres{lc $1}.$2; |
1025 | |
1026 | # HANDLE IRREGULAR FUTURE, PRETERITE AND PERFECT TENSES |
1027 | |
1028 | $word =~ /^($PL_v_irregular_non_pres)((\s.*)?)$/i |
1029 | and return $word; |
1030 | |
1031 | # HANDLE PRESENT NEGATIONS (SIMPLE AND COMPOUND) |
1032 | |
1033 | $word =~ /^($PL_v_irregular_pres)(n't(\s.*)?)$/i |
1034 | and return $PL_v_irregular_pres{lc $1}.$2; |
1035 | |
1036 | $word =~ /^\S+n't\b/i |
1037 | and return $word; |
1038 | |
1039 | # HANDLE SPECIAL CASES |
1040 | |
1041 | $word =~ /^($PL_v_special_s)$/ and return undef; |
1042 | $word =~ /\s/ and return undef; |
1043 | |
1044 | # HANDLE STANDARD 3RD PERSON (CHOP THE ...(e)s OFF SINGLE WORDS) |
1045 | |
1046 | $word =~ /^(.*)([cs]h|[x]|zz|ss)es$/i and return "$1$2"; |
1047 | |
1048 | $word =~ /^(..+)ies$/i and return "$1y"; |
1049 | |
1050 | $word =~ /($PL_v_oes_oe)$/ and return substr($1,0,-1); |
1051 | $word =~ /^(.+)oes$/i and return "$1o"; |
1052 | |
1053 | $word =~ /^(.*[^s])s$/i and return $1; |
1054 | |
1055 | # OTHERWISE, A REGULAR VERB (HANDLE ELSEWHERE) |
1056 | |
1057 | return undef; |
1058 | } |
1059 | |
1060 | sub _PL_general_verb |
1061 | { |
1062 | my ( $word, $count ) = @_; |
1063 | $count = $persistent_count |
1064 | if !defined($count) && defined($persistent_count); |
1065 | $count = (defined $count and $count=~/^($PL_count_one)$/io or |
1066 | defined $count and $classical{zero} and $count=~/^($PL_count_zero)$/io) ? 1 |
1067 | : 2; |
1068 | |
1069 | return $word if $count=~/^($PL_count_one)$/io; |
1070 | |
1071 | # HANDLE AMBIGUOUS PRESENT TENSES (SIMPLE AND COMPOUND) |
1072 | |
1073 | $word =~ /^($PL_v_ambiguous_pres)((\s.*)?)$/i |
1074 | and return $PL_v_ambiguous_pres{lc $1}.$2; |
1075 | |
1076 | # HANDLE AMBIGUOUS PRETERITE AND PERFECT TENSES |
1077 | |
1078 | $word =~ /^($PL_v_ambiguous_non_pres)((\s.*)?)$/i |
1079 | and return $word; |
1080 | |
1081 | # OTHERWISE, 1st OR 2ND PERSON IS UNINFLECTED |
1082 | |
1083 | return $word; |
1084 | |
1085 | } |
1086 | |
1087 | sub _PL_special_adjective |
1088 | { |
1089 | my ( $word, $count ) = @_; |
1090 | $count = $persistent_count |
1091 | if !defined($count) && defined($persistent_count); |
1092 | $count = (defined $count and $count=~/^($PL_count_one)$/io or |
1093 | defined $count and $classical{zero} and $count=~/^($PL_count_zero)$/io) ? 1 |
1094 | : 2; |
1095 | |
1096 | return $word if $count=~/^($PL_count_one)$/io; |
1097 | |
1098 | |
1099 | # HANDLE USER-DEFINED ADJECTIVES |
1100 | |
1101 | my $value; |
1102 | return $value if defined($value = ud_match($word, @PL_adj_user_defined)); |
1103 | |
1104 | # HANDLE KNOWN CASES |
1105 | |
1106 | $word =~ /^($PL_adj_special)$/i |
1107 | and return $PL_adj_special{lc $1}; |
1108 | |
1109 | # HANDLE POSSESSIVES |
1110 | |
1111 | $word =~ /^($PL_adj_poss)$/i |
1112 | and return $PL_adj_poss{lc $1}; |
1113 | |
1114 | $word =~ /^(.*)'s?$/ and do { my $pl = PL_N($1); |
1115 | return "$pl'" . ($pl =~ m/s$/ ? "" : "s"); |
1116 | }; |
1117 | |
1118 | # OTHERWISE, NO IDEA |
1119 | |
1120 | return undef; |
1121 | |
1122 | } |
1123 | |
1124 | |
1125 | # 2. INDEFINITE ARTICLES |
1126 | |
1127 | # THIS PATTERN MATCHES STRINGS OF CAPITALS STARTING WITH A "VOWEL-SOUND" |
1128 | # CONSONANT FOLLOWED BY ANOTHER CONSONANT, AND WHICH ARE NOT LIKELY |
1129 | # TO BE REAL WORDS (OH, ALL RIGHT THEN, IT'S JUST MAGIC!) |
1130 | |
1131 | my $A_abbrev = q{ |
1132 | (?! FJO | [HLMNS]Y. | RY[EO] | SQU |
1133 | | ( F[LR]? | [HL] | MN? | N | RH? | S[CHKLMNPTVW]? | X(YL)?) [AEIOU]) |
1134 | [FHLMNRSX][A-Z] |
1135 | }; |
1136 | |
1137 | # THIS PATTERN CODES THE BEGINNINGS OF ALL ENGLISH WORDS BEGINING WITH A |
1138 | # 'y' FOLLOWED BY A CONSONANT. ANY OTHER Y-CONSONANT PREFIX THEREFORE |
1139 | # IMPLIES AN ABBREVIATION. |
1140 | |
1141 | my $A_y_cons = 'y(b[lor]|cl[ea]|fere|gg|p[ios]|rou|tt)'; |
1142 | |
1143 | # EXCEPTIONS TO EXCEPTIONS |
1144 | |
1145 | my $A_explicit_an = enclose join '|', |
1146 | ( |
1147 | "euler", |
1148 | "hour(?!i)", "heir", "honest", "hono", |
1149 | ); |
1150 | |
1151 | sub A |
1152 | { |
1153 | my ($str, $count) = @_; |
1154 | my ($pre, $word, $post) = ( $str =~ m/\A(\s*)(?:an?\s+)?(.+?)(\s*)\Z/i ); |
1155 | return $str unless $word; |
1156 | my $result = _indef_article($word,$count); |
1157 | return $pre.$result.$post; |
1158 | } |
1159 | |
1160 | sub AN { goto &A } |
1161 | |
1162 | sub _indef_article |
1163 | { |
1164 | my ( $word, $count ) = @_; |
1165 | |
1166 | $count = $persistent_count |
1167 | if !defined($count) && defined($persistent_count); |
1168 | |
1169 | return "$count $word" |
1170 | if defined $count && $count!~/^($PL_count_one)$/io; |
1171 | |
1172 | # HANDLE USER-DEFINED VARIANTS |
1173 | |
1174 | my $value; |
1175 | return $value if defined($value = ud_match($word, @A_a_user_defined)); |
1176 | |
1177 | # HANDLE SPECIAL CASES |
1178 | |
1179 | $word =~ /^($A_explicit_an)/i and return "an $word"; |
1180 | |
1181 | # HANDLE ABBREVIATIONS |
1182 | |
1183 | $word =~ /^($A_abbrev)/ox and return "an $word"; |
1184 | $word =~ /^[aefhilmnorsx][.-]/i and return "an $word"; |
1185 | $word =~ /^[a-z][.-]/i and return "a $word"; |
1186 | |
1187 | # HANDLE CONSONANTS |
1188 | |
1189 | $word =~ /^[^aeiouy]/i and return "a $word"; |
1190 | |
1191 | # HANDLE SPECIAL VOWEL-FORMS |
1192 | |
1193 | $word =~ /^e[uw]/i and return "a $word"; |
1194 | $word =~ /^onc?e\b/i and return "a $word"; |
1195 | $word =~ /^uni([^nmd]|mo)/i and return "a $word"; |
1196 | $word =~ /^u[bcfhjkqrst][aeiou]/i and return "a $word"; |
1197 | |
1198 | # HANDLE SPECIAL CAPITALS |
1199 | |
1200 | $word =~ /^U[NK][AIEO]?/ and return "a $word"; |
1201 | |
1202 | # HANDLE VOWELS |
1203 | |
1204 | $word =~ /^[aeiou]/i and return "an $word"; |
1205 | |
1206 | # HANDLE y... (BEFORE CERTAIN CONSONANTS IMPLIES (UNNATURALIZED) "i.." SOUND) |
1207 | |
1208 | $word =~ /^($A_y_cons)/io and return "an $word"; |
1209 | |
1210 | # OTHERWISE, GUESS "a" |
1211 | return "a $word"; |
1212 | } |
1213 | |
1214 | # 2. TRANSLATE ZERO-QUANTIFIED $word TO "no PL($word)" |
1215 | |
1216 | sub NO |
1217 | { |
1218 | my ($str, $count) = @_; |
1219 | my ($pre, $word, $post) = ($str =~ m/\A(\s*)(.+?)(\s*)\Z/); |
1220 | |
1221 | $count = $persistent_count |
1222 | if !defined($count) && defined($persistent_count); |
1223 | $count = 0 unless $count; |
1224 | |
1225 | return "$pre$count " . PL($word,$count) . $post |
1226 | unless $count =~ /^$PL_count_zero$/; |
1227 | return "${pre}no ". PL($word,0) . $post ; |
1228 | } |
1229 | |
1230 | |
1231 | # PARTICIPLES |
1232 | |
1233 | sub PART_PRES |
1234 | { |
1235 | local $_ = PL_V(shift,2); |
1236 | s/ie$/y/ |
1237 | or s/ue$/u/ |
1238 | or s/([auy])e$/$1/ |
1239 | or s/ski$/ski/ |
1240 | or s/i$// |
1241 | or s/([^e])e$/$1/ |
1242 | or m/er$/ |
1243 | or s/([^aeiou][aeiouy]([bdgmnprst]))$/$1$2/; |
1244 | return "${_}ing"; |
1245 | } |
1246 | |
1247 | |
1248 | |
1249 | # NUMERICAL INFLECTIONS |
1250 | |
1251 | my %nth = |
1252 | ( |
1253 | 0 => 'th', |
1254 | 1 => 'st', |
1255 | 2 => 'nd', |
1256 | 3 => 'rd', |
1257 | 4 => 'th', |
1258 | 5 => 'th', |
1259 | 6 => 'th', |
1260 | 7 => 'th', |
1261 | 8 => 'th', |
1262 | 9 => 'th', |
1263 | 11 => 'th', |
1264 | 12 => 'th', |
1265 | 13 => 'th', |
1266 | ); |
1267 | |
1268 | |
1269 | my %ordinal; |
1270 | @ordinal{qw(ty one two three five eight nine twelve )}= |
1271 | qw(tieth first second third fifth eighth ninth twelfth); |
1272 | |
1273 | my $ordinal_suff = join '|', keys %ordinal, ""; |
1274 | |
1275 | $ordinal{""} = 'th'; |
1276 | |
1277 | sub ORD($) |
1278 | { |
1279 | my $num = shift; |
1280 | if ($num =~ /\d/) { |
1281 | return $num . ($nth{$num%100} || $nth{$num%10}); |
1282 | } |
1283 | else { |
1284 | $num =~ s/($ordinal_suff)\Z/$ordinal{$1}/; |
1285 | return $num; |
1286 | } |
1287 | } |
1288 | |
1289 | |
1290 | my %default_args = |
1291 | ( |
1292 | 'group' => 0, |
1293 | 'comma' => ',', |
1294 | 'and' => 'and', |
1295 | 'zero' => 'zero', |
1296 | 'one' => 'one', |
1297 | 'decimal' => 'point', |
1298 | ); |
1299 | |
1300 | my @unit = ('',qw(one two three four five six seven eight nine)); |
1301 | my @teen = qw(ten eleven twelve thirteen fourteen |
1302 | fifteen sixteen seventeen eighteen nineteen); |
1303 | my @ten = ('','',qw(twenty thirty forty fifty sixty seventy eighty ninety)); |
1304 | my @mill = map { (my $val=$_) =~ s/_/illion/; " $val" } |
1305 | ('',qw(thousand m_ b_ tr_ quadr_ quint_ sext_ sept_ oct_ non_ dec_)); |
1306 | |
1307 | |
1308 | sub mill { my $ind = $_[0]||0; |
1309 | die "Number out of range\n" if $ind > $#mill; |
1310 | return $ind<@mill ? $mill[$ind] : ' ???illion'; } |
1311 | |
1312 | sub unit { return $unit[$_[0]]. mill($_[1]); } |
1313 | |
1314 | sub ten |
1315 | { |
1316 | return $ten[$_[0]] . ($_[0]&&$_[1]?'-':'') . $unit[$_[1]] . mill($_[2]) |
1317 | if $_[0] ne '1'; |
1318 | return $teen[$_[1]]. $mill[$_[2]||0]; |
1319 | } |
1320 | |
1321 | sub hund |
1322 | { |
1323 | return unit($_[0]) . " hundred" . ($_[1] || $_[2] ? " $_[4] " : '') |
1324 | . ten($_[1],$_[2]) . mill($_[3]) . ', ' if $_[0]; |
1325 | return ten($_[1],$_[2]) . mill($_[3]) . ', ' if $_[1] || $_[2]; |
1326 | return ''; |
1327 | } |
1328 | |
1329 | |
1330 | sub enword |
1331 | { |
1332 | my ($num,$group,$zero,$one,$comma,$and) = @_; |
1333 | |
1334 | if ($group==1) |
1335 | { |
1336 | $num =~ s/(\d)/ ($1==1 ? " $one" : $1 ? unit($1) :" $zero")."$comma " /eg; |
1337 | } |
1338 | elsif ($group==2) |
1339 | { |
1340 | $num =~ s/(\d)(\d)/ ($1 ? ten($1,$2) : $2 ? " $zero " . unit($2) : " $zero $zero") . "$comma " /eg; |
1341 | $num =~ s/(\d)/ ($1 ? unit($1) :" $zero")."$comma " /e; |
1342 | } |
1343 | elsif ($group==3) |
1344 | { |
1345 | $num =~ s/(\d)(\d)(\d)/ ($1==1 ? " $one" : $1 ? unit($1) :" $zero")." ".($2 ? ten($2,$3) : $3 ? " $zero " . unit($3) : " $zero $zero") . "$comma " /eg; |
1346 | $num =~ s/(\d)(\d)/ ($1 ? ten($1,$2) : $2 ? " $zero " . unit($2) : " $zero $zero") . "$comma " /e; |
1347 | $num =~ s/(\d)/ ($1==1 ? " $one" : $1 ? unit($1) :" $zero")."$comma " /e; |
1348 | } |
1349 | elsif ($num+0==0) { |
1350 | $num = $zero; |
1351 | } |
1352 | elsif ($num+0==1) { |
1353 | $num = $one; |
1354 | } |
1355 | else { |
1356 | $num =~ s/\A\s*0+//; |
1357 | my $mill = 0; |
1358 | 1 while $num =~ s/(\d)(\d)(\d)(?=\D*\Z)/ hund($1,$2,$3,$mill++,$and) /e; |
1359 | $num =~ s/(\d)(\d)(?=\D*\Z)/ ten($1,$2,$mill)."$comma " /e; |
1360 | $num =~ s/(\d)(?=\D*\Z)/ unit($1,$mill) . "$comma "/e; |
1361 | } |
1362 | return $num; |
1363 | } |
1364 | |
1365 | sub NUMWORDS |
1366 | { |
1367 | my $num = shift; |
1368 | my %arg = ( %default_args, @_ ); |
1369 | my $group = $arg{group}; |
1370 | |
1371 | die "Bad chunking option: $group\n" unless $group =~ /\A[0-3]\Z/; |
1372 | my $sign = ($num =~ /\A\s*\+/) ? "plus" |
1373 | : ($num =~ /\A\s*\-/) ? "minus" |
1374 | : ''; |
1375 | |
1376 | my ($zero, $one) = @arg{'zero','one'}; |
1377 | my $comma = $arg{comma}; |
1378 | my $and = $arg{'and'}; |
1379 | |
1380 | my $ord = $num =~ s/(st|nd|rd|th)\Z//; |
1381 | my @chunks = ($arg{decimal}) |
1382 | ? $group ? split(/\./, $num) : split(/\./, $num, 2) |
1383 | : ($num); |
1384 | |
1385 | my $first = 1; |
1386 | |
1387 | if ($chunks[0] eq '') { $first=0; shift @chunks; } |
1388 | |
1389 | foreach ( @chunks ) |
1390 | { |
1391 | s/\D//g; |
1392 | $_ = '0' unless $_; |
1393 | |
1394 | if (!$group && !$first) { $_ = enword($_,1,$zero,$one,$comma,$and) } |
1395 | else { $_ = enword($_,$group,$zero,$one,$comma,$and) } |
1396 | |
1397 | s/, \Z//; |
1398 | s/\s+,/,/g; |
1399 | s/, (\S+)\s+\Z/ $and $1/ if !$group and $first; |
1400 | s/\s+/ /g; |
1401 | s/(\A\s|\s\Z)//g; |
1402 | $first = '' if $first; |
1403 | } |
1404 | |
1405 | my @numchunks = (); |
1406 | if ($first =~ /0/) |
1407 | { |
1408 | unshift @chunks, ''; |
1409 | } |
1410 | else |
1411 | { |
1412 | @numchunks = split /\Q$comma /, $chunks[0]; |
1413 | } |
1414 | |
1415 | $numchunks[-1] =~ s/($ordinal_suff)\Z/$ordinal{$1}/ |
1416 | if $ord and @numchunks; |
1417 | |
1418 | foreach (@chunks[1..$#chunks]) |
1419 | { |
1420 | push @numchunks, $arg{decimal}; |
1421 | push @numchunks, split /\Q$comma /; |
1422 | } |
1423 | |
1424 | if (wantarray) |
1425 | { |
1426 | unshift @numchunks, $sign if $sign; |
1427 | return @numchunks |
1428 | } |
1429 | elsif ($group) |
1430 | { |
1431 | return ($sign?"$sign ":'') . join ", ", @numchunks; |
1432 | } |
1433 | else |
1434 | { |
1435 | $num = ($sign?"$sign ":'') . shift @numchunks; |
1436 | $first = ($num !~ /$arg{decimal}\Z/); |
1437 | foreach ( @numchunks ) |
1438 | { |
1439 | if (/\A$arg{decimal}\Z/) |
1440 | { |
1441 | $num .= " $_"; |
1442 | $first = 0; |
1443 | } |
1444 | elsif ($first) |
1445 | { |
1446 | $num .= "$comma $_"; |
1447 | } |
1448 | else |
1449 | { |
1450 | $num .= " $_"; |
1451 | } |
1452 | } |
1453 | return $num; |
1454 | } |
1455 | } |
1456 | |
1457 | 1; |
1458 | |
1459 | __END__ |
1460 | |
1461 | =head1 NAME |
1462 | |
1463 | Lingua::EN::Inflect - Convert singular to plural. Select "a" or "an". |
1464 | |
1465 | =head1 VERSION |
1466 | |
1467 | This document describes version 1.86 of Lingua::EN::Inflect, |
1468 | released October 20, 2000. |
1469 | |
1470 | =head1 SYNOPSIS |
1471 | |
1472 | use Lingua::EN::Inflect qw ( PL PL_N PL_V PL_ADJ NO NUM |
1473 | PL_eq PL_N_eq PL_V_eq PL_ADJ_eq |
1474 | A AN |
1475 | PART_PRES |
1476 | ORD NUMWORDS |
1477 | inflect classical |
1478 | def_noun def_verb def_adj def_a def_an ); |
1479 | |
1480 | |
1481 | # UNCONDITIONALLY FORM THE PLURAL |
1482 | |
1483 | print "The plural of ", $word, " is ", PL($word), "\n"; |
1484 | |
1485 | |
1486 | # CONDITIONALLY FORM THE PLURAL |
1487 | |
1488 | print "I saw $cat_count ", PL("cat",$cat_count), "\n"; |
1489 | |
1490 | |
1491 | # FORM PLURALS FOR SPECIFIC PARTS OF SPEECH |
1492 | |
1493 | print PL_N("I",$N1), PL_V("saw",$N1), |
1494 | PL_ADJ("my",$N2), PL_N("saw",$N2), "\n"; |
1495 | |
1496 | |
1497 | # DEAL WITH "0/1/N" -> "no/1/N" TRANSLATION: |
1498 | |
1499 | print "There ", PL_V("was",$errors), NO(" error",$errors), "\n"; |
1500 | |
1501 | |
1502 | # USE DEFAULT COUNTS: |
1503 | |
1504 | print NUM($N1,""), PL("I"), PL_V(" saw"), NUM($N2), PL_N(" saw"); |
1505 | print "There ", NUM($errors,''), PL_V("was"), NO(" error"), "\n"; |
1506 | |
1507 | |
1508 | # COMPARE TWO WORDS "NUMBER-INSENSITIVELY": |
1509 | |
1510 | print "same\n" if PL_eq($word1, $word2); |
1511 | print "same noun\n" if PL_eq_N($word1, $word2); |
1512 | print "same verb\n" if PL_eq_V($word1, $word2); |
1513 | print "same adj.\n" if PL_eq_ADJ($word1, $word2); |
1514 | |
1515 | |
1516 | # ADD CORRECT "a" OR "an" FOR A GIVEN WORD: |
1517 | |
1518 | print "Did you want ", A($thing), " or ", AN($idea), "\n"; |
1519 | |
1520 | |
1521 | # CONVERT NUMERALS INTO ORDINALS (i.e. 1->1st, 2->2nd, 3->3rd, etc.) |
1522 | |
1523 | print "It was", ORD($position), " from the left\n"; |
1524 | |
1525 | # CONVERT NUMERALS TO WORDS (i.e. 1->"one", 101->"one hundred and one", etc.) |
1526 | # IN A SCALAR CONTEXT: GET BACK A SINGLE STRING... |
1527 | |
1528 | $words = NUMWORDS(1234); # "one thousand, two hundred and thirty-four" |
1529 | $words = NUMWORDS(ORD(1234)); # "one thousand, two hundred and thirty-fourth" |
1530 | |
1531 | |
1532 | # IN A LIST CONTEXT: GET BACK A LIST OF STRINGSi, ONE FOR EACH "CHUNK"... |
1533 | |
1534 | @words = NUMWORDS(1234); # ("one thousand","two hundred and thirty-four") |
1535 | |
1536 | |
1537 | # OPTIONAL PARAMETERS CHANGE TRANSLATION: |
1538 | |
1539 | $words = NUMWORDS(12345, group=>1); |
1540 | # "one, two, three, four, five" |
1541 | |
1542 | $words = NUMWORDS(12345, group=>2); |
1543 | # "twelve, thirty-four, five" |
1544 | |
1545 | $words = NUMWORDS(12345, group=>3); |
1546 | # "one twenty-three, forty-five" |
1547 | |
1548 | $words = NUMWORDS(1234, 'and'=>''); |
1549 | # "one thousand, two hundred thirty-four" |
1550 | |
1551 | $words = NUMWORDS(1234, 'and'=>', plus'); |
1552 | # "one thousand, two hundred, plus thirty-four" |
1553 | |
1554 | $words = NUMWORDS(555_1202, group=>1, zero=>'oh'); |
1555 | # "five, five, five, one, two, oh, two" |
1556 | |
1557 | $words = NUMWORDS(555_1202, group=>1, one=>'unity'); |
1558 | # "five, five, five, unity, two, oh, two" |
1559 | |
1560 | $words = NUMWORDS(123.456, group=>1, decimal=>'mark'); |
1561 | # "one two three mark four five six" |
1562 | |
1563 | |
1564 | # REQUIRE "CLASSICAL" PLURALS (EG: "focus"->"foci", "cherub"->"cherubim") |
1565 | |
1566 | classical; # USE ALL CLASSICAL PLURALS |
1567 | |
1568 | classical 1; # USE ALL CLASSICAL PLURALS |
1569 | classical 0; # USE ALL MODERN PLURALS (DEFAULT) |
1570 | |
1571 | classical 'zero'; # "no error" INSTEAD OF "no errors" |
1572 | classical zero=>1; # "no error" INSTEAD OF "no errors" |
1573 | classical zero=>0; # "no errors" INSTEAD OF "no error" |
1574 | |
1575 | classical 'herd'; # "2 buffalo" INSTEAD OF "2 buffalos" |
1576 | classical herd=>1; # "2 buffalo" INSTEAD OF "2 buffalos" |
1577 | classical herd=>0; # "2 buffalos" INSTEAD OF "2 buffalo" |
1578 | |
1579 | classical 'persons'; # "2 chairpersons" INSTEAD OF "2 chairpeople" |
1580 | classical persons=>1; # "2 chairpersons" INSTEAD OF "2 chairpeople" |
1581 | classical persons=>0; # "2 chairpeople" INSTEAD OF "2 chairpersons" |
1582 | |
1583 | classical 'ancient'; # "2 formulae" INSTEAD OF "2 formulas" |
1584 | classical ancient=>1; # "2 formulae" INSTEAD OF "2 formulas" |
1585 | classical ancient=>0; # "2 formulas" INSTEAD OF "2 formulae" |
1586 | |
1587 | |
1588 | |
1589 | # INTERPOLATE "PL()", "PL_N()", "PL_V()", "PL_ADJ()", A()", "AN()" |
1590 | # "NUM()" AND "ORD()" WITHIN STRINGS: |
1591 | |
1592 | print inflect("The plural of $word is PL($word)\n"); |
1593 | print inflect("I saw $cat_count PL("cat",$cat_count)\n"); |
1594 | print inflect("PL(I,$N1) PL_V(saw,$N1) PL(a,$N2) PL_N(saw,$N2)"); |
1595 | print inflect("NUM($N1,)PL(I) PL_V(saw) NUM($N2,)PL(a) PL_N(saw)"); |
1596 | print inflect("I saw NUM($cat_count) PL("cat")\nNUM()"); |
1597 | print inflect("There PL_V(was,$errors) NO(error,$errors)\n"); |
1598 | print inflect("There NUM($errors,) PL_V(was) NO(error)\n"; |
1599 | print inflect("Did you want A($thing) or AN($idea)\n"); |
1600 | print inflect("It was ORD($position) from the left\n"); |
1601 | |
1602 | |
1603 | # ADD USER-DEFINED INFLECTIONS (OVERRIDING INBUILT RULES): |
1604 | |
1605 | def_noun "VAX" => "VAXen"; # SINGULAR => PLURAL |
1606 | |
1607 | def_verb "will" => "shall", # 1ST PERSON SINGULAR => PLURAL |
1608 | "will" => "will", # 2ND PERSON SINGULAR => PLURAL |
1609 | "will" => "will", # 3RD PERSON SINGULAR => PLURAL |
1610 | |
1611 | def_adj "hir" => "their", # SINGULAR => PLURAL |
1612 | |
1613 | def_a "h" # "AY HALWAYS SEZ 'HAITCH'!" |
1614 | |
1615 | def_an "horrendous.*" # "AN HORRENDOUS AFFECTATION" |
1616 | |
1617 | |
1618 | =head1 DESCRIPTION |
1619 | |
1620 | The exportable subroutines of Lingua::EN::Inflect provide plural |
1621 | inflections, "a"/"an" selection for English words, and manipulation |
1622 | of numbers as words |
1623 | |
1624 | Plural forms of all nouns, most verbs, and some adjectives are |
1625 | provided. Where appropriate, "classical" variants (for example: "brother" -> |
1626 | "brethren", "dogma" -> "dogmata", etc.) are also provided. |
1627 | |
1628 | Pronunciation-based "a"/"an" selection is provided for all English |
1629 | words, and most initialisms. |
1630 | |
1631 | It is also possible to inflect numerals (1,2,3) to ordinals (1st, 2nd, 3rd) |
1632 | and to english words ("one", "two", "three). |
1633 | |
1634 | In generating these inflections, Lingua::EN::Inflect follows the Oxford |
1635 | English Dictionary and the guidelines in Fowler's Modern English |
1636 | Usage, preferring the former where the two disagree. |
1637 | |
1638 | The module is built around standard British spelling, but is designed |
1639 | to cope with common American variants as well. Slang, jargon, and |
1640 | other English dialects are I<not> explicitly catered for. |
1641 | |
1642 | Where two or more inflected forms exist for a single word (typically a |
1643 | "classical" form and a "modern" form), Lingua::EN::Inflect prefers the |
1644 | more common form (typically the "modern" one), unless "classical" |
1645 | processing has been specified |
1646 | (see L<"MODERN VS CLASSICAL INFLECTIONS">). |
1647 | |
1648 | =head1 FORMING PLURALS |
1649 | |
1650 | =head2 Inflecting Plurals |
1651 | |
1652 | All of the C<PL_...> plural inflection subroutines take the word to be |
1653 | inflected as their first argument and return the corresponding inflection. |
1654 | Note that all such subroutines expect the I<singular> form of the word. The |
1655 | results of passing a plural form are undefined (and unlikely to be correct). |
1656 | |
1657 | The C<PL_...> subroutines also take an optional second argument, |
1658 | which indicates the grammatical "number" of the word (or of another word |
1659 | with which the word being inflected must agree). If the "number" argument is |
1660 | supplied and is not C<1> (or C<"one"> or C<"a">, or some other adjective that |
1661 | implies the singular), the plural form of the word is returned. If the |
1662 | "number" argument I<does> indicate singularity, the (uninflected) word |
1663 | itself is returned. If the number argument is omitted, the plural form |
1664 | is returned unconditionally. |
1665 | |
1666 | The various subroutines are: |
1667 | |
1668 | =over 8 |
1669 | |
1670 | =item C<PL_N($;$)> |
1671 | |
1672 | The exportable subroutine C<PL_N()> takes a I<singular> English noun or |
1673 | pronoun and returns its plural. Pronouns in the nominative ("I" -> |
1674 | "we") and accusative ("me" -> "us") cases are handled, as are |
1675 | possessive pronouns ("mine" -> "ours"). |
1676 | |
1677 | |
1678 | =item C<PL_V($;$)> |
1679 | |
1680 | The exportable subroutine C<PL_V()> takes the I<singular> form of a |
1681 | conjugated verb (that is, one which is already in the correct "person" |
1682 | and "mood") and returns the corresponding plural conjugation. |
1683 | |
1684 | |
1685 | =item C<PL_ADJ($;$)> |
1686 | |
1687 | The exportable subroutine C<PL_ADJ()> takes the I<singular> form of |
1688 | certain types of adjectives and returns the corresponding plural form. |
1689 | Adjectives that are correctly handled include: "numerical" adjectives |
1690 | ("a" -> "some"), demonstrative adjectives ("this" -> "these", "that" -> |
1691 | "those"), and possessives ("my" -> "our", "cat's" -> "cats'", "child's" |
1692 | -> "childrens'", etc.) |
1693 | |
1694 | |
1695 | =item C<PL($;$)> |
1696 | |
1697 | The exportable subroutine C<PL()> takes a I<singular> English noun, |
1698 | pronoun, verb, or adjective and returns its plural form. Where a word |
1699 | has more than one inflection depending on its part of speech (for |
1700 | example, the noun "thought" inflects to "thoughts", the verb "thought" |
1701 | to "thought"), the (singular) noun sense is preferred to the (singular) |
1702 | verb sense. |
1703 | |
1704 | Hence C<PL("knife")> will return "knives" ("knife" having been treated |
1705 | as a singular noun), whereas C<PL("knifes")> will return "knife" |
1706 | ("knifes" having been treated as a 3rd person singular verb). |
1707 | |
1708 | The inherent ambiguity of such cases suggests that, |
1709 | where the part of speech is known, C<PL_N>, C<PL_V>, and |
1710 | C<PL_ADJ> should be used in preference to C<PL>. |
1711 | |
1712 | =back |
1713 | |
1714 | Note that all these subroutines ignore any whitespace surrounding the |
1715 | word being inflected, but preserve that whitespace when the result is |
1716 | returned. For example, C<S<PL(" cat ")>> returns S<" cats ">. |
1717 | |
1718 | |
1719 | =head2 Numbered plurals |
1720 | |
1721 | The C<PL_...> subroutines return only the inflected word, not the count that |
1722 | was used to inflect it. Thus, in order to produce "I saw 3 ducks", it |
1723 | is necessary to use: |
1724 | |
1725 | print "I saw $N ", PL_N($animal,$N), "\n"; |
1726 | |
1727 | Since the usual purpose of producing a plural is to make it agree with |
1728 | a preceding count, Lingua::EN::Inflect provides an exportable subroutine |
1729 | (C<NO($;$)>) which, given a word and a(n optional) count, returns the |
1730 | count followed by the correctly inflected word. Hence the previous |
1731 | example can be rewritten: |
1732 | |
1733 | print "I saw ", NO($animal,$N), "\n"; |
1734 | |
1735 | In addition, if the count is zero (or some other term which implies |
1736 | zero, such as C<"zero">, C<"nil">, etc.) the count is replaced by the |
1737 | word "no". Hence, if C<$N> had the value zero, the previous example |
1738 | would print the somewhat more elegant: |
1739 | |
1740 | I saw no animals |
1741 | |
1742 | rather than: |
1743 | |
1744 | I saw 0 animals |
1745 | |
1746 | Note that the name of the subroutine is a pun: the subroutine |
1747 | returns either a number (a I<No.>) or a C<"no">, in front of the |
1748 | inflected word. |
1749 | |
1750 | |
1751 | =head2 Reducing the number of counts required |
1752 | |
1753 | In some contexts, the need to supply an explicit count to the various |
1754 | C<PL_...> subroutines makes for tiresome repetition. For example: |
1755 | |
1756 | print PL_ADJ("This",$errors), PL_N(" error",$errors), |
1757 | PL_V(" was",$errors), " fatal.\n"; |
1758 | |
1759 | Lingua::EN::Inflect therefore provides an exportable subroutine |
1760 | (C<NUM($;$)>) which may be used to set a persistent "default number" |
1761 | value. If such a value is set, it is subsequently used whenever an |
1762 | optional second "number" argument is omitted. The default value thus set |
1763 | can subsequently be removed by calling C<NUM()> with no arguments. |
1764 | Hence we could rewrite the previous example: |
1765 | |
1766 | NUM($errors); |
1767 | print PL_ADJ("This"), PL_N(" error"), PL_V(" was"), "fatal.\n"; |
1768 | NUM(); |
1769 | |
1770 | Normally, C<NUM()> returns its first argument, so that it may also |
1771 | be "inlined" in contexts like: |
1772 | |
1773 | print NUM($errors), PL_N(" error"), PL_V(" was"), " detected.\n" |
1774 | print PL_ADJ("This"), PL_N(" error"), PL_V(" was"), "fatal.\n" |
1775 | if $severity > 1; |
1776 | |
1777 | However, in certain contexts (see L<"INTERPOLATING INFLECTIONS IN STRINGS">) |
1778 | it is preferable that C<NUM()> return an empty string. Hence C<NUM()> |
1779 | provides an optional second argument. If that argument is supplied (that is, if |
1780 | it is defined) and evaluates to false, C<NUM> returns an empty string |
1781 | instead of its first argument. For example: |
1782 | |
1783 | print NUM($errors,0), NO("error"), PL_V(" was"), " detected.\n"; |
1784 | print PL_ADJ("This"), PL_N(" error"), PL_V(" was"), "fatal.\n" |
1785 | if $severity > 1; |
1786 | |
1787 | |
1788 | |
1789 | =head2 Number-insensitive equality |
1790 | |
1791 | Lingua::EN::Inflect also provides a solution to the problem |
1792 | of comparing words of differing plurality through the exportable subroutines |
1793 | C<PL_eq($$)>, C<PL_N_eq($$)>, C<PL_V_eq($$)>, and C<PL_ADJ_eq($$)>. |
1794 | Each of these subroutines takes two strings, and compares them |
1795 | using the corresponding plural-inflection subroutine (C<PL()>, C<PL_N()>, |
1796 | C<PL_V()>, and C<PL_ADJ()> respectively). |
1797 | |
1798 | The comparison returns true if: |
1799 | |
1800 | =over 8 |
1801 | |
1802 | =item * |
1803 | |
1804 | the strings are C<eq>-equal, or |
1805 | |
1806 | =item * |
1807 | |
1808 | one string is C<eq>-equal to a plural form of the other, or |
1809 | |
1810 | =item * |
1811 | |
1812 | the strings are two different plural forms of the one word. |
1813 | |
1814 | =back |
1815 | |
1816 | Hence all of the following return true: |
1817 | |
1818 | PL_eq("index","index") # RETURNS "eq" |
1819 | PL_eq("index","indexes") # RETURNS "s:p" |
1820 | PL_eq("index","indices") # RETURNS "s:p" |
1821 | PL_eq("indexes","index") # RETURNS "p:s" |
1822 | PL_eq("indices","index") # RETURNS "p:s" |
1823 | PL_eq("indices","indexes") # RETURNS "p:p" |
1824 | PL_eq("indexes","indices") # RETURNS "p:p" |
1825 | PL_eq("indices","indices") # RETURNS "eq" |
1826 | |
1827 | As indicated by the comments in the previous example, the actual value |
1828 | returned by the various C<PL_eq_...> subroutines encodes which of the |
1829 | three equality rules succeeded: "eq" is returned if the strings were |
1830 | identical, "s:p" if the strings were singular and plural respectively, |
1831 | "p:s" for plural and singular, and "p:p" for two distinct plurals. |
1832 | Inequality is indicated by returning an empty string. |
1833 | |
1834 | It should be noted that two distinct singular words which happen to take |
1835 | the same plural form are I<not> considered equal, nor are cases where |
1836 | one (singular) word's plural is the other (plural) word's singular. |
1837 | Hence all of the following return false: |
1838 | |
1839 | PL_eq("base","basis") # ALTHOUGH BOTH -> "bases" |
1840 | PL_eq("syrinx","syringe") # ALTHOUGH BOTH -> "syringes" |
1841 | PL_eq("she","he") # ALTHOUGH BOTH -> "they" |
1842 | |
1843 | PL_eq("opus","operas") # ALTHOUGH "opus" -> "opera" -> "operas" |
1844 | PL_eq("taxi","taxes") # ALTHOUGH "taxi" -> "taxis" -> "taxes" |
1845 | |
1846 | Note too that, although the comparison is "number-insensitive" it is I<not> |
1847 | case-insensitive (that is, C<PL("time","Times")> returns false. To obtain |
1848 | both number and case insensitivity, prefix both arguments with C<lc> |
1849 | (that is, C<PL(lc "time", lc "Times")> returns true). |
1850 | |
1851 | |
1852 | =head1 OTHER VERB FORMS |
1853 | |
1854 | =head2 Present participles |
1855 | |
1856 | C<Lingua::EN::Inflect> also provides the C<PART_PRES> subroutine, |
1857 | which can take a 3rd person singular verb and |
1858 | correctly inflect it to its present participle: |
1859 | |
1860 | PART_PRES("runs") # "running" |
1861 | PART_PRES("loves") # "loving" |
1862 | PART_PRES("eats") # "eating" |
1863 | PART_PRES("bats") # "batting" |
1864 | PART_PRES("spies") # "spying" |
1865 | |
1866 | |
1867 | =head1 PROVIDING INDEFINITE ARTICLES |
1868 | |
1869 | =head2 Selecting indefinite articles |
1870 | |
1871 | Lingua::EN::Inflect provides two exportable subroutines (C<A($;$)> and |
1872 | C<AN($;$)>) which will correctly prepend the appropriate indefinite |
1873 | article to a word, depending on its pronunciation. For example: |
1874 | |
1875 | A("cat") # -> "a cat" |
1876 | AN("cat") # -> "a cat" |
1877 | A("euphemism") # -> "a euphemism" |
1878 | A("Euler number") # -> "an Euler number" |
1879 | A("hour") # -> "an hour" |
1880 | A("houri") # -> "a houri" |
1881 | |
1882 | The two subroutines are I<identical> in function and may be used |
1883 | interchangeably. The only reason that two versions are provided is to |
1884 | enhance the readability of code such as: |
1885 | |
1886 | print "That is ", AN($errortype), " error\n; |
1887 | print "That is ", A($fataltype), " fatal error\n; |
1888 | |
1889 | Note that in both cases the actual article provided depends I<only> on |
1890 | the pronunciation of the first argument, I<not> on the name of the |
1891 | subroutine. |
1892 | |
1893 | C<A()> and C<AN()> will ignore any indefinite article that already |
1894 | exists at the start of the string. Thus: |
1895 | |
1896 | @half_arked = ( |
1897 | "a elephant", |
1898 | "a giraffe", |
1899 | "an ewe", |
1900 | "a orangutan", |
1901 | ); |
1902 | |
1903 | print A($_), "\n" for @half_arked; |
1904 | |
1905 | # prints: |
1906 | # an elephant |
1907 | # a giraffe |
1908 | # a ewe |
1909 | # an orangutan |
1910 | |
1911 | |
1912 | C<A()> and C<AN()> both take an optional second argument. As with the |
1913 | C<PL_...> subroutines, this second argument is a "number" specifier. If |
1914 | its value is C<1> (or some other value implying singularity), C<A()> and |
1915 | C<AN()> insert "a" or "an" as appropriate. If the number specifier |
1916 | implies plurality, (C<A()> and C<AN()> insert the actual second argument instead. |
1917 | For example: |
1918 | |
1919 | A("cat",1) # -> "a cat" |
1920 | A("cat",2) # -> "2 cat" |
1921 | A("cat","one") # -> "one cat" |
1922 | A("cat","no") # -> "no cat" |
1923 | |
1924 | Note that, as implied by the previous examples, C<A()> and |
1925 | C<AN()> both assume that their job is merely to provide the correct |
1926 | qualifier for a word (that is: "a", "an", or the specified count). |
1927 | In other words, they assume that the word they are given has |
1928 | already been correctly inflected for plurality. Hence, if C<$N> |
1929 | has the value 2, then: |
1930 | |
1931 | print A("cat",$N); |
1932 | |
1933 | prints "2 cat", instead of "2 cats". The correct approach is to use: |
1934 | |
1935 | print A(PL("cat",$N),$N); |
1936 | |
1937 | or, better still: |
1938 | |
1939 | print NO("cat",$N); |
1940 | |
1941 | Note too that, like the various C<PL_...> subroutines, whenever C<A()> |
1942 | and C<AN()> are called with only one argument they are subject to the |
1943 | effects of any preceding call to C<NUM()>. Hence, another possible |
1944 | solution is: |
1945 | |
1946 | NUM($N); |
1947 | print A(PL("cat")); |
1948 | |
1949 | |
1950 | =head2 Indefinite articles and initialisms |
1951 | |
1952 | "Initialisms" (sometimes inaccurately called "acronyms") are terms which |
1953 | have been formed from the initial letters of words in a phrase (for |
1954 | example, "NATO", "NBL", "S.O.S.", "SCUBA", etc.) |
1955 | |
1956 | Such terms present a particular challenge when selecting between "a" |
1957 | and "an", since they are sometimes pronounced as if they were a single |
1958 | word ("nay-tow", "sku-ba") and sometimes as a series of letter names |
1959 | ("en-eff-ell", "ess-oh-ess"). |
1960 | |
1961 | C<A()> and C<AN()> cope with this dichotomy using a series of inbuilt |
1962 | rules, which may be summarized as: |
1963 | |
1964 | =over 8 |
1965 | |
1966 | =item 1. |
1967 | |
1968 | If the word starts with a single letter, followed by a period or dash |
1969 | (for example, "R.I.P.", "C.O.D.", "e-mail", "X-ray", "T-square"), then |
1970 | choose the appropriate article for the I<sound> of the first letter |
1971 | ("an R.I.P.", "a C.O.D.", "an e-mail", "an X-ray", "a T-square"). |
1972 | |
1973 | =item 2. |
1974 | |
1975 | If the first two letters of the word are capitals, |
1976 | consonants, and do not appear at the start of any known English word, |
1977 | (for example, "LCD", "XML", "YWCA"), then once again choose "a" or |
1978 | "an" depending on the I<sound> of the first letter ("an LCD", "an |
1979 | XML", "a YWCA"). |
1980 | |
1981 | =item 3. |
1982 | |
1983 | Otherwise, assume the string is a capitalized word or a |
1984 | pronounceable initialism (for example, "LED", "OPEC", "FAQ", "UNESCO"), and |
1985 | therefore takes "a" or "an" according to the (apparent) pronunciation of |
1986 | the entire word ("a LED", "an OPEC", "a FAQ", "a UNESCO"). |
1987 | |
1988 | =back |
1989 | |
1990 | Note that rules 1 and 3 together imply that the presence or absence of |
1991 | punctuation may change the selection of indefinite article for a |
1992 | particular initialism (for example, "a FAQ" but "an F.A.Q."). |
1993 | |
1994 | |
1995 | =head2 Indefinite articles and "soft H's" |
1996 | |
1997 | Words beginning in the letter 'H' present another type of difficulty |
1998 | when selecting a suitable indefinite article. In a few such words |
1999 | (for example, "hour", "honour", "heir") the 'H' is not voiced at |
2000 | all, and so such words inflect with "an". The remaining cases |
2001 | ("voiced H's") may be divided into two categories: |
2002 | "hard H's" (such as "hangman", "holograph", "hat", etc.) and |
2003 | "soft H's" (such as "hysterical", "horrendous", "holy", etc.) |
2004 | |
2005 | Hard H's always take "a" as their indefinite article, and soft |
2006 | H's normally do so as well. But I<some> English speakers prefer |
2007 | "an" for soft H's (although the practice is now generally considered an |
2008 | affectation, rather than a legitimate grammatical alternative). |
2009 | |
2010 | At present, the C<A()> and C<AN()> subroutines ignore soft H's and use |
2011 | "a" for any voiced 'H'. The author would, however, welcome feedback on |
2012 | this decision (envisaging a possible future "soft H" mode). |
2013 | |
2014 | |
2015 | =head1 INFLECTING ORDINALS |
2016 | |
2017 | Occasionally it is useful to present an integer value as an ordinal |
2018 | rather than as a numeral. For example: |
2019 | |
2020 | Enter password (1st attempt): ******** |
2021 | Enter password (2nd attempt): ********* |
2022 | Enter password (3rd attempt): ********* |
2023 | No 4th attempt. Access denied. |
2024 | |
2025 | To this end, Lingua::EN::Inflect provides the C<ORD()> subroutine. |
2026 | <ORD()> takes a single argument and forms its ordinal equivalent. |
2027 | If the argument isn't a numerical integer, it just adds "-th". |
2028 | |
2029 | |
2030 | =head1 CONVERTING NUMBERS TO WORDS |
2031 | |
2032 | The exportable subroutine C<NUMWORDS> takes a number (cardinal or ordinal) |
2033 | and returns an English representation of that number. In a scalar context |
2034 | a string is returned. Hence: |
2035 | |
2036 | use Lingua::EN::Inflect qw( NUMWORDS ); |
2037 | |
2038 | $words = NUMWORDS(1234567); |
2039 | |
2040 | puts the string: |
2041 | |
2042 | "one million, two hundred and thirty-four thousand, five hundred and sixty-seven" |
2043 | |
2044 | into $words. |
2045 | |
2046 | In a list context each comma-separated chunk is returned as a separate element. |
2047 | Hence: |
2048 | |
2049 | @words = NUMWORDS(1234567); |
2050 | |
2051 | puts the list: |
2052 | |
2053 | ("one million", |
2054 | "two hundred and thirty-four thousand", |
2055 | "five hundred and sixty-seven") |
2056 | |
2057 | into @words. |
2058 | |
2059 | Non-digits (apart from an optional leading plus or minus sign, |
2060 | any decimal points, and ordinal suffixes -- see below) are silently |
2061 | ignored, so the following all produce identical results: |
2062 | |
2063 | NUMWORDS(5551202); |
2064 | NUMWORDS(5_551_202); |
2065 | NUMWORDS("5,551,202"); |
2066 | NUMWORDS("555-1202"); |
2067 | |
2068 | That last case is a little awkward since it's almost certainly a phone number, |
2069 | and "five million, five hundred and fifty-one thousand, two hundred and two" |
2070 | probably isn't what's wanted. |
2071 | |
2072 | To overcome this, C<NUMWORDS()> takes an optional named argument, 'group', |
2073 | which changes how numbers are translated. The argument must be a |
2074 | positive integer less than four, which indicated how the digits of the |
2075 | number are to be grouped. If the argument is C<1>, then each digit is |
2076 | translated separately. If the argument is C<2>, pairs of digits |
2077 | (starting from the I<left>) are grouped together. If the argument is |
2078 | C<3>, triples of numbers (again, from the I<left>) are grouped. Hence: |
2079 | |
2080 | NUMWORDS("555-1202", group=>1) |
2081 | |
2082 | returns C<"five, five, five, one, two, zero, two">, whilst: |
2083 | |
2084 | NUMWORDS("555-1202", group=>2) |
2085 | |
2086 | returns C<"fifty-five, fifty-one, twenty, two">, and: |
2087 | |
2088 | NUMWORDS("555-1202", group=>3) |
2089 | |
2090 | returns C<"five fifty-five, one twenty, two">. |
2091 | |
2092 | Phone numbers are often written in words as |
2093 | C<"five..five..five..one..two..zero..two">, which is also easy to |
2094 | achieve: |
2095 | |
2096 | join '..', NUMWORDS("555-1202", group=>1) |
2097 | |
2098 | C<NUMWORDS> also handles decimal fractions. Hence: |
2099 | |
2100 | NUMWORDS("1.2345") |
2101 | |
2102 | returns C<"one point two three four five"> in a scalar context |
2103 | and C<("one","point","two","three","four","five")>) in an array context. |
2104 | Exponent form (C<"1.234e56">) is not yet handled. |
2105 | |
2106 | Multiple decimal points are only translated in one of the "grouping" modes. |
2107 | Hence: |
2108 | |
2109 | NUMWORDS(101.202.303) |
2110 | |
2111 | returns C<"one hundred and one point two zero two three zero three">, |
2112 | whereas: |
2113 | |
2114 | NUMWORDS(101.202.303, group=>1) |
2115 | |
2116 | returns C<"one zero one point two zero two point three zero three">. |
2117 | |
2118 | The digit C<'0'> is unusual in that in may be translated to English as "zero", |
2119 | "oh", or "nought". To cater for this diversity, C<NUMWORDS> may be passed |
2120 | a named argument, 'zero', which may be set to |
2121 | the desired translation of C<'0'>. For example: |
2122 | |
2123 | print join "..", NUMWORDS("555-1202", group=>3, zero=>'oh') |
2124 | |
2125 | prints C<"five..five..five..one..two..oh..two">. |
2126 | By default, zero is rendered as "zero". |
2127 | |
2128 | Likewise, the digit C<'1'> may be rendered as "one" or "a/an" (or very |
2129 | occasionally other variants), depending on the context. So there is a |
2130 | C<'one'> argument as well: |
2131 | |
2132 | print NUMWORDS($_, one=>'a solitary', zero=>'no more'), |
2133 | PL(" bottle of beer on the wall\n", $_) |
2134 | for (3,2,1,0); |
2135 | |
2136 | # prints: |
2137 | # three bottles of beer on the wall |
2138 | # two bottles of beer on the wall |
2139 | # a solitary bottle of beer on the wall |
2140 | # no more bottles of beer on the wall |
2141 | |
2142 | Care is needed if the word "a/an" is to be used as a C<'one'> value. |
2143 | Unless the next word is known in advance, it's almost always necessary |
2144 | to use the C<A> function as well: |
2145 | |
2146 | print A( NUMWORDS(1, one=>'a') . " $_\n") |
2147 | for qw(cat aardvark ewe hour); |
2148 | |
2149 | # prints: |
2150 | # a cat |
2151 | # an aardvark |
2152 | # a ewe |
2153 | # an hour |
2154 | |
2155 | Another major regional variation in number translation is the use of |
2156 | "and" in certain contexts. The named argument 'and' |
2157 | allows the programmer to specify how "and" should be handled. Hence: |
2158 | |
2159 | print scalar NUMWORDS("765", 'and'=>'') |
2160 | |
2161 | prints "seven hundred sixty-five", instead of "seven hundred and sixty-five". |
2162 | By default, the "and" is included. |
2163 | |
2164 | The translation of the decimal point is also subject to variation |
2165 | (with "point", "dot", and "decimal" being the favorites). |
2166 | The named argument 'decimal' allows the |
2167 | programmer to how the decimal point should be rendered. Hence: |
2168 | |
2169 | print scalar NUMWORDS("666.124.64.101", group=>3, decimal=>'dot') |
2170 | |
2171 | prints "six sixty-six, dot, one twenty-four, dot, sixty-four, dot, one zero one" |
2172 | By default, the decimal point is rendered as "point". |
2173 | |
2174 | C<NUMWORDS> also handles the ordinal forms of numbers. So: |
2175 | |
2176 | print scalar NUMWORDS('1st'); |
2177 | print scalar NUMWORDS('3rd'); |
2178 | print scalar NUMWORDS('202nd'); |
2179 | print scalar NUMWORDS('1000000th'); |
2180 | |
2181 | print: |
2182 | |
2183 | first |
2184 | third |
2185 | two hundred and twenty-second |
2186 | one millionth |
2187 | |
2188 | Two common idioms in this regard are: |
2189 | |
2190 | print scalar NUMWORDS(ORD($number)); |
2191 | |
2192 | and: |
2193 | |
2194 | print scalar ORD(NUMWORDS($number)); |
2195 | |
2196 | These are identical in effect, except when $number contains a decimal: |
2197 | |
2198 | $number = 99.09; |
2199 | print scalar NUMWORDS(ORD($number)); # ninety-ninth point zero nine |
2200 | print scalar ORD(NUMWORDS($number)); # ninety-nine point zero ninth |
2201 | |
2202 | Use whichever you feel is most appropriate. |
2203 | |
2204 | |
2205 | =head1 INTERPOLATING INFLECTIONS IN STRINGS |
2206 | |
2207 | By far the commonest use of the inflection subroutines is to |
2208 | produce message strings for various purposes. For example: |
2209 | |
2210 | print NUM($errors), PL_N(" error"), PL_V(" was"), " detected.\n"; |
2211 | print PL_ADJ("This"), PL_N(" error"), PL_V(" was"), "fatal.\n" |
2212 | if $severity > 1; |
2213 | |
2214 | Unfortunately the need to separate each subroutine call detracts |
2215 | significantly from the readability of the resulting code. To ameliorate |
2216 | this problem, Lingua::EN::Inflect provides an exportable string-interpolating |
2217 | subroutine (C<inflect($)>), which recognizes calls to the various inflection |
2218 | subroutines within a string and interpolates them appropriately. |
2219 | |
2220 | Using C<inflect> the previous example could be rewritten: |
2221 | |
2222 | print inflect "NUM($errors) PL_N(error) PL_V(was) detected.\n"; |
2223 | print inflect "PL_ADJ(This) PL_N(error) PL_V(was) fatal.\n" |
2224 | if $severity > 1; |
2225 | |
2226 | Note that C<inflect> also correctly handles calls to the C<NUM()> subroutine |
2227 | (whether interpolated or antecedent). The C<inflect()> subroutine has |
2228 | a related extra feature, in that it I<automatically> cancels any "default |
2229 | number" value before it returns its interpolated string. This means that |
2230 | calls to C<NUM()> which are embedded in an C<inflect()>-interpolated |
2231 | string do not "escape" and interfere with subsequent inflections. |
2232 | |
2233 | |
2234 | =head1 MODERN VS CLASSICAL INFLECTIONS |
2235 | |
2236 | Certain words, mainly of Latin or Ancient Greek origin, can form |
2237 | plurals either using the standard English "-s" suffix, or with |
2238 | their original Latin or Greek inflections. For example: |
2239 | |
2240 | PL("stigma") # -> "stigmas" or "stigmata" |
2241 | PL("torus") # -> "toruses" or "tori" |
2242 | PL("index") # -> "indexes" or "indices" |
2243 | PL("millennium") # -> "millenniums" or "millennia" |
2244 | PL("ganglion") # -> "ganglions" or "ganglia" |
2245 | PL("octopus") # -> "octopuses" or "octopodes" |
2246 | |
2247 | |
2248 | Lingua::EN::Inflect caters to such words by providing an |
2249 | "alternate state" of inflection known as "classical mode". |
2250 | By default, words are inflected using their contemporary English |
2251 | plurals, but if classical mode is invoked, the more traditional |
2252 | plural forms are returned instead. |
2253 | |
2254 | The exportable subroutine C<classical()> controls this feature. |
2255 | If C<classical()> is called with no arguments, it unconditionally |
2256 | invokes classical mode. If it is called with a single argument, it |
2257 | turns all classical inflects on or off (depending on whether the argument is |
2258 | true or false). If called with two or more arguments, those arguments |
2259 | specify which aspects of classical behaviour are to be used. |
2260 | |
2261 | Thus: |
2262 | |
2263 | classical; # SWITCH ON CLASSICAL MODE |
2264 | print PL("formula"); # -> "formulae" |
2265 | |
2266 | classical 0; # SWITCH OFF CLASSICAL MODE |
2267 | print PL("formula"); # -> "formulas" |
2268 | |
2269 | classical $cmode; # CLASSICAL MODE IFF $cmode |
2270 | print PL("formula"); # -> "formulae" (IF $cmode) |
2271 | # -> "formulas" (OTHERWISE) |
2272 | |
2273 | classical herd=>1; # SWITCH ON CLASSICAL MODE FOR "HERD" NOUNS |
2274 | print PL("wilderbeest"); # -> "wilderbeest" |
2275 | |
2276 | classical names=>1; # SWITCH ON CLASSICAL MODE FOR NAMES |
2277 | print PL("sally"); # -> "sallies" |
2278 | print PL("Sally"); # -> "Sallys" |
2279 | |
2280 | Note however that C<classical()> has no effect on the inflection of words which |
2281 | are now fully assimilated. Hence: |
2282 | |
2283 | PL("forum") # ALWAYS -> "forums" |
2284 | PL("criterion") # ALWAYS -> "criteria" |
2285 | |
2286 | LEI assumes that a capitalized word is a person's name. So it forms the |
2287 | plural according to the rules for names (which is that you don't |
2288 | inflect, you just add -s or -es). You can choose to turn that behaviour |
2289 | off (it's on by the default, even when the module isn't in classical |
2290 | mode) by calling C< classical(names=>0) >; |
2291 | |
2292 | =head1 USER-DEFINED INFLECTIONS |
2293 | |
2294 | =head2 Adding plurals at run-time |
2295 | |
2296 | Lingua::EN::Inflect provides five exportable subroutines which allow |
2297 | the programmer to override the module's behaviour for specific cases: |
2298 | |
2299 | =over 8 |
2300 | |
2301 | =item C<def_noun($$)> |
2302 | |
2303 | The C<def_noun> subroutine takes a pair of string arguments: the singular and |
2304 | plural forms of the noun being specified. The singular form |
2305 | specifies a pattern to be interpolated (as C<m/^(?:$first_arg)$/i>). |
2306 | Any noun matching this pattern is then replaced by the string in the |
2307 | second argument. The second argument specifies a string which is |
2308 | interpolated after the match succeeds, and is then used as the plural |
2309 | form. For example: |
2310 | |
2311 | def_noun 'cow' => 'kine'; |
2312 | def_noun '(.+i)o' => '$1i'; |
2313 | def_noun 'spam(mer)?' => '\\$\\%\\@#\\$\\@#!!'; |
2314 | |
2315 | Note that both arguments should usually be specified in single quotes, |
2316 | so that they are not interpolated when they are specified, but later (when |
2317 | words are compared to them). As indicated by the last example, care |
2318 | also needs to be taken with certain characters in the second argument, |
2319 | to ensure that they are not unintentionally interpolated during comparison. |
2320 | |
2321 | The second argument string may also specify a second variant of the plural |
2322 | form, to be used when "classical" plurals have been requested. The beginning |
2323 | of the second variant is marked by a '|' character: |
2324 | |
2325 | def_noun 'cow' => 'cows|kine'; |
2326 | def_noun '(.+i)o' => '$1os|$1i'; |
2327 | def_noun 'spam(mer)?' => '\\$\\%\\@#\\$\\@#!!|varmints'; |
2328 | |
2329 | If no classical variant is given, the specified plural form is used in |
2330 | both normal and "classical" modes. |
2331 | |
2332 | If the second argument is C<undef> instead of a string, then the |
2333 | current user definition for the first argument is removed, and the |
2334 | standard plural inflection(s) restored. |
2335 | |
2336 | Note that in all cases, later plural definitions for a particular |
2337 | singular form replace earlier definitions of the same form. For example: |
2338 | |
2339 | # FIRST, HIDE THE MODERN FORM.... |
2340 | def_noun 'aviatrix' => 'aviatrices'; |
2341 | |
2342 | # LATER, HIDE THE CLASSICAL FORM... |
2343 | def_noun 'aviatrix' => 'aviatrixes'; |
2344 | |
2345 | # FINALLY, RESTORE THE DEFAULT BEHAVIOUR... |
2346 | def_noun 'aviatrix' => undef; |
2347 | |
2348 | |
2349 | Special care is also required when defining general patterns and |
2350 | associated specific exceptions: put the more specific cases I<after> |
2351 | the general pattern. For example: |
2352 | |
2353 | def_noun '(.+)us' => '$1i'; # EVERY "-us" TO "-i" |
2354 | def_noun 'bus' => 'buses'; # EXCEPT FOR "bus" |
2355 | |
2356 | This "try-most-recently-defined-first" approach to matching |
2357 | user-defined words is also used by C<def_verb>, C<def_a> and C<def_an>. |
2358 | |
2359 | |
2360 | =item C<def_verb($$$$$$)> |
2361 | |
2362 | The C<def_verb> subroutine takes three pairs of string arguments (that is, six |
2363 | arguments in total), specifying the singular and plural forms of the three |
2364 | "persons" of verb. As with C<def_noun>, the singular forms are specifications of |
2365 | run-time-interpolated patterns, whilst the plural forms are specifications of |
2366 | (up to two) run-time-interpolated strings: |
2367 | |
2368 | def_verb 'am' => 'are', |
2369 | 'are' => 'are|art", |
2370 | 'is' => 'are'; |
2371 | |
2372 | def_verb 'have' => 'have', |
2373 | 'have' => 'have", |
2374 | 'ha(s|th)' => 'have'; |
2375 | |
2376 | Note that as with C<def_noun>, modern/classical variants of plurals |
2377 | may be separately specified, subsequent definitions replace previous |
2378 | ones, and C<undef>'ed plural forms revert to the standard behaviour. |
2379 | |
2380 | |
2381 | =item C<def_adj($$)> |
2382 | |
2383 | The C<def_adj> subroutine takes a pair of string arguments, which specify |
2384 | the singular and plural forms of the adjective being defined. |
2385 | As with C<def_noun> and C<def_adj>, the singular forms are specifications of |
2386 | run-time-interpolated patterns, whilst the plural forms are specifications of |
2387 | (up to two) run-time-interpolated strings: |
2388 | |
2389 | def_adj 'this' => 'these', |
2390 | def_adj 'red' => 'red|gules', |
2391 | |
2392 | As previously, modern/classical variants of plurals |
2393 | may be separately specified, subsequent definitions replace previous |
2394 | ones, and C<undef>'ed plural forms revert to the standard behaviour. |
2395 | |
2396 | |
2397 | =item C<def_a($)> and C<def_an($)> |
2398 | |
2399 | The C<def_a> and C<def_an> subroutines each take a single argument, which |
2400 | specifies a pattern. If a word passed to C<A()> or C<AN()> matches this |
2401 | pattern, it will be prefixed (unconditionally) with the corresponding indefinite |
2402 | article. For example: |
2403 | |
2404 | def_a 'error'; |
2405 | def_a 'in.+'; |
2406 | |
2407 | def_an 'mistake'; |
2408 | def_an 'error'; |
2409 | |
2410 | As with the other C<def_...> subroutines, such redefinitions are sequential |
2411 | in effect so that, after the above example, "error" will be inflected with "an". |
2412 | |
2413 | =back |
2414 | |
2415 | =head2 The F<$HOME/.inflectrc> file |
2416 | |
2417 | When it is imported, Lingua::EN::Inflect executes (as Perl code) |
2418 | the contents of any file named F<.inflectrc> which it finds in the |
2419 | in the directory where F<Lingua/EN/Inflect.pm> is installed, |
2420 | or in the current home directory (C<$ENV{HOME}>), or in both. |
2421 | Note that the code is executed within the Lingua::EN::Inflect |
2422 | namespace. |
2423 | |
2424 | Hence the user or the local Perl guru can make appropriate calls to |
2425 | C<def_noun>, C<def_verb>, etc. in one of these F<.inflectrc> files, to |
2426 | permanently and universally modify the behaviour of the module. For example |
2427 | |
2428 | > cat /usr/local/lib/perl5/Text/Inflect/.inflectrc |
2429 | |
2430 | def_noun "UNIX" => "UN*X|UNICES"; |
2431 | |
2432 | def_verb "teco" => "teco", # LITERALLY: "to edit with TECO" |
2433 | "teco" => "teco", |
2434 | "tecos" => "teco"; |
2435 | |
2436 | def_a "Euler.*"; # "Yewler" TURNS IN HIS GRAVE |
2437 | |
2438 | |
2439 | Note that calls to the C<def_...> subroutines from within a program |
2440 | will take precedence over the contents of the home directory |
2441 | F<.inflectrc> file, which in turn takes precedence over the system-wide |
2442 | F<.inflectrc> file. |
2443 | |
2444 | |
2445 | =head1 DIAGNOSTICS |
2446 | |
2447 | On loading, if the Perl code in a F<.inflectrc> file is invalid |
2448 | (syntactically or otherwise), an appropriate fatal error is issued. |
2449 | A common problem is not ending the file with something that |
2450 | evaluates to true (as the five C<def_...> subroutines do). |
2451 | |
2452 | Using the five C<def_...> subroutines directly in a program may also |
2453 | result in fatal diagnostics, if a (singular) pattern or an interpolated |
2454 | (plural) string is somehow invalid. |
2455 | |
2456 | Specific diagnostics related to user-defined inflections are: |
2457 | |
2458 | =over 8 |
2459 | |
2460 | =item C<"Bad user-defined singular pattern:\n\t %s"> |
2461 | |
2462 | The singular form of a user-defined noun or verb |
2463 | (as defined by a call to C<def_noun>, C<def_verb>, C<def_adj>, |
2464 | C<def_a> or C<def_an>) is not a valid Perl regular expression. The |
2465 | actual Perl error message is also given. |
2466 | |
2467 | =item C<"Bad user-defined plural string: '%s'"> |
2468 | |
2469 | The plural form(s) of a user-defined noun or verb |
2470 | (as defined by a call to C<def_noun>, C<def_verb> or C<def_adj>) |
2471 | is not a valid Perl interpolated string (usually because it |
2472 | interpolates some undefined variable). |
2473 | |
2474 | =item C<"Bad .inflectrc file (%s):\n %s"> |
2475 | |
2476 | Some other problem occurred in loading the named local |
2477 | or global F<.inflectrc> file. The Perl error message (including |
2478 | the line number) is also given. |
2479 | |
2480 | =back |
2481 | |
2482 | There are I<no> diagnosable run-time error conditions for the actual |
2483 | inflection subroutines, except C<NUMWORDS> and hence no run-time |
2484 | diagnostics. If the inflection subroutines are unable to form a plural |
2485 | via a user-definition or an inbuilt rule, they just "guess" the |
2486 | commonest English inflection: adding "-s" for nouns, removing "-s" for |
2487 | verbs, and no inflection for adjectives. |
2488 | |
2489 | C<Lingua::EN::Inflect::NUMWORDS()> can C<die> with the following messages: |
2490 | |
2491 | =over 8 |
2492 | |
2493 | =item C<"Bad grouping option: %s"> |
2494 | |
2495 | The optional argument to C<NUMWORDS()> wasn't 1, 2 or 3. |
2496 | |
2497 | =item C<"Number out of range"> |
2498 | |
2499 | C<NUMWORDS()> was passed a number larger than |
2500 | 999,999,999,999,999,999,999,999,999,999,999,999 (that is: nine hundred |
2501 | and ninety-nine decillion, nine hundred and ninety-nine nonillion, nine |
2502 | hundred and ninety-nine octillion, nine hundred and ninety-nine |
2503 | septillion, nine hundred and ninety-nine sextillion, nine hundred and |
2504 | ninety-nine quintillion, nine hundred and ninety-nine quadrillion, nine |
2505 | hundred and ninety-nine trillion, nine hundred and ninety-nine billion, |
2506 | nine hundred and ninety-nine million, nine hundred and ninety-nine |
2507 | thousand, nine hundred and ninety-nine :-) |
2508 | |
2509 | The problem is that C<NUMWORDS> doesn't know any |
2510 | words for number components bigger than "decillion". |
2511 | |
2512 | |
2513 | =head1 OTHER ISSUES |
2514 | |
2515 | =head2 2nd Person precedence |
2516 | |
2517 | If a verb has identical 1st and 2nd person singular forms, but |
2518 | different 1st and 2nd person plural forms, then when its plural is |
2519 | constructed, the 2nd person plural form is always preferred. |
2520 | |
2521 | The author is not currently aware of any such verbs in English, but is |
2522 | not quite arrogant enough to assume I<ipso facto> that none exist. |
2523 | |
2524 | |
2525 | =head2 Nominative precedence |
2526 | |
2527 | The singular pronoun "it" presents a special problem because its plural form |
2528 | can vary, depending on its "case". For example: |
2529 | |
2530 | It ate my homework -> They ate my homework |
2531 | It ate it -> They ate them |
2532 | I fed my homework to it -> I fed my homework to them |
2533 | |
2534 | As a consequence of this ambiguity, C<PL()> or C<PL_N> have been implemented |
2535 | so that they always return the I<nominative> plural (that is, "they"). |
2536 | |
2537 | However, when asked for the plural of an unambiguously I<accusative> |
2538 | "it" (namely, C<PL("to it")>, C<PL_N("from it")>, C<PL("with it")>, |
2539 | etc.), both subroutines will correctly return the accusative plural |
2540 | ("to them", "from them", "with them", etc.) |
2541 | |
2542 | |
2543 | =head2 The plurality of zero |
2544 | |
2545 | The rules governing the choice between: |
2546 | |
2547 | There were no errors. |
2548 | |
2549 | and |
2550 | |
2551 | There was no error. |
2552 | |
2553 | are complex and often depend more on I<intent> rather than I<content>. |
2554 | Hence it is infeasible to specify such rules algorithmically. |
2555 | |
2556 | Therefore, Lingua::EN::Text contents itself with the following compromise: If |
2557 | the governing number is zero, inflections always return the plural form |
2558 | unless the appropriate "classical" inflection is in effect, in which case the |
2559 | singular form is always returned. |
2560 | |
2561 | Thus, the sequence: |
2562 | |
2563 | NUM(0); |
2564 | print inflect "There PL(was) NO(choice)"; |
2565 | |
2566 | produces "There were no choices", whereas: |
2567 | |
2568 | classical 'zero'; # or: classical(zero=>1); |
2569 | NUM(0); |
2570 | print inflect "There PL(was) NO(choice)"; |
2571 | |
2572 | it will print "There was no choice". |
2573 | |
2574 | |
2575 | =head2 Homographs with heterogeneous plurals |
2576 | |
2577 | Another context in which intent (and not content) sometimes determines |
2578 | plurality is where two distinct meanings of a word require different |
2579 | plurals. For example: |
2580 | |
2581 | Three basses were stolen from the band's equipment trailer. |
2582 | Three bass were stolen from the band's aquarium. |
2583 | |
2584 | I put the mice next to the cheese. |
2585 | I put the mouses next to the computers. |
2586 | |
2587 | Several thoughts about leaving crossed my mind. |
2588 | Several thought about leaving across my lawn. |
2589 | |
2590 | Lingua::EN::Inflect handles such words in two ways: |
2591 | |
2592 | =over 8 |
2593 | |
2594 | =item * |
2595 | |
2596 | If both meanings of the word are the I<same> part of speech (for |
2597 | example, "bass" is a noun in both sentences above), then one meaning |
2598 | is chosen as the "usual" meaning, and only that meaning's plural is |
2599 | ever returned by any of the inflection subroutines. |
2600 | |
2601 | =item * |
2602 | |
2603 | If each meaning of the word is a different part of speech (for |
2604 | example, "thought" is both a noun and a verb), then the noun's |
2605 | plural is returned by C<PL()> and C<PL_N()> and the verb's plural is |
2606 | returned only by C<PL_V()>. |
2607 | |
2608 | =back |
2609 | |
2610 | Such contexts are, fortunately, uncommon (particularly |
2611 | "same-part-of-speech" examples). An informal study of nearly 600 |
2612 | "difficult plurals" indicates that C<PL()> can be relied upon to "get |
2613 | it right" about 98% of the time (although, of course, ichthyophilic |
2614 | guitarists or cyber-behaviouralists may experience higher rates of |
2615 | confusion). |
2616 | |
2617 | If the choice of a particular "usual inflection" is considered |
2618 | inappropriate, it can always be reversed with a preliminary call |
2619 | to the corresponding C<def_...> subroutine. |
2620 | |
2621 | =head1 NOTE |
2622 | |
2623 | I'm not taking any further correspondence on: |
2624 | |
2625 | =over |
2626 | |
2627 | =item "octopi". |
2628 | |
2629 | Despite the populist pandering of certain New World dictionaries, the |
2630 | plural is "octopuses" or (for the pendantic classicist) "octopodes". The |
2631 | suffix "-pus" is Greek, not Latin, so the plural is "-podes", not "pi". |
2632 | |
2633 | |
2634 | =item "virus". |
2635 | |
2636 | Had no plural in Latin (possibly because it was a mass noun). |
2637 | The only plural is the Anglicized "viruses". |
2638 | |
2639 | =back |
2640 | |
2641 | =head1 AUTHORS |
2642 | |
2643 | Damian Conway (damian@conway.org) |
2644 | Matthew Persico (ORD inflection) |
2645 | |
2646 | |
2647 | =head1 BUGS AND IRRITATIONS |
2648 | |
2649 | The endless inconsistencies of English. |
2650 | |
2651 | (I<Please> report words for which the correct plural or |
2652 | indefinite article is not formed, so that the reliability |
2653 | of Lingua::EN::Inflect can be improved.) |
2654 | |
2655 | |
2656 | |
2657 | =head1 COPYRIGHT |
2658 | |
2659 | Copyright (c) 1997-2000, Damian Conway. All Rights Reserved. |
2660 | This module is free software. It may be used, redistributed |
2661 | and/or modified under the same terms as Perl itself. |