use improvements in Lingua packages throughout our lexeme tagging
[scpubgit/stemmatology.git] / lib / Text / Tradition / Language / Latin.pm
CommitLineData
5271a011 1package Text::Tradition::Language::Latin;
2
3use strict;
4use warnings;
5use Module::Load;
f8862b58 6use Text::Tradition::Language::Base qw/ lemmatize_treetagger lfs_morph_tags /;
5271a011 7use TryCatch;
8
9=head1 NAME
10
11Text::Tradition::Language::Latin - language-specific module for Latin
12
13=head1 DESCRIPTION
14
15Implements morphology lookup for French words in context. This module
f8862b58 16depends on the Lingua::Morph::Perseus module for access to PhiloLogic database data.
5271a011 17
18=head1 SUBROUTINES
19
20=head2 lemmatize( $text )
21
22Evaluates the string using the Flemm package, and returns the results.
23
24=begin testing
25
26use Text::Tradition;
27use_ok( 'Text::Tradition::Language::Latin' );
28
f8862b58 29eval "use Lingua::Morph::Perseus";
5271a011 30my $err = $@;
31
32SKIP: {
f8862b58 33 skip "Package Lingua::Morph::Perseus not found" if $err;
5271a011 34
35 my $trad = Text::Tradition->new(
36 'language' => 'Latin',
37 'file' => 't/data/legendfrag.xml',
38 'input' => 'Self' );
39 $trad->lemmatize();
40 my $ambig = 0;
41 foreach my $r ( $trad->collation->readings ) {
42 next if $r->is_meta;
43 ok( $r->has_lexemes, "Reading $r has one or more lexemes" );
44 my @lex = $r->lexemes;
45 my $lexstr = join( '', map { $_->string } @lex );
46 my $textstr = $r->text;
47 $textstr =~ s/\s+//g;
48 is( $textstr, $lexstr, "Lexemes for reading $r match the reading" );
49 foreach my $l ( @lex ) {
fe77efe0 50 next unless $l->matches;
5271a011 51 next if $l->is_disambiguated;
52 printf( "Ambiguous lexeme %s for reading %s:\n\t%s\n", $l->string, $r->id,
53 join( "\n\t", map { $_->lemma . ': ' . $_->morphology->to_string } $l->matching_forms ) );
54 $ambig++;
55 }
56 }
f8862b58 57 is( $ambig, 4, "Found 4 ambiguous forms as expected" );
5271a011 58}
59
60=end testing
61
62=cut
63
64sub lemmatize {
65 my $tradition = shift;
66 my %opts = (
67 'language' => 'Latin',
68 'callback' => sub { _perseus_lookup_tt( @_ ) }
69 );
70 return lemmatize_treetagger( $tradition, %opts );
71}
72
73=head2 reading_lookup( $rdg[, $rdg, ...] )
74
75Looks up one or more readings using the Perseus package, and returns the
76possible results. This skips the tree tagger / tokenizer, returning any
77match for the word string(s) in the morphology DB.
78
79=cut
80
81sub reading_lookup {
82 my @words = @_;
83 return map { _perseus_lookup_str( $_ ) } @words;
84}
85
75ae2b25 86=head2 morphology_tags
87
88Return a data structure describing the available parts of speech and their attributes.
89
90=cut
91
92sub morphology_tags {
75ae2b25 93 return lfs_morph_tags();
94}
95
96
5271a011 97{
98 my $morph;
99
100 sub _morph_connect {
101 unless( $morph ) {
102 try {
f8862b58 103 load 'Lingua::Morph::Perseus';
104 $morph = Lingua::Morph::Perseus->connect( 'Latin' );
5271a011 105 } catch {
f8862b58 106 warn "Cannot do Latin word lemmatization without Lingua::Morph::Perseus: @_";
5271a011 107 return;
108 }
109 }
110 }
111
fe77efe0 112 # TODO special case:
113 # passive verbs (-or)
114 # T sapientia -> sapientia
115 # T primus -> unus
116 # T occulta -> occultus (with occulo in next field, hmm...)
117 # T carne -> carnis
118 # T melius -> bonus
119
fe77efe0 120 my %excep = (
121 'absens' => 'absum',
122 'aperte' => 'apertus',
123 'evolvo' => 'exvolvo',
124 'inquiam' => 'inquam',
125 'intelligo' => 'intellego',
126 'itaque' => 'ita',
127 'iuste' => 'iustus',
128 'longe' => 'longus',
129 'male' => 'malus|malum',
130 'multum' => 'multus',
131 'nec' => 'neque',
132 'nos' => 'ego',
133 'occultum' => 'occultus',
134 'peregrinans' => 'peregrinor',
135 'perfectus' => 'perficio',
136 'potius' => 'potis',
137 'praesente' => 'praesens',
138 'prius' => 'prior',
139 'quotidianus' => 'cottidianus',
140 'se' => 'sui',
141 'septem' => 'septimus',
142 'Spiritum' => 'spiritus',
143 'viriliter' => 'virilis', # TODO special case -iter?
144 'vos' => 'tu',
145
146 'datum' => 'do|data|datus',
147 'forte' => 'fors|fortis',
148 'vere' => 'verum|verus',
149 );
150
5271a011 151 sub _perseus_lookup_tt {
152 my( $orig, $pos, $lemma ) = split( /\t/, $_[0] );
153 _morph_connect();
f8862b58 154 return unless $morph;
5271a011 155 # Discard results that don't match the lemma, unless lemma is unknown
f8862b58 156 my $lookupopts = {};
fe77efe0 157 unless( $lemma eq '<unknown>' || $lemma =~ /^\W+$/ ) {
5271a011 158 # TODO Perseus lemma might have a number on the end, yuck.
fe77efe0 159 # multiple lemmata separated with |
160 $lemma =~ s/[^\w|]//g;
161 $lemma = $excep{$lemma} if exists $excep{$lemma};
162 $lemma =~ s/j/i/g;
f8862b58 163 if( $lemma ) { # if we have anything left...
164 my %lems;
165 map { $lems{$_} = 1; $lems{lc($_)} = 1 } split( /\|/, $lemma );
166 $lookupopts->{'lemma'} = [ keys %lems ];
167 }
5271a011 168 }
f8862b58 169 $lookupopts->{'ttpos'} = $pos if $pos;
5271a011 170
f8862b58 171 my $result = $morph->lexicon_lookup( $orig, $lookupopts );
172 # unless( !keys( %$lookupopts ) || $result->{'filtered'} ) {
173 # warn "Filter on $pos / $lemma returned no results; using all results";
174 # }
175 my @ret = @{$result->{'objects'}};
75ae2b25 176 my %unique_wordforms;
5271a011 177 foreach my $obj ( @ret ) {
75ae2b25 178 my $wf = _wordform_from_row( $obj );
179 $unique_wordforms{$wf->to_string} = $wf;
5271a011 180 }
75ae2b25 181 return values( %unique_wordforms );
5271a011 182 }
183
184 sub _perseus_lookup_str {
185 my( $orig ) = @_;
186 _morph_connect();
772edba8 187 return unless $morph;
5271a011 188 # Simple morph DB lookup, and return the results.
189 my $result = $morph->lookup( $orig );
190 return map { _wordform_from_row( $_ ) } @{$result->{'objects'}};
191 }
192
5271a011 193}
194
fe77efe0 195sub _wordform_from_row {
196 my( $rowobj ) = @_;
fe77efe0 197 my $lemma = $rowobj->lemma;
198 $lemma =~ s/^(\D+)\d*$/$1/;
199 my $wf = Text::Tradition::Collation::Reading::WordForm->new(
200 'language' => 'Latin',
201 'lemma' => $lemma,
f8862b58 202 'morphology' => $rowobj->morphology,
fe77efe0 203 );
204 return $wf;
205}
206
5271a011 2071;