Commit | Line | Data |
5271a011 |
1 | package Text::Tradition::Language::Latin; |
2 | |
3 | use strict; |
4 | use warnings; |
5 | use Module::Load; |
75ae2b25 |
6 | use Text::Tradition::Language::Base qw/ lemmatize_treetagger treetagger_struct |
7 | lfs_morph_tags /; |
5271a011 |
8 | use TryCatch; |
9 | |
10 | =head1 NAME |
11 | |
12 | Text::Tradition::Language::Latin - language-specific module for Latin |
13 | |
14 | =head1 DESCRIPTION |
15 | |
16 | Implements morphology lookup for French words in context. This module |
17 | depends on the Morph::Perseus module for access to PhiloLogic database data. |
18 | It also depends on the TreeTagger software |
19 | (L<http://www.ims.uni-stuttgart.de/projekte/corplex/TreeTagger/>), which is |
20 | (for now) expected to be installed in $MORPHDIR/TreeTagger. |
21 | |
22 | =head1 SUBROUTINES |
23 | |
24 | =head2 lemmatize( $text ) |
25 | |
26 | Evaluates the string using the Flemm package, and returns the results. |
27 | |
28 | =begin testing |
29 | |
30 | use Text::Tradition; |
31 | use_ok( 'Text::Tradition::Language::Latin' ); |
32 | |
33 | eval "use Morph::Perseus"; |
34 | my $err = $@; |
35 | |
36 | SKIP: { |
37 | skip "Package Morph::Perseus not found" if $err; |
38 | |
39 | my $trad = Text::Tradition->new( |
40 | 'language' => 'Latin', |
41 | 'file' => 't/data/legendfrag.xml', |
42 | 'input' => 'Self' ); |
43 | $trad->lemmatize(); |
44 | my $ambig = 0; |
45 | foreach my $r ( $trad->collation->readings ) { |
46 | next if $r->is_meta; |
47 | ok( $r->has_lexemes, "Reading $r has one or more lexemes" ); |
48 | my @lex = $r->lexemes; |
49 | my $lexstr = join( '', map { $_->string } @lex ); |
50 | my $textstr = $r->text; |
51 | $textstr =~ s/\s+//g; |
52 | is( $textstr, $lexstr, "Lexemes for reading $r match the reading" ); |
53 | foreach my $l ( @lex ) { |
fe77efe0 |
54 | next unless $l->matches; |
5271a011 |
55 | next if $l->is_disambiguated; |
56 | printf( "Ambiguous lexeme %s for reading %s:\n\t%s\n", $l->string, $r->id, |
57 | join( "\n\t", map { $_->lemma . ': ' . $_->morphology->to_string } $l->matching_forms ) ); |
58 | $ambig++; |
59 | } |
60 | } |
fe77efe0 |
61 | is( $ambig, 7, "Found 7 ambiguous forms as expected" ); |
5271a011 |
62 | } |
63 | |
64 | =end testing |
65 | |
66 | =cut |
67 | |
68 | sub lemmatize { |
69 | my $tradition = shift; |
70 | my %opts = ( |
71 | 'language' => 'Latin', |
72 | 'callback' => sub { _perseus_lookup_tt( @_ ) } |
73 | ); |
74 | return lemmatize_treetagger( $tradition, %opts ); |
75 | } |
76 | |
77 | =head2 reading_lookup( $rdg[, $rdg, ...] ) |
78 | |
79 | Looks up one or more readings using the Perseus package, and returns the |
80 | possible results. This skips the tree tagger / tokenizer, returning any |
81 | match for the word string(s) in the morphology DB. |
82 | |
83 | =cut |
84 | |
85 | sub reading_lookup { |
86 | my @words = @_; |
87 | return map { _perseus_lookup_str( $_ ) } @words; |
88 | } |
89 | |
75ae2b25 |
90 | =head2 morphology_tags |
91 | |
92 | Return a data structure describing the available parts of speech and their attributes. |
93 | |
94 | =cut |
95 | |
96 | sub morphology_tags { |
97 | try { |
98 | load 'Morph::Perseus::Structure'; |
99 | } catch { |
100 | warn "Not using Perseus Latin tags"; |
101 | } |
102 | return lfs_morph_tags(); |
103 | } |
104 | |
105 | |
5271a011 |
106 | { |
107 | my $morph; |
108 | |
109 | sub _morph_connect { |
110 | unless( $morph ) { |
111 | try { |
112 | load 'Morph::Perseus'; |
113 | load 'Morph::Perseus::Structure'; |
114 | $morph = Morph::Perseus->connect( 'Latin' ); |
115 | } catch { |
116 | warn "Cannot do Latin word lemmatization without Morph::Perseus: @_"; |
117 | return; |
118 | } |
119 | } |
120 | } |
121 | |
fe77efe0 |
122 | # TODO special case: |
123 | # passive verbs (-or) |
124 | # T sapientia -> sapientia |
125 | # T primus -> unus |
126 | # T occulta -> occultus (with occulo in next field, hmm...) |
127 | # T carne -> carnis |
128 | # T melius -> bonus |
129 | |
130 | |
131 | my %excep = ( |
132 | 'absens' => 'absum', |
133 | 'aperte' => 'apertus', |
134 | 'evolvo' => 'exvolvo', |
135 | 'inquiam' => 'inquam', |
136 | 'intelligo' => 'intellego', |
137 | 'itaque' => 'ita', |
138 | 'iuste' => 'iustus', |
139 | 'longe' => 'longus', |
140 | 'male' => 'malus|malum', |
141 | 'multum' => 'multus', |
142 | 'nec' => 'neque', |
143 | 'nos' => 'ego', |
144 | 'occultum' => 'occultus', |
145 | 'peregrinans' => 'peregrinor', |
146 | 'perfectus' => 'perficio', |
147 | 'potius' => 'potis', |
148 | 'praesente' => 'praesens', |
149 | 'prius' => 'prior', |
150 | 'quotidianus' => 'cottidianus', |
151 | 'se' => 'sui', |
152 | 'septem' => 'septimus', |
153 | 'Spiritum' => 'spiritus', |
154 | 'viriliter' => 'virilis', # TODO special case -iter? |
155 | 'vos' => 'tu', |
156 | |
157 | 'datum' => 'do|data|datus', |
158 | 'forte' => 'fors|fortis', |
159 | 'vere' => 'verum|verus', |
160 | ); |
161 | |
5271a011 |
162 | sub _perseus_lookup_tt { |
163 | my( $orig, $pos, $lemma ) = split( /\t/, $_[0] ); |
164 | _morph_connect(); |
772edba8 |
165 | return unlesss $morph; |
5271a011 |
166 | my $result = $morph->lookup( $orig ); |
167 | # Discard results that don't match the lemma, unless lemma is unknown |
fe77efe0 |
168 | my @orig = @{$result->{'objects'}}; |
5271a011 |
169 | my @ret; |
fe77efe0 |
170 | unless( $lemma eq '<unknown>' || $lemma =~ /^\W+$/ ) { |
5271a011 |
171 | # TODO Perseus lemma might have a number on the end, yuck. |
fe77efe0 |
172 | # multiple lemmata separated with | |
173 | $lemma =~ s/[^\w|]//g; |
174 | $lemma = $excep{$lemma} if exists $excep{$lemma}; |
175 | $lemma =~ s/j/i/g; |
176 | my %lems; |
177 | my @forms = |
178 | map { $lems{$_} = 1; $lems{lc($_)} = 1 } split( /\|/, $lemma ); |
179 | # Now match the lemmata from Treetagger to the lemmata and alt_ls |
180 | # from Perseus. |
181 | @ret = grep { |
182 | my $x = $_->lemma; |
183 | $x =~ s/\d+$//; |
184 | my $y = $_->alt_lex || ''; |
185 | $y =~ s/\d+$//; |
186 | $lems{$x} || $lems{$y}; |
187 | } @orig; |
5271a011 |
188 | warn "TreeTagger lemma $lemma matched no results from Perseus for $orig" |
fe77efe0 |
189 | if @orig && !@ret; |
5271a011 |
190 | } |
fe77efe0 |
191 | @ret = @orig unless @ret; |
5271a011 |
192 | |
75ae2b25 |
193 | my %unique_wordforms; |
5271a011 |
194 | foreach my $obj ( @ret ) { |
75ae2b25 |
195 | my $wf = _wordform_from_row( $obj ); |
196 | $unique_wordforms{$wf->to_string} = $wf; |
5271a011 |
197 | } |
198 | ## TODO Use TreeTagger info - requires serious hacking of Lingua::TagSet |
fe77efe0 |
199 | # Discard results that don't match the given TreeTagger POS, unless |
200 | # that leaves zero results |
5271a011 |
201 | # my $ttstruct = treetagger_struct( $pos ); |
202 | # my @ttmatch = grep { $ttstruct->is_compatible( $_->morphology ) } @wordforms; |
203 | # unless( @ttmatch ) { |
204 | # warn "TreeTagger POS $pos matched no results from Perseus for $orig"; |
205 | # @ttmatch = @wordforms; |
206 | # } |
207 | # return @ttmatch; |
75ae2b25 |
208 | return values( %unique_wordforms ); |
5271a011 |
209 | } |
210 | |
211 | sub _perseus_lookup_str { |
212 | my( $orig ) = @_; |
213 | _morph_connect(); |
772edba8 |
214 | return unless $morph; |
5271a011 |
215 | # Simple morph DB lookup, and return the results. |
216 | my $result = $morph->lookup( $orig ); |
217 | return map { _wordform_from_row( $_ ) } @{$result->{'objects'}}; |
218 | } |
219 | |
5271a011 |
220 | } |
221 | |
fe77efe0 |
222 | sub _wordform_from_row { |
223 | my( $rowobj ) = @_; |
224 | my $mpstruct; |
225 | try { |
772edba8 |
226 | # M::P::St will be loaded already if we got here |
fe77efe0 |
227 | $mpstruct = Morph::Perseus::Structure->from_tag( $rowobj->code ); |
228 | } catch { |
229 | warn "Could not create morphology structure from " |
230 | . $rowobj->code . ": $!"; |
231 | } |
fe77efe0 |
232 | my $lemma = $rowobj->lemma; |
233 | $lemma =~ s/^(\D+)\d*$/$1/; |
234 | my $wf = Text::Tradition::Collation::Reading::WordForm->new( |
235 | 'language' => 'Latin', |
236 | 'lemma' => $lemma, |
237 | 'morphology' => $mpstruct, |
238 | ); |
239 | return $wf; |
240 | } |
241 | |
5271a011 |
242 | 1; |