remove unconditional dep on Morph::Perseus
[scpubgit/stemmatology.git] / lib / Text / Tradition / Language / Latin.pm
CommitLineData
5271a011 1package Text::Tradition::Language::Latin;
2
3use strict;
4use warnings;
5use Module::Load;
75ae2b25 6use Text::Tradition::Language::Base qw/ lemmatize_treetagger treetagger_struct
7 lfs_morph_tags /;
5271a011 8use TryCatch;
9
10=head1 NAME
11
12Text::Tradition::Language::Latin - language-specific module for Latin
13
14=head1 DESCRIPTION
15
16Implements morphology lookup for French words in context. This module
17depends on the Morph::Perseus module for access to PhiloLogic database data.
18It also depends on the TreeTagger software
19(L<http://www.ims.uni-stuttgart.de/projekte/corplex/TreeTagger/>), which is
20(for now) expected to be installed in $MORPHDIR/TreeTagger.
21
22=head1 SUBROUTINES
23
24=head2 lemmatize( $text )
25
26Evaluates the string using the Flemm package, and returns the results.
27
28=begin testing
29
30use Text::Tradition;
31use_ok( 'Text::Tradition::Language::Latin' );
32
33eval "use Morph::Perseus";
34my $err = $@;
35
36SKIP: {
37 skip "Package Morph::Perseus not found" if $err;
38
39 my $trad = Text::Tradition->new(
40 'language' => 'Latin',
41 'file' => 't/data/legendfrag.xml',
42 'input' => 'Self' );
43 $trad->lemmatize();
44 my $ambig = 0;
45 foreach my $r ( $trad->collation->readings ) {
46 next if $r->is_meta;
47 ok( $r->has_lexemes, "Reading $r has one or more lexemes" );
48 my @lex = $r->lexemes;
49 my $lexstr = join( '', map { $_->string } @lex );
50 my $textstr = $r->text;
51 $textstr =~ s/\s+//g;
52 is( $textstr, $lexstr, "Lexemes for reading $r match the reading" );
53 foreach my $l ( @lex ) {
fe77efe0 54 next unless $l->matches;
5271a011 55 next if $l->is_disambiguated;
56 printf( "Ambiguous lexeme %s for reading %s:\n\t%s\n", $l->string, $r->id,
57 join( "\n\t", map { $_->lemma . ': ' . $_->morphology->to_string } $l->matching_forms ) );
58 $ambig++;
59 }
60 }
fe77efe0 61 is( $ambig, 7, "Found 7 ambiguous forms as expected" );
5271a011 62}
63
64=end testing
65
66=cut
67
68sub lemmatize {
69 my $tradition = shift;
70 my %opts = (
71 'language' => 'Latin',
72 'callback' => sub { _perseus_lookup_tt( @_ ) }
73 );
74 return lemmatize_treetagger( $tradition, %opts );
75}
76
77=head2 reading_lookup( $rdg[, $rdg, ...] )
78
79Looks up one or more readings using the Perseus package, and returns the
80possible results. This skips the tree tagger / tokenizer, returning any
81match for the word string(s) in the morphology DB.
82
83=cut
84
85sub reading_lookup {
86 my @words = @_;
87 return map { _perseus_lookup_str( $_ ) } @words;
88}
89
75ae2b25 90=head2 morphology_tags
91
92Return a data structure describing the available parts of speech and their attributes.
93
94=cut
95
96sub morphology_tags {
97 try {
98 load 'Morph::Perseus::Structure';
99 } catch {
100 warn "Not using Perseus Latin tags";
101 }
102 return lfs_morph_tags();
103}
104
105
5271a011 106{
107 my $morph;
108
109 sub _morph_connect {
110 unless( $morph ) {
111 try {
112 load 'Morph::Perseus';
113 load 'Morph::Perseus::Structure';
114 $morph = Morph::Perseus->connect( 'Latin' );
115 } catch {
116 warn "Cannot do Latin word lemmatization without Morph::Perseus: @_";
117 return;
118 }
119 }
120 }
121
fe77efe0 122 # TODO special case:
123 # passive verbs (-or)
124 # T sapientia -> sapientia
125 # T primus -> unus
126 # T occulta -> occultus (with occulo in next field, hmm...)
127 # T carne -> carnis
128 # T melius -> bonus
129
130
131 my %excep = (
132 'absens' => 'absum',
133 'aperte' => 'apertus',
134 'evolvo' => 'exvolvo',
135 'inquiam' => 'inquam',
136 'intelligo' => 'intellego',
137 'itaque' => 'ita',
138 'iuste' => 'iustus',
139 'longe' => 'longus',
140 'male' => 'malus|malum',
141 'multum' => 'multus',
142 'nec' => 'neque',
143 'nos' => 'ego',
144 'occultum' => 'occultus',
145 'peregrinans' => 'peregrinor',
146 'perfectus' => 'perficio',
147 'potius' => 'potis',
148 'praesente' => 'praesens',
149 'prius' => 'prior',
150 'quotidianus' => 'cottidianus',
151 'se' => 'sui',
152 'septem' => 'septimus',
153 'Spiritum' => 'spiritus',
154 'viriliter' => 'virilis', # TODO special case -iter?
155 'vos' => 'tu',
156
157 'datum' => 'do|data|datus',
158 'forte' => 'fors|fortis',
159 'vere' => 'verum|verus',
160 );
161
5271a011 162 sub _perseus_lookup_tt {
163 my( $orig, $pos, $lemma ) = split( /\t/, $_[0] );
164 _morph_connect();
772edba8 165 return unlesss $morph;
5271a011 166 my $result = $morph->lookup( $orig );
167 # Discard results that don't match the lemma, unless lemma is unknown
fe77efe0 168 my @orig = @{$result->{'objects'}};
5271a011 169 my @ret;
fe77efe0 170 unless( $lemma eq '<unknown>' || $lemma =~ /^\W+$/ ) {
5271a011 171 # TODO Perseus lemma might have a number on the end, yuck.
fe77efe0 172 # multiple lemmata separated with |
173 $lemma =~ s/[^\w|]//g;
174 $lemma = $excep{$lemma} if exists $excep{$lemma};
175 $lemma =~ s/j/i/g;
176 my %lems;
177 my @forms =
178 map { $lems{$_} = 1; $lems{lc($_)} = 1 } split( /\|/, $lemma );
179 # Now match the lemmata from Treetagger to the lemmata and alt_ls
180 # from Perseus.
181 @ret = grep {
182 my $x = $_->lemma;
183 $x =~ s/\d+$//;
184 my $y = $_->alt_lex || '';
185 $y =~ s/\d+$//;
186 $lems{$x} || $lems{$y};
187 } @orig;
5271a011 188 warn "TreeTagger lemma $lemma matched no results from Perseus for $orig"
fe77efe0 189 if @orig && !@ret;
5271a011 190 }
fe77efe0 191 @ret = @orig unless @ret;
5271a011 192
75ae2b25 193 my %unique_wordforms;
5271a011 194 foreach my $obj ( @ret ) {
75ae2b25 195 my $wf = _wordform_from_row( $obj );
196 $unique_wordforms{$wf->to_string} = $wf;
5271a011 197 }
198 ## TODO Use TreeTagger info - requires serious hacking of Lingua::TagSet
fe77efe0 199 # Discard results that don't match the given TreeTagger POS, unless
200 # that leaves zero results
5271a011 201# my $ttstruct = treetagger_struct( $pos );
202# my @ttmatch = grep { $ttstruct->is_compatible( $_->morphology ) } @wordforms;
203# unless( @ttmatch ) {
204# warn "TreeTagger POS $pos matched no results from Perseus for $orig";
205# @ttmatch = @wordforms;
206# }
207# return @ttmatch;
75ae2b25 208 return values( %unique_wordforms );
5271a011 209 }
210
211 sub _perseus_lookup_str {
212 my( $orig ) = @_;
213 _morph_connect();
772edba8 214 return unless $morph;
5271a011 215 # Simple morph DB lookup, and return the results.
216 my $result = $morph->lookup( $orig );
217 return map { _wordform_from_row( $_ ) } @{$result->{'objects'}};
218 }
219
5271a011 220}
221
fe77efe0 222sub _wordform_from_row {
223 my( $rowobj ) = @_;
224 my $mpstruct;
225 try {
772edba8 226 # M::P::St will be loaded already if we got here
fe77efe0 227 $mpstruct = Morph::Perseus::Structure->from_tag( $rowobj->code );
228 } catch {
229 warn "Could not create morphology structure from "
230 . $rowobj->code . ": $!";
231 }
fe77efe0 232 my $lemma = $rowobj->lemma;
233 $lemma =~ s/^(\D+)\d*$/$1/;
234 my $wf = Text::Tradition::Collation::Reading::WordForm->new(
235 'language' => 'Latin',
236 'lemma' => $lemma,
237 'morphology' => $mpstruct,
238 );
239 return $wf;
240}
241
5271a011 2421;