refine some special case weirdness for Perseus
[scpubgit/stemmatology.git] / lib / Text / Tradition / Language / Latin.pm
CommitLineData
5271a011 1package Text::Tradition::Language::Latin;
2
3use strict;
4use warnings;
5use Module::Load;
6use Text::Tradition::Language::Base qw/ lemmatize_treetagger treetagger_struct /;
7use TryCatch;
8
9=head1 NAME
10
11Text::Tradition::Language::Latin - language-specific module for Latin
12
13=head1 DESCRIPTION
14
15Implements morphology lookup for French words in context. This module
16depends on the Morph::Perseus module for access to PhiloLogic database data.
17It also depends on the TreeTagger software
18(L<http://www.ims.uni-stuttgart.de/projekte/corplex/TreeTagger/>), which is
19(for now) expected to be installed in $MORPHDIR/TreeTagger.
20
21=head1 SUBROUTINES
22
23=head2 lemmatize( $text )
24
25Evaluates the string using the Flemm package, and returns the results.
26
27=begin testing
28
29use Text::Tradition;
30use_ok( 'Text::Tradition::Language::Latin' );
31
32eval "use Morph::Perseus";
33my $err = $@;
34
35SKIP: {
36 skip "Package Morph::Perseus not found" if $err;
37
38 my $trad = Text::Tradition->new(
39 'language' => 'Latin',
40 'file' => 't/data/legendfrag.xml',
41 'input' => 'Self' );
42 $trad->lemmatize();
43 my $ambig = 0;
44 foreach my $r ( $trad->collation->readings ) {
45 next if $r->is_meta;
46 ok( $r->has_lexemes, "Reading $r has one or more lexemes" );
47 my @lex = $r->lexemes;
48 my $lexstr = join( '', map { $_->string } @lex );
49 my $textstr = $r->text;
50 $textstr =~ s/\s+//g;
51 is( $textstr, $lexstr, "Lexemes for reading $r match the reading" );
52 foreach my $l ( @lex ) {
fe77efe0 53 next unless $l->matches;
5271a011 54 next if $l->is_disambiguated;
55 printf( "Ambiguous lexeme %s for reading %s:\n\t%s\n", $l->string, $r->id,
56 join( "\n\t", map { $_->lemma . ': ' . $_->morphology->to_string } $l->matching_forms ) );
57 $ambig++;
58 }
59 }
fe77efe0 60 is( $ambig, 7, "Found 7 ambiguous forms as expected" );
5271a011 61}
62
63=end testing
64
65=cut
66
67sub lemmatize {
68 my $tradition = shift;
69 my %opts = (
70 'language' => 'Latin',
71 'callback' => sub { _perseus_lookup_tt( @_ ) }
72 );
73 return lemmatize_treetagger( $tradition, %opts );
74}
75
76=head2 reading_lookup( $rdg[, $rdg, ...] )
77
78Looks up one or more readings using the Perseus package, and returns the
79possible results. This skips the tree tagger / tokenizer, returning any
80match for the word string(s) in the morphology DB.
81
82=cut
83
84sub reading_lookup {
85 my @words = @_;
86 return map { _perseus_lookup_str( $_ ) } @words;
87}
88
89{
90 my $morph;
91
92 sub _morph_connect {
93 unless( $morph ) {
94 try {
95 load 'Morph::Perseus';
96 load 'Morph::Perseus::Structure';
97 $morph = Morph::Perseus->connect( 'Latin' );
98 } catch {
99 warn "Cannot do Latin word lemmatization without Morph::Perseus: @_";
100 return;
101 }
102 }
103 }
104
fe77efe0 105 # TODO special case:
106 # passive verbs (-or)
107 # T sapientia -> sapientia
108 # T primus -> unus
109 # T occulta -> occultus (with occulo in next field, hmm...)
110 # T carne -> carnis
111 # T melius -> bonus
112
113
114 my %excep = (
115 'absens' => 'absum',
116 'aperte' => 'apertus',
117 'evolvo' => 'exvolvo',
118 'inquiam' => 'inquam',
119 'intelligo' => 'intellego',
120 'itaque' => 'ita',
121 'iuste' => 'iustus',
122 'longe' => 'longus',
123 'male' => 'malus|malum',
124 'multum' => 'multus',
125 'nec' => 'neque',
126 'nos' => 'ego',
127 'occultum' => 'occultus',
128 'peregrinans' => 'peregrinor',
129 'perfectus' => 'perficio',
130 'potius' => 'potis',
131 'praesente' => 'praesens',
132 'prius' => 'prior',
133 'quotidianus' => 'cottidianus',
134 'se' => 'sui',
135 'septem' => 'septimus',
136 'Spiritum' => 'spiritus',
137 'viriliter' => 'virilis', # TODO special case -iter?
138 'vos' => 'tu',
139
140 'datum' => 'do|data|datus',
141 'forte' => 'fors|fortis',
142 'vere' => 'verum|verus',
143 );
144
5271a011 145 sub _perseus_lookup_tt {
146 my( $orig, $pos, $lemma ) = split( /\t/, $_[0] );
147 _morph_connect();
148 my $result = $morph->lookup( $orig );
149 # Discard results that don't match the lemma, unless lemma is unknown
fe77efe0 150 my @orig = @{$result->{'objects'}};
5271a011 151 my @ret;
fe77efe0 152 unless( $lemma eq '<unknown>' || $lemma =~ /^\W+$/ ) {
5271a011 153 # TODO Perseus lemma might have a number on the end, yuck.
fe77efe0 154 # multiple lemmata separated with |
155 $lemma =~ s/[^\w|]//g;
156 $lemma = $excep{$lemma} if exists $excep{$lemma};
157 $lemma =~ s/j/i/g;
158 my %lems;
159 my @forms =
160 map { $lems{$_} = 1; $lems{lc($_)} = 1 } split( /\|/, $lemma );
161 # Now match the lemmata from Treetagger to the lemmata and alt_ls
162 # from Perseus.
163 @ret = grep {
164 my $x = $_->lemma;
165 $x =~ s/\d+$//;
166 my $y = $_->alt_lex || '';
167 $y =~ s/\d+$//;
168 $lems{$x} || $lems{$y};
169 } @orig;
5271a011 170 warn "TreeTagger lemma $lemma matched no results from Perseus for $orig"
fe77efe0 171 if @orig && !@ret;
5271a011 172 }
fe77efe0 173 @ret = @orig unless @ret;
5271a011 174
5271a011 175 my @wordforms;
176 foreach my $obj ( @ret ) {
177 push( @wordforms, _wordform_from_row( $obj ) );
178 }
179 ## TODO Use TreeTagger info - requires serious hacking of Lingua::TagSet
fe77efe0 180 # Discard results that don't match the given TreeTagger POS, unless
181 # that leaves zero results
5271a011 182# my $ttstruct = treetagger_struct( $pos );
183# my @ttmatch = grep { $ttstruct->is_compatible( $_->morphology ) } @wordforms;
184# unless( @ttmatch ) {
185# warn "TreeTagger POS $pos matched no results from Perseus for $orig";
186# @ttmatch = @wordforms;
187# }
188# return @ttmatch;
189 return @wordforms;
190 }
191
192 sub _perseus_lookup_str {
193 my( $orig ) = @_;
194 _morph_connect();
195 # Simple morph DB lookup, and return the results.
196 my $result = $morph->lookup( $orig );
197 return map { _wordform_from_row( $_ ) } @{$result->{'objects'}};
198 }
199
5271a011 200}
201
fe77efe0 202sub _wordform_from_row {
203 my( $rowobj ) = @_;
204 my $mpstruct;
205 try {
206 $mpstruct = Morph::Perseus::Structure->from_tag( $rowobj->code );
207 } catch {
208 warn "Could not create morphology structure from "
209 . $rowobj->code . ": $!";
210 }
211 $DB::single = 1 unless $mpstruct;
212 my $lemma = $rowobj->lemma;
213 $lemma =~ s/^(\D+)\d*$/$1/;
214 my $wf = Text::Tradition::Collation::Reading::WordForm->new(
215 'language' => 'Latin',
216 'lemma' => $lemma,
217 'morphology' => $mpstruct,
218 );
219 return $wf;
220}
221
5271a011 2221;