1 package Text::Tradition::Language::Latin;
6 use Morph::Perseus::Structure;
7 use Text::Tradition::Language::Base qw/ lemmatize_treetagger treetagger_struct
13 Text::Tradition::Language::Latin - language-specific module for Latin
17 Implements morphology lookup for French words in context. This module
18 depends on the Morph::Perseus module for access to PhiloLogic database data.
19 It also depends on the TreeTagger software
20 (L<http://www.ims.uni-stuttgart.de/projekte/corplex/TreeTagger/>), which is
21 (for now) expected to be installed in $MORPHDIR/TreeTagger.
25 =head2 lemmatize( $text )
27 Evaluates the string using the Flemm package, and returns the results.
32 use_ok( 'Text::Tradition::Language::Latin' );
34 eval "use Morph::Perseus";
38 skip "Package Morph::Perseus not found" if $err;
40 my $trad = Text::Tradition->new(
41 'language' => 'Latin',
42 'file' => 't/data/legendfrag.xml',
46 foreach my $r ( $trad->collation->readings ) {
48 ok( $r->has_lexemes, "Reading $r has one or more lexemes" );
49 my @lex = $r->lexemes;
50 my $lexstr = join( '', map { $_->string } @lex );
51 my $textstr = $r->text;
53 is( $textstr, $lexstr, "Lexemes for reading $r match the reading" );
54 foreach my $l ( @lex ) {
55 next unless $l->matches;
56 next if $l->is_disambiguated;
57 printf( "Ambiguous lexeme %s for reading %s:\n\t%s\n", $l->string, $r->id,
58 join( "\n\t", map { $_->lemma . ': ' . $_->morphology->to_string } $l->matching_forms ) );
62 is( $ambig, 7, "Found 7 ambiguous forms as expected" );
70 my $tradition = shift;
72 'language' => 'Latin',
73 'callback' => sub { _perseus_lookup_tt( @_ ) }
75 return lemmatize_treetagger( $tradition, %opts );
78 =head2 reading_lookup( $rdg[, $rdg, ...] )
80 Looks up one or more readings using the Perseus package, and returns the
81 possible results. This skips the tree tagger / tokenizer, returning any
82 match for the word string(s) in the morphology DB.
88 return map { _perseus_lookup_str( $_ ) } @words;
91 =head2 morphology_tags
93 Return a data structure describing the available parts of speech and their attributes.
99 load 'Morph::Perseus::Structure';
101 warn "Not using Perseus Latin tags";
103 return lfs_morph_tags();
113 load 'Morph::Perseus';
114 load 'Morph::Perseus::Structure';
115 $morph = Morph::Perseus->connect( 'Latin' );
117 warn "Cannot do Latin word lemmatization without Morph::Perseus: @_";
124 # passive verbs (-or)
125 # T sapientia -> sapientia
127 # T occulta -> occultus (with occulo in next field, hmm...)
134 'aperte' => 'apertus',
135 'evolvo' => 'exvolvo',
136 'inquiam' => 'inquam',
137 'intelligo' => 'intellego',
141 'male' => 'malus|malum',
142 'multum' => 'multus',
145 'occultum' => 'occultus',
146 'peregrinans' => 'peregrinor',
147 'perfectus' => 'perficio',
149 'praesente' => 'praesens',
151 'quotidianus' => 'cottidianus',
153 'septem' => 'septimus',
154 'Spiritum' => 'spiritus',
155 'viriliter' => 'virilis', # TODO special case -iter?
158 'datum' => 'do|data|datus',
159 'forte' => 'fors|fortis',
160 'vere' => 'verum|verus',
163 sub _perseus_lookup_tt {
164 my( $orig, $pos, $lemma ) = split( /\t/, $_[0] );
166 my $result = $morph->lookup( $orig );
167 # Discard results that don't match the lemma, unless lemma is unknown
168 my @orig = @{$result->{'objects'}};
170 unless( $lemma eq '<unknown>' || $lemma =~ /^\W+$/ ) {
171 # TODO Perseus lemma might have a number on the end, yuck.
172 # multiple lemmata separated with |
173 $lemma =~ s/[^\w|]//g;
174 $lemma = $excep{$lemma} if exists $excep{$lemma};
178 map { $lems{$_} = 1; $lems{lc($_)} = 1 } split( /\|/, $lemma );
179 # Now match the lemmata from Treetagger to the lemmata and alt_ls
184 my $y = $_->alt_lex || '';
186 $lems{$x} || $lems{$y};
188 warn "TreeTagger lemma $lemma matched no results from Perseus for $orig"
191 @ret = @orig unless @ret;
193 my %unique_wordforms;
194 foreach my $obj ( @ret ) {
195 my $wf = _wordform_from_row( $obj );
196 $unique_wordforms{$wf->to_string} = $wf;
198 ## TODO Use TreeTagger info - requires serious hacking of Lingua::TagSet
199 # Discard results that don't match the given TreeTagger POS, unless
200 # that leaves zero results
201 # my $ttstruct = treetagger_struct( $pos );
202 # my @ttmatch = grep { $ttstruct->is_compatible( $_->morphology ) } @wordforms;
203 # unless( @ttmatch ) {
204 # warn "TreeTagger POS $pos matched no results from Perseus for $orig";
205 # @ttmatch = @wordforms;
208 return values( %unique_wordforms );
211 sub _perseus_lookup_str {
214 # Simple morph DB lookup, and return the results.
215 my $result = $morph->lookup( $orig );
216 return map { _wordform_from_row( $_ ) } @{$result->{'objects'}};
221 sub _wordform_from_row {
225 $mpstruct = Morph::Perseus::Structure->from_tag( $rowobj->code );
227 warn "Could not create morphology structure from "
228 . $rowobj->code . ": $!";
230 my $lemma = $rowobj->lemma;
231 $lemma =~ s/^(\D+)\d*$/$1/;
232 my $wf = Text::Tradition::Collation::Reading::WordForm->new(
233 'language' => 'Latin',
235 'morphology' => $mpstruct,