add morphology support for Latin, dependent on Perseus morphology
[scpubgit/stemmatology.git] / lib / Text / Tradition / Language / Latin.pm
CommitLineData
5271a011 1package Text::Tradition::Language::Latin;
2
3use strict;
4use warnings;
5use Module::Load;
6use Text::Tradition::Language::Base qw/ lemmatize_treetagger treetagger_struct /;
7use TryCatch;
8
9=head1 NAME
10
11Text::Tradition::Language::Latin - language-specific module for Latin
12
13=head1 DESCRIPTION
14
15Implements morphology lookup for French words in context. This module
16depends on the Morph::Perseus module for access to PhiloLogic database data.
17It also depends on the TreeTagger software
18(L<http://www.ims.uni-stuttgart.de/projekte/corplex/TreeTagger/>), which is
19(for now) expected to be installed in $MORPHDIR/TreeTagger.
20
21=head1 SUBROUTINES
22
23=head2 lemmatize( $text )
24
25Evaluates the string using the Flemm package, and returns the results.
26
27=begin testing
28
29use Text::Tradition;
30use_ok( 'Text::Tradition::Language::Latin' );
31
32eval "use Morph::Perseus";
33my $err = $@;
34
35SKIP: {
36 skip "Package Morph::Perseus not found" if $err;
37
38 my $trad = Text::Tradition->new(
39 'language' => 'Latin',
40 'file' => 't/data/legendfrag.xml',
41 'input' => 'Self' );
42 $trad->lemmatize();
43 my $ambig = 0;
44 foreach my $r ( $trad->collation->readings ) {
45 next if $r->is_meta;
46 ok( $r->has_lexemes, "Reading $r has one or more lexemes" );
47 my @lex = $r->lexemes;
48 my $lexstr = join( '', map { $_->string } @lex );
49 my $textstr = $r->text;
50 $textstr =~ s/\s+//g;
51 is( $textstr, $lexstr, "Lexemes for reading $r match the reading" );
52 foreach my $l ( @lex ) {
53 next if $l->is_disambiguated;
54 printf( "Ambiguous lexeme %s for reading %s:\n\t%s\n", $l->string, $r->id,
55 join( "\n\t", map { $_->lemma . ': ' . $_->morphology->to_string } $l->matching_forms ) );
56 $ambig++;
57 }
58 }
59 is( $ambig, 19, "Found 19 ambiguous forms as expected" );
60}
61
62=end testing
63
64=cut
65
66sub lemmatize {
67 my $tradition = shift;
68 my %opts = (
69 'language' => 'Latin',
70 'callback' => sub { _perseus_lookup_tt( @_ ) }
71 );
72 return lemmatize_treetagger( $tradition, %opts );
73}
74
75=head2 reading_lookup( $rdg[, $rdg, ...] )
76
77Looks up one or more readings using the Perseus package, and returns the
78possible results. This skips the tree tagger / tokenizer, returning any
79match for the word string(s) in the morphology DB.
80
81=cut
82
83sub reading_lookup {
84 my @words = @_;
85 return map { _perseus_lookup_str( $_ ) } @words;
86}
87
88{
89 my $morph;
90
91 sub _morph_connect {
92 unless( $morph ) {
93 try {
94 load 'Morph::Perseus';
95 load 'Morph::Perseus::Structure';
96 $morph = Morph::Perseus->connect( 'Latin' );
97 } catch {
98 warn "Cannot do Latin word lemmatization without Morph::Perseus: @_";
99 return;
100 }
101 }
102 }
103
104 sub _perseus_lookup_tt {
105 my( $orig, $pos, $lemma ) = split( /\t/, $_[0] );
106 _morph_connect();
107 my $result = $morph->lookup( $orig );
108 # Discard results that don't match the lemma, unless lemma is unknown
109 my @ret;
110 unless( $lemma eq '<unknown>' ) {
111 # TODO Perseus lemma might have a number on the end, yuck.
112 @ret = grep { $_->lemma =~ /^$lemma(\d*)$/ } @{$result->{'objects'}};
113 }
114 unless( @ret ) {
115 @ret = @{$result->{'objects'}};
116 warn "TreeTagger lemma $lemma matched no results from Perseus for $orig"
117 if @ret;
118 }
119
120 # Discard results that don't match the given TreeTagger POS, unless
121 # that leaves zero results
122 my @wordforms;
123 foreach my $obj ( @ret ) {
124 push( @wordforms, _wordform_from_row( $obj ) );
125 }
126 ## TODO Use TreeTagger info - requires serious hacking of Lingua::TagSet
127# my $ttstruct = treetagger_struct( $pos );
128# my @ttmatch = grep { $ttstruct->is_compatible( $_->morphology ) } @wordforms;
129# unless( @ttmatch ) {
130# warn "TreeTagger POS $pos matched no results from Perseus for $orig";
131# @ttmatch = @wordforms;
132# }
133# return @ttmatch;
134 return @wordforms;
135 }
136
137 sub _perseus_lookup_str {
138 my( $orig ) = @_;
139 _morph_connect();
140 # Simple morph DB lookup, and return the results.
141 my $result = $morph->lookup( $orig );
142 return map { _wordform_from_row( $_ ) } @{$result->{'objects'}};
143 }
144
145 sub _wordform_from_row {
146 my( $rowobj ) = @_;
147 my $mpstruct = Morph::Perseus::Structure->from_tag( $rowobj->code );
148 my $wf = Text::Tradition::Collation::Reading::WordForm->new(
149 'language' => 'Latin',
150 'lemma' => $rowobj->lemma,
151 'morphology' => $mpstruct,
152 );
153 return $wf;
154 }
155
156}
157
1581;