Commit | Line | Data |
5271a011 |
1 | package Text::Tradition::Language::Latin; |
2 | |
3 | use strict; |
4 | use warnings; |
5 | use Module::Load; |
6 | use Text::Tradition::Language::Base qw/ lemmatize_treetagger treetagger_struct /; |
7 | use TryCatch; |
8 | |
9 | =head1 NAME |
10 | |
11 | Text::Tradition::Language::Latin - language-specific module for Latin |
12 | |
13 | =head1 DESCRIPTION |
14 | |
15 | Implements morphology lookup for French words in context. This module |
16 | depends on the Morph::Perseus module for access to PhiloLogic database data. |
17 | It also depends on the TreeTagger software |
18 | (L<http://www.ims.uni-stuttgart.de/projekte/corplex/TreeTagger/>), which is |
19 | (for now) expected to be installed in $MORPHDIR/TreeTagger. |
20 | |
21 | =head1 SUBROUTINES |
22 | |
23 | =head2 lemmatize( $text ) |
24 | |
25 | Evaluates the string using the Flemm package, and returns the results. |
26 | |
27 | =begin testing |
28 | |
29 | use Text::Tradition; |
30 | use_ok( 'Text::Tradition::Language::Latin' ); |
31 | |
32 | eval "use Morph::Perseus"; |
33 | my $err = $@; |
34 | |
35 | SKIP: { |
36 | skip "Package Morph::Perseus not found" if $err; |
37 | |
38 | my $trad = Text::Tradition->new( |
39 | 'language' => 'Latin', |
40 | 'file' => 't/data/legendfrag.xml', |
41 | 'input' => 'Self' ); |
42 | $trad->lemmatize(); |
43 | my $ambig = 0; |
44 | foreach my $r ( $trad->collation->readings ) { |
45 | next if $r->is_meta; |
46 | ok( $r->has_lexemes, "Reading $r has one or more lexemes" ); |
47 | my @lex = $r->lexemes; |
48 | my $lexstr = join( '', map { $_->string } @lex ); |
49 | my $textstr = $r->text; |
50 | $textstr =~ s/\s+//g; |
51 | is( $textstr, $lexstr, "Lexemes for reading $r match the reading" ); |
52 | foreach my $l ( @lex ) { |
53 | next if $l->is_disambiguated; |
54 | printf( "Ambiguous lexeme %s for reading %s:\n\t%s\n", $l->string, $r->id, |
55 | join( "\n\t", map { $_->lemma . ': ' . $_->morphology->to_string } $l->matching_forms ) ); |
56 | $ambig++; |
57 | } |
58 | } |
59 | is( $ambig, 19, "Found 19 ambiguous forms as expected" ); |
60 | } |
61 | |
62 | =end testing |
63 | |
64 | =cut |
65 | |
66 | sub lemmatize { |
67 | my $tradition = shift; |
68 | my %opts = ( |
69 | 'language' => 'Latin', |
70 | 'callback' => sub { _perseus_lookup_tt( @_ ) } |
71 | ); |
72 | return lemmatize_treetagger( $tradition, %opts ); |
73 | } |
74 | |
75 | =head2 reading_lookup( $rdg[, $rdg, ...] ) |
76 | |
77 | Looks up one or more readings using the Perseus package, and returns the |
78 | possible results. This skips the tree tagger / tokenizer, returning any |
79 | match for the word string(s) in the morphology DB. |
80 | |
81 | =cut |
82 | |
83 | sub reading_lookup { |
84 | my @words = @_; |
85 | return map { _perseus_lookup_str( $_ ) } @words; |
86 | } |
87 | |
88 | { |
89 | my $morph; |
90 | |
91 | sub _morph_connect { |
92 | unless( $morph ) { |
93 | try { |
94 | load 'Morph::Perseus'; |
95 | load 'Morph::Perseus::Structure'; |
96 | $morph = Morph::Perseus->connect( 'Latin' ); |
97 | } catch { |
98 | warn "Cannot do Latin word lemmatization without Morph::Perseus: @_"; |
99 | return; |
100 | } |
101 | } |
102 | } |
103 | |
104 | sub _perseus_lookup_tt { |
105 | my( $orig, $pos, $lemma ) = split( /\t/, $_[0] ); |
106 | _morph_connect(); |
107 | my $result = $morph->lookup( $orig ); |
108 | # Discard results that don't match the lemma, unless lemma is unknown |
109 | my @ret; |
110 | unless( $lemma eq '<unknown>' ) { |
111 | # TODO Perseus lemma might have a number on the end, yuck. |
112 | @ret = grep { $_->lemma =~ /^$lemma(\d*)$/ } @{$result->{'objects'}}; |
113 | } |
114 | unless( @ret ) { |
115 | @ret = @{$result->{'objects'}}; |
116 | warn "TreeTagger lemma $lemma matched no results from Perseus for $orig" |
117 | if @ret; |
118 | } |
119 | |
120 | # Discard results that don't match the given TreeTagger POS, unless |
121 | # that leaves zero results |
122 | my @wordforms; |
123 | foreach my $obj ( @ret ) { |
124 | push( @wordforms, _wordform_from_row( $obj ) ); |
125 | } |
126 | ## TODO Use TreeTagger info - requires serious hacking of Lingua::TagSet |
127 | # my $ttstruct = treetagger_struct( $pos ); |
128 | # my @ttmatch = grep { $ttstruct->is_compatible( $_->morphology ) } @wordforms; |
129 | # unless( @ttmatch ) { |
130 | # warn "TreeTagger POS $pos matched no results from Perseus for $orig"; |
131 | # @ttmatch = @wordforms; |
132 | # } |
133 | # return @ttmatch; |
134 | return @wordforms; |
135 | } |
136 | |
137 | sub _perseus_lookup_str { |
138 | my( $orig ) = @_; |
139 | _morph_connect(); |
140 | # Simple morph DB lookup, and return the results. |
141 | my $result = $morph->lookup( $orig ); |
142 | return map { _wordform_from_row( $_ ) } @{$result->{'objects'}}; |
143 | } |
144 | |
145 | sub _wordform_from_row { |
146 | my( $rowobj ) = @_; |
147 | my $mpstruct = Morph::Perseus::Structure->from_tag( $rowobj->code ); |
148 | my $wf = Text::Tradition::Collation::Reading::WordForm->new( |
149 | 'language' => 'Latin', |
150 | 'lemma' => $rowobj->lemma, |
151 | 'morphology' => $mpstruct, |
152 | ); |
153 | return $wf; |
154 | } |
155 | |
156 | } |
157 | |
158 | 1; |