Commit | Line | Data |
f4b6b4d0 |
1 | package Text::Tradition::Language::English; |
2 | |
3 | use strict; |
4 | use warnings; |
5 | use Encode qw/ encode_utf8 decode_utf8 /; |
6 | use IPC::Run qw/ run /; |
7 | use Lingua::TagSet::TreeTagger; |
8 | use Module::Load; |
9 | use Text::Tradition::Collation::Reading::Lexeme; |
10 | use Text::Tradition::Collation::Reading::WordForm; |
11 | use TryCatch; |
12 | |
13 | my $MORPHDIR = '/Users/tla/Projects/morphology'; |
14 | |
15 | =head1 NAME |
16 | |
17 | Text::Tradition::Language::English - language-specific module for English |
18 | |
19 | =head1 DESCRIPTION |
20 | |
21 | Implements morphology lookup for English words in context. This module |
22 | depends on the TreeTagger software |
23 | (L<http://www.ims.uni-stuttgart.de/projekte/corplex/TreeTagger/>), which is |
24 | (for now) expected to be installed in $MORPHDIR/TreeTagger. |
25 | |
26 | =head1 SUBROUTINES |
27 | |
28 | =head2 lemmatize( $text ) |
29 | |
30 | Evaluates the string using the TreeTagger, and returns the results. |
31 | |
32 | =begin testing |
33 | |
34 | binmode STDOUT, ':utf8'; |
35 | use Text::Tradition; |
36 | use_ok( 'Text::Tradition::Language::English' ); |
37 | |
38 | =end testing |
39 | |
40 | =cut |
41 | |
42 | sub lemmatize { |
43 | my $tradition = shift; |
44 | |
45 | # Given a tradition, lemmatize it witness by witness and see what we get. |
46 | my $workdir = File::Temp->newdir(); |
47 | my $c = $tradition->collation; |
48 | # First, clear out all existing lexemes from the readings. Save the |
49 | # path as long as we went to the trouble of generating it. |
50 | my %witness_paths; |
51 | foreach my $wit ( $tradition->witnesses ) { |
52 | my @sigla = ( $wit->sigil ); |
53 | push( @sigla, $wit->sigil . $c->ac_label ) if $wit->is_layered; |
54 | foreach my $sig ( @sigla ) { |
55 | my @path = grep { !$_->is_meta } |
56 | $c->reading_sequence( $c->start, $c->end, $sig ); |
57 | map { $_->clear_lexemes } @path; |
58 | $witness_paths{$sig} = \@path; |
59 | } |
60 | } |
61 | |
62 | foreach my $sig ( keys %witness_paths ) { |
63 | # Get the text as a sequence of readings and as a string |
64 | print STDERR "Morphologizing witness $sig\n"; |
65 | _lemmatize_sequence( undef, @{$witness_paths{$sig}} ); |
66 | } |
67 | } |
68 | |
69 | sub _update_reading_lexemes { |
70 | my( $replace, $reading, @lexemes ) = @_; |
71 | if( $reading->has_lexemes && !$replace ) { |
72 | # We need to merge what is in @lexemes with what we have already. |
73 | my @oldlex = $reading->lexemes; |
74 | my $cmp1 = join( '||', map { $_->string } @oldlex ); |
75 | my $cmp2 = join( '||', map { $_->string } @lexemes ); |
76 | if ( @oldlex == @lexemes && $cmp1 eq $cmp2 ) { |
77 | # The lexeme strings are the same, so merge the possible |
78 | # word forms from new to old. |
79 | foreach my $i ( 0 .. $#lexemes ) { |
80 | my $ol = $oldlex[$i]; |
81 | my $nl = $lexemes[$i]; |
82 | my %ofw; |
83 | map { $ofw{$_->to_string} = 1 } $ol->matching_forms; |
84 | foreach my $form ( $nl->matching_forms ) { |
85 | unless( $ofw{$form->to_string} ) { |
86 | print STDERR "Adding form " . $form->to_string . |
87 | " to lexeme " . $nl->string . " at $reading\n"; |
88 | $ol->add_matching_form( $form ); |
89 | $ol->is_disambiguated(0); |
90 | } |
91 | } |
92 | } |
93 | } else { |
94 | warn "Lexeme layout for $reading changed; replacing the lot"; |
95 | $reading->clear_lexemes; |
96 | $reading->add_lexeme( @lexemes ); |
97 | } |
98 | } else { |
99 | $reading->clear_lexemes if $replace; |
100 | $reading->add_lexeme( @lexemes ); |
101 | } |
102 | } |
103 | |
104 | =head2 reading_lookup( $rdg[, $rdg, ...] ) |
105 | |
106 | Looks up one or more readings using the Flemm package, and returns the |
107 | possible results. This uses the same logic as L<lemmatize> above for the |
108 | entire tradition, but can also be used to (re-)analyze individual readings. |
109 | |
110 | =cut |
111 | |
112 | sub reading_lookup { |
113 | return _lemmatize_sequence( 1, @_ ); |
114 | } |
115 | |
116 | sub _lemmatize_sequence { |
117 | my( $replace, @path ) = @_; |
118 | my $tagresult = _treetag_string( _text_from_path( 1, @path ) ); |
119 | if( $tagresult ) { |
120 | # Map the tagged words onto the original readings, splitting |
121 | # them up into lexemes where necessary. |
122 | # NOTE we can have multiple lexemes in a reading, but not |
123 | # multiple readings to a lexeme. |
124 | my @tags = split( /\n/, $tagresult ); |
125 | my @lexemes; |
126 | my $curr_rdg = shift @path; |
127 | my @curr_lexemes; |
128 | my $unused_rdg_part; |
129 | foreach my $tag ( @tags ) { |
130 | # Get the original word |
131 | my( $lexeme, @rest ) = split( /\t/, $tag ); |
132 | # Lemmatize the whole |
133 | my @forms = _parse_wordform( $tag ); |
134 | my $lexobj = Text::Tradition::Collation::Reading::Lexeme->new( |
135 | 'string' => $lexeme, 'language' => 'English', |
136 | 'wordform_matchlist' => \@forms ); |
137 | # Find the next non-meta reading |
138 | while( $curr_rdg && $curr_rdg->is_meta ) { |
139 | $curr_rdg = shift @path; |
140 | } |
141 | unless( $curr_rdg ) { |
142 | warn "Ran out of readings in sequence at $lexeme"; |
143 | last; |
144 | } |
145 | my $curr_rdg_text = $curr_rdg->has_normal_form |
146 | ? $curr_rdg->normal_form : $curr_rdg->text; |
147 | if( $unused_rdg_part && |
148 | $unused_rdg_part =~ /^\Q$lexeme\E(\s*)(.*)$/ ) { |
149 | # Nth part of curr_rdg |
150 | $unused_rdg_part = $2; |
151 | push( @curr_lexemes, $lexobj ); |
152 | } elsif( $curr_rdg_text =~ /^\Q$lexeme\E(\s*)(.*)$/ ) { |
153 | # Flag an error if there is already an unused reading part. |
154 | warn "Skipped over unused text $unused_rdg_part at $curr_rdg" |
155 | if $unused_rdg_part; |
156 | $unused_rdg_part = $2; # will be empty if the whole reading matched |
157 | push( @curr_lexemes, $lexobj ); |
158 | } else { |
159 | # We do not cope with the idea of a lexeme being |
160 | # spread across multiple readings. |
161 | warn "Word sequence changed unexpectedly in text"; |
162 | # See if we can find a matching reading |
163 | my @lookahead; |
164 | my $matched; |
165 | while( my $nr = shift @path ) { |
166 | my $nrtext = $nr->has_normal_form ? $nr->normal_form : $nr->text; |
167 | if( $nrtext =~ /^\Q$lexeme\E/ ) { |
168 | $curr_rdg = $lookahead[-1] if @lookahead; |
169 | $matched = 1; |
170 | last; |
171 | } else { |
172 | push( @lookahead, $nr ); |
173 | } |
174 | } |
175 | # No match? Restore the state we had |
176 | unless( $matched ) { |
177 | unshift( @path, @lookahead ); |
178 | } |
179 | # Trigger a move |
180 | $unused_rdg_part = ''; |
181 | } |
182 | |
183 | unless( $unused_rdg_part ) { |
184 | # Record the lexemes for the given reading. |
185 | #print STDERR sprintf( "Adding lexeme(s) %s to reading %s (%s)\n", |
186 | # join( ' ', map { $_->string } @curr_lexemes ), |
187 | # $curr_rdg->id, $curr_rdg->text ); |
188 | _update_reading_lexemes( $replace, $curr_rdg, @curr_lexemes ); |
189 | $curr_rdg = shift @path; |
190 | @curr_lexemes = (); |
191 | } |
192 | } |
193 | } |
194 | } |
195 | |
196 | # Utility function so that we can cheat and use it when we need both the path |
197 | # and its text. |
198 | sub _text_from_path { |
199 | my( $normalize, @path ) = @_; |
200 | my $pathtext = ''; |
201 | my $last; |
202 | foreach my $r ( @path ) { |
203 | unless ( $r->join_prior || !$last || $last->join_next ) { |
204 | $pathtext .= ' '; |
205 | } |
206 | $pathtext .= ( $normalize && $r->has_normal_form ) |
207 | ? $r->normal_form : $r->text; |
208 | $last = $r; |
209 | } |
210 | return $pathtext; |
211 | } |
212 | |
213 | # Utility function that actually calls the tree tagger. |
214 | sub _treetag_string { |
215 | my( $text ) = @_; |
216 | my $wittext = encode_utf8( $text ); |
217 | # Then see if we have TreeTagger |
218 | my $taggercmd = "$MORPHDIR/TreeTagger/cmd/tree-tagger-english"; |
219 | unless( -f $taggercmd ) { |
220 | warn "Cannot do English word lemmatization without TreeTagger"; |
221 | return; |
222 | } |
223 | # OK, we can run it then. |
224 | my @cmd = ( $taggercmd ); |
225 | my( $tagresult, $err ); # Capture the output and error |
226 | run( \@cmd, \$wittext, \$tagresult, \$err ); |
227 | # TODO check for error |
228 | return decode_utf8( $tagresult ); |
229 | } |
230 | |
231 | # Utility function to turn a TreeTagger result into a WordForm |
232 | sub _parse_wordform { |
233 | my $tagresult = shift; |
234 | my( $orig, $tag, $lemma ) = split( /\t/, $tagresult ); |
235 | my $morphobj = Lingua::TagSet::TreeTagger->tag2structure( $tag ); |
236 | if( $morphobj ) { |
237 | return Text::Tradition::Collation::Reading::WordForm->new( |
238 | 'language' => 'English', |
239 | 'lemma' => $lemma, |
240 | 'morphology' => $morphobj, |
241 | ); |
242 | } else { |
243 | warn "No morphology found for word: $_"; |
244 | } |
245 | } |
246 | |
247 | 1; |
248 | |
249 | =head2 TODO |
250 | |
251 | =over |
252 | |
253 | =item * Handle package dependencies more gracefully |
254 | |
255 | =item * Refactor English/French use of TreeTagger into its own util package |
256 | |
257 | =back |
258 | |
259 | =head1 LICENSE |
260 | |
261 | This package is free software and is provided "as is" without express |
262 | or implied warranty. You can redistribute it and/or modify it under |
263 | the same terms as Perl itself. |
264 | |
265 | =head1 AUTHOR |
266 | |
267 | Tara L Andrews E<lt>aurum@cpan.orgE<gt> |