add English morph analysis; test for lurking debug statements
[scpubgit/stemmatology.git] / lib / Text / Tradition / Language / English.pm
CommitLineData
f4b6b4d0 1package Text::Tradition::Language::English;
2
3use strict;
4use warnings;
5use Encode qw/ encode_utf8 decode_utf8 /;
6use IPC::Run qw/ run /;
7use Lingua::TagSet::TreeTagger;
8use Module::Load;
9use Text::Tradition::Collation::Reading::Lexeme;
10use Text::Tradition::Collation::Reading::WordForm;
11use TryCatch;
12
13my $MORPHDIR = '/Users/tla/Projects/morphology';
14
15=head1 NAME
16
17Text::Tradition::Language::English - language-specific module for English
18
19=head1 DESCRIPTION
20
21Implements morphology lookup for English words in context. This module
22depends on the TreeTagger software
23(L<http://www.ims.uni-stuttgart.de/projekte/corplex/TreeTagger/>), which is
24(for now) expected to be installed in $MORPHDIR/TreeTagger.
25
26=head1 SUBROUTINES
27
28=head2 lemmatize( $text )
29
30Evaluates the string using the TreeTagger, and returns the results.
31
32=begin testing
33
34binmode STDOUT, ':utf8';
35use Text::Tradition;
36use_ok( 'Text::Tradition::Language::English' );
37
38=end testing
39
40=cut
41
42sub lemmatize {
43 my $tradition = shift;
44
45 # Given a tradition, lemmatize it witness by witness and see what we get.
46 my $workdir = File::Temp->newdir();
47 my $c = $tradition->collation;
48 # First, clear out all existing lexemes from the readings. Save the
49 # path as long as we went to the trouble of generating it.
50 my %witness_paths;
51 foreach my $wit ( $tradition->witnesses ) {
52 my @sigla = ( $wit->sigil );
53 push( @sigla, $wit->sigil . $c->ac_label ) if $wit->is_layered;
54 foreach my $sig ( @sigla ) {
55 my @path = grep { !$_->is_meta }
56 $c->reading_sequence( $c->start, $c->end, $sig );
57 map { $_->clear_lexemes } @path;
58 $witness_paths{$sig} = \@path;
59 }
60 }
61
62 foreach my $sig ( keys %witness_paths ) {
63 # Get the text as a sequence of readings and as a string
64 print STDERR "Morphologizing witness $sig\n";
65 _lemmatize_sequence( undef, @{$witness_paths{$sig}} );
66 }
67}
68
69sub _update_reading_lexemes {
70 my( $replace, $reading, @lexemes ) = @_;
71 if( $reading->has_lexemes && !$replace ) {
72 # We need to merge what is in @lexemes with what we have already.
73 my @oldlex = $reading->lexemes;
74 my $cmp1 = join( '||', map { $_->string } @oldlex );
75 my $cmp2 = join( '||', map { $_->string } @lexemes );
76 if ( @oldlex == @lexemes && $cmp1 eq $cmp2 ) {
77 # The lexeme strings are the same, so merge the possible
78 # word forms from new to old.
79 foreach my $i ( 0 .. $#lexemes ) {
80 my $ol = $oldlex[$i];
81 my $nl = $lexemes[$i];
82 my %ofw;
83 map { $ofw{$_->to_string} = 1 } $ol->matching_forms;
84 foreach my $form ( $nl->matching_forms ) {
85 unless( $ofw{$form->to_string} ) {
86 print STDERR "Adding form " . $form->to_string .
87 " to lexeme " . $nl->string . " at $reading\n";
88 $ol->add_matching_form( $form );
89 $ol->is_disambiguated(0);
90 }
91 }
92 }
93 } else {
94 warn "Lexeme layout for $reading changed; replacing the lot";
95 $reading->clear_lexemes;
96 $reading->add_lexeme( @lexemes );
97 }
98 } else {
99 $reading->clear_lexemes if $replace;
100 $reading->add_lexeme( @lexemes );
101 }
102}
103
104=head2 reading_lookup( $rdg[, $rdg, ...] )
105
106Looks up one or more readings using the Flemm package, and returns the
107possible results. This uses the same logic as L<lemmatize> above for the
108entire tradition, but can also be used to (re-)analyze individual readings.
109
110=cut
111
112sub reading_lookup {
113 return _lemmatize_sequence( 1, @_ );
114}
115
116sub _lemmatize_sequence {
117 my( $replace, @path ) = @_;
118 my $tagresult = _treetag_string( _text_from_path( 1, @path ) );
119 if( $tagresult ) {
120 # Map the tagged words onto the original readings, splitting
121 # them up into lexemes where necessary.
122 # NOTE we can have multiple lexemes in a reading, but not
123 # multiple readings to a lexeme.
124 my @tags = split( /\n/, $tagresult );
125 my @lexemes;
126 my $curr_rdg = shift @path;
127 my @curr_lexemes;
128 my $unused_rdg_part;
129 foreach my $tag ( @tags ) {
130 # Get the original word
131 my( $lexeme, @rest ) = split( /\t/, $tag );
132 # Lemmatize the whole
133 my @forms = _parse_wordform( $tag );
134 my $lexobj = Text::Tradition::Collation::Reading::Lexeme->new(
135 'string' => $lexeme, 'language' => 'English',
136 'wordform_matchlist' => \@forms );
137 # Find the next non-meta reading
138 while( $curr_rdg && $curr_rdg->is_meta ) {
139 $curr_rdg = shift @path;
140 }
141 unless( $curr_rdg ) {
142 warn "Ran out of readings in sequence at $lexeme";
143 last;
144 }
145 my $curr_rdg_text = $curr_rdg->has_normal_form
146 ? $curr_rdg->normal_form : $curr_rdg->text;
147 if( $unused_rdg_part &&
148 $unused_rdg_part =~ /^\Q$lexeme\E(\s*)(.*)$/ ) {
149 # Nth part of curr_rdg
150 $unused_rdg_part = $2;
151 push( @curr_lexemes, $lexobj );
152 } elsif( $curr_rdg_text =~ /^\Q$lexeme\E(\s*)(.*)$/ ) {
153 # Flag an error if there is already an unused reading part.
154 warn "Skipped over unused text $unused_rdg_part at $curr_rdg"
155 if $unused_rdg_part;
156 $unused_rdg_part = $2; # will be empty if the whole reading matched
157 push( @curr_lexemes, $lexobj );
158 } else {
159 # We do not cope with the idea of a lexeme being
160 # spread across multiple readings.
161 warn "Word sequence changed unexpectedly in text";
162 # See if we can find a matching reading
163 my @lookahead;
164 my $matched;
165 while( my $nr = shift @path ) {
166 my $nrtext = $nr->has_normal_form ? $nr->normal_form : $nr->text;
167 if( $nrtext =~ /^\Q$lexeme\E/ ) {
168 $curr_rdg = $lookahead[-1] if @lookahead;
169 $matched = 1;
170 last;
171 } else {
172 push( @lookahead, $nr );
173 }
174 }
175 # No match? Restore the state we had
176 unless( $matched ) {
177 unshift( @path, @lookahead );
178 }
179 # Trigger a move
180 $unused_rdg_part = '';
181 }
182
183 unless( $unused_rdg_part ) {
184 # Record the lexemes for the given reading.
185 #print STDERR sprintf( "Adding lexeme(s) %s to reading %s (%s)\n",
186 # join( ' ', map { $_->string } @curr_lexemes ),
187 # $curr_rdg->id, $curr_rdg->text );
188 _update_reading_lexemes( $replace, $curr_rdg, @curr_lexemes );
189 $curr_rdg = shift @path;
190 @curr_lexemes = ();
191 }
192 }
193 }
194}
195
196# Utility function so that we can cheat and use it when we need both the path
197# and its text.
198sub _text_from_path {
199 my( $normalize, @path ) = @_;
200 my $pathtext = '';
201 my $last;
202 foreach my $r ( @path ) {
203 unless ( $r->join_prior || !$last || $last->join_next ) {
204 $pathtext .= ' ';
205 }
206 $pathtext .= ( $normalize && $r->has_normal_form )
207 ? $r->normal_form : $r->text;
208 $last = $r;
209 }
210 return $pathtext;
211}
212
213# Utility function that actually calls the tree tagger.
214sub _treetag_string {
215 my( $text ) = @_;
216 my $wittext = encode_utf8( $text );
217 # Then see if we have TreeTagger
218 my $taggercmd = "$MORPHDIR/TreeTagger/cmd/tree-tagger-english";
219 unless( -f $taggercmd ) {
220 warn "Cannot do English word lemmatization without TreeTagger";
221 return;
222 }
223 # OK, we can run it then.
224 my @cmd = ( $taggercmd );
225 my( $tagresult, $err ); # Capture the output and error
226 run( \@cmd, \$wittext, \$tagresult, \$err );
227 # TODO check for error
228 return decode_utf8( $tagresult );
229}
230
231# Utility function to turn a TreeTagger result into a WordForm
232sub _parse_wordform {
233 my $tagresult = shift;
234 my( $orig, $tag, $lemma ) = split( /\t/, $tagresult );
235 my $morphobj = Lingua::TagSet::TreeTagger->tag2structure( $tag );
236 if( $morphobj ) {
237 return Text::Tradition::Collation::Reading::WordForm->new(
238 'language' => 'English',
239 'lemma' => $lemma,
240 'morphology' => $morphobj,
241 );
242 } else {
243 warn "No morphology found for word: $_";
244 }
245}
246
2471;
248
249=head2 TODO
250
251=over
252
253=item * Handle package dependencies more gracefully
254
255=item * Refactor English/French use of TreeTagger into its own util package
256
257=back
258
259=head1 LICENSE
260
261This package is free software and is provided "as is" without express
262or implied warranty. You can redistribute it and/or modify it under
263the same terms as Perl itself.
264
265=head1 AUTHOR
266
267Tara L Andrews E<lt>aurum@cpan.orgE<gt>