add English morph analysis; test for lurking debug statements
[scpubgit/stemmatology.git] / lib / Text / Tradition / Language / French.pm
CommitLineData
cca4f996 1package Text::Tradition::Language::French;
2
f4b6b4d0 3use strict;
4use warnings;
d3e7842a 5use Encode qw/ encode_utf8 decode_utf8 /;
6ad2ce78 6use IPC::Run qw/ run /;
7use Lingua::TagSet::Multext;
8use Lingua::TagSet::TreeTagger;
d3e7842a 9use Module::Load;
10use Text::Tradition::Collation::Reading::Lexeme;
cca4f996 11use Text::Tradition::Collation::Reading::WordForm;
d3e7842a 12use TryCatch;
13
14my $MORPHDIR = '/Users/tla/Projects/morphology';
cca4f996 15
16=head1 NAME
17
6ad2ce78 18Text::Tradition::Language::French - language-specific module for French
cca4f996 19
20=head1 DESCRIPTION
21
6ad2ce78 22Implements morphology lookup for French words in context. This module
23depends on the Flemm module for French lemmatization
24(L<http://www.univ-nancy2.fr/pers/namer/Outils.htm#fl3> in conjunction with
25the TreeTagger software
26(L<http://www.ims.uni-stuttgart.de/projekte/corplex/TreeTagger/>), which is
27(for now) expected to be installed in $MORPHDIR/TreeTagger.
cca4f996 28
29=head1 SUBROUTINES
30
31=head2 lemmatize( $text )
32
33Evaluates the string using the Flemm package, and returns the results.
34
6ad2ce78 35=begin testing
36
37binmode STDOUT, ':utf8';
38use Text::Tradition;
39use_ok( 'Text::Tradition::Language::French' );
40
41eval "use Flemm";
42my $err = $@;
43
44SKIP: {
45 skip "Package Flemm not found" if $err;
46 my $tf = Text::Tradition->new(
47 'input' => 'Self',
48 'file' => 't/data/besoin.xml',
49 'language' => 'French' );
50
51 is( $tf->language, 'French', "Set language okay" );
52 $tf->lemmatize();
53 # Test the lemmatization. How many readings now have morphological info?
54 # Do the lexemes match the reading?
55 my $ambig = 0;
56 foreach my $r ( $tf->collation->readings ) {
57 next if $r->is_meta;
58 ok( $r->has_lexemes, "Reading $r has one or more lexemes" );
59 my @lex = $r->lexemes;
60 my $lexstr = join( '', map { $_->string } @lex );
61 my $textstr = $r->text;
62 $textstr =~ s/\s+//g;
63 is( $textstr, $lexstr, "Lexemes for reading $r match the reading" );
64 foreach my $l ( @lex ) {
65 next if $l->is_disambiguated;
66 # printf( "Ambiguous lexeme %s for reading %s:\n\t%s\n", $l->string, $r->id,
67 # join( "\n\t", map { $_->lemma . ': ' . $_->morphology->to_string } $l->matching_forms ) );
68 $ambig++;
69 }
70 }
71 is( $ambig, 102, "Found 102 ambiguous forms as expected" );
72
73 # Try setting the normal form of a reading and re-analyzing
74 my $mr = $tf->collation->reading('99,2');
75 is( $mr->text, 'minspire', "Picked correct test reading" );
76 is( $mr->language, 'French', "Reading has correct language setting" );
77 $mr->normal_form( "m'inspire" );
78 $mr->lemmatize;
79 is( $mr->lexemes, 2, "Got two lexemes for new m'inspire reading" );
80}
81
82=end testing
83
cca4f996 84=cut
85
86sub lemmatize {
d3e7842a 87 my $tradition = shift;
88
89 # Given a tradition, lemmatize it witness by witness and see what we get.
90 my $workdir = File::Temp->newdir();
91 my $c = $tradition->collation;
92 # First, clear out all existing lexemes from the readings. Save the
93 # path as long as we went to the trouble of generating it.
94 my %witness_paths;
95 foreach my $wit ( $tradition->witnesses ) {
96 my @sigla = ( $wit->sigil );
97 push( @sigla, $wit->sigil . $c->ac_label ) if $wit->is_layered;
98 foreach my $sig ( @sigla ) {
99 my @path = grep { !$_->is_meta }
100 $c->reading_sequence( $c->start, $c->end, $sig );
101 map { $_->clear_lexemes } @path;
102 $witness_paths{$sig} = \@path;
103 }
104 }
cca4f996 105
d3e7842a 106 foreach my $sig ( keys %witness_paths ) {
107 # Get the text as a sequence of readings and as a string
108 print STDERR "Morphologizing witness $sig\n";
6ad2ce78 109 _lemmatize_sequence( undef, @{$witness_paths{$sig}} );
d3e7842a 110 }
111}
112
113sub _update_reading_lexemes {
6ad2ce78 114 my( $replace, $reading, @lexemes ) = @_;
115 if( $reading->has_lexemes && !$replace ) {
d3e7842a 116 # We need to merge what is in @lexemes with what we have already.
117 my @oldlex = $reading->lexemes;
118 my $cmp1 = join( '||', map { $_->string } @oldlex );
119 my $cmp2 = join( '||', map { $_->string } @lexemes );
6ad2ce78 120 if ( @oldlex == @lexemes && $cmp1 eq $cmp2 ) {
d3e7842a 121 # The lexeme strings are the same, so merge the possible
122 # word forms from new to old.
123 foreach my $i ( 0 .. $#lexemes ) {
124 my $ol = $oldlex[$i];
125 my $nl = $lexemes[$i];
126 my %ofw;
6ad2ce78 127 map { $ofw{$_->to_string} = 1 } $ol->matching_forms;
d3e7842a 128 foreach my $form ( $nl->matching_forms ) {
6ad2ce78 129 unless( $ofw{$form->to_string} ) {
130 print STDERR "Adding form " . $form->to_string .
d3e7842a 131 " to lexeme " . $nl->string . " at $reading\n";
132 $ol->add_matching_form( $form );
133 $ol->is_disambiguated(0);
134 }
135 }
136 }
137 } else {
d3e7842a 138 warn "Lexeme layout for $reading changed; replacing the lot";
139 $reading->clear_lexemes;
140 $reading->add_lexeme( @lexemes );
141 }
142 } else {
6ad2ce78 143 $reading->clear_lexemes if $replace;
d3e7842a 144 $reading->add_lexeme( @lexemes );
145 }
cca4f996 146}
147
6ad2ce78 148=head2 reading_lookup( $rdg[, $rdg, ...] )
cca4f996 149
6ad2ce78 150Looks up one or more readings using the Flemm package, and returns the
151possible results. This uses the same logic as L<lemmatize> above for the
152entire tradition, but can also be used to (re-)analyze individual readings.
cca4f996 153
154=cut
155
6ad2ce78 156sub reading_lookup {
157 return _lemmatize_sequence( 1, @_ );
158}
159
160sub _lemmatize_sequence {
161 my( $replace, @path ) = @_;
6ad2ce78 162 my $tagresult = _treetag_string( _text_from_path( 1, @path ) );
163 if( $tagresult ) {
164 # Map the tagged words onto the original readings, splitting
165 # them up into lexemes where necessary.
166 # NOTE we can have multiple lexemes in a reading, but not
167 # multiple readings to a lexeme.
168 my @tags = split( /\n/, $tagresult );
169 my @lexemes;
170 my $curr_rdg = shift @path;
171 my @curr_lexemes;
172 my $unused_rdg_part;
173 foreach my $tag ( @tags ) {
174 # Get the original word
175 my( $lexeme, @rest ) = split( /\t/, $tag );
176 # Lemmatize the whole
177 my @forms = _parse_wordform( _flemm_lookup( $tag ) );
178 my $lexobj = Text::Tradition::Collation::Reading::Lexeme->new(
179 'string' => $lexeme, 'language' => 'French',
180 'wordform_matchlist' => \@forms );
181 # Find the next non-meta reading
182 while( $curr_rdg && $curr_rdg->is_meta ) {
183 $curr_rdg = shift @path;
184 }
185 unless( $curr_rdg ) {
f4b6b4d0 186 warn "Ran out of readings in sequence at $lexeme";
6ad2ce78 187 last;
188 }
189 my $curr_rdg_text = $curr_rdg->has_normal_form
190 ? $curr_rdg->normal_form : $curr_rdg->text;
191 if( $unused_rdg_part &&
192 $unused_rdg_part =~ /^\Q$lexeme\E(\s*)(.*)$/ ) {
193 # Nth part of curr_rdg
194 $unused_rdg_part = $2;
195 push( @curr_lexemes, $lexobj );
196 } elsif( $curr_rdg_text =~ /^\Q$lexeme\E(\s*)(.*)$/ ) {
197 # Flag an error if there is already an unused reading part.
198 warn "Skipped over unused text $unused_rdg_part at $curr_rdg"
199 if $unused_rdg_part;
200 $unused_rdg_part = $2; # will be empty if the whole reading matched
201 push( @curr_lexemes, $lexobj );
202 } else {
203 # We do not cope with the idea of a lexeme being
204 # spread across multiple readings.
205 warn "Word sequence changed unexpectedly in text";
206 # See if we can find a matching reading
207 my @lookahead;
208 my $matched;
209 while( my $nr = shift @path ) {
210 my $nrtext = $nr->has_normal_form ? $nr->normal_form : $nr->text;
211 if( $nrtext =~ /^\Q$lexeme\E/ ) {
212 $curr_rdg = $lookahead[-1] if @lookahead;
213 $matched = 1;
214 last;
215 } else {
216 push( @lookahead, $nr );
217 }
218 }
219 # No match? Restore the state we had
220 unless( $matched ) {
221 unshift( @path, @lookahead );
222 }
223 # Trigger a move
224 $unused_rdg_part = '';
225 }
226
227 unless( $unused_rdg_part ) {
228 # Record the lexemes for the given reading.
229 #print STDERR sprintf( "Adding lexeme(s) %s to reading %s (%s)\n",
230 # join( ' ', map { $_->string } @curr_lexemes ),
231 # $curr_rdg->id, $curr_rdg->text );
232 _update_reading_lexemes( $replace, $curr_rdg, @curr_lexemes );
233 $curr_rdg = shift @path;
234 @curr_lexemes = ();
235 }
236 }
237 }
238}
239
240# Utility function so that we can cheat and use it when we need both the path
241# and its text.
242sub _text_from_path {
243 my( $normalize, @path ) = @_;
244 my $pathtext = '';
245 my $last;
246 foreach my $r ( @path ) {
247 unless ( $r->join_prior || !$last || $last->join_next ) {
248 $pathtext .= ' ';
249 }
250 $pathtext .= ( $normalize && $r->has_normal_form )
251 ? $r->normal_form : $r->text;
252 $last = $r;
d3e7842a 253 }
6ad2ce78 254 return $pathtext;
d3e7842a 255}
256
257# Utility function that actually calls the tree tagger.
258sub _treetag_string {
259 my( $text ) = @_;
260 my $wittext = encode_utf8( $text );
261 # Then see if we have TreeTagger
262 my $taggercmd = "$MORPHDIR/TreeTagger/cmd/tree-tagger-french-utf8";
263 unless( -f $taggercmd ) {
264 warn "Cannot do French word lemmatization without TreeTagger";
265 return;
266 }
267 # OK, we can run it then.
268 my @cmd = ( $taggercmd );
269 my( $tagresult, $err ); # Capture the output and error
270 run( \@cmd, \$wittext, \$tagresult, \$err );
271 # TODO check for error
272 return decode_utf8( $tagresult );
273}
274
275# Closure and utility function for the package lemmatizer
276{
277 my $lemmatizer;
278
279 sub _flemm_lookup {
280 # First try to load Flemm
281 unless( $lemmatizer ) {
282 try {
283 load 'Flemm';
284 $lemmatizer = Flemm->new( 'Encoding' => 'utf8', 'Tagger' => 'treetagger' );
285 } catch {
286 warn "Cannot do French word lemmatization without Flemm: @_";
287 return;
288 }
289 }
290 return $lemmatizer->lemmatize( @_ )
291 }
cca4f996 292
293}
294
d3e7842a 295# Utility function to turn a Flemm result into a WordForm
296sub _parse_wordform {
297 my $flemmobj = shift;
298 # For now just parse the string, until we make sense of the documentation.
299 my @results = split( / \|\| /, $flemmobj->getResult );
300 my @forms;
301 foreach ( @results ) {
302 my( $orig, $tag, $lemma ) = split( /\t/, $_ );
303 my( $pos, $morph ) = split( /:/, $tag );
6ad2ce78 304 my $morphobj;
305 if( $morph ) {
306 $morphobj = Lingua::TagSet::Multext->tag2structure( $morph );
307 } else {
308 # Use the TreeTagger info if there is no Flemm morphology.
309 $morphobj = Lingua::TagSet::TreeTagger->tag2structure( $pos );
310 }
311 if( $morphobj ) {
312 my $wf = Text::Tradition::Collation::Reading::WordForm->new(
313 'language' => 'French',
314 'lemma' => $lemma,
315 'morphology' => $morphobj,
316 );
317 push( @forms, $wf );
318 } else {
319 warn "No morphology found for word: $_";
320 }
d3e7842a 321 }
322 return @forms;
323}
324
3251;
326
6ad2ce78 327=head2 TODO
328
329=over
330
331=item * Handle package dependencies more gracefully
332
333=back
334
cca4f996 335=head1 LICENSE
336
337This package is free software and is provided "as is" without express
338or implied warranty. You can redistribute it and/or modify it under
339the same terms as Perl itself.
340
341=head1 AUTHOR
342
343Tara L Andrews E<lt>aurum@cpan.orgE<gt>