make French morph tagging work; dependent on Flemm and TreeTagger
[scpubgit/stemmatology.git] / lib / Text / Tradition / Language / French.pm
CommitLineData
cca4f996 1package Text::Tradition::Language::French;
2
d3e7842a 3use Encode qw/ encode_utf8 decode_utf8 /;
4use IPC::Run qw/ run binary /;
5use Module::Load;
6use Text::Tradition::Collation::Reading::Lexeme;
cca4f996 7use Text::Tradition::Collation::Reading::WordForm;
d3e7842a 8use TryCatch;
9
10my $MORPHDIR = '/Users/tla/Projects/morphology';
cca4f996 11
12=head1 NAME
13
14Text::Tradition::Language::French - language-specific modules for French
15
16=head1 DESCRIPTION
17
18Implements morphology lookup for French words in context.
19
20=head1 SUBROUTINES
21
22=head2 lemmatize( $text )
23
24Evaluates the string using the Flemm package, and returns the results.
25
26=cut
27
28sub lemmatize {
d3e7842a 29 my $tradition = shift;
30
31 # Given a tradition, lemmatize it witness by witness and see what we get.
32 my $workdir = File::Temp->newdir();
33 my $c = $tradition->collation;
34 # First, clear out all existing lexemes from the readings. Save the
35 # path as long as we went to the trouble of generating it.
36 my %witness_paths;
37 foreach my $wit ( $tradition->witnesses ) {
38 my @sigla = ( $wit->sigil );
39 push( @sigla, $wit->sigil . $c->ac_label ) if $wit->is_layered;
40 foreach my $sig ( @sigla ) {
41 my @path = grep { !$_->is_meta }
42 $c->reading_sequence( $c->start, $c->end, $sig );
43 map { $_->clear_lexemes } @path;
44 $witness_paths{$sig} = \@path;
45 }
46 }
cca4f996 47
d3e7842a 48 foreach my $sig ( keys %witness_paths ) {
49 # Get the text as a sequence of readings and as a string
50 print STDERR "Morphologizing witness $sig\n";
51 my @path = @{$witness_paths{$sig}};
52 my $tagresult = _treetag_string( $c->_text_from_path( @path ) );
53 if( $tagresult ) {
54 # Map the tagged words onto the original readings, splitting
55 # them up into lexemes where necessary.
56 # NOTE we can have multiple lexemes in a reading, but not
57 # multiple readings to a lexeme.
58 my @tags = split( /\n/, $tagresult );
59 my @lexemes;
60 my $curr_rdg = shift @path;
61 my @curr_lexemes;
62 my $unused_rdg_part;
63 foreach my $tag ( @tags ) {
64 # Get the original word
65 my( $lexeme, @rest ) = split( /\t/, $tag );
66 # Lemmatize the whole
67 my @forms = _parse_wordform( _flemm_lookup( $tag ) );
68 my $lexobj = Text::Tradition::Collation::Reading::Lexeme->new(
69 'string' => $lexeme, 'language' => 'French',
70 'wordform_matchlist' => \@forms );
71 # Find the next non-meta reading
72 while( $curr_rdg->is_meta ) {
73 $curr_rdg = shift @path;
74 }
75 unless( $curr_rdg ) {
76 warn "Ran out of readings in sequence for " . $wit->sigil
77 . " at $lexeme";
78 last;
79 }
80 if( $unused_rdg_part &&
81 $unused_rdg_part =~ /^\Q$lexeme\E(\s*)(.*)$/ ) {
82 # Nth part of curr_rdg
83 $unused_rdg_part = $2;
84 push( @curr_lexemes, $lexobj );
85 } elsif( $curr_rdg->text =~ /^\Q$lexeme\E(\s*)(.*)$/ ) {
86 # Flag an error if there is already an unused reading part.
87 warn "Skipped over unused text $unused_rdg_part at $curr_rdg"
88 if $unused_rdg_part;
89 $unused_rdg_part = $2; # will be empty if the whole reading matched
90 push( @curr_lexemes, $lexobj );
91 } else {
92 # We do not cope with the idea of a lexeme being
93 # spread across multiple readings.
94 warn "Word sequence changed unexpectedly in text";
95 # See if we can find a matching reading
96 my @lookahead;
97 my $matched;
98 while( my $nr = shift @path ) {
99 if( $nr->text =~ /^\Q$lexeme\E/ ) {
100 $curr_rdg = $lookahead[-1] if @lookahead;
101 $matched = 1;
102 last;
103 } else {
104 push( @lookahead, $nr );
105 }
106 }
107 # No match? Restore the state we had
108 unless( $matched ) {
109 unshift( @path, @lookahead );
110 }
111 # Trigger a move
112 $unused_rdg_part = '';
113 }
114
115 unless( $unused_rdg_part ) {
116 # Record the lexemes for the given reading.
117 #print STDERR sprintf( "Adding lexeme(s) %s to reading %s (%s)\n",
118 # join( ' ', map { $_->string } @curr_lexemes ),
119 # $curr_rdg->id, $curr_rdg->text );
120 _update_reading_lexemes( $curr_rdg, @curr_lexemes );
121 $curr_rdg = shift @path;
122 @curr_lexemes = ();
123 }
124 }
125 }
126 }
127}
128
129sub _update_reading_lexemes {
130 my( $reading, @lexemes ) = @_;
131 if( $reading->has_lexemes ) {
132 # We need to merge what is in @lexemes with what we have already.
133 my @oldlex = $reading->lexemes;
134 my $cmp1 = join( '||', map { $_->string } @oldlex );
135 my $cmp2 = join( '||', map { $_->string } @lexemes );
136 if ( @oldlex == @lexemes && $cmp1 == $cmp2 ) {
137 # The lexeme strings are the same, so merge the possible
138 # word forms from new to old.
139 foreach my $i ( 0 .. $#lexemes ) {
140 my $ol = $oldlex[$i];
141 my $nl = $lexemes[$i];
142 my %ofw;
143 map { $ofw{$_->_stringify} = 1 } $ol->matching_forms;
144 foreach my $form ( $nl->matching_forms ) {
145 unless( $ofw{$form->_stringify} ) {
146 print STDERR "Adding form " . $form->_stringify .
147 " to lexeme " . $nl->string . " at $reading\n";
148 $ol->add_matching_form( $form );
149 $ol->is_disambiguated(0);
150 }
151 }
152 }
153 } else {
154 $DB::single = 1;
155 warn "Lexeme layout for $reading changed; replacing the lot";
156 $reading->clear_lexemes;
157 $reading->add_lexeme( @lexemes );
158 }
159 } else {
160 $reading->add_lexeme( @lexemes );
161 }
cca4f996 162}
163
164=head2 word_lookup( $word )
165
166Looks up a word using the Flemm package, and returns the possible results.
167It is better to use L<lemmatize> for context sensitivity.
168
169=cut
170
171sub word_lookup {
172 my $word = shift;
d3e7842a 173 my $tagresult = _treetag_string( $word );
174 my $lemmatizer;
175 try {
176 load 'Flemm';
177 $lemmatizer = Flemm->new( 'Encoding' => 'utf8', 'Tagger' => 'treetagger' );
178 } catch {
179 warn "Cannot do French word lemmatization without Flemm: @_";
180 return;
181 }
182 return _parse_wordform( _flemm_lookup( $tagresult ) );
183}
184
185# Utility function that actually calls the tree tagger.
186sub _treetag_string {
187 my( $text ) = @_;
188 my $wittext = encode_utf8( $text );
189 # Then see if we have TreeTagger
190 my $taggercmd = "$MORPHDIR/TreeTagger/cmd/tree-tagger-french-utf8";
191 unless( -f $taggercmd ) {
192 warn "Cannot do French word lemmatization without TreeTagger";
193 return;
194 }
195 # OK, we can run it then.
196 my @cmd = ( $taggercmd );
197 my( $tagresult, $err ); # Capture the output and error
198 run( \@cmd, \$wittext, \$tagresult, \$err );
199 # TODO check for error
200 return decode_utf8( $tagresult );
201}
202
203# Closure and utility function for the package lemmatizer
204{
205 my $lemmatizer;
206
207 sub _flemm_lookup {
208 # First try to load Flemm
209 unless( $lemmatizer ) {
210 try {
211 load 'Flemm';
212 $lemmatizer = Flemm->new( 'Encoding' => 'utf8', 'Tagger' => 'treetagger' );
213 } catch {
214 warn "Cannot do French word lemmatization without Flemm: @_";
215 return;
216 }
217 }
218 return $lemmatizer->lemmatize( @_ )
219 }
cca4f996 220
221}
222
d3e7842a 223# Utility function to turn a Flemm result into a WordForm
224sub _parse_wordform {
225 my $flemmobj = shift;
226 # For now just parse the string, until we make sense of the documentation.
227 my @results = split( / \|\| /, $flemmobj->getResult );
228 my @forms;
229 foreach ( @results ) {
230 my( $orig, $tag, $lemma ) = split( /\t/, $_ );
231 my( $pos, $morph ) = split( /:/, $tag );
232 my $wf = Text::Tradition::Collation::Reading::WordForm->new(
233 'language' => 'French',
234 'lemma' => $lemma,
235 'morphology' => [ split( //, $morph ) ],
236 );
237 push( @forms, $wf );
238 }
239 return @forms;
240}
241
2421;
243
cca4f996 244=head1 LICENSE
245
246This package is free software and is provided "as is" without express
247or implied warranty. You can redistribute it and/or modify it under
248the same terms as Perl itself.
249
250=head1 AUTHOR
251
252Tara L Andrews E<lt>aurum@cpan.orgE<gt>