Commit | Line | Data |
cca4f996 |
1 | package Text::Tradition::Language::French; |
2 | |
d3e7842a |
3 | use Encode qw/ encode_utf8 decode_utf8 /; |
6ad2ce78 |
4 | use IPC::Run qw/ run /; |
5 | use Lingua::TagSet::Multext; |
6 | use Lingua::TagSet::TreeTagger; |
d3e7842a |
7 | use Module::Load; |
8 | use Text::Tradition::Collation::Reading::Lexeme; |
cca4f996 |
9 | use Text::Tradition::Collation::Reading::WordForm; |
d3e7842a |
10 | use TryCatch; |
11 | |
12 | my $MORPHDIR = '/Users/tla/Projects/morphology'; |
cca4f996 |
13 | |
14 | =head1 NAME |
15 | |
6ad2ce78 |
16 | Text::Tradition::Language::French - language-specific module for French |
cca4f996 |
17 | |
18 | =head1 DESCRIPTION |
19 | |
6ad2ce78 |
20 | Implements morphology lookup for French words in context. This module |
21 | depends on the Flemm module for French lemmatization |
22 | (L<http://www.univ-nancy2.fr/pers/namer/Outils.htm#fl3> in conjunction with |
23 | the TreeTagger software |
24 | (L<http://www.ims.uni-stuttgart.de/projekte/corplex/TreeTagger/>), which is |
25 | (for now) expected to be installed in $MORPHDIR/TreeTagger. |
cca4f996 |
26 | |
27 | =head1 SUBROUTINES |
28 | |
29 | =head2 lemmatize( $text ) |
30 | |
31 | Evaluates the string using the Flemm package, and returns the results. |
32 | |
6ad2ce78 |
33 | =begin testing |
34 | |
35 | binmode STDOUT, ':utf8'; |
36 | use Text::Tradition; |
37 | use_ok( 'Text::Tradition::Language::French' ); |
38 | |
39 | eval "use Flemm"; |
40 | my $err = $@; |
41 | |
42 | SKIP: { |
43 | skip "Package Flemm not found" if $err; |
44 | my $tf = Text::Tradition->new( |
45 | 'input' => 'Self', |
46 | 'file' => 't/data/besoin.xml', |
47 | 'language' => 'French' ); |
48 | |
49 | is( $tf->language, 'French', "Set language okay" ); |
50 | $tf->lemmatize(); |
51 | # Test the lemmatization. How many readings now have morphological info? |
52 | # Do the lexemes match the reading? |
53 | my $ambig = 0; |
54 | foreach my $r ( $tf->collation->readings ) { |
55 | next if $r->is_meta; |
56 | ok( $r->has_lexemes, "Reading $r has one or more lexemes" ); |
57 | my @lex = $r->lexemes; |
58 | my $lexstr = join( '', map { $_->string } @lex ); |
59 | my $textstr = $r->text; |
60 | $textstr =~ s/\s+//g; |
61 | is( $textstr, $lexstr, "Lexemes for reading $r match the reading" ); |
62 | foreach my $l ( @lex ) { |
63 | next if $l->is_disambiguated; |
64 | # printf( "Ambiguous lexeme %s for reading %s:\n\t%s\n", $l->string, $r->id, |
65 | # join( "\n\t", map { $_->lemma . ': ' . $_->morphology->to_string } $l->matching_forms ) ); |
66 | $ambig++; |
67 | } |
68 | } |
69 | is( $ambig, 102, "Found 102 ambiguous forms as expected" ); |
70 | |
71 | # Try setting the normal form of a reading and re-analyzing |
72 | my $mr = $tf->collation->reading('99,2'); |
73 | is( $mr->text, 'minspire', "Picked correct test reading" ); |
74 | is( $mr->language, 'French', "Reading has correct language setting" ); |
75 | $mr->normal_form( "m'inspire" ); |
76 | $mr->lemmatize; |
77 | is( $mr->lexemes, 2, "Got two lexemes for new m'inspire reading" ); |
78 | } |
79 | |
80 | =end testing |
81 | |
cca4f996 |
82 | =cut |
83 | |
84 | sub lemmatize { |
d3e7842a |
85 | my $tradition = shift; |
86 | |
87 | # Given a tradition, lemmatize it witness by witness and see what we get. |
88 | my $workdir = File::Temp->newdir(); |
89 | my $c = $tradition->collation; |
90 | # First, clear out all existing lexemes from the readings. Save the |
91 | # path as long as we went to the trouble of generating it. |
92 | my %witness_paths; |
93 | foreach my $wit ( $tradition->witnesses ) { |
94 | my @sigla = ( $wit->sigil ); |
95 | push( @sigla, $wit->sigil . $c->ac_label ) if $wit->is_layered; |
96 | foreach my $sig ( @sigla ) { |
97 | my @path = grep { !$_->is_meta } |
98 | $c->reading_sequence( $c->start, $c->end, $sig ); |
99 | map { $_->clear_lexemes } @path; |
100 | $witness_paths{$sig} = \@path; |
101 | } |
102 | } |
cca4f996 |
103 | |
d3e7842a |
104 | foreach my $sig ( keys %witness_paths ) { |
105 | # Get the text as a sequence of readings and as a string |
106 | print STDERR "Morphologizing witness $sig\n"; |
6ad2ce78 |
107 | _lemmatize_sequence( undef, @{$witness_paths{$sig}} ); |
d3e7842a |
108 | } |
109 | } |
110 | |
111 | sub _update_reading_lexemes { |
6ad2ce78 |
112 | my( $replace, $reading, @lexemes ) = @_; |
113 | if( $reading->has_lexemes && !$replace ) { |
d3e7842a |
114 | # We need to merge what is in @lexemes with what we have already. |
115 | my @oldlex = $reading->lexemes; |
116 | my $cmp1 = join( '||', map { $_->string } @oldlex ); |
117 | my $cmp2 = join( '||', map { $_->string } @lexemes ); |
6ad2ce78 |
118 | if ( @oldlex == @lexemes && $cmp1 eq $cmp2 ) { |
d3e7842a |
119 | # The lexeme strings are the same, so merge the possible |
120 | # word forms from new to old. |
121 | foreach my $i ( 0 .. $#lexemes ) { |
122 | my $ol = $oldlex[$i]; |
123 | my $nl = $lexemes[$i]; |
124 | my %ofw; |
6ad2ce78 |
125 | map { $ofw{$_->to_string} = 1 } $ol->matching_forms; |
d3e7842a |
126 | foreach my $form ( $nl->matching_forms ) { |
6ad2ce78 |
127 | unless( $ofw{$form->to_string} ) { |
128 | print STDERR "Adding form " . $form->to_string . |
d3e7842a |
129 | " to lexeme " . $nl->string . " at $reading\n"; |
130 | $ol->add_matching_form( $form ); |
131 | $ol->is_disambiguated(0); |
132 | } |
133 | } |
134 | } |
135 | } else { |
d3e7842a |
136 | warn "Lexeme layout for $reading changed; replacing the lot"; |
137 | $reading->clear_lexemes; |
138 | $reading->add_lexeme( @lexemes ); |
139 | } |
140 | } else { |
6ad2ce78 |
141 | $reading->clear_lexemes if $replace; |
d3e7842a |
142 | $reading->add_lexeme( @lexemes ); |
143 | } |
cca4f996 |
144 | } |
145 | |
6ad2ce78 |
146 | =head2 reading_lookup( $rdg[, $rdg, ...] ) |
cca4f996 |
147 | |
6ad2ce78 |
148 | Looks up one or more readings using the Flemm package, and returns the |
149 | possible results. This uses the same logic as L<lemmatize> above for the |
150 | entire tradition, but can also be used to (re-)analyze individual readings. |
cca4f996 |
151 | |
152 | =cut |
153 | |
6ad2ce78 |
154 | sub reading_lookup { |
155 | return _lemmatize_sequence( 1, @_ ); |
156 | } |
157 | |
158 | sub _lemmatize_sequence { |
159 | my( $replace, @path ) = @_; |
160 | $DB::single = 1 if $replace; |
161 | my $tagresult = _treetag_string( _text_from_path( 1, @path ) ); |
162 | if( $tagresult ) { |
163 | # Map the tagged words onto the original readings, splitting |
164 | # them up into lexemes where necessary. |
165 | # NOTE we can have multiple lexemes in a reading, but not |
166 | # multiple readings to a lexeme. |
167 | my @tags = split( /\n/, $tagresult ); |
168 | my @lexemes; |
169 | my $curr_rdg = shift @path; |
170 | my @curr_lexemes; |
171 | my $unused_rdg_part; |
172 | foreach my $tag ( @tags ) { |
173 | # Get the original word |
174 | my( $lexeme, @rest ) = split( /\t/, $tag ); |
175 | # Lemmatize the whole |
176 | my @forms = _parse_wordform( _flemm_lookup( $tag ) ); |
177 | my $lexobj = Text::Tradition::Collation::Reading::Lexeme->new( |
178 | 'string' => $lexeme, 'language' => 'French', |
179 | 'wordform_matchlist' => \@forms ); |
180 | # Find the next non-meta reading |
181 | while( $curr_rdg && $curr_rdg->is_meta ) { |
182 | $curr_rdg = shift @path; |
183 | } |
184 | unless( $curr_rdg ) { |
185 | warn "Ran out of readings in sequence for " . $wit->sigil |
186 | . " at $lexeme"; |
187 | last; |
188 | } |
189 | my $curr_rdg_text = $curr_rdg->has_normal_form |
190 | ? $curr_rdg->normal_form : $curr_rdg->text; |
191 | if( $unused_rdg_part && |
192 | $unused_rdg_part =~ /^\Q$lexeme\E(\s*)(.*)$/ ) { |
193 | # Nth part of curr_rdg |
194 | $unused_rdg_part = $2; |
195 | push( @curr_lexemes, $lexobj ); |
196 | } elsif( $curr_rdg_text =~ /^\Q$lexeme\E(\s*)(.*)$/ ) { |
197 | # Flag an error if there is already an unused reading part. |
198 | warn "Skipped over unused text $unused_rdg_part at $curr_rdg" |
199 | if $unused_rdg_part; |
200 | $unused_rdg_part = $2; # will be empty if the whole reading matched |
201 | push( @curr_lexemes, $lexobj ); |
202 | } else { |
203 | # We do not cope with the idea of a lexeme being |
204 | # spread across multiple readings. |
205 | warn "Word sequence changed unexpectedly in text"; |
206 | # See if we can find a matching reading |
207 | my @lookahead; |
208 | my $matched; |
209 | while( my $nr = shift @path ) { |
210 | my $nrtext = $nr->has_normal_form ? $nr->normal_form : $nr->text; |
211 | if( $nrtext =~ /^\Q$lexeme\E/ ) { |
212 | $curr_rdg = $lookahead[-1] if @lookahead; |
213 | $matched = 1; |
214 | last; |
215 | } else { |
216 | push( @lookahead, $nr ); |
217 | } |
218 | } |
219 | # No match? Restore the state we had |
220 | unless( $matched ) { |
221 | unshift( @path, @lookahead ); |
222 | } |
223 | # Trigger a move |
224 | $unused_rdg_part = ''; |
225 | } |
226 | |
227 | unless( $unused_rdg_part ) { |
228 | # Record the lexemes for the given reading. |
229 | #print STDERR sprintf( "Adding lexeme(s) %s to reading %s (%s)\n", |
230 | # join( ' ', map { $_->string } @curr_lexemes ), |
231 | # $curr_rdg->id, $curr_rdg->text ); |
232 | _update_reading_lexemes( $replace, $curr_rdg, @curr_lexemes ); |
233 | $curr_rdg = shift @path; |
234 | @curr_lexemes = (); |
235 | } |
236 | } |
237 | } |
238 | } |
239 | |
240 | # Utility function so that we can cheat and use it when we need both the path |
241 | # and its text. |
242 | sub _text_from_path { |
243 | my( $normalize, @path ) = @_; |
244 | my $pathtext = ''; |
245 | my $last; |
246 | foreach my $r ( @path ) { |
247 | unless ( $r->join_prior || !$last || $last->join_next ) { |
248 | $pathtext .= ' '; |
249 | } |
250 | $pathtext .= ( $normalize && $r->has_normal_form ) |
251 | ? $r->normal_form : $r->text; |
252 | $last = $r; |
d3e7842a |
253 | } |
6ad2ce78 |
254 | return $pathtext; |
d3e7842a |
255 | } |
256 | |
257 | # Utility function that actually calls the tree tagger. |
258 | sub _treetag_string { |
259 | my( $text ) = @_; |
260 | my $wittext = encode_utf8( $text ); |
261 | # Then see if we have TreeTagger |
262 | my $taggercmd = "$MORPHDIR/TreeTagger/cmd/tree-tagger-french-utf8"; |
263 | unless( -f $taggercmd ) { |
264 | warn "Cannot do French word lemmatization without TreeTagger"; |
265 | return; |
266 | } |
267 | # OK, we can run it then. |
268 | my @cmd = ( $taggercmd ); |
269 | my( $tagresult, $err ); # Capture the output and error |
270 | run( \@cmd, \$wittext, \$tagresult, \$err ); |
271 | # TODO check for error |
272 | return decode_utf8( $tagresult ); |
273 | } |
274 | |
275 | # Closure and utility function for the package lemmatizer |
276 | { |
277 | my $lemmatizer; |
278 | |
279 | sub _flemm_lookup { |
280 | # First try to load Flemm |
281 | unless( $lemmatizer ) { |
282 | try { |
283 | load 'Flemm'; |
284 | $lemmatizer = Flemm->new( 'Encoding' => 'utf8', 'Tagger' => 'treetagger' ); |
285 | } catch { |
286 | warn "Cannot do French word lemmatization without Flemm: @_"; |
287 | return; |
288 | } |
289 | } |
290 | return $lemmatizer->lemmatize( @_ ) |
291 | } |
cca4f996 |
292 | |
293 | } |
294 | |
d3e7842a |
295 | # Utility function to turn a Flemm result into a WordForm |
296 | sub _parse_wordform { |
297 | my $flemmobj = shift; |
298 | # For now just parse the string, until we make sense of the documentation. |
299 | my @results = split( / \|\| /, $flemmobj->getResult ); |
300 | my @forms; |
301 | foreach ( @results ) { |
302 | my( $orig, $tag, $lemma ) = split( /\t/, $_ ); |
303 | my( $pos, $morph ) = split( /:/, $tag ); |
6ad2ce78 |
304 | my $morphobj; |
305 | if( $morph ) { |
306 | $morphobj = Lingua::TagSet::Multext->tag2structure( $morph ); |
307 | } else { |
308 | # Use the TreeTagger info if there is no Flemm morphology. |
309 | $morphobj = Lingua::TagSet::TreeTagger->tag2structure( $pos ); |
310 | } |
311 | if( $morphobj ) { |
312 | my $wf = Text::Tradition::Collation::Reading::WordForm->new( |
313 | 'language' => 'French', |
314 | 'lemma' => $lemma, |
315 | 'morphology' => $morphobj, |
316 | ); |
317 | push( @forms, $wf ); |
318 | } else { |
319 | warn "No morphology found for word: $_"; |
320 | } |
d3e7842a |
321 | } |
322 | return @forms; |
323 | } |
324 | |
325 | 1; |
326 | |
6ad2ce78 |
327 | =head2 TODO |
328 | |
329 | =over |
330 | |
331 | =item * Handle package dependencies more gracefully |
332 | |
333 | =back |
334 | |
cca4f996 |
335 | =head1 LICENSE |
336 | |
337 | This package is free software and is provided "as is" without express |
338 | or implied warranty. You can redistribute it and/or modify it under |
339 | the same terms as Perl itself. |
340 | |
341 | =head1 AUTHOR |
342 | |
343 | Tara L Andrews E<lt>aurum@cpan.orgE<gt> |