Commit | Line | Data |
cca4f996 |
1 | package Text::Tradition::Language::French; |
2 | |
d3e7842a |
3 | use Encode qw/ encode_utf8 decode_utf8 /; |
4 | use IPC::Run qw/ run binary /; |
5 | use Module::Load; |
6 | use Text::Tradition::Collation::Reading::Lexeme; |
cca4f996 |
7 | use Text::Tradition::Collation::Reading::WordForm; |
d3e7842a |
8 | use TryCatch; |
9 | |
10 | my $MORPHDIR = '/Users/tla/Projects/morphology'; |
cca4f996 |
11 | |
12 | =head1 NAME |
13 | |
14 | Text::Tradition::Language::French - language-specific modules for French |
15 | |
16 | =head1 DESCRIPTION |
17 | |
18 | Implements morphology lookup for French words in context. |
19 | |
20 | =head1 SUBROUTINES |
21 | |
22 | =head2 lemmatize( $text ) |
23 | |
24 | Evaluates the string using the Flemm package, and returns the results. |
25 | |
26 | =cut |
27 | |
28 | sub lemmatize { |
d3e7842a |
29 | my $tradition = shift; |
30 | |
31 | # Given a tradition, lemmatize it witness by witness and see what we get. |
32 | my $workdir = File::Temp->newdir(); |
33 | my $c = $tradition->collation; |
34 | # First, clear out all existing lexemes from the readings. Save the |
35 | # path as long as we went to the trouble of generating it. |
36 | my %witness_paths; |
37 | foreach my $wit ( $tradition->witnesses ) { |
38 | my @sigla = ( $wit->sigil ); |
39 | push( @sigla, $wit->sigil . $c->ac_label ) if $wit->is_layered; |
40 | foreach my $sig ( @sigla ) { |
41 | my @path = grep { !$_->is_meta } |
42 | $c->reading_sequence( $c->start, $c->end, $sig ); |
43 | map { $_->clear_lexemes } @path; |
44 | $witness_paths{$sig} = \@path; |
45 | } |
46 | } |
cca4f996 |
47 | |
d3e7842a |
48 | foreach my $sig ( keys %witness_paths ) { |
49 | # Get the text as a sequence of readings and as a string |
50 | print STDERR "Morphologizing witness $sig\n"; |
51 | my @path = @{$witness_paths{$sig}}; |
52 | my $tagresult = _treetag_string( $c->_text_from_path( @path ) ); |
53 | if( $tagresult ) { |
54 | # Map the tagged words onto the original readings, splitting |
55 | # them up into lexemes where necessary. |
56 | # NOTE we can have multiple lexemes in a reading, but not |
57 | # multiple readings to a lexeme. |
58 | my @tags = split( /\n/, $tagresult ); |
59 | my @lexemes; |
60 | my $curr_rdg = shift @path; |
61 | my @curr_lexemes; |
62 | my $unused_rdg_part; |
63 | foreach my $tag ( @tags ) { |
64 | # Get the original word |
65 | my( $lexeme, @rest ) = split( /\t/, $tag ); |
66 | # Lemmatize the whole |
67 | my @forms = _parse_wordform( _flemm_lookup( $tag ) ); |
68 | my $lexobj = Text::Tradition::Collation::Reading::Lexeme->new( |
69 | 'string' => $lexeme, 'language' => 'French', |
70 | 'wordform_matchlist' => \@forms ); |
71 | # Find the next non-meta reading |
72 | while( $curr_rdg->is_meta ) { |
73 | $curr_rdg = shift @path; |
74 | } |
75 | unless( $curr_rdg ) { |
76 | warn "Ran out of readings in sequence for " . $wit->sigil |
77 | . " at $lexeme"; |
78 | last; |
79 | } |
80 | if( $unused_rdg_part && |
81 | $unused_rdg_part =~ /^\Q$lexeme\E(\s*)(.*)$/ ) { |
82 | # Nth part of curr_rdg |
83 | $unused_rdg_part = $2; |
84 | push( @curr_lexemes, $lexobj ); |
85 | } elsif( $curr_rdg->text =~ /^\Q$lexeme\E(\s*)(.*)$/ ) { |
86 | # Flag an error if there is already an unused reading part. |
87 | warn "Skipped over unused text $unused_rdg_part at $curr_rdg" |
88 | if $unused_rdg_part; |
89 | $unused_rdg_part = $2; # will be empty if the whole reading matched |
90 | push( @curr_lexemes, $lexobj ); |
91 | } else { |
92 | # We do not cope with the idea of a lexeme being |
93 | # spread across multiple readings. |
94 | warn "Word sequence changed unexpectedly in text"; |
95 | # See if we can find a matching reading |
96 | my @lookahead; |
97 | my $matched; |
98 | while( my $nr = shift @path ) { |
99 | if( $nr->text =~ /^\Q$lexeme\E/ ) { |
100 | $curr_rdg = $lookahead[-1] if @lookahead; |
101 | $matched = 1; |
102 | last; |
103 | } else { |
104 | push( @lookahead, $nr ); |
105 | } |
106 | } |
107 | # No match? Restore the state we had |
108 | unless( $matched ) { |
109 | unshift( @path, @lookahead ); |
110 | } |
111 | # Trigger a move |
112 | $unused_rdg_part = ''; |
113 | } |
114 | |
115 | unless( $unused_rdg_part ) { |
116 | # Record the lexemes for the given reading. |
117 | #print STDERR sprintf( "Adding lexeme(s) %s to reading %s (%s)\n", |
118 | # join( ' ', map { $_->string } @curr_lexemes ), |
119 | # $curr_rdg->id, $curr_rdg->text ); |
120 | _update_reading_lexemes( $curr_rdg, @curr_lexemes ); |
121 | $curr_rdg = shift @path; |
122 | @curr_lexemes = (); |
123 | } |
124 | } |
125 | } |
126 | } |
127 | } |
128 | |
129 | sub _update_reading_lexemes { |
130 | my( $reading, @lexemes ) = @_; |
131 | if( $reading->has_lexemes ) { |
132 | # We need to merge what is in @lexemes with what we have already. |
133 | my @oldlex = $reading->lexemes; |
134 | my $cmp1 = join( '||', map { $_->string } @oldlex ); |
135 | my $cmp2 = join( '||', map { $_->string } @lexemes ); |
136 | if ( @oldlex == @lexemes && $cmp1 == $cmp2 ) { |
137 | # The lexeme strings are the same, so merge the possible |
138 | # word forms from new to old. |
139 | foreach my $i ( 0 .. $#lexemes ) { |
140 | my $ol = $oldlex[$i]; |
141 | my $nl = $lexemes[$i]; |
142 | my %ofw; |
143 | map { $ofw{$_->_stringify} = 1 } $ol->matching_forms; |
144 | foreach my $form ( $nl->matching_forms ) { |
145 | unless( $ofw{$form->_stringify} ) { |
146 | print STDERR "Adding form " . $form->_stringify . |
147 | " to lexeme " . $nl->string . " at $reading\n"; |
148 | $ol->add_matching_form( $form ); |
149 | $ol->is_disambiguated(0); |
150 | } |
151 | } |
152 | } |
153 | } else { |
154 | $DB::single = 1; |
155 | warn "Lexeme layout for $reading changed; replacing the lot"; |
156 | $reading->clear_lexemes; |
157 | $reading->add_lexeme( @lexemes ); |
158 | } |
159 | } else { |
160 | $reading->add_lexeme( @lexemes ); |
161 | } |
cca4f996 |
162 | } |
163 | |
164 | =head2 word_lookup( $word ) |
165 | |
166 | Looks up a word using the Flemm package, and returns the possible results. |
167 | It is better to use L<lemmatize> for context sensitivity. |
168 | |
169 | =cut |
170 | |
171 | sub word_lookup { |
172 | my $word = shift; |
d3e7842a |
173 | my $tagresult = _treetag_string( $word ); |
174 | my $lemmatizer; |
175 | try { |
176 | load 'Flemm'; |
177 | $lemmatizer = Flemm->new( 'Encoding' => 'utf8', 'Tagger' => 'treetagger' ); |
178 | } catch { |
179 | warn "Cannot do French word lemmatization without Flemm: @_"; |
180 | return; |
181 | } |
182 | return _parse_wordform( _flemm_lookup( $tagresult ) ); |
183 | } |
184 | |
185 | # Utility function that actually calls the tree tagger. |
186 | sub _treetag_string { |
187 | my( $text ) = @_; |
188 | my $wittext = encode_utf8( $text ); |
189 | # Then see if we have TreeTagger |
190 | my $taggercmd = "$MORPHDIR/TreeTagger/cmd/tree-tagger-french-utf8"; |
191 | unless( -f $taggercmd ) { |
192 | warn "Cannot do French word lemmatization without TreeTagger"; |
193 | return; |
194 | } |
195 | # OK, we can run it then. |
196 | my @cmd = ( $taggercmd ); |
197 | my( $tagresult, $err ); # Capture the output and error |
198 | run( \@cmd, \$wittext, \$tagresult, \$err ); |
199 | # TODO check for error |
200 | return decode_utf8( $tagresult ); |
201 | } |
202 | |
203 | # Closure and utility function for the package lemmatizer |
204 | { |
205 | my $lemmatizer; |
206 | |
207 | sub _flemm_lookup { |
208 | # First try to load Flemm |
209 | unless( $lemmatizer ) { |
210 | try { |
211 | load 'Flemm'; |
212 | $lemmatizer = Flemm->new( 'Encoding' => 'utf8', 'Tagger' => 'treetagger' ); |
213 | } catch { |
214 | warn "Cannot do French word lemmatization without Flemm: @_"; |
215 | return; |
216 | } |
217 | } |
218 | return $lemmatizer->lemmatize( @_ ) |
219 | } |
cca4f996 |
220 | |
221 | } |
222 | |
d3e7842a |
223 | # Utility function to turn a Flemm result into a WordForm |
224 | sub _parse_wordform { |
225 | my $flemmobj = shift; |
226 | # For now just parse the string, until we make sense of the documentation. |
227 | my @results = split( / \|\| /, $flemmobj->getResult ); |
228 | my @forms; |
229 | foreach ( @results ) { |
230 | my( $orig, $tag, $lemma ) = split( /\t/, $_ ); |
231 | my( $pos, $morph ) = split( /:/, $tag ); |
232 | my $wf = Text::Tradition::Collation::Reading::WordForm->new( |
233 | 'language' => 'French', |
234 | 'lemma' => $lemma, |
235 | 'morphology' => [ split( //, $morph ) ], |
236 | ); |
237 | push( @forms, $wf ); |
238 | } |
239 | return @forms; |
240 | } |
241 | |
242 | 1; |
243 | |
cca4f996 |
244 | =head1 LICENSE |
245 | |
246 | This package is free software and is provided "as is" without express |
247 | or implied warranty. You can redistribute it and/or modify it under |
248 | the same terms as Perl itself. |
249 | |
250 | =head1 AUTHOR |
251 | |
252 | Tara L Andrews E<lt>aurum@cpan.orgE<gt> |