Commit | Line | Data |
e0f6836a |
1 | package Text::Tradition::Language::Base; |
2 | |
3 | use strict; |
4 | use warnings; |
5 | use Encode qw/ encode_utf8 decode_utf8 /; |
6 | use Exporter 'import'; |
7 | use vars qw/ @EXPORT_OK /; |
8 | use IPC::Run qw/ run /; |
9 | use Lingua::TagSet::Multext; |
10 | use Lingua::TagSet::TreeTagger; |
11 | use Module::Load; |
12 | use Text::Tradition::Collation::Reading::Lexeme; |
13 | use Text::Tradition::Collation::Reading::WordForm; |
14 | use TryCatch; |
15 | |
5271a011 |
16 | @EXPORT_OK = qw/ lemmatize_treetagger reading_lookup_treetagger treetagger_struct |
75ae2b25 |
17 | multext_struct lfs_morph_tags /; |
e0f6836a |
18 | |
19 | =head1 NAME |
20 | |
21 | Text::Tradition::Language::Base - Base subroutines for lemmatization of words |
22 | |
23 | =head1 DESCRIPTION |
24 | |
25 | Common routines for applying morphological tagging to a Text::Tradition. Used |
26 | with callbacks from the named language packages. |
27 | |
28 | =head1 SUBROUTINES |
29 | |
30 | =head2 lemmatize_treetagger( $tradition ) |
31 | |
32 | Evaluates the tradition with the given options, and returns the results. |
33 | |
34 | =cut |
35 | |
36 | sub lemmatize_treetagger { |
37 | my( $tradition, %opts ) = @_; |
38 | |
39 | # Given a tradition, lemmatize it witness by witness and see what we get. |
40 | my $c = $tradition->collation; |
41 | # First, clear out all existing lexemes from the readings. |
42 | my %witness_paths = _clear_reading_lexemes( $tradition ); |
43 | |
44 | foreach my $sig ( keys %witness_paths ) { |
45 | # Get the text as a sequence of readings and as a string |
e0f6836a |
46 | my %witopts = ( |
47 | 'path' => $witness_paths{$sig}, |
48 | %opts |
49 | ); |
50 | _lemmatize_treetagger_sequence( %witopts ); |
51 | } |
52 | } |
53 | |
54 | sub _clear_reading_lexemes { |
55 | my $tradition = shift; |
56 | my $c = $tradition->collation; |
57 | # Clear out all existing lexemes from the readings. Save the path as long |
58 | # as we went to the trouble of generating it. |
59 | my %witness_paths; |
60 | foreach my $wit ( $tradition->witnesses ) { |
61 | my @sigla = ( $wit->sigil ); |
62 | push( @sigla, $wit->sigil . $c->ac_label ) if $wit->is_layered; |
63 | foreach my $sig ( @sigla ) { |
64 | my @path = grep { !$_->is_meta } |
65 | $c->reading_sequence( $c->start, $c->end, $sig ); |
66 | map { $_->clear_lexemes } @path; |
67 | $witness_paths{$sig} = \@path; |
68 | } |
69 | } |
70 | return %witness_paths; |
71 | } |
72 | |
73 | =head2 reading_lookup( $rdg[, $rdg, ...] ) |
74 | |
75 | Looks up one or more readings using the Flemm package, and returns the |
76 | possible results. This uses the same logic as L<lemmatize> above for the |
77 | entire tradition, but can also be used to (re-)analyze individual readings. |
78 | |
79 | =cut |
80 | |
81 | sub reading_lookup_treetagger { |
82 | my %opts = @_; |
83 | $opts{'replace'} = 1; |
84 | return _lemmatize_treetagger_sequence( %opts ); |
85 | } |
86 | |
87 | sub _lemmatize_treetagger_sequence { |
88 | my %opts = @_; |
89 | my @path = @{$opts{'path'}}; |
90 | my $tagresult = _treetag_string( _text_from_path( 1, @path ), $opts{'language'} ); |
91 | if( $tagresult ) { |
92 | # Map the tagged words onto the original readings, splitting |
93 | # them up into lexemes where necessary. |
94 | # NOTE we can have multiple lexemes in a reading, but not |
95 | # multiple readings to a lexeme. |
96 | my @tags = split( /\n/, $tagresult ); |
97 | my @lexemes; |
98 | my $curr_rdg = shift @path; |
99 | my @curr_lexemes; |
100 | my $unused_rdg_part; |
101 | foreach my $tag ( @tags ) { |
102 | # Get the original word |
103 | my( $lexeme, @rest ) = split( /\t/, $tag ); |
104 | # Lemmatize the whole |
105 | # TODO error trap this |
106 | my @forms = $opts{'callback'}( $tag ); |
107 | |
108 | my $lexobj = Text::Tradition::Collation::Reading::Lexeme->new( |
109 | 'string' => $lexeme, 'language' => $opts{'language'}, |
110 | 'wordform_matchlist' => \@forms ); |
111 | # Find the next non-meta reading |
112 | while( $curr_rdg && $curr_rdg->is_meta ) { |
113 | $curr_rdg = shift @path; |
114 | } |
115 | unless( $curr_rdg ) { |
116 | warn "Ran out of readings in sequence at $lexeme"; |
117 | last; |
118 | } |
119 | my $curr_rdg_text = $curr_rdg->has_normal_form |
120 | ? $curr_rdg->normal_form : $curr_rdg->text; |
121 | if( $unused_rdg_part && |
122 | $unused_rdg_part =~ /^\Q$lexeme\E(\s*)(.*)$/ ) { |
123 | # Nth part of curr_rdg |
124 | $unused_rdg_part = $2; |
125 | push( @curr_lexemes, $lexobj ); |
126 | } elsif( $curr_rdg_text =~ /^\Q$lexeme\E(\s*)(.*)$/ ) { |
127 | # Flag an error if there is already an unused reading part. |
128 | warn "Skipped over unused text $unused_rdg_part at $curr_rdg" |
129 | if $unused_rdg_part; |
130 | $unused_rdg_part = $2; # will be empty if the whole reading matched |
131 | push( @curr_lexemes, $lexobj ); |
132 | } else { |
133 | # We do not cope with the idea of a lexeme being |
134 | # spread across multiple readings. |
135 | warn "Word sequence changed unexpectedly in text"; |
136 | # See if we can find a matching reading |
137 | my @lookahead; |
138 | my $matched; |
139 | while( my $nr = shift @path ) { |
140 | my $nrtext = $nr->has_normal_form ? $nr->normal_form : $nr->text; |
141 | if( $nrtext =~ /^\Q$lexeme\E/ ) { |
142 | $curr_rdg = $lookahead[-1] if @lookahead; |
143 | $matched = 1; |
144 | last; |
145 | } else { |
146 | push( @lookahead, $nr ); |
147 | } |
148 | } |
149 | # No match? Restore the state we had |
150 | unless( $matched ) { |
151 | unshift( @path, @lookahead ); |
152 | } |
153 | # Trigger a move |
154 | $unused_rdg_part = ''; |
155 | } |
156 | |
157 | unless( $unused_rdg_part ) { |
158 | # Record the lexemes for the given reading. |
159 | #print STDERR sprintf( "Adding lexeme(s) %s to reading %s (%s)\n", |
160 | # join( ' ', map { $_->string } @curr_lexemes ), |
161 | # $curr_rdg->id, $curr_rdg->text ); |
162 | _update_reading_lexemes( $opts{replace}, $curr_rdg, @curr_lexemes ); |
163 | $curr_rdg = shift @path; |
164 | @curr_lexemes = (); |
165 | } |
166 | } |
167 | } |
168 | } |
169 | |
170 | sub _update_reading_lexemes { |
171 | my( $replace, $reading, @lexemes ) = @_; |
172 | if( $reading->has_lexemes && !$replace ) { |
173 | # We need to merge what is in @lexemes with what we have already. |
174 | my @oldlex = $reading->lexemes; |
175 | my $cmp1 = join( '||', map { $_->string } @oldlex ); |
176 | my $cmp2 = join( '||', map { $_->string } @lexemes ); |
177 | if ( @oldlex == @lexemes && $cmp1 eq $cmp2 ) { |
178 | # The lexeme strings are the same, so merge the possible |
179 | # word forms from new to old. |
180 | foreach my $i ( 0 .. $#lexemes ) { |
181 | my $ol = $oldlex[$i]; |
182 | my $nl = $lexemes[$i]; |
183 | my %ofw; |
184 | map { $ofw{$_->to_string} = 1 } $ol->matching_forms; |
185 | foreach my $form ( $nl->matching_forms ) { |
186 | unless( $ofw{$form->to_string} ) { |
a3ef385d |
187 | # print STDERR "Adding form " . $form->to_string . |
188 | # " to lexeme " . $nl->string . " at $reading\n"; |
e0f6836a |
189 | $ol->add_matching_form( $form ); |
190 | $ol->is_disambiguated(0); |
191 | } |
192 | } |
193 | } |
194 | } else { |
195 | warn "Lexeme layout for $reading changed; replacing the lot"; |
196 | $reading->clear_lexemes; |
197 | $reading->add_lexeme( @lexemes ); |
198 | } |
199 | } else { |
200 | $reading->clear_lexemes if $replace; |
201 | $reading->add_lexeme( @lexemes ); |
202 | } |
203 | } |
204 | |
205 | # Utility function so that we can cheat and use it when we need both the path |
206 | # and its text. |
207 | sub _text_from_path { |
208 | my( $normalize, @path ) = @_; |
209 | my $pathtext = ''; |
210 | my $last; |
211 | foreach my $r ( @path ) { |
212 | unless ( $r->join_prior || !$last || $last->join_next ) { |
213 | $pathtext .= ' '; |
214 | } |
215 | $pathtext .= ( $normalize && $r->has_normal_form ) |
216 | ? $r->normal_form : $r->text; |
217 | $last = $r; |
218 | } |
219 | return $pathtext; |
220 | } |
221 | |
222 | # Utility function that actually calls the tree tagger. |
223 | sub _treetag_string { |
224 | my( $text, $lang ) = @_; |
225 | my $wittext = encode_utf8( $text ); |
226 | # Then see if we have TreeTagger |
227 | try { |
228 | load( 'Lingua::TreeTagger' ); |
229 | } catch { |
230 | warn "Cannot run TreeTagger without Lingua::TreeTagger module"; |
231 | return ''; |
232 | } |
233 | # OK, we can run it then. |
234 | # First upgrade to UTF8 for necessary languages. |
a3ef385d |
235 | my @utf8_supported = qw/ French /; |
236 | my %ttopts = ( 'language' => $lang, 'options' => [ qw/ -token -lemma / ] ); |
e0f6836a |
237 | if( grep { $_ eq $lang } @utf8_supported ) { |
a3ef385d |
238 | $ttopts{'use_utf8'} = 1; |
e0f6836a |
239 | } |
240 | # Now instantiate and run the tagger. |
a3ef385d |
241 | my $tagger = Lingua::TreeTagger->new( %ttopts ); |
e0f6836a |
242 | my $tagresult = $tagger->tag_text( \$text ); |
243 | |
244 | # TODO maybe send the tokens back rather than the interpreted string... |
e0f6836a |
245 | return $tagresult->as_text(); |
246 | } |
247 | |
5271a011 |
248 | ## HACK function to correct for TagSet::TreeTagger brokenness |
249 | sub treetagger_struct { |
250 | my $pos = shift; |
251 | $pos =~ s/PREP/PRP/; |
252 | return Lingua::TagSet::TreeTagger->tag2structure( $pos ); |
253 | } |
254 | |
255 | sub multext_struct { |
256 | my $pos = shift; |
257 | # No known hacks needed |
258 | return Lingua::TagSet::Multext->tag2structure( $pos ); |
259 | } |
e0f6836a |
260 | |
75ae2b25 |
261 | =head2 lfs_morph_tags |
262 | |
263 | Return a data structure describing the available parts of speech and their attributes |
264 | from the Lingua::Features::Structure class currently defined. |
265 | |
266 | =cut |
267 | |
268 | sub lfs_morph_tags { |
269 | load('Lingua::Features::StructureType'); |
270 | my $tagset = { 'structures' => [], 'features' => {} }; |
271 | foreach my $lfs ( sort { _by_structid( $a->id, $b->id ) } Lingua::Features::StructureType->types() ) { |
272 | my $tsstruct = { 'id' => $lfs->id, 'use_features' => [] }; |
273 | foreach my $ftid ( Lingua::Features::StructureType->type($lfs->id)->features ) { |
274 | my $ftype = $lfs->feature_type( $ftid ); |
275 | my $tfstruct = { 'id' => $ftid, 'values' => [] }; |
276 | foreach my $fval( $ftype->values ) { |
277 | push( @{$tfstruct->{'values'}}, |
278 | { 'short' => $fval, 'long' => $ftype->value_name( $fval ) } ); |
279 | } |
280 | push( @{$tsstruct->{'use_features'}}, $ftid ); |
281 | $tagset->{'features'}->{$ftid} = $tfstruct; |
282 | } |
283 | push( @{$tagset->{'structures'}}, $tsstruct ); |
284 | } |
285 | return $tagset; |
286 | } |
287 | |
288 | sub _by_structid { |
289 | my( $a, $b ) = @_; |
290 | return -1 if $a eq 'cat'; |
291 | return 1 if $b eq 'cat'; |
292 | return $a cmp $b; |
293 | } |
294 | |
e0f6836a |
295 | 1; |
296 | |
297 | =head2 TODO |
298 | |
299 | =over |
300 | |
301 | =item * Handle package dependencies more gracefully |
302 | |
303 | =back |
304 | |
305 | =head1 LICENSE |
306 | |
307 | This package is free software and is provided "as is" without express |
308 | or implied warranty. You can redistribute it and/or modify it under |
309 | the same terms as Perl itself. |
310 | |
311 | =head1 AUTHOR |
312 | |
313 | Tara L Andrews E<lt>aurum@cpan.orgE<gt> |