Commit | Line | Data |
e0f6836a |
1 | package Text::Tradition::Language::Base; |
2 | |
3 | use strict; |
4 | use warnings; |
5 | use Encode qw/ encode_utf8 decode_utf8 /; |
6 | use Exporter 'import'; |
7 | use vars qw/ @EXPORT_OK /; |
8 | use IPC::Run qw/ run /; |
9 | use Lingua::TagSet::Multext; |
10 | use Lingua::TagSet::TreeTagger; |
11 | use Module::Load; |
12 | use Text::Tradition::Collation::Reading::Lexeme; |
13 | use Text::Tradition::Collation::Reading::WordForm; |
14 | use TryCatch; |
15 | |
16 | @EXPORT_OK = qw/ lemmatize_treetagger reading_lookup_treetagger /; |
17 | |
18 | =head1 NAME |
19 | |
20 | Text::Tradition::Language::Base - Base subroutines for lemmatization of words |
21 | |
22 | =head1 DESCRIPTION |
23 | |
24 | Common routines for applying morphological tagging to a Text::Tradition. Used |
25 | with callbacks from the named language packages. |
26 | |
27 | =head1 SUBROUTINES |
28 | |
29 | =head2 lemmatize_treetagger( $tradition ) |
30 | |
31 | Evaluates the tradition with the given options, and returns the results. |
32 | |
33 | =cut |
34 | |
35 | sub lemmatize_treetagger { |
36 | my( $tradition, %opts ) = @_; |
37 | |
38 | # Given a tradition, lemmatize it witness by witness and see what we get. |
39 | my $c = $tradition->collation; |
40 | # First, clear out all existing lexemes from the readings. |
41 | my %witness_paths = _clear_reading_lexemes( $tradition ); |
42 | |
43 | foreach my $sig ( keys %witness_paths ) { |
44 | # Get the text as a sequence of readings and as a string |
e0f6836a |
45 | my %witopts = ( |
46 | 'path' => $witness_paths{$sig}, |
47 | %opts |
48 | ); |
49 | _lemmatize_treetagger_sequence( %witopts ); |
50 | } |
51 | } |
52 | |
53 | sub _clear_reading_lexemes { |
54 | my $tradition = shift; |
55 | my $c = $tradition->collation; |
56 | # Clear out all existing lexemes from the readings. Save the path as long |
57 | # as we went to the trouble of generating it. |
58 | my %witness_paths; |
59 | foreach my $wit ( $tradition->witnesses ) { |
60 | my @sigla = ( $wit->sigil ); |
61 | push( @sigla, $wit->sigil . $c->ac_label ) if $wit->is_layered; |
62 | foreach my $sig ( @sigla ) { |
63 | my @path = grep { !$_->is_meta } |
64 | $c->reading_sequence( $c->start, $c->end, $sig ); |
65 | map { $_->clear_lexemes } @path; |
66 | $witness_paths{$sig} = \@path; |
67 | } |
68 | } |
69 | return %witness_paths; |
70 | } |
71 | |
72 | =head2 reading_lookup( $rdg[, $rdg, ...] ) |
73 | |
74 | Looks up one or more readings using the Flemm package, and returns the |
75 | possible results. This uses the same logic as L<lemmatize> above for the |
76 | entire tradition, but can also be used to (re-)analyze individual readings. |
77 | |
78 | =cut |
79 | |
80 | sub reading_lookup_treetagger { |
81 | my %opts = @_; |
82 | $opts{'replace'} = 1; |
83 | return _lemmatize_treetagger_sequence( %opts ); |
84 | } |
85 | |
86 | sub _lemmatize_treetagger_sequence { |
87 | my %opts = @_; |
88 | my @path = @{$opts{'path'}}; |
89 | my $tagresult = _treetag_string( _text_from_path( 1, @path ), $opts{'language'} ); |
90 | if( $tagresult ) { |
91 | # Map the tagged words onto the original readings, splitting |
92 | # them up into lexemes where necessary. |
93 | # NOTE we can have multiple lexemes in a reading, but not |
94 | # multiple readings to a lexeme. |
95 | my @tags = split( /\n/, $tagresult ); |
96 | my @lexemes; |
97 | my $curr_rdg = shift @path; |
98 | my @curr_lexemes; |
99 | my $unused_rdg_part; |
100 | foreach my $tag ( @tags ) { |
101 | # Get the original word |
102 | my( $lexeme, @rest ) = split( /\t/, $tag ); |
103 | # Lemmatize the whole |
104 | # TODO error trap this |
105 | my @forms = $opts{'callback'}( $tag ); |
106 | |
107 | my $lexobj = Text::Tradition::Collation::Reading::Lexeme->new( |
108 | 'string' => $lexeme, 'language' => $opts{'language'}, |
109 | 'wordform_matchlist' => \@forms ); |
110 | # Find the next non-meta reading |
111 | while( $curr_rdg && $curr_rdg->is_meta ) { |
112 | $curr_rdg = shift @path; |
113 | } |
114 | unless( $curr_rdg ) { |
115 | warn "Ran out of readings in sequence at $lexeme"; |
116 | last; |
117 | } |
118 | my $curr_rdg_text = $curr_rdg->has_normal_form |
119 | ? $curr_rdg->normal_form : $curr_rdg->text; |
120 | if( $unused_rdg_part && |
121 | $unused_rdg_part =~ /^\Q$lexeme\E(\s*)(.*)$/ ) { |
122 | # Nth part of curr_rdg |
123 | $unused_rdg_part = $2; |
124 | push( @curr_lexemes, $lexobj ); |
125 | } elsif( $curr_rdg_text =~ /^\Q$lexeme\E(\s*)(.*)$/ ) { |
126 | # Flag an error if there is already an unused reading part. |
127 | warn "Skipped over unused text $unused_rdg_part at $curr_rdg" |
128 | if $unused_rdg_part; |
129 | $unused_rdg_part = $2; # will be empty if the whole reading matched |
130 | push( @curr_lexemes, $lexobj ); |
131 | } else { |
132 | # We do not cope with the idea of a lexeme being |
133 | # spread across multiple readings. |
134 | warn "Word sequence changed unexpectedly in text"; |
135 | # See if we can find a matching reading |
136 | my @lookahead; |
137 | my $matched; |
138 | while( my $nr = shift @path ) { |
139 | my $nrtext = $nr->has_normal_form ? $nr->normal_form : $nr->text; |
140 | if( $nrtext =~ /^\Q$lexeme\E/ ) { |
141 | $curr_rdg = $lookahead[-1] if @lookahead; |
142 | $matched = 1; |
143 | last; |
144 | } else { |
145 | push( @lookahead, $nr ); |
146 | } |
147 | } |
148 | # No match? Restore the state we had |
149 | unless( $matched ) { |
150 | unshift( @path, @lookahead ); |
151 | } |
152 | # Trigger a move |
153 | $unused_rdg_part = ''; |
154 | } |
155 | |
156 | unless( $unused_rdg_part ) { |
157 | # Record the lexemes for the given reading. |
158 | #print STDERR sprintf( "Adding lexeme(s) %s to reading %s (%s)\n", |
159 | # join( ' ', map { $_->string } @curr_lexemes ), |
160 | # $curr_rdg->id, $curr_rdg->text ); |
161 | _update_reading_lexemes( $opts{replace}, $curr_rdg, @curr_lexemes ); |
162 | $curr_rdg = shift @path; |
163 | @curr_lexemes = (); |
164 | } |
165 | } |
166 | } |
167 | } |
168 | |
169 | sub _update_reading_lexemes { |
170 | my( $replace, $reading, @lexemes ) = @_; |
171 | if( $reading->has_lexemes && !$replace ) { |
172 | # We need to merge what is in @lexemes with what we have already. |
173 | my @oldlex = $reading->lexemes; |
174 | my $cmp1 = join( '||', map { $_->string } @oldlex ); |
175 | my $cmp2 = join( '||', map { $_->string } @lexemes ); |
176 | if ( @oldlex == @lexemes && $cmp1 eq $cmp2 ) { |
177 | # The lexeme strings are the same, so merge the possible |
178 | # word forms from new to old. |
179 | foreach my $i ( 0 .. $#lexemes ) { |
180 | my $ol = $oldlex[$i]; |
181 | my $nl = $lexemes[$i]; |
182 | my %ofw; |
183 | map { $ofw{$_->to_string} = 1 } $ol->matching_forms; |
184 | foreach my $form ( $nl->matching_forms ) { |
185 | unless( $ofw{$form->to_string} ) { |
a3ef385d |
186 | # print STDERR "Adding form " . $form->to_string . |
187 | # " to lexeme " . $nl->string . " at $reading\n"; |
e0f6836a |
188 | $ol->add_matching_form( $form ); |
189 | $ol->is_disambiguated(0); |
190 | } |
191 | } |
192 | } |
193 | } else { |
194 | warn "Lexeme layout for $reading changed; replacing the lot"; |
195 | $reading->clear_lexemes; |
196 | $reading->add_lexeme( @lexemes ); |
197 | } |
198 | } else { |
199 | $reading->clear_lexemes if $replace; |
200 | $reading->add_lexeme( @lexemes ); |
201 | } |
202 | } |
203 | |
204 | # Utility function so that we can cheat and use it when we need both the path |
205 | # and its text. |
206 | sub _text_from_path { |
207 | my( $normalize, @path ) = @_; |
208 | my $pathtext = ''; |
209 | my $last; |
210 | foreach my $r ( @path ) { |
211 | unless ( $r->join_prior || !$last || $last->join_next ) { |
212 | $pathtext .= ' '; |
213 | } |
214 | $pathtext .= ( $normalize && $r->has_normal_form ) |
215 | ? $r->normal_form : $r->text; |
216 | $last = $r; |
217 | } |
218 | return $pathtext; |
219 | } |
220 | |
221 | # Utility function that actually calls the tree tagger. |
222 | sub _treetag_string { |
223 | my( $text, $lang ) = @_; |
224 | my $wittext = encode_utf8( $text ); |
225 | # Then see if we have TreeTagger |
226 | try { |
227 | load( 'Lingua::TreeTagger' ); |
228 | } catch { |
229 | warn "Cannot run TreeTagger without Lingua::TreeTagger module"; |
230 | return ''; |
231 | } |
232 | # OK, we can run it then. |
233 | # First upgrade to UTF8 for necessary languages. |
a3ef385d |
234 | my @utf8_supported = qw/ French /; |
235 | my %ttopts = ( 'language' => $lang, 'options' => [ qw/ -token -lemma / ] ); |
e0f6836a |
236 | if( grep { $_ eq $lang } @utf8_supported ) { |
a3ef385d |
237 | $ttopts{'use_utf8'} = 1; |
e0f6836a |
238 | } |
239 | # Now instantiate and run the tagger. |
a3ef385d |
240 | my $tagger = Lingua::TreeTagger->new( %ttopts ); |
e0f6836a |
241 | my $tagresult = $tagger->tag_text( \$text ); |
242 | |
243 | # TODO maybe send the tokens back rather than the interpreted string... |
e0f6836a |
244 | return $tagresult->as_text(); |
245 | } |
246 | |
247 | |
248 | 1; |
249 | |
250 | =head2 TODO |
251 | |
252 | =over |
253 | |
254 | =item * Handle package dependencies more gracefully |
255 | |
256 | =back |
257 | |
258 | =head1 LICENSE |
259 | |
260 | This package is free software and is provided "as is" without express |
261 | or implied warranty. You can redistribute it and/or modify it under |
262 | the same terms as Perl itself. |
263 | |
264 | =head1 AUTHOR |
265 | |
266 | Tara L Andrews E<lt>aurum@cpan.orgE<gt> |