Commit | Line | Data |
e0f6836a |
1 | package Text::Tradition::Language::Base; |
2 | |
3 | use strict; |
4 | use warnings; |
5 | use Encode qw/ encode_utf8 decode_utf8 /; |
6 | use Exporter 'import'; |
7 | use vars qw/ @EXPORT_OK /; |
8 | use IPC::Run qw/ run /; |
e0f6836a |
9 | use Module::Load; |
10 | use Text::Tradition::Collation::Reading::Lexeme; |
11 | use Text::Tradition::Collation::Reading::WordForm; |
12 | use TryCatch; |
13 | |
f8862b58 |
14 | @EXPORT_OK = qw/ lemmatize_treetagger reading_lookup_treetagger lfs_morph_tags /; |
e0f6836a |
15 | |
16 | =head1 NAME |
17 | |
18 | Text::Tradition::Language::Base - Base subroutines for lemmatization of words |
19 | |
20 | =head1 DESCRIPTION |
21 | |
22 | Common routines for applying morphological tagging to a Text::Tradition. Used |
23 | with callbacks from the named language packages. |
24 | |
25 | =head1 SUBROUTINES |
26 | |
27 | =head2 lemmatize_treetagger( $tradition ) |
28 | |
29 | Evaluates the tradition with the given options, and returns the results. |
30 | |
31 | =cut |
32 | |
33 | sub lemmatize_treetagger { |
34 | my( $tradition, %opts ) = @_; |
35 | |
36 | # Given a tradition, lemmatize it witness by witness and see what we get. |
37 | my $c = $tradition->collation; |
38 | # First, clear out all existing lexemes from the readings. |
39 | my %witness_paths = _clear_reading_lexemes( $tradition ); |
40 | |
41 | foreach my $sig ( keys %witness_paths ) { |
42 | # Get the text as a sequence of readings and as a string |
e0f6836a |
43 | my %witopts = ( |
44 | 'path' => $witness_paths{$sig}, |
45 | %opts |
46 | ); |
47 | _lemmatize_treetagger_sequence( %witopts ); |
48 | } |
49 | } |
50 | |
51 | sub _clear_reading_lexemes { |
52 | my $tradition = shift; |
53 | my $c = $tradition->collation; |
54 | # Clear out all existing lexemes from the readings. Save the path as long |
55 | # as we went to the trouble of generating it. |
56 | my %witness_paths; |
57 | foreach my $wit ( $tradition->witnesses ) { |
58 | my @sigla = ( $wit->sigil ); |
59 | push( @sigla, $wit->sigil . $c->ac_label ) if $wit->is_layered; |
60 | foreach my $sig ( @sigla ) { |
61 | my @path = grep { !$_->is_meta } |
62 | $c->reading_sequence( $c->start, $c->end, $sig ); |
63 | map { $_->clear_lexemes } @path; |
64 | $witness_paths{$sig} = \@path; |
65 | } |
66 | } |
67 | return %witness_paths; |
68 | } |
69 | |
70 | =head2 reading_lookup( $rdg[, $rdg, ...] ) |
71 | |
72 | Looks up one or more readings using the Flemm package, and returns the |
73 | possible results. This uses the same logic as L<lemmatize> above for the |
74 | entire tradition, but can also be used to (re-)analyze individual readings. |
75 | |
76 | =cut |
77 | |
78 | sub reading_lookup_treetagger { |
79 | my %opts = @_; |
80 | $opts{'replace'} = 1; |
81 | return _lemmatize_treetagger_sequence( %opts ); |
82 | } |
83 | |
84 | sub _lemmatize_treetagger_sequence { |
85 | my %opts = @_; |
86 | my @path = @{$opts{'path'}}; |
87 | my $tagresult = _treetag_string( _text_from_path( 1, @path ), $opts{'language'} ); |
88 | if( $tagresult ) { |
89 | # Map the tagged words onto the original readings, splitting |
90 | # them up into lexemes where necessary. |
91 | # NOTE we can have multiple lexemes in a reading, but not |
92 | # multiple readings to a lexeme. |
93 | my @tags = split( /\n/, $tagresult ); |
94 | my @lexemes; |
95 | my $curr_rdg = shift @path; |
96 | my @curr_lexemes; |
97 | my $unused_rdg_part; |
98 | foreach my $tag ( @tags ) { |
99 | # Get the original word |
100 | my( $lexeme, @rest ) = split( /\t/, $tag ); |
101 | # Lemmatize the whole |
102 | # TODO error trap this |
103 | my @forms = $opts{'callback'}( $tag ); |
104 | |
105 | my $lexobj = Text::Tradition::Collation::Reading::Lexeme->new( |
106 | 'string' => $lexeme, 'language' => $opts{'language'}, |
107 | 'wordform_matchlist' => \@forms ); |
108 | # Find the next non-meta reading |
109 | while( $curr_rdg && $curr_rdg->is_meta ) { |
110 | $curr_rdg = shift @path; |
111 | } |
112 | unless( $curr_rdg ) { |
113 | warn "Ran out of readings in sequence at $lexeme"; |
114 | last; |
115 | } |
367e901b |
116 | my $curr_rdg_text = $curr_rdg->normal_form; |
e0f6836a |
117 | if( $unused_rdg_part && |
118 | $unused_rdg_part =~ /^\Q$lexeme\E(\s*)(.*)$/ ) { |
119 | # Nth part of curr_rdg |
120 | $unused_rdg_part = $2; |
121 | push( @curr_lexemes, $lexobj ); |
122 | } elsif( $curr_rdg_text =~ /^\Q$lexeme\E(\s*)(.*)$/ ) { |
123 | # Flag an error if there is already an unused reading part. |
124 | warn "Skipped over unused text $unused_rdg_part at $curr_rdg" |
125 | if $unused_rdg_part; |
126 | $unused_rdg_part = $2; # will be empty if the whole reading matched |
127 | push( @curr_lexemes, $lexobj ); |
128 | } else { |
129 | # We do not cope with the idea of a lexeme being |
130 | # spread across multiple readings. |
131 | warn "Word sequence changed unexpectedly in text"; |
132 | # See if we can find a matching reading |
133 | my @lookahead; |
134 | my $matched; |
135 | while( my $nr = shift @path ) { |
367e901b |
136 | my $nrtext = $nr->normal_form; |
e0f6836a |
137 | if( $nrtext =~ /^\Q$lexeme\E/ ) { |
138 | $curr_rdg = $lookahead[-1] if @lookahead; |
139 | $matched = 1; |
140 | last; |
141 | } else { |
142 | push( @lookahead, $nr ); |
143 | } |
144 | } |
145 | # No match? Restore the state we had |
146 | unless( $matched ) { |
147 | unshift( @path, @lookahead ); |
148 | } |
149 | # Trigger a move |
150 | $unused_rdg_part = ''; |
151 | } |
152 | |
153 | unless( $unused_rdg_part ) { |
154 | # Record the lexemes for the given reading. |
155 | #print STDERR sprintf( "Adding lexeme(s) %s to reading %s (%s)\n", |
156 | # join( ' ', map { $_->string } @curr_lexemes ), |
157 | # $curr_rdg->id, $curr_rdg->text ); |
158 | _update_reading_lexemes( $opts{replace}, $curr_rdg, @curr_lexemes ); |
159 | $curr_rdg = shift @path; |
160 | @curr_lexemes = (); |
161 | } |
162 | } |
163 | } |
164 | } |
165 | |
166 | sub _update_reading_lexemes { |
167 | my( $replace, $reading, @lexemes ) = @_; |
168 | if( $reading->has_lexemes && !$replace ) { |
169 | # We need to merge what is in @lexemes with what we have already. |
170 | my @oldlex = $reading->lexemes; |
171 | my $cmp1 = join( '||', map { $_->string } @oldlex ); |
172 | my $cmp2 = join( '||', map { $_->string } @lexemes ); |
173 | if ( @oldlex == @lexemes && $cmp1 eq $cmp2 ) { |
174 | # The lexeme strings are the same, so merge the possible |
175 | # word forms from new to old. |
176 | foreach my $i ( 0 .. $#lexemes ) { |
177 | my $ol = $oldlex[$i]; |
178 | my $nl = $lexemes[$i]; |
179 | my %ofw; |
180 | map { $ofw{$_->to_string} = 1 } $ol->matching_forms; |
181 | foreach my $form ( $nl->matching_forms ) { |
182 | unless( $ofw{$form->to_string} ) { |
a3ef385d |
183 | # print STDERR "Adding form " . $form->to_string . |
184 | # " to lexeme " . $nl->string . " at $reading\n"; |
e0f6836a |
185 | $ol->add_matching_form( $form ); |
186 | $ol->is_disambiguated(0); |
187 | } |
188 | } |
189 | } |
190 | } else { |
191 | warn "Lexeme layout for $reading changed; replacing the lot"; |
192 | $reading->clear_lexemes; |
193 | $reading->add_lexeme( @lexemes ); |
194 | } |
195 | } else { |
196 | $reading->clear_lexemes if $replace; |
197 | $reading->add_lexeme( @lexemes ); |
198 | } |
199 | } |
200 | |
201 | # Utility function so that we can cheat and use it when we need both the path |
202 | # and its text. |
203 | sub _text_from_path { |
204 | my( $normalize, @path ) = @_; |
205 | my $pathtext = ''; |
206 | my $last; |
207 | foreach my $r ( @path ) { |
208 | unless ( $r->join_prior || !$last || $last->join_next ) { |
209 | $pathtext .= ' '; |
210 | } |
367e901b |
211 | $pathtext .= $normalize ? $r->normal_form : $r->text; |
e0f6836a |
212 | $last = $r; |
213 | } |
214 | return $pathtext; |
215 | } |
216 | |
217 | # Utility function that actually calls the tree tagger. |
218 | sub _treetag_string { |
219 | my( $text, $lang ) = @_; |
220 | my $wittext = encode_utf8( $text ); |
221 | # Then see if we have TreeTagger |
222 | try { |
223 | load( 'Lingua::TreeTagger' ); |
224 | } catch { |
225 | warn "Cannot run TreeTagger without Lingua::TreeTagger module"; |
226 | return ''; |
227 | } |
228 | # OK, we can run it then. |
229 | # First upgrade to UTF8 for necessary languages. |
ad2a2c47 |
230 | my @utf8_supported = qw/ French Latin Greek /; |
a3ef385d |
231 | my %ttopts = ( 'language' => $lang, 'options' => [ qw/ -token -lemma / ] ); |
e0f6836a |
232 | if( grep { $_ eq $lang } @utf8_supported ) { |
a3ef385d |
233 | $ttopts{'use_utf8'} = 1; |
e0f6836a |
234 | } |
235 | # Now instantiate and run the tagger. |
a3ef385d |
236 | my $tagger = Lingua::TreeTagger->new( %ttopts ); |
e0f6836a |
237 | my $tagresult = $tagger->tag_text( \$text ); |
238 | |
239 | # TODO maybe send the tokens back rather than the interpreted string... |
e0f6836a |
240 | return $tagresult->as_text(); |
241 | } |
242 | |
75ae2b25 |
243 | =head2 lfs_morph_tags |
244 | |
245 | Return a data structure describing the available parts of speech and their attributes |
246 | from the Lingua::Features::Structure class currently defined. |
247 | |
248 | =cut |
249 | |
250 | sub lfs_morph_tags { |
251 | load('Lingua::Features::StructureType'); |
252 | my $tagset = { 'structures' => [], 'features' => {} }; |
253 | foreach my $lfs ( sort { _by_structid( $a->id, $b->id ) } Lingua::Features::StructureType->types() ) { |
f8862b58 |
254 | my $tsstruct = { 'id' => $lfs->id, 'desc' => $lfs->desc, 'use_features' => [] }; |
75ae2b25 |
255 | foreach my $ftid ( Lingua::Features::StructureType->type($lfs->id)->features ) { |
256 | my $ftype = $lfs->feature_type( $ftid ); |
f8862b58 |
257 | if( !$ftype && $lfs->base ) { |
258 | $ftype = $lfs->base->feature_type( $ftid ); |
259 | } |
260 | if( $ftype ) { |
261 | push( @{$tsstruct->{'use_features'}}, $ftid ); |
262 | if( $ftid eq 'type' ) { |
263 | # Type values change according to category |
264 | $ftid .= " (" . $lfs->id . ")"; |
265 | } |
266 | my $tfstruct = { 'id' => $ftid, 'values' => [] }; |
267 | foreach my $fval( $ftype->values ) { |
268 | push( @{$tfstruct->{'values'}}, |
269 | { 'short' => $fval, 'long' => $ftype->value_name( $fval ) } ); |
270 | } |
271 | $tagset->{'features'}->{$ftid} = $tfstruct; |
75ae2b25 |
272 | } |
75ae2b25 |
273 | } |
274 | push( @{$tagset->{'structures'}}, $tsstruct ); |
275 | } |
276 | return $tagset; |
277 | } |
278 | |
279 | sub _by_structid { |
280 | my( $a, $b ) = @_; |
281 | return -1 if $a eq 'cat'; |
282 | return 1 if $b eq 'cat'; |
283 | return $a cmp $b; |
284 | } |
285 | |
e0f6836a |
286 | 1; |
287 | |
288 | =head2 TODO |
289 | |
290 | =over |
291 | |
292 | =item * Handle package dependencies more gracefully |
293 | |
294 | =back |
295 | |