introspect for morphology values; include these in help; make sure Perseus results...
[scpubgit/stemmatology.git] / lib / Text / Tradition / Language / Base.pm
CommitLineData
e0f6836a 1package Text::Tradition::Language::Base;
2
3use strict;
4use warnings;
5use Encode qw/ encode_utf8 decode_utf8 /;
6use Exporter 'import';
7use vars qw/ @EXPORT_OK /;
8use IPC::Run qw/ run /;
9use Lingua::TagSet::Multext;
10use Lingua::TagSet::TreeTagger;
11use Module::Load;
12use Text::Tradition::Collation::Reading::Lexeme;
13use Text::Tradition::Collation::Reading::WordForm;
14use TryCatch;
15
5271a011 16@EXPORT_OK = qw/ lemmatize_treetagger reading_lookup_treetagger treetagger_struct
75ae2b25 17 multext_struct lfs_morph_tags /;
e0f6836a 18
19=head1 NAME
20
21Text::Tradition::Language::Base - Base subroutines for lemmatization of words
22
23=head1 DESCRIPTION
24
25Common routines for applying morphological tagging to a Text::Tradition. Used
26with callbacks from the named language packages.
27
28=head1 SUBROUTINES
29
30=head2 lemmatize_treetagger( $tradition )
31
32Evaluates the tradition with the given options, and returns the results.
33
34=cut
35
36sub lemmatize_treetagger {
37 my( $tradition, %opts ) = @_;
38
39 # Given a tradition, lemmatize it witness by witness and see what we get.
40 my $c = $tradition->collation;
41 # First, clear out all existing lexemes from the readings.
42 my %witness_paths = _clear_reading_lexemes( $tradition );
43
44 foreach my $sig ( keys %witness_paths ) {
45 # Get the text as a sequence of readings and as a string
e0f6836a 46 my %witopts = (
47 'path' => $witness_paths{$sig},
48 %opts
49 );
50 _lemmatize_treetagger_sequence( %witopts );
51 }
52}
53
54sub _clear_reading_lexemes {
55 my $tradition = shift;
56 my $c = $tradition->collation;
57 # Clear out all existing lexemes from the readings. Save the path as long
58 # as we went to the trouble of generating it.
59 my %witness_paths;
60 foreach my $wit ( $tradition->witnesses ) {
61 my @sigla = ( $wit->sigil );
62 push( @sigla, $wit->sigil . $c->ac_label ) if $wit->is_layered;
63 foreach my $sig ( @sigla ) {
64 my @path = grep { !$_->is_meta }
65 $c->reading_sequence( $c->start, $c->end, $sig );
66 map { $_->clear_lexemes } @path;
67 $witness_paths{$sig} = \@path;
68 }
69 }
70 return %witness_paths;
71}
72
73=head2 reading_lookup( $rdg[, $rdg, ...] )
74
75Looks up one or more readings using the Flemm package, and returns the
76possible results. This uses the same logic as L<lemmatize> above for the
77entire tradition, but can also be used to (re-)analyze individual readings.
78
79=cut
80
81sub reading_lookup_treetagger {
82 my %opts = @_;
83 $opts{'replace'} = 1;
84 return _lemmatize_treetagger_sequence( %opts );
85}
86
87sub _lemmatize_treetagger_sequence {
88 my %opts = @_;
89 my @path = @{$opts{'path'}};
90 my $tagresult = _treetag_string( _text_from_path( 1, @path ), $opts{'language'} );
91 if( $tagresult ) {
92 # Map the tagged words onto the original readings, splitting
93 # them up into lexemes where necessary.
94 # NOTE we can have multiple lexemes in a reading, but not
95 # multiple readings to a lexeme.
96 my @tags = split( /\n/, $tagresult );
97 my @lexemes;
98 my $curr_rdg = shift @path;
99 my @curr_lexemes;
100 my $unused_rdg_part;
101 foreach my $tag ( @tags ) {
102 # Get the original word
103 my( $lexeme, @rest ) = split( /\t/, $tag );
104 # Lemmatize the whole
105 # TODO error trap this
106 my @forms = $opts{'callback'}( $tag );
107
108 my $lexobj = Text::Tradition::Collation::Reading::Lexeme->new(
109 'string' => $lexeme, 'language' => $opts{'language'},
110 'wordform_matchlist' => \@forms );
111 # Find the next non-meta reading
112 while( $curr_rdg && $curr_rdg->is_meta ) {
113 $curr_rdg = shift @path;
114 }
115 unless( $curr_rdg ) {
116 warn "Ran out of readings in sequence at $lexeme";
117 last;
118 }
119 my $curr_rdg_text = $curr_rdg->has_normal_form
120 ? $curr_rdg->normal_form : $curr_rdg->text;
121 if( $unused_rdg_part &&
122 $unused_rdg_part =~ /^\Q$lexeme\E(\s*)(.*)$/ ) {
123 # Nth part of curr_rdg
124 $unused_rdg_part = $2;
125 push( @curr_lexemes, $lexobj );
126 } elsif( $curr_rdg_text =~ /^\Q$lexeme\E(\s*)(.*)$/ ) {
127 # Flag an error if there is already an unused reading part.
128 warn "Skipped over unused text $unused_rdg_part at $curr_rdg"
129 if $unused_rdg_part;
130 $unused_rdg_part = $2; # will be empty if the whole reading matched
131 push( @curr_lexemes, $lexobj );
132 } else {
133 # We do not cope with the idea of a lexeme being
134 # spread across multiple readings.
135 warn "Word sequence changed unexpectedly in text";
136 # See if we can find a matching reading
137 my @lookahead;
138 my $matched;
139 while( my $nr = shift @path ) {
140 my $nrtext = $nr->has_normal_form ? $nr->normal_form : $nr->text;
141 if( $nrtext =~ /^\Q$lexeme\E/ ) {
142 $curr_rdg = $lookahead[-1] if @lookahead;
143 $matched = 1;
144 last;
145 } else {
146 push( @lookahead, $nr );
147 }
148 }
149 # No match? Restore the state we had
150 unless( $matched ) {
151 unshift( @path, @lookahead );
152 }
153 # Trigger a move
154 $unused_rdg_part = '';
155 }
156
157 unless( $unused_rdg_part ) {
158 # Record the lexemes for the given reading.
159 #print STDERR sprintf( "Adding lexeme(s) %s to reading %s (%s)\n",
160 # join( ' ', map { $_->string } @curr_lexemes ),
161 # $curr_rdg->id, $curr_rdg->text );
162 _update_reading_lexemes( $opts{replace}, $curr_rdg, @curr_lexemes );
163 $curr_rdg = shift @path;
164 @curr_lexemes = ();
165 }
166 }
167 }
168}
169
170sub _update_reading_lexemes {
171 my( $replace, $reading, @lexemes ) = @_;
172 if( $reading->has_lexemes && !$replace ) {
173 # We need to merge what is in @lexemes with what we have already.
174 my @oldlex = $reading->lexemes;
175 my $cmp1 = join( '||', map { $_->string } @oldlex );
176 my $cmp2 = join( '||', map { $_->string } @lexemes );
177 if ( @oldlex == @lexemes && $cmp1 eq $cmp2 ) {
178 # The lexeme strings are the same, so merge the possible
179 # word forms from new to old.
180 foreach my $i ( 0 .. $#lexemes ) {
181 my $ol = $oldlex[$i];
182 my $nl = $lexemes[$i];
183 my %ofw;
184 map { $ofw{$_->to_string} = 1 } $ol->matching_forms;
185 foreach my $form ( $nl->matching_forms ) {
186 unless( $ofw{$form->to_string} ) {
a3ef385d 187 # print STDERR "Adding form " . $form->to_string .
188 # " to lexeme " . $nl->string . " at $reading\n";
e0f6836a 189 $ol->add_matching_form( $form );
190 $ol->is_disambiguated(0);
191 }
192 }
193 }
194 } else {
195 warn "Lexeme layout for $reading changed; replacing the lot";
196 $reading->clear_lexemes;
197 $reading->add_lexeme( @lexemes );
198 }
199 } else {
200 $reading->clear_lexemes if $replace;
201 $reading->add_lexeme( @lexemes );
202 }
203}
204
205# Utility function so that we can cheat and use it when we need both the path
206# and its text.
207sub _text_from_path {
208 my( $normalize, @path ) = @_;
209 my $pathtext = '';
210 my $last;
211 foreach my $r ( @path ) {
212 unless ( $r->join_prior || !$last || $last->join_next ) {
213 $pathtext .= ' ';
214 }
215 $pathtext .= ( $normalize && $r->has_normal_form )
216 ? $r->normal_form : $r->text;
217 $last = $r;
218 }
219 return $pathtext;
220}
221
222# Utility function that actually calls the tree tagger.
223sub _treetag_string {
224 my( $text, $lang ) = @_;
225 my $wittext = encode_utf8( $text );
226 # Then see if we have TreeTagger
227 try {
228 load( 'Lingua::TreeTagger' );
229 } catch {
230 warn "Cannot run TreeTagger without Lingua::TreeTagger module";
231 return '';
232 }
233 # OK, we can run it then.
234 # First upgrade to UTF8 for necessary languages.
a3ef385d 235 my @utf8_supported = qw/ French /;
236 my %ttopts = ( 'language' => $lang, 'options' => [ qw/ -token -lemma / ] );
e0f6836a 237 if( grep { $_ eq $lang } @utf8_supported ) {
a3ef385d 238 $ttopts{'use_utf8'} = 1;
e0f6836a 239 }
240 # Now instantiate and run the tagger.
a3ef385d 241 my $tagger = Lingua::TreeTagger->new( %ttopts );
e0f6836a 242 my $tagresult = $tagger->tag_text( \$text );
243
244 # TODO maybe send the tokens back rather than the interpreted string...
e0f6836a 245 return $tagresult->as_text();
246}
247
5271a011 248## HACK function to correct for TagSet::TreeTagger brokenness
249sub treetagger_struct {
250 my $pos = shift;
251 $pos =~ s/PREP/PRP/;
252 return Lingua::TagSet::TreeTagger->tag2structure( $pos );
253}
254
255sub multext_struct {
256 my $pos = shift;
257 # No known hacks needed
258 return Lingua::TagSet::Multext->tag2structure( $pos );
259}
e0f6836a 260
75ae2b25 261=head2 lfs_morph_tags
262
263Return a data structure describing the available parts of speech and their attributes
264from the Lingua::Features::Structure class currently defined.
265
266=cut
267
268sub lfs_morph_tags {
269 load('Lingua::Features::StructureType');
270 my $tagset = { 'structures' => [], 'features' => {} };
271 foreach my $lfs ( sort { _by_structid( $a->id, $b->id ) } Lingua::Features::StructureType->types() ) {
272 my $tsstruct = { 'id' => $lfs->id, 'use_features' => [] };
273 foreach my $ftid ( Lingua::Features::StructureType->type($lfs->id)->features ) {
274 my $ftype = $lfs->feature_type( $ftid );
275 my $tfstruct = { 'id' => $ftid, 'values' => [] };
276 foreach my $fval( $ftype->values ) {
277 push( @{$tfstruct->{'values'}},
278 { 'short' => $fval, 'long' => $ftype->value_name( $fval ) } );
279 }
280 push( @{$tsstruct->{'use_features'}}, $ftid );
281 $tagset->{'features'}->{$ftid} = $tfstruct;
282 }
283 push( @{$tagset->{'structures'}}, $tsstruct );
284 }
285 return $tagset;
286}
287
288sub _by_structid {
289 my( $a, $b ) = @_;
290 return -1 if $a eq 'cat';
291 return 1 if $b eq 'cat';
292 return $a cmp $b;
293}
294
e0f6836a 2951;
296
297=head2 TODO
298
299=over
300
301=item * Handle package dependencies more gracefully
302
303=back
304
305=head1 LICENSE
306
307This package is free software and is provided "as is" without express
308or implied warranty. You can redistribute it and/or modify it under
309the same terms as Perl itself.
310
311=head1 AUTHOR
312
313Tara L Andrews E<lt>aurum@cpan.orgE<gt>