use modified Lingua::TreeTagger
[scpubgit/stemmatology.git] / lib / Text / Tradition / Language / Base.pm
CommitLineData
e0f6836a 1package Text::Tradition::Language::Base;
2
3use strict;
4use warnings;
5use Encode qw/ encode_utf8 decode_utf8 /;
6use Exporter 'import';
7use vars qw/ @EXPORT_OK /;
8use IPC::Run qw/ run /;
9use Lingua::TagSet::Multext;
10use Lingua::TagSet::TreeTagger;
11use Module::Load;
12use Text::Tradition::Collation::Reading::Lexeme;
13use Text::Tradition::Collation::Reading::WordForm;
14use TryCatch;
15
16@EXPORT_OK = qw/ lemmatize_treetagger reading_lookup_treetagger /;
17
18=head1 NAME
19
20Text::Tradition::Language::Base - Base subroutines for lemmatization of words
21
22=head1 DESCRIPTION
23
24Common routines for applying morphological tagging to a Text::Tradition. Used
25with callbacks from the named language packages.
26
27=head1 SUBROUTINES
28
29=head2 lemmatize_treetagger( $tradition )
30
31Evaluates the tradition with the given options, and returns the results.
32
33=cut
34
35sub lemmatize_treetagger {
36 my( $tradition, %opts ) = @_;
37
38 # Given a tradition, lemmatize it witness by witness and see what we get.
39 my $c = $tradition->collation;
40 # First, clear out all existing lexemes from the readings.
41 my %witness_paths = _clear_reading_lexemes( $tradition );
42
43 foreach my $sig ( keys %witness_paths ) {
44 # Get the text as a sequence of readings and as a string
e0f6836a 45 my %witopts = (
46 'path' => $witness_paths{$sig},
47 %opts
48 );
49 _lemmatize_treetagger_sequence( %witopts );
50 }
51}
52
53sub _clear_reading_lexemes {
54 my $tradition = shift;
55 my $c = $tradition->collation;
56 # Clear out all existing lexemes from the readings. Save the path as long
57 # as we went to the trouble of generating it.
58 my %witness_paths;
59 foreach my $wit ( $tradition->witnesses ) {
60 my @sigla = ( $wit->sigil );
61 push( @sigla, $wit->sigil . $c->ac_label ) if $wit->is_layered;
62 foreach my $sig ( @sigla ) {
63 my @path = grep { !$_->is_meta }
64 $c->reading_sequence( $c->start, $c->end, $sig );
65 map { $_->clear_lexemes } @path;
66 $witness_paths{$sig} = \@path;
67 }
68 }
69 return %witness_paths;
70}
71
72=head2 reading_lookup( $rdg[, $rdg, ...] )
73
74Looks up one or more readings using the Flemm package, and returns the
75possible results. This uses the same logic as L<lemmatize> above for the
76entire tradition, but can also be used to (re-)analyze individual readings.
77
78=cut
79
80sub reading_lookup_treetagger {
81 my %opts = @_;
82 $opts{'replace'} = 1;
83 return _lemmatize_treetagger_sequence( %opts );
84}
85
86sub _lemmatize_treetagger_sequence {
87 my %opts = @_;
88 my @path = @{$opts{'path'}};
89 my $tagresult = _treetag_string( _text_from_path( 1, @path ), $opts{'language'} );
90 if( $tagresult ) {
91 # Map the tagged words onto the original readings, splitting
92 # them up into lexemes where necessary.
93 # NOTE we can have multiple lexemes in a reading, but not
94 # multiple readings to a lexeme.
95 my @tags = split( /\n/, $tagresult );
96 my @lexemes;
97 my $curr_rdg = shift @path;
98 my @curr_lexemes;
99 my $unused_rdg_part;
100 foreach my $tag ( @tags ) {
101 # Get the original word
102 my( $lexeme, @rest ) = split( /\t/, $tag );
103 # Lemmatize the whole
104 # TODO error trap this
105 my @forms = $opts{'callback'}( $tag );
106
107 my $lexobj = Text::Tradition::Collation::Reading::Lexeme->new(
108 'string' => $lexeme, 'language' => $opts{'language'},
109 'wordform_matchlist' => \@forms );
110 # Find the next non-meta reading
111 while( $curr_rdg && $curr_rdg->is_meta ) {
112 $curr_rdg = shift @path;
113 }
114 unless( $curr_rdg ) {
115 warn "Ran out of readings in sequence at $lexeme";
116 last;
117 }
118 my $curr_rdg_text = $curr_rdg->has_normal_form
119 ? $curr_rdg->normal_form : $curr_rdg->text;
120 if( $unused_rdg_part &&
121 $unused_rdg_part =~ /^\Q$lexeme\E(\s*)(.*)$/ ) {
122 # Nth part of curr_rdg
123 $unused_rdg_part = $2;
124 push( @curr_lexemes, $lexobj );
125 } elsif( $curr_rdg_text =~ /^\Q$lexeme\E(\s*)(.*)$/ ) {
126 # Flag an error if there is already an unused reading part.
127 warn "Skipped over unused text $unused_rdg_part at $curr_rdg"
128 if $unused_rdg_part;
129 $unused_rdg_part = $2; # will be empty if the whole reading matched
130 push( @curr_lexemes, $lexobj );
131 } else {
132 # We do not cope with the idea of a lexeme being
133 # spread across multiple readings.
134 warn "Word sequence changed unexpectedly in text";
135 # See if we can find a matching reading
136 my @lookahead;
137 my $matched;
138 while( my $nr = shift @path ) {
139 my $nrtext = $nr->has_normal_form ? $nr->normal_form : $nr->text;
140 if( $nrtext =~ /^\Q$lexeme\E/ ) {
141 $curr_rdg = $lookahead[-1] if @lookahead;
142 $matched = 1;
143 last;
144 } else {
145 push( @lookahead, $nr );
146 }
147 }
148 # No match? Restore the state we had
149 unless( $matched ) {
150 unshift( @path, @lookahead );
151 }
152 # Trigger a move
153 $unused_rdg_part = '';
154 }
155
156 unless( $unused_rdg_part ) {
157 # Record the lexemes for the given reading.
158 #print STDERR sprintf( "Adding lexeme(s) %s to reading %s (%s)\n",
159 # join( ' ', map { $_->string } @curr_lexemes ),
160 # $curr_rdg->id, $curr_rdg->text );
161 _update_reading_lexemes( $opts{replace}, $curr_rdg, @curr_lexemes );
162 $curr_rdg = shift @path;
163 @curr_lexemes = ();
164 }
165 }
166 }
167}
168
169sub _update_reading_lexemes {
170 my( $replace, $reading, @lexemes ) = @_;
171 if( $reading->has_lexemes && !$replace ) {
172 # We need to merge what is in @lexemes with what we have already.
173 my @oldlex = $reading->lexemes;
174 my $cmp1 = join( '||', map { $_->string } @oldlex );
175 my $cmp2 = join( '||', map { $_->string } @lexemes );
176 if ( @oldlex == @lexemes && $cmp1 eq $cmp2 ) {
177 # The lexeme strings are the same, so merge the possible
178 # word forms from new to old.
179 foreach my $i ( 0 .. $#lexemes ) {
180 my $ol = $oldlex[$i];
181 my $nl = $lexemes[$i];
182 my %ofw;
183 map { $ofw{$_->to_string} = 1 } $ol->matching_forms;
184 foreach my $form ( $nl->matching_forms ) {
185 unless( $ofw{$form->to_string} ) {
a3ef385d 186 # print STDERR "Adding form " . $form->to_string .
187 # " to lexeme " . $nl->string . " at $reading\n";
e0f6836a 188 $ol->add_matching_form( $form );
189 $ol->is_disambiguated(0);
190 }
191 }
192 }
193 } else {
194 warn "Lexeme layout for $reading changed; replacing the lot";
195 $reading->clear_lexemes;
196 $reading->add_lexeme( @lexemes );
197 }
198 } else {
199 $reading->clear_lexemes if $replace;
200 $reading->add_lexeme( @lexemes );
201 }
202}
203
204# Utility function so that we can cheat and use it when we need both the path
205# and its text.
206sub _text_from_path {
207 my( $normalize, @path ) = @_;
208 my $pathtext = '';
209 my $last;
210 foreach my $r ( @path ) {
211 unless ( $r->join_prior || !$last || $last->join_next ) {
212 $pathtext .= ' ';
213 }
214 $pathtext .= ( $normalize && $r->has_normal_form )
215 ? $r->normal_form : $r->text;
216 $last = $r;
217 }
218 return $pathtext;
219}
220
221# Utility function that actually calls the tree tagger.
222sub _treetag_string {
223 my( $text, $lang ) = @_;
224 my $wittext = encode_utf8( $text );
225 # Then see if we have TreeTagger
226 try {
227 load( 'Lingua::TreeTagger' );
228 } catch {
229 warn "Cannot run TreeTagger without Lingua::TreeTagger module";
230 return '';
231 }
232 # OK, we can run it then.
233 # First upgrade to UTF8 for necessary languages.
a3ef385d 234 my @utf8_supported = qw/ French /;
235 my %ttopts = ( 'language' => $lang, 'options' => [ qw/ -token -lemma / ] );
e0f6836a 236 if( grep { $_ eq $lang } @utf8_supported ) {
a3ef385d 237 $ttopts{'use_utf8'} = 1;
e0f6836a 238 }
239 # Now instantiate and run the tagger.
a3ef385d 240 my $tagger = Lingua::TreeTagger->new( %ttopts );
e0f6836a 241 my $tagresult = $tagger->tag_text( \$text );
242
243 # TODO maybe send the tokens back rather than the interpreted string...
e0f6836a 244 return $tagresult->as_text();
245}
246
247
2481;
249
250=head2 TODO
251
252=over
253
254=item * Handle package dependencies more gracefully
255
256=back
257
258=head1 LICENSE
259
260This package is free software and is provided "as is" without express
261or implied warranty. You can redistribute it and/or modify it under
262the same terms as Perl itself.
263
264=head1 AUTHOR
265
266Tara L Andrews E<lt>aurum@cpan.orgE<gt>