incorporate user auth functionality
[scpubgit/stemmatology.git] / lib / Text / Tradition / Language / Base.pm
CommitLineData
e0f6836a 1package Text::Tradition::Language::Base;
2
3use strict;
4use warnings;
5use Encode qw/ encode_utf8 decode_utf8 /;
6use Exporter 'import';
7use vars qw/ @EXPORT_OK /;
8use IPC::Run qw/ run /;
e0f6836a 9use Module::Load;
10use Text::Tradition::Collation::Reading::Lexeme;
11use Text::Tradition::Collation::Reading::WordForm;
12use TryCatch;
13
f8862b58 14@EXPORT_OK = qw/ lemmatize_treetagger reading_lookup_treetagger lfs_morph_tags /;
e0f6836a 15
16=head1 NAME
17
18Text::Tradition::Language::Base - Base subroutines for lemmatization of words
19
20=head1 DESCRIPTION
21
22Common routines for applying morphological tagging to a Text::Tradition. Used
23with callbacks from the named language packages.
24
25=head1 SUBROUTINES
26
27=head2 lemmatize_treetagger( $tradition )
28
29Evaluates the tradition with the given options, and returns the results.
30
31=cut
32
33sub lemmatize_treetagger {
34 my( $tradition, %opts ) = @_;
35
36 # Given a tradition, lemmatize it witness by witness and see what we get.
37 my $c = $tradition->collation;
38 # First, clear out all existing lexemes from the readings.
39 my %witness_paths = _clear_reading_lexemes( $tradition );
40
41 foreach my $sig ( keys %witness_paths ) {
42 # Get the text as a sequence of readings and as a string
e0f6836a 43 my %witopts = (
44 'path' => $witness_paths{$sig},
45 %opts
46 );
47 _lemmatize_treetagger_sequence( %witopts );
48 }
49}
50
51sub _clear_reading_lexemes {
52 my $tradition = shift;
53 my $c = $tradition->collation;
54 # Clear out all existing lexemes from the readings. Save the path as long
55 # as we went to the trouble of generating it.
56 my %witness_paths;
57 foreach my $wit ( $tradition->witnesses ) {
58 my @sigla = ( $wit->sigil );
59 push( @sigla, $wit->sigil . $c->ac_label ) if $wit->is_layered;
60 foreach my $sig ( @sigla ) {
61 my @path = grep { !$_->is_meta }
62 $c->reading_sequence( $c->start, $c->end, $sig );
63 map { $_->clear_lexemes } @path;
64 $witness_paths{$sig} = \@path;
65 }
66 }
67 return %witness_paths;
68}
69
70=head2 reading_lookup( $rdg[, $rdg, ...] )
71
72Looks up one or more readings using the Flemm package, and returns the
73possible results. This uses the same logic as L<lemmatize> above for the
74entire tradition, but can also be used to (re-)analyze individual readings.
75
76=cut
77
78sub reading_lookup_treetagger {
79 my %opts = @_;
80 $opts{'replace'} = 1;
81 return _lemmatize_treetagger_sequence( %opts );
82}
83
84sub _lemmatize_treetagger_sequence {
85 my %opts = @_;
86 my @path = @{$opts{'path'}};
87 my $tagresult = _treetag_string( _text_from_path( 1, @path ), $opts{'language'} );
88 if( $tagresult ) {
89 # Map the tagged words onto the original readings, splitting
90 # them up into lexemes where necessary.
91 # NOTE we can have multiple lexemes in a reading, but not
92 # multiple readings to a lexeme.
93 my @tags = split( /\n/, $tagresult );
94 my @lexemes;
95 my $curr_rdg = shift @path;
96 my @curr_lexemes;
97 my $unused_rdg_part;
98 foreach my $tag ( @tags ) {
99 # Get the original word
100 my( $lexeme, @rest ) = split( /\t/, $tag );
101 # Lemmatize the whole
102 # TODO error trap this
103 my @forms = $opts{'callback'}( $tag );
104
105 my $lexobj = Text::Tradition::Collation::Reading::Lexeme->new(
106 'string' => $lexeme, 'language' => $opts{'language'},
107 'wordform_matchlist' => \@forms );
108 # Find the next non-meta reading
109 while( $curr_rdg && $curr_rdg->is_meta ) {
110 $curr_rdg = shift @path;
111 }
112 unless( $curr_rdg ) {
113 warn "Ran out of readings in sequence at $lexeme";
114 last;
115 }
367e901b 116 my $curr_rdg_text = $curr_rdg->normal_form;
e0f6836a 117 if( $unused_rdg_part &&
118 $unused_rdg_part =~ /^\Q$lexeme\E(\s*)(.*)$/ ) {
119 # Nth part of curr_rdg
120 $unused_rdg_part = $2;
121 push( @curr_lexemes, $lexobj );
122 } elsif( $curr_rdg_text =~ /^\Q$lexeme\E(\s*)(.*)$/ ) {
123 # Flag an error if there is already an unused reading part.
124 warn "Skipped over unused text $unused_rdg_part at $curr_rdg"
125 if $unused_rdg_part;
126 $unused_rdg_part = $2; # will be empty if the whole reading matched
127 push( @curr_lexemes, $lexobj );
128 } else {
129 # We do not cope with the idea of a lexeme being
130 # spread across multiple readings.
131 warn "Word sequence changed unexpectedly in text";
132 # See if we can find a matching reading
133 my @lookahead;
134 my $matched;
135 while( my $nr = shift @path ) {
367e901b 136 my $nrtext = $nr->normal_form;
e0f6836a 137 if( $nrtext =~ /^\Q$lexeme\E/ ) {
138 $curr_rdg = $lookahead[-1] if @lookahead;
139 $matched = 1;
140 last;
141 } else {
142 push( @lookahead, $nr );
143 }
144 }
145 # No match? Restore the state we had
146 unless( $matched ) {
147 unshift( @path, @lookahead );
148 }
149 # Trigger a move
150 $unused_rdg_part = '';
151 }
152
153 unless( $unused_rdg_part ) {
154 # Record the lexemes for the given reading.
155 #print STDERR sprintf( "Adding lexeme(s) %s to reading %s (%s)\n",
156 # join( ' ', map { $_->string } @curr_lexemes ),
157 # $curr_rdg->id, $curr_rdg->text );
158 _update_reading_lexemes( $opts{replace}, $curr_rdg, @curr_lexemes );
159 $curr_rdg = shift @path;
160 @curr_lexemes = ();
161 }
162 }
163 }
164}
165
166sub _update_reading_lexemes {
167 my( $replace, $reading, @lexemes ) = @_;
168 if( $reading->has_lexemes && !$replace ) {
169 # We need to merge what is in @lexemes with what we have already.
170 my @oldlex = $reading->lexemes;
171 my $cmp1 = join( '||', map { $_->string } @oldlex );
172 my $cmp2 = join( '||', map { $_->string } @lexemes );
173 if ( @oldlex == @lexemes && $cmp1 eq $cmp2 ) {
174 # The lexeme strings are the same, so merge the possible
175 # word forms from new to old.
176 foreach my $i ( 0 .. $#lexemes ) {
177 my $ol = $oldlex[$i];
178 my $nl = $lexemes[$i];
179 my %ofw;
180 map { $ofw{$_->to_string} = 1 } $ol->matching_forms;
181 foreach my $form ( $nl->matching_forms ) {
182 unless( $ofw{$form->to_string} ) {
a3ef385d 183 # print STDERR "Adding form " . $form->to_string .
184 # " to lexeme " . $nl->string . " at $reading\n";
e0f6836a 185 $ol->add_matching_form( $form );
186 $ol->is_disambiguated(0);
187 }
188 }
189 }
190 } else {
191 warn "Lexeme layout for $reading changed; replacing the lot";
192 $reading->clear_lexemes;
193 $reading->add_lexeme( @lexemes );
194 }
195 } else {
196 $reading->clear_lexemes if $replace;
197 $reading->add_lexeme( @lexemes );
198 }
199}
200
201# Utility function so that we can cheat and use it when we need both the path
202# and its text.
203sub _text_from_path {
204 my( $normalize, @path ) = @_;
205 my $pathtext = '';
206 my $last;
207 foreach my $r ( @path ) {
208 unless ( $r->join_prior || !$last || $last->join_next ) {
209 $pathtext .= ' ';
210 }
367e901b 211 $pathtext .= $normalize ? $r->normal_form : $r->text;
e0f6836a 212 $last = $r;
213 }
214 return $pathtext;
215}
216
217# Utility function that actually calls the tree tagger.
218sub _treetag_string {
219 my( $text, $lang ) = @_;
220 my $wittext = encode_utf8( $text );
221 # Then see if we have TreeTagger
222 try {
223 load( 'Lingua::TreeTagger' );
224 } catch {
225 warn "Cannot run TreeTagger without Lingua::TreeTagger module";
226 return '';
227 }
228 # OK, we can run it then.
229 # First upgrade to UTF8 for necessary languages.
ad2a2c47 230 my @utf8_supported = qw/ French Latin Greek /;
a3ef385d 231 my %ttopts = ( 'language' => $lang, 'options' => [ qw/ -token -lemma / ] );
e0f6836a 232 if( grep { $_ eq $lang } @utf8_supported ) {
a3ef385d 233 $ttopts{'use_utf8'} = 1;
e0f6836a 234 }
235 # Now instantiate and run the tagger.
a3ef385d 236 my $tagger = Lingua::TreeTagger->new( %ttopts );
e0f6836a 237 my $tagresult = $tagger->tag_text( \$text );
238
239 # TODO maybe send the tokens back rather than the interpreted string...
e0f6836a 240 return $tagresult->as_text();
241}
242
75ae2b25 243=head2 lfs_morph_tags
244
245Return a data structure describing the available parts of speech and their attributes
246from the Lingua::Features::Structure class currently defined.
247
248=cut
249
250sub lfs_morph_tags {
251 load('Lingua::Features::StructureType');
252 my $tagset = { 'structures' => [], 'features' => {} };
253 foreach my $lfs ( sort { _by_structid( $a->id, $b->id ) } Lingua::Features::StructureType->types() ) {
f8862b58 254 my $tsstruct = { 'id' => $lfs->id, 'desc' => $lfs->desc, 'use_features' => [] };
75ae2b25 255 foreach my $ftid ( Lingua::Features::StructureType->type($lfs->id)->features ) {
256 my $ftype = $lfs->feature_type( $ftid );
f8862b58 257 if( !$ftype && $lfs->base ) {
258 $ftype = $lfs->base->feature_type( $ftid );
259 }
260 if( $ftype ) {
261 push( @{$tsstruct->{'use_features'}}, $ftid );
262 if( $ftid eq 'type' ) {
263 # Type values change according to category
264 $ftid .= " (" . $lfs->id . ")";
265 }
266 my $tfstruct = { 'id' => $ftid, 'values' => [] };
267 foreach my $fval( $ftype->values ) {
268 push( @{$tfstruct->{'values'}},
269 { 'short' => $fval, 'long' => $ftype->value_name( $fval ) } );
270 }
271 $tagset->{'features'}->{$ftid} = $tfstruct;
75ae2b25 272 }
75ae2b25 273 }
274 push( @{$tagset->{'structures'}}, $tsstruct );
275 }
276 return $tagset;
277}
278
279sub _by_structid {
280 my( $a, $b ) = @_;
281 return -1 if $a eq 'cat';
282 return 1 if $b eq 'cat';
283 return $a cmp $b;
284}
285
e0f6836a 2861;
287
288=head2 TODO
289
290=over
291
292=item * Handle package dependencies more gracefully
293
294=back
295
296=head1 LICENSE
297
298This package is free software and is provided "as is" without express
299or implied warranty. You can redistribute it and/or modify it under
300the same terms as Perl itself.
301
302=head1 AUTHOR
303
304Tara L Andrews E<lt>aurum@cpan.orgE<gt>