lib/Text/Tradition/Language/Base.pm

   1 package Text::Tradition::Language::Base;
   2
   3 use strict;
   4 use warnings;
   5 use Encode qw/ encode_utf8 decode_utf8 /;
   6 use Exporter 'import';
   7 use vars qw/ @EXPORT_OK /;
   8 use IPC::Run qw/ run /;
   9 use Lingua::TagSet::Multext;
  10 use Lingua::TagSet::TreeTagger;
  11 use Module::Load;
  12 use Text::Tradition::Collation::Reading::Lexeme;
  13 use Text::Tradition::Collation::Reading::WordForm;
  14 use TryCatch;
  15
  16 @EXPORT_OK = qw/ lemmatize_treetagger reading_lookup_treetagger /;
  17
  18 =head1 NAME
  19
  20 Text::Tradition::Language::Base - Base subroutines for lemmatization of words
  21
  22 =head1 DESCRIPTION
  23
  24 Common routines for applying morphological tagging to a Text::Tradition. Used
  25 with callbacks from the named language packages.
  26
  27 =head1 SUBROUTINES
  28
  29 =head2 lemmatize_treetagger( $tradition )
  30
  31 Evaluates the tradition with the given options, and returns the results.
  32
  33 =cut
  34
  35 sub lemmatize_treetagger {
  36         my( $tradition, %opts ) = @_;
  37
  38         # Given a tradition, lemmatize it witness by witness and see what we get.
  39         my $c = $tradition->collation;
  40         # First, clear out all existing lexemes from the readings.
  41         my %witness_paths = _clear_reading_lexemes( $tradition );
  42
  43         foreach my $sig ( keys %witness_paths ) {
  44                 # Get the text as a sequence of readings and as a string
  45                 print STDERR "Morphologizing witness $sig\n";
  46                 my %witopts = (
  47                         'path' => $witness_paths{$sig},
  48                         %opts
  49                         );
  50                 _lemmatize_treetagger_sequence( %witopts );
  51         }
  52 }
  53
  54 sub _clear_reading_lexemes {
  55         my $tradition = shift;
  56                 my $c = $tradition->collation;
  57         # Clear out all existing lexemes from the readings. Save the path as long
  58         # as we went to the trouble of generating it.
  59         my %witness_paths;
  60         foreach my $wit ( $tradition->witnesses ) {
  61                 my @sigla = ( $wit->sigil );
  62                 push( @sigla, $wit->sigil . $c->ac_label ) if $wit->is_layered;
  63                 foreach my $sig ( @sigla ) {
  64                         my @path = grep { !$_->is_meta }
  65                                 $c->reading_sequence( $c->start, $c->end, $sig );
  66                         map { $_->clear_lexemes } @path;
  67                         $witness_paths{$sig} = \@path;
  68                 }
  69         }
  70         return %witness_paths;
  71 }
  72
  73 =head2 reading_lookup( $rdg[, $rdg, ...] )
  74
  75 Looks up one or more readings using the Flemm package, and returns the
  76 possible results.  This uses the same logic as L<lemmatize> above for the
  77 entire tradition, but can also be used to (re-)analyze individual readings.
  78
  79 =cut
  80
  81 sub reading_lookup_treetagger {
  82         my %opts = @_;
  83         $opts{'replace'} = 1;
  84         return _lemmatize_treetagger_sequence( %opts );
  85 }
  86
  87 sub _lemmatize_treetagger_sequence {
  88         my %opts = @_;
  89         my @path = @{$opts{'path'}};
  90         my $tagresult = _treetag_string( _text_from_path( 1, @path ), $opts{'language'} );
  91         if( $tagresult ) {
  92                 # Map the tagged words onto the original readings, splitting
  93                 # them up into lexemes where necessary.
  94                 # NOTE we can have multiple lexemes in a reading, but not
  95                 # multiple readings to a lexeme.
  96                 my @tags = split( /\n/, $tagresult );
  97                 my @lexemes;
  98                 my $curr_rdg = shift @path;
  99                 my @curr_lexemes;
 100                 my $unused_rdg_part;
 101                 foreach my $tag ( @tags ) {
 102                         # Get the original word
 103                         my( $lexeme, @rest ) = split( /\t/, $tag );
 104                         # Lemmatize the whole
 105                         # TODO error trap this
 106                         my @forms = $opts{'callback'}( $tag );
 107
 108                         my $lexobj = Text::Tradition::Collation::Reading::Lexeme->new(
 109                                 'string' => $lexeme, 'language' => $opts{'language'},
 110                                 'wordform_matchlist' => \@forms );
 111                         # Find the next non-meta reading
 112                         while( $curr_rdg && $curr_rdg->is_meta ) {
 113                                 $curr_rdg = shift @path;
 114                         }
 115                         unless( $curr_rdg ) {
 116                                 warn "Ran out of readings in sequence at $lexeme";
 117                                 last;
 118                         }
 119                         my $curr_rdg_text = $curr_rdg->has_normal_form
 120                                 ? $curr_rdg->normal_form : $curr_rdg->text;
 121                         if( $unused_rdg_part &&
 122                                 $unused_rdg_part =~ /^\Q$lexeme\E(\s*)(.*)$/ ) {
 123                                 # Nth part of curr_rdg
 124                                 $unused_rdg_part = $2;
 125                                 push( @curr_lexemes, $lexobj );
 126                         } elsif( $curr_rdg_text =~ /^\Q$lexeme\E(\s*)(.*)$/ ) {
 127                                 # Flag an error if there is already an unused reading part.
 128                                 warn "Skipped over unused text $unused_rdg_part at $curr_rdg"
 129                                         if $unused_rdg_part;
 130                                 $unused_rdg_part = $2; # will be empty if the whole reading matched
 131                                 push( @curr_lexemes, $lexobj );
 132                         } else {
 133                                 # We do not cope with the idea of a lexeme being
 134                                 # spread across multiple readings.
 135                                 warn "Word sequence changed unexpectedly in text";
 136                                 # See if we can find a matching reading
 137                                 my @lookahead;
 138                                 my $matched;
 139                                 while( my $nr = shift @path ) {
 140                                         my $nrtext = $nr->has_normal_form ? $nr->normal_form : $nr->text;
 141                                         if( $nrtext =~ /^\Q$lexeme\E/ ) {
 142                                                 $curr_rdg = $lookahead[-1] if @lookahead;
 143                                                 $matched = 1;
 144                                                 last;
 145                                         } else {
 146                                                 push( @lookahead, $nr );
 147                                         }
 148                                 }
 149                                 # No match? Restore the state we had
 150                                 unless( $matched ) {
 151                                         unshift( @path, @lookahead );
 152                                 }
 153                                 # Trigger a move
 154                                 $unused_rdg_part = '';
 155                         }
 156
 157                         unless( $unused_rdg_part ) {
 158                                 # Record the lexemes for the given reading.
 159                                 #print STDERR sprintf( "Adding lexeme(s) %s to reading %s (%s)\n",
 160                                 #       join( ' ', map { $_->string } @curr_lexemes ),
 161                                 #       $curr_rdg->id, $curr_rdg->text );
 162                                 _update_reading_lexemes( $opts{replace}, $curr_rdg, @curr_lexemes );
 163                                 $curr_rdg = shift @path;
 164                                 @curr_lexemes = ();
 165                         }
 166                 }
 167         }
 168 }
 169
 170 sub _update_reading_lexemes {
 171         my( $replace, $reading, @lexemes ) = @_;
 172         if( $reading->has_lexemes && !$replace ) {
 173                 # We need to merge what is in @lexemes with what we have already.
 174                 my @oldlex = $reading->lexemes;
 175                 my $cmp1 = join( '||', map { $_->string } @oldlex );
 176                 my $cmp2 = join( '||', map { $_->string } @lexemes );
 177                 if ( @oldlex == @lexemes && $cmp1 eq $cmp2 ) {
 178                         # The lexeme strings are the same, so merge the possible
 179                         # word forms from new to old.
 180                         foreach my $i ( 0 .. $#lexemes ) {
 181                                 my $ol = $oldlex[$i];
 182                                 my $nl = $lexemes[$i];
 183                                 my %ofw;
 184                                 map { $ofw{$_->to_string} = 1 } $ol->matching_forms;
 185                                 foreach my $form ( $nl->matching_forms ) {
 186                                         unless( $ofw{$form->to_string} ) {
 187                                                 print STDERR "Adding form " . $form->to_string .
 188                                                         " to lexeme " . $nl->string . " at $reading\n";
 189                                                 $ol->add_matching_form( $form );
 190                                                 $ol->is_disambiguated(0);
 191                                         }
 192                                 }
 193                         }
 194                 } else {
 195                         warn "Lexeme layout for $reading changed; replacing the lot";
 196                         $reading->clear_lexemes;
 197                         $reading->add_lexeme( @lexemes );
 198                 }
 199         } else {
 200                 $reading->clear_lexemes if $replace;
 201                 $reading->add_lexeme( @lexemes );
 202         }
 203 }
 204
 205 # Utility function so that we can cheat and use it when we need both the path
 206 # and its text.
 207 sub _text_from_path {
 208         my( $normalize, @path ) = @_;
 209         my $pathtext = '';
 210         my $last;
 211         foreach my $r ( @path ) {
 212                 unless ( $r->join_prior || !$last || $last->join_next ) {
 213                         $pathtext .= ' ';
 214                 }
 215                 $pathtext .= ( $normalize && $r->has_normal_form )
 216                         ? $r->normal_form : $r->text;
 217                 $last = $r;
 218         }
 219         return $pathtext;
 220 }
 221
 222 # Utility function that actually calls the tree tagger.
 223 sub _treetag_string {
 224         my( $text, $lang ) = @_;
 225         my $wittext = encode_utf8( $text );
 226         # Then see if we have TreeTagger
 227         try {
 228                 load( 'Lingua::TreeTagger' );
 229         } catch {
 230                 warn "Cannot run TreeTagger without Lingua::TreeTagger module";
 231                 return '';
 232         }
 233         # OK, we can run it then.
 234         # First upgrade to UTF8 for necessary languages.
 235         $lang = lc( $lang );
 236         my $use_utf8;
 237         my @utf8_supported = qw/ french /;
 238         if( grep { $_ eq $lang } @utf8_supported ) {
 239                 $lang .= '-utf8';
 240                 $use_utf8 = 1;
 241         }
 242         # Now instantiate and run the tagger.
 243         my $tagger = Lingua::TreeTagger->new(
 244                 'language' => $lang,
 245                 'options' => [ qw/ -token -lemma / ],
 246                 );
 247         if( $use_utf8 ) {
 248                 $text = encode_utf8( $text );
 249         }
 250         my $tagresult = $tagger->tag_text( \$text );
 251
 252         # TODO maybe send the tokens back rather than the interpreted string...
 253         if( $use_utf8 ) {
 254                 return decode_utf8( $tagresult->as_text() );
 255         }
 256         return $tagresult->as_text();
 257 }
 258
 259
 260 1;
 261
 262 =head2 TODO
 263
 264 =over
 265
 266 =item * Handle package dependencies more gracefully
 267
 268 =back
 269
 270 =head1 LICENSE
 271
 272 This package is free software and is provided "as is" without express
 273 or implied warranty.  You can redistribute it and/or modify it under
 274 the same terms as Perl itself.
 275
 276 =head1 AUTHOR
 277
 278 Tara L Andrews E<lt>aurum@cpan.orgE<gt>