lib/Text/Tradition/Language/English.pm

   1 package Text::Tradition::Language::English;
   2
   3 use strict;
   4 use warnings;
   5 use Encode qw/ encode_utf8 decode_utf8 /;
   6 use IPC::Run qw/ run /;
   7 use Lingua::TagSet::TreeTagger;
   8 use Module::Load;
   9 use Text::Tradition::Collation::Reading::Lexeme;
  10 use Text::Tradition::Collation::Reading::WordForm;
  11 use TryCatch;
  12
  13 my $MORPHDIR = '/Users/tla/Projects/morphology';
  14
  15 =head1 NAME
  16
  17 Text::Tradition::Language::English - language-specific module for English
  18
  19 =head1 DESCRIPTION
  20
  21 Implements morphology lookup for English words in context.  This module
  22 depends on the TreeTagger software
  23 (L<http://www.ims.uni-stuttgart.de/projekte/corplex/TreeTagger/>), which is
  24 (for now) expected to be installed in $MORPHDIR/TreeTagger.
  25
  26 =head1 SUBROUTINES
  27
  28 =head2 lemmatize( $text )
  29
  30 Evaluates the string using the TreeTagger, and returns the results.
  31
  32 =begin testing
  33
  34 binmode STDOUT, ':utf8';
  35 use Text::Tradition;
  36 use_ok( 'Text::Tradition::Language::English' );
  37
  38 =end testing
  39
  40 =cut
  41
  42 sub lemmatize {
  43         my $tradition = shift;
  44
  45         # Given a tradition, lemmatize it witness by witness and see what we get.
  46         my $workdir = File::Temp->newdir();
  47         my $c = $tradition->collation;
  48         # First, clear out all existing lexemes from the readings. Save the
  49         # path as long as we went to the trouble of generating it.
  50         my %witness_paths;
  51         foreach my $wit ( $tradition->witnesses ) {
  52                 my @sigla = ( $wit->sigil );
  53                 push( @sigla, $wit->sigil . $c->ac_label ) if $wit->is_layered;
  54                 foreach my $sig ( @sigla ) {
  55                         my @path = grep { !$_->is_meta }
  56                                 $c->reading_sequence( $c->start, $c->end, $sig );
  57                         map { $_->clear_lexemes } @path;
  58                         $witness_paths{$sig} = \@path;
  59                 }
  60         }
  61
  62         foreach my $sig ( keys %witness_paths ) {
  63                 # Get the text as a sequence of readings and as a string
  64                 print STDERR "Morphologizing witness $sig\n";
  65                 _lemmatize_sequence( undef, @{$witness_paths{$sig}} );
  66         }
  67 }
  68
  69 sub _update_reading_lexemes {
  70         my( $replace, $reading, @lexemes ) = @_;
  71         if( $reading->has_lexemes && !$replace ) {
  72                 # We need to merge what is in @lexemes with what we have already.
  73                 my @oldlex = $reading->lexemes;
  74                 my $cmp1 = join( '||', map { $_->string } @oldlex );
  75                 my $cmp2 = join( '||', map { $_->string } @lexemes );
  76                 if ( @oldlex == @lexemes && $cmp1 eq $cmp2 ) {
  77                         # The lexeme strings are the same, so merge the possible
  78                         # word forms from new to old.
  79                         foreach my $i ( 0 .. $#lexemes ) {
  80                                 my $ol = $oldlex[$i];
  81                                 my $nl = $lexemes[$i];
  82                                 my %ofw;
  83                                 map { $ofw{$_->to_string} = 1 } $ol->matching_forms;
  84                                 foreach my $form ( $nl->matching_forms ) {
  85                                         unless( $ofw{$form->to_string} ) {
  86                                                 print STDERR "Adding form " . $form->to_string .
  87                                                         " to lexeme " . $nl->string . " at $reading\n";
  88                                                 $ol->add_matching_form( $form );
  89                                                 $ol->is_disambiguated(0);
  90                                         }
  91                                 }
  92                         }
  93                 } else {
  94                         warn "Lexeme layout for $reading changed; replacing the lot";
  95                         $reading->clear_lexemes;
  96                         $reading->add_lexeme( @lexemes );
  97                 }
  98         } else {
  99                 $reading->clear_lexemes if $replace;
 100                 $reading->add_lexeme( @lexemes );
 101         }
 102 }
 103
 104 =head2 reading_lookup( $rdg[, $rdg, ...] )
 105
 106 Looks up one or more readings using the Flemm package, and returns the
 107 possible results.  This uses the same logic as L<lemmatize> above for the
 108 entire tradition, but can also be used to (re-)analyze individual readings.
 109
 110 =cut
 111
 112 sub reading_lookup {
 113         return _lemmatize_sequence( 1, @_ );
 114 }
 115
 116 sub _lemmatize_sequence {
 117         my( $replace, @path ) = @_;
 118         my $tagresult = _treetag_string( _text_from_path( 1, @path ) );
 119         if( $tagresult ) {
 120                 # Map the tagged words onto the original readings, splitting
 121                 # them up into lexemes where necessary.
 122                 # NOTE we can have multiple lexemes in a reading, but not
 123                 # multiple readings to a lexeme.
 124                 my @tags = split( /\n/, $tagresult );
 125                 my @lexemes;
 126                 my $curr_rdg = shift @path;
 127                 my @curr_lexemes;
 128                 my $unused_rdg_part;
 129                 foreach my $tag ( @tags ) {
 130                         # Get the original word
 131                         my( $lexeme, @rest ) = split( /\t/, $tag );
 132                         # Lemmatize the whole
 133                         my @forms = _parse_wordform( $tag );
 134                         my $lexobj = Text::Tradition::Collation::Reading::Lexeme->new(
 135                                 'string' => $lexeme, 'language' => 'English',
 136                                 'wordform_matchlist' => \@forms );
 137                         # Find the next non-meta reading
 138                         while( $curr_rdg && $curr_rdg->is_meta ) {
 139                                 $curr_rdg = shift @path;
 140                         }
 141                         unless( $curr_rdg ) {
 142                                 warn "Ran out of readings in sequence at $lexeme";
 143                                 last;
 144                         }
 145                         my $curr_rdg_text = $curr_rdg->has_normal_form
 146                                 ? $curr_rdg->normal_form : $curr_rdg->text;
 147                         if( $unused_rdg_part &&
 148                                 $unused_rdg_part =~ /^\Q$lexeme\E(\s*)(.*)$/ ) {
 149                                 # Nth part of curr_rdg
 150                                 $unused_rdg_part = $2;
 151                                 push( @curr_lexemes, $lexobj );
 152                         } elsif( $curr_rdg_text =~ /^\Q$lexeme\E(\s*)(.*)$/ ) {
 153                                 # Flag an error if there is already an unused reading part.
 154                                 warn "Skipped over unused text $unused_rdg_part at $curr_rdg"
 155                                         if $unused_rdg_part;
 156                                 $unused_rdg_part = $2; # will be empty if the whole reading matched
 157                                 push( @curr_lexemes, $lexobj );
 158                         } else {
 159                                 # We do not cope with the idea of a lexeme being
 160                                 # spread across multiple readings.
 161                                 warn "Word sequence changed unexpectedly in text";
 162                                 # See if we can find a matching reading
 163                                 my @lookahead;
 164                                 my $matched;
 165                                 while( my $nr = shift @path ) {
 166                                         my $nrtext = $nr->has_normal_form ? $nr->normal_form : $nr->text;
 167                                         if( $nrtext =~ /^\Q$lexeme\E/ ) {
 168                                                 $curr_rdg = $lookahead[-1] if @lookahead;
 169                                                 $matched = 1;
 170                                                 last;
 171                                         } else {
 172                                                 push( @lookahead, $nr );
 173                                         }
 174                                 }
 175                                 # No match? Restore the state we had
 176                                 unless( $matched ) {
 177                                         unshift( @path, @lookahead );
 178                                 }
 179                                 # Trigger a move
 180                                 $unused_rdg_part = '';
 181                         }
 182
 183                         unless( $unused_rdg_part ) {
 184                                 # Record the lexemes for the given reading.
 185                                 #print STDERR sprintf( "Adding lexeme(s) %s to reading %s (%s)\n",
 186                                 #       join( ' ', map { $_->string } @curr_lexemes ),
 187                                 #       $curr_rdg->id, $curr_rdg->text );
 188                                 _update_reading_lexemes( $replace, $curr_rdg, @curr_lexemes );
 189                                 $curr_rdg = shift @path;
 190                                 @curr_lexemes = ();
 191                         }
 192                 }
 193         }
 194 }
 195
 196 # Utility function so that we can cheat and use it when we need both the path
 197 # and its text.
 198 sub _text_from_path {
 199         my( $normalize, @path ) = @_;
 200         my $pathtext = '';
 201         my $last;
 202         foreach my $r ( @path ) {
 203                 unless ( $r->join_prior || !$last || $last->join_next ) {
 204                         $pathtext .= ' ';
 205                 }
 206                 $pathtext .= ( $normalize && $r->has_normal_form )
 207                         ? $r->normal_form : $r->text;
 208                 $last = $r;
 209         }
 210         return $pathtext;
 211 }
 212
 213 # Utility function that actually calls the tree tagger.
 214 sub _treetag_string {
 215         my( $text ) = @_;
 216         my $wittext = encode_utf8( $text );
 217         # Then see if we have TreeTagger
 218         my $taggercmd = "$MORPHDIR/TreeTagger/cmd/tree-tagger-english";
 219         unless( -f $taggercmd ) {
 220                 warn "Cannot do English word lemmatization without TreeTagger";
 221                 return;
 222         }
 223         # OK, we can run it then.
 224         my @cmd = ( $taggercmd );
 225         my( $tagresult, $err ); # Capture the output and error
 226         run( \@cmd, \$wittext, \$tagresult, \$err );
 227         # TODO check for error
 228         return decode_utf8( $tagresult );
 229 }
 230
 231 # Utility function to turn a TreeTagger result into a WordForm
 232 sub _parse_wordform {
 233         my $tagresult = shift;
 234         my( $orig, $tag, $lemma ) = split( /\t/, $tagresult );
 235         my $morphobj = Lingua::TagSet::TreeTagger->tag2structure( $tag );
 236         if( $morphobj ) {
 237                 return Text::Tradition::Collation::Reading::WordForm->new(
 238                         'language' => 'English',
 239                         'lemma' => $lemma,
 240                         'morphology' => $morphobj,
 241                         );
 242         } else {
 243                 warn "No morphology found for word: $_";
 244         }
 245 }
 246
 247 1;
 248
 249 =head2 TODO
 250
 251 =over
 252
 253 =item * Handle package dependencies more gracefully
 254
 255 =item * Refactor English/French use of TreeTagger into its own util package
 256
 257 =back
 258
 259 =head1 LICENSE
 260
 261 This package is free software and is provided "as is" without express
 262 or implied warranty.  You can redistribute it and/or modify it under
 263 the same terms as Perl itself.
 264
 265 =head1 AUTHOR
 266
 267 Tara L Andrews E<lt>aurum@cpan.orgE<gt>