lib/Text/Tradition/Language/French.pm

   1 package Text::Tradition::Language::French;
   2
   3 use Encode qw/ encode_utf8 decode_utf8 /;
   4 use IPC::Run qw/ run binary /;
   5 use Module::Load;
   6 use Text::Tradition::Collation::Reading::Lexeme;
   7 use Text::Tradition::Collation::Reading::WordForm;
   8 use TryCatch;
   9
  10 my $MORPHDIR = '/Users/tla/Projects/morphology';
  11
  12 =head1 NAME
  13
  14 Text::Tradition::Language::French - language-specific modules for French
  15
  16 =head1 DESCRIPTION
  17
  18 Implements morphology lookup for French words in context.
  19
  20 =head1 SUBROUTINES
  21
  22 =head2 lemmatize( $text )
  23
  24 Evaluates the string using the Flemm package, and returns the results.
  25
  26 =cut
  27
  28 sub lemmatize {
  29         my $tradition = shift;
  30
  31         # Given a tradition, lemmatize it witness by witness and see what we get.
  32         my $workdir = File::Temp->newdir();
  33         my $c = $tradition->collation;
  34         # First, clear out all existing lexemes from the readings. Save the
  35         # path as long as we went to the trouble of generating it.
  36         my %witness_paths;
  37         foreach my $wit ( $tradition->witnesses ) {
  38                 my @sigla = ( $wit->sigil );
  39                 push( @sigla, $wit->sigil . $c->ac_label ) if $wit->is_layered;
  40                 foreach my $sig ( @sigla ) {
  41                         my @path = grep { !$_->is_meta }
  42                                 $c->reading_sequence( $c->start, $c->end, $sig );
  43                         map { $_->clear_lexemes } @path;
  44                         $witness_paths{$sig} = \@path;
  45                 }
  46         }
  47
  48         foreach my $sig ( keys %witness_paths ) {
  49                 # Get the text as a sequence of readings and as a string
  50                 print STDERR "Morphologizing witness $sig\n";
  51                 my @path = @{$witness_paths{$sig}};
  52                 my $tagresult = _treetag_string( $c->_text_from_path( @path ) );
  53                 if( $tagresult ) {
  54                         # Map the tagged words onto the original readings, splitting
  55                         # them up into lexemes where necessary.
  56                         # NOTE we can have multiple lexemes in a reading, but not
  57                         # multiple readings to a lexeme.
  58                         my @tags = split( /\n/, $tagresult );
  59                         my @lexemes;
  60                         my $curr_rdg = shift @path;
  61                         my @curr_lexemes;
  62                         my $unused_rdg_part;
  63                         foreach my $tag ( @tags ) {
  64                                 # Get the original word
  65                                 my( $lexeme, @rest ) = split( /\t/, $tag );
  66                                 # Lemmatize the whole
  67                                 my @forms = _parse_wordform( _flemm_lookup( $tag ) );
  68                                 my $lexobj = Text::Tradition::Collation::Reading::Lexeme->new(
  69                                         'string' => $lexeme, 'language' => 'French',
  70                                         'wordform_matchlist' => \@forms );
  71                                 # Find the next non-meta reading
  72                                 while( $curr_rdg->is_meta ) {
  73                                         $curr_rdg = shift @path;
  74                                 }
  75                                 unless( $curr_rdg ) {
  76                                         warn "Ran out of readings in sequence for " . $wit->sigil
  77                                                 . " at $lexeme";
  78                                         last;
  79                                 }
  80                                 if( $unused_rdg_part &&
  81                                         $unused_rdg_part =~ /^\Q$lexeme\E(\s*)(.*)$/ ) {
  82                                         # Nth part of curr_rdg
  83                                         $unused_rdg_part = $2;
  84                                         push( @curr_lexemes, $lexobj );
  85                                 } elsif( $curr_rdg->text =~ /^\Q$lexeme\E(\s*)(.*)$/ ) {
  86                                         # Flag an error if there is already an unused reading part.
  87                                         warn "Skipped over unused text $unused_rdg_part at $curr_rdg"
  88                                                 if $unused_rdg_part;
  89                                         $unused_rdg_part = $2; # will be empty if the whole reading matched
  90                                         push( @curr_lexemes, $lexobj );
  91                                 } else {
  92                                         # We do not cope with the idea of a lexeme being
  93                                         # spread across multiple readings.
  94                                         warn "Word sequence changed unexpectedly in text";
  95                                         # See if we can find a matching reading
  96                                         my @lookahead;
  97                                         my $matched;
  98                                         while( my $nr = shift @path ) {
  99                                                 if( $nr->text =~ /^\Q$lexeme\E/ ) {
 100                                                         $curr_rdg = $lookahead[-1] if @lookahead;
 101                                                         $matched = 1;
 102                                                         last;
 103                                                 } else {
 104                                                         push( @lookahead, $nr );
 105                                                 }
 106                                         }
 107                                         # No match? Restore the state we had
 108                                         unless( $matched ) {
 109                                                 unshift( @path, @lookahead );
 110                                         }
 111                                         # Trigger a move
 112                                         $unused_rdg_part = '';
 113                                 }
 114
 115                                 unless( $unused_rdg_part ) {
 116                                         # Record the lexemes for the given reading.
 117                                         #print STDERR sprintf( "Adding lexeme(s) %s to reading %s (%s)\n",
 118                                         #       join( ' ', map { $_->string } @curr_lexemes ),
 119                                         #       $curr_rdg->id, $curr_rdg->text );
 120                                         _update_reading_lexemes( $curr_rdg, @curr_lexemes );
 121                                         $curr_rdg = shift @path;
 122                                         @curr_lexemes = ();
 123                                 }
 124                         }
 125                 }
 126         }
 127 }
 128
 129 sub _update_reading_lexemes {
 130         my( $reading, @lexemes ) = @_;
 131         if( $reading->has_lexemes ) {
 132                 # We need to merge what is in @lexemes with what we have already.
 133                 my @oldlex = $reading->lexemes;
 134                 my $cmp1 = join( '||', map { $_->string } @oldlex );
 135                 my $cmp2 = join( '||', map { $_->string } @lexemes );
 136                 if ( @oldlex == @lexemes && $cmp1 == $cmp2 ) {
 137                         # The lexeme strings are the same, so merge the possible
 138                         # word forms from new to old.
 139                         foreach my $i ( 0 .. $#lexemes ) {
 140                                 my $ol = $oldlex[$i];
 141                                 my $nl = $lexemes[$i];
 142                                 my %ofw;
 143                                 map { $ofw{$_->_stringify} = 1 } $ol->matching_forms;
 144                                 foreach my $form ( $nl->matching_forms ) {
 145                                         unless( $ofw{$form->_stringify} ) {
 146                                                 print STDERR "Adding form " . $form->_stringify .
 147                                                         " to lexeme " . $nl->string . " at $reading\n";
 148                                                 $ol->add_matching_form( $form );
 149                                                 $ol->is_disambiguated(0);
 150                                         }
 151                                 }
 152                         }
 153                 } else {
 154                         $DB::single = 1;
 155                         warn "Lexeme layout for $reading changed; replacing the lot";
 156                         $reading->clear_lexemes;
 157                         $reading->add_lexeme( @lexemes );
 158                 }
 159         } else {
 160                 $reading->add_lexeme( @lexemes );
 161         }
 162 }
 163
 164 =head2 word_lookup( $word )
 165
 166 Looks up a word using the Flemm package, and returns the possible results.
 167 It is better to use L<lemmatize> for context sensitivity.
 168
 169 =cut
 170
 171 sub word_lookup {
 172         my $word = shift;
 173         my $tagresult = _treetag_string( $word );
 174         my $lemmatizer;
 175         try {
 176                 load 'Flemm';
 177                 $lemmatizer = Flemm->new( 'Encoding' => 'utf8', 'Tagger' => 'treetagger' );
 178         } catch {
 179                 warn "Cannot do French word lemmatization without Flemm: @_";
 180                 return;
 181         }
 182         return _parse_wordform( _flemm_lookup( $tagresult ) );
 183 }
 184
 185 # Utility function that actually calls the tree tagger.
 186 sub _treetag_string {
 187         my( $text ) = @_;
 188         my $wittext = encode_utf8( $text );
 189         # Then see if we have TreeTagger
 190         my $taggercmd = "$MORPHDIR/TreeTagger/cmd/tree-tagger-french-utf8";
 191         unless( -f $taggercmd ) {
 192                 warn "Cannot do French word lemmatization without TreeTagger";
 193                 return;
 194         }
 195         # OK, we can run it then.
 196         my @cmd = ( $taggercmd );
 197         my( $tagresult, $err ); # Capture the output and error
 198         run( \@cmd, \$wittext, \$tagresult, \$err );
 199         # TODO check for error
 200         return decode_utf8( $tagresult );
 201 }
 202
 203 # Closure and utility function for the package lemmatizer
 204 {
 205         my $lemmatizer;
 206
 207         sub _flemm_lookup {
 208                 # First try to load Flemm
 209                 unless( $lemmatizer ) {
 210                         try {
 211                                 load 'Flemm';
 212                                 $lemmatizer = Flemm->new( 'Encoding' => 'utf8', 'Tagger' => 'treetagger' );
 213                         } catch {
 214                                 warn "Cannot do French word lemmatization without Flemm: @_";
 215                                 return;
 216                         }
 217                 }
 218                 return $lemmatizer->lemmatize( @_ )
 219         }
 220
 221 }
 222
 223 # Utility function to turn a Flemm result into a WordForm
 224 sub _parse_wordform {
 225         my $flemmobj = shift;
 226         # For now just parse the string, until we make sense of the documentation.
 227         my @results = split( / \|\| /, $flemmobj->getResult );
 228         my @forms;
 229         foreach ( @results ) {
 230                 my( $orig, $tag, $lemma ) = split( /\t/, $_ );
 231                 my( $pos, $morph ) = split( /:/, $tag );
 232                 my $wf = Text::Tradition::Collation::Reading::WordForm->new(
 233                         'language' => 'French',
 234                         'lemma' => $lemma,
 235                         'morphology' => [ split( //, $morph ) ],
 236                         );
 237                 push( @forms, $wf );
 238         }
 239         return @forms;
 240 }
 241
 242 1;
 243
 244 =head1 LICENSE
 245
 246 This package is free software and is provided "as is" without express
 247 or implied warranty.  You can redistribute it and/or modify it under
 248 the same terms as Perl itself.
 249
 250 =head1 AUTHOR
 251
 252 Tara L Andrews E<lt>aurum@cpan.orgE<gt>