lib/Text/Tradition/Language/French.pm

   1 package Text::Tradition::Language::French;
   2
   3 use Encode qw/ encode_utf8 decode_utf8 /;
   4 use IPC::Run qw/ run /;
   5 use Lingua::TagSet::Multext;
   6 use Lingua::TagSet::TreeTagger;
   7 use Module::Load;
   8 use Text::Tradition::Collation::Reading::Lexeme;
   9 use Text::Tradition::Collation::Reading::WordForm;
  10 use TryCatch;
  11
  12 my $MORPHDIR = '/Users/tla/Projects/morphology';
  13
  14 =head1 NAME
  15
  16 Text::Tradition::Language::French - language-specific module for French
  17
  18 =head1 DESCRIPTION
  19
  20 Implements morphology lookup for French words in context.  This module
  21 depends on the Flemm module for French lemmatization
  22 (L<http://www.univ-nancy2.fr/pers/namer/Outils.htm#fl3> in conjunction with
  23 the TreeTagger software
  24 (L<http://www.ims.uni-stuttgart.de/projekte/corplex/TreeTagger/>), which is
  25 (for now) expected to be installed in $MORPHDIR/TreeTagger.
  26
  27 =head1 SUBROUTINES
  28
  29 =head2 lemmatize( $text )
  30
  31 Evaluates the string using the Flemm package, and returns the results.
  32
  33 =begin testing
  34
  35 binmode STDOUT, ':utf8';
  36 use Text::Tradition;
  37 use_ok( 'Text::Tradition::Language::French' );
  38
  39 eval "use Flemm";
  40 my $err = $@;
  41
  42 SKIP: {
  43         skip "Package Flemm not found" if $err;
  44         my $tf = Text::Tradition->new(
  45                 'input' => 'Self',
  46                 'file' => 't/data/besoin.xml',
  47                 'language' => 'French' );
  48
  49         is( $tf->language, 'French', "Set language okay" );
  50         $tf->lemmatize();
  51         # Test the lemmatization. How many readings now have morphological info?
  52         # Do the lexemes match the reading?
  53         my $ambig = 0;
  54         foreach my $r ( $tf->collation->readings ) {
  55                 next if $r->is_meta;
  56                 ok( $r->has_lexemes, "Reading $r has one or more lexemes" );
  57                 my @lex = $r->lexemes;
  58                 my $lexstr = join( '', map { $_->string } @lex );
  59                 my $textstr = $r->text;
  60                 $textstr =~ s/\s+//g;
  61                 is( $textstr, $lexstr, "Lexemes for reading $r match the reading" );
  62                 foreach my $l ( @lex ) {
  63                         next if $l->is_disambiguated;
  64         #               printf( "Ambiguous lexeme %s for reading %s:\n\t%s\n", $l->string, $r->id,
  65         #                       join( "\n\t", map { $_->lemma . ': ' . $_->morphology->to_string } $l->matching_forms ) );
  66                         $ambig++;
  67                 }
  68         }
  69         is( $ambig, 102, "Found 102 ambiguous forms as expected" );
  70
  71         # Try setting the normal form of a reading and re-analyzing
  72         my $mr = $tf->collation->reading('99,2');
  73         is( $mr->text, 'minspire', "Picked correct test reading" );
  74         is( $mr->language, 'French', "Reading has correct language setting" );
  75         $mr->normal_form( "m'inspire" );
  76         $mr->lemmatize;
  77         is( $mr->lexemes, 2, "Got two lexemes for new m'inspire reading" );
  78 }
  79
  80 =end testing
  81
  82 =cut
  83
  84 sub lemmatize {
  85         my $tradition = shift;
  86
  87         # Given a tradition, lemmatize it witness by witness and see what we get.
  88         my $workdir = File::Temp->newdir();
  89         my $c = $tradition->collation;
  90         # First, clear out all existing lexemes from the readings. Save the
  91         # path as long as we went to the trouble of generating it.
  92         my %witness_paths;
  93         foreach my $wit ( $tradition->witnesses ) {
  94                 my @sigla = ( $wit->sigil );
  95                 push( @sigla, $wit->sigil . $c->ac_label ) if $wit->is_layered;
  96                 foreach my $sig ( @sigla ) {
  97                         my @path = grep { !$_->is_meta }
  98                                 $c->reading_sequence( $c->start, $c->end, $sig );
  99                         map { $_->clear_lexemes } @path;
 100                         $witness_paths{$sig} = \@path;
 101                 }
 102         }
 103
 104         foreach my $sig ( keys %witness_paths ) {
 105                 # Get the text as a sequence of readings and as a string
 106                 print STDERR "Morphologizing witness $sig\n";
 107                 _lemmatize_sequence( undef, @{$witness_paths{$sig}} );
 108         }
 109 }
 110
 111 sub _update_reading_lexemes {
 112         my( $replace, $reading, @lexemes ) = @_;
 113         if( $reading->has_lexemes && !$replace ) {
 114                 # We need to merge what is in @lexemes with what we have already.
 115                 my @oldlex = $reading->lexemes;
 116                 my $cmp1 = join( '||', map { $_->string } @oldlex );
 117                 my $cmp2 = join( '||', map { $_->string } @lexemes );
 118                 if ( @oldlex == @lexemes && $cmp1 eq $cmp2 ) {
 119                         # The lexeme strings are the same, so merge the possible
 120                         # word forms from new to old.
 121                         foreach my $i ( 0 .. $#lexemes ) {
 122                                 my $ol = $oldlex[$i];
 123                                 my $nl = $lexemes[$i];
 124                                 my %ofw;
 125                                 map { $ofw{$_->to_string} = 1 } $ol->matching_forms;
 126                                 foreach my $form ( $nl->matching_forms ) {
 127                                         unless( $ofw{$form->to_string} ) {
 128                                                 print STDERR "Adding form " . $form->to_string .
 129                                                         " to lexeme " . $nl->string . " at $reading\n";
 130                                                 $ol->add_matching_form( $form );
 131                                                 $ol->is_disambiguated(0);
 132                                         }
 133                                 }
 134                         }
 135                 } else {
 136                         warn "Lexeme layout for $reading changed; replacing the lot";
 137                         $reading->clear_lexemes;
 138                         $reading->add_lexeme( @lexemes );
 139                 }
 140         } else {
 141                 $reading->clear_lexemes if $replace;
 142                 $reading->add_lexeme( @lexemes );
 143         }
 144 }
 145
 146 =head2 reading_lookup( $rdg[, $rdg, ...] )
 147
 148 Looks up one or more readings using the Flemm package, and returns the
 149 possible results.  This uses the same logic as L<lemmatize> above for the
 150 entire tradition, but can also be used to (re-)analyze individual readings.
 151
 152 =cut
 153
 154 sub reading_lookup {
 155         return _lemmatize_sequence( 1, @_ );
 156 }
 157
 158 sub _lemmatize_sequence {
 159         my( $replace, @path ) = @_;
 160         $DB::single = 1 if $replace;
 161         my $tagresult = _treetag_string( _text_from_path( 1, @path ) );
 162         if( $tagresult ) {
 163                 # Map the tagged words onto the original readings, splitting
 164                 # them up into lexemes where necessary.
 165                 # NOTE we can have multiple lexemes in a reading, but not
 166                 # multiple readings to a lexeme.
 167                 my @tags = split( /\n/, $tagresult );
 168                 my @lexemes;
 169                 my $curr_rdg = shift @path;
 170                 my @curr_lexemes;
 171                 my $unused_rdg_part;
 172                 foreach my $tag ( @tags ) {
 173                         # Get the original word
 174                         my( $lexeme, @rest ) = split( /\t/, $tag );
 175                         # Lemmatize the whole
 176                         my @forms = _parse_wordform( _flemm_lookup( $tag ) );
 177                         my $lexobj = Text::Tradition::Collation::Reading::Lexeme->new(
 178                                 'string' => $lexeme, 'language' => 'French',
 179                                 'wordform_matchlist' => \@forms );
 180                         # Find the next non-meta reading
 181                         while( $curr_rdg && $curr_rdg->is_meta ) {
 182                                 $curr_rdg = shift @path;
 183                         }
 184                         unless( $curr_rdg ) {
 185                                 warn "Ran out of readings in sequence for " . $wit->sigil
 186                                         . " at $lexeme";
 187                                 last;
 188                         }
 189                         my $curr_rdg_text = $curr_rdg->has_normal_form
 190                                 ? $curr_rdg->normal_form : $curr_rdg->text;
 191                         if( $unused_rdg_part &&
 192                                 $unused_rdg_part =~ /^\Q$lexeme\E(\s*)(.*)$/ ) {
 193                                 # Nth part of curr_rdg
 194                                 $unused_rdg_part = $2;
 195                                 push( @curr_lexemes, $lexobj );
 196                         } elsif( $curr_rdg_text =~ /^\Q$lexeme\E(\s*)(.*)$/ ) {
 197                                 # Flag an error if there is already an unused reading part.
 198                                 warn "Skipped over unused text $unused_rdg_part at $curr_rdg"
 199                                         if $unused_rdg_part;
 200                                 $unused_rdg_part = $2; # will be empty if the whole reading matched
 201                                 push( @curr_lexemes, $lexobj );
 202                         } else {
 203                                 # We do not cope with the idea of a lexeme being
 204                                 # spread across multiple readings.
 205                                 warn "Word sequence changed unexpectedly in text";
 206                                 # See if we can find a matching reading
 207                                 my @lookahead;
 208                                 my $matched;
 209                                 while( my $nr = shift @path ) {
 210                                         my $nrtext = $nr->has_normal_form ? $nr->normal_form : $nr->text;
 211                                         if( $nrtext =~ /^\Q$lexeme\E/ ) {
 212                                                 $curr_rdg = $lookahead[-1] if @lookahead;
 213                                                 $matched = 1;
 214                                                 last;
 215                                         } else {
 216                                                 push( @lookahead, $nr );
 217                                         }
 218                                 }
 219                                 # No match? Restore the state we had
 220                                 unless( $matched ) {
 221                                         unshift( @path, @lookahead );
 222                                 }
 223                                 # Trigger a move
 224                                 $unused_rdg_part = '';
 225                         }
 226
 227                         unless( $unused_rdg_part ) {
 228                                 # Record the lexemes for the given reading.
 229                                 #print STDERR sprintf( "Adding lexeme(s) %s to reading %s (%s)\n",
 230                                 #       join( ' ', map { $_->string } @curr_lexemes ),
 231                                 #       $curr_rdg->id, $curr_rdg->text );
 232                                 _update_reading_lexemes( $replace, $curr_rdg, @curr_lexemes );
 233                                 $curr_rdg = shift @path;
 234                                 @curr_lexemes = ();
 235                         }
 236                 }
 237         }
 238 }
 239
 240 # Utility function so that we can cheat and use it when we need both the path
 241 # and its text.
 242 sub _text_from_path {
 243         my( $normalize, @path ) = @_;
 244         my $pathtext = '';
 245         my $last;
 246         foreach my $r ( @path ) {
 247                 unless ( $r->join_prior || !$last || $last->join_next ) {
 248                         $pathtext .= ' ';
 249                 }
 250                 $pathtext .= ( $normalize && $r->has_normal_form )
 251                         ? $r->normal_form : $r->text;
 252                 $last = $r;
 253         }
 254         return $pathtext;
 255 }
 256
 257 # Utility function that actually calls the tree tagger.
 258 sub _treetag_string {
 259         my( $text ) = @_;
 260         my $wittext = encode_utf8( $text );
 261         # Then see if we have TreeTagger
 262         my $taggercmd = "$MORPHDIR/TreeTagger/cmd/tree-tagger-french-utf8";
 263         unless( -f $taggercmd ) {
 264                 warn "Cannot do French word lemmatization without TreeTagger";
 265                 return;
 266         }
 267         # OK, we can run it then.
 268         my @cmd = ( $taggercmd );
 269         my( $tagresult, $err ); # Capture the output and error
 270         run( \@cmd, \$wittext, \$tagresult, \$err );
 271         # TODO check for error
 272         return decode_utf8( $tagresult );
 273 }
 274
 275 # Closure and utility function for the package lemmatizer
 276 {
 277         my $lemmatizer;
 278
 279         sub _flemm_lookup {
 280                 # First try to load Flemm
 281                 unless( $lemmatizer ) {
 282                         try {
 283                                 load 'Flemm';
 284                                 $lemmatizer = Flemm->new( 'Encoding' => 'utf8', 'Tagger' => 'treetagger' );
 285                         } catch {
 286                                 warn "Cannot do French word lemmatization without Flemm: @_";
 287                                 return;
 288                         }
 289                 }
 290                 return $lemmatizer->lemmatize( @_ )
 291         }
 292
 293 }
 294
 295 # Utility function to turn a Flemm result into a WordForm
 296 sub _parse_wordform {
 297         my $flemmobj = shift;
 298         # For now just parse the string, until we make sense of the documentation.
 299         my @results = split( / \|\| /, $flemmobj->getResult );
 300         my @forms;
 301         foreach ( @results ) {
 302                 my( $orig, $tag, $lemma ) = split( /\t/, $_ );
 303                 my( $pos, $morph ) = split( /:/, $tag );
 304                 my $morphobj;
 305                 if( $morph ) {
 306                         $morphobj = Lingua::TagSet::Multext->tag2structure( $morph );
 307                 } else {
 308                         # Use the TreeTagger info if there is no Flemm morphology.
 309                         $morphobj = Lingua::TagSet::TreeTagger->tag2structure( $pos );
 310                 }
 311                 if( $morphobj ) {
 312                         my $wf = Text::Tradition::Collation::Reading::WordForm->new(
 313                                 'language' => 'French',
 314                                 'lemma' => $lemma,
 315                                 'morphology' => $morphobj,
 316                                 );
 317                         push( @forms, $wf );
 318                 } else {
 319                         warn "No morphology found for word: $_";
 320                 }
 321         }
 322         return @forms;
 323 }
 324
 325 1;
 326
 327 =head2 TODO
 328
 329 =over
 330
 331 =item * Handle package dependencies more gracefully
 332
 333 =back
 334
 335 =head1 LICENSE
 336
 337 This package is free software and is provided "as is" without express
 338 or implied warranty.  You can redistribute it and/or modify it under
 339 the same terms as Perl itself.
 340
 341 =head1 AUTHOR
 342
 343 Tara L Andrews E<lt>aurum@cpan.orgE<gt>