add English morph analysis; test for lurking debug statements
[scpubgit/stemmatology.git] / lib / Text / Tradition / Language / French.pm
index 8028c98..6847354 100644 (file)
@@ -1,7 +1,11 @@
 package Text::Tradition::Language::French;
 
+use strict;
+use warnings;
 use Encode qw/ encode_utf8 decode_utf8 /;
-use IPC::Run qw/ run binary /;
+use IPC::Run qw/ run /;
+use Lingua::TagSet::Multext;
+use Lingua::TagSet::TreeTagger;
 use Module::Load;
 use Text::Tradition::Collation::Reading::Lexeme;
 use Text::Tradition::Collation::Reading::WordForm;
@@ -11,11 +15,16 @@ my $MORPHDIR = '/Users/tla/Projects/morphology';
 
 =head1 NAME
 
-Text::Tradition::Language::French - language-specific modules for French
+Text::Tradition::Language::French - language-specific module for French
 
 =head1 DESCRIPTION
 
-Implements morphology lookup for French words in context.
+Implements morphology lookup for French words in context.  This module
+depends on the Flemm module for French lemmatization
+(L<http://www.univ-nancy2.fr/pers/namer/Outils.htm#fl3> in conjunction with
+the TreeTagger software
+(L<http://www.ims.uni-stuttgart.de/projekte/corplex/TreeTagger/>), which is
+(for now) expected to be installed in $MORPHDIR/TreeTagger.
 
 =head1 SUBROUTINES
 
@@ -23,6 +32,55 @@ Implements morphology lookup for French words in context.
 
 Evaluates the string using the Flemm package, and returns the results.
 
+=begin testing
+
+binmode STDOUT, ':utf8';
+use Text::Tradition;
+use_ok( 'Text::Tradition::Language::French' );
+
+eval "use Flemm";
+my $err = $@;
+
+SKIP: {
+       skip "Package Flemm not found" if $err;
+       my $tf = Text::Tradition->new(
+               'input' => 'Self',
+               'file' => 't/data/besoin.xml',
+               'language' => 'French' );
+               
+       is( $tf->language, 'French', "Set language okay" );
+       $tf->lemmatize();
+       # Test the lemmatization. How many readings now have morphological info?
+       # Do the lexemes match the reading?
+       my $ambig = 0;
+       foreach my $r ( $tf->collation->readings ) {
+               next if $r->is_meta;
+               ok( $r->has_lexemes, "Reading $r has one or more lexemes" );
+               my @lex = $r->lexemes;
+               my $lexstr = join( '', map { $_->string } @lex );
+               my $textstr = $r->text;
+               $textstr =~ s/\s+//g;
+               is( $textstr, $lexstr, "Lexemes for reading $r match the reading" );
+               foreach my $l ( @lex ) {
+                       next if $l->is_disambiguated;
+       #               printf( "Ambiguous lexeme %s for reading %s:\n\t%s\n", $l->string, $r->id,
+       #                       join( "\n\t", map { $_->lemma . ': ' . $_->morphology->to_string } $l->matching_forms ) );
+                       $ambig++;
+               }
+       }
+       is( $ambig, 102, "Found 102 ambiguous forms as expected" );
+       
+       # Try setting the normal form of a reading and re-analyzing
+       my $mr = $tf->collation->reading('99,2');
+       is( $mr->text, 'minspire', "Picked correct test reading" );
+       is( $mr->language, 'French', "Reading has correct language setting" );
+       $mr->normal_form( "m'inspire" );
+       $mr->lemmatize;
+       is( $mr->lexemes, 2, "Got two lexemes for new m'inspire reading" );
+}
+
+=end testing
+
 =cut
 
 sub lemmatize {
@@ -48,102 +106,28 @@ sub lemmatize {
        foreach my $sig ( keys %witness_paths ) {
                # Get the text as a sequence of readings and as a string
                print STDERR "Morphologizing witness $sig\n";
-               my @path = @{$witness_paths{$sig}};
-               my $tagresult = _treetag_string( $c->_text_from_path( @path ) );
-               if( $tagresult ) {
-                       # Map the tagged words onto the original readings, splitting 
-                       # them up into lexemes where necessary.
-                       # NOTE we can have multiple lexemes in a reading, but not
-                       # multiple readings to a lexeme.
-                       my @tags = split( /\n/, $tagresult );
-                       my @lexemes;
-                       my $curr_rdg = shift @path;
-                       my @curr_lexemes;
-                       my $unused_rdg_part;
-                       foreach my $tag ( @tags ) {
-                               # Get the original word
-                               my( $lexeme, @rest ) = split( /\t/, $tag );
-                               # Lemmatize the whole
-                               my @forms = _parse_wordform( _flemm_lookup( $tag ) );
-                               my $lexobj = Text::Tradition::Collation::Reading::Lexeme->new(
-                                       'string' => $lexeme, 'language' => 'French',
-                                       'wordform_matchlist' => \@forms );
-                               # Find the next non-meta reading
-                               while( $curr_rdg->is_meta ) {
-                                       $curr_rdg = shift @path;
-                               }
-                               unless( $curr_rdg ) {
-                                       warn "Ran out of readings in sequence for " . $wit->sigil
-                                               . " at $lexeme";
-                                       last;
-                               }
-                               if( $unused_rdg_part &&
-                                       $unused_rdg_part =~ /^\Q$lexeme\E(\s*)(.*)$/ ) {
-                                       # Nth part of curr_rdg
-                                       $unused_rdg_part = $2;
-                                       push( @curr_lexemes, $lexobj );
-                               } elsif( $curr_rdg->text =~ /^\Q$lexeme\E(\s*)(.*)$/ ) {
-                                       # Flag an error if there is already an unused reading part.
-                                       warn "Skipped over unused text $unused_rdg_part at $curr_rdg"
-                                               if $unused_rdg_part;
-                                       $unused_rdg_part = $2; # will be empty if the whole reading matched
-                                       push( @curr_lexemes, $lexobj );
-                               } else {
-                                       # We do not cope with the idea of a lexeme being 
-                                       # spread across multiple readings.
-                                       warn "Word sequence changed unexpectedly in text";
-                                       # See if we can find a matching reading
-                                       my @lookahead;
-                                       my $matched;
-                                       while( my $nr = shift @path ) {
-                                               if( $nr->text =~ /^\Q$lexeme\E/ ) {
-                                                       $curr_rdg = $lookahead[-1] if @lookahead;
-                                                       $matched = 1;
-                                                       last;
-                                               } else {
-                                                       push( @lookahead, $nr );
-                                               }
-                                       }
-                                       # No match? Restore the state we had
-                                       unless( $matched ) {
-                                               unshift( @path, @lookahead );
-                                       }
-                                       # Trigger a move
-                                       $unused_rdg_part = '';
-                               }
-                               
-                               unless( $unused_rdg_part ) {
-                                       # Record the lexemes for the given reading.
-                                       #print STDERR sprintf( "Adding lexeme(s) %s to reading %s (%s)\n",
-                                       #       join( ' ', map { $_->string } @curr_lexemes ),
-                                       #       $curr_rdg->id, $curr_rdg->text );
-                                       _update_reading_lexemes( $curr_rdg, @curr_lexemes );
-                                       $curr_rdg = shift @path;
-                                       @curr_lexemes = ();
-                               }
-                       }
-               }
+               _lemmatize_sequence( undef, @{$witness_paths{$sig}} );
        }
 }
 
 sub _update_reading_lexemes {
-       my( $reading, @lexemes ) = @_;
-       if( $reading->has_lexemes ) {
+       my( $replace, $reading, @lexemes ) = @_;
+       if( $reading->has_lexemes && !$replace ) {
                # We need to merge what is in @lexemes with what we have already.
                my @oldlex = $reading->lexemes;
                my $cmp1 = join( '||', map { $_->string } @oldlex );
                my $cmp2 = join( '||', map { $_->string } @lexemes );
-               if ( @oldlex == @lexemes && $cmp1 == $cmp2 ) {
+               if ( @oldlex == @lexemes && $cmp1 eq $cmp2 ) {
                        # The lexeme strings are the same, so merge the possible
                        # word forms from new to old.
                        foreach my $i ( 0 .. $#lexemes ) {
                                my $ol = $oldlex[$i];
                                my $nl = $lexemes[$i];
                                my %ofw;
-                               map { $ofw{$_->_stringify} = 1 } $ol->matching_forms;
+                               map { $ofw{$_->to_string} = 1 } $ol->matching_forms;
                                foreach my $form ( $nl->matching_forms ) {
-                                       unless( $ofw{$form->_stringify} ) {
-                                               print STDERR "Adding form " . $form->_stringify . 
+                                       unless( $ofw{$form->to_string} ) {
+                                               print STDERR "Adding form " . $form->to_string . 
                                                        " to lexeme " . $nl->string . " at $reading\n";
                                                $ol->add_matching_form( $form );
                                                $ol->is_disambiguated(0);
@@ -151,35 +135,123 @@ sub _update_reading_lexemes {
                                }
                        }
                } else {
-                       $DB::single = 1;
                        warn "Lexeme layout for $reading changed; replacing the lot";
                        $reading->clear_lexemes;
                        $reading->add_lexeme( @lexemes );
                }
        } else {
+               $reading->clear_lexemes if $replace;
                $reading->add_lexeme( @lexemes );
        }
 }
 
-=head2 word_lookup( $word )
+=head2 reading_lookup( $rdg[, $rdg, ...] )
 
-Looks up a word using the Flemm package, and returns the possible results.
-It is better to use L<lemmatize> for context sensitivity.
+Looks up one or more readings using the Flemm package, and returns the
+possible results.  This uses the same logic as L<lemmatize> above for the
+entire tradition, but can also be used to (re-)analyze individual readings.
 
 =cut
 
-sub word_lookup {
-       my $word = shift;
-       my $tagresult = _treetag_string( $word );
-       my $lemmatizer;
-       try {
-               load 'Flemm';
-               $lemmatizer = Flemm->new( 'Encoding' => 'utf8', 'Tagger' => 'treetagger' );
-       } catch {
-               warn "Cannot do French word lemmatization without Flemm: @_";
-               return;
+sub reading_lookup {
+       return _lemmatize_sequence( 1, @_ );
+}
+
+sub _lemmatize_sequence {
+       my( $replace, @path ) = @_;
+       my $tagresult = _treetag_string( _text_from_path( 1, @path ) );
+       if( $tagresult ) {
+               # Map the tagged words onto the original readings, splitting 
+               # them up into lexemes where necessary.
+               # NOTE we can have multiple lexemes in a reading, but not
+               # multiple readings to a lexeme.
+               my @tags = split( /\n/, $tagresult );
+               my @lexemes;
+               my $curr_rdg = shift @path;
+               my @curr_lexemes;
+               my $unused_rdg_part;
+               foreach my $tag ( @tags ) {
+                       # Get the original word
+                       my( $lexeme, @rest ) = split( /\t/, $tag );
+                       # Lemmatize the whole
+                       my @forms = _parse_wordform( _flemm_lookup( $tag ) );
+                       my $lexobj = Text::Tradition::Collation::Reading::Lexeme->new(
+                               'string' => $lexeme, 'language' => 'French',
+                               'wordform_matchlist' => \@forms );
+                       # Find the next non-meta reading
+                       while( $curr_rdg && $curr_rdg->is_meta ) {
+                               $curr_rdg = shift @path;
+                       }
+                       unless( $curr_rdg ) {
+                               warn "Ran out of readings in sequence at $lexeme";
+                               last;
+                       }
+                       my $curr_rdg_text = $curr_rdg->has_normal_form 
+                               ? $curr_rdg->normal_form : $curr_rdg->text;
+                       if( $unused_rdg_part &&
+                               $unused_rdg_part =~ /^\Q$lexeme\E(\s*)(.*)$/ ) {
+                               # Nth part of curr_rdg
+                               $unused_rdg_part = $2;
+                               push( @curr_lexemes, $lexobj );
+                       } elsif( $curr_rdg_text =~ /^\Q$lexeme\E(\s*)(.*)$/ ) {
+                               # Flag an error if there is already an unused reading part.
+                               warn "Skipped over unused text $unused_rdg_part at $curr_rdg"
+                                       if $unused_rdg_part;
+                               $unused_rdg_part = $2; # will be empty if the whole reading matched
+                               push( @curr_lexemes, $lexobj );
+                       } else {
+                               # We do not cope with the idea of a lexeme being 
+                               # spread across multiple readings.
+                               warn "Word sequence changed unexpectedly in text";
+                               # See if we can find a matching reading
+                               my @lookahead;
+                               my $matched;
+                               while( my $nr = shift @path ) {
+                                       my $nrtext = $nr->has_normal_form ? $nr->normal_form : $nr->text;
+                                       if( $nrtext =~ /^\Q$lexeme\E/ ) {
+                                               $curr_rdg = $lookahead[-1] if @lookahead;
+                                               $matched = 1;
+                                               last;
+                                       } else {
+                                               push( @lookahead, $nr );
+                                       }
+                               }
+                               # No match? Restore the state we had
+                               unless( $matched ) {
+                                       unshift( @path, @lookahead );
+                               }
+                               # Trigger a move
+                               $unused_rdg_part = '';
+                       }
+                       
+                       unless( $unused_rdg_part ) {
+                               # Record the lexemes for the given reading.
+                               #print STDERR sprintf( "Adding lexeme(s) %s to reading %s (%s)\n",
+                               #       join( ' ', map { $_->string } @curr_lexemes ),
+                               #       $curr_rdg->id, $curr_rdg->text );
+                               _update_reading_lexemes( $replace, $curr_rdg, @curr_lexemes );
+                               $curr_rdg = shift @path;
+                               @curr_lexemes = ();
+                       }
+               }
+       }
+}
+
+# Utility function so that we can cheat and use it when we need both the path
+# and its text.
+sub _text_from_path {
+       my( $normalize, @path ) = @_;
+       my $pathtext = '';
+       my $last;
+       foreach my $r ( @path ) {
+               unless ( $r->join_prior || !$last || $last->join_next ) {
+                       $pathtext .= ' ';
+               } 
+               $pathtext .= ( $normalize && $r->has_normal_form ) 
+                       ? $r->normal_form : $r->text;
+               $last = $r;
        }
-       return _parse_wordform( _flemm_lookup( $tagresult ) );
+       return $pathtext;
 }
 
 # Utility function that actually calls the tree tagger.
@@ -229,18 +301,37 @@ sub _parse_wordform {
        foreach ( @results ) {
                my( $orig, $tag, $lemma ) = split( /\t/, $_ );
                my( $pos, $morph ) = split( /:/, $tag );
-               my $wf = Text::Tradition::Collation::Reading::WordForm->new(
-                       'language' => 'French',
-                       'lemma' => $lemma,
-                       'morphology' => [ split( //, $morph ) ],
-                       );
-               push( @forms, $wf );
+               my $morphobj;
+               if( $morph ) {
+                       $morphobj = Lingua::TagSet::Multext->tag2structure( $morph );
+               } else {
+                       # Use the TreeTagger info if there is no Flemm morphology.
+                       $morphobj = Lingua::TagSet::TreeTagger->tag2structure( $pos );
+               }
+               if( $morphobj ) {
+                       my $wf = Text::Tradition::Collation::Reading::WordForm->new(
+                               'language' => 'French',
+                               'lemma' => $lemma,
+                               'morphology' => $morphobj,
+                               );
+                       push( @forms, $wf );
+               } else {
+                       warn "No morphology found for word: $_";
+               }
        }
        return @forms;
 }
 
 1;
 
+=head2 TODO
+
+=over
+
+=item * Handle package dependencies more gracefully
+
+=back
+
 =head1 LICENSE
 
 This package is free software and is provided "as is" without express