package Text::Tradition::Language::French;
-use Encode qw/ encode_utf8 decode_utf8 /;
-use IPC::Run qw/ run binary /;
-use Module::Load;
-use Text::Tradition::Collation::Reading::Lexeme;
-use Text::Tradition::Collation::Reading::WordForm;
+use strict;
+use warnings;
+use Lingua::TagSet::Multext;
+use Lingua::TagSet::TreeTagger::French;
+use Module::Load qw/ load /;
+use Text::Tradition::Language::Base qw/ lemmatize_treetagger reading_lookup_treetagger lfs_morph_tags /;
use TryCatch;
-my $MORPHDIR = '/Users/tla/Projects/morphology';
-
=head1 NAME
-Text::Tradition::Language::French - language-specific modules for French
+Text::Tradition::Language::French - language-specific module for French
=head1 DESCRIPTION
-Implements morphology lookup for French words in context.
+Implements morphology lookup for French words in context. This module
+depends on the Flemm module for French lemmatization
+(L<http://www.univ-nancy2.fr/pers/namer/Outils.htm#fl3> in conjunction with
+the TreeTagger software
+(L<http://www.ims.uni-stuttgart.de/projekte/corplex/TreeTagger/>), which is
+(for now) expected to be installed in $MORPHDIR/TreeTagger.
=head1 SUBROUTINES
Evaluates the string using the Flemm package, and returns the results.
-=cut
-
-sub lemmatize {
- my $tradition = shift;
-
- # Given a tradition, lemmatize it witness by witness and see what we get.
- my $workdir = File::Temp->newdir();
- my $c = $tradition->collation;
- # First, clear out all existing lexemes from the readings. Save the
- # path as long as we went to the trouble of generating it.
- my %witness_paths;
- foreach my $wit ( $tradition->witnesses ) {
- my @sigla = ( $wit->sigil );
- push( @sigla, $wit->sigil . $c->ac_label ) if $wit->is_layered;
- foreach my $sig ( @sigla ) {
- my @path = grep { !$_->is_meta }
- $c->reading_sequence( $c->start, $c->end, $sig );
- map { $_->clear_lexemes } @path;
- $witness_paths{$sig} = \@path;
- }
- }
-
- foreach my $sig ( keys %witness_paths ) {
- # Get the text as a sequence of readings and as a string
- print STDERR "Morphologizing witness $sig\n";
- my @path = @{$witness_paths{$sig}};
- my $tagresult = _treetag_string( $c->_text_from_path( @path ) );
- if( $tagresult ) {
- # Map the tagged words onto the original readings, splitting
- # them up into lexemes where necessary.
- # NOTE we can have multiple lexemes in a reading, but not
- # multiple readings to a lexeme.
- my @tags = split( /\n/, $tagresult );
- my @lexemes;
- my $curr_rdg = shift @path;
- my @curr_lexemes;
- my $unused_rdg_part;
- foreach my $tag ( @tags ) {
- # Get the original word
- my( $lexeme, @rest ) = split( /\t/, $tag );
- # Lemmatize the whole
- my @forms = _parse_wordform( _flemm_lookup( $tag ) );
- my $lexobj = Text::Tradition::Collation::Reading::Lexeme->new(
- 'string' => $lexeme, 'language' => 'French',
- 'wordform_matchlist' => \@forms );
- # Find the next non-meta reading
- while( $curr_rdg->is_meta ) {
- $curr_rdg = shift @path;
- }
- unless( $curr_rdg ) {
- warn "Ran out of readings in sequence for " . $wit->sigil
- . " at $lexeme";
- last;
- }
- if( $unused_rdg_part &&
- $unused_rdg_part =~ /^\Q$lexeme\E(\s*)(.*)$/ ) {
- # Nth part of curr_rdg
- $unused_rdg_part = $2;
- push( @curr_lexemes, $lexobj );
- } elsif( $curr_rdg->text =~ /^\Q$lexeme\E(\s*)(.*)$/ ) {
- # Flag an error if there is already an unused reading part.
- warn "Skipped over unused text $unused_rdg_part at $curr_rdg"
- if $unused_rdg_part;
- $unused_rdg_part = $2; # will be empty if the whole reading matched
- push( @curr_lexemes, $lexobj );
- } else {
- # We do not cope with the idea of a lexeme being
- # spread across multiple readings.
- warn "Word sequence changed unexpectedly in text";
- # See if we can find a matching reading
- my @lookahead;
- my $matched;
- while( my $nr = shift @path ) {
- if( $nr->text =~ /^\Q$lexeme\E/ ) {
- $curr_rdg = $lookahead[-1] if @lookahead;
- $matched = 1;
- last;
- } else {
- push( @lookahead, $nr );
- }
- }
- # No match? Restore the state we had
- unless( $matched ) {
- unshift( @path, @lookahead );
- }
- # Trigger a move
- $unused_rdg_part = '';
- }
-
- unless( $unused_rdg_part ) {
- # Record the lexemes for the given reading.
- #print STDERR sprintf( "Adding lexeme(s) %s to reading %s (%s)\n",
- # join( ' ', map { $_->string } @curr_lexemes ),
- # $curr_rdg->id, $curr_rdg->text );
- _update_reading_lexemes( $curr_rdg, @curr_lexemes );
- $curr_rdg = shift @path;
- @curr_lexemes = ();
- }
+=begin testing
+
+binmode STDOUT, ':utf8';
+use Text::Tradition;
+use_ok( 'Text::Tradition::Language::French' );
+
+eval "use Flemm";
+my $err = $@;
+
+SKIP: {
+ skip "Package Flemm not found" if $err;
+ my $tf = Text::Tradition->new(
+ 'input' => 'Self',
+ 'file' => 't/data/besoin.xml',
+ 'language' => 'French' );
+
+ is( $tf->language, 'French', "Set language okay" );
+ $tf->lemmatize();
+ # Test the lemmatization. How many readings now have morphological info?
+ # Do the lexemes match the reading?
+ my $ambig = 0;
+ my $flemmed = 0;
+ foreach my $r ( $tf->collation->readings ) {
+ next if $r->is_meta;
+ ok( $r->has_lexemes, "Reading $r has one or more lexemes" );
+ my @lex = $r->lexemes;
+ my $lexstr = join( '', map { $_->string } @lex );
+ my $textstr = $r->text;
+ $textstr =~ s/\s+//g;
+ is( $textstr, $lexstr, "Lexemes for reading $r match the reading" );
+ foreach my $l ( @lex ) {
+ # Check to see if Flemm actually ran
+ foreach my $wf ( $l->matching_forms ) {
+ $flemmed++ if $wf->morphology->get_feature('num');
}
+ next if $l->is_disambiguated;
+ $ambig++;
}
}
+ is( $ambig, 102, "Found 102 ambiguous forms as expected" );
+ ok( $flemmed > 500, "Found enough Flemm info in wordforms" );
+
+ # Try setting the normal form of a reading and re-analyzing
+ my $mr = $tf->collation->reading('r99.2');
+ is( $mr->text, 'minspire', "Picked correct test reading" );
+ is( $mr->language, 'French', "Reading has correct language setting" );
+ $mr->normal_form( "m'inspire" );
+ $mr->lemmatize;
+ my @l = $mr->lexemes;
+ is( @l, 2, "Got two lexemes for new m'inspire reading" );
+ is( $l[0]->form->to_string,
+ '"French // se|le|lui // cat@pron type@pers pers@1 num@sing case@acc|dat"',
+ "New reading has correct first lexeme" );
}
-sub _update_reading_lexemes {
- my( $reading, @lexemes ) = @_;
- if( $reading->has_lexemes ) {
- # We need to merge what is in @lexemes with what we have already.
- my @oldlex = $reading->lexemes;
- my $cmp1 = join( '||', map { $_->string } @oldlex );
- my $cmp2 = join( '||', map { $_->string } @lexemes );
- if ( @oldlex == @lexemes && $cmp1 == $cmp2 ) {
- # The lexeme strings are the same, so merge the possible
- # word forms from new to old.
- foreach my $i ( 0 .. $#lexemes ) {
- my $ol = $oldlex[$i];
- my $nl = $lexemes[$i];
- my %ofw;
- map { $ofw{$_->_stringify} = 1 } $ol->matching_forms;
- foreach my $form ( $nl->matching_forms ) {
- unless( $ofw{$form->_stringify} ) {
- print STDERR "Adding form " . $form->_stringify .
- " to lexeme " . $nl->string . " at $reading\n";
- $ol->add_matching_form( $form );
- $ol->is_disambiguated(0);
- }
- }
- }
- } else {
- $DB::single = 1;
- warn "Lexeme layout for $reading changed; replacing the lot";
- $reading->clear_lexemes;
- $reading->add_lexeme( @lexemes );
- }
- } else {
- $reading->add_lexeme( @lexemes );
- }
+=end testing
+
+=cut
+
+sub lemmatize {
+ my $tradition = shift;
+ my %opts = (
+ 'language' => 'French',
+ 'callback' => sub { _parse_wordform( _flemm_lookup( @_ ) ) }
+ );
+ return lemmatize_treetagger( $tradition, %opts );
}
-=head2 word_lookup( $word )
+=head2 reading_lookup( $rdg[, $rdg, ...] )
-Looks up a word using the Flemm package, and returns the possible results.
-It is better to use L<lemmatize> for context sensitivity.
+Looks up one or more readings using the Flemm package, and returns the
+possible results. This uses the same logic as L<lemmatize> above for the
+entire tradition, but can also be used to (re-)analyze individual readings.
=cut
-sub word_lookup {
- my $word = shift;
- my $tagresult = _treetag_string( $word );
- my $lemmatizer;
- try {
- load 'Flemm';
- $lemmatizer = Flemm->new( 'Encoding' => 'utf8', 'Tagger' => 'treetagger' );
- } catch {
- warn "Cannot do French word lemmatization without Flemm: @_";
- return;
- }
- return _parse_wordform( _flemm_lookup( $tagresult ) );
+sub reading_lookup {
+ my( @path ) = @_;
+ my %opts = (
+ 'language' => 'French',
+ 'callback' => sub { _parse_wordform( _flemm_lookup( @_ ) ) },
+ 'path' => \@path,
+ );
+ return reading_lookup_treetagger( %opts );
}
-# Utility function that actually calls the tree tagger.
-sub _treetag_string {
- my( $text ) = @_;
- my $wittext = encode_utf8( $text );
- # Then see if we have TreeTagger
- my $taggercmd = "$MORPHDIR/TreeTagger/cmd/tree-tagger-french-utf8";
- unless( -f $taggercmd ) {
- warn "Cannot do French word lemmatization without TreeTagger";
- return;
- }
- # OK, we can run it then.
- my @cmd = ( $taggercmd );
- my( $tagresult, $err ); # Capture the output and error
- run( \@cmd, \$wittext, \$tagresult, \$err );
- # TODO check for error
- return decode_utf8( $tagresult );
+=head2 morphology_tags
+
+Return a data structure describing the available parts of speech and their attributes.
+
+=cut
+
+sub morphology_tags {
+ return lfs_morph_tags();
}
# Closure and utility function for the package lemmatizer
foreach ( @results ) {
my( $orig, $tag, $lemma ) = split( /\t/, $_ );
my( $pos, $morph ) = split( /:/, $tag );
- my $wf = Text::Tradition::Collation::Reading::WordForm->new(
- 'language' => 'French',
- 'lemma' => $lemma,
- 'morphology' => [ split( //, $morph ) ],
- );
- push( @forms, $wf );
+ my $morphobj;
+ if( $morph ) {
+ $morphobj = Lingua::TagSet::Multext->tag2structure( $morph );
+ } else {
+ # Use the TreeTagger info if there is no Flemm morphology.
+ $morphobj = Lingua::TagSet::TreeTagger::French->tag2structure( $pos );
+ }
+ if( $morphobj ) {
+ my $wf = Text::Tradition::Collation::Reading::WordForm->new(
+ 'language' => 'French',
+ 'lemma' => $lemma,
+ 'morphology' => $morphobj,
+ );
+ push( @forms, $wf );
+ } else {
+ warn "No morphology found for word: $_";
+ }
}
return @forms;
}
1;
+=head2 TODO
+
+=over
+
+=item * Try to do more things with Perl objects in Flemm and TT
+
+=back
+
=head1 LICENSE
This package is free software and is provided "as is" without express