morphology/lib/Text/Tradition/Language/French.pm

   1 package Text::Tradition::Language::French;
   2
   3 use strict;
   4 use warnings;
   5 use Lingua::TagSet::Multext;
   6 use Lingua::TagSet::TreeTagger::French;
   7 use Module::Load qw/ load /;
   8 use Text::Tradition::Language::Base qw/ lemmatize_treetagger reading_lookup_treetagger
   9         lfs_morph_tags unicode_regularize /;
  10 use TryCatch;
  11
  12 =head1 NAME
  13
  14 Text::Tradition::Language::French - language-specific module for French
  15
  16 =head1 DESCRIPTION
  17
  18 Implements morphology lookup for French words in context.  This module
  19 depends on the Flemm module for French lemmatization
  20 (L<http://www.univ-nancy2.fr/pers/namer/Outils.htm#fl3> in conjunction with
  21 the TreeTagger software
  22 (L<http://www.ims.uni-stuttgart.de/projekte/corplex/TreeTagger/>), which is
  23 (for now) expected to be installed in $MORPHDIR/TreeTagger.
  24
  25 =head1 SUBROUTINES
  26
  27 =head2 lemmatize( $text )
  28
  29 Evaluates the string using the Flemm package, and returns the results.
  30
  31 =begin testing
  32
  33 binmode STDOUT, ':utf8';
  34 use Text::Tradition;
  35 use_ok( 'Text::Tradition::Language::French' );
  36
  37 eval "use Flemm";
  38 my $err = $@;
  39
  40 SKIP: {
  41         skip "Package Flemm not found" if $err;
  42         my $tf = Text::Tradition->new(
  43                 'input' => 'Self',
  44                 'file' => 't/data/besoin.xml',
  45                 'language' => 'French' );
  46
  47         is( $tf->language, 'French', "Set language okay" );
  48         $tf->lemmatize();
  49         # Test the lemmatization. How many readings now have morphological info?
  50         # Do the lexemes match the reading?
  51         my $ambig = 0;
  52         my $flemmed = 0;
  53         foreach my $r ( $tf->collation->readings ) {
  54                 next if $r->is_meta;
  55                 ok( $r->has_lexemes, "Reading $r has one or more lexemes" );
  56                 my @lex = $r->lexemes;
  57                 my $lexstr = join( '', map { $_->string } @lex );
  58                 my $textstr = $r->text;
  59                 $textstr =~ s/\s+//g;
  60                 is( $textstr, $lexstr, "Lexemes for reading $r match the reading" );
  61                 foreach my $l ( @lex ) {
  62                         # Check to see if Flemm actually ran
  63                         foreach my $wf ( $l->matching_forms ) {
  64                                 $flemmed++ if $wf->morphology->get_feature('num');
  65                         }
  66                         next if $l->is_disambiguated;
  67                         $ambig++;
  68                 }
  69         }
  70         is( $ambig, 102, "Found 102 ambiguous forms as expected" );
  71         ok( $flemmed > 500, "Found enough Flemm info in wordforms" );
  72
  73         # Try setting the normal form of a reading and re-analyzing
  74         my $mr = $tf->collation->reading('r99.2');
  75         is( $mr->text, 'minspire', "Picked correct test reading" );
  76         is( $mr->language, 'French', "Reading has correct language setting" );
  77         $mr->normal_form( "m'inspire" );
  78         $mr->lemmatize;
  79         my @l = $mr->lexemes;
  80         is( @l, 2, "Got two lexemes for new m'inspire reading" );
  81         is( $l[0]->form->to_string,
  82                 '"French // se|le|lui // cat@pron type@pers pers@1 num@sing case@acc|dat"',
  83                 "New reading has correct first lexeme" );
  84 }
  85
  86 =end testing
  87
  88 =cut
  89
  90 sub lemmatize {
  91         my $tradition = shift;
  92         my %opts = (
  93                 'language' => 'French',
  94                 'callback' => sub { _parse_wordform( _flemm_lookup( @_ ) ) }
  95                 );
  96         return lemmatize_treetagger( $tradition, %opts );
  97 }
  98
  99 =head2 reading_lookup( $rdg[, $rdg, ...] )
 100
 101 Looks up one or more readings using the Flemm package, and returns the
 102 possible results.  This uses the same logic as L<lemmatize> above for the
 103 entire tradition, but can also be used to (re-)analyze individual readings.
 104
 105 =cut
 106
 107 sub reading_lookup {
 108         my( @path ) = @_;
 109         my %opts = (
 110                 'language' => 'French',
 111                 'callback' => sub { _parse_wordform( _flemm_lookup( @_ ) ) },
 112                 'path' => \@path,
 113                 );
 114         return reading_lookup_treetagger( %opts );
 115 }
 116
 117 =head2 morphology_tags
 118
 119 Return a data structure describing the available parts of speech and their attributes.
 120
 121 =cut
 122
 123 sub morphology_tags {
 124         return lfs_morph_tags();
 125 }
 126
 127 # Closure and utility function for the package lemmatizer
 128 {
 129         my $lemmatizer;
 130
 131         sub _flemm_lookup {
 132                 # First try to load Flemm
 133                 unless( $lemmatizer ) {
 134                         try {
 135                                 load 'Flemm';
 136                                 $lemmatizer = Flemm->new( 'Encoding' => 'utf8', 'Tagger' => 'treetagger' );
 137                         } catch {
 138                                 warn "Cannot do French word lemmatization without Flemm: @_";
 139                                 return;
 140                         }
 141                 }
 142                 return $lemmatizer->lemmatize( @_ )
 143         }
 144
 145 }
 146
 147 # Utility function to turn a Flemm result into a WordForm
 148 sub _parse_wordform {
 149         my $flemmobj = shift;
 150         # For now just parse the string, until we make sense of the documentation.
 151         my @results = split( / \|\| /, $flemmobj->getResult );
 152         my @forms;
 153         foreach ( @results ) {
 154                 my( $orig, $tag, $lemma ) = split( /\t/, $_ );
 155                 my( $pos, $morph ) = split( /:/, $tag );
 156                 my $morphobj;
 157                 if( $morph ) {
 158                         $morphobj = Lingua::TagSet::Multext->tag2structure( $morph );
 159                 } else {
 160                         # Use the TreeTagger info if there is no Flemm morphology.
 161                         $morphobj = Lingua::TagSet::TreeTagger::French->tag2structure( $pos );
 162                 }
 163                 if( $morphobj ) {
 164                         my $wf = Text::Tradition::Collation::Reading::WordForm->new(
 165                                 'language' => 'French',
 166                                 'lemma' => $lemma,
 167                                 'morphology' => $morphobj,
 168                                 );
 169                         push( @forms, $wf );
 170                 } else {
 171                         warn "No morphology found for word: $_";
 172                 }
 173         }
 174         return @forms;
 175 }
 176
 177 =head2 regularize( $text )
 178
 179 Returns a regularized form of the reading for the purposes of collation.
 180
 181 =cut
 182
 183 sub regularize {
 184         return unicode_regularize( @_ );
 185 }
 186
 187 1;
 188
 189 =head2 TODO
 190
 191 =over
 192
 193 =item * Try to do more things with Perl objects in Flemm and TT
 194
 195 =back
 196