1 package Text::Tradition::Language::French;
5 use Lingua::TagSet::Multext;
6 use Lingua::TagSet::TreeTagger::French;
7 use Module::Load qw/ load /;
8 use Text::Tradition::Language::Base qw/ lemmatize_treetagger reading_lookup_treetagger lfs_morph_tags /;
13 Text::Tradition::Language::French - language-specific module for French
17 Implements morphology lookup for French words in context. This module
18 depends on the Flemm module for French lemmatization
19 (L<http://www.univ-nancy2.fr/pers/namer/Outils.htm#fl3> in conjunction with
20 the TreeTagger software
21 (L<http://www.ims.uni-stuttgart.de/projekte/corplex/TreeTagger/>), which is
22 (for now) expected to be installed in $MORPHDIR/TreeTagger.
26 =head2 lemmatize( $text )
28 Evaluates the string using the Flemm package, and returns the results.
32 binmode STDOUT, ':utf8';
34 use_ok( 'Text::Tradition::Language::French' );
40 skip "Package Flemm not found" if $err;
41 my $tf = Text::Tradition->new(
43 'file' => 't/data/besoin.xml',
44 'language' => 'French' );
46 is( $tf->language, 'French', "Set language okay" );
48 # Test the lemmatization. How many readings now have morphological info?
49 # Do the lexemes match the reading?
52 foreach my $r ( $tf->collation->readings ) {
54 ok( $r->has_lexemes, "Reading $r has one or more lexemes" );
55 my @lex = $r->lexemes;
56 my $lexstr = join( '', map { $_->string } @lex );
57 my $textstr = $r->text;
59 is( $textstr, $lexstr, "Lexemes for reading $r match the reading" );
60 foreach my $l ( @lex ) {
61 # Check to see if Flemm actually ran
62 foreach my $wf ( $l->matching_forms ) {
63 $flemmed++ if $wf->morphology->get_feature('num');
65 next if $l->is_disambiguated;
69 is( $ambig, 102, "Found 102 ambiguous forms as expected" );
70 ok( $flemmed > 500, "Found enough Flemm info in wordforms" );
72 # Try setting the normal form of a reading and re-analyzing
73 my $mr = $tf->collation->reading('r99.2');
74 is( $mr->text, 'minspire', "Picked correct test reading" );
75 is( $mr->language, 'French', "Reading has correct language setting" );
76 $mr->normal_form( "m'inspire" );
79 is( @l, 2, "Got two lexemes for new m'inspire reading" );
80 is( $l[0]->form->to_string,
81 '"French // se|le|lui // cat@pron type@pers pers@1 num@sing case@acc|dat"',
82 "New reading has correct first lexeme" );
90 my $tradition = shift;
92 'language' => 'French',
93 'callback' => sub { _parse_wordform( _flemm_lookup( @_ ) ) }
95 return lemmatize_treetagger( $tradition, %opts );
98 =head2 reading_lookup( $rdg[, $rdg, ...] )
100 Looks up one or more readings using the Flemm package, and returns the
101 possible results. This uses the same logic as L<lemmatize> above for the
102 entire tradition, but can also be used to (re-)analyze individual readings.
109 'language' => 'French',
110 'callback' => sub { _parse_wordform( _flemm_lookup( @_ ) ) },
113 return reading_lookup_treetagger( %opts );
116 =head2 morphology_tags
118 Return a data structure describing the available parts of speech and their attributes.
122 sub morphology_tags {
123 return lfs_morph_tags();
126 # Closure and utility function for the package lemmatizer
131 # First try to load Flemm
132 unless( $lemmatizer ) {
135 $lemmatizer = Flemm->new( 'Encoding' => 'utf8', 'Tagger' => 'treetagger' );
137 warn "Cannot do French word lemmatization without Flemm: @_";
141 return $lemmatizer->lemmatize( @_ )
146 # Utility function to turn a Flemm result into a WordForm
147 sub _parse_wordform {
148 my $flemmobj = shift;
149 # For now just parse the string, until we make sense of the documentation.
150 my @results = split( / \|\| /, $flemmobj->getResult );
152 foreach ( @results ) {
153 my( $orig, $tag, $lemma ) = split( /\t/, $_ );
154 my( $pos, $morph ) = split( /:/, $tag );
157 $morphobj = Lingua::TagSet::Multext->tag2structure( $morph );
159 # Use the TreeTagger info if there is no Flemm morphology.
160 $morphobj = Lingua::TagSet::TreeTagger::French->tag2structure( $pos );
163 my $wf = Text::Tradition::Collation::Reading::WordForm->new(
164 'language' => 'French',
166 'morphology' => $morphobj,
170 warn "No morphology found for word: $_";
182 =item * Try to do more things with Perl objects in Flemm and TT
188 This package is free software and is provided "as is" without express
189 or implied warranty. You can redistribute it and/or modify it under
190 the same terms as Perl itself.
194 Tara L Andrews E<lt>aurum@cpan.orgE<gt>