1 package Text::Tradition::Language::French;
5 use Module::Load qw/ load /;
6 use Text::Tradition::Language::Base qw/ lemmatize_treetagger reading_lookup_treetagger treetagger_struct multext_struct /;
11 Text::Tradition::Language::French - language-specific module for French
15 Implements morphology lookup for French words in context. This module
16 depends on the Flemm module for French lemmatization
17 (L<http://www.univ-nancy2.fr/pers/namer/Outils.htm#fl3> in conjunction with
18 the TreeTagger software
19 (L<http://www.ims.uni-stuttgart.de/projekte/corplex/TreeTagger/>), which is
20 (for now) expected to be installed in $MORPHDIR/TreeTagger.
24 =head2 lemmatize( $text )
26 Evaluates the string using the Flemm package, and returns the results.
30 binmode STDOUT, ':utf8';
32 use_ok( 'Text::Tradition::Language::French' );
38 skip "Package Flemm not found" if $err;
39 my $tf = Text::Tradition->new(
41 'file' => 't/data/besoin.xml',
42 'language' => 'French' );
44 is( $tf->language, 'French', "Set language okay" );
46 # Test the lemmatization. How many readings now have morphological info?
47 # Do the lexemes match the reading?
49 foreach my $r ( $tf->collation->readings ) {
51 ok( $r->has_lexemes, "Reading $r has one or more lexemes" );
52 my @lex = $r->lexemes;
53 my $lexstr = join( '', map { $_->string } @lex );
54 my $textstr = $r->text;
56 is( $textstr, $lexstr, "Lexemes for reading $r match the reading" );
57 foreach my $l ( @lex ) {
58 next if $l->is_disambiguated;
59 # printf( "Ambiguous lexeme %s for reading %s:\n\t%s\n", $l->string, $r->id,
60 # join( "\n\t", map { $_->lemma . ': ' . $_->morphology->to_string } $l->matching_forms ) );
64 is( $ambig, 102, "Found 102 ambiguous forms as expected" );
66 # Try setting the normal form of a reading and re-analyzing
67 my $mr = $tf->collation->reading('r99.2');
68 is( $mr->text, 'minspire', "Picked correct test reading" );
69 is( $mr->language, 'French', "Reading has correct language setting" );
70 $mr->normal_form( "m'inspire" );
72 is( $mr->lexemes, 2, "Got two lexemes for new m'inspire reading" );
80 my $tradition = shift;
82 'language' => 'French',
83 'callback' => sub { _parse_wordform( _flemm_lookup( @_ ) ) }
85 return lemmatize_treetagger( $tradition, %opts );
88 =head2 reading_lookup( $rdg[, $rdg, ...] )
90 Looks up one or more readings using the Flemm package, and returns the
91 possible results. This uses the same logic as L<lemmatize> above for the
92 entire tradition, but can also be used to (re-)analyze individual readings.
99 'language' => 'French',
100 'callback' => sub { _parse_wordform( _flemm_lookup( @_ ) ) },
103 return reading_lookup_treetagger( %opts );
106 # Closure and utility function for the package lemmatizer
111 # First try to load Flemm
112 unless( $lemmatizer ) {
115 $lemmatizer = Flemm->new( 'Encoding' => 'utf8', 'Tagger' => 'treetagger' );
117 warn "Cannot do French word lemmatization without Flemm: @_";
121 return $lemmatizer->lemmatize( @_ )
126 # Utility function to turn a Flemm result into a WordForm
127 sub _parse_wordform {
128 my $flemmobj = shift;
129 # For now just parse the string, until we make sense of the documentation.
130 my @results = split( / \|\| /, $flemmobj->getResult );
132 foreach ( @results ) {
133 my( $orig, $tag, $lemma ) = split( /\t/, $_ );
134 my( $pos, $morph ) = split( /:/, $tag );
137 $morphobj = multext_struct( $morph );
139 # Use the TreeTagger info if there is no Flemm morphology.
140 $morphobj = treetagger_struct( $pos );
143 my $wf = Text::Tradition::Collation::Reading::WordForm->new(
144 'language' => 'French',
146 'morphology' => $morphobj,
150 warn "No morphology found for word: $_";
162 =item * Try to do more things with Perl objects in Flemm and TT
168 This package is free software and is provided "as is" without express
169 or implied warranty. You can redistribute it and/or modify it under
170 the same terms as Perl itself.
174 Tara L Andrews E<lt>aurum@cpan.orgE<gt>