1 package Text::Tradition::Language::French;
5 use Module::Load qw/ load /;
6 use Text::Tradition::Language::Base qw/ lemmatize_treetagger reading_lookup_treetagger treetagger_struct multext_struct lfs_morph_tags /;
11 Text::Tradition::Language::French - language-specific module for French
15 Implements morphology lookup for French words in context. This module
16 depends on the Flemm module for French lemmatization
17 (L<http://www.univ-nancy2.fr/pers/namer/Outils.htm#fl3> in conjunction with
18 the TreeTagger software
19 (L<http://www.ims.uni-stuttgart.de/projekte/corplex/TreeTagger/>), which is
20 (for now) expected to be installed in $MORPHDIR/TreeTagger.
24 =head2 lemmatize( $text )
26 Evaluates the string using the Flemm package, and returns the results.
30 binmode STDOUT, ':utf8';
32 use_ok( 'Text::Tradition::Language::French' );
38 skip "Package Flemm not found" if $err;
39 my $tf = Text::Tradition->new(
41 'file' => 't/data/besoin.xml',
42 'language' => 'French' );
44 is( $tf->language, 'French', "Set language okay" );
46 # Test the lemmatization. How many readings now have morphological info?
47 # Do the lexemes match the reading?
49 foreach my $r ( $tf->collation->readings ) {
51 ok( $r->has_lexemes, "Reading $r has one or more lexemes" );
52 my @lex = $r->lexemes;
53 my $lexstr = join( '', map { $_->string } @lex );
54 my $textstr = $r->text;
56 is( $textstr, $lexstr, "Lexemes for reading $r match the reading" );
57 foreach my $l ( @lex ) {
58 next if $l->is_disambiguated;
59 # printf( "Ambiguous lexeme %s for reading %s:\n\t%s\n", $l->string, $r->id,
60 # join( "\n\t", map { $_->lemma . ': ' . $_->morphology->to_string } $l->matching_forms ) );
64 is( $ambig, 102, "Found 102 ambiguous forms as expected" );
66 # Try setting the normal form of a reading and re-analyzing
67 my $mr = $tf->collation->reading('r99.2');
68 is( $mr->text, 'minspire', "Picked correct test reading" );
69 is( $mr->language, 'French', "Reading has correct language setting" );
70 $mr->normal_form( "m'inspire" );
72 is( $mr->lexemes, 2, "Got two lexemes for new m'inspire reading" );
80 my $tradition = shift;
82 'language' => 'French',
83 'callback' => sub { _parse_wordform( _flemm_lookup( @_ ) ) }
85 return lemmatize_treetagger( $tradition, %opts );
88 =head2 reading_lookup( $rdg[, $rdg, ...] )
90 Looks up one or more readings using the Flemm package, and returns the
91 possible results. This uses the same logic as L<lemmatize> above for the
92 entire tradition, but can also be used to (re-)analyze individual readings.
99 'language' => 'French',
100 'callback' => sub { _parse_wordform( _flemm_lookup( @_ ) ) },
103 return reading_lookup_treetagger( %opts );
106 =head2 morphology_tags
108 Return a data structure describing the available parts of speech and their attributes.
112 sub morphology_tags {
113 return lfs_morph_tags();
116 # Closure and utility function for the package lemmatizer
121 # First try to load Flemm
122 unless( $lemmatizer ) {
125 $lemmatizer = Flemm->new( 'Encoding' => 'utf8', 'Tagger' => 'treetagger' );
127 warn "Cannot do French word lemmatization without Flemm: @_";
131 return $lemmatizer->lemmatize( @_ )
136 # Utility function to turn a Flemm result into a WordForm
137 sub _parse_wordform {
138 my $flemmobj = shift;
139 # For now just parse the string, until we make sense of the documentation.
140 my @results = split( / \|\| /, $flemmobj->getResult );
142 foreach ( @results ) {
143 my( $orig, $tag, $lemma ) = split( /\t/, $_ );
144 my( $pos, $morph ) = split( /:/, $tag );
147 $morphobj = multext_struct( $morph );
149 # Use the TreeTagger info if there is no Flemm morphology.
150 $morphobj = treetagger_struct( $pos );
153 my $wf = Text::Tradition::Collation::Reading::WordForm->new(
154 'language' => 'French',
156 'morphology' => $morphobj,
160 warn "No morphology found for word: $_";
172 =item * Try to do more things with Perl objects in Flemm and TT
178 This package is free software and is provided "as is" without express
179 or implied warranty. You can redistribute it and/or modify it under
180 the same terms as Perl itself.
184 Tara L Andrews E<lt>aurum@cpan.orgE<gt>