introspect for morphology values; include these in help; make sure Perseus results...
[scpubgit/stemmatology.git] / lib / Text / Tradition / Language / French.pm
CommitLineData
cca4f996 1package Text::Tradition::Language::French;
2
f4b6b4d0 3use strict;
4use warnings;
e0f6836a 5use Module::Load qw/ load /;
75ae2b25 6use Text::Tradition::Language::Base qw/ lemmatize_treetagger reading_lookup_treetagger treetagger_struct multext_struct lfs_morph_tags /;
d3e7842a 7use TryCatch;
8
cca4f996 9=head1 NAME
10
6ad2ce78 11Text::Tradition::Language::French - language-specific module for French
cca4f996 12
13=head1 DESCRIPTION
14
6ad2ce78 15Implements morphology lookup for French words in context. This module
16depends on the Flemm module for French lemmatization
17(L<http://www.univ-nancy2.fr/pers/namer/Outils.htm#fl3> in conjunction with
18the TreeTagger software
19(L<http://www.ims.uni-stuttgart.de/projekte/corplex/TreeTagger/>), which is
20(for now) expected to be installed in $MORPHDIR/TreeTagger.
cca4f996 21
22=head1 SUBROUTINES
23
24=head2 lemmatize( $text )
25
26Evaluates the string using the Flemm package, and returns the results.
27
6ad2ce78 28=begin testing
29
30binmode STDOUT, ':utf8';
31use Text::Tradition;
32use_ok( 'Text::Tradition::Language::French' );
33
34eval "use Flemm";
35my $err = $@;
36
37SKIP: {
38 skip "Package Flemm not found" if $err;
39 my $tf = Text::Tradition->new(
40 'input' => 'Self',
41 'file' => 't/data/besoin.xml',
42 'language' => 'French' );
43
44 is( $tf->language, 'French', "Set language okay" );
45 $tf->lemmatize();
46 # Test the lemmatization. How many readings now have morphological info?
47 # Do the lexemes match the reading?
48 my $ambig = 0;
49 foreach my $r ( $tf->collation->readings ) {
50 next if $r->is_meta;
51 ok( $r->has_lexemes, "Reading $r has one or more lexemes" );
52 my @lex = $r->lexemes;
53 my $lexstr = join( '', map { $_->string } @lex );
54 my $textstr = $r->text;
55 $textstr =~ s/\s+//g;
56 is( $textstr, $lexstr, "Lexemes for reading $r match the reading" );
57 foreach my $l ( @lex ) {
58 next if $l->is_disambiguated;
59 # printf( "Ambiguous lexeme %s for reading %s:\n\t%s\n", $l->string, $r->id,
60 # join( "\n\t", map { $_->lemma . ': ' . $_->morphology->to_string } $l->matching_forms ) );
61 $ambig++;
62 }
63 }
64 is( $ambig, 102, "Found 102 ambiguous forms as expected" );
65
66 # Try setting the normal form of a reading and re-analyzing
e0f6836a 67 my $mr = $tf->collation->reading('r99.2');
6ad2ce78 68 is( $mr->text, 'minspire', "Picked correct test reading" );
69 is( $mr->language, 'French', "Reading has correct language setting" );
70 $mr->normal_form( "m'inspire" );
71 $mr->lemmatize;
72 is( $mr->lexemes, 2, "Got two lexemes for new m'inspire reading" );
73}
74
75=end testing
76
cca4f996 77=cut
78
79sub lemmatize {
d3e7842a 80 my $tradition = shift;
e0f6836a 81 my %opts = (
82 'language' => 'French',
83 'callback' => sub { _parse_wordform( _flemm_lookup( @_ ) ) }
84 );
85 return lemmatize_treetagger( $tradition, %opts );
cca4f996 86}
87
6ad2ce78 88=head2 reading_lookup( $rdg[, $rdg, ...] )
cca4f996 89
6ad2ce78 90Looks up one or more readings using the Flemm package, and returns the
91possible results. This uses the same logic as L<lemmatize> above for the
92entire tradition, but can also be used to (re-)analyze individual readings.
cca4f996 93
94=cut
95
6ad2ce78 96sub reading_lookup {
e0f6836a 97 my( @path ) = @_;
98 my %opts = (
99 'language' => 'French',
100 'callback' => sub { _parse_wordform( _flemm_lookup( @_ ) ) },
101 'path' => \@path,
102 );
103 return reading_lookup_treetagger( %opts );
d3e7842a 104}
105
75ae2b25 106=head2 morphology_tags
107
108Return a data structure describing the available parts of speech and their attributes.
109
110=cut
111
112sub morphology_tags {
113 return lfs_morph_tags();
114}
115
d3e7842a 116# Closure and utility function for the package lemmatizer
117{
118 my $lemmatizer;
119
120 sub _flemm_lookup {
121 # First try to load Flemm
122 unless( $lemmatizer ) {
123 try {
124 load 'Flemm';
125 $lemmatizer = Flemm->new( 'Encoding' => 'utf8', 'Tagger' => 'treetagger' );
126 } catch {
127 warn "Cannot do French word lemmatization without Flemm: @_";
128 return;
129 }
130 }
131 return $lemmatizer->lemmatize( @_ )
132 }
cca4f996 133
134}
135
d3e7842a 136# Utility function to turn a Flemm result into a WordForm
137sub _parse_wordform {
138 my $flemmobj = shift;
139 # For now just parse the string, until we make sense of the documentation.
140 my @results = split( / \|\| /, $flemmobj->getResult );
141 my @forms;
142 foreach ( @results ) {
143 my( $orig, $tag, $lemma ) = split( /\t/, $_ );
144 my( $pos, $morph ) = split( /:/, $tag );
6ad2ce78 145 my $morphobj;
146 if( $morph ) {
5271a011 147 $morphobj = multext_struct( $morph );
6ad2ce78 148 } else {
149 # Use the TreeTagger info if there is no Flemm morphology.
5271a011 150 $morphobj = treetagger_struct( $pos );
6ad2ce78 151 }
152 if( $morphobj ) {
153 my $wf = Text::Tradition::Collation::Reading::WordForm->new(
154 'language' => 'French',
155 'lemma' => $lemma,
156 'morphology' => $morphobj,
157 );
158 push( @forms, $wf );
159 } else {
160 warn "No morphology found for word: $_";
161 }
d3e7842a 162 }
163 return @forms;
164}
165
1661;
167
6ad2ce78 168=head2 TODO
169
170=over
171
e0f6836a 172=item * Try to do more things with Perl objects in Flemm and TT
6ad2ce78 173
174=back
175
cca4f996 176=head1 LICENSE
177
178This package is free software and is provided "as is" without express
179or implied warranty. You can redistribute it and/or modify it under
180the same terms as Perl itself.
181
182=head1 AUTHOR
183
184Tara L Andrews E<lt>aurum@cpan.orgE<gt>