analysis script for upcoming presentation
[scpubgit/stemmatology.git] / lib / Text / Tradition / Language / French.pm
CommitLineData
cca4f996 1package Text::Tradition::Language::French;
2
f4b6b4d0 3use strict;
4use warnings;
f8862b58 5use Lingua::TagSet::Multext;
6use Lingua::TagSet::TreeTagger::French;
e0f6836a 7use Module::Load qw/ load /;
f8862b58 8use Text::Tradition::Language::Base qw/ lemmatize_treetagger reading_lookup_treetagger lfs_morph_tags /;
d3e7842a 9use TryCatch;
10
cca4f996 11=head1 NAME
12
6ad2ce78 13Text::Tradition::Language::French - language-specific module for French
cca4f996 14
15=head1 DESCRIPTION
16
6ad2ce78 17Implements morphology lookup for French words in context. This module
18depends on the Flemm module for French lemmatization
19(L<http://www.univ-nancy2.fr/pers/namer/Outils.htm#fl3> in conjunction with
20the TreeTagger software
21(L<http://www.ims.uni-stuttgart.de/projekte/corplex/TreeTagger/>), which is
22(for now) expected to be installed in $MORPHDIR/TreeTagger.
cca4f996 23
24=head1 SUBROUTINES
25
26=head2 lemmatize( $text )
27
28Evaluates the string using the Flemm package, and returns the results.
29
6ad2ce78 30=begin testing
31
32binmode STDOUT, ':utf8';
33use Text::Tradition;
34use_ok( 'Text::Tradition::Language::French' );
35
36eval "use Flemm";
37my $err = $@;
38
39SKIP: {
40 skip "Package Flemm not found" if $err;
41 my $tf = Text::Tradition->new(
42 'input' => 'Self',
43 'file' => 't/data/besoin.xml',
44 'language' => 'French' );
45
46 is( $tf->language, 'French', "Set language okay" );
47 $tf->lemmatize();
48 # Test the lemmatization. How many readings now have morphological info?
49 # Do the lexemes match the reading?
50 my $ambig = 0;
f8862b58 51 my $flemmed = 0;
6ad2ce78 52 foreach my $r ( $tf->collation->readings ) {
53 next if $r->is_meta;
54 ok( $r->has_lexemes, "Reading $r has one or more lexemes" );
55 my @lex = $r->lexemes;
56 my $lexstr = join( '', map { $_->string } @lex );
57 my $textstr = $r->text;
58 $textstr =~ s/\s+//g;
59 is( $textstr, $lexstr, "Lexemes for reading $r match the reading" );
60 foreach my $l ( @lex ) {
f8862b58 61 # Check to see if Flemm actually ran
62 foreach my $wf ( $l->matching_forms ) {
63 $flemmed++ if $wf->morphology->get_feature('num');
64 }
6ad2ce78 65 next if $l->is_disambiguated;
6ad2ce78 66 $ambig++;
67 }
68 }
69 is( $ambig, 102, "Found 102 ambiguous forms as expected" );
f8862b58 70 ok( $flemmed > 500, "Found enough Flemm info in wordforms" );
6ad2ce78 71
72 # Try setting the normal form of a reading and re-analyzing
e0f6836a 73 my $mr = $tf->collation->reading('r99.2');
6ad2ce78 74 is( $mr->text, 'minspire', "Picked correct test reading" );
75 is( $mr->language, 'French', "Reading has correct language setting" );
76 $mr->normal_form( "m'inspire" );
77 $mr->lemmatize;
f8862b58 78 my @l = $mr->lexemes;
79 is( @l, 2, "Got two lexemes for new m'inspire reading" );
80 is( $l[0]->form->to_string,
81 '"French // se|le|lui // cat@pron type@pers pers@1 num@sing case@acc|dat"',
82 "New reading has correct first lexeme" );
6ad2ce78 83}
84
85=end testing
86
cca4f996 87=cut
88
89sub lemmatize {
d3e7842a 90 my $tradition = shift;
e0f6836a 91 my %opts = (
92 'language' => 'French',
93 'callback' => sub { _parse_wordform( _flemm_lookup( @_ ) ) }
94 );
95 return lemmatize_treetagger( $tradition, %opts );
cca4f996 96}
97
6ad2ce78 98=head2 reading_lookup( $rdg[, $rdg, ...] )
cca4f996 99
6ad2ce78 100Looks up one or more readings using the Flemm package, and returns the
101possible results. This uses the same logic as L<lemmatize> above for the
102entire tradition, but can also be used to (re-)analyze individual readings.
cca4f996 103
104=cut
105
6ad2ce78 106sub reading_lookup {
e0f6836a 107 my( @path ) = @_;
108 my %opts = (
109 'language' => 'French',
110 'callback' => sub { _parse_wordform( _flemm_lookup( @_ ) ) },
111 'path' => \@path,
112 );
113 return reading_lookup_treetagger( %opts );
d3e7842a 114}
115
75ae2b25 116=head2 morphology_tags
117
118Return a data structure describing the available parts of speech and their attributes.
119
120=cut
121
122sub morphology_tags {
123 return lfs_morph_tags();
124}
125
d3e7842a 126# Closure and utility function for the package lemmatizer
127{
128 my $lemmatizer;
129
130 sub _flemm_lookup {
131 # First try to load Flemm
132 unless( $lemmatizer ) {
133 try {
134 load 'Flemm';
135 $lemmatizer = Flemm->new( 'Encoding' => 'utf8', 'Tagger' => 'treetagger' );
136 } catch {
137 warn "Cannot do French word lemmatization without Flemm: @_";
138 return;
139 }
140 }
141 return $lemmatizer->lemmatize( @_ )
142 }
cca4f996 143
144}
145
d3e7842a 146# Utility function to turn a Flemm result into a WordForm
147sub _parse_wordform {
148 my $flemmobj = shift;
149 # For now just parse the string, until we make sense of the documentation.
150 my @results = split( / \|\| /, $flemmobj->getResult );
151 my @forms;
152 foreach ( @results ) {
153 my( $orig, $tag, $lemma ) = split( /\t/, $_ );
154 my( $pos, $morph ) = split( /:/, $tag );
6ad2ce78 155 my $morphobj;
156 if( $morph ) {
f8862b58 157 $morphobj = Lingua::TagSet::Multext->tag2structure( $morph );
6ad2ce78 158 } else {
159 # Use the TreeTagger info if there is no Flemm morphology.
f8862b58 160 $morphobj = Lingua::TagSet::TreeTagger::French->tag2structure( $pos );
6ad2ce78 161 }
162 if( $morphobj ) {
163 my $wf = Text::Tradition::Collation::Reading::WordForm->new(
164 'language' => 'French',
165 'lemma' => $lemma,
166 'morphology' => $morphobj,
167 );
168 push( @forms, $wf );
169 } else {
170 warn "No morphology found for word: $_";
171 }
d3e7842a 172 }
173 return @forms;
174}
175
1761;
177
6ad2ce78 178=head2 TODO
179
180=over
181
e0f6836a 182=item * Try to do more things with Perl objects in Flemm and TT
6ad2ce78 183
184=back
185
cca4f996 186=head1 LICENSE
187
188This package is free software and is provided "as is" without express
189or implied warranty. You can redistribute it and/or modify it under
190the same terms as Perl itself.
191
192=head1 AUTHOR
193
194Tara L Andrews E<lt>aurum@cpan.orgE<gt>