refactor English/French shared TT logic into Base.pm
[scpubgit/stemmatology.git] / lib / Text / Tradition / Language / French.pm
CommitLineData
cca4f996 1package Text::Tradition::Language::French;
2
f4b6b4d0 3use strict;
4use warnings;
e0f6836a 5use Module::Load qw/ load /;
6use Text::Tradition::Language::Base qw/ lemmatize_treetagger reading_lookup_treetagger /;
d3e7842a 7use TryCatch;
8
cca4f996 9=head1 NAME
10
6ad2ce78 11Text::Tradition::Language::French - language-specific module for French
cca4f996 12
13=head1 DESCRIPTION
14
6ad2ce78 15Implements morphology lookup for French words in context. This module
16depends on the Flemm module for French lemmatization
17(L<http://www.univ-nancy2.fr/pers/namer/Outils.htm#fl3> in conjunction with
18the TreeTagger software
19(L<http://www.ims.uni-stuttgart.de/projekte/corplex/TreeTagger/>), which is
20(for now) expected to be installed in $MORPHDIR/TreeTagger.
cca4f996 21
22=head1 SUBROUTINES
23
24=head2 lemmatize( $text )
25
26Evaluates the string using the Flemm package, and returns the results.
27
6ad2ce78 28=begin testing
29
30binmode STDOUT, ':utf8';
31use Text::Tradition;
32use_ok( 'Text::Tradition::Language::French' );
33
34eval "use Flemm";
35my $err = $@;
36
37SKIP: {
38 skip "Package Flemm not found" if $err;
39 my $tf = Text::Tradition->new(
40 'input' => 'Self',
41 'file' => 't/data/besoin.xml',
42 'language' => 'French' );
43
44 is( $tf->language, 'French', "Set language okay" );
45 $tf->lemmatize();
46 # Test the lemmatization. How many readings now have morphological info?
47 # Do the lexemes match the reading?
48 my $ambig = 0;
49 foreach my $r ( $tf->collation->readings ) {
50 next if $r->is_meta;
51 ok( $r->has_lexemes, "Reading $r has one or more lexemes" );
52 my @lex = $r->lexemes;
53 my $lexstr = join( '', map { $_->string } @lex );
54 my $textstr = $r->text;
55 $textstr =~ s/\s+//g;
56 is( $textstr, $lexstr, "Lexemes for reading $r match the reading" );
57 foreach my $l ( @lex ) {
58 next if $l->is_disambiguated;
59 # printf( "Ambiguous lexeme %s for reading %s:\n\t%s\n", $l->string, $r->id,
60 # join( "\n\t", map { $_->lemma . ': ' . $_->morphology->to_string } $l->matching_forms ) );
61 $ambig++;
62 }
63 }
64 is( $ambig, 102, "Found 102 ambiguous forms as expected" );
65
66 # Try setting the normal form of a reading and re-analyzing
e0f6836a 67 my $mr = $tf->collation->reading('r99.2');
6ad2ce78 68 is( $mr->text, 'minspire', "Picked correct test reading" );
69 is( $mr->language, 'French', "Reading has correct language setting" );
70 $mr->normal_form( "m'inspire" );
71 $mr->lemmatize;
72 is( $mr->lexemes, 2, "Got two lexemes for new m'inspire reading" );
73}
74
75=end testing
76
cca4f996 77=cut
78
79sub lemmatize {
d3e7842a 80 my $tradition = shift;
e0f6836a 81 my %opts = (
82 'language' => 'French',
83 'callback' => sub { _parse_wordform( _flemm_lookup( @_ ) ) }
84 );
85 return lemmatize_treetagger( $tradition, %opts );
cca4f996 86}
87
6ad2ce78 88=head2 reading_lookup( $rdg[, $rdg, ...] )
cca4f996 89
6ad2ce78 90Looks up one or more readings using the Flemm package, and returns the
91possible results. This uses the same logic as L<lemmatize> above for the
92entire tradition, but can also be used to (re-)analyze individual readings.
cca4f996 93
94=cut
95
6ad2ce78 96sub reading_lookup {
e0f6836a 97 my( @path ) = @_;
98 my %opts = (
99 'language' => 'French',
100 'callback' => sub { _parse_wordform( _flemm_lookup( @_ ) ) },
101 'path' => \@path,
102 );
103 return reading_lookup_treetagger( %opts );
d3e7842a 104}
105
106# Closure and utility function for the package lemmatizer
107{
108 my $lemmatizer;
109
110 sub _flemm_lookup {
111 # First try to load Flemm
112 unless( $lemmatizer ) {
113 try {
114 load 'Flemm';
115 $lemmatizer = Flemm->new( 'Encoding' => 'utf8', 'Tagger' => 'treetagger' );
116 } catch {
117 warn "Cannot do French word lemmatization without Flemm: @_";
118 return;
119 }
120 }
121 return $lemmatizer->lemmatize( @_ )
122 }
cca4f996 123
124}
125
d3e7842a 126# Utility function to turn a Flemm result into a WordForm
127sub _parse_wordform {
128 my $flemmobj = shift;
129 # For now just parse the string, until we make sense of the documentation.
130 my @results = split( / \|\| /, $flemmobj->getResult );
131 my @forms;
132 foreach ( @results ) {
133 my( $orig, $tag, $lemma ) = split( /\t/, $_ );
134 my( $pos, $morph ) = split( /:/, $tag );
6ad2ce78 135 my $morphobj;
136 if( $morph ) {
137 $morphobj = Lingua::TagSet::Multext->tag2structure( $morph );
138 } else {
139 # Use the TreeTagger info if there is no Flemm morphology.
140 $morphobj = Lingua::TagSet::TreeTagger->tag2structure( $pos );
141 }
142 if( $morphobj ) {
143 my $wf = Text::Tradition::Collation::Reading::WordForm->new(
144 'language' => 'French',
145 'lemma' => $lemma,
146 'morphology' => $morphobj,
147 );
148 push( @forms, $wf );
149 } else {
150 warn "No morphology found for word: $_";
151 }
d3e7842a 152 }
153 return @forms;
154}
155
1561;
157
6ad2ce78 158=head2 TODO
159
160=over
161
e0f6836a 162=item * Try to do more things with Perl objects in Flemm and TT
6ad2ce78 163
164=back
165
cca4f996 166=head1 LICENSE
167
168This package is free software and is provided "as is" without express
169or implied warranty. You can redistribute it and/or modify it under
170the same terms as Perl itself.
171
172=head1 AUTHOR
173
174Tara L Andrews E<lt>aurum@cpan.orgE<gt>