add witness role for collation regularization
[scpubgit/stemmatology.git] / morphology / lib / Text / Tradition / Language / French.pm
CommitLineData
cca4f996 1package Text::Tradition::Language::French;
2
f4b6b4d0 3use strict;
4use warnings;
f8862b58 5use Lingua::TagSet::Multext;
6use Lingua::TagSet::TreeTagger::French;
e0f6836a 7use Module::Load qw/ load /;
307d8db9 8use Text::Tradition::Language::Base qw/ lemmatize_treetagger reading_lookup_treetagger
9 lfs_morph_tags unicode_regularize /;
d3e7842a 10use TryCatch;
11
cca4f996 12=head1 NAME
13
6ad2ce78 14Text::Tradition::Language::French - language-specific module for French
cca4f996 15
16=head1 DESCRIPTION
17
6ad2ce78 18Implements morphology lookup for French words in context. This module
19depends on the Flemm module for French lemmatization
20(L<http://www.univ-nancy2.fr/pers/namer/Outils.htm#fl3> in conjunction with
21the TreeTagger software
22(L<http://www.ims.uni-stuttgart.de/projekte/corplex/TreeTagger/>), which is
23(for now) expected to be installed in $MORPHDIR/TreeTagger.
cca4f996 24
25=head1 SUBROUTINES
26
27=head2 lemmatize( $text )
28
29Evaluates the string using the Flemm package, and returns the results.
30
6ad2ce78 31=begin testing
32
33binmode STDOUT, ':utf8';
34use Text::Tradition;
35use_ok( 'Text::Tradition::Language::French' );
36
37eval "use Flemm";
38my $err = $@;
39
40SKIP: {
41 skip "Package Flemm not found" if $err;
42 my $tf = Text::Tradition->new(
43 'input' => 'Self',
44 'file' => 't/data/besoin.xml',
45 'language' => 'French' );
46
47 is( $tf->language, 'French', "Set language okay" );
48 $tf->lemmatize();
49 # Test the lemmatization. How many readings now have morphological info?
50 # Do the lexemes match the reading?
51 my $ambig = 0;
f8862b58 52 my $flemmed = 0;
6ad2ce78 53 foreach my $r ( $tf->collation->readings ) {
54 next if $r->is_meta;
55 ok( $r->has_lexemes, "Reading $r has one or more lexemes" );
56 my @lex = $r->lexemes;
57 my $lexstr = join( '', map { $_->string } @lex );
58 my $textstr = $r->text;
59 $textstr =~ s/\s+//g;
60 is( $textstr, $lexstr, "Lexemes for reading $r match the reading" );
61 foreach my $l ( @lex ) {
f8862b58 62 # Check to see if Flemm actually ran
63 foreach my $wf ( $l->matching_forms ) {
64 $flemmed++ if $wf->morphology->get_feature('num');
65 }
6ad2ce78 66 next if $l->is_disambiguated;
6ad2ce78 67 $ambig++;
68 }
69 }
70 is( $ambig, 102, "Found 102 ambiguous forms as expected" );
f8862b58 71 ok( $flemmed > 500, "Found enough Flemm info in wordforms" );
6ad2ce78 72
73 # Try setting the normal form of a reading and re-analyzing
e0f6836a 74 my $mr = $tf->collation->reading('r99.2');
6ad2ce78 75 is( $mr->text, 'minspire', "Picked correct test reading" );
76 is( $mr->language, 'French', "Reading has correct language setting" );
77 $mr->normal_form( "m'inspire" );
78 $mr->lemmatize;
f8862b58 79 my @l = $mr->lexemes;
80 is( @l, 2, "Got two lexemes for new m'inspire reading" );
81 is( $l[0]->form->to_string,
82 '"French // se|le|lui // cat@pron type@pers pers@1 num@sing case@acc|dat"',
83 "New reading has correct first lexeme" );
6ad2ce78 84}
85
86=end testing
87
cca4f996 88=cut
89
90sub lemmatize {
d3e7842a 91 my $tradition = shift;
e0f6836a 92 my %opts = (
93 'language' => 'French',
94 'callback' => sub { _parse_wordform( _flemm_lookup( @_ ) ) }
95 );
96 return lemmatize_treetagger( $tradition, %opts );
cca4f996 97}
98
6ad2ce78 99=head2 reading_lookup( $rdg[, $rdg, ...] )
cca4f996 100
6ad2ce78 101Looks up one or more readings using the Flemm package, and returns the
102possible results. This uses the same logic as L<lemmatize> above for the
103entire tradition, but can also be used to (re-)analyze individual readings.
cca4f996 104
105=cut
106
6ad2ce78 107sub reading_lookup {
e0f6836a 108 my( @path ) = @_;
109 my %opts = (
110 'language' => 'French',
111 'callback' => sub { _parse_wordform( _flemm_lookup( @_ ) ) },
112 'path' => \@path,
113 );
114 return reading_lookup_treetagger( %opts );
d3e7842a 115}
116
75ae2b25 117=head2 morphology_tags
118
119Return a data structure describing the available parts of speech and their attributes.
120
121=cut
122
123sub morphology_tags {
124 return lfs_morph_tags();
125}
126
d3e7842a 127# Closure and utility function for the package lemmatizer
128{
129 my $lemmatizer;
130
131 sub _flemm_lookup {
132 # First try to load Flemm
133 unless( $lemmatizer ) {
134 try {
135 load 'Flemm';
136 $lemmatizer = Flemm->new( 'Encoding' => 'utf8', 'Tagger' => 'treetagger' );
137 } catch {
138 warn "Cannot do French word lemmatization without Flemm: @_";
139 return;
140 }
141 }
142 return $lemmatizer->lemmatize( @_ )
143 }
cca4f996 144
145}
146
d3e7842a 147# Utility function to turn a Flemm result into a WordForm
148sub _parse_wordform {
149 my $flemmobj = shift;
150 # For now just parse the string, until we make sense of the documentation.
151 my @results = split( / \|\| /, $flemmobj->getResult );
152 my @forms;
153 foreach ( @results ) {
154 my( $orig, $tag, $lemma ) = split( /\t/, $_ );
155 my( $pos, $morph ) = split( /:/, $tag );
6ad2ce78 156 my $morphobj;
157 if( $morph ) {
f8862b58 158 $morphobj = Lingua::TagSet::Multext->tag2structure( $morph );
6ad2ce78 159 } else {
160 # Use the TreeTagger info if there is no Flemm morphology.
f8862b58 161 $morphobj = Lingua::TagSet::TreeTagger::French->tag2structure( $pos );
6ad2ce78 162 }
163 if( $morphobj ) {
164 my $wf = Text::Tradition::Collation::Reading::WordForm->new(
165 'language' => 'French',
166 'lemma' => $lemma,
167 'morphology' => $morphobj,
168 );
169 push( @forms, $wf );
170 } else {
171 warn "No morphology found for word: $_";
172 }
d3e7842a 173 }
174 return @forms;
175}
176
307d8db9 177=head2 regularize( $text )
178
179Returns a regularized form of the reading for the purposes of collation.
180
181=cut
182
183sub regularize {
184 return unicode_regularize( @_ );
185}
186
d3e7842a 1871;
188
6ad2ce78 189=head2 TODO
190
191=over
192
e0f6836a 193=item * Try to do more things with Perl objects in Flemm and TT
6ad2ce78 194
195=back
196