Commit | Line | Data |
cca4f996 |
1 | package Text::Tradition::Language::French; |
2 | |
f4b6b4d0 |
3 | use strict; |
4 | use warnings; |
f8862b58 |
5 | use Lingua::TagSet::Multext; |
6 | use Lingua::TagSet::TreeTagger::French; |
e0f6836a |
7 | use Module::Load qw/ load /; |
f8862b58 |
8 | use Text::Tradition::Language::Base qw/ lemmatize_treetagger reading_lookup_treetagger lfs_morph_tags /; |
d3e7842a |
9 | use TryCatch; |
10 | |
cca4f996 |
11 | =head1 NAME |
12 | |
6ad2ce78 |
13 | Text::Tradition::Language::French - language-specific module for French |
cca4f996 |
14 | |
15 | =head1 DESCRIPTION |
16 | |
6ad2ce78 |
17 | Implements morphology lookup for French words in context. This module |
18 | depends on the Flemm module for French lemmatization |
19 | (L<http://www.univ-nancy2.fr/pers/namer/Outils.htm#fl3> in conjunction with |
20 | the TreeTagger software |
21 | (L<http://www.ims.uni-stuttgart.de/projekte/corplex/TreeTagger/>), which is |
22 | (for now) expected to be installed in $MORPHDIR/TreeTagger. |
cca4f996 |
23 | |
24 | =head1 SUBROUTINES |
25 | |
26 | =head2 lemmatize( $text ) |
27 | |
28 | Evaluates the string using the Flemm package, and returns the results. |
29 | |
6ad2ce78 |
30 | =begin testing |
31 | |
32 | binmode STDOUT, ':utf8'; |
33 | use Text::Tradition; |
34 | use_ok( 'Text::Tradition::Language::French' ); |
35 | |
36 | eval "use Flemm"; |
37 | my $err = $@; |
38 | |
39 | SKIP: { |
40 | skip "Package Flemm not found" if $err; |
41 | my $tf = Text::Tradition->new( |
42 | 'input' => 'Self', |
43 | 'file' => 't/data/besoin.xml', |
44 | 'language' => 'French' ); |
45 | |
46 | is( $tf->language, 'French', "Set language okay" ); |
47 | $tf->lemmatize(); |
48 | # Test the lemmatization. How many readings now have morphological info? |
49 | # Do the lexemes match the reading? |
50 | my $ambig = 0; |
f8862b58 |
51 | my $flemmed = 0; |
6ad2ce78 |
52 | foreach my $r ( $tf->collation->readings ) { |
53 | next if $r->is_meta; |
54 | ok( $r->has_lexemes, "Reading $r has one or more lexemes" ); |
55 | my @lex = $r->lexemes; |
56 | my $lexstr = join( '', map { $_->string } @lex ); |
57 | my $textstr = $r->text; |
58 | $textstr =~ s/\s+//g; |
59 | is( $textstr, $lexstr, "Lexemes for reading $r match the reading" ); |
60 | foreach my $l ( @lex ) { |
f8862b58 |
61 | # Check to see if Flemm actually ran |
62 | foreach my $wf ( $l->matching_forms ) { |
63 | $flemmed++ if $wf->morphology->get_feature('num'); |
64 | } |
6ad2ce78 |
65 | next if $l->is_disambiguated; |
6ad2ce78 |
66 | $ambig++; |
67 | } |
68 | } |
69 | is( $ambig, 102, "Found 102 ambiguous forms as expected" ); |
f8862b58 |
70 | ok( $flemmed > 500, "Found enough Flemm info in wordforms" ); |
6ad2ce78 |
71 | |
72 | # Try setting the normal form of a reading and re-analyzing |
e0f6836a |
73 | my $mr = $tf->collation->reading('r99.2'); |
6ad2ce78 |
74 | is( $mr->text, 'minspire', "Picked correct test reading" ); |
75 | is( $mr->language, 'French', "Reading has correct language setting" ); |
76 | $mr->normal_form( "m'inspire" ); |
77 | $mr->lemmatize; |
f8862b58 |
78 | my @l = $mr->lexemes; |
79 | is( @l, 2, "Got two lexemes for new m'inspire reading" ); |
80 | is( $l[0]->form->to_string, |
81 | '"French // se|le|lui // cat@pron type@pers pers@1 num@sing case@acc|dat"', |
82 | "New reading has correct first lexeme" ); |
6ad2ce78 |
83 | } |
84 | |
85 | =end testing |
86 | |
cca4f996 |
87 | =cut |
88 | |
89 | sub lemmatize { |
d3e7842a |
90 | my $tradition = shift; |
e0f6836a |
91 | my %opts = ( |
92 | 'language' => 'French', |
93 | 'callback' => sub { _parse_wordform( _flemm_lookup( @_ ) ) } |
94 | ); |
95 | return lemmatize_treetagger( $tradition, %opts ); |
cca4f996 |
96 | } |
97 | |
6ad2ce78 |
98 | =head2 reading_lookup( $rdg[, $rdg, ...] ) |
cca4f996 |
99 | |
6ad2ce78 |
100 | Looks up one or more readings using the Flemm package, and returns the |
101 | possible results. This uses the same logic as L<lemmatize> above for the |
102 | entire tradition, but can also be used to (re-)analyze individual readings. |
cca4f996 |
103 | |
104 | =cut |
105 | |
6ad2ce78 |
106 | sub reading_lookup { |
e0f6836a |
107 | my( @path ) = @_; |
108 | my %opts = ( |
109 | 'language' => 'French', |
110 | 'callback' => sub { _parse_wordform( _flemm_lookup( @_ ) ) }, |
111 | 'path' => \@path, |
112 | ); |
113 | return reading_lookup_treetagger( %opts ); |
d3e7842a |
114 | } |
115 | |
75ae2b25 |
116 | =head2 morphology_tags |
117 | |
118 | Return a data structure describing the available parts of speech and their attributes. |
119 | |
120 | =cut |
121 | |
122 | sub morphology_tags { |
123 | return lfs_morph_tags(); |
124 | } |
125 | |
d3e7842a |
126 | # Closure and utility function for the package lemmatizer |
127 | { |
128 | my $lemmatizer; |
129 | |
130 | sub _flemm_lookup { |
131 | # First try to load Flemm |
132 | unless( $lemmatizer ) { |
133 | try { |
134 | load 'Flemm'; |
135 | $lemmatizer = Flemm->new( 'Encoding' => 'utf8', 'Tagger' => 'treetagger' ); |
136 | } catch { |
137 | warn "Cannot do French word lemmatization without Flemm: @_"; |
138 | return; |
139 | } |
140 | } |
141 | return $lemmatizer->lemmatize( @_ ) |
142 | } |
cca4f996 |
143 | |
144 | } |
145 | |
d3e7842a |
146 | # Utility function to turn a Flemm result into a WordForm |
147 | sub _parse_wordform { |
148 | my $flemmobj = shift; |
149 | # For now just parse the string, until we make sense of the documentation. |
150 | my @results = split( / \|\| /, $flemmobj->getResult ); |
151 | my @forms; |
152 | foreach ( @results ) { |
153 | my( $orig, $tag, $lemma ) = split( /\t/, $_ ); |
154 | my( $pos, $morph ) = split( /:/, $tag ); |
6ad2ce78 |
155 | my $morphobj; |
156 | if( $morph ) { |
f8862b58 |
157 | $morphobj = Lingua::TagSet::Multext->tag2structure( $morph ); |
6ad2ce78 |
158 | } else { |
159 | # Use the TreeTagger info if there is no Flemm morphology. |
f8862b58 |
160 | $morphobj = Lingua::TagSet::TreeTagger::French->tag2structure( $pos ); |
6ad2ce78 |
161 | } |
162 | if( $morphobj ) { |
163 | my $wf = Text::Tradition::Collation::Reading::WordForm->new( |
164 | 'language' => 'French', |
165 | 'lemma' => $lemma, |
166 | 'morphology' => $morphobj, |
167 | ); |
168 | push( @forms, $wf ); |
169 | } else { |
170 | warn "No morphology found for word: $_"; |
171 | } |
d3e7842a |
172 | } |
173 | return @forms; |
174 | } |
175 | |
176 | 1; |
177 | |
6ad2ce78 |
178 | =head2 TODO |
179 | |
180 | =over |
181 | |
e0f6836a |
182 | =item * Try to do more things with Perl objects in Flemm and TT |
6ad2ce78 |
183 | |
184 | =back |
185 | |
cca4f996 |
186 | =head1 LICENSE |
187 | |
188 | This package is free software and is provided "as is" without express |
189 | or implied warranty. You can redistribute it and/or modify it under |
190 | the same terms as Perl itself. |
191 | |
192 | =head1 AUTHOR |
193 | |
194 | Tara L Andrews E<lt>aurum@cpan.orgE<gt> |