Commit | Line | Data |
cca4f996 |
1 | package Text::Tradition::Language::French; |
2 | |
f4b6b4d0 |
3 | use strict; |
4 | use warnings; |
e0f6836a |
5 | use Module::Load qw/ load /; |
5271a011 |
6 | use Text::Tradition::Language::Base qw/ lemmatize_treetagger reading_lookup_treetagger treetagger_struct multext_struct /; |
d3e7842a |
7 | use TryCatch; |
8 | |
cca4f996 |
9 | =head1 NAME |
10 | |
6ad2ce78 |
11 | Text::Tradition::Language::French - language-specific module for French |
cca4f996 |
12 | |
13 | =head1 DESCRIPTION |
14 | |
6ad2ce78 |
15 | Implements morphology lookup for French words in context. This module |
16 | depends on the Flemm module for French lemmatization |
17 | (L<http://www.univ-nancy2.fr/pers/namer/Outils.htm#fl3> in conjunction with |
18 | the TreeTagger software |
19 | (L<http://www.ims.uni-stuttgart.de/projekte/corplex/TreeTagger/>), which is |
20 | (for now) expected to be installed in $MORPHDIR/TreeTagger. |
cca4f996 |
21 | |
22 | =head1 SUBROUTINES |
23 | |
24 | =head2 lemmatize( $text ) |
25 | |
26 | Evaluates the string using the Flemm package, and returns the results. |
27 | |
6ad2ce78 |
28 | =begin testing |
29 | |
30 | binmode STDOUT, ':utf8'; |
31 | use Text::Tradition; |
32 | use_ok( 'Text::Tradition::Language::French' ); |
33 | |
34 | eval "use Flemm"; |
35 | my $err = $@; |
36 | |
37 | SKIP: { |
38 | skip "Package Flemm not found" if $err; |
39 | my $tf = Text::Tradition->new( |
40 | 'input' => 'Self', |
41 | 'file' => 't/data/besoin.xml', |
42 | 'language' => 'French' ); |
43 | |
44 | is( $tf->language, 'French', "Set language okay" ); |
45 | $tf->lemmatize(); |
46 | # Test the lemmatization. How many readings now have morphological info? |
47 | # Do the lexemes match the reading? |
48 | my $ambig = 0; |
49 | foreach my $r ( $tf->collation->readings ) { |
50 | next if $r->is_meta; |
51 | ok( $r->has_lexemes, "Reading $r has one or more lexemes" ); |
52 | my @lex = $r->lexemes; |
53 | my $lexstr = join( '', map { $_->string } @lex ); |
54 | my $textstr = $r->text; |
55 | $textstr =~ s/\s+//g; |
56 | is( $textstr, $lexstr, "Lexemes for reading $r match the reading" ); |
57 | foreach my $l ( @lex ) { |
58 | next if $l->is_disambiguated; |
59 | # printf( "Ambiguous lexeme %s for reading %s:\n\t%s\n", $l->string, $r->id, |
60 | # join( "\n\t", map { $_->lemma . ': ' . $_->morphology->to_string } $l->matching_forms ) ); |
61 | $ambig++; |
62 | } |
63 | } |
64 | is( $ambig, 102, "Found 102 ambiguous forms as expected" ); |
65 | |
66 | # Try setting the normal form of a reading and re-analyzing |
e0f6836a |
67 | my $mr = $tf->collation->reading('r99.2'); |
6ad2ce78 |
68 | is( $mr->text, 'minspire', "Picked correct test reading" ); |
69 | is( $mr->language, 'French', "Reading has correct language setting" ); |
70 | $mr->normal_form( "m'inspire" ); |
71 | $mr->lemmatize; |
72 | is( $mr->lexemes, 2, "Got two lexemes for new m'inspire reading" ); |
73 | } |
74 | |
75 | =end testing |
76 | |
cca4f996 |
77 | =cut |
78 | |
79 | sub lemmatize { |
d3e7842a |
80 | my $tradition = shift; |
e0f6836a |
81 | my %opts = ( |
82 | 'language' => 'French', |
83 | 'callback' => sub { _parse_wordform( _flemm_lookup( @_ ) ) } |
84 | ); |
85 | return lemmatize_treetagger( $tradition, %opts ); |
cca4f996 |
86 | } |
87 | |
6ad2ce78 |
88 | =head2 reading_lookup( $rdg[, $rdg, ...] ) |
cca4f996 |
89 | |
6ad2ce78 |
90 | Looks up one or more readings using the Flemm package, and returns the |
91 | possible results. This uses the same logic as L<lemmatize> above for the |
92 | entire tradition, but can also be used to (re-)analyze individual readings. |
cca4f996 |
93 | |
94 | =cut |
95 | |
6ad2ce78 |
96 | sub reading_lookup { |
e0f6836a |
97 | my( @path ) = @_; |
98 | my %opts = ( |
99 | 'language' => 'French', |
100 | 'callback' => sub { _parse_wordform( _flemm_lookup( @_ ) ) }, |
101 | 'path' => \@path, |
102 | ); |
103 | return reading_lookup_treetagger( %opts ); |
d3e7842a |
104 | } |
105 | |
106 | # Closure and utility function for the package lemmatizer |
107 | { |
108 | my $lemmatizer; |
109 | |
110 | sub _flemm_lookup { |
111 | # First try to load Flemm |
112 | unless( $lemmatizer ) { |
113 | try { |
114 | load 'Flemm'; |
115 | $lemmatizer = Flemm->new( 'Encoding' => 'utf8', 'Tagger' => 'treetagger' ); |
116 | } catch { |
117 | warn "Cannot do French word lemmatization without Flemm: @_"; |
118 | return; |
119 | } |
120 | } |
121 | return $lemmatizer->lemmatize( @_ ) |
122 | } |
cca4f996 |
123 | |
124 | } |
125 | |
d3e7842a |
126 | # Utility function to turn a Flemm result into a WordForm |
127 | sub _parse_wordform { |
128 | my $flemmobj = shift; |
129 | # For now just parse the string, until we make sense of the documentation. |
130 | my @results = split( / \|\| /, $flemmobj->getResult ); |
131 | my @forms; |
132 | foreach ( @results ) { |
133 | my( $orig, $tag, $lemma ) = split( /\t/, $_ ); |
134 | my( $pos, $morph ) = split( /:/, $tag ); |
6ad2ce78 |
135 | my $morphobj; |
136 | if( $morph ) { |
5271a011 |
137 | $morphobj = multext_struct( $morph ); |
6ad2ce78 |
138 | } else { |
139 | # Use the TreeTagger info if there is no Flemm morphology. |
5271a011 |
140 | $morphobj = treetagger_struct( $pos ); |
6ad2ce78 |
141 | } |
142 | if( $morphobj ) { |
143 | my $wf = Text::Tradition::Collation::Reading::WordForm->new( |
144 | 'language' => 'French', |
145 | 'lemma' => $lemma, |
146 | 'morphology' => $morphobj, |
147 | ); |
148 | push( @forms, $wf ); |
149 | } else { |
150 | warn "No morphology found for word: $_"; |
151 | } |
d3e7842a |
152 | } |
153 | return @forms; |
154 | } |
155 | |
156 | 1; |
157 | |
6ad2ce78 |
158 | =head2 TODO |
159 | |
160 | =over |
161 | |
e0f6836a |
162 | =item * Try to do more things with Perl objects in Flemm and TT |
6ad2ce78 |
163 | |
164 | =back |
165 | |
cca4f996 |
166 | =head1 LICENSE |
167 | |
168 | This package is free software and is provided "as is" without express |
169 | or implied warranty. You can redistribute it and/or modify it under |
170 | the same terms as Perl itself. |
171 | |
172 | =head1 AUTHOR |
173 | |
174 | Tara L Andrews E<lt>aurum@cpan.orgE<gt> |