Commit | Line | Data |
a445ce40 |
1 | package Text::Tradition::Morphology; |
2 | |
3 | use strict; |
4 | use warnings; |
5 | use JSON qw/ from_json /; |
6 | use Moose::Role; |
7 | use Module::Load; |
8 | use Text::Tradition::Collation::Reading::Lexeme; |
9 | |
483478bf |
10 | use vars qw/ $VERSION /; |
11 | $VERSION = "0.1"; |
12 | |
a445ce40 |
13 | =head1 NAME |
14 | |
8943ff68 |
15 | Text::Tradition::Morphology - morphology plugin for Text::Tradition |
16 | |
17 | =head1 DESCRIPTION |
18 | |
19 | The Text::Tradition::Morphology package enables lemma and part-of-speech |
20 | information for traditions and their Reading objects. This distribution |
21 | includes the L<Text::Tradition::Language> role for Traditions, the |
22 | L<Text::Tradition::Morphology> role (this package) for Readings, and a set |
23 | of Language::* modules for language-specific lemmatization. |
24 | |
25 | See L<Text::Tradition::Collation::Reading::Lexeme> for more about the |
26 | morphology object structure. |
a445ce40 |
27 | |
28 | =cut |
29 | |
332750fc |
30 | requires 'is_identical', 'is_combinable', '_combine'; |
31 | |
58f9c2b9 |
32 | has 'language' => ( |
33 | is => 'ro', |
34 | isa => 'Str', |
35 | predicate => 'has_language', |
36 | ); |
37 | |
a445ce40 |
38 | has 'grammar_invalid' => ( |
39 | is => 'rw', |
40 | isa => 'Bool', |
41 | default => undef, |
42 | ); |
43 | |
44 | has 'is_nonsense' => ( |
45 | is => 'rw', |
46 | isa => 'Bool', |
47 | default => undef, |
48 | ); |
49 | |
50 | has 'normal_form' => ( |
51 | is => 'rw', |
52 | isa => 'Str', |
53 | predicate => '_has_normal_form', |
54 | clearer => '_clear_normal_form', |
55 | ); |
56 | |
57 | # Holds the lexemes for the reading. |
58 | has 'reading_lexemes' => ( |
59 | traits => ['Array'], |
60 | isa => 'ArrayRef[Text::Tradition::Collation::Reading::Lexeme]', |
61 | handles => { |
62 | lexeme => 'get', |
63 | lexemes => 'elements', |
64 | has_lexemes => 'count', |
65 | clear_lexemes => 'clear', |
66 | add_lexeme => 'push', |
67 | }, |
68 | default => sub { [] }, |
69 | ); |
70 | |
71 | |
72 | |
73 | # Make normal_form default to text, transparently. |
74 | around 'normal_form' => sub { |
75 | my $orig = shift; |
76 | my $self = shift; |
77 | my( $arg ) = @_; |
78 | if( $arg && $arg eq $self->text ) { |
79 | $self->_clear_normal_form; |
80 | return $arg; |
81 | } elsif( !$arg && !$self->_has_normal_form ) { |
82 | return $self->text; |
83 | } else { |
84 | $self->$orig( @_ ); |
85 | } |
86 | }; |
87 | |
8943ff68 |
88 | =head1 READING METHODS |
a445ce40 |
89 | |
90 | Methods for the morphological information (if any) attached to readings. |
91 | A reading may be made up of multiple lexemes; the concatenated lexeme |
92 | strings ought to match the reading's normalized form. |
93 | |
94 | See L<Text::Tradition::Collation::Reading::Lexeme> for more information |
95 | on Lexeme objects and their attributes. |
96 | |
97 | =head2 has_lexemes |
98 | |
99 | Returns a true value if the reading has any attached lexemes. |
100 | |
101 | =head2 lexemes |
102 | |
103 | Returns the Lexeme objects (if any) attached to the reading. |
104 | |
105 | =head2 clear_lexemes |
106 | |
107 | Wipes any associated Lexeme objects out of the reading. |
108 | |
109 | =head2 add_lexeme( $lexobj ) |
110 | |
111 | Adds the Lexeme in $lexobj to the list of lexemes. |
112 | |
113 | =head2 lemmatize |
114 | |
115 | If the language of the reading is set, this method will use the appropriate |
116 | Language model to determine the lexemes that belong to this reading. See |
8943ff68 |
117 | L<Text::Tradition::Language::lemmatize> if you wish to lemmatize an entire tradition. |
a445ce40 |
118 | |
119 | =cut |
120 | |
121 | sub lemmatize { |
122 | my $self = shift; |
123 | unless( $self->has_language ) { |
124 | warn "Please set a language to lemmatize a tradition"; |
125 | return; |
126 | } |
127 | my $mod = "Text::Tradition::Language::" . $self->language; |
128 | load( $mod ); |
129 | $mod->can( 'reading_lookup' )->( $self ); |
130 | |
131 | } |
132 | |
133 | # For graph serialization. Return a JSON representation of the associated |
134 | # reading lexemes. |
135 | sub _serialize_lexemes { |
136 | my $self = shift; |
137 | my $json = JSON->new->allow_blessed(1)->convert_blessed(1); |
138 | return $json->encode( [ $self->lexemes ] ); |
139 | } |
140 | |
141 | # Given a JSON representation of the lexemes, instantiate them and add |
142 | # them to the reading. |
143 | sub _deserialize_lexemes { |
144 | my( $self, $json ) = @_; |
145 | my $data = from_json( $json ); |
146 | return unless @$data; |
147 | |
148 | my @lexemes; |
149 | foreach my $lexhash ( @$data ) { |
150 | push( @lexemes, Text::Tradition::Collation::Reading::Lexeme->new( |
151 | 'JSON' => $lexhash ) ); |
152 | } |
153 | $self->clear_lexemes; |
154 | $self->add_lexeme( @lexemes ); |
155 | } |
156 | |
157 | sub disambiguated { |
158 | my $self = shift; |
159 | return 0 unless $self->has_lexemes; |
160 | return !grep { !$_->is_disambiguated } $self->lexemes; |
161 | } |
162 | |
163 | sub filter_lexemes { |
164 | my $self = shift; |
165 | # While we are here, get rid of any extra wordforms from a disambiguated |
166 | # reading. |
167 | if( $self->disambiguated ) { |
168 | foreach my $lex ( $self->lexemes ) { |
169 | $lex->clear_matching_forms(); |
170 | $lex->add_matching_form( $lex->form ); |
171 | } |
172 | } |
173 | } |
174 | |
58f9c2b9 |
175 | =head2 regularize |
176 | |
177 | Call the 'regularize' function of the appropriate language model on our |
178 | own reading text. This is a rules-based function distinct from 'normal_form', |
179 | which can be set to any arbitrary string. |
180 | |
181 | =cut |
182 | |
183 | # TODO Test this stuff |
184 | |
185 | sub regularize { |
186 | my $self = shift; |
187 | if( $self->has_language ) { |
188 | # If we do have a language, regularize the tokens in $answer. |
189 | my $mod = 'Text::Tradition::Language::' . $self->language; |
58f9c2b9 |
190 | eval { load( $mod ); }; |
191 | # If a module doesn't exist for our language, use the base routine |
48cb9a90 |
192 | if( $@ ) { |
193 | $mod = 'Text::Tradition::Language::Base'; |
194 | load( $mod ); |
195 | } |
58f9c2b9 |
196 | return $mod->can( 'regularize' )->( $self->text ); |
197 | } else { |
198 | return $self->text; |
199 | } |
200 | } |
201 | |
a445ce40 |
202 | around 'is_identical' => sub { |
203 | my $orig = shift; |
204 | my $self = shift; |
205 | my $other = shift; |
206 | # If the base class returns true, do an extra check to make sure the |
207 | # lexemes also match. |
208 | my $answer = $self->$orig( $other ); |
209 | if( $answer ) { |
210 | if( $self->disambiguated && $other->disambiguated ) { |
211 | my $rform = join( '//', map { $_->form->to_string } $self->lexemes ); |
212 | my $uform = join( '//', map { $_->form->to_string } $other->lexemes ); |
213 | $answer = undef unless $rform eq $uform; |
214 | } elsif( $self->disambiguated xor $other->disambiguated ) { |
215 | $answer = undef; |
216 | } |
217 | } |
218 | return $answer; |
219 | }; |
220 | |
221 | around 'is_combinable' => sub { |
222 | my $orig = shift; |
223 | my $self = shift; |
224 | # If the reading is marked with invalid grammar or as a nonsense reading, |
225 | # it is no longer combinable. |
226 | return undef if $self->grammar_invalid || $self->is_nonsense; |
227 | return $self->$orig(); |
228 | }; |
229 | |
230 | after '_combine' => sub { |
231 | my $self = shift; |
232 | my $other = shift; |
233 | my $joinstr = shift; |
234 | $self->normal_form( |
235 | join( $joinstr, $self->normal_form, $other->normal_form ) ); |
236 | # Combine the lexemes present in the readings |
237 | if( $self->has_lexemes && $other->has_lexemes ) { |
238 | $self->add_lexeme( $other->lexemes ); |
239 | } |
240 | }; |
241 | |
45095bee |
242 | =head2 relationship_added |
243 | |
244 | To be called when a relationship is set, to implement the consequences of |
245 | certain relationships. |
246 | |
247 | =begin testing |
248 | |
249 | # Test that normal form follows lemma setting. Draws on code both here and in |
250 | # the base module. |
251 | |
252 | use Text::Tradition; |
253 | |
254 | my $t = Text::Tradition->new( |
255 | input => 'Self', |
256 | file => 't/data/florilegium_graphml.xml' ); |
257 | my $c = $t->collation; |
258 | |
259 | # First try lemmatizing and then adding a relationship |
260 | my $r1 = $c->reading('w42'); |
261 | my $r2 = $c->reading('w44'); |
262 | $r1->normal_form('FOO'); |
263 | $r2->normal_form('BAR'); |
264 | |
265 | $r1->make_lemma( 1 ); |
266 | is( $r1->normal_form, 'FOO', "nothing changed yet" ); |
267 | is( $r2->normal_form, 'BAR', "nothing changed yet" ); |
268 | |
269 | $c->add_relationship( $r1, $r2, { type => 'spelling' } ); |
270 | is( $r2->normal_form, 'FOO', "Normal form followed lemma" ); |
271 | |
272 | # Now try setting relationships and then lemmatizing |
273 | my $r3 = $c->reading('w98'); |
274 | my $r4 = $c->reading('w100'); |
275 | my $r5 = $c->reading('w103'); |
276 | $r3->normal_form('YAN'); |
277 | $r4->normal_form('TAN'); |
278 | $r5->normal_form('TETHERA'); |
279 | |
280 | $c->add_relationship( $r3, $r4, { type => 'orthographic', propagate => 1 } ); |
281 | $c->add_relationship( $r3, $r5, { type => 'orthographic', propagate => 1 } ); |
282 | is( $r3->normal_form, 'YAN', "nothing changed yet" ); |
283 | is( $r4->normal_form, 'TAN', "nothing changed yet" ); |
284 | is( $r5->normal_form, 'TETHERA', "nothing changed yet" ); |
285 | |
286 | $r3->make_lemma( 1 ); |
287 | is( $r4->normal_form, 'YAN', "normal form propagated" ); |
288 | is( $r5->normal_form, 'YAN', "normal form propagated" ); |
289 | |
290 | # Finally, try a relationship that shouldn't propagate the normal form |
291 | my $r6 = $c->reading('w91'); |
292 | my $r7 = $c->reading('w92'); |
293 | $r6->normal_form('BAZ'); |
294 | $r7->normal_form('QUUX'); |
295 | $r6->make_lemma( 1 ); |
296 | |
297 | $c->add_relationship( $r6, $r7, { type => 'grammatical' } ); |
298 | is( $r7->normal_form, 'QUUX', "normal form on grammatical relationship unchanged" ); |
299 | |
300 | =end testing |
301 | |
302 | =cut |
303 | |
304 | sub relationship_added { |
305 | my( $rdg1, $rdg2, $rel ) = @_; |
306 | my $lemma = $rdg1->is_lemma ? $rdg1 : ( $rdg2->is_lemma ? $rdg2 : undef ); |
307 | if( $rel->type =~ /^(spelling|orthographic)$/ && $lemma ) { |
308 | my $other = $lemma->id eq $rdg1->id ? $rdg2 : $rdg1; |
309 | # Set the normal form on $other to match $lemma. |
310 | $other->normal_form( $lemma->normal_form ); |
311 | } |
312 | } |
313 | |
e92d4229 |
314 | 1; |
315 | |
316 | =head1 LICENSE |
317 | |
318 | This package is free software and is provided "as is" without express |
319 | or implied warranty. You can redistribute it and/or modify it under |
320 | the same terms as Perl itself. |
321 | |
322 | =head1 AUTHOR |
323 | |
324 | Tara L Andrews E<lt>aurum@cpan.orgE<gt> |