Add logic for normalized form of lemma to propagate on orth/spelling links
[scpubgit/stemmatology.git] / morphology / lib / Text / Tradition / Morphology.pm
CommitLineData
a445ce40 1package Text::Tradition::Morphology;
2
3use strict;
4use warnings;
5use JSON qw/ from_json /;
6use Moose::Role;
7use Module::Load;
8use Text::Tradition::Collation::Reading::Lexeme;
9
483478bf 10use vars qw/ $VERSION /;
11$VERSION = "0.1";
12
a445ce40 13=head1 NAME
14
8943ff68 15Text::Tradition::Morphology - morphology plugin for Text::Tradition
16
17=head1 DESCRIPTION
18
19The Text::Tradition::Morphology package enables lemma and part-of-speech
20information for traditions and their Reading objects. This distribution
21includes the L<Text::Tradition::Language> role for Traditions, the
22L<Text::Tradition::Morphology> role (this package) for Readings, and a set
23of Language::* modules for language-specific lemmatization.
24
25See L<Text::Tradition::Collation::Reading::Lexeme> for more about the
26morphology object structure.
a445ce40 27
28=cut
29
332750fc 30requires 'is_identical', 'is_combinable', '_combine';
31
58f9c2b9 32has 'language' => (
33 is => 'ro',
34 isa => 'Str',
35 predicate => 'has_language',
36 );
37
a445ce40 38has 'grammar_invalid' => (
39 is => 'rw',
40 isa => 'Bool',
41 default => undef,
42 );
43
44has 'is_nonsense' => (
45 is => 'rw',
46 isa => 'Bool',
47 default => undef,
48 );
49
50has 'normal_form' => (
51 is => 'rw',
52 isa => 'Str',
53 predicate => '_has_normal_form',
54 clearer => '_clear_normal_form',
55 );
56
57# Holds the lexemes for the reading.
58has 'reading_lexemes' => (
59 traits => ['Array'],
60 isa => 'ArrayRef[Text::Tradition::Collation::Reading::Lexeme]',
61 handles => {
62 lexeme => 'get',
63 lexemes => 'elements',
64 has_lexemes => 'count',
65 clear_lexemes => 'clear',
66 add_lexeme => 'push',
67 },
68 default => sub { [] },
69 );
70
71
72
73# Make normal_form default to text, transparently.
74around 'normal_form' => sub {
75 my $orig = shift;
76 my $self = shift;
77 my( $arg ) = @_;
78 if( $arg && $arg eq $self->text ) {
79 $self->_clear_normal_form;
80 return $arg;
81 } elsif( !$arg && !$self->_has_normal_form ) {
82 return $self->text;
83 } else {
84 $self->$orig( @_ );
85 }
86};
87
8943ff68 88=head1 READING METHODS
a445ce40 89
90Methods for the morphological information (if any) attached to readings.
91A reading may be made up of multiple lexemes; the concatenated lexeme
92strings ought to match the reading's normalized form.
93
94See L<Text::Tradition::Collation::Reading::Lexeme> for more information
95on Lexeme objects and their attributes.
96
97=head2 has_lexemes
98
99Returns a true value if the reading has any attached lexemes.
100
101=head2 lexemes
102
103Returns the Lexeme objects (if any) attached to the reading.
104
105=head2 clear_lexemes
106
107Wipes any associated Lexeme objects out of the reading.
108
109=head2 add_lexeme( $lexobj )
110
111Adds the Lexeme in $lexobj to the list of lexemes.
112
113=head2 lemmatize
114
115If the language of the reading is set, this method will use the appropriate
116Language model to determine the lexemes that belong to this reading. See
8943ff68 117L<Text::Tradition::Language::lemmatize> if you wish to lemmatize an entire tradition.
a445ce40 118
119=cut
120
121sub lemmatize {
122 my $self = shift;
123 unless( $self->has_language ) {
124 warn "Please set a language to lemmatize a tradition";
125 return;
126 }
127 my $mod = "Text::Tradition::Language::" . $self->language;
128 load( $mod );
129 $mod->can( 'reading_lookup' )->( $self );
130
131}
132
133# For graph serialization. Return a JSON representation of the associated
134# reading lexemes.
135sub _serialize_lexemes {
136 my $self = shift;
137 my $json = JSON->new->allow_blessed(1)->convert_blessed(1);
138 return $json->encode( [ $self->lexemes ] );
139}
140
141# Given a JSON representation of the lexemes, instantiate them and add
142# them to the reading.
143sub _deserialize_lexemes {
144 my( $self, $json ) = @_;
145 my $data = from_json( $json );
146 return unless @$data;
147
148 my @lexemes;
149 foreach my $lexhash ( @$data ) {
150 push( @lexemes, Text::Tradition::Collation::Reading::Lexeme->new(
151 'JSON' => $lexhash ) );
152 }
153 $self->clear_lexemes;
154 $self->add_lexeme( @lexemes );
155}
156
157sub disambiguated {
158 my $self = shift;
159 return 0 unless $self->has_lexemes;
160 return !grep { !$_->is_disambiguated } $self->lexemes;
161}
162
163sub filter_lexemes {
164 my $self = shift;
165 # While we are here, get rid of any extra wordforms from a disambiguated
166 # reading.
167 if( $self->disambiguated ) {
168 foreach my $lex ( $self->lexemes ) {
169 $lex->clear_matching_forms();
170 $lex->add_matching_form( $lex->form );
171 }
172 }
173}
174
58f9c2b9 175=head2 regularize
176
177Call the 'regularize' function of the appropriate language model on our
178own reading text. This is a rules-based function distinct from 'normal_form',
179which can be set to any arbitrary string.
180
181=cut
182
183# TODO Test this stuff
184
185sub regularize {
186 my $self = shift;
187 if( $self->has_language ) {
188 # If we do have a language, regularize the tokens in $answer.
189 my $mod = 'Text::Tradition::Language::' . $self->language;
58f9c2b9 190 eval { load( $mod ); };
191 # If a module doesn't exist for our language, use the base routine
48cb9a90 192 if( $@ ) {
193 $mod = 'Text::Tradition::Language::Base';
194 load( $mod );
195 }
58f9c2b9 196 return $mod->can( 'regularize' )->( $self->text );
197 } else {
198 return $self->text;
199 }
200}
201
a445ce40 202around 'is_identical' => sub {
203 my $orig = shift;
204 my $self = shift;
205 my $other = shift;
206 # If the base class returns true, do an extra check to make sure the
207 # lexemes also match.
208 my $answer = $self->$orig( $other );
209 if( $answer ) {
210 if( $self->disambiguated && $other->disambiguated ) {
211 my $rform = join( '//', map { $_->form->to_string } $self->lexemes );
212 my $uform = join( '//', map { $_->form->to_string } $other->lexemes );
213 $answer = undef unless $rform eq $uform;
214 } elsif( $self->disambiguated xor $other->disambiguated ) {
215 $answer = undef;
216 }
217 }
218 return $answer;
219};
220
221around 'is_combinable' => sub {
222 my $orig = shift;
223 my $self = shift;
224 # If the reading is marked with invalid grammar or as a nonsense reading,
225 # it is no longer combinable.
226 return undef if $self->grammar_invalid || $self->is_nonsense;
227 return $self->$orig();
228};
229
230after '_combine' => sub {
231 my $self = shift;
232 my $other = shift;
233 my $joinstr = shift;
234 $self->normal_form(
235 join( $joinstr, $self->normal_form, $other->normal_form ) );
236 # Combine the lexemes present in the readings
237 if( $self->has_lexemes && $other->has_lexemes ) {
238 $self->add_lexeme( $other->lexemes );
239 }
240};
241
45095bee 242=head2 relationship_added
243
244To be called when a relationship is set, to implement the consequences of
245certain relationships.
246
247=begin testing
248
249# Test that normal form follows lemma setting. Draws on code both here and in
250# the base module.
251
252use Text::Tradition;
253
254my $t = Text::Tradition->new(
255 input => 'Self',
256 file => 't/data/florilegium_graphml.xml' );
257my $c = $t->collation;
258
259# First try lemmatizing and then adding a relationship
260my $r1 = $c->reading('w42');
261my $r2 = $c->reading('w44');
262$r1->normal_form('FOO');
263$r2->normal_form('BAR');
264
265$r1->make_lemma( 1 );
266is( $r1->normal_form, 'FOO', "nothing changed yet" );
267is( $r2->normal_form, 'BAR', "nothing changed yet" );
268
269$c->add_relationship( $r1, $r2, { type => 'spelling' } );
270is( $r2->normal_form, 'FOO', "Normal form followed lemma" );
271
272# Now try setting relationships and then lemmatizing
273my $r3 = $c->reading('w98');
274my $r4 = $c->reading('w100');
275my $r5 = $c->reading('w103');
276$r3->normal_form('YAN');
277$r4->normal_form('TAN');
278$r5->normal_form('TETHERA');
279
280$c->add_relationship( $r3, $r4, { type => 'orthographic', propagate => 1 } );
281$c->add_relationship( $r3, $r5, { type => 'orthographic', propagate => 1 } );
282is( $r3->normal_form, 'YAN', "nothing changed yet" );
283is( $r4->normal_form, 'TAN', "nothing changed yet" );
284is( $r5->normal_form, 'TETHERA', "nothing changed yet" );
285
286$r3->make_lemma( 1 );
287is( $r4->normal_form, 'YAN', "normal form propagated" );
288is( $r5->normal_form, 'YAN', "normal form propagated" );
289
290# Finally, try a relationship that shouldn't propagate the normal form
291my $r6 = $c->reading('w91');
292my $r7 = $c->reading('w92');
293$r6->normal_form('BAZ');
294$r7->normal_form('QUUX');
295$r6->make_lemma( 1 );
296
297$c->add_relationship( $r6, $r7, { type => 'grammatical' } );
298is( $r7->normal_form, 'QUUX', "normal form on grammatical relationship unchanged" );
299
300=end testing
301
302=cut
303
304sub relationship_added {
305 my( $rdg1, $rdg2, $rel ) = @_;
306 my $lemma = $rdg1->is_lemma ? $rdg1 : ( $rdg2->is_lemma ? $rdg2 : undef );
307 if( $rel->type =~ /^(spelling|orthographic)$/ && $lemma ) {
308 my $other = $lemma->id eq $rdg1->id ? $rdg2 : $rdg1;
309 # Set the normal form on $other to match $lemma.
310 $other->normal_form( $lemma->normal_form );
311 }
312}
313
e92d4229 3141;
315
316=head1 LICENSE
317
318This package is free software and is provided "as is" without express
319or implied warranty. You can redistribute it and/or modify it under
320the same terms as Perl itself.
321
322=head1 AUTHOR
323
324Tara L Andrews E<lt>aurum@cpan.orgE<gt>