1 package Text::Tradition::Morphology;
5 use JSON qw/ from_json /;
8 use Text::Tradition::Collation::Reading::Lexeme;
10 use vars qw/ $VERSION /;
15 Text::Tradition::Morphology - morphology plugin for Text::Tradition
19 The Text::Tradition::Morphology package enables lemma and part-of-speech
20 information for traditions and their Reading objects. This distribution
21 includes the L<Text::Tradition::Language> role for Traditions, the
22 L<Text::Tradition::Morphology> role (this package) for Readings, and a set
23 of Language::* modules for language-specific lemmatization.
25 See L<Text::Tradition::Collation::Reading::Lexeme> for more about the
26 morphology object structure.
30 requires 'is_identical', 'is_combinable', '_combine';
35 predicate => 'has_language',
38 has 'grammar_invalid' => (
44 has 'is_nonsense' => (
50 has 'normal_form' => (
53 predicate => '_has_normal_form',
54 clearer => '_clear_normal_form',
57 # Holds the lexemes for the reading.
58 has 'reading_lexemes' => (
60 isa => 'ArrayRef[Text::Tradition::Collation::Reading::Lexeme]',
63 lexemes => 'elements',
64 has_lexemes => 'count',
65 clear_lexemes => 'clear',
68 default => sub { [] },
73 # Make normal_form default to text, transparently.
74 around 'normal_form' => sub {
78 if( $arg && $arg eq $self->text ) {
79 $self->_clear_normal_form;
81 } elsif( !$arg && !$self->_has_normal_form ) {
88 =head1 READING METHODS
90 Methods for the morphological information (if any) attached to readings.
91 A reading may be made up of multiple lexemes; the concatenated lexeme
92 strings ought to match the reading's normalized form.
94 See L<Text::Tradition::Collation::Reading::Lexeme> for more information
95 on Lexeme objects and their attributes.
99 Returns a true value if the reading has any attached lexemes.
103 Returns the Lexeme objects (if any) attached to the reading.
107 Wipes any associated Lexeme objects out of the reading.
109 =head2 add_lexeme( $lexobj )
111 Adds the Lexeme in $lexobj to the list of lexemes.
115 If the language of the reading is set, this method will use the appropriate
116 Language model to determine the lexemes that belong to this reading. See
117 L<Text::Tradition::Language::lemmatize> if you wish to lemmatize an entire tradition.
123 unless( $self->has_language ) {
124 warn "Please set a language to lemmatize a tradition";
127 my $mod = "Text::Tradition::Language::" . $self->language;
129 $mod->can( 'reading_lookup' )->( $self );
133 # For graph serialization. Return a JSON representation of the associated
135 sub _serialize_lexemes {
137 my $json = JSON->new->allow_blessed(1)->convert_blessed(1);
138 return $json->encode( [ $self->lexemes ] );
141 # Given a JSON representation of the lexemes, instantiate them and add
142 # them to the reading.
143 sub _deserialize_lexemes {
144 my( $self, $json ) = @_;
145 my $data = from_json( $json );
146 return unless @$data;
149 foreach my $lexhash ( @$data ) {
150 push( @lexemes, Text::Tradition::Collation::Reading::Lexeme->new(
151 'JSON' => $lexhash ) );
153 $self->clear_lexemes;
154 $self->add_lexeme( @lexemes );
159 return 0 unless $self->has_lexemes;
160 return !grep { !$_->is_disambiguated } $self->lexemes;
165 # While we are here, get rid of any extra wordforms from a disambiguated
167 if( $self->disambiguated ) {
168 foreach my $lex ( $self->lexemes ) {
169 $lex->clear_matching_forms();
170 $lex->add_matching_form( $lex->form );
177 Call the 'regularize' function of the appropriate language model on our
178 own reading text. This is a rules-based function distinct from 'normal_form',
179 which can be set to any arbitrary string.
183 # TODO Test this stuff
187 if( $self->has_language ) {
188 # If we do have a language, regularize the tokens in $answer.
189 my $mod = 'Text::Tradition::Language::' . $self->language;
190 eval { load( $mod ); };
191 # If a module doesn't exist for our language, use the base routine
193 $mod = 'Text::Tradition::Language::Base';
196 return $mod->can( 'regularize' )->( $self->text );
202 around 'is_identical' => sub {
206 # If the base class returns true, do an extra check to make sure the
207 # lexemes also match.
208 my $answer = $self->$orig( $other );
210 if( $self->disambiguated && $other->disambiguated ) {
211 my $rform = join( '//', map { $_->form->to_string } $self->lexemes );
212 my $uform = join( '//', map { $_->form->to_string } $other->lexemes );
213 $answer = undef unless $rform eq $uform;
214 } elsif( $self->disambiguated xor $other->disambiguated ) {
221 around 'is_combinable' => sub {
224 # If the reading is marked with invalid grammar or as a nonsense reading,
225 # it is no longer combinable.
226 return undef if $self->grammar_invalid || $self->is_nonsense;
227 return $self->$orig();
230 after '_combine' => sub {
235 join( $joinstr, $self->normal_form, $other->normal_form ) );
236 # Combine the lexemes present in the readings
237 if( $self->has_lexemes && $other->has_lexemes ) {
238 $self->add_lexeme( $other->lexemes );
242 =head2 relationship_added
244 To be called when a relationship is set, to implement the consequences of
245 certain relationships.
249 # Test that normal form follows lemma setting. Draws on code both here and in
254 my $t = Text::Tradition->new(
256 file => 't/data/florilegium_graphml.xml' );
257 my $c = $t->collation;
259 # First try lemmatizing and then adding a relationship
260 my $r1 = $c->reading('w42');
261 my $r2 = $c->reading('w44');
262 $r1->normal_form('FOO');
263 $r2->normal_form('BAR');
265 $r1->make_lemma( 1 );
266 is( $r1->normal_form, 'FOO', "nothing changed yet" );
267 is( $r2->normal_form, 'BAR', "nothing changed yet" );
269 $c->add_relationship( $r1, $r2, { type => 'spelling' } );
270 is( $r2->normal_form, 'FOO', "Normal form followed lemma" );
272 # Now try setting relationships and then lemmatizing
273 my $r3 = $c->reading('w98');
274 my $r4 = $c->reading('w100');
275 my $r5 = $c->reading('w103');
276 $r3->normal_form('YAN');
277 $r4->normal_form('TAN');
278 $r5->normal_form('TETHERA');
280 $c->add_relationship( $r3, $r4, { type => 'orthographic', propagate => 1 } );
281 $c->add_relationship( $r3, $r5, { type => 'orthographic', propagate => 1 } );
282 is( $r3->normal_form, 'YAN', "nothing changed yet" );
283 is( $r4->normal_form, 'TAN', "nothing changed yet" );
284 is( $r5->normal_form, 'TETHERA', "nothing changed yet" );
286 $r3->make_lemma( 1 );
287 is( $r4->normal_form, 'YAN', "normal form propagated" );
288 is( $r5->normal_form, 'YAN', "normal form propagated" );
290 # Finally, try a relationship that shouldn't propagate the normal form
291 my $r6 = $c->reading('w91');
292 my $r7 = $c->reading('w92');
293 $r6->normal_form('BAZ');
294 $r7->normal_form('QUUX');
295 $r6->make_lemma( 1 );
297 $c->add_relationship( $r6, $r7, { type => 'grammatical' } );
298 is( $r7->normal_form, 'QUUX', "normal form on grammatical relationship unchanged" );
304 sub relationship_added {
305 my( $rdg1, $rdg2, $rel ) = @_;
306 my $lemma = $rdg1->is_lemma ? $rdg1 : ( $rdg2->is_lemma ? $rdg2 : undef );
307 if( $rel->type =~ /^(spelling|orthographic)$/ && $lemma ) {
308 my $other = $lemma->id eq $rdg1->id ? $rdg2 : $rdg1;
309 # Set the normal form on $other to match $lemma.
310 $other->normal_form( $lemma->normal_form );
318 This package is free software and is provided "as is" without express
319 or implied warranty. You can redistribute it and/or modify it under
320 the same terms as Perl itself.
324 Tara L Andrews E<lt>aurum@cpan.orgE<gt>