morphology/lib/Text/Tradition/Morphology.pm

   1 package Text::Tradition::Morphology;
   2
   3 use strict;
   4 use warnings;
   5 use JSON qw/ from_json /;
   6 use Moose::Role;
   7 use Module::Load;
   8 use Text::Tradition::Collation::Reading::Lexeme;
   9
  10 use vars qw/ $VERSION /;
  11 $VERSION = "0.1";
  12
  13 =head1 NAME
  14
  15 Text::Tradition::Morphology - morphology plugin for Text::Tradition
  16
  17 =head1 DESCRIPTION
  18
  19 The Text::Tradition::Morphology package enables lemma and part-of-speech
  20 information for traditions and their Reading objects. This distribution
  21 includes the L<Text::Tradition::Language> role for Traditions, the
  22 L<Text::Tradition::Morphology> role (this package) for Readings, and a set
  23 of Language::* modules for language-specific lemmatization.
  24
  25 See L<Text::Tradition::Collation::Reading::Lexeme> for more about the
  26 morphology object structure.
  27
  28 =cut
  29
  30 requires 'is_identical', 'is_combinable', '_combine';
  31
  32 has 'language' => (
  33         is => 'ro',
  34         isa => 'Str',
  35         predicate => 'has_language',
  36         );
  37
  38 has 'grammar_invalid' => (
  39         is => 'rw',
  40         isa => 'Bool',
  41         default => undef,
  42         );
  43
  44 has 'is_nonsense' => (
  45         is => 'rw',
  46         isa => 'Bool',
  47         default => undef,
  48         );
  49
  50 has 'normal_form' => (
  51         is => 'rw',
  52         isa => 'Str',
  53         predicate => '_has_normal_form',
  54         clearer => '_clear_normal_form',
  55         );
  56
  57 # Holds the lexemes for the reading.
  58 has 'reading_lexemes' => (
  59         traits => ['Array'],
  60         isa => 'ArrayRef[Text::Tradition::Collation::Reading::Lexeme]',
  61         handles => {
  62                 lexeme => 'get',
  63                 lexemes => 'elements',
  64                 has_lexemes => 'count',
  65                 clear_lexemes => 'clear',
  66                 add_lexeme => 'push',
  67                 },
  68         default => sub { [] },
  69         );
  70
  71
  72
  73 # Make normal_form default to text, transparently.
  74 around 'normal_form' => sub {
  75         my $orig = shift;
  76         my $self = shift;
  77         my( $arg ) = @_;
  78         if( $arg && $arg eq $self->text ) {
  79                 $self->_clear_normal_form;
  80                 return $arg;
  81         } elsif( !$arg && !$self->_has_normal_form ) {
  82                 return $self->text;
  83         } else {
  84                 $self->$orig( @_ );
  85         }
  86 };
  87
  88 =head1 READING METHODS
  89
  90 Methods for the morphological information (if any) attached to readings.
  91 A reading may be made up of multiple lexemes; the concatenated lexeme
  92 strings ought to match the reading's normalized form.
  93
  94 See L<Text::Tradition::Collation::Reading::Lexeme> for more information
  95 on Lexeme objects and their attributes.
  96
  97 =head2 has_lexemes
  98
  99 Returns a true value if the reading has any attached lexemes.
 100
 101 =head2 lexemes
 102
 103 Returns the Lexeme objects (if any) attached to the reading.
 104
 105 =head2 clear_lexemes
 106
 107 Wipes any associated Lexeme objects out of the reading.
 108
 109 =head2 add_lexeme( $lexobj )
 110
 111 Adds the Lexeme in $lexobj to the list of lexemes.
 112
 113 =head2 lemmatize
 114
 115 If the language of the reading is set, this method will use the appropriate
 116 Language model to determine the lexemes that belong to this reading.  See
 117 L<Text::Tradition::Language::lemmatize> if you wish to lemmatize an entire tradition.
 118
 119 =cut
 120
 121 sub lemmatize {
 122         my $self = shift;
 123         unless( $self->has_language ) {
 124                 warn "Please set a language to lemmatize a tradition";
 125                 return;
 126         }
 127         my $mod = "Text::Tradition::Language::" . $self->language;
 128         load( $mod );
 129         $mod->can( 'reading_lookup' )->( $self );
 130
 131 }
 132
 133 # For graph serialization. Return a JSON representation of the associated
 134 # reading lexemes.
 135 sub _serialize_lexemes {
 136         my $self = shift;
 137         my $json = JSON->new->allow_blessed(1)->convert_blessed(1);
 138         return $json->encode( [ $self->lexemes ] );
 139 }
 140
 141 # Given a JSON representation of the lexemes, instantiate them and add
 142 # them to the reading.
 143 sub _deserialize_lexemes {
 144         my( $self, $json ) = @_;
 145         my $data = from_json( $json );
 146         return unless @$data;
 147
 148         my @lexemes;
 149         foreach my $lexhash ( @$data ) {
 150                 push( @lexemes, Text::Tradition::Collation::Reading::Lexeme->new(
 151                         'JSON' => $lexhash ) );
 152         }
 153         $self->clear_lexemes;
 154         $self->add_lexeme( @lexemes );
 155 }
 156
 157 sub disambiguated {
 158         my $self = shift;
 159         return 0 unless $self->has_lexemes;
 160         return !grep { !$_->is_disambiguated } $self->lexemes;
 161 }
 162
 163 sub filter_lexemes {
 164         my $self = shift;
 165         # While we are here, get rid of any extra wordforms from a disambiguated
 166         # reading.
 167         if( $self->disambiguated ) {
 168                 foreach my $lex ( $self->lexemes ) {
 169                         $lex->clear_matching_forms();
 170                         $lex->add_matching_form( $lex->form );
 171                 }
 172         }
 173 }
 174
 175 =head2 regularize
 176
 177 Call the 'regularize' function of the appropriate language model on our
 178 own reading text. This is a rules-based function distinct from 'normal_form',
 179 which can be set to any arbitrary string.
 180
 181 =cut
 182
 183 # TODO Test this stuff
 184
 185 sub regularize {
 186         my $self = shift;
 187         if( $self->has_language ) {
 188                 # If we do have a language, regularize the tokens in $answer.
 189                 my $mod = 'Text::Tradition::Language::' . $self->language;
 190                 eval { load( $mod ); };
 191                 # If a module doesn't exist for our language, use the base routine
 192                 if( $@ ) {
 193                         $mod = 'Text::Tradition::Language::Base';
 194                         load( $mod );
 195                 }
 196                 return $mod->can( 'regularize' )->( $self->text );
 197         } else {
 198                 return $self->text;
 199         }
 200 }
 201
 202 around 'is_identical' => sub {
 203         my $orig = shift;
 204         my $self = shift;
 205         my $other = shift;
 206         # If the base class returns true, do an extra check to make sure the
 207         # lexemes also match.
 208         my $answer = $self->$orig( $other );
 209         if( $answer ) {
 210                 if( $self->disambiguated && $other->disambiguated ) {
 211                         my $rform = join( '//', map { $_->form->to_string } $self->lexemes );
 212                         my $uform = join( '//', map { $_->form->to_string } $other->lexemes );
 213                         $answer = undef unless $rform eq $uform;
 214                 } elsif( $self->disambiguated xor $other->disambiguated ) {
 215                         $answer = undef;
 216                 }
 217         }
 218         return $answer;
 219 };
 220
 221 around 'is_combinable' => sub {
 222         my $orig = shift;
 223         my $self = shift;
 224         # If the reading is marked with invalid grammar or as a nonsense reading,
 225         # it is no longer combinable.
 226         return undef if $self->grammar_invalid || $self->is_nonsense;
 227         return $self->$orig();
 228 };
 229
 230 after '_combine' => sub {
 231         my $self = shift;
 232         my $other = shift;
 233         my $joinstr = shift;
 234         $self->normal_form(
 235                 join( $joinstr, $self->normal_form, $other->normal_form ) );
 236         # Combine the lexemes present in the readings
 237         if( $self->has_lexemes && $other->has_lexemes ) {
 238                 $self->add_lexeme( $other->lexemes );
 239         }
 240 };
 241
 242 =head2 relationship_added
 243
 244 To be called when a relationship is set, to implement the consequences of
 245 certain relationships.
 246
 247 =begin testing
 248
 249 # Test that normal form follows lemma setting. Draws on code both here and in
 250 # the base module.
 251
 252 use Text::Tradition;
 253
 254 my $t = Text::Tradition->new(
 255         input => 'Self',
 256         file => 't/data/florilegium_graphml.xml' );
 257 my $c = $t->collation;
 258
 259 # First try lemmatizing and then adding a relationship
 260 my $r1 = $c->reading('w42');
 261 my $r2 = $c->reading('w44');
 262 $r1->normal_form('FOO');
 263 $r2->normal_form('BAR');
 264
 265 $r1->make_lemma( 1 );
 266 is( $r1->normal_form, 'FOO', "nothing changed yet" );
 267 is( $r2->normal_form, 'BAR', "nothing changed yet" );
 268
 269 $c->add_relationship( $r1, $r2, { type => 'spelling' } );
 270 is( $r2->normal_form, 'FOO', "Normal form followed lemma" );
 271
 272 # Now try setting relationships and then lemmatizing
 273 my $r3 = $c->reading('w98');
 274 my $r4 = $c->reading('w100');
 275 my $r5 = $c->reading('w103');
 276 $r3->normal_form('YAN');
 277 $r4->normal_form('TAN');
 278 $r5->normal_form('TETHERA');
 279
 280 $c->add_relationship( $r3, $r4, { type => 'orthographic', propagate => 1 } );
 281 $c->add_relationship( $r3, $r5, { type => 'orthographic', propagate => 1 } );
 282 is( $r3->normal_form, 'YAN', "nothing changed yet" );
 283 is( $r4->normal_form, 'TAN', "nothing changed yet" );
 284 is( $r5->normal_form, 'TETHERA', "nothing changed yet" );
 285
 286 $r3->make_lemma( 1 );
 287 is( $r4->normal_form, 'YAN', "normal form propagated" );
 288 is( $r5->normal_form, 'YAN', "normal form propagated" );
 289
 290 # Finally, try a relationship that shouldn't propagate the normal form
 291 my $r6 = $c->reading('w91');
 292 my $r7 = $c->reading('w92');
 293 $r6->normal_form('BAZ');
 294 $r7->normal_form('QUUX');
 295 $r6->make_lemma( 1 );
 296
 297 $c->add_relationship( $r6, $r7, { type => 'grammatical' } );
 298 is( $r7->normal_form, 'QUUX', "normal form on grammatical relationship unchanged" );
 299
 300 =end testing
 301
 302 =cut
 303
 304 sub relationship_added {
 305         my( $rdg1, $rdg2, $rel ) = @_;
 306         my $lemma = $rdg1->is_lemma ? $rdg1 : ( $rdg2->is_lemma ? $rdg2 : undef );
 307         if( $rel->type =~ /^(spelling|orthographic)$/ && $lemma ) {
 308                 my $other = $lemma->id eq $rdg1->id ? $rdg2 : $rdg1;
 309                 # Set the normal form on $other to match $lemma.
 310                 $other->normal_form( $lemma->normal_form );
 311         }
 312 }
 313
 314 1;
 315
 316 =head1 LICENSE
 317
 318 This package is free software and is provided "as is" without express
 319 or implied warranty.  You can redistribute it and/or modify it under
 320 the same terms as Perl itself.
 321
 322 =head1 AUTHOR
 323
 324 Tara L Andrews E<lt>aurum@cpan.orgE<gt>