morphology/lib/Text/Tradition/Morphology.pm

   1 package Text::Tradition::Morphology;
   2
   3 use strict;
   4 use warnings;
   5 use JSON qw/ from_json /;
   6 use Moose::Role;
   7 use Module::Load;
   8 use Text::Tradition::Collation::Reading::Lexeme;
   9
  10 use vars qw/ $VERSION /;
  11 $VERSION = "0.1";
  12
  13 =head1 NAME
  14
  15 Text::Tradition::Morphology - morphology plugin for Text::Tradition
  16
  17 =head1 DESCRIPTION
  18
  19 The Text::Tradition::Morphology package enables lemma and part-of-speech
  20 information for traditions and their Reading objects. This distribution
  21 includes the L<Text::Tradition::Language> role for Traditions, the
  22 L<Text::Tradition::Morphology> role (this package) for Readings, and a set
  23 of Language::* modules for language-specific lemmatization.
  24
  25 See L<Text::Tradition::Collation::Reading::Lexeme> for more about the
  26 morphology object structure.
  27
  28 =cut
  29
  30 requires 'is_identical', 'is_combinable', '_combine';
  31
  32 has 'language' => (
  33         is => 'ro',
  34         isa => 'Str',
  35         predicate => 'has_language',
  36         );
  37
  38 has 'grammar_invalid' => (
  39         is => 'rw',
  40         isa => 'Bool',
  41         default => undef,
  42         );
  43
  44 has 'is_nonsense' => (
  45         is => 'rw',
  46         isa => 'Bool',
  47         default => undef,
  48         );
  49
  50 has 'normal_form' => (
  51         is => 'rw',
  52         isa => 'Str',
  53         predicate => '_has_normal_form',
  54         clearer => '_clear_normal_form',
  55         );
  56
  57 # Holds the lexemes for the reading.
  58 has 'reading_lexemes' => (
  59         traits => ['Array'],
  60         isa => 'ArrayRef[Text::Tradition::Collation::Reading::Lexeme]',
  61         handles => {
  62                 lexeme => 'get',
  63                 lexemes => 'elements',
  64                 has_lexemes => 'count',
  65                 clear_lexemes => 'clear',
  66                 add_lexeme => 'push',
  67                 },
  68         default => sub { [] },
  69         );
  70
  71
  72
  73 # Make normal_form default to text, transparently.
  74 around 'normal_form' => sub {
  75         my $orig = shift;
  76         my $self = shift;
  77         my( $arg ) = @_;
  78         if( $arg && $arg eq $self->text ) {
  79                 $self->_clear_normal_form;
  80                 return $arg;
  81         } elsif( !$arg && !$self->_has_normal_form ) {
  82                 return $self->text;
  83         } else {
  84                 $self->$orig( @_ );
  85         }
  86 };
  87
  88 =head1 READING METHODS
  89
  90 Methods for the morphological information (if any) attached to readings.
  91 A reading may be made up of multiple lexemes; the concatenated lexeme
  92 strings ought to match the reading's normalized form.
  93
  94 See L<Text::Tradition::Collation::Reading::Lexeme> for more information
  95 on Lexeme objects and their attributes.
  96
  97 =head2 has_lexemes
  98
  99 Returns a true value if the reading has any attached lexemes.
 100
 101 =head2 lexemes
 102
 103 Returns the Lexeme objects (if any) attached to the reading.
 104
 105 =head2 clear_lexemes
 106
 107 Wipes any associated Lexeme objects out of the reading.
 108
 109 =head2 add_lexeme( $lexobj )
 110
 111 Adds the Lexeme in $lexobj to the list of lexemes.
 112
 113 =head2 lemmatize
 114
 115 If the language of the reading is set, this method will use the appropriate
 116 Language model to determine the lexemes that belong to this reading.  See
 117 L<Text::Tradition::Language::lemmatize> if you wish to lemmatize an entire tradition.
 118
 119 =cut
 120
 121 sub lemmatize {
 122         my $self = shift;
 123         unless( $self->has_language ) {
 124                 warn "Please set a language to lemmatize a tradition";
 125                 return;
 126         }
 127         my $mod = "Text::Tradition::Language::" . $self->language;
 128         load( $mod );
 129         $mod->can( 'reading_lookup' )->( $self );
 130
 131 }
 132
 133 # For graph serialization. Return a JSON representation of the associated
 134 # reading lexemes.
 135 sub _serialize_lexemes {
 136         my $self = shift;
 137         my $json = JSON->new->allow_blessed(1)->convert_blessed(1);
 138         return $json->encode( [ $self->lexemes ] );
 139 }
 140
 141 # Given a JSON representation of the lexemes, instantiate them and add
 142 # them to the reading.
 143 sub _deserialize_lexemes {
 144         my( $self, $json ) = @_;
 145         my $data = from_json( $json );
 146         return unless @$data;
 147
 148         my @lexemes;
 149         foreach my $lexhash ( @$data ) {
 150                 push( @lexemes, Text::Tradition::Collation::Reading::Lexeme->new(
 151                         'JSON' => $lexhash ) );
 152         }
 153         $self->clear_lexemes;
 154         $self->add_lexeme( @lexemes );
 155 }
 156
 157 sub disambiguated {
 158         my $self = shift;
 159         return 0 unless $self->has_lexemes;
 160         return !grep { !$_->is_disambiguated } $self->lexemes;
 161 }
 162
 163 sub filter_lexemes {
 164         my $self = shift;
 165         # While we are here, get rid of any extra wordforms from a disambiguated
 166         # reading.
 167         if( $self->disambiguated ) {
 168                 foreach my $lex ( $self->lexemes ) {
 169                         $lex->clear_matching_forms();
 170                         $lex->add_matching_form( $lex->form );
 171                 }
 172         }
 173 }
 174
 175 =head2 regularize
 176
 177 Call the 'regularize' function of the appropriate language model on our
 178 own reading text. This is a rules-based function distinct from 'normal_form',
 179 which can be set to any arbitrary string.
 180
 181 =cut
 182
 183 # TODO Test this stuff
 184
 185 sub regularize {
 186         my $self = shift;
 187         if( $self->has_language ) {
 188                 # If we do have a language, regularize the tokens in $answer.
 189                 my $mod = 'Text::Tradition::Language::' . $self->language;
 190                 eval { load( $mod ); };
 191                 # If a module doesn't exist for our language, use the base routine
 192                 $mod = 'Text::Tradition::Language::Base' if $@;
 193                 return $mod->can( 'regularize' )->( $self->text );
 194         } else {
 195                 return $self->text;
 196         }
 197 }
 198
 199 around 'is_identical' => sub {
 200         my $orig = shift;
 201         my $self = shift;
 202         my $other = shift;
 203         # If the base class returns true, do an extra check to make sure the
 204         # lexemes also match.
 205         my $answer = $self->$orig( $other );
 206         if( $answer ) {
 207                 if( $self->disambiguated && $other->disambiguated ) {
 208                         my $rform = join( '//', map { $_->form->to_string } $self->lexemes );
 209                         my $uform = join( '//', map { $_->form->to_string } $other->lexemes );
 210                         $answer = undef unless $rform eq $uform;
 211                 } elsif( $self->disambiguated xor $other->disambiguated ) {
 212                         $answer = undef;
 213                 }
 214         }
 215         return $answer;
 216 };
 217
 218 around 'is_combinable' => sub {
 219         my $orig = shift;
 220         my $self = shift;
 221         # If the reading is marked with invalid grammar or as a nonsense reading,
 222         # it is no longer combinable.
 223         return undef if $self->grammar_invalid || $self->is_nonsense;
 224         return $self->$orig();
 225 };
 226
 227 after '_combine' => sub {
 228         my $self = shift;
 229         my $other = shift;
 230         my $joinstr = shift;
 231         $self->normal_form(
 232                 join( $joinstr, $self->normal_form, $other->normal_form ) );
 233         # Combine the lexemes present in the readings
 234         if( $self->has_lexemes && $other->has_lexemes ) {
 235                 $self->add_lexeme( $other->lexemes );
 236         }
 237 };
 238
 239 1;
 240
 241 =head1 LICENSE
 242
 243 This package is free software and is provided "as is" without express
 244 or implied warranty.  You can redistribute it and/or modify it under
 245 the same terms as Perl itself.
 246
 247 =head1 AUTHOR
 248
 249 Tara L Andrews E<lt>aurum@cpan.orgE<gt>