morphology/lib/Text/Tradition/Morphology.pm

   1 package Text::Tradition::Morphology;
   2
   3 use strict;
   4 use warnings;
   5 use JSON qw/ from_json /;
   6 use Moose::Role;
   7 use Module::Load;
   8 use Text::Tradition::Collation::Reading::Lexeme;
   9
  10 use vars qw/ $VERSION /;
  11 $VERSION = "0.1";
  12
  13 =head1 NAME
  14
  15 Text::Tradition::Morphology - morphology plugin for Text::Tradition
  16
  17 =head1 DESCRIPTION
  18
  19 The Text::Tradition::Morphology package enables lemma and part-of-speech
  20 information for traditions and their Reading objects. This distribution
  21 includes the L<Text::Tradition::Language> role for Traditions, the
  22 L<Text::Tradition::Morphology> role (this package) for Readings, and a set
  23 of Language::* modules for language-specific lemmatization.
  24
  25 See L<Text::Tradition::Collation::Reading::Lexeme> for more about the
  26 morphology object structure.
  27
  28 =cut
  29
  30 requires 'is_identical', 'is_combinable', '_combine';
  31
  32 has 'language' => (
  33         is => 'ro',
  34         isa => 'Str',
  35         predicate => 'has_language',
  36         );
  37
  38 has 'grammar_invalid' => (
  39         is => 'rw',
  40         isa => 'Bool',
  41         default => undef,
  42         );
  43
  44 has 'is_nonsense' => (
  45         is => 'rw',
  46         isa => 'Bool',
  47         default => undef,
  48         );
  49
  50 has 'normal_form' => (
  51         is => 'rw',
  52         isa => 'Str',
  53         predicate => '_has_normal_form',
  54         clearer => '_clear_normal_form',
  55         );
  56
  57 # Holds the lexemes for the reading.
  58 has 'reading_lexemes' => (
  59         traits => ['Array'],
  60         isa => 'ArrayRef[Text::Tradition::Collation::Reading::Lexeme]',
  61         handles => {
  62                 lexeme => 'get',
  63                 lexemes => 'elements',
  64                 has_lexemes => 'count',
  65                 clear_lexemes => 'clear',
  66                 add_lexeme => 'push',
  67                 },
  68         default => sub { [] },
  69         );
  70
  71
  72
  73 # Make normal_form default to text, transparently.
  74 around 'normal_form' => sub {
  75         my $orig = shift;
  76         my $self = shift;
  77         my( $arg ) = @_;
  78         if( $arg && $arg eq $self->text ) {
  79                 $self->_clear_normal_form;
  80                 return $arg;
  81         } elsif( !$arg && !$self->_has_normal_form ) {
  82                 return $self->text;
  83         } else {
  84                 $self->$orig( @_ );
  85         }
  86 };
  87
  88 =head1 READING METHODS
  89
  90 Methods for the morphological information (if any) attached to readings.
  91 A reading may be made up of multiple lexemes; the concatenated lexeme
  92 strings ought to match the reading's normalized form.
  93
  94 See L<Text::Tradition::Collation::Reading::Lexeme> for more information
  95 on Lexeme objects and their attributes.
  96
  97 =head2 has_lexemes
  98
  99 Returns a true value if the reading has any attached lexemes.
 100
 101 =head2 lexemes
 102
 103 Returns the Lexeme objects (if any) attached to the reading.
 104
 105 =head2 clear_lexemes
 106
 107 Wipes any associated Lexeme objects out of the reading.
 108
 109 =head2 add_lexeme( $lexobj )
 110
 111 Adds the Lexeme in $lexobj to the list of lexemes.
 112
 113 =head2 lemmatize
 114
 115 If the language of the reading is set, this method will use the appropriate
 116 Language model to determine the lexemes that belong to this reading.  See
 117 L<Text::Tradition::Language::lemmatize> if you wish to lemmatize an entire tradition.
 118
 119 =cut
 120
 121 sub lemmatize {
 122         my $self = shift;
 123         unless( $self->has_language ) {
 124                 warn "Please set a language to lemmatize a tradition";
 125                 return;
 126         }
 127         my $mod = "Text::Tradition::Language::" . $self->language;
 128         load( $mod );
 129         $mod->can( 'reading_lookup' )->( $self );
 130
 131 }
 132
 133 # For graph serialization. Return a JSON representation of the associated
 134 # reading lexemes.
 135 sub _serialize_lexemes {
 136         my $self = shift;
 137         my $json = JSON->new->allow_blessed(1)->convert_blessed(1);
 138         return $json->encode( [ $self->lexemes ] );
 139 }
 140
 141 # Given a JSON representation of the lexemes, instantiate them and add
 142 # them to the reading.
 143 sub _deserialize_lexemes {
 144         my( $self, $json ) = @_;
 145         my $data = from_json( $json );
 146         return unless @$data;
 147
 148         my @lexemes;
 149         foreach my $lexhash ( @$data ) {
 150                 push( @lexemes, Text::Tradition::Collation::Reading::Lexeme->new(
 151                         'JSON' => $lexhash ) );
 152         }
 153         $self->clear_lexemes;
 154         $self->add_lexeme( @lexemes );
 155 }
 156
 157 sub disambiguated {
 158         my $self = shift;
 159         return 0 unless $self->has_lexemes;
 160         return !grep { !$_->is_disambiguated } $self->lexemes;
 161 }
 162
 163 sub filter_lexemes {
 164         my $self = shift;
 165         # While we are here, get rid of any extra wordforms from a disambiguated
 166         # reading.
 167         if( $self->disambiguated ) {
 168                 foreach my $lex ( $self->lexemes ) {
 169                         $lex->clear_matching_forms();
 170                         $lex->add_matching_form( $lex->form );
 171                 }
 172         }
 173 }
 174
 175 =head2 regularize
 176
 177 Call the 'regularize' function of the appropriate language model on our
 178 own reading text. This is a rules-based function distinct from 'normal_form',
 179 which can be set to any arbitrary string.
 180
 181 =cut
 182
 183 # TODO Test this stuff
 184
 185 sub regularize {
 186         my $self = shift;
 187         if( $self->has_language ) {
 188                 # If we do have a language, regularize the tokens in $answer.
 189                 my $mod = 'Text::Tradition::Language::' . $self->language;
 190                 eval { load( $mod ); };
 191                 # If a module doesn't exist for our language, use the base routine
 192                 if( $@ ) {
 193                         $mod = 'Text::Tradition::Language::Base';
 194                         load( $mod );
 195                 }
 196                 return $mod->can( 'regularize' )->( $self->text );
 197         } else {
 198                 return $self->text;
 199         }
 200 }
 201
 202 around 'is_identical' => sub {
 203         my $orig = shift;
 204         my $self = shift;
 205         my $other = shift;
 206         # If the base class returns true, do an extra check to make sure the
 207         # lexemes also match.
 208         my $answer = $self->$orig( $other );
 209         if( $answer ) {
 210                 if( $self->disambiguated && $other->disambiguated ) {
 211                         my $rform = join( '//', map { $_->form->to_string } $self->lexemes );
 212                         my $uform = join( '//', map { $_->form->to_string } $other->lexemes );
 213                         $answer = undef unless $rform eq $uform;
 214                 } elsif( $self->disambiguated xor $other->disambiguated ) {
 215                         $answer = undef;
 216                 }
 217         }
 218         return $answer;
 219 };
 220
 221 around 'is_combinable' => sub {
 222         my $orig = shift;
 223         my $self = shift;
 224         # If the reading is marked with invalid grammar or as a nonsense reading,
 225         # it is no longer combinable.
 226         return undef if $self->grammar_invalid || $self->is_nonsense;
 227         return $self->$orig();
 228 };
 229
 230 after '_combine' => sub {
 231         my $self = shift;
 232         my $other = shift;
 233         my $joinstr = shift;
 234         $self->normal_form(
 235                 join( $joinstr, $self->normal_form, $other->normal_form ) );
 236         # Combine the lexemes present in the readings
 237         if( $self->has_lexemes && $other->has_lexemes ) {
 238                 $self->add_lexeme( $other->lexemes );
 239         }
 240 };
 241
 242 1;
 243
 244 =head1 LICENSE
 245
 246 This package is free software and is provided "as is" without express
 247 or implied warranty.  You can redistribute it and/or modify it under
 248 the same terms as Perl itself.
 249
 250 =head1 AUTHOR
 251
 252 Tara L Andrews E<lt>aurum@cpan.orgE<gt>