[scpubgit/stemmatology.git] / morphology / lib / Text / Tradition / Morphology.pm

package Text::Tradition::Morphology;

use strict;
use warnings;
use JSON qw/ from_json /;
use Moose::Role;
use Module::Load;
use Text::Tradition::Collation::Reading::Lexeme;

use vars qw/ $VERSION /;
$VERSION = "0.1";

=head1 NAME

Text::Tradition::Morphology - morphology plugin for Text::Tradition

=head1 DESCRIPTION

The Text::Tradition::Morphology package enables lemma and part-of-speech
information for traditions and their Reading objects. This distribution
includes the L<Text::Tradition::Language> role for Traditions, the
L<Text::Tradition::Morphology> role (this package) for Readings, and a set
of Language::* modules for language-specific lemmatization.

See L<Text::Tradition::Collation::Reading::Lexeme> for more about the 
morphology object structure.

=cut

requires 'is_identical', 'is_combinable', '_combine';

has 'language' => (
	is => 'ro',
	isa => 'Str',
	predicate => 'has_language',
	);
	
has 'grammar_invalid' => (
	is => 'rw',
	isa => 'Bool',
	default => undef,
	);
	
has 'is_nonsense' => (
	is => 'rw',
	isa => 'Bool',
	default => undef,
	);

has 'normal_form' => (
	is => 'rw',
	isa => 'Str',
	predicate => '_has_normal_form',
	clearer => '_clear_normal_form',
	);

# Holds the lexemes for the reading.
has 'reading_lexemes' => (
	traits => ['Array'],
	isa => 'ArrayRef[Text::Tradition::Collation::Reading::Lexeme]',
	handles => {
		lexeme => 'get',
		lexemes => 'elements',
		has_lexemes => 'count',
		clear_lexemes => 'clear',
		add_lexeme => 'push',
		},
	default => sub { [] },
	);
	

# Make normal_form default to text, transparently.
around 'normal_form' => sub {
	my $orig = shift;
	my $self = shift;
	my( $arg ) = @_;
	if( $arg && $arg eq $self->text ) {
		$self->_clear_normal_form;
		return $arg;
	} elsif( !$arg && !$self->_has_normal_form ) {
		return $self->text;
	} else {
		$self->$orig( @_ );
	}
};

=head1 READING METHODS

Methods for the morphological information (if any) attached to readings.
A reading may be made up of multiple lexemes; the concatenated lexeme
strings ought to match the reading's normalized form.
 
See L<Text::Tradition::Collation::Reading::Lexeme> for more information
on Lexeme objects and their attributes.

=head2 has_lexemes

Returns a true value if the reading has any attached lexemes.

=head2 lexemes

Returns the Lexeme objects (if any) attached to the reading.

=head2 clear_lexemes

Wipes any associated Lexeme objects out of the reading.

=head2 add_lexeme( $lexobj )

Adds the Lexeme in $lexobj to the list of lexemes.

=head2 lemmatize

If the language of the reading is set, this method will use the appropriate
Language model to determine the lexemes that belong to this reading.  See
L<Text::Tradition::Language::lemmatize> if you wish to lemmatize an entire tradition.

=cut

sub lemmatize {
	my $self = shift;
	unless( $self->has_language ) {
		warn "Please set a language to lemmatize a tradition";
		return;
	}
	my $mod = "Text::Tradition::Language::" . $self->language;
	load( $mod );
	$mod->can( 'reading_lookup' )->( $self );

}

# For graph serialization. Return a JSON representation of the associated
# reading lexemes.
sub _serialize_lexemes {
	my $self = shift;
	my $json = JSON->new->allow_blessed(1)->convert_blessed(1);
	return $json->encode( [ $self->lexemes ] );
}

# Given a JSON representation of the lexemes, instantiate them and add
# them to the reading.
sub _deserialize_lexemes {
	my( $self, $json ) = @_;
	my $data = from_json( $json );
	return unless @$data;
	
	my @lexemes;
	foreach my $lexhash ( @$data ) {
		push( @lexemes, Text::Tradition::Collation::Reading::Lexeme->new(
			'JSON' => $lexhash ) );
	}
	$self->clear_lexemes;
	$self->add_lexeme( @lexemes );
}

sub disambiguated {
	my $self = shift;
	return 0 unless $self->has_lexemes;
	return !grep { !$_->is_disambiguated } $self->lexemes;
}

sub filter_lexemes {
	my $self = shift;
	# While we are here, get rid of any extra wordforms from a disambiguated
	# reading.
	if( $self->disambiguated ) {
		foreach my $lex ( $self->lexemes ) {
			$lex->clear_matching_forms();
			$lex->add_matching_form( $lex->form );
		}
	}
}

=head2 regularize

Call the 'regularize' function of the appropriate language model on our
own reading text. This is a rules-based function distinct from 'normal_form',
which can be set to any arbitrary string.

=cut

# TODO Test this stuff

sub regularize {
	my $self = shift;
	if( $self->has_language ) {
		# If we do have a language, regularize the tokens in $answer.
		my $mod = 'Text::Tradition::Language::' . $self->language;
		eval { load( $mod ); };
		# If a module doesn't exist for our language, use the base routine
		$mod = 'Text::Tradition::Language::Base' if $@;
		return $mod->can( 'regularize' )->( $self->text );
	} else {
		return $self->text;
	}
}

around 'is_identical' => sub {
	my $orig = shift;
	my $self = shift;
	my $other = shift;
	# If the base class returns true, do an extra check to make sure the
	# lexemes also match.
	my $answer = $self->$orig( $other );
	if( $answer ) {
		if( $self->disambiguated && $other->disambiguated ) {
			my $rform = join( '//', map { $_->form->to_string } $self->lexemes );
			my $uform = join( '//', map { $_->form->to_string } $other->lexemes );
			$answer = undef unless $rform eq $uform;
		} elsif( $self->disambiguated xor $other->disambiguated ) {
			$answer = undef;
		}
	}
	return $answer;
};

around 'is_combinable' => sub {
	my $orig = shift;
	my $self = shift;
	# If the reading is marked with invalid grammar or as a nonsense reading,
	# it is no longer combinable.
	return undef if $self->grammar_invalid || $self->is_nonsense;
	return $self->$orig();
};

after '_combine' => sub {
	my $self = shift;
	my $other = shift;
	my $joinstr = shift;
	$self->normal_form( 
		join( $joinstr, $self->normal_form, $other->normal_form ) );
	# Combine the lexemes present in the readings
	if( $self->has_lexemes && $other->has_lexemes ) {
		$self->add_lexeme( $other->lexemes );
	}
};

1;

=head1 LICENSE

This package is free software and is provided "as is" without express
or implied warranty.  You can redistribute it and/or modify it under
the same terms as Perl itself.

=head1 AUTHOR

Tara L Andrews E<lt>aurum@cpan.orgE<gt>
Commit	Line	Data
a445ce40	1	package Text::Tradition::Morphology;
	2
	3	use strict;
	4	use warnings;
	5	use JSON qw/ from_json /;
	6	use Moose::Role;
	7	use Module::Load;
	8	use Text::Tradition::Collation::Reading::Lexeme;
	9
483478bf	10	use vars qw/ $VERSION /;
	11	$VERSION = "0.1";
	12
a445ce40	13	=head1 NAME
a445ce40	14
8943ff68	15	Text::Tradition::Morphology - morphology plugin for Text::Tradition
	16
	17	=head1 DESCRIPTION
	18
	19	The Text::Tradition::Morphology package enables lemma and part-of-speech
	20	information for traditions and their Reading objects. This distribution
	21	includes the L<Text::Tradition::Language> role for Traditions, the
	22	L<Text::Tradition::Morphology> role (this package) for Readings, and a set
	23	of Language::* modules for language-specific lemmatization.
	24
	25	See L<Text::Tradition::Collation::Reading::Lexeme> for more about the
	26	morphology object structure.
a445ce40	27
	28	=cut
	29
332750fc	30	requires 'is_identical', 'is_combinable', '_combine';
332750fc	31
58f9c2b9	32	has 'language' => (
	33	is => 'ro',
	34	isa => 'Str',
	35	predicate => 'has_language',
	36	);
	37
a445ce40	38	has 'grammar_invalid' => (
	39	is => 'rw',
	40	isa => 'Bool',
	41	default => undef,
	42	);
	43
	44	has 'is_nonsense' => (
	45	is => 'rw',
	46	isa => 'Bool',
	47	default => undef,
	48	);
	49
	50	has 'normal_form' => (
	51	is => 'rw',
	52	isa => 'Str',
	53	predicate => '_has_normal_form',
	54	clearer => '_clear_normal_form',
	55	);
	56
	57	# Holds the lexemes for the reading.
	58	has 'reading_lexemes' => (
	59	traits => ['Array'],
	60	isa => 'ArrayRef[Text::Tradition::Collation::Reading::Lexeme]',
	61	handles => {
	62	lexeme => 'get',
	63	lexemes => 'elements',
	64	has_lexemes => 'count',
	65	clear_lexemes => 'clear',
	66	add_lexeme => 'push',
	67	},
	68	default => sub { [] },
	69	);
	70
	71
	72
	73	# Make normal_form default to text, transparently.
	74	around 'normal_form' => sub {
	75	my $orig = shift;
	76	my $self = shift;
	77	my( $arg ) = @_;
	78	if( $arg && $arg eq $self->text ) {
	79	$self->_clear_normal_form;
	80	return $arg;
	81	} elsif( !$arg && !$self->_has_normal_form ) {
	82	return $self->text;
	83	} else {
	84	$self->$orig( @_ );
	85	}
	86	};
	87
8943ff68	88	=head1 READING METHODS
a445ce40	89
	90	Methods for the morphological information (if any) attached to readings.
	91	A reading may be made up of multiple lexemes; the concatenated lexeme
	92	strings ought to match the reading's normalized form.
	93
	94	See L<Text::Tradition::Collation::Reading::Lexeme> for more information
	95	on Lexeme objects and their attributes.
	96
	97	=head2 has_lexemes
	98
	99	Returns a true value if the reading has any attached lexemes.
	100
	101	=head2 lexemes
	102
	103	Returns the Lexeme objects (if any) attached to the reading.
	104
	105	=head2 clear_lexemes
	106
	107	Wipes any associated Lexeme objects out of the reading.
	108
	109	=head2 add_lexeme( $lexobj )
	110
	111	Adds the Lexeme in $lexobj to the list of lexemes.
	112
	113	=head2 lemmatize
	114
	115	If the language of the reading is set, this method will use the appropriate
	116	Language model to determine the lexemes that belong to this reading. See
8943ff68	117	L<Text::Tradition::Language::lemmatize> if you wish to lemmatize an entire tradition.
a445ce40	118
	119	=cut
	120
	121	sub lemmatize {
	122	my $self = shift;
	123	unless( $self->has_language ) {
	124	warn "Please set a language to lemmatize a tradition";
	125	return;
	126	}
	127	my $mod = "Text::Tradition::Language::" . $self->language;
	128	load( $mod );
	129	$mod->can( 'reading_lookup' )->( $self );
	130
	131	}
	132
	133	# For graph serialization. Return a JSON representation of the associated
	134	# reading lexemes.
	135	sub _serialize_lexemes {
	136	my $self = shift;
	137	my $json = JSON->new->allow_blessed(1)->convert_blessed(1);
	138	return $json->encode( [ $self->lexemes ] );
	139	}
	140
	141	# Given a JSON representation of the lexemes, instantiate them and add
	142	# them to the reading.
	143	sub _deserialize_lexemes {
	144	my( $self, $json ) = @_;
	145	my $data = from_json( $json );
	146	return unless @$data;
	147
	148	my @lexemes;
	149	foreach my $lexhash ( @$data ) {
	150	push( @lexemes, Text::Tradition::Collation::Reading::Lexeme->new(
	151	'JSON' => $lexhash ) );
	152	}
	153	$self->clear_lexemes;
	154	$self->add_lexeme( @lexemes );
	155	}
	156
	157	sub disambiguated {
	158	my $self = shift;
	159	return 0 unless $self->has_lexemes;
	160	return !grep { !$_->is_disambiguated } $self->lexemes;
	161	}
	162
	163	sub filter_lexemes {
	164	my $self = shift;
	165	# While we are here, get rid of any extra wordforms from a disambiguated
	166	# reading.
	167	if( $self->disambiguated ) {
	168	foreach my $lex ( $self->lexemes ) {
	169	$lex->clear_matching_forms();
	170	$lex->add_matching_form( $lex->form );
	171	}
	172	}
	173	}
	174
58f9c2b9	175	=head2 regularize
	176
	177	Call the 'regularize' function of the appropriate language model on our
	178	own reading text. This is a rules-based function distinct from 'normal_form',
	179	which can be set to any arbitrary string.
	180
	181	=cut
	182
	183	# TODO Test this stuff
	184
	185	sub regularize {
	186	my $self = shift;
	187	if( $self->has_language ) {
	188	# If we do have a language, regularize the tokens in $answer.
	189	my $mod = 'Text::Tradition::Language::' . $self->language;
58f9c2b9	190	eval { load( $mod ); };
	191	# If a module doesn't exist for our language, use the base routine
	192	$mod = 'Text::Tradition::Language::Base' if $@;
	193	return $mod->can( 'regularize' )->( $self->text );
	194	} else {
	195	return $self->text;
	196	}
	197	}
	198
a445ce40	199	around 'is_identical' => sub {
	200	my $orig = shift;
	201	my $self = shift;
	202	my $other = shift;
	203	# If the base class returns true, do an extra check to make sure the
	204	# lexemes also match.
	205	my $answer = $self->$orig( $other );
	206	if( $answer ) {
	207	if( $self->disambiguated && $other->disambiguated ) {
	208	my $rform = join( '//', map { $_->form->to_string } $self->lexemes );
	209	my $uform = join( '//', map { $_->form->to_string } $other->lexemes );
	210	$answer = undef unless $rform eq $uform;
	211	} elsif( $self->disambiguated xor $other->disambiguated ) {
	212	$answer = undef;
	213	}
	214	}
	215	return $answer;
	216	};
	217
	218	around 'is_combinable' => sub {
	219	my $orig = shift;
	220	my $self = shift;
	221	# If the reading is marked with invalid grammar or as a nonsense reading,
	222	# it is no longer combinable.
	223	return undef if $self->grammar_invalid \|\| $self->is_nonsense;
	224	return $self->$orig();
	225	};
	226
	227	after '_combine' => sub {
	228	my $self = shift;
	229	my $other = shift;
	230	my $joinstr = shift;
	231	$self->normal_form(
	232	join( $joinstr, $self->normal_form, $other->normal_form ) );
	233	# Combine the lexemes present in the readings
	234	if( $self->has_lexemes && $other->has_lexemes ) {
	235	$self->add_lexeme( $other->lexemes );
	236	}
	237	};
	238
e92d4229	239	1;
	240
	241	=head1 LICENSE
	242
	243	This package is free software and is provided "as is" without express
	244	or implied warranty. You can redistribute it and/or modify it under
	245	the same terms as Perl itself.
	246
	247	=head1 AUTHOR
	248
	249	Tara L Andrews E<lt>aurum@cpan.orgE<gt>