[scpubgit/stemmatology.git] / morphology / lib / Text / Tradition / Morphology.pm

package Text::Tradition::Morphology;

use strict;
use warnings;
use JSON qw/ from_json /;
use Moose::Role;
use Module::Load;
use Text::Tradition::Collation::Reading::Lexeme;

use vars qw/ $VERSION /;
$VERSION = "0.1";

=head1 NAME

Text::Tradition::Morphology - morphology plugin for Text::Tradition

=head1 DESCRIPTION

The Text::Tradition::Morphology package enables lemma and part-of-speech
information for traditions and their Reading objects. This distribution
includes the L<Text::Tradition::Language> role for Traditions, the
L<Text::Tradition::Morphology> role (this package) for Readings, and a set
of Language::* modules for language-specific lemmatization.

See L<Text::Tradition::Collation::Reading::Lexeme> for more about the 
morphology object structure.

=cut

requires 'is_identical', 'is_combinable', '_combine';

has 'language' => (
	is => 'ro',
	isa => 'Str',
	predicate => 'has_language',
	);
	
has 'grammar_invalid' => (
	is => 'rw',
	isa => 'Bool',
	default => undef,
	);
	
has 'is_nonsense' => (
	is => 'rw',
	isa => 'Bool',
	default => undef,
	);

has 'normal_form' => (
	is => 'rw',
	isa => 'Str',
	predicate => '_has_normal_form',
	clearer => '_clear_normal_form',
	);

# Holds the lexemes for the reading.
has 'reading_lexemes' => (
	traits => ['Array'],
	isa => 'ArrayRef[Text::Tradition::Collation::Reading::Lexeme]',
	handles => {
		lexeme => 'get',
		lexemes => 'elements',
		has_lexemes => 'count',
		clear_lexemes => 'clear',
		add_lexeme => 'push',
		},
	default => sub { [] },
	);
	

# Make normal_form default to text, transparently.
around 'normal_form' => sub {
	my $orig = shift;
	my $self = shift;
	my( $arg ) = @_;
	if( $arg && $arg eq $self->text ) {
		$self->_clear_normal_form;
		return $arg;
	} elsif( !$arg && !$self->_has_normal_form ) {
		return $self->text;
	} else {
		$self->$orig( @_ );
	}
};

=head1 READING METHODS

Methods for the morphological information (if any) attached to readings.
A reading may be made up of multiple lexemes; the concatenated lexeme
strings ought to match the reading's normalized form.
 
See L<Text::Tradition::Collation::Reading::Lexeme> for more information
on Lexeme objects and their attributes.

=head2 has_lexemes

Returns a true value if the reading has any attached lexemes.

=head2 lexemes

Returns the Lexeme objects (if any) attached to the reading.

=head2 clear_lexemes

Wipes any associated Lexeme objects out of the reading.

=head2 add_lexeme( $lexobj )

Adds the Lexeme in $lexobj to the list of lexemes.

=head2 lemmatize

If the language of the reading is set, this method will use the appropriate
Language model to determine the lexemes that belong to this reading.  See
L<Text::Tradition::Language::lemmatize> if you wish to lemmatize an entire tradition.

=cut

sub lemmatize {
	my $self = shift;
	unless( $self->has_language ) {
		warn "Please set a language to lemmatize a tradition";
		return;
	}
	my $mod = "Text::Tradition::Language::" . $self->language;
	load( $mod );
	$mod->can( 'reading_lookup' )->( $self );

}

# For graph serialization. Return a JSON representation of the associated
# reading lexemes.
sub _serialize_lexemes {
	my $self = shift;
	my $json = JSON->new->allow_blessed(1)->convert_blessed(1);
	return $json->encode( [ $self->lexemes ] );
}

# Given a JSON representation of the lexemes, instantiate them and add
# them to the reading.
sub _deserialize_lexemes {
	my( $self, $json ) = @_;
	my $data = from_json( $json );
	return unless @$data;
	
	my @lexemes;
	foreach my $lexhash ( @$data ) {
		push( @lexemes, Text::Tradition::Collation::Reading::Lexeme->new(
			'JSON' => $lexhash ) );
	}
	$self->clear_lexemes;
	$self->add_lexeme( @lexemes );
}

sub disambiguated {
	my $self = shift;
	return 0 unless $self->has_lexemes;
	return !grep { !$_->is_disambiguated } $self->lexemes;
}

sub filter_lexemes {
	my $self = shift;
	# While we are here, get rid of any extra wordforms from a disambiguated
	# reading.
	if( $self->disambiguated ) {
		foreach my $lex ( $self->lexemes ) {
			$lex->clear_matching_forms();
			$lex->add_matching_form( $lex->form );
		}
	}
}

=head2 regularize

Call the 'regularize' function of the appropriate language model on our
own reading text. This is a rules-based function distinct from 'normal_form',
which can be set to any arbitrary string.

=cut

# TODO Test this stuff

sub regularize {
	my $self = shift;
	if( $self->has_language ) {
		# If we do have a language, regularize the tokens in $answer.
		my $mod = 'Text::Tradition::Language::' . $self->language;
		eval { load( $mod ); };
		# If a module doesn't exist for our language, use the base routine
		if( $@ ) {
			$mod = 'Text::Tradition::Language::Base';
			load( $mod );
		}
		return $mod->can( 'regularize' )->( $self->text );
	} else {
		return $self->text;
	}
}

around 'is_identical' => sub {
	my $orig = shift;
	my $self = shift;
	my $other = shift;
	# If the base class returns true, do an extra check to make sure the
	# lexemes also match.
	my $answer = $self->$orig( $other );
	if( $answer ) {
		if( $self->disambiguated && $other->disambiguated ) {
			my $rform = join( '//', map { $_->form->to_string } $self->lexemes );
			my $uform = join( '//', map { $_->form->to_string } $other->lexemes );
			$answer = undef unless $rform eq $uform;
		} elsif( $self->disambiguated xor $other->disambiguated ) {
			$answer = undef;
		}
	}
	return $answer;
};

around 'is_combinable' => sub {
	my $orig = shift;
	my $self = shift;
	# If the reading is marked with invalid grammar or as a nonsense reading,
	# it is no longer combinable.
	return undef if $self->grammar_invalid || $self->is_nonsense;
	return $self->$orig();
};

after '_combine' => sub {
	my $self = shift;
	my $other = shift;
	my $joinstr = shift;
	$self->normal_form( 
		join( $joinstr, $self->normal_form, $other->normal_form ) );
	# Combine the lexemes present in the readings
	if( $self->has_lexemes && $other->has_lexemes ) {
		$self->add_lexeme( $other->lexemes );
	}
};

=head2 relationship_added

To be called when a relationship is set, to implement the consequences of 
certain relationships.

=begin testing

# Test that normal form follows lemma setting. Draws on code both here and in
# the base module.

use Text::Tradition;

my $t = Text::Tradition->new(
	input => 'Self',
	file => 't/data/florilegium_graphml.xml' );
my $c = $t->collation;

# First try lemmatizing and then adding a relationship
my $r1 = $c->reading('w42');
my $r2 = $c->reading('w44');
$r1->normal_form('FOO');
$r2->normal_form('BAR');

$r1->make_lemma( 1 );
is( $r1->normal_form, 'FOO', "nothing changed yet" );
is( $r2->normal_form, 'BAR', "nothing changed yet" );

$c->add_relationship( $r1, $r2, { type => 'spelling' } );
is( $r2->normal_form, 'FOO', "Normal form followed lemma" );

# Now try setting relationships and then lemmatizing
my $r3 = $c->reading('w98');
my $r4 = $c->reading('w100');
my $r5 = $c->reading('w103');
$r3->normal_form('YAN');
$r4->normal_form('TAN');
$r5->normal_form('TETHERA');

$c->add_relationship( $r3, $r4, { type => 'orthographic', propagate => 1 } );
$c->add_relationship( $r3, $r5, { type => 'orthographic', propagate => 1 } );
is( $r3->normal_form, 'YAN', "nothing changed yet" );
is( $r4->normal_form, 'TAN', "nothing changed yet" );
is( $r5->normal_form, 'TETHERA', "nothing changed yet" );

$r3->make_lemma( 1 );
is( $r4->normal_form, 'YAN', "normal form propagated" );
is( $r5->normal_form, 'YAN', "normal form propagated" );

# Finally, try a relationship that shouldn't propagate the normal form
my $r6 = $c->reading('w91');
my $r7 = $c->reading('w92');
$r6->normal_form('BAZ');
$r7->normal_form('QUUX');
$r6->make_lemma( 1 );

$c->add_relationship( $r6, $r7, { type => 'grammatical' } );
is( $r7->normal_form, 'QUUX', "normal form on grammatical relationship unchanged" );

=end testing

=cut

sub relationship_added {
	my( $rdg1, $rdg2, $rel ) = @_;
	my $lemma = $rdg1->is_lemma ? $rdg1 : ( $rdg2->is_lemma ? $rdg2 : undef );
	if( $rel->type =~ /^(spelling|orthographic)$/ && $lemma ) {
		my $other = $lemma->id eq $rdg1->id ? $rdg2 : $rdg1;
		# Set the normal form on $other to match $lemma.
		$other->normal_form( $lemma->normal_form );
	}
}

1;

=head1 LICENSE

This package is free software and is provided "as is" without express
or implied warranty.  You can redistribute it and/or modify it under
the same terms as Perl itself.

=head1 AUTHOR

Tara L Andrews E<lt>aurum@cpan.orgE<gt>
Commit	Line	Data
a445ce40	1	package Text::Tradition::Morphology;
	2
	3	use strict;
	4	use warnings;
	5	use JSON qw/ from_json /;
	6	use Moose::Role;
	7	use Module::Load;
	8	use Text::Tradition::Collation::Reading::Lexeme;
	9
483478bf	10	use vars qw/ $VERSION /;
	11	$VERSION = "0.1";
	12
a445ce40	13	=head1 NAME
a445ce40	14
8943ff68	15	Text::Tradition::Morphology - morphology plugin for Text::Tradition
	16
	17	=head1 DESCRIPTION
	18
	19	The Text::Tradition::Morphology package enables lemma and part-of-speech
	20	information for traditions and their Reading objects. This distribution
	21	includes the L<Text::Tradition::Language> role for Traditions, the
	22	L<Text::Tradition::Morphology> role (this package) for Readings, and a set
	23	of Language::* modules for language-specific lemmatization.
	24
	25	See L<Text::Tradition::Collation::Reading::Lexeme> for more about the
	26	morphology object structure.
a445ce40	27
	28	=cut
	29
332750fc	30	requires 'is_identical', 'is_combinable', '_combine';
332750fc	31
58f9c2b9	32	has 'language' => (
	33	is => 'ro',
	34	isa => 'Str',
	35	predicate => 'has_language',
	36	);
	37
a445ce40	38	has 'grammar_invalid' => (
	39	is => 'rw',
	40	isa => 'Bool',
	41	default => undef,
	42	);
	43
	44	has 'is_nonsense' => (
	45	is => 'rw',
	46	isa => 'Bool',
	47	default => undef,
	48	);
	49
	50	has 'normal_form' => (
	51	is => 'rw',
	52	isa => 'Str',
	53	predicate => '_has_normal_form',
	54	clearer => '_clear_normal_form',
	55	);
	56
	57	# Holds the lexemes for the reading.
	58	has 'reading_lexemes' => (
	59	traits => ['Array'],
	60	isa => 'ArrayRef[Text::Tradition::Collation::Reading::Lexeme]',
	61	handles => {
	62	lexeme => 'get',
	63	lexemes => 'elements',
	64	has_lexemes => 'count',
	65	clear_lexemes => 'clear',
	66	add_lexeme => 'push',
	67	},
	68	default => sub { [] },
	69	);
	70
	71
	72
	73	# Make normal_form default to text, transparently.
	74	around 'normal_form' => sub {
	75	my $orig = shift;
	76	my $self = shift;
	77	my( $arg ) = @_;
	78	if( $arg && $arg eq $self->text ) {
	79	$self->_clear_normal_form;
	80	return $arg;
	81	} elsif( !$arg && !$self->_has_normal_form ) {
	82	return $self->text;
	83	} else {
	84	$self->$orig( @_ );
	85	}
	86	};
	87
8943ff68	88	=head1 READING METHODS
a445ce40	89
	90	Methods for the morphological information (if any) attached to readings.
	91	A reading may be made up of multiple lexemes; the concatenated lexeme
	92	strings ought to match the reading's normalized form.
	93
	94	See L<Text::Tradition::Collation::Reading::Lexeme> for more information
	95	on Lexeme objects and their attributes.
	96
	97	=head2 has_lexemes
	98
	99	Returns a true value if the reading has any attached lexemes.
	100
	101	=head2 lexemes
	102
	103	Returns the Lexeme objects (if any) attached to the reading.
	104
	105	=head2 clear_lexemes
	106
	107	Wipes any associated Lexeme objects out of the reading.
	108
	109	=head2 add_lexeme( $lexobj )
	110
	111	Adds the Lexeme in $lexobj to the list of lexemes.
	112
	113	=head2 lemmatize
	114
	115	If the language of the reading is set, this method will use the appropriate
	116	Language model to determine the lexemes that belong to this reading. See
8943ff68	117	L<Text::Tradition::Language::lemmatize> if you wish to lemmatize an entire tradition.
a445ce40	118
	119	=cut
	120
	121	sub lemmatize {
	122	my $self = shift;
	123	unless( $self->has_language ) {
	124	warn "Please set a language to lemmatize a tradition";
	125	return;
	126	}
	127	my $mod = "Text::Tradition::Language::" . $self->language;
	128	load( $mod );
	129	$mod->can( 'reading_lookup' )->( $self );
	130
	131	}
	132
	133	# For graph serialization. Return a JSON representation of the associated
	134	# reading lexemes.
	135	sub _serialize_lexemes {
	136	my $self = shift;
	137	my $json = JSON->new->allow_blessed(1)->convert_blessed(1);
	138	return $json->encode( [ $self->lexemes ] );
	139	}
	140
	141	# Given a JSON representation of the lexemes, instantiate them and add
	142	# them to the reading.
	143	sub _deserialize_lexemes {
	144	my( $self, $json ) = @_;
	145	my $data = from_json( $json );
	146	return unless @$data;
	147
	148	my @lexemes;
	149	foreach my $lexhash ( @$data ) {
	150	push( @lexemes, Text::Tradition::Collation::Reading::Lexeme->new(
	151	'JSON' => $lexhash ) );
	152	}
	153	$self->clear_lexemes;
	154	$self->add_lexeme( @lexemes );
	155	}
	156
	157	sub disambiguated {
	158	my $self = shift;
	159	return 0 unless $self->has_lexemes;
	160	return !grep { !$_->is_disambiguated } $self->lexemes;
	161	}
	162
	163	sub filter_lexemes {
	164	my $self = shift;
	165	# While we are here, get rid of any extra wordforms from a disambiguated
	166	# reading.
	167	if( $self->disambiguated ) {
	168	foreach my $lex ( $self->lexemes ) {
	169	$lex->clear_matching_forms();
	170	$lex->add_matching_form( $lex->form );
	171	}
	172	}
	173	}
	174
58f9c2b9	175	=head2 regularize
	176
	177	Call the 'regularize' function of the appropriate language model on our
	178	own reading text. This is a rules-based function distinct from 'normal_form',
	179	which can be set to any arbitrary string.
	180
	181	=cut
	182
	183	# TODO Test this stuff
	184
	185	sub regularize {
	186	my $self = shift;
	187	if( $self->has_language ) {
	188	# If we do have a language, regularize the tokens in $answer.
	189	my $mod = 'Text::Tradition::Language::' . $self->language;
58f9c2b9	190	eval { load( $mod ); };
58f9c2b9	191	# If a module doesn't exist for our language, use the base routine
48cb9a90	192	if( $@ ) {
	193	$mod = 'Text::Tradition::Language::Base';
	194	load( $mod );
	195	}
58f9c2b9	196	return $mod->can( 'regularize' )->( $self->text );
	197	} else {
	198	return $self->text;
	199	}
	200	}
	201
a445ce40	202	around 'is_identical' => sub {
	203	my $orig = shift;
	204	my $self = shift;
	205	my $other = shift;
	206	# If the base class returns true, do an extra check to make sure the
	207	# lexemes also match.
	208	my $answer = $self->$orig( $other );
	209	if( $answer ) {
	210	if( $self->disambiguated && $other->disambiguated ) {
	211	my $rform = join( '//', map { $_->form->to_string } $self->lexemes );
	212	my $uform = join( '//', map { $_->form->to_string } $other->lexemes );
	213	$answer = undef unless $rform eq $uform;
	214	} elsif( $self->disambiguated xor $other->disambiguated ) {
	215	$answer = undef;
	216	}
	217	}
	218	return $answer;
	219	};
	220
	221	around 'is_combinable' => sub {
	222	my $orig = shift;
	223	my $self = shift;
	224	# If the reading is marked with invalid grammar or as a nonsense reading,
	225	# it is no longer combinable.
	226	return undef if $self->grammar_invalid \|\| $self->is_nonsense;
	227	return $self->$orig();
	228	};
	229
	230	after '_combine' => sub {
	231	my $self = shift;
	232	my $other = shift;
	233	my $joinstr = shift;
	234	$self->normal_form(
	235	join( $joinstr, $self->normal_form, $other->normal_form ) );
	236	# Combine the lexemes present in the readings
	237	if( $self->has_lexemes && $other->has_lexemes ) {
	238	$self->add_lexeme( $other->lexemes );
	239	}
	240	};
	241
45095bee	242	=head2 relationship_added
	243
	244	To be called when a relationship is set, to implement the consequences of
	245	certain relationships.
	246
	247	=begin testing
	248
	249	# Test that normal form follows lemma setting. Draws on code both here and in
	250	# the base module.
	251
	252	use Text::Tradition;
	253
	254	my $t = Text::Tradition->new(
	255	input => 'Self',
	256	file => 't/data/florilegium_graphml.xml' );
	257	my $c = $t->collation;
	258
	259	# First try lemmatizing and then adding a relationship
	260	my $r1 = $c->reading('w42');
	261	my $r2 = $c->reading('w44');
	262	$r1->normal_form('FOO');
	263	$r2->normal_form('BAR');
	264
	265	$r1->make_lemma( 1 );
	266	is( $r1->normal_form, 'FOO', "nothing changed yet" );
	267	is( $r2->normal_form, 'BAR', "nothing changed yet" );
	268
	269	$c->add_relationship( $r1, $r2, { type => 'spelling' } );
	270	is( $r2->normal_form, 'FOO', "Normal form followed lemma" );
	271
	272	# Now try setting relationships and then lemmatizing
	273	my $r3 = $c->reading('w98');
	274	my $r4 = $c->reading('w100');
	275	my $r5 = $c->reading('w103');
	276	$r3->normal_form('YAN');
	277	$r4->normal_form('TAN');
	278	$r5->normal_form('TETHERA');
	279
	280	$c->add_relationship( $r3, $r4, { type => 'orthographic', propagate => 1 } );
	281	$c->add_relationship( $r3, $r5, { type => 'orthographic', propagate => 1 } );
	282	is( $r3->normal_form, 'YAN', "nothing changed yet" );
	283	is( $r4->normal_form, 'TAN', "nothing changed yet" );
	284	is( $r5->normal_form, 'TETHERA', "nothing changed yet" );
	285
	286	$r3->make_lemma( 1 );
	287	is( $r4->normal_form, 'YAN', "normal form propagated" );
	288	is( $r5->normal_form, 'YAN', "normal form propagated" );
	289
	290	# Finally, try a relationship that shouldn't propagate the normal form
	291	my $r6 = $c->reading('w91');
	292	my $r7 = $c->reading('w92');
	293	$r6->normal_form('BAZ');
	294	$r7->normal_form('QUUX');
	295	$r6->make_lemma( 1 );
	296
	297	$c->add_relationship( $r6, $r7, { type => 'grammatical' } );
	298	is( $r7->normal_form, 'QUUX', "normal form on grammatical relationship unchanged" );
	299
	300	=end testing
	301
	302	=cut
	303
	304	sub relationship_added {
	305	my( $rdg1, $rdg2, $rel ) = @_;
306	my $lemma = $rdg1->is_lemma ? $rdg1 : ( $rdg2->is_lemma ? $rdg2 : undef );
307	if( $rel->type =~ /^(spelling\|orthographic)$/ && $lemma ) {
308	my $other = $lemma->id eq $rdg1->id ? $rdg2 : $rdg1;
309	# Set the normal form on $other to match $lemma.
310	$other->normal_form( $lemma->normal_form );
311	}
312	}
313
e92d4229	314	1;
	315
	316	=head1 LICENSE
	317
	318	This package is free software and is provided "as is" without express
	319	or implied warranty. You can redistribute it and/or modify it under
	320	the same terms as Perl itself.
	321
	322	=head1 AUTHOR
	323
	324	Tara L Andrews E<lt>aurum@cpan.orgE<gt>