[scpubgit/stemmatology.git] / lib / Text / Tradition / Collation / Reading.pm

package Text::Tradition::Collation::Reading;

use Moose;
use overload '""' => \&_stringify, 'fallback' => 1;

=head1 NAME

Text::Tradition::Collation::Reading - represents a reading (usually a word) in a collation.
    
=head1 DESCRIPTION

Text::Tradition is a library for representation and analysis of collated
texts, particularly medieval ones.  A 'reading' refers to a unit of text,
usually a word, that appears in one or more witnesses (manuscripts) of the
tradition; the text of a given witness is composed of a set of readings in
a particular sequence

=head1 METHODS

=head2 new

Creates a new reading in the given collation with the given attributes. 
Options include:

=over 4

=item collation - The Text::Tradition::Collation object to which this reading belongs.  Required.

=item id - A unique identifier for this reading. Required.

=item text - The word or other text of the reading.

=item is_start - The reading is the starting point for the collation.

=item is_end - The reading is the ending point for the collation.

=item is_lacuna - The 'reading' represents a known gap in the text.

=item is_ph - A temporary placeholder for apparatus parsing purposes.  Do not use unless you know what you are doing.

=item rank - The sequence number of the reading. This should probably not be set manually.

=back

One of 'text', 'is_start', 'is_end', or 'is_lacuna' is required.

=head2 collation

=head2 id

=head2 text

=head2 is_start

=head2 is_end

=head2 is_lacuna

=head2 rank

Accessor methods for the given attributes.

=cut

has 'collation' => (
	is => 'ro',
	isa => 'Text::Tradition::Collation',
	# required => 1,
	weak_ref => 1,
	);

has 'id' => (
	is => 'ro',
	isa => 'Str',
	required => 1,
	);

has 'text' => (
	is => 'ro',
	isa => 'Str',
	required => 1,
	writer => 'alter_text',
	);
	
has 'is_start' => (
	is => 'ro',
	isa => 'Bool',
	default => undef,
	);

has 'is_end' => (
	is => 'ro',
	isa => 'Bool',
	default => undef,
	);
    
has 'is_lacuna' => (
    is => 'ro',
    isa => 'Bool',
	default => undef,
    );
    
has 'is_ph' => (
	is => 'ro',
	isa => 'Bool',
	default => undef,
	);
	
has 'is_common' => (
	is => 'rw',
	isa => 'Bool',
	default => undef,
	);

has 'rank' => (
    is => 'rw',
    isa => 'Int',
    predicate => 'has_rank',
    clearer => 'clear_rank',
    );
    
## For morphological analysis

has 'normal_form' => (
	is => 'rw',
	isa => 'Str',
	predicate => 'has_normal_form',
	);

has 'lemma' => (
	is => 'rw',
	isa => 'Str',
	predicate => 'has_lemma',
	);

has 'morphology' => (
	traits => ['Array'],
	isa => 'ArrayRef[HashRef[ArrayRef[Text::Tradition::Collation::Reading::Morphology]]]',
	handles => {
		lexemes => 'elements',
		has_morphology => 'count',
		_clear_morph => 'clear',
		_add_morph => 'push',
		},
	);
	
## For prefix/suffix readings

has 'join_prior' => (
	is => 'ro',
	isa => 'Bool',
	default => undef,
	);
	
has 'join_next' => (
	is => 'ro',
	isa => 'Bool',
	default => undef,
	);


around BUILDARGS => sub {
	my $orig = shift;
	my $class = shift;
	my $args;
	if( @_ == 1 ) {
		$args = shift;
	} else {
		$args = { @_ };
	}
			
	# If one of our special booleans is set, we change the text and the
	# ID to match.
	if( exists $args->{'is_lacuna'} && !exists $args->{'text'} ) {
		$args->{'text'} = '#LACUNA#';
	} elsif( exists $args->{'is_start'} ) {
		$args->{'id'} = '#START#';  # Change the ID to ensure we have only one
		$args->{'text'} = '#START#';
		$args->{'rank'} = 0;
	} elsif( exists $args->{'is_end'} ) {
		$args->{'id'} = '#END#';	# Change the ID to ensure we have only one
		$args->{'text'} = '#END#';
	} elsif( exists $args->{'is_ph'} ) {
		$args->{'text'} = $args->{'id'};
	}
	
	$class->$orig( $args );
};

=head2 is_meta

A meta attribute (ha ha), which should be true if any of our 'special'
booleans are true.  Implies that the reading does not represent a bit 
of text found in a witness.

=cut

sub is_meta {
	my $self = shift;
	return $self->is_start || $self->is_end || $self->is_lacuna || $self->is_ph;	
}

=head1 Convenience methods

=head2 related_readings

Calls Collation's related_readings with $self as the first argument.

=cut

sub related_readings {
	my $self = shift;
	return $self->collation->related_readings( $self, @_ );
}

=head2 witnesses 

Calls Collation's reading_witnesses with $self as the first argument.

=cut

sub witnesses {
	my $self = shift;
	return $self->collation->reading_witnesses( $self, @_ );
}

=head2 predecessors

Returns a list of Reading objects that immediately precede $self in the collation.

=cut

sub predecessors {
	my $self = shift;
	my @pred = $self->collation->sequence->predecessors( $self->id );
	return map { $self->collation->reading( $_ ) } @pred;
}

=head2 successors

Returns a list of Reading objects that immediately follow $self in the collation.

=cut

sub successors {
	my $self = shift;
	my @succ = $self->collation->sequence->successors( $self->id );
	return map { $self->collation->reading( $_ ) } @succ;
}

=head2 set_identical( $other_reading)

Backwards compatibility method, to add a transposition relationship
between $self and $other_reading.  Don't use this.

=cut

sub set_identical {
	my( $self, $other ) = @_;
	return $self->collation->add_relationship( $self, $other, 
		{ 'type' => 'transposition' } );
}

sub _stringify {
	my $self = shift;
	return $self->id;
}

=head1 MORPHOLOGY

A few methods to try to tack on morphological information.

=head2 is_disambiguated

Returns true if there is only one tag per lexeme in this reading.

=cut

sub use_lexemes {
	my( $self, @lexemes ) = @_;
	# The lexemes need to be the same as $self->text.
	my $cmpstr = $self->has_normal_form ? lc( $self->normal_form ) : lc( $self->text );
	$cmpstr =~ s/[\s-]+//g;
	my $lexstr = lc( join( '', @lexemes ) );
	$lexstr =~ s/[\s-]+//g;
	unless( $lexstr eq $cmpstr ) {
		warn "Cannot split " . $self->text . " into " . join( '.', @lexemes );
		return;
	}
	$self->_clear_morph;
	map { $self->_add_morph( { $_ => [] } ) } @lexemes;
}

sub add_morphological_tag {
	my( $self, $lexeme, $opts ) = @_;
	my $struct;
	unless( $opts ) {
		# No lexeme was passed; use reading text.
		$opts = $lexeme;
		$lexeme = $self->text;
		$self->use_lexemes( $lexeme );
	}
	# Get the correct container
	( $struct ) = grep { exists $_->{$lexeme} } $self->lexemes;
	unless( $struct ) {
		warn "No lexeme $lexeme exists in this reading";
		return;
	}
	# Now make the morph object and add it to this lexeme.
	my $morph_obj = Text::Tradition::Collation::Reading::Morphology->new( $opts );
	# TODO Check for existence
	push( @{$struct->{$lexeme}}, $morph_obj );
}

sub disambiguate {
	my( $self, $lexeme, $index ) = @_;
	my $struct;
	unless( $index ) {
		# No lexeme was passed; use reading text.
		$index = $lexeme;
		$lexeme = $self->text;
	}
	# Get the correct container
	( $struct ) = grep { exists $_->{$lexeme} } $self->lexemes;
	unless( $struct ) {
		warn "No lexeme $lexeme exists in this reading";
		return;
	}
	# Keep the object at the selected index
	my $selected = $struct->{$lexeme}->[$index];
	$struct->{$lexeme} = [ $selected ];
}

sub is_disambiguated {
	my $self = shift;
	return undef unless $self->has_morphology;
	foreach my $lexeme ( $self->lexemes ) {
		my( $key ) = keys %$lexeme; # will be only one
		return undef unless @{$lexeme->{$key}} == 1;
	}
	return 1;
}

## Utility methods

sub TO_JSON {
	my $self = shift;
	return $self->text;
}

## TODO will need a throw() here

no Moose;
__PACKAGE__->meta->make_immutable;

###################################################
### Morphology objects, to be attached to readings
###################################################

package Text::Tradition::Collation::Reading::Morphology;

use Moose;

has 'lemma' => (
	is => 'ro',
	isa => 'Str',
	required => 1,
	);
	
has 'code' => (
	is => 'ro',
	isa => 'Str',
	required => 1,
	);
	
has 'language' => (
	is => 'ro',
	isa => 'Str',
	required => 1,
	);
	
## Transmute codes into comparison arrays for our various languages.

around BUILDARGS => sub {
	my $orig = shift;
	my $class = shift;
	my $args;
	if( @_ == 1 && ref( $_[0] ) ) {
		$args = shift;
	} else {
		$args = { @_ };
	}
	if( exists( $args->{'serial'} ) ) {
		my( $lemma, $code ) = split( /!!/, delete $args->{'serial'} );
		$args->{'lemma'} = $lemma;
		$args->{'code'} = $code;
	}
	$class->$orig( $args );
};

sub serialization {
	my $self = shift;
	return join( '!!', $self->lemma, $self->code );
};

sub comparison_array {
	my $self = shift;
	if( $self->language eq 'French' ) {
		my @array;
		my @bits = split( /\+/, $self->code );
		# First push the non k/v parts.
		while( @bits && $bits[0] !~ /=/ ) {
			push( @array, shift @bits );
		}
		while( @array < 2 ) {
			push( @array, undef );
		}
		# Now push the k/v parts in a known order.
		my @fields = qw/ Pers Nb Temps Genre Spec Fonc /;
		my %props;
		map { my( $k, $v ) = split( /=/, $_ ); $props{$k} = $v; } @bits;
		foreach my $k ( @fields ) {
			push( @array, $props{$k} );
		}
		# Give the answer.
		return @array;
	} elsif( $self->language eq 'English' ) {
		# Do something as yet undetermined
	} else {
		# Latin or Greek or Armenian, just split the chars
		return split( '', $self->code );
	}
};

no Moose;
__PACKAGE__->meta->make_immutable;

1;
Commit	Line	Data
784877d9	1	package Text::Tradition::Collation::Reading;
784877d9	2
8e1394aa	3	use Moose;
e4b0f464	4	use overload '""' => \&_stringify, 'fallback' => 1;
784877d9	5
3a2ebbf4	6	=head1 NAME
784877d9	7
3a2ebbf4	8	Text::Tradition::Collation::Reading - represents a reading (usually a word) in a collation.
eca16057	9
3a2ebbf4	10	=head1 DESCRIPTION
784877d9	11
3a2ebbf4	12	Text::Tradition is a library for representation and analysis of collated
	13	texts, particularly medieval ones. A 'reading' refers to a unit of text,
	14	usually a word, that appears in one or more witnesses (manuscripts) of the
	15	tradition; the text of a given witness is composed of a set of readings in
	16	a particular sequence
784877d9	17
3a2ebbf4	18	=head1 METHODS
1ca1163d	19
3a2ebbf4	20	=head2 new
8e1394aa	21
3a2ebbf4	22	Creates a new reading in the given collation with the given attributes.
3a2ebbf4	23	Options include:
94c00c71	24
3a2ebbf4	25	=over 4
784877d9	26
3a2ebbf4	27	=item collation - The Text::Tradition::Collation object to which this reading belongs. Required.
e2902068	28
3a2ebbf4	29	=item id - A unique identifier for this reading. Required.
910a0a6d	30
3a2ebbf4	31	=item text - The word or other text of the reading.
784877d9	32
3a2ebbf4	33	=item is_start - The reading is the starting point for the collation.
3265b0ce	34
3a2ebbf4	35	=item is_end - The reading is the ending point for the collation.
784877d9	36
3a2ebbf4	37	=item is_lacuna - The 'reading' represents a known gap in the text.
de51424a	38
12720144	39	=item is_ph - A temporary placeholder for apparatus parsing purposes. Do not use unless you know what you are doing.
12720144	40
3a2ebbf4	41	=item rank - The sequence number of the reading. This should probably not be set manually.
d047cd52	42
3a2ebbf4	43	=back
8e1394aa	44
3a2ebbf4	45	One of 'text', 'is_start', 'is_end', or 'is_lacuna' is required.
8e1394aa	46
3a2ebbf4	47	=head2 collation
94c00c71	48
3a2ebbf4	49	=head2 id
94c00c71	50
3a2ebbf4	51	=head2 text
4cdd82f1	52
3a2ebbf4	53	=head2 is_start
4cdd82f1	54
3a2ebbf4	55	=head2 is_end
4a8828f0	56
3a2ebbf4	57	=head2 is_lacuna
4a8828f0	58
3a2ebbf4	59	=head2 rank
4a8828f0	60
3a2ebbf4	61	Accessor methods for the given attributes.
d047cd52	62
3a2ebbf4	63	=cut
d047cd52	64
3a2ebbf4	65	has 'collation' => (
	66	is => 'ro',
	67	isa => 'Text::Tradition::Collation',
	68	# required => 1,
	69	weak_ref => 1,
	70	);
d047cd52	71
3a2ebbf4	72	has 'id' => (
	73	is => 'ro',
	74	isa => 'Str',
	75	required => 1,
	76	);
d047cd52	77
3a2ebbf4	78	has 'text' => (
	79	is => 'ro',
	80	isa => 'Str',
	81	required => 1,
49d4f2ac	82	writer => 'alter_text',
3a2ebbf4	83	);
0e47f4f6	84
3a2ebbf4	85	has 'is_start' => (
	86	is => 'ro',
	87	isa => 'Bool',
	88	default => undef,
	89	);
	90
	91	has 'is_end' => (
	92	is => 'ro',
	93	isa => 'Bool',
	94	default => undef,
	95	);
	96
	97	has 'is_lacuna' => (
	98	is => 'ro',
	99	isa => 'Bool',
	100	default => undef,
	101	);
12720144	102
	103	has 'is_ph' => (
	104	is => 'ro',
	105	isa => 'Bool',
	106	default => undef,
	107	);
d4b75f44	108
	109	has 'is_common' => (
	110	is => 'rw',
	111	isa => 'Bool',
	112	default => undef,
	113	);
3a2ebbf4	114
	115	has 'rank' => (
	116	is => 'rw',
	117	isa => 'Int',
	118	predicate => 'has_rank',
ca6e6095	119	clearer => 'clear_rank',
3a2ebbf4	120	);
fd602649	121
	122	## For morphological analysis
	123
	124	has 'normal_form' => (
	125	is => 'rw',
	126	isa => 'Str',
	127	predicate => 'has_normal_form',
	128	);
	129
	130	has 'lemma' => (
	131	is => 'rw',
	132	isa => 'Str',
	133	predicate => 'has_lemma',
	134	);
	135
	136	has 'morphology' => (
4d9593df	137	traits => ['Array'],
	138	isa => 'ArrayRef[HashRef[ArrayRef[Text::Tradition::Collation::Reading::Morphology]]]',
	139	handles => {
	140	lexemes => 'elements',
	141	has_morphology => 'count',
	142	_clear_morph => 'clear',
	143	_add_morph => 'push',
	144	},
fd602649	145	);
fd602649	146
629e27b0	147	## For prefix/suffix readings
	148
	149	has 'join_prior' => (
	150	is => 'ro',
	151	isa => 'Bool',
	152	default => undef,
	153	);
	154
	155	has 'join_next' => (
	156	is => 'ro',
	157	isa => 'Bool',
	158	default => undef,
	159	);
	160
3a2ebbf4	161
	162	around BUILDARGS => sub {
	163	my $orig = shift;
	164	my $class = shift;
	165	my $args;
	166	if( @_ == 1 ) {
	167	$args = shift;
	168	} else {
	169	$args = { @_ };
	170	}
b0b4421a	171
3a2ebbf4	172	# If one of our special booleans is set, we change the text and the
3a2ebbf4	173	# ID to match.
1d310495	174	if( exists $args->{'is_lacuna'} && !exists $args->{'text'} ) {
56eefa04	175	$args->{'text'} = '#LACUNA#';
3a2ebbf4	176	} elsif( exists $args->{'is_start'} ) {
	177	$args->{'id'} = '#START#'; # Change the ID to ensure we have only one
	178	$args->{'text'} = '#START#';
	179	$args->{'rank'} = 0;
	180	} elsif( exists $args->{'is_end'} ) {
	181	$args->{'id'} = '#END#'; # Change the ID to ensure we have only one
	182	$args->{'text'} = '#END#';
12720144	183	} elsif( exists $args->{'is_ph'} ) {
12720144	184	$args->{'text'} = $args->{'id'};
3a2ebbf4	185	}
	186
	187	$class->$orig( $args );
	188	};
	189
	190	=head2 is_meta
	191
	192	A meta attribute (ha ha), which should be true if any of our 'special'
	193	booleans are true. Implies that the reading does not represent a bit
	194	of text found in a witness.
	195
	196	=cut
	197
	198	sub is_meta {
	199	my $self = shift;
12720144	200	return $self->is_start \|\| $self->is_end \|\| $self->is_lacuna \|\| $self->is_ph;
3a2ebbf4	201	}
3a2ebbf4	202
027d819c	203	=head1 Convenience methods
	204
	205	=head2 related_readings
	206
	207	Calls Collation's related_readings with $self as the first argument.
	208
	209	=cut
	210
3a2ebbf4	211	sub related_readings {
	212	my $self = shift;
	213	return $self->collation->related_readings( $self, @_ );
	214	}
	215
7f52eac8	216	=head2 witnesses
	217
	218	Calls Collation's reading_witnesses with $self as the first argument.
	219
	220	=cut
	221
	222	sub witnesses {
	223	my $self = shift;
	224	return $self->collation->reading_witnesses( $self, @_ );
	225	}
	226
027d819c	227	=head2 predecessors
	228
	229	Returns a list of Reading objects that immediately precede $self in the collation.
	230
	231	=cut
	232
22222af9	233	sub predecessors {
	234	my $self = shift;
	235	my @pred = $self->collation->sequence->predecessors( $self->id );
	236	return map { $self->collation->reading( $_ ) } @pred;
	237	}
	238
027d819c	239	=head2 successors
	240
	241	Returns a list of Reading objects that immediately follow $self in the collation.
	242
	243	=cut
	244
22222af9	245	sub successors {
	246	my $self = shift;
	247	my @succ = $self->collation->sequence->successors( $self->id );
	248	return map { $self->collation->reading( $_ ) } @succ;
	249	}
	250
027d819c	251	=head2 set_identical( $other_reading)
	252
	253	Backwards compatibility method, to add a transposition relationship
	254	between $self and $other_reading. Don't use this.
	255
	256	=cut
	257
1d310495	258	sub set_identical {
	259	my( $self, $other ) = @_;
	260	return $self->collation->add_relationship( $self, $other,
	261	{ 'type' => 'transposition' } );
	262	}
	263
3a2ebbf4	264	sub _stringify {
	265	my $self = shift;
	266	return $self->id;
	267	}
d047cd52	268
4d9593df	269	=head1 MORPHOLOGY
	270
	271	A few methods to try to tack on morphological information.
	272
	273	=head2 is_disambiguated
	274
	275	Returns true if there is only one tag per lexeme in this reading.
	276
	277	=cut
	278
	279	sub use_lexemes {
	280	my( $self, @lexemes ) = @_;
	281	# The lexemes need to be the same as $self->text.
	282	my $cmpstr = $self->has_normal_form ? lc( $self->normal_form ) : lc( $self->text );
	283	$cmpstr =~ s/[\s-]+//g;
	284	my $lexstr = lc( join( '', @lexemes ) );
	285	$lexstr =~ s/[\s-]+//g;
	286	unless( $lexstr eq $cmpstr ) {
	287	warn "Cannot split " . $self->text . " into " . join( '.', @lexemes );
	288	return;
	289	}
	290	$self->_clear_morph;
	291	map { $self->_add_morph( { $_ => [] } ) } @lexemes;
	292	}
	293
	294	sub add_morphological_tag {
	295	my( $self, $lexeme, $opts ) = @_;
	296	my $struct;
	297	unless( $opts ) {
	298	# No lexeme was passed; use reading text.
	299	$opts = $lexeme;
	300	$lexeme = $self->text;
	301	$self->use_lexemes( $lexeme );
	302	}
	303	# Get the correct container
	304	( $struct ) = grep { exists $_->{$lexeme} } $self->lexemes;
	305	unless( $struct ) {
	306	warn "No lexeme $lexeme exists in this reading";
	307	return;
	308	}
	309	# Now make the morph object and add it to this lexeme.
	310	my $morph_obj = Text::Tradition::Collation::Reading::Morphology->new( $opts );
	311	# TODO Check for existence
	312	push( @{$struct->{$lexeme}}, $morph_obj );
	313	}
	314
	315	sub disambiguate {
	316	my( $self, $lexeme, $index ) = @_;
	317	my $struct;
	318	unless( $index ) {
	319	# No lexeme was passed; use reading text.
	320	$index = $lexeme;
	321	$lexeme = $self->text;
	322	}
	323	# Get the correct container
	324	( $struct ) = grep { exists $_->{$lexeme} } $self->lexemes;
	325	unless( $struct ) {
	326	warn "No lexeme $lexeme exists in this reading";
	327	return;
	328	}
	329	# Keep the object at the selected index
	330	my $selected = $struct->{$lexeme}->[$index];
	331	$struct->{$lexeme} = [ $selected ];
	332	}
333
334	sub is_disambiguated {
335	my $self = shift;
336	return undef unless $self->has_morphology;
337	foreach my $lexeme ( $self->lexemes ) {
338	my( $key ) = keys %$lexeme; # will be only one
339	return undef unless @{$lexeme->{$key}} == 1;
340	}
341	return 1;
342	}
343
344	## Utility methods
345
2acf0892	346	sub TO_JSON {
	347	my $self = shift;
	348	return $self->text;
	349	}
	350
4d9593df	351	## TODO will need a throw() here
	352
	353	no Moose;
	354	__PACKAGE__->meta->make_immutable;
	355
	356	###################################################
	357	### Morphology objects, to be attached to readings
	358	###################################################
	359
	360	package Text::Tradition::Collation::Reading::Morphology;
	361
	362	use Moose;
	363
	364	has 'lemma' => (
	365	is => 'ro',
	366	isa => 'Str',
	367	required => 1,
	368	);
	369
	370	has 'code' => (
	371	is => 'ro',
	372	isa => 'Str',
	373	required => 1,
	374	);
	375
	376	has 'language' => (
	377	is => 'ro',
	378	isa => 'Str',
	379	required => 1,
	380	);
	381
	382	## Transmute codes into comparison arrays for our various languages.
	383
	384	around BUILDARGS => sub {
	385	my $orig = shift;
	386	my $class = shift;
	387	my $args;
	388	if( @_ == 1 && ref( $_[0] ) ) {
	389	$args = shift;
	390	} else {
	391	$args = { @_ };
	392	}
	393	if( exists( $args->{'serial'} ) ) {
	394	my( $lemma, $code ) = split( /!!/, delete $args->{'serial'} );
	395	$args->{'lemma'} = $lemma;
	396	$args->{'code'} = $code;
	397	}
	398	$class->$orig( $args );
	399	};
	400
	401	sub serialization {
	402	my $self = shift;
	403	return join( '!!', $self->lemma, $self->code );
	404	};
	405
	406	sub comparison_array {
	407	my $self = shift;
	408	if( $self->language eq 'French' ) {
	409	my @array;
	410	my @bits = split( /\+/, $self->code );
	411	# First push the non k/v parts.
	412	while( @bits && $bits[0] !~ /=/ ) {
	413	push( @array, shift @bits );
	414	}
415	while( @array < 2 ) {
416	push( @array, undef );
417	}
418	# Now push the k/v parts in a known order.
419	my @fields = qw/ Pers Nb Temps Genre Spec Fonc /;
420	my %props;
421	map { my( $k, $v ) = split( /=/, $_ ); $props{$k} = $v; } @bits;
422	foreach my $k ( @fields ) {
423	push( @array, $props{$k} );
424	}
425	# Give the answer.
426	return @array;
427	} elsif( $self->language eq 'English' ) {
428	# Do something as yet undetermined
429	} else {
430	# Latin or Greek or Armenian, just split the chars
431	return split( '', $self->code );
432	}
433	};
434
021bdbac	435	no Moose;
021bdbac	436	__PACKAGE__->meta->make_immutable;
d047cd52	437
021bdbac	438	1;
d047cd52	439