[scpubgit/stemmatology.git] / lib / Text / Tradition / Collation / Reading.pm

package Text::Tradition::Collation::Reading;

use Moose;
use Moose::Util::TypeConstraints;
use JSON qw/ from_json /;
use Module::Load;
use Text::Tradition::Error;
use XML::Easy::Syntax qw( $xml10_name_rx $xml10_namestartchar_rx );
use YAML::XS;
use overload '""' => \&_stringify, 'fallback' => 1;

subtype 'ReadingID',
	as 'Str',
	where { $_ =~ /\A$xml10_name_rx\z/ },
	message { 'Reading ID must be a valid XML attribute string' };
	
no Moose::Util::TypeConstraints;

=head1 NAME

Text::Tradition::Collation::Reading - represents a reading (usually a word)
in a collation.

=head1 DESCRIPTION

Text::Tradition is a library for representation and analysis of collated
texts, particularly medieval ones.  A 'reading' refers to a unit of text,
usually a word, that appears in one or more witnesses (manuscripts) of the
tradition; the text of a given witness is composed of a set of readings in
a particular sequence

=head1 METHODS

=head2 new

Creates a new reading in the given collation with the given attributes.
Options include:

=over 4

=item collation - The Text::Tradition::Collation object to which this
reading belongs.  Required.

=item id - A unique identifier for this reading. Required.

=item text - The word or other text of the reading.

=item is_start - The reading is the starting point for the collation.

=item is_end - The reading is the ending point for the collation.

=item is_lacuna - The 'reading' represents a known gap in the text.

=item is_ph - A temporary placeholder for apparatus parsing purposes.  Do
not use unless you know what you are doing.

=item rank - The sequence number of the reading. This should probably not
be set manually.

=back

One of 'text', 'is_start', 'is_end', or 'is_lacuna' is required.

=head2 collation

=head2 id

=head2 text

=head2 is_start

=head2 is_end

=head2 is_lacuna

=head2 rank

Accessor methods for the given attributes.

=cut

has 'collation' => (
	is => 'ro',
	isa => 'Text::Tradition::Collation',
	# required => 1,
	weak_ref => 1,
	);

has 'id' => (
	is => 'ro',
	isa => 'ReadingID',
	required => 1,
	);

has 'text' => (
	is => 'ro',
	isa => 'Str',
	required => 1,
	writer => 'alter_text',
	);
	
has 'language' => (
	is => 'ro',
	isa => 'Str',
	predicate => 'has_language',
	);
	
has 'is_start' => (
	is => 'ro',
	isa => 'Bool',
	default => undef,
	);

has 'is_end' => (
	is => 'ro',
	isa => 'Bool',
	default => undef,
	);
    
has 'is_lacuna' => (
    is => 'ro',
    isa => 'Bool',
	default => undef,
    );
    
has 'is_ph' => (
	is => 'ro',
	isa => 'Bool',
	default => undef,
	);
	
has 'is_common' => (
	is => 'rw',
	isa => 'Bool',
	default => undef,
	);

has 'rank' => (
    is => 'rw',
    isa => 'Int',
    predicate => 'has_rank',
    clearer => 'clear_rank',
    );
    
## For morphological analysis

has 'normal_form' => (
	is => 'rw',
	isa => 'Str',
	predicate => 'has_normal_form',
	);

# Holds the lexemes for the reading.
has 'reading_lexemes' => (
	traits => ['Array'],
	isa => 'ArrayRef[Text::Tradition::Collation::Reading::Lexeme]',
	handles => {
		lexemes => 'elements',
		has_lexemes => 'count',
		clear_lexemes => 'clear',
		add_lexeme => 'push',
		},
	default => sub { [] },
	);
	
## For prefix/suffix readings

has 'join_prior' => (
	is => 'ro',
	isa => 'Bool',
	default => undef,
	);
	
has 'join_next' => (
	is => 'ro',
	isa => 'Bool',
	default => undef,
	);


around BUILDARGS => sub {
	my $orig = shift;
	my $class = shift;
	my $args;
	if( @_ == 1 ) {
		$args = shift;
	} else {
		$args = { @_ };
	}
			
	# If one of our special booleans is set, we change the text and the
	# ID to match.
	if( exists $args->{'is_lacuna'} && !exists $args->{'text'} ) {
		$args->{'text'} = '#LACUNA#';
	} elsif( exists $args->{'is_start'} ) {
		$args->{'id'} = '__START__';  # Change the ID to ensure we have only one
		$args->{'text'} = '#START#';
		$args->{'rank'} = 0;
	} elsif( exists $args->{'is_end'} ) {
		$args->{'id'} = '__END__';	# Change the ID to ensure we have only one
		$args->{'text'} = '#END#';
	} elsif( exists $args->{'is_ph'} ) {
		$args->{'text'} = $args->{'id'};
	}
	
	# Backwards compatibility for non-XMLname IDs
	my $rid = $args->{'id'};
	$rid =~ s/\#/__/g;
	$rid =~ s/[\/,]/./g;
    if( $rid !~ /^$xml10_namestartchar_rx/ ) {
    	$rid = 'r'.$rid;
    }
	$args->{'id'} = $rid;
	
	$class->$orig( $args );
};

# Look for a lexeme-string argument in the build args.
sub BUILD {
	my( $self, $args ) = @_;
	if( exists $args->{'lexemes'} ) {
		$self->_deserialize_lexemes( $args->{'lexemes'} );
	}
}

=head2 is_meta

A meta attribute (ha ha), which should be true if any of our 'special'
booleans are true.  Implies that the reading does not represent a bit 
of text found in a witness.

=cut

sub is_meta {
	my $self = shift;
	return $self->is_start || $self->is_end || $self->is_lacuna || $self->is_ph;	
}

=head1 Convenience methods

=head2 related_readings

Calls Collation's related_readings with $self as the first argument.

=cut

sub related_readings {
	my $self = shift;
	return $self->collation->related_readings( $self, @_ );
}

=head2 witnesses 

Calls Collation's reading_witnesses with $self as the first argument.

=cut

sub witnesses {
	my $self = shift;
	return $self->collation->reading_witnesses( $self, @_ );
}

=head2 predecessors

Returns a list of Reading objects that immediately precede $self in the collation.

=cut

sub predecessors {
	my $self = shift;
	my @pred = $self->collation->sequence->predecessors( $self->id );
	return map { $self->collation->reading( $_ ) } @pred;
}

=head2 successors

Returns a list of Reading objects that immediately follow $self in the collation.

=cut

sub successors {
	my $self = shift;
	my @succ = $self->collation->sequence->successors( $self->id );
	return map { $self->collation->reading( $_ ) } @succ;
}

=head2 set_identical( $other_reading)

Backwards compatibility method, to add a transposition relationship
between $self and $other_reading.  Don't use this.

=cut

sub set_identical {
	my( $self, $other ) = @_;
	return $self->collation->add_relationship( $self, $other, 
		{ 'type' => 'transposition' } );
}

sub _stringify {
	my $self = shift;
	return $self->id;
}

=head1 MORPHOLOGY

Methods for the morphological information (if any) attached to readings.
A reading may be made up of multiple lexemes; the concatenated lexeme
strings ought to match the reading's normalized form.
 
See L<Text::Tradition::Collation::Reading::Lexeme> for more information
on Lexeme objects and their attributes.

=head2 has_lexemes

Returns a true value if the reading has any attached lexemes.

=head2 lexemes

Returns the Lexeme objects (if any) attached to the reading.

=head2 clear_lexemes

Wipes any associated Lexeme objects out of the reading.

=head2 add_lexeme( $lexobj )

Adds the Lexeme in $lexobj to the list of lexemes.

=head2 lemmatize

If the language of the reading is set, this method will use the appropriate
Language model to determine the lexemes that belong to this reading.  See
L<Text::Tradition::lemmatize> if you wish to lemmatize an entire tradition.

=cut

sub lemmatize {
	my $self = shift;
	unless( $self->has_language ) {
		warn "Please set a language to lemmatize a tradition";
		return;
	}
	my $mod = "Text::Tradition::Language::" . $self->language;
	load( $mod );
	$mod->can( 'reading_lookup' )->( $self );

}

# For graph serialization. Return a JSON representation of the associated
# reading lexemes.
sub _serialize_lexemes {
	my $self = shift;
	my $json = JSON->new->allow_blessed(1)->convert_blessed(1);
	return $json->encode( [ $self->lexemes ] );
}

# Given a JSON representation of the lexemes, instantiate them and add
# them to the reading.
sub _deserialize_lexemes {
	my( $self, $json ) = @_;
	my $data = from_json( $json );
	return unless @$data;
	
	# Need to have the lexeme module in order to have lexemes.
	eval { use Text::Tradition::Collation::Reading::Lexeme; };
	throw( $@ ) if $@;
	
	# Good to go - add the lexemes.
	my @lexemes;
	foreach my $lexhash ( @$data ) {
		push( @lexemes, Text::Tradition::Collation::Reading::Lexeme->new(
			'JSON' => $lexhash ) );
	}
	$self->clear_lexemes;
	$self->add_lexeme( @lexemes );
}

## Utility methods

sub TO_JSON {
	my $self = shift;
	return $self->text;
}

sub throw {
	Text::Tradition::Error->throw( 
		'ident' => 'Reading error',
		'message' => $_[0],
		);
}

no Moose;
__PACKAGE__->meta->make_immutable;

1;
Commit	Line	Data
784877d9	1	package Text::Tradition::Collation::Reading;
784877d9	2
8e1394aa	3	use Moose;
10e4b1ac	4	use Moose::Util::TypeConstraints;
7604424b	5	use JSON qw/ from_json /;
6ad2ce78	6	use Module::Load;
70745e70	7	use Text::Tradition::Error;
10e4b1ac	8	use XML::Easy::Syntax qw( $xml10_name_rx $xml10_namestartchar_rx );
7cd9f181	9	use YAML::XS;
e4b0f464	10	use overload '""' => \&_stringify, 'fallback' => 1;
784877d9	11
10e4b1ac	12	subtype 'ReadingID',
	13	as 'Str',
	14	where { $_ =~ /\A$xml10_name_rx\z/ },
	15	message { 'Reading ID must be a valid XML attribute string' };
	16
	17	no Moose::Util::TypeConstraints;
	18
3a2ebbf4	19	=head1 NAME
784877d9	20
4aea6e9b	21	Text::Tradition::Collation::Reading - represents a reading (usually a word)
	22	in a collation.
	23
3a2ebbf4	24	=head1 DESCRIPTION
784877d9	25
3a2ebbf4	26	Text::Tradition is a library for representation and analysis of collated
	27	texts, particularly medieval ones. A 'reading' refers to a unit of text,
	28	usually a word, that appears in one or more witnesses (manuscripts) of the
	29	tradition; the text of a given witness is composed of a set of readings in
	30	a particular sequence
784877d9	31
3a2ebbf4	32	=head1 METHODS
1ca1163d	33
3a2ebbf4	34	=head2 new
8e1394aa	35
4aea6e9b	36	Creates a new reading in the given collation with the given attributes.
3a2ebbf4	37	Options include:
94c00c71	38
3a2ebbf4	39	=over 4
784877d9	40
4aea6e9b	41	=item collation - The Text::Tradition::Collation object to which this
4aea6e9b	42	reading belongs. Required.
e2902068	43
3a2ebbf4	44	=item id - A unique identifier for this reading. Required.
910a0a6d	45
3a2ebbf4	46	=item text - The word or other text of the reading.
784877d9	47
3a2ebbf4	48	=item is_start - The reading is the starting point for the collation.
3265b0ce	49
3a2ebbf4	50	=item is_end - The reading is the ending point for the collation.
784877d9	51
3a2ebbf4	52	=item is_lacuna - The 'reading' represents a known gap in the text.
de51424a	53
4aea6e9b	54	=item is_ph - A temporary placeholder for apparatus parsing purposes. Do
4aea6e9b	55	not use unless you know what you are doing.
12720144	56
4aea6e9b	57	=item rank - The sequence number of the reading. This should probably not
4aea6e9b	58	be set manually.
d047cd52	59
3a2ebbf4	60	=back
8e1394aa	61
3a2ebbf4	62	One of 'text', 'is_start', 'is_end', or 'is_lacuna' is required.
8e1394aa	63
3a2ebbf4	64	=head2 collation
94c00c71	65
3a2ebbf4	66	=head2 id
94c00c71	67
3a2ebbf4	68	=head2 text
4cdd82f1	69
3a2ebbf4	70	=head2 is_start
4cdd82f1	71
3a2ebbf4	72	=head2 is_end
4a8828f0	73
3a2ebbf4	74	=head2 is_lacuna
4a8828f0	75
3a2ebbf4	76	=head2 rank
4a8828f0	77
3a2ebbf4	78	Accessor methods for the given attributes.
d047cd52	79
3a2ebbf4	80	=cut
d047cd52	81
3a2ebbf4	82	has 'collation' => (
	83	is => 'ro',
	84	isa => 'Text::Tradition::Collation',
	85	# required => 1,
	86	weak_ref => 1,
	87	);
d047cd52	88
3a2ebbf4	89	has 'id' => (
3a2ebbf4	90	is => 'ro',
10e4b1ac	91	isa => 'ReadingID',
3a2ebbf4	92	required => 1,
3a2ebbf4	93	);
d047cd52	94
3a2ebbf4	95	has 'text' => (
	96	is => 'ro',
	97	isa => 'Str',
	98	required => 1,
49d4f2ac	99	writer => 'alter_text',
3a2ebbf4	100	);
0e47f4f6	101
fae52efd	102	has 'language' => (
	103	is => 'ro',
	104	isa => 'Str',
6ad2ce78	105	predicate => 'has_language',
fae52efd	106	);
fae52efd	107
3a2ebbf4	108	has 'is_start' => (
	109	is => 'ro',
	110	isa => 'Bool',
	111	default => undef,
	112	);
	113
	114	has 'is_end' => (
	115	is => 'ro',
	116	isa => 'Bool',
	117	default => undef,
	118	);
	119
	120	has 'is_lacuna' => (
	121	is => 'ro',
	122	isa => 'Bool',
	123	default => undef,
	124	);
12720144	125
	126	has 'is_ph' => (
	127	is => 'ro',
	128	isa => 'Bool',
	129	default => undef,
	130	);
d4b75f44	131
	132	has 'is_common' => (
	133	is => 'rw',
	134	isa => 'Bool',
	135	default => undef,
	136	);
3a2ebbf4	137
	138	has 'rank' => (
	139	is => 'rw',
	140	isa => 'Int',
	141	predicate => 'has_rank',
ca6e6095	142	clearer => 'clear_rank',
3a2ebbf4	143	);
fd602649	144
	145	## For morphological analysis
	146
	147	has 'normal_form' => (
	148	is => 'rw',
	149	isa => 'Str',
	150	predicate => 'has_normal_form',
	151	);
	152
7cd9f181	153	# Holds the lexemes for the reading.
d3e7842a	154	has 'reading_lexemes' => (
4d9593df	155	traits => ['Array'],
d3e7842a	156	isa => 'ArrayRef[Text::Tradition::Collation::Reading::Lexeme]',
4d9593df	157	handles => {
4d9593df	158	lexemes => 'elements',
cca4f996	159	has_lexemes => 'count',
d3e7842a	160	clear_lexemes => 'clear',
d3e7842a	161	add_lexeme => 'push',
4d9593df	162	},
d3e7842a	163	default => sub { [] },
fd602649	164	);
fd602649	165
629e27b0	166	## For prefix/suffix readings
	167
	168	has 'join_prior' => (
	169	is => 'ro',
	170	isa => 'Bool',
	171	default => undef,
	172	);
	173
	174	has 'join_next' => (
	175	is => 'ro',
	176	isa => 'Bool',
	177	default => undef,
	178	);
	179
3a2ebbf4	180
	181	around BUILDARGS => sub {
	182	my $orig = shift;
	183	my $class = shift;
	184	my $args;
	185	if( @_ == 1 ) {
	186	$args = shift;
	187	} else {
	188	$args = { @_ };
	189	}
b0b4421a	190
3a2ebbf4	191	# If one of our special booleans is set, we change the text and the
3a2ebbf4	192	# ID to match.
1d310495	193	if( exists $args->{'is_lacuna'} && !exists $args->{'text'} ) {
56eefa04	194	$args->{'text'} = '#LACUNA#';
3a2ebbf4	195	} elsif( exists $args->{'is_start'} ) {
10e4b1ac	196	$args->{'id'} = '__START__'; # Change the ID to ensure we have only one
3a2ebbf4	197	$args->{'text'} = '#START#';
	198	$args->{'rank'} = 0;
	199	} elsif( exists $args->{'is_end'} ) {
10e4b1ac	200	$args->{'id'} = '__END__'; # Change the ID to ensure we have only one
3a2ebbf4	201	$args->{'text'} = '#END#';
12720144	202	} elsif( exists $args->{'is_ph'} ) {
12720144	203	$args->{'text'} = $args->{'id'};
3a2ebbf4	204	}
3a2ebbf4	205
10e4b1ac	206	# Backwards compatibility for non-XMLname IDs
	207	my $rid = $args->{'id'};
	208	$rid =~ s/\#/__/g;
	209	$rid =~ s/[\/,]/./g;
	210	if( $rid !~ /^$xml10_namestartchar_rx/ ) {
	211	$rid = 'r'.$rid;
	212	}
	213	$args->{'id'} = $rid;
	214
3a2ebbf4	215	$class->$orig( $args );
	216	};
	217
70745e70	218	# Look for a lexeme-string argument in the build args.
	219	sub BUILD {
	220	my( $self, $args ) = @_;
	221	if( exists $args->{'lexemes'} ) {
	222	$self->_deserialize_lexemes( $args->{'lexemes'} );
	223	}
	224	}
	225
3a2ebbf4	226	=head2 is_meta
	227
	228	A meta attribute (ha ha), which should be true if any of our 'special'
	229	booleans are true. Implies that the reading does not represent a bit
	230	of text found in a witness.
	231
	232	=cut
	233
	234	sub is_meta {
	235	my $self = shift;
12720144	236	return $self->is_start \|\| $self->is_end \|\| $self->is_lacuna \|\| $self->is_ph;
3a2ebbf4	237	}
3a2ebbf4	238
027d819c	239	=head1 Convenience methods
	240
	241	=head2 related_readings
	242
	243	Calls Collation's related_readings with $self as the first argument.
	244
	245	=cut
	246
3a2ebbf4	247	sub related_readings {
	248	my $self = shift;
	249	return $self->collation->related_readings( $self, @_ );
	250	}
	251
7f52eac8	252	=head2 witnesses
	253
	254	Calls Collation's reading_witnesses with $self as the first argument.
	255
	256	=cut
	257
	258	sub witnesses {
	259	my $self = shift;
	260	return $self->collation->reading_witnesses( $self, @_ );
	261	}
	262
027d819c	263	=head2 predecessors
	264
	265	Returns a list of Reading objects that immediately precede $self in the collation.
	266
	267	=cut
	268
22222af9	269	sub predecessors {
	270	my $self = shift;
	271	my @pred = $self->collation->sequence->predecessors( $self->id );
	272	return map { $self->collation->reading( $_ ) } @pred;
	273	}
	274
027d819c	275	=head2 successors
	276
	277	Returns a list of Reading objects that immediately follow $self in the collation.
	278
	279	=cut
	280
22222af9	281	sub successors {
	282	my $self = shift;
	283	my @succ = $self->collation->sequence->successors( $self->id );
	284	return map { $self->collation->reading( $_ ) } @succ;
	285	}
	286
027d819c	287	=head2 set_identical( $other_reading)
	288
	289	Backwards compatibility method, to add a transposition relationship
	290	between $self and $other_reading. Don't use this.
	291
	292	=cut
	293
1d310495	294	sub set_identical {
	295	my( $self, $other ) = @_;
	296	return $self->collation->add_relationship( $self, $other,
	297	{ 'type' => 'transposition' } );
	298	}
	299
3a2ebbf4	300	sub _stringify {
	301	my $self = shift;
	302	return $self->id;
	303	}
d047cd52	304
4d9593df	305	=head1 MORPHOLOGY
4d9593df	306
7cd9f181	307	Methods for the morphological information (if any) attached to readings.
	308	A reading may be made up of multiple lexemes; the concatenated lexeme
	309	strings ought to match the reading's normalized form.
	310
	311	See L<Text::Tradition::Collation::Reading::Lexeme> for more information
	312	on Lexeme objects and their attributes.
	313
	314	=head2 has_lexemes
	315
	316	Returns a true value if the reading has any attached lexemes.
4d9593df	317
6ad2ce78	318	=head2 lexemes
06e7cbc7	319
7cd9f181	320	Returns the Lexeme objects (if any) attached to the reading.
6ad2ce78	321
	322	=head2 clear_lexemes
	323
7cd9f181	324	Wipes any associated Lexeme objects out of the reading.
	325
	326	=head2 add_lexeme( $lexobj )
6ad2ce78	327
7cd9f181	328	Adds the Lexeme in $lexobj to the list of lexemes.
	329
	330	=head2 lemmatize
	331
	332	If the language of the reading is set, this method will use the appropriate
	333	Language model to determine the lexemes that belong to this reading. See
	334	L<Text::Tradition::lemmatize> if you wish to lemmatize an entire tradition.
06e7cbc7	335
4d9593df	336	=cut
4d9593df	337
6ad2ce78	338	sub lemmatize {
	339	my $self = shift;
	340	unless( $self->has_language ) {
	341	warn "Please set a language to lemmatize a tradition";
	342	return;
	343	}
	344	my $mod = "Text::Tradition::Language::" . $self->language;
	345	load( $mod );
	346	$mod->can( 'reading_lookup' )->( $self );
	347
	348	}
4d9593df	349
7604424b	350	# For graph serialization. Return a JSON representation of the associated
7cd9f181	351	# reading lexemes.
	352	sub _serialize_lexemes {
	353	my $self = shift;
7604424b	354	my $json = JSON->new->allow_blessed(1)->convert_blessed(1);
7604424b	355	return $json->encode( [ $self->lexemes ] );
7cd9f181	356	}
70745e70	357
7604424b	358	# Given a JSON representation of the lexemes, instantiate them and add
7604424b	359	# them to the reading.
70745e70	360	sub _deserialize_lexemes {
7604424b	361	my( $self, $json ) = @_;
	362	my $data = from_json( $json );
	363	return unless @$data;
70745e70	364
7604424b	365	# Need to have the lexeme module in order to have lexemes.
7604424b	366	eval { use Text::Tradition::Collation::Reading::Lexeme; };
70745e70	367	throw( $@ ) if $@;
	368
	369	# Good to go - add the lexemes.
	370	my @lexemes;
7604424b	371	foreach my $lexhash ( @$data ) {
	372	push( @lexemes, Text::Tradition::Collation::Reading::Lexeme->new(
	373	'JSON' => $lexhash ) );
70745e70	374	}
	375	$self->clear_lexemes;
	376	$self->add_lexeme( @lexemes );
	377	}
7cd9f181	378
4d9593df	379	## Utility methods
4d9593df	380
2acf0892	381	sub TO_JSON {
	382	my $self = shift;
	383	return $self->text;
	384	}
	385
70745e70	386	sub throw {
	387	Text::Tradition::Error->throw(
	388	'ident' => 'Reading error',
	389	'message' => $_[0],
	390	);
	391	}
4d9593df	392
	393	no Moose;
	394	__PACKAGE__->meta->make_immutable;
	395
021bdbac	396	1;