[scpubgit/stemmatology.git] / lib / Text / Tradition / Collation / Reading.pm

package Text::Tradition::Collation::Reading;

use Moose;
use Moose::Util::TypeConstraints;
use JSON qw/ from_json /;
use Module::Load;
use Text::Tradition::Error;
use XML::Easy::Syntax qw( $xml10_name_rx $xml10_namestartchar_rx );
use YAML::XS;
use overload '""' => \&_stringify, 'fallback' => 1;

subtype 'ReadingID',
	as 'Str',
	where { $_ =~ /\A$xml10_name_rx\z/ },
	message { 'Reading ID must be a valid XML attribute string' };
	
no Moose::Util::TypeConstraints;

=head1 NAME

Text::Tradition::Collation::Reading - represents a reading (usually a word)
in a collation.

=head1 DESCRIPTION

Text::Tradition is a library for representation and analysis of collated
texts, particularly medieval ones.  A 'reading' refers to a unit of text,
usually a word, that appears in one or more witnesses (manuscripts) of the
tradition; the text of a given witness is composed of a set of readings in
a particular sequence

=head1 METHODS

=head2 new

Creates a new reading in the given collation with the given attributes.
Options include:

=over 4

=item collation - The Text::Tradition::Collation object to which this
reading belongs.  Required.

=item id - A unique identifier for this reading. Required.

=item text - The word or other text of the reading.

=item is_start - The reading is the starting point for the collation.

=item is_end - The reading is the ending point for the collation.

=item is_lacuna - The 'reading' represents a known gap in the text.

=item is_ph - A temporary placeholder for apparatus parsing purposes.  Do
not use unless you know what you are doing.

=item rank - The sequence number of the reading. This should probably not
be set manually.

=back

One of 'text', 'is_start', 'is_end', or 'is_lacuna' is required.

=head2 collation

=head2 id

=head2 text

=head2 is_start

=head2 is_end

=head2 is_lacuna

=head2 rank

Accessor methods for the given attributes.

=cut

has 'collation' => (
	is => 'ro',
	isa => 'Text::Tradition::Collation',
	# required => 1,
	weak_ref => 1,
	);

has 'id' => (
	is => 'ro',
	isa => 'ReadingID',
	required => 1,
	);

has 'text' => (
	is => 'ro',
	isa => 'Str',
	required => 1,
	writer => 'alter_text',
	);
	
has 'language' => (
	is => 'ro',
	isa => 'Str',
	predicate => 'has_language',
	);
	
has 'is_start' => (
	is => 'ro',
	isa => 'Bool',
	default => undef,
	);

has 'is_end' => (
	is => 'ro',
	isa => 'Bool',
	default => undef,
	);
    
has 'is_lacuna' => (
    is => 'ro',
    isa => 'Bool',
	default => undef,
    );
    
has 'is_ph' => (
	is => 'ro',
	isa => 'Bool',
	default => undef,
	);
	
has 'is_common' => (
	is => 'rw',
	isa => 'Bool',
	default => undef,
	);

has 'rank' => (
    is => 'rw',
    isa => 'Int',
    predicate => 'has_rank',
    clearer => 'clear_rank',
    );
    
## For morphological analysis

has 'grammar_invalid' => (
	is => 'rw',
	isa => 'Bool',
	default => undef,
	);
	
has 'is_nonsense' => (
	is => 'rw',
	isa => 'Bool',
	default => undef,
	);

has 'normal_form' => (
	is => 'rw',
	isa => 'Str',
	predicate => 'has_normal_form',
	);

# Holds the lexemes for the reading.
has 'reading_lexemes' => (
	traits => ['Array'],
	isa => 'ArrayRef[Text::Tradition::Collation::Reading::Lexeme]',
	handles => {
		lexeme => 'get',
		lexemes => 'elements',
		has_lexemes => 'count',
		clear_lexemes => 'clear',
		add_lexeme => 'push',
		},
	default => sub { [] },
	);
	
## For prefix/suffix readings

has 'join_prior' => (
	is => 'ro',
	isa => 'Bool',
	default => undef,
	);
	
has 'join_next' => (
	is => 'ro',
	isa => 'Bool',
	default => undef,
	);


around BUILDARGS => sub {
	my $orig = shift;
	my $class = shift;
	my $args;
	if( @_ == 1 ) {
		$args = shift;
	} else {
		$args = { @_ };
	}
			
	# If one of our special booleans is set, we change the text and the
	# ID to match.
	if( exists $args->{'is_lacuna'} && !exists $args->{'text'} ) {
		$args->{'text'} = '#LACUNA#';
	} elsif( exists $args->{'is_start'} ) {
		$args->{'id'} = '__START__';  # Change the ID to ensure we have only one
		$args->{'text'} = '#START#';
		$args->{'rank'} = 0;
	} elsif( exists $args->{'is_end'} ) {
		$args->{'id'} = '__END__';	# Change the ID to ensure we have only one
		$args->{'text'} = '#END#';
	} elsif( exists $args->{'is_ph'} ) {
		$args->{'text'} = $args->{'id'};
	}
	
	# Backwards compatibility for non-XMLname IDs
	my $rid = $args->{'id'};
	$rid =~ s/\#/__/g;
	$rid =~ s/[\/,]/./g;
    if( $rid !~ /^$xml10_namestartchar_rx/ ) {
    	$rid = 'r'.$rid;
    }
	$args->{'id'} = $rid;
	
	$class->$orig( $args );
};

# Look for a lexeme-string argument in the build args.
sub BUILD {
	my( $self, $args ) = @_;
	if( exists $args->{'lexemes'} ) {
		$self->_deserialize_lexemes( $args->{'lexemes'} );
	}
}

=head2 is_meta

A meta attribute (ha ha), which should be true if any of our 'special'
booleans are true.  Implies that the reading does not represent a bit 
of text found in a witness.

=cut

sub is_meta {
	my $self = shift;
	return $self->is_start || $self->is_end || $self->is_lacuna || $self->is_ph;	
}

=head2 is_nonrel

Similar to is_meta, but returns false for the start and end readings.

=cut

sub is_nonrel {
	my $self = shift;
	return $self->is_lacuna || $self->is_ph;
}

=head1 Convenience methods

=head2 related_readings

Calls Collation's related_readings with $self as the first argument.

=cut

sub related_readings {
	my $self = shift;
	return $self->collation->related_readings( $self, @_ );
}

=head2 witnesses 

Calls Collation's reading_witnesses with $self as the first argument.

=cut

sub witnesses {
	my $self = shift;
	return $self->collation->reading_witnesses( $self, @_ );
}

=head2 predecessors

Returns a list of Reading objects that immediately precede $self in the collation.

=cut

sub predecessors {
	my $self = shift;
	my @pred = $self->collation->sequence->predecessors( $self->id );
	return map { $self->collation->reading( $_ ) } @pred;
}

=head2 successors

Returns a list of Reading objects that immediately follow $self in the collation.

=cut

sub successors {
	my $self = shift;
	my @succ = $self->collation->sequence->successors( $self->id );
	return map { $self->collation->reading( $_ ) } @succ;
}

=head2 set_identical( $other_reading)

Backwards compatibility method, to add a transposition relationship
between $self and $other_reading.  Don't use this.

=cut

sub set_identical {
	my( $self, $other ) = @_;
	return $self->collation->add_relationship( $self, $other, 
		{ 'type' => 'transposition' } );
}

sub _stringify {
	my $self = shift;
	return $self->id;
}

=head1 MORPHOLOGY

Methods for the morphological information (if any) attached to readings.
A reading may be made up of multiple lexemes; the concatenated lexeme
strings ought to match the reading's normalized form.
 
See L<Text::Tradition::Collation::Reading::Lexeme> for more information
on Lexeme objects and their attributes.

=head2 has_lexemes

Returns a true value if the reading has any attached lexemes.

=head2 lexemes

Returns the Lexeme objects (if any) attached to the reading.

=head2 clear_lexemes

Wipes any associated Lexeme objects out of the reading.

=head2 add_lexeme( $lexobj )

Adds the Lexeme in $lexobj to the list of lexemes.

=head2 lemmatize

If the language of the reading is set, this method will use the appropriate
Language model to determine the lexemes that belong to this reading.  See
L<Text::Tradition::lemmatize> if you wish to lemmatize an entire tradition.

=cut

sub lemmatize {
	my $self = shift;
	unless( $self->has_language ) {
		warn "Please set a language to lemmatize a tradition";
		return;
	}
	my $mod = "Text::Tradition::Language::" . $self->language;
	load( $mod );
	$mod->can( 'reading_lookup' )->( $self );

}

# For graph serialization. Return a JSON representation of the associated
# reading lexemes.
sub _serialize_lexemes {
	my $self = shift;
	my $json = JSON->new->allow_blessed(1)->convert_blessed(1);
	return $json->encode( [ $self->lexemes ] );
}

# Given a JSON representation of the lexemes, instantiate them and add
# them to the reading.
sub _deserialize_lexemes {
	my( $self, $json ) = @_;
	my $data = from_json( $json );
	return unless @$data;
	
	# Need to have the lexeme module in order to have lexemes.
	eval { use Text::Tradition::Collation::Reading::Lexeme; };
	throw( $@ ) if $@;
	
	# Good to go - add the lexemes.
	my @lexemes;
	foreach my $lexhash ( @$data ) {
		push( @lexemes, Text::Tradition::Collation::Reading::Lexeme->new(
			'JSON' => $lexhash ) );
	}
	$self->clear_lexemes;
	$self->add_lexeme( @lexemes );
}

## Utility methods

sub TO_JSON {
	my $self = shift;
	return $self->text;
}

sub throw {
	Text::Tradition::Error->throw( 
		'ident' => 'Reading error',
		'message' => $_[0],
		);
}

no Moose;
__PACKAGE__->meta->make_immutable;

1;
Commit	Line	Data
784877d9	1	package Text::Tradition::Collation::Reading;
784877d9	2
8e1394aa	3	use Moose;
10e4b1ac	4	use Moose::Util::TypeConstraints;
7604424b	5	use JSON qw/ from_json /;
6ad2ce78	6	use Module::Load;
70745e70	7	use Text::Tradition::Error;
10e4b1ac	8	use XML::Easy::Syntax qw( $xml10_name_rx $xml10_namestartchar_rx );
7cd9f181	9	use YAML::XS;
e4b0f464	10	use overload '""' => \&_stringify, 'fallback' => 1;
784877d9	11
10e4b1ac	12	subtype 'ReadingID',
	13	as 'Str',
	14	where { $_ =~ /\A$xml10_name_rx\z/ },
	15	message { 'Reading ID must be a valid XML attribute string' };
	16
	17	no Moose::Util::TypeConstraints;
	18
3a2ebbf4	19	=head1 NAME
784877d9	20
4aea6e9b	21	Text::Tradition::Collation::Reading - represents a reading (usually a word)
	22	in a collation.
	23
3a2ebbf4	24	=head1 DESCRIPTION
784877d9	25
3a2ebbf4	26	Text::Tradition is a library for representation and analysis of collated
	27	texts, particularly medieval ones. A 'reading' refers to a unit of text,
	28	usually a word, that appears in one or more witnesses (manuscripts) of the
	29	tradition; the text of a given witness is composed of a set of readings in
	30	a particular sequence
784877d9	31
3a2ebbf4	32	=head1 METHODS
1ca1163d	33
3a2ebbf4	34	=head2 new
8e1394aa	35
4aea6e9b	36	Creates a new reading in the given collation with the given attributes.
3a2ebbf4	37	Options include:
94c00c71	38
3a2ebbf4	39	=over 4
784877d9	40
4aea6e9b	41	=item collation - The Text::Tradition::Collation object to which this
4aea6e9b	42	reading belongs. Required.
e2902068	43
3a2ebbf4	44	=item id - A unique identifier for this reading. Required.
910a0a6d	45
3a2ebbf4	46	=item text - The word or other text of the reading.
784877d9	47
3a2ebbf4	48	=item is_start - The reading is the starting point for the collation.
3265b0ce	49
3a2ebbf4	50	=item is_end - The reading is the ending point for the collation.
784877d9	51
3a2ebbf4	52	=item is_lacuna - The 'reading' represents a known gap in the text.
de51424a	53
4aea6e9b	54	=item is_ph - A temporary placeholder for apparatus parsing purposes. Do
4aea6e9b	55	not use unless you know what you are doing.
12720144	56
4aea6e9b	57	=item rank - The sequence number of the reading. This should probably not
4aea6e9b	58	be set manually.
d047cd52	59
3a2ebbf4	60	=back
8e1394aa	61
3a2ebbf4	62	One of 'text', 'is_start', 'is_end', or 'is_lacuna' is required.
8e1394aa	63
3a2ebbf4	64	=head2 collation
94c00c71	65
3a2ebbf4	66	=head2 id
94c00c71	67
3a2ebbf4	68	=head2 text
4cdd82f1	69
3a2ebbf4	70	=head2 is_start
4cdd82f1	71
3a2ebbf4	72	=head2 is_end
4a8828f0	73
3a2ebbf4	74	=head2 is_lacuna
4a8828f0	75
3a2ebbf4	76	=head2 rank
4a8828f0	77
3a2ebbf4	78	Accessor methods for the given attributes.
d047cd52	79
3a2ebbf4	80	=cut
d047cd52	81
3a2ebbf4	82	has 'collation' => (
	83	is => 'ro',
	84	isa => 'Text::Tradition::Collation',
	85	# required => 1,
	86	weak_ref => 1,
	87	);
d047cd52	88
3a2ebbf4	89	has 'id' => (
3a2ebbf4	90	is => 'ro',
10e4b1ac	91	isa => 'ReadingID',
3a2ebbf4	92	required => 1,
3a2ebbf4	93	);
d047cd52	94
3a2ebbf4	95	has 'text' => (
	96	is => 'ro',
	97	isa => 'Str',
	98	required => 1,
49d4f2ac	99	writer => 'alter_text',
3a2ebbf4	100	);
0e47f4f6	101
fae52efd	102	has 'language' => (
	103	is => 'ro',
	104	isa => 'Str',
6ad2ce78	105	predicate => 'has_language',
fae52efd	106	);
fae52efd	107
3a2ebbf4	108	has 'is_start' => (
	109	is => 'ro',
	110	isa => 'Bool',
	111	default => undef,
	112	);
	113
	114	has 'is_end' => (
	115	is => 'ro',
	116	isa => 'Bool',
	117	default => undef,
	118	);
	119
	120	has 'is_lacuna' => (
	121	is => 'ro',
	122	isa => 'Bool',
	123	default => undef,
	124	);
12720144	125
	126	has 'is_ph' => (
	127	is => 'ro',
	128	isa => 'Bool',
	129	default => undef,
	130	);
d4b75f44	131
	132	has 'is_common' => (
	133	is => 'rw',
	134	isa => 'Bool',
	135	default => undef,
	136	);
3a2ebbf4	137
	138	has 'rank' => (
	139	is => 'rw',
	140	isa => 'Int',
	141	predicate => 'has_rank',
ca6e6095	142	clearer => 'clear_rank',
3a2ebbf4	143	);
fd602649	144
	145	## For morphological analysis
	146
a8928d1d	147	has 'grammar_invalid' => (
	148	is => 'rw',
	149	isa => 'Bool',
	150	default => undef,
	151	);
	152
	153	has 'is_nonsense' => (
	154	is => 'rw',
	155	isa => 'Bool',
0e6e9e7a	156	default => undef,
a8928d1d	157	);
a8928d1d	158
fd602649	159	has 'normal_form' => (
	160	is => 'rw',
	161	isa => 'Str',
	162	predicate => 'has_normal_form',
	163	);
	164
7cd9f181	165	# Holds the lexemes for the reading.
d3e7842a	166	has 'reading_lexemes' => (
4d9593df	167	traits => ['Array'],
d3e7842a	168	isa => 'ArrayRef[Text::Tradition::Collation::Reading::Lexeme]',
4d9593df	169	handles => {
da83693e	170	lexeme => 'get',
4d9593df	171	lexemes => 'elements',
cca4f996	172	has_lexemes => 'count',
d3e7842a	173	clear_lexemes => 'clear',
d3e7842a	174	add_lexeme => 'push',
4d9593df	175	},
d3e7842a	176	default => sub { [] },
fd602649	177	);
fd602649	178
629e27b0	179	## For prefix/suffix readings
	180
	181	has 'join_prior' => (
	182	is => 'ro',
	183	isa => 'Bool',
	184	default => undef,
	185	);
	186
	187	has 'join_next' => (
	188	is => 'ro',
	189	isa => 'Bool',
	190	default => undef,
	191	);
	192
3a2ebbf4	193
	194	around BUILDARGS => sub {
	195	my $orig = shift;
	196	my $class = shift;
	197	my $args;
	198	if( @_ == 1 ) {
	199	$args = shift;
	200	} else {
	201	$args = { @_ };
	202	}
b0b4421a	203
3a2ebbf4	204	# If one of our special booleans is set, we change the text and the
3a2ebbf4	205	# ID to match.
1d310495	206	if( exists $args->{'is_lacuna'} && !exists $args->{'text'} ) {
56eefa04	207	$args->{'text'} = '#LACUNA#';
3a2ebbf4	208	} elsif( exists $args->{'is_start'} ) {
10e4b1ac	209	$args->{'id'} = '__START__'; # Change the ID to ensure we have only one
3a2ebbf4	210	$args->{'text'} = '#START#';
	211	$args->{'rank'} = 0;
	212	} elsif( exists $args->{'is_end'} ) {
10e4b1ac	213	$args->{'id'} = '__END__'; # Change the ID to ensure we have only one
3a2ebbf4	214	$args->{'text'} = '#END#';
12720144	215	} elsif( exists $args->{'is_ph'} ) {
12720144	216	$args->{'text'} = $args->{'id'};
3a2ebbf4	217	}
3a2ebbf4	218
10e4b1ac	219	# Backwards compatibility for non-XMLname IDs
	220	my $rid = $args->{'id'};
	221	$rid =~ s/\#/__/g;
	222	$rid =~ s/[\/,]/./g;
	223	if( $rid !~ /^$xml10_namestartchar_rx/ ) {
	224	$rid = 'r'.$rid;
	225	}
	226	$args->{'id'} = $rid;
	227
3a2ebbf4	228	$class->$orig( $args );
	229	};
	230
70745e70	231	# Look for a lexeme-string argument in the build args.
	232	sub BUILD {
	233	my( $self, $args ) = @_;
	234	if( exists $args->{'lexemes'} ) {
	235	$self->_deserialize_lexemes( $args->{'lexemes'} );
	236	}
	237	}
	238
3a2ebbf4	239	=head2 is_meta
	240
	241	A meta attribute (ha ha), which should be true if any of our 'special'
	242	booleans are true. Implies that the reading does not represent a bit
	243	of text found in a witness.
	244
	245	=cut
	246
	247	sub is_meta {
	248	my $self = shift;
12720144	249	return $self->is_start \|\| $self->is_end \|\| $self->is_lacuna \|\| $self->is_ph;
3a2ebbf4	250	}
3a2ebbf4	251
56772e8c	252	=head2 is_nonrel
	253
	254	Similar to is_meta, but returns false for the start and end readings.
	255
	256	=cut
	257
	258	sub is_nonrel {
	259	my $self = shift;
	260	return $self->is_lacuna \|\| $self->is_ph;
	261	}
	262
027d819c	263	=head1 Convenience methods
	264
	265	=head2 related_readings
	266
	267	Calls Collation's related_readings with $self as the first argument.
	268
	269	=cut
	270
3a2ebbf4	271	sub related_readings {
	272	my $self = shift;
	273	return $self->collation->related_readings( $self, @_ );
	274	}
	275
7f52eac8	276	=head2 witnesses
	277
	278	Calls Collation's reading_witnesses with $self as the first argument.
	279
	280	=cut
	281
	282	sub witnesses {
	283	my $self = shift;
	284	return $self->collation->reading_witnesses( $self, @_ );
	285	}
	286
027d819c	287	=head2 predecessors
	288
	289	Returns a list of Reading objects that immediately precede $self in the collation.
	290
	291	=cut
	292
22222af9	293	sub predecessors {
	294	my $self = shift;
	295	my @pred = $self->collation->sequence->predecessors( $self->id );
	296	return map { $self->collation->reading( $_ ) } @pred;
	297	}
	298
027d819c	299	=head2 successors
	300
	301	Returns a list of Reading objects that immediately follow $self in the collation.
	302
	303	=cut
	304
22222af9	305	sub successors {
	306	my $self = shift;
	307	my @succ = $self->collation->sequence->successors( $self->id );
	308	return map { $self->collation->reading( $_ ) } @succ;
	309	}
	310
027d819c	311	=head2 set_identical( $other_reading)
	312
	313	Backwards compatibility method, to add a transposition relationship
	314	between $self and $other_reading. Don't use this.
	315
	316	=cut
	317
1d310495	318	sub set_identical {
	319	my( $self, $other ) = @_;
	320	return $self->collation->add_relationship( $self, $other,
	321	{ 'type' => 'transposition' } );
	322	}
	323
3a2ebbf4	324	sub _stringify {
	325	my $self = shift;
	326	return $self->id;
	327	}
d047cd52	328
4d9593df	329	=head1 MORPHOLOGY
4d9593df	330
7cd9f181	331	Methods for the morphological information (if any) attached to readings.
	332	A reading may be made up of multiple lexemes; the concatenated lexeme
	333	strings ought to match the reading's normalized form.
	334
	335	See L<Text::Tradition::Collation::Reading::Lexeme> for more information
	336	on Lexeme objects and their attributes.
	337
	338	=head2 has_lexemes
	339
	340	Returns a true value if the reading has any attached lexemes.
4d9593df	341
6ad2ce78	342	=head2 lexemes
06e7cbc7	343
7cd9f181	344	Returns the Lexeme objects (if any) attached to the reading.
6ad2ce78	345
	346	=head2 clear_lexemes
	347
7cd9f181	348	Wipes any associated Lexeme objects out of the reading.
	349
	350	=head2 add_lexeme( $lexobj )
6ad2ce78	351
7cd9f181	352	Adds the Lexeme in $lexobj to the list of lexemes.
	353
	354	=head2 lemmatize
	355
	356	If the language of the reading is set, this method will use the appropriate
	357	Language model to determine the lexemes that belong to this reading. See
	358	L<Text::Tradition::lemmatize> if you wish to lemmatize an entire tradition.
06e7cbc7	359
4d9593df	360	=cut
4d9593df	361
6ad2ce78	362	sub lemmatize {
	363	my $self = shift;
	364	unless( $self->has_language ) {
	365	warn "Please set a language to lemmatize a tradition";
	366	return;
	367	}
	368	my $mod = "Text::Tradition::Language::" . $self->language;
	369	load( $mod );
	370	$mod->can( 'reading_lookup' )->( $self );
	371
	372	}
4d9593df	373
7604424b	374	# For graph serialization. Return a JSON representation of the associated
7cd9f181	375	# reading lexemes.
	376	sub _serialize_lexemes {
	377	my $self = shift;
7604424b	378	my $json = JSON->new->allow_blessed(1)->convert_blessed(1);
7604424b	379	return $json->encode( [ $self->lexemes ] );
7cd9f181	380	}
70745e70	381
7604424b	382	# Given a JSON representation of the lexemes, instantiate them and add
7604424b	383	# them to the reading.
70745e70	384	sub _deserialize_lexemes {
7604424b	385	my( $self, $json ) = @_;
	386	my $data = from_json( $json );
	387	return unless @$data;
70745e70	388
7604424b	389	# Need to have the lexeme module in order to have lexemes.
7604424b	390	eval { use Text::Tradition::Collation::Reading::Lexeme; };
70745e70	391	throw( $@ ) if $@;
	392
	393	# Good to go - add the lexemes.
	394	my @lexemes;
7604424b	395	foreach my $lexhash ( @$data ) {
	396	push( @lexemes, Text::Tradition::Collation::Reading::Lexeme->new(
	397	'JSON' => $lexhash ) );
70745e70	398	}
	399	$self->clear_lexemes;
	400	$self->add_lexeme( @lexemes );
	401	}
7cd9f181	402
4d9593df	403	## Utility methods
4d9593df	404
2acf0892	405	sub TO_JSON {
	406	my $self = shift;
	407	return $self->text;
	408	}
	409
70745e70	410	sub throw {
	411	Text::Tradition::Error->throw(
	412	'ident' => 'Reading error',
	413	'message' => $_[0],
	414	);
	415	}
4d9593df	416
	417	no Moose;
	418	__PACKAGE__->meta->make_immutable;
	419
021bdbac	420	1;