[scpubgit/stemmatology.git] / lib / Text / Tradition / Collation / Reading.pm

package Text::Tradition::Collation::Reading;

use Moose;
use Moose::Util::TypeConstraints;
use JSON qw/ from_json /;
use Module::Load;
use Text::Tradition::Error;
use XML::Easy::Syntax qw( $xml10_name_rx $xml10_namestartchar_rx );
use YAML::XS;
use overload '""' => \&_stringify, 'fallback' => 1;

subtype 'ReadingID',
	as 'Str',
	where { $_ =~ /\A$xml10_name_rx\z/ },
	message { 'Reading ID must be a valid XML attribute string' };
	
no Moose::Util::TypeConstraints;

=head1 NAME

Text::Tradition::Collation::Reading - represents a reading (usually a word)
in a collation.

=head1 DESCRIPTION

Text::Tradition is a library for representation and analysis of collated
texts, particularly medieval ones.  A 'reading' refers to a unit of text,
usually a word, that appears in one or more witnesses (manuscripts) of the
tradition; the text of a given witness is composed of a set of readings in
a particular sequence

=head1 METHODS

=head2 new

Creates a new reading in the given collation with the given attributes.
Options include:

=over 4

=item collation - The Text::Tradition::Collation object to which this
reading belongs.  Required.

=item id - A unique identifier for this reading. Required.

=item text - The word or other text of the reading.

=item is_start - The reading is the starting point for the collation.

=item is_end - The reading is the ending point for the collation.

=item is_lacuna - The 'reading' represents a known gap in the text.

=item is_ph - A temporary placeholder for apparatus parsing purposes.  Do
not use unless you know what you are doing.

=item rank - The sequence number of the reading. This should probably not
be set manually.

=back

One of 'text', 'is_start', 'is_end', or 'is_lacuna' is required.

=head2 collation

=head2 id

=head2 text

=head2 is_start

=head2 is_end

=head2 is_lacuna

=head2 rank

Accessor methods for the given attributes.

=cut

has 'collation' => (
	is => 'ro',
	isa => 'Text::Tradition::Collation',
	# required => 1,
	weak_ref => 1,
	);

has 'id' => (
	is => 'ro',
	isa => 'ReadingID',
	required => 1,
	);

has 'text' => (
	is => 'ro',
	isa => 'Str',
	required => 1,
	writer => 'alter_text',
	);
	
has 'language' => (
	is => 'ro',
	isa => 'Str',
	predicate => 'has_language',
	);
	
has 'is_start' => (
	is => 'ro',
	isa => 'Bool',
	default => undef,
	);

has 'is_end' => (
	is => 'ro',
	isa => 'Bool',
	default => undef,
	);
    
has 'is_lacuna' => (
    is => 'ro',
    isa => 'Bool',
	default => undef,
    );
    
has 'is_ph' => (
	is => 'ro',
	isa => 'Bool',
	default => undef,
	);
	
has 'is_common' => (
	is => 'rw',
	isa => 'Bool',
	default => undef,
	);

has 'rank' => (
    is => 'rw',
    isa => 'Int',
    predicate => 'has_rank',
    clearer => 'clear_rank',
    );
    
## For morphological analysis

has 'grammar_invalid' => (
	is => 'rw',
	isa => 'Bool',
	default => undef,
	);
	
has 'is_nonsense' => (
	is => 'rw',
	isa => 'Bool',
	default => undef,
	);

has 'normal_form' => (
	is => 'rw',
	isa => 'Str',
	predicate => 'has_normal_form',
	);

# Holds the lexemes for the reading.
has 'reading_lexemes' => (
	traits => ['Array'],
	isa => 'ArrayRef[Text::Tradition::Collation::Reading::Lexeme]',
	handles => {
		lexemes => 'elements',
		has_lexemes => 'count',
		clear_lexemes => 'clear',
		add_lexeme => 'push',
		},
	default => sub { [] },
	);
	
## For prefix/suffix readings

has 'join_prior' => (
	is => 'ro',
	isa => 'Bool',
	default => undef,
	);
	
has 'join_next' => (
	is => 'ro',
	isa => 'Bool',
	default => undef,
	);


around BUILDARGS => sub {
	my $orig = shift;
	my $class = shift;
	my $args;
	if( @_ == 1 ) {
		$args = shift;
	} else {
		$args = { @_ };
	}
			
	# If one of our special booleans is set, we change the text and the
	# ID to match.
	if( exists $args->{'is_lacuna'} && !exists $args->{'text'} ) {
		$args->{'text'} = '#LACUNA#';
	} elsif( exists $args->{'is_start'} ) {
		$args->{'id'} = '__START__';  # Change the ID to ensure we have only one
		$args->{'text'} = '#START#';
		$args->{'rank'} = 0;
	} elsif( exists $args->{'is_end'} ) {
		$args->{'id'} = '__END__';	# Change the ID to ensure we have only one
		$args->{'text'} = '#END#';
	} elsif( exists $args->{'is_ph'} ) {
		$args->{'text'} = $args->{'id'};
	}
	
	# Backwards compatibility for non-XMLname IDs
	my $rid = $args->{'id'};
	$rid =~ s/\#/__/g;
	$rid =~ s/[\/,]/./g;
    if( $rid !~ /^$xml10_namestartchar_rx/ ) {
    	$rid = 'r'.$rid;
    }
	$args->{'id'} = $rid;
	
	$class->$orig( $args );
};

# Look for a lexeme-string argument in the build args.
sub BUILD {
	my( $self, $args ) = @_;
	if( exists $args->{'lexemes'} ) {
		$self->_deserialize_lexemes( $args->{'lexemes'} );
	}
}

=head2 is_meta

A meta attribute (ha ha), which should be true if any of our 'special'
booleans are true.  Implies that the reading does not represent a bit 
of text found in a witness.

=cut

sub is_meta {
	my $self = shift;
	return $self->is_start || $self->is_end || $self->is_lacuna || $self->is_ph;	
}

=head1 Convenience methods

=head2 related_readings

Calls Collation's related_readings with $self as the first argument.

=cut

sub related_readings {
	my $self = shift;
	return $self->collation->related_readings( $self, @_ );
}

=head2 witnesses 

Calls Collation's reading_witnesses with $self as the first argument.

=cut

sub witnesses {
	my $self = shift;
	return $self->collation->reading_witnesses( $self, @_ );
}

=head2 predecessors

Returns a list of Reading objects that immediately precede $self in the collation.

=cut

sub predecessors {
	my $self = shift;
	my @pred = $self->collation->sequence->predecessors( $self->id );
	return map { $self->collation->reading( $_ ) } @pred;
}

=head2 successors

Returns a list of Reading objects that immediately follow $self in the collation.

=cut

sub successors {
	my $self = shift;
	my @succ = $self->collation->sequence->successors( $self->id );
	return map { $self->collation->reading( $_ ) } @succ;
}

=head2 set_identical( $other_reading)

Backwards compatibility method, to add a transposition relationship
between $self and $other_reading.  Don't use this.

=cut

sub set_identical {
	my( $self, $other ) = @_;
	return $self->collation->add_relationship( $self, $other, 
		{ 'type' => 'transposition' } );
}

sub _stringify {
	my $self = shift;
	return $self->id;
}

=head1 MORPHOLOGY

Methods for the morphological information (if any) attached to readings.
A reading may be made up of multiple lexemes; the concatenated lexeme
strings ought to match the reading's normalized form.
 
See L<Text::Tradition::Collation::Reading::Lexeme> for more information
on Lexeme objects and their attributes.

=head2 has_lexemes

Returns a true value if the reading has any attached lexemes.

=head2 lexemes

Returns the Lexeme objects (if any) attached to the reading.

=head2 clear_lexemes

Wipes any associated Lexeme objects out of the reading.

=head2 add_lexeme( $lexobj )

Adds the Lexeme in $lexobj to the list of lexemes.

=head2 lemmatize

If the language of the reading is set, this method will use the appropriate
Language model to determine the lexemes that belong to this reading.  See
L<Text::Tradition::lemmatize> if you wish to lemmatize an entire tradition.

=cut

sub lemmatize {
	my $self = shift;
	unless( $self->has_language ) {
		warn "Please set a language to lemmatize a tradition";
		return;
	}
	my $mod = "Text::Tradition::Language::" . $self->language;
	load( $mod );
	$mod->can( 'reading_lookup' )->( $self );

}

# For graph serialization. Return a JSON representation of the associated
# reading lexemes.
sub _serialize_lexemes {
	my $self = shift;
	my $json = JSON->new->allow_blessed(1)->convert_blessed(1);
	return $json->encode( [ $self->lexemes ] );
}

# Given a JSON representation of the lexemes, instantiate them and add
# them to the reading.
sub _deserialize_lexemes {
	my( $self, $json ) = @_;
	my $data = from_json( $json );
	return unless @$data;
	
	# Need to have the lexeme module in order to have lexemes.
	eval { use Text::Tradition::Collation::Reading::Lexeme; };
	throw( $@ ) if $@;
	
	# Good to go - add the lexemes.
	my @lexemes;
	foreach my $lexhash ( @$data ) {
		push( @lexemes, Text::Tradition::Collation::Reading::Lexeme->new(
			'JSON' => $lexhash ) );
	}
	$self->clear_lexemes;
	$self->add_lexeme( @lexemes );
}

## Utility methods

sub TO_JSON {
	my $self = shift;
	return $self->text;
}

sub throw {
	Text::Tradition::Error->throw( 
		'ident' => 'Reading error',
		'message' => $_[0],
		);
}

no Moose;
__PACKAGE__->meta->make_immutable;

1;
Commit	Line	Data
784877d9	1	package Text::Tradition::Collation::Reading;
784877d9	2
8e1394aa	3	use Moose;
10e4b1ac	4	use Moose::Util::TypeConstraints;
7604424b	5	use JSON qw/ from_json /;
6ad2ce78	6	use Module::Load;
70745e70	7	use Text::Tradition::Error;
10e4b1ac	8	use XML::Easy::Syntax qw( $xml10_name_rx $xml10_namestartchar_rx );
7cd9f181	9	use YAML::XS;
e4b0f464	10	use overload '""' => \&_stringify, 'fallback' => 1;
784877d9	11
10e4b1ac	12	subtype 'ReadingID',
	13	as 'Str',
	14	where { $_ =~ /\A$xml10_name_rx\z/ },
	15	message { 'Reading ID must be a valid XML attribute string' };
	16
	17	no Moose::Util::TypeConstraints;
	18
3a2ebbf4	19	=head1 NAME
784877d9	20
4aea6e9b	21	Text::Tradition::Collation::Reading - represents a reading (usually a word)
	22	in a collation.
	23
3a2ebbf4	24	=head1 DESCRIPTION
784877d9	25
3a2ebbf4	26	Text::Tradition is a library for representation and analysis of collated
	27	texts, particularly medieval ones. A 'reading' refers to a unit of text,
	28	usually a word, that appears in one or more witnesses (manuscripts) of the
	29	tradition; the text of a given witness is composed of a set of readings in
	30	a particular sequence
784877d9	31
3a2ebbf4	32	=head1 METHODS
1ca1163d	33
3a2ebbf4	34	=head2 new
8e1394aa	35
4aea6e9b	36	Creates a new reading in the given collation with the given attributes.
3a2ebbf4	37	Options include:
94c00c71	38
3a2ebbf4	39	=over 4
784877d9	40
4aea6e9b	41	=item collation - The Text::Tradition::Collation object to which this
4aea6e9b	42	reading belongs. Required.
e2902068	43
3a2ebbf4	44	=item id - A unique identifier for this reading. Required.
910a0a6d	45
3a2ebbf4	46	=item text - The word or other text of the reading.
784877d9	47
3a2ebbf4	48	=item is_start - The reading is the starting point for the collation.
3265b0ce	49
3a2ebbf4	50	=item is_end - The reading is the ending point for the collation.
784877d9	51
3a2ebbf4	52	=item is_lacuna - The 'reading' represents a known gap in the text.
de51424a	53
4aea6e9b	54	=item is_ph - A temporary placeholder for apparatus parsing purposes. Do
4aea6e9b	55	not use unless you know what you are doing.
12720144	56
4aea6e9b	57	=item rank - The sequence number of the reading. This should probably not
4aea6e9b	58	be set manually.
d047cd52	59
3a2ebbf4	60	=back
8e1394aa	61
3a2ebbf4	62	One of 'text', 'is_start', 'is_end', or 'is_lacuna' is required.
8e1394aa	63
3a2ebbf4	64	=head2 collation
94c00c71	65
3a2ebbf4	66	=head2 id
94c00c71	67
3a2ebbf4	68	=head2 text
4cdd82f1	69
3a2ebbf4	70	=head2 is_start
4cdd82f1	71
3a2ebbf4	72	=head2 is_end
4a8828f0	73
3a2ebbf4	74	=head2 is_lacuna
4a8828f0	75
3a2ebbf4	76	=head2 rank
4a8828f0	77
3a2ebbf4	78	Accessor methods for the given attributes.
d047cd52	79
3a2ebbf4	80	=cut
d047cd52	81
3a2ebbf4	82	has 'collation' => (
	83	is => 'ro',
	84	isa => 'Text::Tradition::Collation',
	85	# required => 1,
	86	weak_ref => 1,
	87	);
d047cd52	88
3a2ebbf4	89	has 'id' => (
3a2ebbf4	90	is => 'ro',
10e4b1ac	91	isa => 'ReadingID',
3a2ebbf4	92	required => 1,
3a2ebbf4	93	);
d047cd52	94
3a2ebbf4	95	has 'text' => (
	96	is => 'ro',
	97	isa => 'Str',
	98	required => 1,
49d4f2ac	99	writer => 'alter_text',
3a2ebbf4	100	);
0e47f4f6	101
fae52efd	102	has 'language' => (
	103	is => 'ro',
	104	isa => 'Str',
6ad2ce78	105	predicate => 'has_language',
fae52efd	106	);
fae52efd	107
3a2ebbf4	108	has 'is_start' => (
	109	is => 'ro',
	110	isa => 'Bool',
	111	default => undef,
	112	);
	113
	114	has 'is_end' => (
	115	is => 'ro',
	116	isa => 'Bool',
	117	default => undef,
	118	);
	119
	120	has 'is_lacuna' => (
	121	is => 'ro',
	122	isa => 'Bool',
	123	default => undef,
	124	);
12720144	125
	126	has 'is_ph' => (
	127	is => 'ro',
	128	isa => 'Bool',
	129	default => undef,
	130	);
d4b75f44	131
	132	has 'is_common' => (
	133	is => 'rw',
	134	isa => 'Bool',
	135	default => undef,
	136	);
3a2ebbf4	137
	138	has 'rank' => (
	139	is => 'rw',
	140	isa => 'Int',
	141	predicate => 'has_rank',
ca6e6095	142	clearer => 'clear_rank',
3a2ebbf4	143	);
fd602649	144
	145	## For morphological analysis
	146
a8928d1d	147	has 'grammar_invalid' => (
	148	is => 'rw',
	149	isa => 'Bool',
	150	default => undef,
	151	);
	152
	153	has 'is_nonsense' => (
	154	is => 'rw',
	155	isa => 'Bool',
0e6e9e7a	156	default => undef,
a8928d1d	157	);
a8928d1d	158
fd602649	159	has 'normal_form' => (
	160	is => 'rw',
	161	isa => 'Str',
	162	predicate => 'has_normal_form',
	163	);
	164
7cd9f181	165	# Holds the lexemes for the reading.
d3e7842a	166	has 'reading_lexemes' => (
4d9593df	167	traits => ['Array'],
d3e7842a	168	isa => 'ArrayRef[Text::Tradition::Collation::Reading::Lexeme]',
4d9593df	169	handles => {
4d9593df	170	lexemes => 'elements',
cca4f996	171	has_lexemes => 'count',
d3e7842a	172	clear_lexemes => 'clear',
d3e7842a	173	add_lexeme => 'push',
4d9593df	174	},
d3e7842a	175	default => sub { [] },
fd602649	176	);
fd602649	177
629e27b0	178	## For prefix/suffix readings
	179
	180	has 'join_prior' => (
	181	is => 'ro',
	182	isa => 'Bool',
	183	default => undef,
	184	);
	185
	186	has 'join_next' => (
	187	is => 'ro',
	188	isa => 'Bool',
	189	default => undef,
	190	);
	191
3a2ebbf4	192
	193	around BUILDARGS => sub {
	194	my $orig = shift;
	195	my $class = shift;
	196	my $args;
	197	if( @_ == 1 ) {
	198	$args = shift;
	199	} else {
	200	$args = { @_ };
	201	}
b0b4421a	202
3a2ebbf4	203	# If one of our special booleans is set, we change the text and the
3a2ebbf4	204	# ID to match.
1d310495	205	if( exists $args->{'is_lacuna'} && !exists $args->{'text'} ) {
56eefa04	206	$args->{'text'} = '#LACUNA#';
3a2ebbf4	207	} elsif( exists $args->{'is_start'} ) {
10e4b1ac	208	$args->{'id'} = '__START__'; # Change the ID to ensure we have only one
3a2ebbf4	209	$args->{'text'} = '#START#';
	210	$args->{'rank'} = 0;
	211	} elsif( exists $args->{'is_end'} ) {
10e4b1ac	212	$args->{'id'} = '__END__'; # Change the ID to ensure we have only one
3a2ebbf4	213	$args->{'text'} = '#END#';
12720144	214	} elsif( exists $args->{'is_ph'} ) {
12720144	215	$args->{'text'} = $args->{'id'};
3a2ebbf4	216	}
3a2ebbf4	217
10e4b1ac	218	# Backwards compatibility for non-XMLname IDs
	219	my $rid = $args->{'id'};
	220	$rid =~ s/\#/__/g;
	221	$rid =~ s/[\/,]/./g;
	222	if( $rid !~ /^$xml10_namestartchar_rx/ ) {
	223	$rid = 'r'.$rid;
	224	}
	225	$args->{'id'} = $rid;
	226
3a2ebbf4	227	$class->$orig( $args );
	228	};
	229
70745e70	230	# Look for a lexeme-string argument in the build args.
	231	sub BUILD {
	232	my( $self, $args ) = @_;
	233	if( exists $args->{'lexemes'} ) {
	234	$self->_deserialize_lexemes( $args->{'lexemes'} );
	235	}
	236	}
	237
3a2ebbf4	238	=head2 is_meta
	239
	240	A meta attribute (ha ha), which should be true if any of our 'special'
	241	booleans are true. Implies that the reading does not represent a bit
	242	of text found in a witness.
	243
	244	=cut
	245
	246	sub is_meta {
	247	my $self = shift;
12720144	248	return $self->is_start \|\| $self->is_end \|\| $self->is_lacuna \|\| $self->is_ph;
3a2ebbf4	249	}
3a2ebbf4	250
027d819c	251	=head1 Convenience methods
	252
	253	=head2 related_readings
	254
	255	Calls Collation's related_readings with $self as the first argument.
	256
	257	=cut
	258
3a2ebbf4	259	sub related_readings {
	260	my $self = shift;
	261	return $self->collation->related_readings( $self, @_ );
	262	}
	263
7f52eac8	264	=head2 witnesses
	265
	266	Calls Collation's reading_witnesses with $self as the first argument.
	267
	268	=cut
	269
	270	sub witnesses {
	271	my $self = shift;
	272	return $self->collation->reading_witnesses( $self, @_ );
	273	}
	274
027d819c	275	=head2 predecessors
	276
	277	Returns a list of Reading objects that immediately precede $self in the collation.
	278
	279	=cut
	280
22222af9	281	sub predecessors {
	282	my $self = shift;
	283	my @pred = $self->collation->sequence->predecessors( $self->id );
	284	return map { $self->collation->reading( $_ ) } @pred;
	285	}
	286
027d819c	287	=head2 successors
	288
	289	Returns a list of Reading objects that immediately follow $self in the collation.
	290
	291	=cut
	292
22222af9	293	sub successors {
	294	my $self = shift;
	295	my @succ = $self->collation->sequence->successors( $self->id );
	296	return map { $self->collation->reading( $_ ) } @succ;
	297	}
	298
027d819c	299	=head2 set_identical( $other_reading)
	300
	301	Backwards compatibility method, to add a transposition relationship
	302	between $self and $other_reading. Don't use this.
	303
	304	=cut
	305
1d310495	306	sub set_identical {
	307	my( $self, $other ) = @_;
	308	return $self->collation->add_relationship( $self, $other,
	309	{ 'type' => 'transposition' } );
	310	}
	311
3a2ebbf4	312	sub _stringify {
	313	my $self = shift;
	314	return $self->id;
	315	}
d047cd52	316
4d9593df	317	=head1 MORPHOLOGY
4d9593df	318
7cd9f181	319	Methods for the morphological information (if any) attached to readings.
	320	A reading may be made up of multiple lexemes; the concatenated lexeme
	321	strings ought to match the reading's normalized form.
	322
	323	See L<Text::Tradition::Collation::Reading::Lexeme> for more information
	324	on Lexeme objects and their attributes.
	325
	326	=head2 has_lexemes
	327
	328	Returns a true value if the reading has any attached lexemes.
4d9593df	329
6ad2ce78	330	=head2 lexemes
06e7cbc7	331
7cd9f181	332	Returns the Lexeme objects (if any) attached to the reading.
6ad2ce78	333
	334	=head2 clear_lexemes
	335
7cd9f181	336	Wipes any associated Lexeme objects out of the reading.
	337
	338	=head2 add_lexeme( $lexobj )
6ad2ce78	339
7cd9f181	340	Adds the Lexeme in $lexobj to the list of lexemes.
	341
	342	=head2 lemmatize
	343
	344	If the language of the reading is set, this method will use the appropriate
	345	Language model to determine the lexemes that belong to this reading. See
	346	L<Text::Tradition::lemmatize> if you wish to lemmatize an entire tradition.
06e7cbc7	347
4d9593df	348	=cut
4d9593df	349
6ad2ce78	350	sub lemmatize {
	351	my $self = shift;
	352	unless( $self->has_language ) {
	353	warn "Please set a language to lemmatize a tradition";
	354	return;
	355	}
	356	my $mod = "Text::Tradition::Language::" . $self->language;
	357	load( $mod );
	358	$mod->can( 'reading_lookup' )->( $self );
	359
	360	}
4d9593df	361
7604424b	362	# For graph serialization. Return a JSON representation of the associated
7cd9f181	363	# reading lexemes.
	364	sub _serialize_lexemes {
	365	my $self = shift;
7604424b	366	my $json = JSON->new->allow_blessed(1)->convert_blessed(1);
7604424b	367	return $json->encode( [ $self->lexemes ] );
7cd9f181	368	}
70745e70	369
7604424b	370	# Given a JSON representation of the lexemes, instantiate them and add
7604424b	371	# them to the reading.
70745e70	372	sub _deserialize_lexemes {
7604424b	373	my( $self, $json ) = @_;
	374	my $data = from_json( $json );
	375	return unless @$data;
70745e70	376
7604424b	377	# Need to have the lexeme module in order to have lexemes.
7604424b	378	eval { use Text::Tradition::Collation::Reading::Lexeme; };
70745e70	379	throw( $@ ) if $@;
	380
	381	# Good to go - add the lexemes.
	382	my @lexemes;
7604424b	383	foreach my $lexhash ( @$data ) {
	384	push( @lexemes, Text::Tradition::Collation::Reading::Lexeme->new(
	385	'JSON' => $lexhash ) );
70745e70	386	}
	387	$self->clear_lexemes;
	388	$self->add_lexeme( @lexemes );
	389	}
7cd9f181	390
4d9593df	391	## Utility methods
4d9593df	392
2acf0892	393	sub TO_JSON {
	394	my $self = shift;
	395	return $self->text;
	396	}
	397
70745e70	398	sub throw {
	399	Text::Tradition::Error->throw(
	400	'ident' => 'Reading error',
	401	'message' => $_[0],
	402	);
	403	}
4d9593df	404
	405	no Moose;
	406	__PACKAGE__->meta->make_immutable;
	407
021bdbac	408	1;