[scpubgit/stemmatology.git] / lib / Text / Tradition / Graph / Position.pm

package Text::Tradition::Graph::Position;

use strict;
use warnings;

=head1 NAME

Text::Tradition::Graph::Position

=head1 SUMMARY

An object to go with a text graph that keeps track of relative
positions of the nodes on that graph.  This is useful for keeping
track of which readings are variants of each other, which is expensive
to calculate every time from the graph itself.

=head1 METHODS

=over 4

=item B<new>

Takes two arguments: a list of names of common nodes in the graph, and
a list of witness paths.  Calculates position identifiers for each
node based on this.

=cut

# TODO Why not just hand over the graph and calculate the common nodes
# and witness paths here?
sub new {
    my $proto = shift;
    my( $common_nodes, $witness_paths ) = @_;

    my $self = {};

    # We have to calculate the position identifiers for each word,
    # keyed on the common nodes.  This will be 'fun'.  The end result
    # is a hash per witness, whose key is the word node and whose
    # value is its position in the text.  Common nodes are always N,1
    # so have identical positions in each text.

    my $node_pos = {};
    foreach my $wit ( keys %$witness_paths ) {
	# First we walk each path, making a matrix for each witness that
	# corresponds to its eventual position identifier.  Common nodes
	# always start a new row, and are thus always in the first column.

	my $wit_matrix = [];
	my $cn = 0;  # We should hit the common nodes in order.
	my $row = [];
	foreach my $wn ( @{$witness_paths->{$wit}} ) { # $wn is a node name
	    if( $wn eq $common_nodes->[$cn] ) {
		# Set up to look for the next common node, and
		# start a new row of words.
		$cn++;
		push( @$wit_matrix, $row ) if scalar( @$row );
		$row = [];
	    }
	    push( @$row, $wn );
	}
	push( @$wit_matrix, $row );  # Push the last row onto the matrix

	# Now we have a matrix per witness, so that each row in the
	# matrix begins with a common node, and continues with all the
	# variant words that appear in the witness.  We turn this into
	# real positions in row,cell format.  But we need some
	# trickery in order to make sure that each node gets assigned
	# to only one position.

	foreach my $li ( 1..scalar(@$wit_matrix) ) {
	    foreach my $di ( 1..scalar(@{$wit_matrix->[$li-1]}) ) {
		my $node = $wit_matrix->[$li-1]->[$di-1];
		my $position = "$li,$di";
		# If we have seen this node before, we need to compare
		# its position with what went before.
		unless( exists $node_pos->{ $node } && 
			_cmp_position( $position, $node_pos->{ $node }) < 1 ) {
		    # The new position ID replaces the old one.
		    $node_pos->{$node} = $position;
		} # otherwise, the old position needs to stay.
	    }
	}
    }

    # Now we have a hash of node positions keyed on node.
    $self->{'node_positions'} = $node_pos;
    # We should also save our witness paths, as long as we have them.
    # Right now each path is a list of nodes; we may want to make it
    # a list of position refs.
    $self->{'witness_paths'} = $witness_paths;

    # We are also going to want to keep track of whether a position has
    # been explicitly emptied, for our lemmatization.
    my $position_state = {};
    map { $position_state->{ $_ } = undef } values %$node_pos;
    $self->{'position_state'} = $position_state;


    bless( $self, $proto );
    return $self;
}

=item B<node_position>

my $pos = $positions->node_position( $node );

Returns the position identifier for a given node in the graph.

=cut

sub node_position {
    my( $self, $node ) = @_;
    $node = _name( $node );

    unless( exists( $self->{'node_positions'}->{ $node } ) ) {
	warn "No node with name $node known to the graph";
	return;
    }

    return $self->{'node_positions'}->{ $node };
}

=item B<nodes_at_position>

my @nodes = $positions->nodes_at_position( $pos );

Returns the names of all the nodes in the graph at a given position.

=cut

sub nodes_at_position {
    my( $self, $pos ) = @_;

    my $positions = $self->_calc_positions();
    unless( exists $positions->{ $pos } ) {
	warn "No position $pos in the graph";
	return;
    }
    return @{ $positions->{ $pos }};
}

=item B<colocated_nodes>

my @nodes = $positions->colocated_nodes( $node );

Returns the names of all the nodes in the graph at the same position
as the node given, apart from that node itself.

=cut

sub colocated_nodes {
    my( $self, $node ) = @_;
    $node = _name( $node );
    my $pos = $self->node_position( $node );
    my @loc_nodes = $self->nodes_at_position( $pos );

    my @cn = grep { $_ !~ /^$node$/ } @loc_nodes;
    return @cn;
}

=item B<all>

my @position_list = $positions->all()

Returns an ordered list of positions in the graph.

=cut

sub all {
    my( $self ) = @_;
    my $pos = $self->_calc_positions;
    return sort by_position keys( %$pos );
}

sub witness_path {
    my( $self, $wit ) = @_;
    return @{$self->{'witness_paths'}->{ $wit }};
}

=back

=head2 Lemmatization functions

For some traditions, each position will have at least one node that is
the 'lemma text', that is, the text that an editor has chosen to stand
as authoritative for the tradition.  The following methods keep
track of what lemma, if any, should stand at each position.

=over

=item B<init_lemmatizer>

$positions->init_lemmatizer( @nodelist )

Sets up the necessary logic for keeping track of lemmas.  It should be
called once, with the initial list of lemmas.

=cut

# TODO We can initialize this without the argument, based on the passed
# list of common nodes.
sub init_lemmatizer {
    my( $self, @nodes ) = @_;
    foreach my $n ( @nodes ) {
	$self->set_state( $self->node_position( $n ), $n );
    }
}

=item B<state>

my $answer = $positions->state( $position_id )

For the given position ID, returns the node (if any) that stands at
the lemma.  If no node should stand as lemma at this position, returns
0; if no decision has been made for this position, returns undef.

=cut

sub state {
    my( $self, $pos ) = @_;
    return $self->{'position_state'}->{ $pos };
}

=item B<set_state>

$positions->set_state( $position_id, $state )

For the given position ID, sets the lemma (if any).  State can be the
name of a node, 0 (for cases when no lemma should stand), or undef
(for cases when no decision has been made).

=cut

sub set_state {
    my( $self, $pos, $state ) = @_;
    $self->{'position_state'}->{ $pos } = $state;
}

=back

=head2 Comparison function

=over

=item B<by_position>

my @nodelist = sort $positions->by_position @nodelist;

For use in the 'sort' function.  Returns a comparison value based on
the position of the given nodes.

=cut

# Compares two nodes according to their positions in the witness 
# index hash.
sub by_position {
    my $self = shift;
    return _cmp_position( $a, $b );
}

# Takes two position strings (X,Y) and sorts them.
sub _cmp_position {
    my( $a, $b ) = @_;
    my @pos_a = split(/,/, $a );
    my @pos_b = split(/,/, $b );

    my $big_cmp = $pos_a[0] <=> $pos_b[0];
    return $big_cmp if $big_cmp;
    # else 
    return $pos_a[1] <=> $pos_b[1];
}


#### HELPER FUNCTIONS ####

# At some point I may find myself using scalar references for the node
# positions, in order to keep them easily in sync.  Just in case, I will
# calculate this every time I need it.
sub _calc_positions {
    my $self = shift;
    return _invert_hash( $self->{'node_positions'} )
}

# Helper for dealing with node refs
sub _name {
    my( $node ) = @_;
    # We work with node names in this library
    if( ref( $node ) && ref( $node ) eq 'Graph::Easy::Node' ) {
	$node = $node->name();
    }
    return $node;
}

# Useful helper.  Will be especially useful if I find myself using
# scalar references for the positions after all - it can dereference
# them here.
sub _invert_hash {
    my ( $hash, $plaintext_keys ) = @_;
    my %new_hash;
    foreach my $key ( keys %$hash ) {
        my $val = $hash->{$key};
        my $valkey = $val;
        if( $plaintext_keys 
            && ref( $val ) ) {
            $valkey = $plaintext_keys->{ scalar( $val ) };
            warn( "No plaintext value given for $val" ) unless $valkey;
        }
        if( exists ( $new_hash{$valkey} ) ) {
            push( @{$new_hash{$valkey}}, $key );
        } else {
            $new_hash{$valkey} = [ $key ];
        }
    }
    return \%new_hash;
}

=back

=head1 LICENSE

This package is free software and is provided "as is" without express
or implied warranty.  You can redistribute it and/or modify it under
the same terms as Perl itself.

=head1 AUTHOR

Tara L Andrews, aurum@cpan.org

=cut

1;
Commit	Line	Data
e58153d6	1	package Text::Tradition::Graph::Position;
a25d4374	2
	3	use strict;
	4	use warnings;
	5
	6	=head1 NAME
	7
e58153d6	8	Text::Tradition::Graph::Position
a25d4374	9
	10	=head1 SUMMARY
	11
	12	An object to go with a text graph that keeps track of relative
2ceca8c3	13	positions of the nodes on that graph. This is useful for keeping
	14	track of which readings are variants of each other, which is expensive
	15	to calculate every time from the graph itself.
a25d4374	16
	17	=head1 METHODS
	18
	19	=over 4
	20
	21	=item B<new>
	22
	23	Takes two arguments: a list of names of common nodes in the graph, and
	24	a list of witness paths. Calculates position identifiers for each
	25	node based on this.
	26
	27	=cut
	28
2ceca8c3	29	# TODO Why not just hand over the graph and calculate the common nodes
2ceca8c3	30	# and witness paths here?
a25d4374	31	sub new {
	32	my $proto = shift;
	33	my( $common_nodes, $witness_paths ) = @_;
	34
	35	my $self = {};
	36
	37	# We have to calculate the position identifiers for each word,
	38	# keyed on the common nodes. This will be 'fun'. The end result
	39	# is a hash per witness, whose key is the word node and whose
	40	# value is its position in the text. Common nodes are always N,1
	41	# so have identical positions in each text.
	42
	43	my $node_pos = {};
	44	foreach my $wit ( keys %$witness_paths ) {
	45	# First we walk each path, making a matrix for each witness that
	46	# corresponds to its eventual position identifier. Common nodes
	47	# always start a new row, and are thus always in the first column.
	48
	49	my $wit_matrix = [];
	50	my $cn = 0; # We should hit the common nodes in order.
	51	my $row = [];
	52	foreach my $wn ( @{$witness_paths->{$wit}} ) { # $wn is a node name
	53	if( $wn eq $common_nodes->[$cn] ) {
	54	# Set up to look for the next common node, and
	55	# start a new row of words.
	56	$cn++;
	57	push( @$wit_matrix, $row ) if scalar( @$row );
	58	$row = [];
	59	}
	60	push( @$row, $wn );
	61	}
	62	push( @$wit_matrix, $row ); # Push the last row onto the matrix
	63
	64	# Now we have a matrix per witness, so that each row in the
	65	# matrix begins with a common node, and continues with all the
	66	# variant words that appear in the witness. We turn this into
	67	# real positions in row,cell format. But we need some
	68	# trickery in order to make sure that each node gets assigned
	69	# to only one position.
	70
	71	foreach my $li ( 1..scalar(@$wit_matrix) ) {
	72	foreach my $di ( 1..scalar(@{$wit_matrix->[$li-1]}) ) {
	73	my $node = $wit_matrix->[$li-1]->[$di-1];
	74	my $position = "$li,$di";
	75	# If we have seen this node before, we need to compare
	76	# its position with what went before.
	77	unless( exists $node_pos->{ $node } &&
	78	_cmp_position( $position, $node_pos->{ $node }) < 1 ) {
	79	# The new position ID replaces the old one.
	80	$node_pos->{$node} = $position;
	81	} # otherwise, the old position needs to stay.
	82	}
	83	}
	84	}
58a3c424	85
a25d4374	86	# Now we have a hash of node positions keyed on node.
a25d4374	87	$self->{'node_positions'} = $node_pos;
58a3c424	88	# We should also save our witness paths, as long as we have them.
	89	# Right now each path is a list of nodes; we may want to make it
	90	# a list of position refs.
a25d4374	91	$self->{'witness_paths'} = $witness_paths;
a25d4374	92
58a3c424	93	# We are also going to want to keep track of whether a position has
	94	# been explicitly emptied, for our lemmatization.
	95	my $position_state = {};
	96	map { $position_state->{ $_ } = undef } values %$node_pos;
	97	$self->{'position_state'} = $position_state;
	98
	99
a25d4374	100	bless( $self, $proto );
	101	return $self;
	102	}
	103
2ceca8c3	104	=item B<node_position>
	105
	106	my $pos = $positions->node_position( $node );
	107
	108	Returns the position identifier for a given node in the graph.
	109
	110	=cut
	111
a25d4374	112	sub node_position {
	113	my( $self, $node ) = @_;
	114	$node = _name( $node );
	115
	116	unless( exists( $self->{'node_positions'}->{ $node } ) ) {
	117	warn "No node with name $node known to the graph";
	118	return;
	119	}
	120
	121	return $self->{'node_positions'}->{ $node };
	122	}
	123
2ceca8c3	124	=item B<nodes_at_position>
	125
	126	my @nodes = $positions->nodes_at_position( $pos );
	127
	128	Returns the names of all the nodes in the graph at a given position.
	129
	130	=cut
	131
a25d4374	132	sub nodes_at_position {
	133	my( $self, $pos ) = @_;
	134
2ceca8c3	135	my $positions = $self->_calc_positions();
a25d4374	136	unless( exists $positions->{ $pos } ) {
	137	warn "No position $pos in the graph";
	138	return;
	139	}
	140	return @{ $positions->{ $pos }};
	141	}
	142
2ceca8c3	143	=item B<colocated_nodes>
	144
	145	my @nodes = $positions->colocated_nodes( $node );
	146
	147	Returns the names of all the nodes in the graph at the same position
	148	as the node given, apart from that node itself.
	149
	150	=cut
	151
a25d4374	152	sub colocated_nodes {
	153	my( $self, $node ) = @_;
	154	$node = _name( $node );
	155	my $pos = $self->node_position( $node );
	156	my @loc_nodes = $self->nodes_at_position( $pos );
	157
	158	my @cn = grep { $_ !~ /^$node$/ } @loc_nodes;
	159	return @cn;
	160	}
	161
2ceca8c3	162	=item B<all>
	163
	164	my @position_list = $positions->all()
	165
	166	Returns an ordered list of positions in the graph.
	167
	168	=cut
	169
a25d4374	170	sub all {
a25d4374	171	my( $self ) = @_;
2ceca8c3	172	my $pos = $self->_calc_positions;
a25d4374	173	return sort by_position keys( %$pos );
	174	}
	175
2ceca8c3	176	sub witness_path {
	177	my( $self, $wit ) = @_;
	178	return @{$self->{'witness_paths'}->{ $wit }};
58a3c424	179	}
58a3c424	180
2ceca8c3	181	=back
	182
	183	=head2 Lemmatization functions
	184
	185	For some traditions, each position will have at least one node that is
	186	the 'lemma text', that is, the text that an editor has chosen to stand
	187	as authoritative for the tradition. The following methods keep
	188	track of what lemma, if any, should stand at each position.
	189
	190	=over
58a3c424	191
2ceca8c3	192	=item B<init_lemmatizer>
	193
	194	$positions->init_lemmatizer( @nodelist )
	195
	196	Sets up the necessary logic for keeping track of lemmas. It should be
	197	called once, with the initial list of lemmas.
	198
	199	=cut
	200
	201	# TODO We can initialize this without the argument, based on the passed
	202	# list of common nodes.
58a3c424	203	sub init_lemmatizer {
	204	my( $self, @nodes ) = @_;
	205	foreach my $n ( @nodes ) {
	206	$self->set_state( $self->node_position( $n ), $n );
	207	}
	208	}
	209
2ceca8c3	210	=item B<state>
a25d4374	211
2ceca8c3	212	my $answer = $positions->state( $position_id )
	213
	214	For the given position ID, returns the node (if any) that stands at
	215	the lemma. If no node should stand as lemma at this position, returns
	216	0; if no decision has been made for this position, returns undef.
	217
	218	=cut
	219
	220	sub state {
	221	my( $self, $pos ) = @_;
	222	return $self->{'position_state'}->{ $pos };
a25d4374	223	}
a25d4374	224
2ceca8c3	225	=item B<set_state>
	226
	227	$positions->set_state( $position_id, $state )
	228
	229	For the given position ID, sets the lemma (if any). State can be the
	230	name of a node, 0 (for cases when no lemma should stand), or undef
	231	(for cases when no decision has been made).
	232
	233	=cut
	234
	235	sub set_state {
	236	my( $self, $pos, $state ) = @_;
	237	$self->{'position_state'}->{ $pos } = $state;
a25d4374	238	}
a25d4374	239
2ceca8c3	240	=back
	241
	242	=head2 Comparison function
	243
	244	=over
	245
	246	=item B<by_position>
	247
	248	my @nodelist = sort $positions->by_position @nodelist;
	249
	250	For use in the 'sort' function. Returns a comparison value based on
	251	the position of the given nodes.
	252
	253	=cut
a25d4374	254
	255	# Compares two nodes according to their positions in the witness
	256	# index hash.
	257	sub by_position {
	258	my $self = shift;
	259	return _cmp_position( $a, $b );
	260	}
	261
	262	# Takes two position strings (X,Y) and sorts them.
	263	sub _cmp_position {
	264	my( $a, $b ) = @_;
	265	my @pos_a = split(/,/, $a );
	266	my @pos_b = split(/,/, $b );
	267
	268	my $big_cmp = $pos_a[0] <=> $pos_b[0];
	269	return $big_cmp if $big_cmp;
	270	# else
	271	return $pos_a[1] <=> $pos_b[1];
	272	}
	273
2ceca8c3	274
	275	#### HELPER FUNCTIONS ####
	276
	277	# At some point I may find myself using scalar references for the node
	278	# positions, in order to keep them easily in sync. Just in case, I will
	279	# calculate this every time I need it.
	280	sub _calc_positions {
	281	my $self = shift;
	282	return _invert_hash( $self->{'node_positions'} )
	283	}
	284
	285	# Helper for dealing with node refs
	286	sub _name {
	287	my( $node ) = @_;
	288	# We work with node names in this library
	289	if( ref( $node ) && ref( $node ) eq 'Graph::Easy::Node' ) {
	290	$node = $node->name();
	291	}
	292	return $node;
	293	}
	294
a25d4374	295	# Useful helper. Will be especially useful if I find myself using
	296	# scalar references for the positions after all - it can dereference
	297	# them here.
	298	sub _invert_hash {
	299	my ( $hash, $plaintext_keys ) = @_;
	300	my %new_hash;
	301	foreach my $key ( keys %$hash ) {
	302	my $val = $hash->{$key};
	303	my $valkey = $val;
	304	if( $plaintext_keys
	305	&& ref( $val ) ) {
	306	$valkey = $plaintext_keys->{ scalar( $val ) };
	307	warn( "No plaintext value given for $val" ) unless $valkey;
	308	}
	309	if( exists ( $new_hash{$valkey} ) ) {
	310	push( @{$new_hash{$valkey}}, $key );
	311	} else {
	312	$new_hash{$valkey} = [ $key ];
	313	}
	314	}
	315	return \%new_hash;
	316	}
	317
2ceca8c3	318	=back
	319
	320	=head1 LICENSE
	321
	322	This package is free software and is provided "as is" without express
	323	or implied warranty. You can redistribute it and/or modify it under
	324	the same terms as Perl itself.
	325
	326	=head1 AUTHOR
	327
	328	Tara L Andrews, aurum@cpan.org
	329
	330	=cut
	331
a25d4374	332	1;