[scpubgit/stemmatology.git] / lib / Text / Tradition / Graph / Position.pm

package Text::Tradition::Graph::Position;

use strict;
use warnings;

=head1 NAME

Text::Tradition::Graph::Position

=head1 SUMMARY

An object to go with a text graph that keeps track of relative
positions of the nodes.

=head1 METHODS

=over 4

=item B<new>

Takes two arguments: a list of names of common nodes in the graph, and
a list of witness paths.  Calculates position identifiers for each
node based on this.

=cut

sub new {
    my $proto = shift;
    my( $common_nodes, $witness_paths ) = @_;

    my $self = {};

    # We have to calculate the position identifiers for each word,
    # keyed on the common nodes.  This will be 'fun'.  The end result
    # is a hash per witness, whose key is the word node and whose
    # value is its position in the text.  Common nodes are always N,1
    # so have identical positions in each text.

    my $node_pos = {};
    foreach my $wit ( keys %$witness_paths ) {
	# First we walk each path, making a matrix for each witness that
	# corresponds to its eventual position identifier.  Common nodes
	# always start a new row, and are thus always in the first column.

	my $wit_matrix = [];
	my $cn = 0;  # We should hit the common nodes in order.
	my $row = [];
	foreach my $wn ( @{$witness_paths->{$wit}} ) { # $wn is a node name
	    if( $wn eq $common_nodes->[$cn] ) {
		# Set up to look for the next common node, and
		# start a new row of words.
		$cn++;
		push( @$wit_matrix, $row ) if scalar( @$row );
		$row = [];
	    }
	    push( @$row, $wn );
	}
	push( @$wit_matrix, $row );  # Push the last row onto the matrix

	# Now we have a matrix per witness, so that each row in the
	# matrix begins with a common node, and continues with all the
	# variant words that appear in the witness.  We turn this into
	# real positions in row,cell format.  But we need some
	# trickery in order to make sure that each node gets assigned
	# to only one position.

	foreach my $li ( 1..scalar(@$wit_matrix) ) {
	    foreach my $di ( 1..scalar(@{$wit_matrix->[$li-1]}) ) {
		my $node = $wit_matrix->[$li-1]->[$di-1];
		my $position = "$li,$di";
		# If we have seen this node before, we need to compare
		# its position with what went before.
		unless( exists $node_pos->{ $node } && 
			_cmp_position( $position, $node_pos->{ $node }) < 1 ) {
		    # The new position ID replaces the old one.
		    $node_pos->{$node} = $position;
		} # otherwise, the old position needs to stay.
	    }
	}
    }

    # Now we have a hash of node positions keyed on node.
    $self->{'node_positions'} = $node_pos;
    # We should also save our witness paths, as long as we have them.
    # Right now each path is a list of nodes; we may want to make it
    # a list of position refs.
    $self->{'witness_paths'} = $witness_paths;

    # We are also going to want to keep track of whether a position has
    # been explicitly emptied, for our lemmatization.
    my $position_state = {};
    map { $position_state->{ $_ } = undef } values %$node_pos;
    $self->{'position_state'} = $position_state;


    bless( $self, $proto );
    return $self;
}

sub node_position {
    my( $self, $node ) = @_;
    $node = _name( $node );

    unless( exists( $self->{'node_positions'}->{ $node } ) ) {
	warn "No node with name $node known to the graph";
	return;
    }

    return $self->{'node_positions'}->{ $node };
}

sub nodes_at_position {
    my( $self, $pos ) = @_;

    my $positions = $self->calc_positions();
    unless( exists $positions->{ $pos } ) {
	warn "No position $pos in the graph";
	return;
    }
    return @{ $positions->{ $pos }};
}

sub colocated_nodes {
    my( $self, $node ) = @_;
    $node = _name( $node );
    my $pos = $self->node_position( $node );
    my @loc_nodes = $self->nodes_at_position( $pos );

    my @cn = grep { $_ !~ /^$node$/ } @loc_nodes;
    return @cn;
}

# Returns an ordered list of positions in this graph
sub all {
    my( $self ) = @_;
    my $pos = $self->calc_positions;
    return sort by_position keys( %$pos );
}

# Returns undef if no decision has been taken on this position, the
# node name if there is a lemma for it, and 0 if there is no lemma for
# it.
sub state {
    my( $self, $pos ) = @_;
    return $self->{'position_state'}->{ $pos };
}

sub set_state {
    my( $self, $pos, $state ) = @_;
    $self->{'position_state'}->{ $pos } = $state;
}

sub init_lemmatizer {
    my( $self, @nodes ) = @_;
    foreach my $n ( @nodes ) {
	$self->set_state( $self->node_position( $n ), $n );
    }
}

sub witness_path {
    my( $self, $wit ) = @_;
    return @{$self->{'witness_paths'}->{ $wit }};
}

# At some point I may find myself using scalar references for the node
# positions, in order to keep them easily in sync.  Just in case, I will
# calculate this every time I need it.
sub calc_positions {
    my $self = shift;
    return _invert_hash( $self->{'node_positions'} )
}

# Helper for dealing with node refs
sub _name {
    my( $node ) = @_;
    # We work with node names in this library
    if( ref( $node ) && ref( $node ) eq 'Graph::Easy::Node' ) {
	$node = $node->name();
    }
    return $node;
}

### Comparison functions

# Compares two nodes according to their positions in the witness 
# index hash.
sub by_position {
    my $self = shift;
    return _cmp_position( $a, $b );
}

# Takes two position strings (X,Y) and sorts them.
sub _cmp_position {
    my( $a, $b ) = @_;
    my @pos_a = split(/,/, $a );
    my @pos_b = split(/,/, $b );

    my $big_cmp = $pos_a[0] <=> $pos_b[0];
    return $big_cmp if $big_cmp;
    # else 
    return $pos_a[1] <=> $pos_b[1];
}

# Useful helper.  Will be especially useful if I find myself using
# scalar references for the positions after all - it can dereference
# them here.
sub _invert_hash {
    my ( $hash, $plaintext_keys ) = @_;
    my %new_hash;
    foreach my $key ( keys %$hash ) {
        my $val = $hash->{$key};
        my $valkey = $val;
        if( $plaintext_keys 
            && ref( $val ) ) {
            $valkey = $plaintext_keys->{ scalar( $val ) };
            warn( "No plaintext value given for $val" ) unless $valkey;
        }
        if( exists ( $new_hash{$valkey} ) ) {
            push( @{$new_hash{$valkey}}, $key );
        } else {
            $new_hash{$valkey} = [ $key ];
        }
    }
    return \%new_hash;
}

1;
Commit	Line	Data
e58153d6	1	package Text::Tradition::Graph::Position;
a25d4374	2
	3	use strict;
	4	use warnings;
	5
	6	=head1 NAME
	7
e58153d6	8	Text::Tradition::Graph::Position
a25d4374	9
	10	=head1 SUMMARY
	11
	12	An object to go with a text graph that keeps track of relative
	13	positions of the nodes.
	14
	15	=head1 METHODS
	16
	17	=over 4
	18
	19	=item B<new>
	20
	21	Takes two arguments: a list of names of common nodes in the graph, and
	22	a list of witness paths. Calculates position identifiers for each
	23	node based on this.
	24
	25	=cut
	26
	27	sub new {
	28	my $proto = shift;
	29	my( $common_nodes, $witness_paths ) = @_;
	30
	31	my $self = {};
	32
	33	# We have to calculate the position identifiers for each word,
	34	# keyed on the common nodes. This will be 'fun'. The end result
	35	# is a hash per witness, whose key is the word node and whose
	36	# value is its position in the text. Common nodes are always N,1
	37	# so have identical positions in each text.
	38
	39	my $node_pos = {};
	40	foreach my $wit ( keys %$witness_paths ) {
	41	# First we walk each path, making a matrix for each witness that
	42	# corresponds to its eventual position identifier. Common nodes
	43	# always start a new row, and are thus always in the first column.
	44
	45	my $wit_matrix = [];
	46	my $cn = 0; # We should hit the common nodes in order.
	47	my $row = [];
	48	foreach my $wn ( @{$witness_paths->{$wit}} ) { # $wn is a node name
	49	if( $wn eq $common_nodes->[$cn] ) {
	50	# Set up to look for the next common node, and
	51	# start a new row of words.
	52	$cn++;
	53	push( @$wit_matrix, $row ) if scalar( @$row );
	54	$row = [];
	55	}
	56	push( @$row, $wn );
	57	}
	58	push( @$wit_matrix, $row ); # Push the last row onto the matrix
	59
	60	# Now we have a matrix per witness, so that each row in the
	61	# matrix begins with a common node, and continues with all the
	62	# variant words that appear in the witness. We turn this into
	63	# real positions in row,cell format. But we need some
	64	# trickery in order to make sure that each node gets assigned
	65	# to only one position.
	66
	67	foreach my $li ( 1..scalar(@$wit_matrix) ) {
	68	foreach my $di ( 1..scalar(@{$wit_matrix->[$li-1]}) ) {
	69	my $node = $wit_matrix->[$li-1]->[$di-1];
	70	my $position = "$li,$di";
	71	# If we have seen this node before, we need to compare
	72	# its position with what went before.
73	unless( exists $node_pos->{ $node } &&
74	_cmp_position( $position, $node_pos->{ $node }) < 1 ) {
75	# The new position ID replaces the old one.
76	$node_pos->{$node} = $position;
77	} # otherwise, the old position needs to stay.
78	}
79	}
80	}
58a3c424	81
a25d4374	82	# Now we have a hash of node positions keyed on node.
a25d4374	83	$self->{'node_positions'} = $node_pos;
58a3c424	84	# We should also save our witness paths, as long as we have them.
	85	# Right now each path is a list of nodes; we may want to make it
	86	# a list of position refs.
a25d4374	87	$self->{'witness_paths'} = $witness_paths;
a25d4374	88
58a3c424	89	# We are also going to want to keep track of whether a position has
	90	# been explicitly emptied, for our lemmatization.
	91	my $position_state = {};
	92	map { $position_state->{ $_ } = undef } values %$node_pos;
	93	$self->{'position_state'} = $position_state;
	94
	95
a25d4374	96	bless( $self, $proto );
	97	return $self;
	98	}
	99
	100	sub node_position {
	101	my( $self, $node ) = @_;
	102	$node = _name( $node );
	103
	104	unless( exists( $self->{'node_positions'}->{ $node } ) ) {
	105	warn "No node with name $node known to the graph";
	106	return;
	107	}
	108
	109	return $self->{'node_positions'}->{ $node };
	110	}
	111
	112	sub nodes_at_position {
	113	my( $self, $pos ) = @_;
	114
	115	my $positions = $self->calc_positions();
	116	unless( exists $positions->{ $pos } ) {
	117	warn "No position $pos in the graph";
	118	return;
	119	}
	120	return @{ $positions->{ $pos }};
	121	}
	122
	123	sub colocated_nodes {
	124	my( $self, $node ) = @_;
	125	$node = _name( $node );
	126	my $pos = $self->node_position( $node );
	127	my @loc_nodes = $self->nodes_at_position( $pos );
	128
	129	my @cn = grep { $_ !~ /^$node$/ } @loc_nodes;
	130	return @cn;
	131	}
	132
58a3c424	133	# Returns an ordered list of positions in this graph
a25d4374	134	sub all {
	135	my( $self ) = @_;
	136	my $pos = $self->calc_positions;
	137	return sort by_position keys( %$pos );
	138	}
	139
58a3c424	140	# Returns undef if no decision has been taken on this position, the
	141	# node name if there is a lemma for it, and 0 if there is no lemma for
	142	# it.
	143	sub state {
	144	my( $self, $pos ) = @_;
	145	return $self->{'position_state'}->{ $pos };
	146	}
	147
	148	sub set_state {
	149	my( $self, $pos, $state ) = @_;
	150	$self->{'position_state'}->{ $pos } = $state;
	151	}
	152
	153	sub init_lemmatizer {
	154	my( $self, @nodes ) = @_;
	155	foreach my $n ( @nodes ) {
	156	$self->set_state( $self->node_position( $n ), $n );
	157	}
	158	}
	159
a25d4374	160	sub witness_path {
	161	my( $self, $wit ) = @_;
	162	return @{$self->{'witness_paths'}->{ $wit }};
	163	}
	164
	165	# At some point I may find myself using scalar references for the node
	166	# positions, in order to keep them easily in sync. Just in case, I will
	167	# calculate this every time I need it.
	168	sub calc_positions {
	169	my $self = shift;
	170	return _invert_hash( $self->{'node_positions'} )
	171	}
	172
	173	# Helper for dealing with node refs
	174	sub _name {
	175	my( $node ) = @_;
	176	# We work with node names in this library
	177	if( ref( $node ) && ref( $node ) eq 'Graph::Easy::Node' ) {
	178	$node = $node->name();
	179	}
	180	return $node;
	181	}
	182
	183	### Comparison functions
	184
	185	# Compares two nodes according to their positions in the witness
	186	# index hash.
	187	sub by_position {
	188	my $self = shift;
	189	return _cmp_position( $a, $b );
	190	}
	191
	192	# Takes two position strings (X,Y) and sorts them.
	193	sub _cmp_position {
	194	my( $a, $b ) = @_;
	195	my @pos_a = split(/,/, $a );
	196	my @pos_b = split(/,/, $b );
	197
	198	my $big_cmp = $pos_a[0] <=> $pos_b[0];
	199	return $big_cmp if $big_cmp;
	200	# else
	201	return $pos_a[1] <=> $pos_b[1];
	202	}
	203
	204	# Useful helper. Will be especially useful if I find myself using
	205	# scalar references for the positions after all - it can dereference
	206	# them here.
	207	sub _invert_hash {
	208	my ( $hash, $plaintext_keys ) = @_;
	209	my %new_hash;
	210	foreach my $key ( keys %$hash ) {
	211	my $val = $hash->{$key};
	212	my $valkey = $val;
	213	if( $plaintext_keys
	214	&& ref( $val ) ) {
	215	$valkey = $plaintext_keys->{ scalar( $val ) };
	216	warn( "No plaintext value given for $val" ) unless $valkey;
	217	}
	218	if( exists ( $new_hash{$valkey} ) ) {
	219	push( @{$new_hash{$valkey}}, $key );
	220	} else {
	221	$new_hash{$valkey} = [ $key ];
	222	}
	223	}
224	return \%new_hash;
225	}
226
227	1;