From: Tara L Andrews <tla@mit.edu>
Date: Mon, 2 May 2011 22:42:25 +0000 (+0200)
Subject: fixed node matching against many variants
X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=e49731d785cb2cd70ffccdcf1fabe61f0c419022;p=scpubgit%2Fstemmatology.git

fixed node matching against many variants
---

diff --git a/lib/Text/Tradition/Graph.pm b/lib/Text/Tradition/Graph.pm
index f49db42..be16099 100644
--- a/lib/Text/Tradition/Graph.pm
+++ b/lib/Text/Tradition/Graph.pm
@@ -280,18 +280,8 @@ path.  TODO These are badly named.
 
 sub next_word {
     # Return the successor via the corresponding edge.
-    my( $self, $node, $edge ) = @_;
-    $edge = "base text" unless $edge;
-    my @next_edges = $node->outgoing();
-    return undef unless scalar( @next_edges );
-    
-    foreach my $e ( @next_edges ) {
-	next unless $e->label() eq $edge;
-	return $e->to();
-    }
-
-    warn "Could not find node connected to edge $edge";
-    return undef;
+    my $self = shift;
+    return $self->_find_linked_word( 'next', @_ );
 }
 
 =item B<prior_word>
@@ -305,20 +295,42 @@ path.  TODO These are badly named.
 
 sub prior_word {
     # Return the predecessor via the corresponding edge.
-    my( $self, $node, $edge ) = @_;
-    $edge = "base text" unless $edge;
-    my @prior_edges = $node->incoming();
-    return undef unless scalar( @prior_edges );
+    my $self = shift;
+    return $self->_find_linked_word( 'prior', @_ );
+}
+
+sub _find_linked_word {
+    my( $self, $direction, $node, $edge ) = @_;
+    $edge = 'base text' unless $edge;
+    my @linked_edges = $direction eq 'next' 
+	? $node->outgoing() : $node->incoming();
+    return undef unless scalar( @linked_edges );
     
-    foreach my $e ( @prior_edges ) {
-	next unless $e->label() eq $edge;
-	return $e->from();
+    # We have to find the linked edge that contains all of the
+    # witnesses supplied in $edge.
+    my @edge_wits = split( /, /, $edge );
+    foreach my $le ( @linked_edges ) {
+	my @le_wits = split( /, /, $le->name() );
+	if( _is_within( \@edge_wits, \@le_wits ) ) {
+	    # This is the right edge.
+	    return $direction eq 'next' ? $le->to() : $le->from();
+	}
     }
-
-    warn "Could not find node connected from edge $edge";
+    warn "Could not find $direction node from " . $node->label 
+	. " along edge $edge";
     return undef;
 }
 
+# Some set logic.
+sub _is_within {
+    my( $set1, $set2 ) = @_;
+    my $ret = 1;
+    foreach my $el ( @$set1 ) {
+	$ret = 0 unless grep { /^\Q$el\E$/ } @$set2;
+    }
+    return $ret;
+}
+
 =item B<node_sequence>
 
 my @nodes = $graph->node_sequence( $first, $last, $path );
@@ -487,13 +499,95 @@ sub init_lemmatizer {
 
 }
 
-# Takes a list of nodes that have just been turned off, and returns a
-# set of tuples of the form ['node', 'state'] that indicates what
-# changes need to be made to the graph.
-# A state of 1 means 'turn on this node'
-# A state of 0 means 'turn off this node'
-# A state of undef means 'an ellipsis belongs in the text here because
-#   no decision has been made'
+=item B<toggle_node>
+
+my @nodes_turned_off = $graph->toggle_node( $node );
+
+Takes a node name, and either lemmatizes or de-lemmatizes it. Returns
+a list of all nodes that are de-lemmatized as a result of the toggle.
+
+=cut
+
+sub toggle_node {
+    my( $self, $node ) = @_;
+    
+    # In case this is being called for the first time.
+    $self->init_lemmatizer();
+
+    if( $self->is_common( $node ) ) {
+	# Do nothing, it's a common node.
+	return;
+    } 
+    
+    my $pos = $self->{'positions'}->node_position( $node );
+    my $old_state = $self->{'positions'}->state( $pos );
+    my @nodes_off;
+    if( $old_state && $old_state eq $node ) {
+	# Turn off the node. We turn on no others by default.
+	push( @nodes_off, $node );
+    } else {
+	# Turn on the node.
+	$self->{'positions'}->set_state( $pos, $node );
+	# Any other 'on' nodes in the same position should be off.
+	push( @nodes_off, $self->colocated_nodes( $node ) );
+	# Any node that is an identical transposed one should be off.
+	push( @nodes_off, $self->identical_nodes( $node ) )
+	    if $self->identical_nodes( $node );
+    }
+    @nodes_off = unique_list( @nodes_off );
+
+    # Turn off the nodes that need to be turned off.
+    my @nodes_turned_off;
+    foreach my $n ( @nodes_off ) {
+	my $npos = $self->{'positions'}->node_position( $n );
+	my $state = $self->{'positions'}->state( $npos );
+	if( $state && $state eq $n ) { 
+	    # this node is still on
+	    push( @nodes_turned_off, $n );
+	    my $new_state = undef;
+	    if( $n eq $node ) {
+		# This is the node that was clicked, so if there are no
+		# other nodes there, turn off the position.  In all other
+		# cases, restore the ellipsis.
+		my @all_n = $self->{'positions'}->nodes_at_position( $pos );
+		$new_state = 0 if scalar( @all_n ) == 1;
+	    }
+	    $self->{'positions'}->set_state( $npos, $new_state );
+	} elsif( $old_state && $old_state eq $n ) { 
+	    # another node has already been turned on here
+	    push( @nodes_turned_off, $n );
+	} # else some other node was on anyway, so pass.
+    }
+    return @nodes_turned_off;
+}
+
+=item B<active_nodes>
+
+my @state = $graph->active_nodes( @nodes_turned_off );
+
+Takes a list of nodes that have just been turned off, and returns a
+set of tuples of the form ['node', 'state'] that indicates what
+changes need to be made to the graph.
+
+=over
+
+=item * 
+
+A state of 1 means 'turn on this node'
+
+=item * 
+
+A state of 0 means 'turn off this node'
+
+=item * 
+
+A state of undef means 'an ellipsis belongs in the text here because
+no decision has been made'
+
+=back
+
+=cut
+
 sub active_nodes {
     my( $self, @toggled_off_nodes ) = @_;
 
@@ -558,62 +652,6 @@ sub _nodeobj {
     return $node;
 }
 
-# toggle_node takes a node name, and either lemmatizes or de-lemmatizes it.
-# Returns a list of nodes that are de-lemmatized as a result of the toggle.
-
-sub toggle_node {
-    my( $self, $node ) = @_;
-    
-    # In case this is being called for the first time.
-    $self->init_lemmatizer();
-
-    if( $self->is_common( $node ) ) {
-	# Do nothing, it's a common node.
-	return;
-    } 
-    
-    my $pos = $self->{'positions'}->node_position( $node );
-    my $old_state = $self->{'positions'}->state( $pos );
-    my @nodes_off;
-    if( $old_state && $old_state eq $node ) {
-	# Turn off the node. We turn on no others by default.
-	push( @nodes_off, $node );
-    } else {
-	# Turn on the node.
-	$self->{'positions'}->set_state( $pos, $node );
-	# Any other 'on' nodes in the same position should be off.
-	push( @nodes_off, $self->colocated_nodes( $node ) );
-	# Any node that is an identical transposed one should be off.
-	push( @nodes_off, $self->identical_nodes( $node ) )
-	    if $self->identical_nodes( $node );
-    }
-    @nodes_off = unique_list( @nodes_off );
-
-    # Turn off the nodes that need to be turned off.
-    my @nodes_turned_off;
-    foreach my $n ( @nodes_off ) {
-	my $npos = $self->{'positions'}->node_position( $n );
-	my $state = $self->{'positions'}->state( $npos );
-	if( $state && $state eq $n ) { 
-	    # this node is still on
-	    push( @nodes_turned_off, $n );
-	    my $new_state = undef;
-	    if( $n eq $node ) {
-		# This is the node that was clicked, so if there are no
-		# other nodes there, turn off the position.  In all other
-		# cases, restore the ellipsis.
-		my @all_n = $self->{'positions'}->nodes_at_position( $pos );
-		$new_state = 0 if scalar( @all_n ) == 1;
-	    }
-	    $self->{'positions'}->set_state( $npos, $new_state );
-	} elsif( $old_state && $old_state eq $n ) { 
-	    # another node has already been turned on here
-	    push( @nodes_turned_off, $n );
-	} # else some other node was on anyway, so pass.
-    }
-    return @nodes_turned_off;
-}
-
 sub colocated_nodes {
     my $self = shift;
     return $self->{'positions'}->colocated_nodes( @_ );
diff --git a/lib/Text/Tradition/Parser/BaseText.pm b/lib/Text/Tradition/Parser/BaseText.pm
index b9242c3..59f5a59 100644
--- a/lib/Text/Tradition/Parser/BaseText.pm
+++ b/lib/Text/Tradition/Parser/BaseText.pm
@@ -66,7 +66,7 @@ sub merge_base {
 	# DEBUG with a short graph
 	# last if $line > 2;
 	# DEBUG for problematic entries
-	# my $scrutinize = "21.8";
+	my $scrutinize = "";
 	my $first_line_node = $base_line_starts[ $line ];
 	my $too_far = $base_line_starts[ $line+1 ];
 	
@@ -138,6 +138,10 @@ sub merge_base {
 	
 	# Now we have our lemma nodes; we add the variant nodes to the graph.
 	
+	# Keep track of the start and end point of each reading for later
+	# node collapse.
+	my @readings = ( $lemma_start, $lemma_end );
+
 	# For each reading that is not rdg_0, we make a chain of nodes
 	# and connect them to the anchor.  Edges are named after the mss
 	# that are relevant.
@@ -172,11 +176,13 @@ sub merge_base {
 	    $graph->add_edge( $last_node, $graph->next_word( $lemma_end ),
 					$edge_name );
 	    
-	    # Now collate and collapse the identical nodes within the graph.
-	    collate_variant( $graph, $lemma_start, $lemma_end, 
-			     $var_start, $last_node );
-	    
+	    if( $var_start ) { # if it wasn't an empty reading
+		push( @readings, $var_start, $last_node );
+	    }
 	}
+
+	# Now collate and collapse the identical nodes within the graph.
+	collate_variants( $graph, @readings );
     }
 
     ## Now in theory I have a graph.  I want to make it a little easier to
@@ -237,7 +243,8 @@ sub read_base {
 		$started = 1;
 	    }
 	    if( $last_node ) {
-		$graph->add_edge( $last_node, $node, "base text" );
+		my $edge = $graph->add_edge( $last_node, $node, "base text" );
+		$edge->set_attribute( 'class', 'basetext' );
 		$last_node = $node;
 	    } # TODO there should be no else here...
 	}
@@ -251,66 +258,86 @@ sub read_base {
     return( @$lineref_array );
 }
 
-=item B<collate_variant>
+=item B<collate_variants>
 
-collate_variant( $graph, $lemma_start, $lemma_end, $var_start, $var_end );
+collate_variants( $graph, @readings )
 
-Given a lemma and a variant as start- and endpoints on the graph,
+Given a set of readings in the form 
+( lemma_start, lemma_end, rdg1_start, rdg1_end, ... )
 walks through each to identify those nodes that are identical.  The
-graph is a Text::Tradition::Graph object; the other arguments are
+graph is a Text::Tradition::Graph object; the elements of @readings are
 Graph::Easy::Node objects that appear on the graph.
 
 TODO: Handle collapsed and non-collapsed transpositions.
 
 =cut
 
-sub collate_variant {
-    my( $graph, $lemma_start, $lemma_end, $var_start, $var_end ) = @_;
-    # If var_start is undef, then the variant is an omission and
-    # there's nothing to collate. Return.
-    return unless $var_start;
-
-    # I want to look at the nodes in the variant and lemma, and
-    # collapse nodes that are the same word.  This is mini-collation.
-    my %collapsed = ();
-    # There will only be one outgoing edge at first, so this is safe.
-    my @out = $var_start->outgoing();
-    my $var_label = $out[0]->label();
+sub collate_variants {
+    my( $graph, @readings ) = @_;
+    my $lemma_start = shift @readings;
+    my $lemma_end = shift @readings;
+    my $detranspose = 1;
 
-    my @lemma_nodes;
+    # Start the list of distinct nodes with those nodes in the lemma.
+    my @distinct_nodes;
     while( $lemma_start ne $lemma_end ) {
-	push( @lemma_nodes, $lemma_start );
+	push( @distinct_nodes, [ $lemma_start, 'base text' ] );
 	$lemma_start = $graph->next_word( $lemma_start );
     } 
-    push( @lemma_nodes, $lemma_end );
+    push( @distinct_nodes, [ $lemma_end, 'base text' ] );
     
-    my @variant_nodes;
-    while( $var_start ne $var_end ) {
-	push( @variant_nodes, $var_start );
-	$var_start = $graph->next_word( $var_start, $var_label );
-    }
-    push( @variant_nodes, $var_end );
-
-    # Go through the variant nodes, and if we find a lemma node that
-    # hasn't yet been collapsed with a node, equate them.
-
-    foreach my $w ( @variant_nodes ) {
-	my $word = $w->label();
-	foreach my $l ( @lemma_nodes ) {
-	    if( $word eq cmp_str( $l ) ) {
-		next if exists( $collapsed{ $l->label } )
-		    && $collapsed{ $l->label } eq $l;
-		# Collapse the nodes.
-		printf STDERR "Merging nodes %s/%s and %s/%s\n", 
-		    $l->name, $l->label, $w->name, $w->label;
-		$graph->merge_nodes( $l, $w );
-		$collapsed{ $l->label } = $l;
-		# Now collapse any multiple edges to and from the node.
-		# Rely on the presence of the 'base text' edge.
-		remove_duplicate_edges( $graph, $graph->prior_word( $l ), $l );
-		remove_duplicate_edges( $graph, $l, $graph->next_word( $l ) );
+
+    while( scalar @readings ) {
+	my( $var_start, $var_end ) = splice( @readings, 0, 2 );
+
+	# I want to look at the nodes in the variant and lemma, and
+	# collapse nodes that are the same word.  This is mini-collation.
+	# Each word in the 'main' list can only be collapsed once with a
+	# word from the current reading.
+	my %collapsed = ();
+
+	# Get the label. There will only be one outgoing edge to start
+	# with, so this is safe.
+	my @out = $var_start->outgoing();
+	my $var_label = $out[0]->label();
+
+	my @variant_nodes;
+	while( $var_start ne $var_end ) {
+	    push( @variant_nodes, $var_start );
+	    $var_start = $graph->next_word( $var_start, $var_label );
+	}
+	push( @variant_nodes, $var_end );
+
+	# Go through the variant nodes, and if we find a lemma node that
+	# hasn't yet been collapsed with a node, equate them.  If we do
+	# not, keep them to push onto the end of all_nodes.
+	my @remaining_nodes;
+	my $last_index = 0;
+	foreach my $w ( @variant_nodes ) {
+	    my $word = $w->label();
+	    my $matched = 0;
+	    foreach my $idx ( $last_index .. $#distinct_nodes ) {
+		my( $l, $edgelabel ) = @{$distinct_nodes[$idx]};
+		if( $word eq cmp_str( $l ) ) {
+		    next if exists( $collapsed{ $l->label } )
+			&& $collapsed{ $l->label } eq $l;
+		    $matched = 1;
+		    $last_index = $idx if $detranspose;
+		    # Collapse the nodes.
+		    printf STDERR "Merging nodes %s/%s and %s/%s\n", 
+		        $l->name, $l->label, $w->name, $w->label;
+		    $graph->merge_nodes( $l, $w );
+		    $collapsed{ $l->label } = $l;
+		    # Now collapse any multiple edges to and from the node.
+ 		    remove_duplicate_edges( $graph, 
+ 				    $graph->prior_word( $l, $edgelabel ), $l );
+ 		    remove_duplicate_edges( $graph, $l, 
+ 				    $graph->next_word( $l, $edgelabel ) );
+		}
 	    }
+	    push( @remaining_nodes, [ $w, $var_label ] ) unless $matched;
 	}
+	push( @distinct_nodes, @remaining_nodes) if scalar( @remaining_nodes );
     }
 }