From: Tara L Andrews Date: Mon, 2 May 2011 22:42:25 +0000 (+0200) Subject: fixed node matching against many variants X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=e49731d785cb2cd70ffccdcf1fabe61f0c419022;p=scpubgit%2Fstemmatology.git fixed node matching against many variants --- diff --git a/lib/Text/Tradition/Graph.pm b/lib/Text/Tradition/Graph.pm index f49db42..be16099 100644 --- a/lib/Text/Tradition/Graph.pm +++ b/lib/Text/Tradition/Graph.pm @@ -280,18 +280,8 @@ path. TODO These are badly named. sub next_word { # Return the successor via the corresponding edge. - my( $self, $node, $edge ) = @_; - $edge = "base text" unless $edge; - my @next_edges = $node->outgoing(); - return undef unless scalar( @next_edges ); - - foreach my $e ( @next_edges ) { - next unless $e->label() eq $edge; - return $e->to(); - } - - warn "Could not find node connected to edge $edge"; - return undef; + my $self = shift; + return $self->_find_linked_word( 'next', @_ ); } =item B @@ -305,20 +295,42 @@ path. TODO These are badly named. sub prior_word { # Return the predecessor via the corresponding edge. - my( $self, $node, $edge ) = @_; - $edge = "base text" unless $edge; - my @prior_edges = $node->incoming(); - return undef unless scalar( @prior_edges ); + my $self = shift; + return $self->_find_linked_word( 'prior', @_ ); +} + +sub _find_linked_word { + my( $self, $direction, $node, $edge ) = @_; + $edge = 'base text' unless $edge; + my @linked_edges = $direction eq 'next' + ? $node->outgoing() : $node->incoming(); + return undef unless scalar( @linked_edges ); - foreach my $e ( @prior_edges ) { - next unless $e->label() eq $edge; - return $e->from(); + # We have to find the linked edge that contains all of the + # witnesses supplied in $edge. + my @edge_wits = split( /, /, $edge ); + foreach my $le ( @linked_edges ) { + my @le_wits = split( /, /, $le->name() ); + if( _is_within( \@edge_wits, \@le_wits ) ) { + # This is the right edge. + return $direction eq 'next' ? $le->to() : $le->from(); + } } - - warn "Could not find node connected from edge $edge"; + warn "Could not find $direction node from " . $node->label + . " along edge $edge"; return undef; } +# Some set logic. +sub _is_within { + my( $set1, $set2 ) = @_; + my $ret = 1; + foreach my $el ( @$set1 ) { + $ret = 0 unless grep { /^\Q$el\E$/ } @$set2; + } + return $ret; +} + =item B my @nodes = $graph->node_sequence( $first, $last, $path ); @@ -487,13 +499,95 @@ sub init_lemmatizer { } -# Takes a list of nodes that have just been turned off, and returns a -# set of tuples of the form ['node', 'state'] that indicates what -# changes need to be made to the graph. -# A state of 1 means 'turn on this node' -# A state of 0 means 'turn off this node' -# A state of undef means 'an ellipsis belongs in the text here because -# no decision has been made' +=item B + +my @nodes_turned_off = $graph->toggle_node( $node ); + +Takes a node name, and either lemmatizes or de-lemmatizes it. Returns +a list of all nodes that are de-lemmatized as a result of the toggle. + +=cut + +sub toggle_node { + my( $self, $node ) = @_; + + # In case this is being called for the first time. + $self->init_lemmatizer(); + + if( $self->is_common( $node ) ) { + # Do nothing, it's a common node. + return; + } + + my $pos = $self->{'positions'}->node_position( $node ); + my $old_state = $self->{'positions'}->state( $pos ); + my @nodes_off; + if( $old_state && $old_state eq $node ) { + # Turn off the node. We turn on no others by default. + push( @nodes_off, $node ); + } else { + # Turn on the node. + $self->{'positions'}->set_state( $pos, $node ); + # Any other 'on' nodes in the same position should be off. + push( @nodes_off, $self->colocated_nodes( $node ) ); + # Any node that is an identical transposed one should be off. + push( @nodes_off, $self->identical_nodes( $node ) ) + if $self->identical_nodes( $node ); + } + @nodes_off = unique_list( @nodes_off ); + + # Turn off the nodes that need to be turned off. + my @nodes_turned_off; + foreach my $n ( @nodes_off ) { + my $npos = $self->{'positions'}->node_position( $n ); + my $state = $self->{'positions'}->state( $npos ); + if( $state && $state eq $n ) { + # this node is still on + push( @nodes_turned_off, $n ); + my $new_state = undef; + if( $n eq $node ) { + # This is the node that was clicked, so if there are no + # other nodes there, turn off the position. In all other + # cases, restore the ellipsis. + my @all_n = $self->{'positions'}->nodes_at_position( $pos ); + $new_state = 0 if scalar( @all_n ) == 1; + } + $self->{'positions'}->set_state( $npos, $new_state ); + } elsif( $old_state && $old_state eq $n ) { + # another node has already been turned on here + push( @nodes_turned_off, $n ); + } # else some other node was on anyway, so pass. + } + return @nodes_turned_off; +} + +=item B + +my @state = $graph->active_nodes( @nodes_turned_off ); + +Takes a list of nodes that have just been turned off, and returns a +set of tuples of the form ['node', 'state'] that indicates what +changes need to be made to the graph. + +=over + +=item * + +A state of 1 means 'turn on this node' + +=item * + +A state of 0 means 'turn off this node' + +=item * + +A state of undef means 'an ellipsis belongs in the text here because +no decision has been made' + +=back + +=cut + sub active_nodes { my( $self, @toggled_off_nodes ) = @_; @@ -558,62 +652,6 @@ sub _nodeobj { return $node; } -# toggle_node takes a node name, and either lemmatizes or de-lemmatizes it. -# Returns a list of nodes that are de-lemmatized as a result of the toggle. - -sub toggle_node { - my( $self, $node ) = @_; - - # In case this is being called for the first time. - $self->init_lemmatizer(); - - if( $self->is_common( $node ) ) { - # Do nothing, it's a common node. - return; - } - - my $pos = $self->{'positions'}->node_position( $node ); - my $old_state = $self->{'positions'}->state( $pos ); - my @nodes_off; - if( $old_state && $old_state eq $node ) { - # Turn off the node. We turn on no others by default. - push( @nodes_off, $node ); - } else { - # Turn on the node. - $self->{'positions'}->set_state( $pos, $node ); - # Any other 'on' nodes in the same position should be off. - push( @nodes_off, $self->colocated_nodes( $node ) ); - # Any node that is an identical transposed one should be off. - push( @nodes_off, $self->identical_nodes( $node ) ) - if $self->identical_nodes( $node ); - } - @nodes_off = unique_list( @nodes_off ); - - # Turn off the nodes that need to be turned off. - my @nodes_turned_off; - foreach my $n ( @nodes_off ) { - my $npos = $self->{'positions'}->node_position( $n ); - my $state = $self->{'positions'}->state( $npos ); - if( $state && $state eq $n ) { - # this node is still on - push( @nodes_turned_off, $n ); - my $new_state = undef; - if( $n eq $node ) { - # This is the node that was clicked, so if there are no - # other nodes there, turn off the position. In all other - # cases, restore the ellipsis. - my @all_n = $self->{'positions'}->nodes_at_position( $pos ); - $new_state = 0 if scalar( @all_n ) == 1; - } - $self->{'positions'}->set_state( $npos, $new_state ); - } elsif( $old_state && $old_state eq $n ) { - # another node has already been turned on here - push( @nodes_turned_off, $n ); - } # else some other node was on anyway, so pass. - } - return @nodes_turned_off; -} - sub colocated_nodes { my $self = shift; return $self->{'positions'}->colocated_nodes( @_ ); diff --git a/lib/Text/Tradition/Parser/BaseText.pm b/lib/Text/Tradition/Parser/BaseText.pm index b9242c3..59f5a59 100644 --- a/lib/Text/Tradition/Parser/BaseText.pm +++ b/lib/Text/Tradition/Parser/BaseText.pm @@ -66,7 +66,7 @@ sub merge_base { # DEBUG with a short graph # last if $line > 2; # DEBUG for problematic entries - # my $scrutinize = "21.8"; + my $scrutinize = ""; my $first_line_node = $base_line_starts[ $line ]; my $too_far = $base_line_starts[ $line+1 ]; @@ -138,6 +138,10 @@ sub merge_base { # Now we have our lemma nodes; we add the variant nodes to the graph. + # Keep track of the start and end point of each reading for later + # node collapse. + my @readings = ( $lemma_start, $lemma_end ); + # For each reading that is not rdg_0, we make a chain of nodes # and connect them to the anchor. Edges are named after the mss # that are relevant. @@ -172,11 +176,13 @@ sub merge_base { $graph->add_edge( $last_node, $graph->next_word( $lemma_end ), $edge_name ); - # Now collate and collapse the identical nodes within the graph. - collate_variant( $graph, $lemma_start, $lemma_end, - $var_start, $last_node ); - + if( $var_start ) { # if it wasn't an empty reading + push( @readings, $var_start, $last_node ); + } } + + # Now collate and collapse the identical nodes within the graph. + collate_variants( $graph, @readings ); } ## Now in theory I have a graph. I want to make it a little easier to @@ -237,7 +243,8 @@ sub read_base { $started = 1; } if( $last_node ) { - $graph->add_edge( $last_node, $node, "base text" ); + my $edge = $graph->add_edge( $last_node, $node, "base text" ); + $edge->set_attribute( 'class', 'basetext' ); $last_node = $node; } # TODO there should be no else here... } @@ -251,66 +258,86 @@ sub read_base { return( @$lineref_array ); } -=item B +=item B -collate_variant( $graph, $lemma_start, $lemma_end, $var_start, $var_end ); +collate_variants( $graph, @readings ) -Given a lemma and a variant as start- and endpoints on the graph, +Given a set of readings in the form +( lemma_start, lemma_end, rdg1_start, rdg1_end, ... ) walks through each to identify those nodes that are identical. The -graph is a Text::Tradition::Graph object; the other arguments are +graph is a Text::Tradition::Graph object; the elements of @readings are Graph::Easy::Node objects that appear on the graph. TODO: Handle collapsed and non-collapsed transpositions. =cut -sub collate_variant { - my( $graph, $lemma_start, $lemma_end, $var_start, $var_end ) = @_; - # If var_start is undef, then the variant is an omission and - # there's nothing to collate. Return. - return unless $var_start; - - # I want to look at the nodes in the variant and lemma, and - # collapse nodes that are the same word. This is mini-collation. - my %collapsed = (); - # There will only be one outgoing edge at first, so this is safe. - my @out = $var_start->outgoing(); - my $var_label = $out[0]->label(); +sub collate_variants { + my( $graph, @readings ) = @_; + my $lemma_start = shift @readings; + my $lemma_end = shift @readings; + my $detranspose = 1; - my @lemma_nodes; + # Start the list of distinct nodes with those nodes in the lemma. + my @distinct_nodes; while( $lemma_start ne $lemma_end ) { - push( @lemma_nodes, $lemma_start ); + push( @distinct_nodes, [ $lemma_start, 'base text' ] ); $lemma_start = $graph->next_word( $lemma_start ); } - push( @lemma_nodes, $lemma_end ); + push( @distinct_nodes, [ $lemma_end, 'base text' ] ); - my @variant_nodes; - while( $var_start ne $var_end ) { - push( @variant_nodes, $var_start ); - $var_start = $graph->next_word( $var_start, $var_label ); - } - push( @variant_nodes, $var_end ); - - # Go through the variant nodes, and if we find a lemma node that - # hasn't yet been collapsed with a node, equate them. - - foreach my $w ( @variant_nodes ) { - my $word = $w->label(); - foreach my $l ( @lemma_nodes ) { - if( $word eq cmp_str( $l ) ) { - next if exists( $collapsed{ $l->label } ) - && $collapsed{ $l->label } eq $l; - # Collapse the nodes. - printf STDERR "Merging nodes %s/%s and %s/%s\n", - $l->name, $l->label, $w->name, $w->label; - $graph->merge_nodes( $l, $w ); - $collapsed{ $l->label } = $l; - # Now collapse any multiple edges to and from the node. - # Rely on the presence of the 'base text' edge. - remove_duplicate_edges( $graph, $graph->prior_word( $l ), $l ); - remove_duplicate_edges( $graph, $l, $graph->next_word( $l ) ); + + while( scalar @readings ) { + my( $var_start, $var_end ) = splice( @readings, 0, 2 ); + + # I want to look at the nodes in the variant and lemma, and + # collapse nodes that are the same word. This is mini-collation. + # Each word in the 'main' list can only be collapsed once with a + # word from the current reading. + my %collapsed = (); + + # Get the label. There will only be one outgoing edge to start + # with, so this is safe. + my @out = $var_start->outgoing(); + my $var_label = $out[0]->label(); + + my @variant_nodes; + while( $var_start ne $var_end ) { + push( @variant_nodes, $var_start ); + $var_start = $graph->next_word( $var_start, $var_label ); + } + push( @variant_nodes, $var_end ); + + # Go through the variant nodes, and if we find a lemma node that + # hasn't yet been collapsed with a node, equate them. If we do + # not, keep them to push onto the end of all_nodes. + my @remaining_nodes; + my $last_index = 0; + foreach my $w ( @variant_nodes ) { + my $word = $w->label(); + my $matched = 0; + foreach my $idx ( $last_index .. $#distinct_nodes ) { + my( $l, $edgelabel ) = @{$distinct_nodes[$idx]}; + if( $word eq cmp_str( $l ) ) { + next if exists( $collapsed{ $l->label } ) + && $collapsed{ $l->label } eq $l; + $matched = 1; + $last_index = $idx if $detranspose; + # Collapse the nodes. + printf STDERR "Merging nodes %s/%s and %s/%s\n", + $l->name, $l->label, $w->name, $w->label; + $graph->merge_nodes( $l, $w ); + $collapsed{ $l->label } = $l; + # Now collapse any multiple edges to and from the node. + remove_duplicate_edges( $graph, + $graph->prior_word( $l, $edgelabel ), $l ); + remove_duplicate_edges( $graph, $l, + $graph->next_word( $l, $edgelabel ) ); + } } + push( @remaining_nodes, [ $w, $var_label ] ) unless $matched; } + push( @distinct_nodes, @remaining_nodes) if scalar( @remaining_nodes ); } }