new position logic for the lemmatizer and toggler; still need non-linear positions

[scpubgit/stemmatology.git] / lib / Text / Tradition / Collation.pm
diff --git a/lib/Text/Tradition/Collation.pm b/lib/Text/Tradition/Collation.pm

index 3d43a84..aa0680a 100644 (file)
--- a/lib/Text/Tradition/Collation.pm
+++ b/lib/Text/Tradition/Collation.pm
@@ -3,6 +3,7 @@ package Text::Tradition::Collation;
 use Graph::Easy;
 use IPC::Run qw( run binary );
 use Text::Tradition::Collation::Path;
+use Text::Tradition::Collation::Position;
 use Text::Tradition::Collation::Reading;
 use Text::Tradition::Collation::Relationship;
 use Text::Tradition::Collation::Segment;
@@ -195,7 +196,8 @@ sub add_segment {
 }
 
 ## Dealing with relationships between readings.  This is a different
-## sort of graph edge.
+## sort of graph edge.  Return a success/failure value and a list of
+## node pairs that have been linked.
 
 sub add_relationship {
     my( $self, $source, $target, $options ) = @_;
@@ -206,30 +208,46 @@ sub add_relationship {
        unless ref( $source ) && $source->isa( 'Graph::Easy::Node' );
     $target = $self->reading( $target )
        unless ref( $target ) && $target->isa( 'Graph::Easy::Node' );
-    foreach my $rel ( $source->edges_to( $target ) ) {
-       if( $rel->label eq $options->{'type'} && $rel->class eq 'edge.relationship' ) {
-           return;
+    foreach my $rel ( $source->edges_to( $target ), $target->edges_to( $source ) ) {
+       if( $rel->class eq 'edge.relationship' ) {
+           return ( undef, "Relationship already exists between these readings" );
+       } else {
+           return ( undef, "There is a witness path between these readings" );
+       }
+    }
+
+    if( $source->has_position && $target->has_position ) {
+       unless( grep { $_ eq $target } $self->same_position_as( $source ) ) {
+           return( undef, "Cannot set relationship at different positions" );
        }
     }
-    $options->{'orig_relation'} = [ $source, $target ];
 
+    my @joined = ( [ $source->name, $target->name ] );  # Keep track of the nodes we join.
+    
+    $options->{'this_relation'} = [ $source, $target ];
     my $rel = Text::Tradition::Collation::Relationship->new( %$options );
     $self->graph->add_edge( $source, $target, $rel );
     if( $options->{'global'} ) {
        # Look for all readings with the source label, and if there are
        # colocated readings with the target label, join them too.
-       foreach my $r ( $self->readings() ) {
-           next unless $r->label eq $source->label;
+       foreach my $r ( grep { $_->label eq $source->label } $self->readings() ) {
+           next if $r->name eq $source->name;
            my @colocated = grep { $_->label eq $target->label }
                $self->same_position_as( $r );
            if( @colocated ) {
                warn "Multiple readings with same label at same position!"
                    if @colocated > 1;
+               my $colo = $colocated[0];
+               next if $colo->edges_to( $r ) || $r->edges_to( $colo );
+               $options->{'primary_relation'} = $options->{'this_relation'};
+               $options->{'this_relation'} = [ $r, $colocated[0] ];
                my $dup_rel = Text::Tradition::Collation::Relationship->new( %$options );
                $self->graph->add_edge( $r, $colocated[0], $dup_rel );
+               push( @joined, [ $r->name, $colocated[0]->name ] );
            }
        }
     }
+    return( 1, @joined );
 }
 
 =head2 Output method(s)
@@ -751,7 +769,7 @@ sub make_witness_path {
 sub common_readings {
     my $self = shift;
     my @common = grep { $_->is_common } $self->readings();
-    return sort { _cmp_position( $a->position, $b->position ) } @common;
+    return sort { $a->position->cmp_with( $b->position ) } @common;
 }
 
 # Calculate the relative positions of nodes in the graph, if they
@@ -759,103 +777,102 @@ sub common_readings {
 sub calculate_positions {
     my( $self, @ordered_common ) = @_;
 
-    # We have to calculate the position identifiers for each word,
-    # keyed on the common nodes.  This will be 'fun'.  The end result
-    # is a hash per witness, whose key is the word node and whose
-    # value is its position in the text.  Common nodes are always N,1
-    # so have identical positions in each text.
-
-    my $node_pos = {};
-    foreach my $wit ( @{$self->tradition->witnesses} ) {
-       print STDERR "Calculating positions in " . $wit->sigil . "\n";
-       _update_positions_from_path( $wit->path, @ordered_common );
-       _update_positions_from_path( $wit->uncorrected_path, @ordered_common )
-           if $wit->has_ante_corr;
-    }
-    
-    # DEBUG
-    foreach my $r ( $self->readings() ) {
-       print STDERR "Reading " . $r->name . "/" . $r->label . " has no position\n"
-           unless( $r->has_position );
+    # First assign positions to all the common nodes.
+    my $l = 1;
+    foreach my $oc ( @ordered_common ) {
+       $oc->position( $l++, 1 );
     }
 
-    $self->init_lemmata();
-}
-
-sub _update_positions_from_path {
-    my( $path, @ordered_common ) = @_;
-
-    # First we walk the given path, making a matrix for the witness
-    # that corresponds to its eventual position identifier.  Common
-    # nodes always start a new row, and are thus always in the first
-    # column.
-    my $wit_matrix = [];
-    my $cn = 0;  # We should hit the common readings in order.
-    my $row = [];
-    foreach my $wn ( @{$path} ) {
-       if( $wn eq $ordered_common[$cn] ) {
-           # Set up to look for the next common node, and
-           # start a new row of words.
-           $cn++;
-           push( @$wit_matrix, $row ) if scalar( @$row );
-           $row = [];
-       }
-       push( @$row, $wn );
-    }
-    push( @$wit_matrix, $row );  # Push the last row onto the matrix
-
-    # Now we have a matrix per witness, so that each row in the
-    # matrix begins with a common node, and continues with all the
-    # variant words that appear in the witness.  We turn this into
-    # real positions in row,cell format.  But we need some
-    # trickery in order to make sure that each node gets assigned
-    # to only one position.
-    
-    foreach my $li ( 1..scalar(@$wit_matrix) ) {
-       foreach my $di ( 1..scalar(@{$wit_matrix->[$li-1]}) ) {
-           my $reading = $wit_matrix->[$li-1]->[$di-1];
-           my $position = "$li,$di";
-
-           # If we have seen this node before, we need to compare
-           # its position with what went before.
-           unless( $reading->has_position &&
-                   _cmp_position( $position, $reading->position ) < 1 ) {
-               # The new position ID replaces the old one.
-               $reading->position( $position );
-           } # otherwise, the old position needs to stay.
+    if( $self->linear ) {
+       # For the space between each common node, we have to find all the chains
+       # from all the witnesses.  The longest chain gives us our max, and the
+       # others get min/max ranges to fit.
+       my $first = shift @ordered_common;
+       while( @ordered_common ) {
+           my %paths;
+           my $next = shift @ordered_common;
+           my $longest = 0;
+           foreach my $wit ( @{$self->tradition->witnesses} ) {
+               # Key to the path is not important; we just have to get
+               # all unique paths.
+               my $length = $self->_track_paths( \%paths, $first, $next, $wit->sigil );
+               $longest = $length unless $longest > $length;
+               if( $wit->has_ante_corr ) {
+                   my $length = $self->_track_paths( \%paths, $first, $next, 
+                                                     $wit->sigil.$self->ac_label, $wit->sigil );
+                   $longest = $length unless $longest > $length;
+               }
+           }
+           
+           # Transform the path values from unique strings to arrays.
+           foreach my $k ( keys %paths ) {
+               my @v = split( /\s+/, $paths{$k} );
+               $paths{$k} = \@v;
+           }
+           
+           # Now %paths has all the unique paths, and we know how long the
+           # longest of these is.  Assign positions, starting with the
+           # longest.  All non-common positions start at 2.
+           foreach my $path ( sort { scalar @$b <=> scalar @$a } values %paths  ) {
+               my $range = $longest - scalar @$path;
+               foreach my $i ( 0 .. $#{$path} ) {
+                   my $min = $i+2;
+                   my $rdg = $self->reading( $path->[$i] );
+                   unless( $rdg->has_position ) {
+                       $rdg->position( $first->position->common, $min, $min+$range );
+                   }
+               }
+           }
+           
+           $first = $next;
        }
-    }
-}
+    } else {
+
+       # Non-linear positions are pretty much impossible to pin down.
+       # Any reading might appear anywhere in the graph.  I guess we
+       # can do positions where there aren't transpositions...
 
-sub _cmp_position {
-    my( $a, $b ) = @_;
-    if ( $a && $b ) {
-       my @pos_a = split(/,/, $a );
-       my @pos_b = split(/,/, $b );
-
-       my $big_cmp = $pos_a[0] <=> $pos_b[0];
-       return $big_cmp if $big_cmp;
-       # else 
-       return $pos_a[1] <=> $pos_b[1];
-    } elsif ( $b ) { # a is undefined
-       return -1;
-    } elsif ( $a ) { # b is undefined
-       return 1;
     }
-    return 0; # they are both undefined
+               
+    $self->init_lemmata();
 }
 
-sub all_positions {
+# Helper function for the guts of calculate_positions.
+sub _track_paths {
+    my $self = shift;
+    my $track_hash = shift;
+    # Args are first, last, wit, backup
+    my @path = $self->reading_sequence( @_ );
+    # Top and tail the array
+    shift @path;
+    pop @path;
+    $track_hash->{$_[2]} = join( ' ', map { $_->name } @path )
+       if @path;
+    return @path;
+}
+ 
+sub possible_positions {
     my $self = shift;
+    my @answer;
     my %positions = ();
-    map { $positions{$_->position} = 1 } $self->readings;
-    my @answer = sort { _cmp_position( $a, $b ) } keys( %positions );
+    foreach my $r ( $self->readings ) {
+       next unless $r->has_position;
+       $positions{$r->position->maxref} = 1;
+    }
+    @answer = keys %positions;
     return @answer;
 }
 
+# TODO think about indexing this.
 sub readings_at_position {
-    my( $self, $pos ) = @_;
-    my @answer = grep { $_->position eq $pos } $self->readings;
+    my( $self, $position, $strict ) = @_;
+    unless( ref( $position ) eq 'Text::Tradition::Collation::Position' ) {
+       $position = Text::Tradition::Collation::Position->new( $position );
+    }
+    my @answer;
+    foreach my $r ( $self->readings ) {
+       push( @answer, $r ) if $r->is_at_position( $position, $strict );
+    }
     return @answer;
 }
 
@@ -863,13 +880,13 @@ sub readings_at_position {
 
 sub init_lemmata {
     my $self = shift;
-    
-    foreach my $position ( $self->all_positions ) {
+
+    foreach my $position ( $self->possible_positions ) {
        $self->lemmata->{$position} = undef;
     }
 
     foreach my $cr ( $self->common_readings ) {
-       $self->lemmata->{$cr->position} = $cr->name;
+       $self->lemmata->{$cr->position->maxref} = $cr->name;
     }
 }
     
@@ -906,16 +923,25 @@ sub lemma_readings {
     # First get the positions of those nodes which have been
     # toggled off.
     my $positions_off = {};
-    map { $positions_off->{ $_->position } = $_->name } @toggled_off_nodes;
+    map { $positions_off->{ $_->position->reference } = $_->name } 
+        @toggled_off_nodes;
 
     # Now for each position, we have to see if a node is on, and we
-    # have to see if a node has been turned off.
+    # have to see if a node has been turned off.  The lemmata hash
+    # should contain fixed positions, range positions whose node was
+    # just turned off, and range positions whose node is on.
     my @answer;
-    foreach my $pos ( $self->all_positions() ) {
+    my %fixed_positions;
+    # TODO One of these is probably redundant.
+    map { $fixed_positions{$_} = 0 } keys %{$self->lemmata};
+    map { $fixed_positions{$_} = 0 } keys %{$positions_off};
+    map { $fixed_positions{$_} = 1 } $self->possible_positions;
+    foreach my $pos ( sort { Text::Tradition::Collation::Position::str_cmp( $a, $b ) } keys %fixed_positions ) {
        # Find the state of this position.  If there is an active node,
        # its name will be the state; otherwise the state will be 0 
        # (nothing at this position) or undef (ellipsis at this position)
-       my $active = $self->lemmata->{$pos};
+       my $active = undef;
+       $active = $self->lemmata->{$pos} if exists $self->lemmata->{$pos};
        
        # Is there a formerly active node that was toggled off?
        if( exists( $positions_off->{$pos} ) ) {
@@ -923,6 +949,10 @@ sub lemma_readings {
            if( $active && $active ne $off_node) {
                push( @answer, [ $off_node, 0 ], [ $active, 1 ] );
            } else {
+               unless( $fixed_positions{$pos} ) {
+                   $active = 0;
+                   delete $self->lemmata->{$pos};
+               }
                push( @answer, [ $off_node, $active ] );
            }
 
@@ -936,9 +966,10 @@ sub lemma_readings {
            # at that position.
            my @pos_nodes = $self->readings_at_position( $pos );
            push( @answer, [ $pos_nodes[0]->name, $self->lemmata->{$pos} ] );
+           delete $self->lemmata->{$pos} unless $fixed_positions{$pos};
        }
     }
-    
+
     return @answer;
 }
 
@@ -963,37 +994,44 @@ sub toggle_reading {
     } 
     
     my $pos = $reading->position;
-    my $old_state = $self->lemmata->{$pos};
+    my $fixed = $reading->position->fixed;
+    my $old_state = $self->lemmata->{$pos->reference};
+
     my @readings_off;
     if( $old_state && $old_state eq $rname ) {
        # Turn off the node. We turn on no others by default.
        push( @readings_off, $reading );
     } else {
        # Turn on the node.
-       $self->lemmata->{$pos} = $rname;
-       # Any other 'on' readings in the same position should be off.
-       push( @readings_off, $self->same_position_as( $reading ) );
+       $self->lemmata->{$pos->reference} = $rname;
+       # Any other 'on' readings in the same position should be off
+       # if we have a fixed position.
+       push( @readings_off, $self->same_position_as( $reading, 1 ) )
+           if $pos->fixed;
        # Any node that is an identical transposed one should be off.
        push( @readings_off, $reading->identical_readings );
     }
     @readings_off = unique_list( @readings_off );
-
+       
     # Turn off the readings that need to be turned off.
     my @readings_delemmatized;
     foreach my $n ( @readings_off ) {
-       my $state = $self->lemmata->{$n->position};
+       my $npos = $n->position;
+       my $state = undef;
+       $state = $self->lemmata->{$npos->reference}
+           if defined $self->lemmata->{$npos->reference};
        if( $state && $state eq $n->name ) { 
            # this reading is still on, so turn it off
            push( @readings_delemmatized, $n );
            my $new_state = undef;
-           if( $n eq $reading ) {
+           if( $npos->fixed && $n eq $reading ) {
                # This is the reading that was clicked, so if there are no
-               # other readings there, turn off the position.  In all other
-               # cases, restore the ellipsis.
-               my @other_n = $self->same_position_as( $n );
+               # other readings there and this is a fixed position, turn off 
+               # the position.  In all other cases, restore the ellipsis.
+               my @other_n = $self->same_position_as( $n ); # TODO do we need strict?
                $new_state = 0 unless @other_n;
            }
-           $self->lemmata->{$n->position} = $new_state;
+           $self->lemmata->{$npos->reference} = $new_state;
        } elsif( $old_state && $old_state eq $n->name ) { 
            # another reading has already been turned on here
            push( @readings_delemmatized, $n );
@@ -1003,9 +1041,15 @@ sub toggle_reading {
 }
 
 sub same_position_as {
-    my( $self, $reading ) = @_;
+    my( $self, $reading, $strict ) = @_;
     my $pos = $reading->position;
-    my @same = grep { $_ ne $reading } $self->readings_at_position( $reading->position );
+    my %onpath = ( $reading->name => 1 );
+    # TODO This might not always be sufficient.  We really want to
+    # exclude all readings on this one's path between its two
+    # common points.
+    map { $onpath{$_->name} = 1 } $reading->neighbor_readings;
+    my @same = grep { !$onpath{$_->name} } 
+        $self->readings_at_position( $reading->position, $strict );
     return @same;
 }