get rid of erroneous double-decode

[scpubgit/stemmatology.git] / base / lib / Text / Tradition / Collation.pm
diff --git a/base/lib/Text/Tradition/Collation.pm b/base/lib/Text/Tradition/Collation.pm

index f1c9227..ae36da5 100644 (file)
--- a/base/lib/Text/Tradition/Collation.pm
+++ b/base/lib/Text/Tradition/Collation.pm
@@ -159,15 +159,6 @@ See L<Text::Tradition::Collation::Reading> for the available arguments.
 Removes the given reading from the collation, implicitly removing its
 paths and relationships.
 
-=head2 merge_readings( $main, $second, $concatenate, $with_str )
-
-Merges the $second reading into the $main one. If $concatenate is true, then
-the merged node will carry the text of both readings, concatenated with either
-$with_str (if specified) or a sensible default (the empty string if the
-appropriate 'join_*' flag is set on either reading, or else $self->wordsep.)
-
-The first two arguments may be either readings or reading IDs.
-
 =head2 has_reading( $id )
 
 Predicate to see whether a given reading ID is in the graph.
@@ -290,6 +281,15 @@ around del_reading => sub {
        $self->$orig( $arg );
 };
 
+=head2 merge_readings( $main, $second, $concatenate, $with_str )
+
+Merges the $second reading into the $main one. If $concatenate is true, then
+the merged node will carry the text of both readings, concatenated with either
+$with_str (if specified) or a sensible default (the empty string if the
+appropriate 'join_*' flag is set on either reading, or else $self->wordsep.)
+
+The first two arguments may be either readings or reading IDs.
+
 =begin testing
 
 use Text::Tradition;
@@ -303,13 +303,14 @@ my $t = Text::Tradition->new(
 my $c = $t->collation;
 
 my $rno = scalar $c->readings;
-# Split n21 for testing purposes
+# Split n21 ('unto') for testing purposes
 my $new_r = $c->add_reading( { 'id' => 'n21p0', 'text' => 'un', 'join_next' => 1 } );
 my $old_r = $c->reading( 'n21' );
 $old_r->alter_text( 'to' );
 $c->del_path( 'n20', 'n21', 'A' );
 $c->add_path( 'n20', 'n21p0', 'A' );
 $c->add_path( 'n21p0', 'n21', 'A' );
+$c->add_relationship( 'n21', 'n22', { type => 'collated', scope => 'local' } );
 $c->flatten_ranks();
 ok( $c->reading( 'n21p0' ), "New reading exists" );
 is( scalar $c->readings, $rno, "Reading add offset by flatten_ranks" );
@@ -475,21 +476,27 @@ is( scalar( $sc->readings ), $numr, "There are $numr readings in the graph" );
 is( $sc->end->rank, 14, "There are fourteen ranks in the graph" );
 
 # Detach the erroneously collated reading
-my $newr = $sc->duplicate_reading( 'n131', 'Ba96' );
+my( $newr, @del_rdgs ) = $sc->duplicate_reading( 'n131', 'Ba96' );
 ok( $newr, "New reading was created" );
 ok( $sc->reading('n131_0'), "Detached the bad collation with a new reading" );
 is( scalar( $sc->readings ), $numr + 1, "A reading was added to the graph" );
 is( $sc->end->rank, 10, "There are now only ten ranks in the graph" );
+my $csucc = $sc->common_successor( 'n131', 'n131_0' );
+is( $csucc->id, 'n136', "Found correct common successor to duped reading" ); 
 
 # Check that the bad transposition is gone
+is( scalar @del_rdgs, 1, "Deleted reading was returned by API call" );
 is( $sc->get_relationship( 'n130', 'n135' ), undef, "Bad transposition relationship is gone" );
 
+# The collation should not be fixed
+my @pairs = $sc->identical_readings();
+is( scalar @pairs, 0, "Not re-collated yet" );
 # Fix the collation
-ok( $sc->add_relationship( 'n124', 'n131_0', { type => 'collated', scope => 'local' } ),
-       "Collated the readings correctly" );
-$sc->calculate_ranks();
-$sc->flatten_ranks();
+ok( $sc->merge_readings( 'n124', 'n131_0' ), "Collated the readings correctly" );
+@pairs = $sc->identical_readings( start => 'n124', end => $csucc->id );
+is( scalar @pairs, 3, "Found three more identical readings" );
 is( $sc->end->rank, 11, "The ranks shifted appropriately" );
+$sc->flatten_ranks();
 is( scalar( $sc->readings ), $numr - 3, "Now we are collated correctly" );
 
 =end testing
@@ -545,24 +552,25 @@ sub duplicate_reading {
                $self->add_path( $newr, $next, $wit );
        }
        
-       # Hash the reading ranks and find the closest common successor to our
-       # two readings
-       my %rrk;
+       # If the graph is ranked, we need to look for relationships that are now
+       # invalid (i.e. 'non-colocation' types that might now be colocated) and
+       # remove them. If not, we can skip it.
        my $succ;
+       my %rrk;
+       my @deleted_relations;
        if( $self->end->has_rank ) {
+               # Find the point where we can stop checking
                $succ = $self->common_successor( $r, $newr );
+               
+               # Hash the existing ranks
                foreach my $rdg ( $self->readings ) {
                        $rrk{$rdg->id} = $rdg->rank;
                }
-       }
-
-       # Rebuild the equivalence graph and calculate the new ranks     
-       $self->relations->rebuild_equivalence();
-       $self->calculate_ranks();
+               # Calculate the new ranks       
+               $self->calculate_ranks();
        
-       # Check for invalid non-colocated relationships among changed-rank readings
-       # from where the ranks start changing up to $succ
-       if( $self->end->has_rank ) {
+               # Check for invalid non-colocated relationships among changed-rank readings
+               # from where the ranks start changing up to $succ
                my $lastrank = $succ->rank;
                foreach my $rdg ( $self->readings ) {
                        next if $rdg->rank > $lastrank;
@@ -570,11 +578,13 @@ sub duplicate_reading {
                        my @noncolo = $rdg->related_readings( sub { !$_[0]->colocated } );
                        next unless @noncolo;
                        foreach my $nc ( @noncolo ) {
-                               $self->relations->verify_or_delete( $rdg, $nc );
+                               unless( $self->relations->verify_or_delete( $rdg, $nc ) ) {
+                                       push( @deleted_relations, [ $rdg->id, $nc->id ] );
+                               }
                        }
                }
        }
-       return $newr;
+       return ( $newr, @deleted_relations );
 }
 
 sub _generate_dup_id {
@@ -693,8 +703,8 @@ around qw/ get_relationship del_relationship / => sub {
        if( @args == 1 && ref( $args[0] ) eq 'ARRAY' ) {
                @args = @{$_[0]};
        }
-       my( $source, $target ) = $self->_stringify_args( @args );
-       $self->$orig( $source, $target );
+       my @stringargs = $self->_stringify_args( @args );
+       $self->$orig( @stringargs );
 };
 
 =head2 reading_witnesses( $reading )
@@ -857,7 +867,8 @@ sub as_dot {
     foreach my $edge ( @edges ) {
        # Do we need to output this edge?
        if( $used{$edge->[0]} && $used{$edge->[1]} ) {
-               my $label = $self->_path_display_label( $self->path_witnesses( $edge ) );
+               my $label = $self->_path_display_label( $opts,
+                       $self->path_witnesses( $edge ) );
                        my $variables = { %edge_attrs, 'label' => $label };
                        
                        # Account for the rank gap if necessary
@@ -917,7 +928,8 @@ sub as_dot {
     
     # Add substitute start and end edges if necessary
     foreach my $node ( keys %substart ) {
-       my $witstr = $self->_path_display_label ( $self->path_witnesses( $substart{$node}, $node ) );
+       my $witstr = $self->_path_display_label( $opts, 
+               $self->path_witnesses( $substart{$node}, $node ) );
        my $variables = { %edge_attrs, 'label' => $witstr };
        my $nrdg = $self->reading( $node );
        if( $nrdg->has_rank && $nrdg->rank > $startrank ) {
@@ -928,7 +940,8 @@ sub as_dot {
         $dot .= "\t\"__SUBSTART__\" -> \"$node\" $varopts;\n";
        }
     foreach my $node ( keys %subend ) {
-       my $witstr = $self->_path_display_label ( $self->path_witnesses( $node, $subend{$node} ) );
+       my $witstr = $self->_path_display_label( $opts,
+               $self->path_witnesses( $node, $subend{$node} ) );
        my $variables = { %edge_attrs, 'label' => $witstr };
         my $varopts = _dot_attr_string( $variables );
         $dot .= "\t\"$node\" -> \"__SUBEND__\" $varopts;\n";
@@ -1001,6 +1014,7 @@ sub path_witnesses {
 # witnesses only where the main witness is not also in the list.
 sub _path_display_label {
        my $self = shift;
+       my $opts = shift;
        my %wits;
        map { $wits{$_} = 1 } @_;
 
@@ -1018,14 +1032,18 @@ sub _path_display_label {
                }
        }
        
-       # See if we are in a majority situation.
-       my $maj = scalar( $self->tradition->witnesses ) * 0.6;
-       $maj = $maj > 5 ? $maj : 5;
-       if( scalar keys %wits > $maj ) {
-               unshift( @disp_ac, 'majority' );
-               return join( ', ', @disp_ac );
-       } else {
+       if( $opts->{'explicit_wits'} ) {
                return join( ', ', sort keys %wits );
+       } else {
+               # See if we are in a majority situation.
+               my $maj = scalar( $self->tradition->witnesses ) * 0.6;
+               $maj = $maj > 5 ? $maj : 5;
+               if( scalar keys %wits > $maj ) {
+                       unshift( @disp_ac, 'majority' );
+                       return join( ', ', @disp_ac );
+               } else {
+                       return join( ', ', sort keys %wits );
+               }
        }
 }
 
@@ -1380,13 +1398,13 @@ sub as_csv {
     my @result;
     # Make the header row
     $csv->combine( map { $_->{'witness'} } @{$table->{'alignment'}} );
-       push( @result, decode_utf8( $csv->string ) );
+       push( @result, $csv->string );
     # Make the rest of the rows
     foreach my $idx ( 0 .. $table->{'length'} - 1 ) {
        my @rowobjs = map { $_->{'tokens'}->[$idx] } @{$table->{'alignment'}};
        my @row = map { $_ ? $_->{'t'}->text : $_ } @rowobjs;
         $csv->combine( @row );
-        push( @result, decode_utf8( $csv->string ) );
+        push( @result, $csv->string );
     }
     return join( "\n", @result );
 }
@@ -1785,29 +1803,71 @@ with the same text at the same rank, and merges any that are found.
 =cut
 
 sub flatten_ranks {
-    my $self = shift;
+    my ( $self, %args ) = shift;
     my %unique_rank_rdg;
     my $changed;
+    foreach my $p ( $self->identical_readings( %args ) ) {
+               # say STDERR "Combining readings at same rank: @$p";
+               $changed = 1;
+               $self->merge_readings( @$p );
+               # TODO see if this now makes a common point.
+    }
+    # If we merged readings, the ranks are still fine but the alignment
+    # table is wrong. Wipe it.
+    $self->wipe_table() if $changed;
+}
+
+=head2 identical_readings
+=head2 identical_readings( start => $startnode, end => $endnode )
+=head2 identical_readings( startrank => $startrank, endrank => $endrank )
+
+Goes through the graph identifying all pairs of readings that appear to be
+identical, and therefore able to be merged into a single reading. Returns the 
+relevant identical pairs. Can be restricted to run over only a part of the 
+graph, specified either by node or by rank.
+
+=cut
+
+sub identical_readings {
+       my ( $self, %args ) = @_;
+    # Find where we should start and end.
+    my $startrank = $args{startrank} || 0;
+    if( $args{start} ) {
+       throw( "Starting reading has no rank" ) unless $self->reading( $args{start} ) 
+               && $self->reading( $args{start} )->has_rank;
+       $startrank = $self->reading( $args{start} )->rank;
+    }
+    my $endrank = $args{endrank} || $self->end->rank;
+    if( $args{end} ) {
+       throw( "Ending reading has no rank" ) unless $self->reading( $args{end} ) 
+               && $self->reading( $args{end} )->has_rank;
+       $endrank = $self->reading( $args{end} )->rank;
+    }
+    
+    # Make sure the ranks are correct.
+    unless( $self->_graphcalc_done ) {
+       $self->calculate_ranks;
+    }
+    # Go through the readings looking for duplicates.
+    my %unique_rank_rdg;
+    my @pairs;
     foreach my $rdg ( $self->readings ) {
         next unless $rdg->has_rank;
-        my $key = $rdg->rank . "||" . $rdg->text;
+        my $rk = $rdg->rank;
+        next if $rk > $endrank || $rk < $startrank;
+        my $key = $rk . "||" . $rdg->text;
         if( exists $unique_rank_rdg{$key} ) {
                # Make sure they don't have different grammatical forms
                        my $ur = $unique_rank_rdg{$key};
                if( $rdg->is_identical( $ur ) ) {
-                               # Combine!
-                               #say STDERR "Combining readings at same rank: $key";
-                               $changed = 1;
-                               $self->merge_readings( $unique_rank_rdg{$key}, $rdg );
-                               # TODO see if this now makes a common point.
+                               push( @pairs, [ $ur, $rdg ] );
                        }
         } else {
             $unique_rank_rdg{$key} = $rdg;
         }
-    }
-    # If we merged readings, the ranks are still fine but the alignment
-    # table is wrong. Wipe it.
-    $self->wipe_table() if $changed;
+    }  
+    
+    return @pairs;
 }