Merge branch 'master' of github.com:tla/stemmatology

[scpubgit/stemmatology.git] / lib / Text / Tradition / Collation.pm
diff --git a/lib/Text/Tradition/Collation.pm b/lib/Text/Tradition/Collation.pm

index 0f6e657..bd3b58d 100644 (file)
--- a/lib/Text/Tradition/Collation.pm
+++ b/lib/Text/Tradition/Collation.pm
@@ -1,5 +1,6 @@
 package Text::Tradition::Collation;
 
+use feature 'say';
 use Encode qw( decode_utf8 );
 use File::Temp;
 use File::Which;
@@ -274,8 +275,10 @@ See L<Text::Tradition::Collation::Relationship> for the available options.
 sub BUILD {
     my $self = shift;
     $self->_set_relations( Text::Tradition::Collation::RelationshipStore->new( 'collation' => $self ) );
-    $self->_set_start( $self->add_reading( { 'collation' => $self, 'is_start' => 1 } ) );
-    $self->_set_end( $self->add_reading( { 'collation' => $self, 'is_end' => 1 } ) );
+    $self->_set_start( $self->add_reading( 
+       { 'collation' => $self, 'is_start' => 1, 'init' => 1 } ) );
+    $self->_set_end( $self->add_reading( 
+       { 'collation' => $self, 'is_end' => 1, 'init' => 1 } ) );
 }
 
 ### Reading construct/destruct functions
@@ -284,7 +287,11 @@ sub add_reading {
        my( $self, $reading ) = @_;
        unless( ref( $reading ) eq 'Text::Tradition::Collation::Reading' ) {
                my %args = %$reading;
-               if( $self->tradition->has_language && !exists $args{'language'} ) {
+               if( $args{'init'} ) {
+                       # If we are initializing an empty collation, don't assume that we
+                       # have set a tradition.
+                       delete $args{'init'};
+               } elsif( $self->tradition->has_language && !exists $args{'language'} ) {
                        $args{'language'} = $self->tradition->language;
                }
                $reading = Text::Tradition::Collation::Reading->new( 
@@ -299,10 +306,7 @@ sub add_reading {
        $self->_add_reading( $reading->id => $reading );
        # Once the reading has been added, put it in both graphs.
        $self->sequence->add_vertex( $reading->id );
-       # All meta readings save 'start' and 'end' get disregarded for relationships.
-       unless( $reading->is_nonrel ) {
-               $self->relations->add_reading( $reading->id );
-       }
+       $self->relations->add_reading( $reading->id );
        return $reading;
 };
 
@@ -311,19 +315,17 @@ around del_reading => sub {
        my $self = shift;
        my $arg = shift;
        
-       unless( ref( $arg ) eq 'Text::Tradition::Collation::Reading' ) {
-               $arg = $self->reading( $arg )
+       if( ref( $arg ) eq 'Text::Tradition::Collation::Reading' ) {
+               $arg = $arg->id;
        }
-       my $argid = $arg->id;
        # Remove the reading from the graphs.
        $self->_graphcalc_done(0);
        $self->_clear_cache; # Explicitly clear caches to GC the reading
-       $self->sequence->delete_vertex( $argid );
-       $self->relations->delete_reading( $argid )
-               unless $arg->is_nonrel;
+       $self->sequence->delete_vertex( $arg );
+       $self->relations->delete_reading( $arg );
        
        # Carry on.
-       $self->$orig( $argid );
+       $self->$orig( $arg );
 };
 
 =begin testing
@@ -404,21 +406,80 @@ sub merge_readings {
                @wits{keys %$fwits} = values %$fwits;
                $self->sequence->set_edge_attributes( @vector, \%wits );
        }
-       $self->relations->merge_readings( $kept, $deleted, $combine )
-               unless $mergemeta;
+       $self->relations->merge_readings( $kept, $deleted, $combine );
        
        # Do the deletion deed.
        if( $combine ) {
+               # Combine the text of the readings
                my $joinstr = $combine_char;
                unless( defined $joinstr ) {
                        $joinstr = '' if $kept_obj->join_next || $del_obj->join_prior;
                        $joinstr = $self->wordsep unless defined $joinstr;
                }
                $kept_obj->alter_text( join( $joinstr, $kept_obj->text, $del_obj->text ) );
+               $kept_obj->normal_form( 
+                       join( $joinstr, $kept_obj->normal_form, $del_obj->normal_form ) );
+               # Combine the lexemes present in the readings
+               if( $kept_obj->has_lexemes && $del_obj->has_lexemes ) {
+                       $kept_obj->add_lexeme( $del_obj->lexemes );
+               }
        }
        $self->del_reading( $deleted );
 }
 
+=head2 compress_readings
+
+Where possible in the graph, compresses plain sequences of readings into a
+single reading. The sequences must consist of readings with no
+relationships to other readings, with only a single witness path between
+them and no other witness paths from either that would skip the other. The
+readings must also not be marked as nonsense or bad grammar.
+
+WARNING: This operation cannot be undone.
+
+=cut
+
+sub compress_readings {
+       my $self = shift;
+       # Anywhere in the graph that there is a reading that joins only to a single
+       # successor, and neither of these have any relationships, just join the two
+       # readings.
+       my %gobbled;
+       foreach my $rdg ( sort { $a->rank <=> $b->rank } $self->readings ) {
+               next if $rdg->is_meta;
+               next if $gobbled{$rdg->id};
+               next if $rdg->grammar_invalid || $rdg->is_nonsense;
+               next if $rdg->related_readings();
+               my %seen;
+               while( $self->sequence->successors( $rdg ) == 1 ) {
+                       my( $next ) = $self->reading( $self->sequence->successors( $rdg ) );
+                       throw( "Infinite loop" ) if $seen{$next->id};
+                       $seen{$next->id} = 1;
+                       last if $self->sequence->predecessors( $next ) > 1;
+                       last if $next->is_meta;
+                       last if $next->grammar_invalid || $next->is_nonsense;
+                       last if $next->related_readings();
+                       say "Joining readings $rdg and $next";
+                       $self->merge_readings( $rdg, $next, 1 );
+               }
+       }
+       # Make sure we haven't screwed anything up
+       foreach my $wit ( $self->tradition->witnesses ) {
+               my $pathtext = $self->path_text( $wit->sigil );
+               my $origtext = join( ' ', @{$wit->text} );
+               throw( "Text differs for witness " . $wit->sigil )
+                       unless $pathtext eq $origtext;
+               if( $wit->is_layered ) {
+                       $pathtext = $self->path_text( $wit->sigil.$self->ac_label );
+                       $origtext = join( ' ', @{$wit->layertext} );
+                       throw( "Ante-corr text differs for witness " . $wit->sigil )
+                               unless $pathtext eq $origtext;
+               }
+       }
+
+       $self->relations->rebuild_equivalence();
+       $self->calculate_ranks();
+}
 
 # Helper function for manipulating the graph.
 sub _stringify_args {
@@ -446,7 +507,7 @@ sub add_path {
 
        # We only need the IDs for adding paths to the graph, not the reading
        # objects themselves.
-    my( $source, $target, $wit ) = $self->_objectify_args( @_ );
+    my( $source, $target, $wit ) = $self->_stringify_args( @_ );
 
        $self->_graphcalc_done(0);
        # Connect the readings
@@ -729,23 +790,28 @@ sub as_dot {
                        $dot .= sprintf( "\t\"%s\" -> \"%s\" %s;\n", 
                                $edge->[0], $edge->[1], $varopts );
         } elsif( $used{$edge->[0]} ) {
-               $subend{$edge->[0]} = 1;
+               $subend{$edge->[0]} = $edge->[1];
         } elsif( $used{$edge->[1]} ) {
-               $substart{$edge->[1]} = 1;
+               $substart{$edge->[1]} = $edge->[0];
         }
     }
     # Add substitute start and end edges if necessary
     foreach my $node ( keys %substart ) {
-       my $witstr = $self->_path_display_label ( $self->reading_witnesses( $self->reading( $node ) ) );
+       my $witstr = $self->_path_display_label ( $self->path_witnesses( $substart{$node}, $node ) );
        my $variables = { %edge_attrs, 'label' => $witstr };
+       my $nrdg = $self->reading( $node );
+       if( $nrdg->has_rank && $nrdg->rank > $startrank ) {
+               # Substart is actually one lower than $startrank
+               $variables->{'minlen'} = $nrdg->rank - ( $startrank - 1 );
+       }       
         my $varopts = _dot_attr_string( $variables );
-        $dot .= "\t\"__SUBSTART__\" -> \"$node\" $varopts;";
+        $dot .= "\t\"__SUBSTART__\" -> \"$node\" $varopts;\n";
        }
     foreach my $node ( keys %subend ) {
-       my $witstr = $self->_path_display_label ( $self->reading_witnesses( $self->reading( $node ) ) );
+       my $witstr = $self->_path_display_label ( $self->path_witnesses( $node, $subend{$node} ) );
        my $variables = { %edge_attrs, 'label' => $witstr };
         my $varopts = _dot_attr_string( $variables );
-        $dot .= "\t\"$node\" -> \"__SUBEND__\" $varopts;";
+        $dot .= "\t\"$node\" -> \"__SUBEND__\" $varopts;\n";
        }
        # HACK part 2
        if( $STRAIGHTENHACK ) {
@@ -1222,7 +1288,7 @@ sub alignment_table {
     my $table = { 'alignment' => [], 'length' => $self->end->rank - 1 };
     my @all_pos = ( 1 .. $self->end->rank - 1 );
     foreach my $wit ( sort { $a->sigil cmp $b->sigil } $self->tradition->witnesses ) {
-        # print STDERR "Making witness row(s) for " . $wit->sigil . "\n";
+        # say STDERR "Making witness row(s) for " . $wit->sigil;
         my @wit_path = $self->reading_sequence( $self->start, $self->end, $wit->sigil );
         my @row = _make_witness_row( \@wit_path, \@all_pos );
         push( @{$table->{'alignment'}}, 
@@ -1247,8 +1313,8 @@ sub _make_witness_row {
     foreach my $rdg ( @$path ) {
         my $rtext = $rdg->text;
         $rtext = '#LACUNA#' if $rdg->is_lacuna;
-        print STDERR "rank " . $rdg->rank . "\n" if $debug;
-        # print STDERR "No rank for " . $rdg->id . "\n" unless defined $rdg->rank;
+        say STDERR "rank " . $rdg->rank if $debug;
+        # say STDERR "No rank for " . $rdg->id unless defined $rdg->rank;
         $char_hash{$rdg->rank} = { 't' => $rdg };
     }
     my @row = map { $char_hash{$_} } @$positions;
@@ -1472,7 +1538,7 @@ Call make_witness_path for all witnesses in the tradition.
 sub make_witness_paths {
     my( $self ) = @_;
     foreach my $wit ( $self->tradition->witnesses ) {
-        # print STDERR "Making path for " . $wit->sigil . "\n";
+        # say STDERR "Making path for " . $wit->sigil;
         $self->make_witness_path( $wit );
     }
 }
@@ -1544,17 +1610,7 @@ sub calculate_ranks {
 
     # Transfer our rankings from the topological graph to the real one.
     foreach my $r ( $self->readings ) {
-        if( $r->is_nonrel ) {
-               # These are not in the equivalence graph.  Grab the rank of the highest
-               # predecessor + 1.
-               my @preds = $self->sequence->predecessors( $r );
-               my $mrank = 0;
-               map { my $rk = $node_ranks->{$self->equivalence( $_ )} + 1;
-                       $mrank = $rk > $mrank ? $rk : $mrank; } 
-                       $self->sequence->predecessors( $r );
-               throw( "All predecessors of $r unranked!" ) unless $mrank;
-               $r->rank( $mrank );
-        } elsif( defined $node_ranks->{$self->equivalence( $r->id )} ) {
+        if( defined $node_ranks->{$self->equivalence( $r->id )} ) {
             $r->rank( $node_ranks->{$self->equivalence( $r->id )} );
         } else {
                # Die. Find the last rank we calculated.
@@ -1613,7 +1669,7 @@ sub flatten_ranks {
                                next;
                        }
             # Combine!
-               print STDERR "Combining readings at same rank: $key\n";
+               #say STDERR "Combining readings at same rank: $key";
                $changed = 1;
             $self->merge_readings( $unique_rank_rdg{$key}, $rdg );
             # TODO see if this now makes a common point.
@@ -1775,7 +1831,7 @@ sub _common_in_path {
        my @last_r2 = ( $r2 );
        # my %all_seen = ( $r1 => 'r1', $r2 => 'r2' );
        my %all_seen;
-       # print STDERR "Finding common $dir for $r1, $r2\n";
+       # say STDERR "Finding common $dir for $r1, $r2";
        while( !@candidates ) {
                last unless $iter--;  # Avoid looping infinitely
                # Iterate separately down the graph from r1 and r2
@@ -1783,7 +1839,7 @@ sub _common_in_path {
                foreach my $lc ( @last_r1 ) {
                        foreach my $p ( $lc->$dir ) {
                                if( $all_seen{$p->id} && $all_seen{$p->id} ne 'r1' ) {
-                                       # print STDERR "Path candidate $p from $lc\n";
+                                       # say STDERR "Path candidate $p from $lc";
                                        push( @candidates, $p );
                                } elsif( !$all_seen{$p->id} ) {
                                        $all_seen{$p->id} = 'r1';
@@ -1794,7 +1850,7 @@ sub _common_in_path {
                foreach my $lc ( @last_r2 ) {
                        foreach my $p ( $lc->$dir ) {
                                if( $all_seen{$p->id} && $all_seen{$p->id} ne 'r2' ) {
-                                       # print STDERR "Path candidate $p from $lc\n";
+                                       # say STDERR "Path candidate $p from $lc";
                                        push( @candidates, $p );
                                } elsif( !$all_seen{$p->id} ) {
                                        $all_seen{$p->id} = 'r2';