migrate lexemes and normal forms when readings are combined; script to merge readings...
[scpubgit/stemmatology.git] / lib / Text / Tradition / Collation.pm
index d500390..d4395a7 100644 (file)
@@ -274,8 +274,10 @@ See L<Text::Tradition::Collation::Relationship> for the available options.
 sub BUILD {
     my $self = shift;
     $self->_set_relations( Text::Tradition::Collation::RelationshipStore->new( 'collation' => $self ) );
-    $self->_set_start( $self->add_reading( { 'collation' => $self, 'is_start' => 1 } ) );
-    $self->_set_end( $self->add_reading( { 'collation' => $self, 'is_end' => 1 } ) );
+    $self->_set_start( $self->add_reading( 
+       { 'collation' => $self, 'is_start' => 1, 'init' => 1 } ) );
+    $self->_set_end( $self->add_reading( 
+       { 'collation' => $self, 'is_end' => 1, 'init' => 1 } ) );
 }
 
 ### Reading construct/destruct functions
@@ -284,7 +286,11 @@ sub add_reading {
        my( $self, $reading ) = @_;
        unless( ref( $reading ) eq 'Text::Tradition::Collation::Reading' ) {
                my %args = %$reading;
-               if( $self->tradition->has_language && !exists $args{'language'} ) {
+               if( $args{'init'} ) {
+                       # If we are initializing an empty collation, don't assume that we
+                       # have set a tradition.
+                       delete $args{'init'};
+               } elsif( $self->tradition->has_language && !exists $args{'language'} ) {
                        $args{'language'} = $self->tradition->language;
                }
                $reading = Text::Tradition::Collation::Reading->new( 
@@ -299,10 +305,7 @@ sub add_reading {
        $self->_add_reading( $reading->id => $reading );
        # Once the reading has been added, put it in both graphs.
        $self->sequence->add_vertex( $reading->id );
-       # All meta readings save 'start' and 'end' get disregarded for relationships.
-       unless( $reading->is_nonrel ) {
-               $self->relations->add_reading( $reading->id );
-       }
+       $self->relations->add_reading( $reading->id );
        return $reading;
 };
 
@@ -311,19 +314,17 @@ around del_reading => sub {
        my $self = shift;
        my $arg = shift;
        
-       unless( ref( $arg ) eq 'Text::Tradition::Collation::Reading' ) {
-               $arg = $self->reading( $arg )
+       if( ref( $arg ) eq 'Text::Tradition::Collation::Reading' ) {
+               $arg = $arg->id;
        }
-       my $argid = $arg->id;
        # Remove the reading from the graphs.
        $self->_graphcalc_done(0);
        $self->_clear_cache; # Explicitly clear caches to GC the reading
-       $self->sequence->delete_vertex( $argid );
-       $self->relations->delete_reading( $argid )
-               unless $arg->is_nonrel;
+       $self->sequence->delete_vertex( $arg );
+       $self->relations->delete_reading( $arg );
        
        # Carry on.
-       $self->$orig( $argid );
+       $self->$orig( $arg );
 };
 
 =begin testing
@@ -404,17 +405,23 @@ sub merge_readings {
                @wits{keys %$fwits} = values %$fwits;
                $self->sequence->set_edge_attributes( @vector, \%wits );
        }
-       $self->relations->merge_readings( $kept, $deleted, $combine )
-               unless $mergemeta;
+       $self->relations->merge_readings( $kept, $deleted, $combine );
        
        # Do the deletion deed.
        if( $combine ) {
+               # Combine the text of the readings
                my $joinstr = $combine_char;
                unless( defined $joinstr ) {
                        $joinstr = '' if $kept_obj->join_next || $del_obj->join_prior;
                        $joinstr = $self->wordsep unless defined $joinstr;
                }
                $kept_obj->alter_text( join( $joinstr, $kept_obj->text, $del_obj->text ) );
+               $kept_obj->normal_form( 
+                       join( $joinstr, $kept_obj->normal_form, $del_obj->normal_form ) );
+               # Combine the lexemes present in the readings
+               if( $kept_obj->has_lexemes && $del_obj->has_lexemes ) {
+                       $kept_obj->add_lexeme( $del_obj->lexemes );
+               }
        }
        $self->del_reading( $deleted );
 }
@@ -446,7 +453,7 @@ sub add_path {
 
        # We only need the IDs for adding paths to the graph, not the reading
        # objects themselves.
-    my( $source, $target, $wit ) = $self->_objectify_args( @_ );
+    my( $source, $target, $wit ) = $self->_stringify_args( @_ );
 
        $self->_graphcalc_done(0);
        # Connect the readings
@@ -729,23 +736,28 @@ sub as_dot {
                        $dot .= sprintf( "\t\"%s\" -> \"%s\" %s;\n", 
                                $edge->[0], $edge->[1], $varopts );
         } elsif( $used{$edge->[0]} ) {
-               $subend{$edge->[0]} = 1;
+               $subend{$edge->[0]} = $edge->[1];
         } elsif( $used{$edge->[1]} ) {
-               $substart{$edge->[1]} = 1;
+               $substart{$edge->[1]} = $edge->[0];
         }
     }
     # Add substitute start and end edges if necessary
     foreach my $node ( keys %substart ) {
-       my $witstr = $self->_path_display_label ( $self->reading_witnesses( $self->reading( $node ) ) );
+       my $witstr = $self->_path_display_label ( $self->path_witnesses( $substart{$node}, $node ) );
        my $variables = { %edge_attrs, 'label' => $witstr };
+       my $nrdg = $self->reading( $node );
+       if( $nrdg->has_rank && $nrdg->rank > $startrank ) {
+               # Substart is actually one lower than $startrank
+               $variables->{'minlen'} = $nrdg->rank - ( $startrank - 1 );
+       }       
         my $varopts = _dot_attr_string( $variables );
-        $dot .= "\t\"__SUBSTART__\" -> \"$node\" $varopts;";
+        $dot .= "\t\"__SUBSTART__\" -> \"$node\" $varopts;\n";
        }
     foreach my $node ( keys %subend ) {
-       my $witstr = $self->_path_display_label ( $self->reading_witnesses( $self->reading( $node ) ) );
+       my $witstr = $self->_path_display_label ( $self->path_witnesses( $node, $subend{$node} ) );
        my $variables = { %edge_attrs, 'label' => $witstr };
         my $varopts = _dot_attr_string( $variables );
-        $dot .= "\t\"$node\" -> \"__SUBEND__\" $varopts;";
+        $dot .= "\t\"$node\" -> \"__SUBEND__\" $varopts;\n";
        }
        # HACK part 2
        if( $STRAIGHTENHACK ) {
@@ -1095,6 +1107,8 @@ sub as_graphml {
                                # serialize them. Otherwise set nval to undef so that the
                                # key is excluded from this reading.
                        $nval = $nval ? $n->_serialize_lexemes : undef;
+               } elsif( $d eq 'normal_form' && $n->normal_form eq $n->text ) {
+                       $nval = undef;
                }
                if( $rankoffset && $d eq 'rank' && $n ne $self->start ) {
                        # Adjust the ranks within the subgraph.
@@ -1542,17 +1556,7 @@ sub calculate_ranks {
 
     # Transfer our rankings from the topological graph to the real one.
     foreach my $r ( $self->readings ) {
-        if( $r->is_nonrel ) {
-               # These are not in the equivalence graph.  Grab the rank of the highest
-               # predecessor + 1.
-               my @preds = $self->sequence->predecessors( $r );
-               my $mrank = 0;
-               map { my $rk = $node_ranks->{$self->equivalence( $_ )} + 1;
-                       $mrank = $rk > $mrank ? $rk : $mrank; } 
-                       $self->sequence->predecessors( $r );
-               throw( "All predecessors of $r unranked!" ) unless $mrank;
-               $r->rank( $mrank );
-        } elsif( defined $node_ranks->{$self->equivalence( $r->id )} ) {
+        if( defined $node_ranks->{$self->equivalence( $r->id )} ) {
             $r->rank( $node_ranks->{$self->equivalence( $r->id )} );
         } else {
                # Die. Find the last rank we calculated.
@@ -1601,8 +1605,17 @@ sub flatten_ranks {
         next unless $rdg->has_rank;
         my $key = $rdg->rank . "||" . $rdg->text;
         if( exists $unique_rank_rdg{$key} ) {
+               # Make sure they don't have different grammatical forms
+                       my $ur = $unique_rank_rdg{$key};
+                       if( $rdg->disambiguated && $ur->disambiguated ) {
+                               my $rform = join( '//', map { $_->form->to_string } $rdg->lexemes );
+                               my $uform = join( '//', map { $_->form->to_string } $ur->lexemes );
+                               next unless $rform eq $uform;
+                       } elsif( $rdg->disambiguated xor $ur->disambiguated ) {
+                               next;
+                       }
             # Combine!
-               # print STDERR "Combining readings at same rank: $key\n";
+               #print STDERR "Combining readings at same rank: $key\n";
                $changed = 1;
             $self->merge_readings( $unique_rank_rdg{$key}, $rdg );
             # TODO see if this now makes a common point.