load XML::LibXML only when required; handle global relationships more correctly;...

[scpubgit/stemmatology.git] / lib / Text / Tradition / Collation.pm
diff --git a/lib/Text/Tradition/Collation.pm b/lib/Text/Tradition/Collation.pm

index d500390..dda8485 100644 (file)
--- a/lib/Text/Tradition/Collation.pm
+++ b/lib/Text/Tradition/Collation.pm
@@ -1,5 +1,6 @@
 package Text::Tradition::Collation;
 
+use feature 'say';
 use Encode qw( decode_utf8 );
 use File::Temp;
 use File::Which;
@@ -10,8 +11,6 @@ use Text::Tradition::Collation::Reading;
 use Text::Tradition::Collation::RelationshipStore;
 use Text::Tradition::Error;
 use XML::Easy::Syntax qw( $xml10_namestartchar_rx $xml10_namechar_rx );
-use XML::LibXML;
-use XML::LibXML::XPathContext;
 use Moose;
 
 has 'sequence' => (
@@ -274,8 +273,10 @@ See L<Text::Tradition::Collation::Relationship> for the available options.
 sub BUILD {
     my $self = shift;
     $self->_set_relations( Text::Tradition::Collation::RelationshipStore->new( 'collation' => $self ) );
-    $self->_set_start( $self->add_reading( { 'collation' => $self, 'is_start' => 1 } ) );
-    $self->_set_end( $self->add_reading( { 'collation' => $self, 'is_end' => 1 } ) );
+    $self->_set_start( $self->add_reading( 
+       { 'collation' => $self, 'is_start' => 1, 'init' => 1 } ) );
+    $self->_set_end( $self->add_reading( 
+       { 'collation' => $self, 'is_end' => 1, 'init' => 1 } ) );
 }
 
 ### Reading construct/destruct functions
@@ -284,7 +285,11 @@ sub add_reading {
        my( $self, $reading ) = @_;
        unless( ref( $reading ) eq 'Text::Tradition::Collation::Reading' ) {
                my %args = %$reading;
-               if( $self->tradition->has_language && !exists $args{'language'} ) {
+               if( $args{'init'} ) {
+                       # If we are initializing an empty collation, don't assume that we
+                       # have set a tradition.
+                       delete $args{'init'};
+               } elsif( $self->tradition->has_language && !exists $args{'language'} ) {
                        $args{'language'} = $self->tradition->language;
                }
                $reading = Text::Tradition::Collation::Reading->new( 
@@ -299,10 +304,7 @@ sub add_reading {
        $self->_add_reading( $reading->id => $reading );
        # Once the reading has been added, put it in both graphs.
        $self->sequence->add_vertex( $reading->id );
-       # All meta readings save 'start' and 'end' get disregarded for relationships.
-       unless( $reading->is_nonrel ) {
-               $self->relations->add_reading( $reading->id );
-       }
+       $self->relations->add_reading( $reading->id );
        return $reading;
 };
 
@@ -311,19 +313,17 @@ around del_reading => sub {
        my $self = shift;
        my $arg = shift;
        
-       unless( ref( $arg ) eq 'Text::Tradition::Collation::Reading' ) {
-               $arg = $self->reading( $arg )
+       if( ref( $arg ) eq 'Text::Tradition::Collation::Reading' ) {
+               $arg = $arg->id;
        }
-       my $argid = $arg->id;
        # Remove the reading from the graphs.
        $self->_graphcalc_done(0);
        $self->_clear_cache; # Explicitly clear caches to GC the reading
-       $self->sequence->delete_vertex( $argid );
-       $self->relations->delete_reading( $argid )
-               unless $arg->is_nonrel;
+       $self->sequence->delete_vertex( $arg );
+       $self->relations->delete_reading( $arg );
        
        # Carry on.
-       $self->$orig( $argid );
+       $self->$orig( $arg );
 };
 
 =begin testing
@@ -404,21 +404,91 @@ sub merge_readings {
                @wits{keys %$fwits} = values %$fwits;
                $self->sequence->set_edge_attributes( @vector, \%wits );
        }
-       $self->relations->merge_readings( $kept, $deleted, $combine )
-               unless $mergemeta;
+       $self->relations->merge_readings( $kept, $deleted, $combine );
        
        # Do the deletion deed.
        if( $combine ) {
+               # Combine the text of the readings
                my $joinstr = $combine_char;
                unless( defined $joinstr ) {
                        $joinstr = '' if $kept_obj->join_next || $del_obj->join_prior;
                        $joinstr = $self->wordsep unless defined $joinstr;
                }
                $kept_obj->alter_text( join( $joinstr, $kept_obj->text, $del_obj->text ) );
+               # Change this reading to a joining one if necessary
+               $kept_obj->_set_join_next( $del_obj->join_next );
+               $kept_obj->normal_form( 
+                       join( $joinstr, $kept_obj->normal_form, $del_obj->normal_form ) );
+               # Combine the lexemes present in the readings
+               if( $kept_obj->has_lexemes && $del_obj->has_lexemes ) {
+                       $kept_obj->add_lexeme( $del_obj->lexemes );
+               }
        }
        $self->del_reading( $deleted );
 }
 
+=head2 compress_readings
+
+Where possible in the graph, compresses plain sequences of readings into a
+single reading. The sequences must consist of readings with no
+relationships to other readings, with only a single witness path between
+them and no other witness paths from either that would skip the other. The
+readings must also not be marked as nonsense or bad grammar.
+
+WARNING: This operation cannot be undone.
+
+=cut
+
+sub compress_readings {
+       my $self = shift;
+       # Anywhere in the graph that there is a reading that joins only to a single
+       # successor, and neither of these have any relationships, just join the two
+       # readings.
+       my %gobbled;
+       foreach my $rdg ( sort { $a->rank <=> $b->rank } $self->readings ) {
+               # While we are here, get rid of any extra wordforms from a disambiguated
+               # reading.
+               if( $rdg->disambiguated ) {
+                       foreach my $lex ( $rdg->lexemes ) {
+                               $lex->clear_matching_forms();
+                               $lex->add_matching_form( $lex->form );
+                       }
+               }
+               # Now look for readings that can be joined to their successors.
+               next if $rdg->is_meta;
+               next if $gobbled{$rdg->id};
+               next if $rdg->grammar_invalid || $rdg->is_nonsense;
+               next if $rdg->related_readings();
+               my %seen;
+               while( $self->sequence->successors( $rdg ) == 1 ) {
+                       my( $next ) = $self->reading( $self->sequence->successors( $rdg ) );
+                       throw( "Infinite loop" ) if $seen{$next->id};
+                       $seen{$next->id} = 1;
+                       last if $self->sequence->predecessors( $next ) > 1;
+                       last if $next->is_meta;
+                       last if $next->grammar_invalid || $next->is_nonsense;
+                       last if $next->related_readings();
+                       say "Joining readings $rdg and $next";
+                       $self->merge_readings( $rdg, $next, 1 );
+               }
+       }
+       # Make sure we haven't screwed anything up
+       foreach my $wit ( $self->tradition->witnesses ) {
+               my $pathtext = $self->path_text( $wit->sigil );
+               my $origtext = join( ' ', @{$wit->text} );
+               throw( "Text differs for witness " . $wit->sigil )
+                       unless $pathtext eq $origtext;
+               if( $wit->is_layered ) {
+                       $pathtext = $self->path_text( $wit->sigil.$self->ac_label );
+                       $origtext = join( ' ', @{$wit->layertext} );
+                       throw( "Ante-corr text differs for witness " . $wit->sigil )
+                               unless $pathtext eq $origtext;
+               }
+       }
+
+       $self->relations->rebuild_equivalence();
+       $self->calculate_ranks();
+}
 
 # Helper function for manipulating the graph.
 sub _stringify_args {
@@ -446,7 +516,7 @@ sub add_path {
 
        # We only need the IDs for adding paths to the graph, not the reading
        # objects themselves.
-    my( $source, $target, $wit ) = $self->_objectify_args( @_ );
+    my( $source, $target, $wit ) = $self->_stringify_args( @_ );
 
        $self->_graphcalc_done(0);
        # Connect the readings
@@ -729,23 +799,28 @@ sub as_dot {
                        $dot .= sprintf( "\t\"%s\" -> \"%s\" %s;\n", 
                                $edge->[0], $edge->[1], $varopts );
         } elsif( $used{$edge->[0]} ) {
-               $subend{$edge->[0]} = 1;
+               $subend{$edge->[0]} = $edge->[1];
         } elsif( $used{$edge->[1]} ) {
-               $substart{$edge->[1]} = 1;
+               $substart{$edge->[1]} = $edge->[0];
         }
     }
     # Add substitute start and end edges if necessary
     foreach my $node ( keys %substart ) {
-       my $witstr = $self->_path_display_label ( $self->reading_witnesses( $self->reading( $node ) ) );
+       my $witstr = $self->_path_display_label ( $self->path_witnesses( $substart{$node}, $node ) );
        my $variables = { %edge_attrs, 'label' => $witstr };
+       my $nrdg = $self->reading( $node );
+       if( $nrdg->has_rank && $nrdg->rank > $startrank ) {
+               # Substart is actually one lower than $startrank
+               $variables->{'minlen'} = $nrdg->rank - ( $startrank - 1 );
+       }       
         my $varopts = _dot_attr_string( $variables );
-        $dot .= "\t\"__SUBSTART__\" -> \"$node\" $varopts;";
+        $dot .= "\t\"__SUBSTART__\" -> \"$node\" $varopts;\n";
        }
     foreach my $node ( keys %subend ) {
-       my $witstr = $self->_path_display_label ( $self->reading_witnesses( $self->reading( $node ) ) );
+       my $witstr = $self->_path_display_label ( $self->path_witnesses( $node, $subend{$node} ) );
        my $variables = { %edge_attrs, 'label' => $witstr };
         my $varopts = _dot_attr_string( $variables );
-        $dot .= "\t\"$node\" -> \"__SUBEND__\" $varopts;";
+        $dot .= "\t\"$node\" -> \"__SUBEND__\" $varopts;\n";
        }
        # HACK part 2
        if( $STRAIGHTENHACK ) {
@@ -905,13 +980,23 @@ is( scalar $st->collation->readings, $READINGS, "Reparsed collation has all read
 is( scalar $st->collation->paths, $PATHS, "Reparsed collation has all paths" );
 is( scalar $st->collation->relationships, 3, "Reparsed collation has new relationships" );
 
-# Now add a stemma, write to GraphML, and parse again.
+# Now add a stemma, write to GraphML, and look at the output.
 my $stemma = $tradition->add_stemma( 'dotfile' => 't/data/florilegium.dot' );
 is( ref( $stemma ), 'Text::Tradition::Stemma', "Parsed dotfile into stemma" );
 is( $tradition->stemmata, 1, "Tradition now has the stemma" );
 $graphml = $c->as_graphml;
 like( $graphml, qr/digraph/, "Digraph declaration exists in GraphML" );
 
+# Now add a user, write to GraphML, and look at the output.
+unlike( $graphml, qr/testuser/, "Test user name does not exist in GraphML yet" );
+my $testuser = Text::Tradition::User->new( 
+       id => 'testuser', password => 'testpass' );
+is( ref( $testuser ), 'Text::Tradition::User', "Created test user object" );
+$testuser->add_tradition( $tradition );
+is( $tradition->user->id, $testuser->id, "Tradition assigned to test user" );
+$graphml = $c->as_graphml;
+like( $graphml, qr/testuser/, "Test user name now exists in GraphML" );
+
 =end testing
 
 =cut
@@ -943,6 +1028,7 @@ sub as_graphml {
         'http://graphml.graphdrawing.org/xmlns/1.0/graphml.xsd';
 
     # Create the document and root node
+    require XML::LibXML;
     my $graphml = XML::LibXML->createDocument( "1.0", "UTF-8" );
     my $root = $graphml->createElementNS( $graphml_ns, 'graphml' );
     $graphml->setDocumentElement( $root );
@@ -982,14 +1068,25 @@ sub as_graphml {
                next unless $save_types{$attr->type_constraint->name};
                $graph_attributes{$attr->name} = $save_types{$attr->type_constraint->name};
        }
-    # Extra custom key for the tradition stemma(ta)
-    $graph_attributes{'stemmata'} = 'string';
+    # Extra custom keys for complex objects that should be saved in some form.
+    # The subroutine should return a string, or undef/empty.
+    $graph_attributes{'stemmata'} = sub { 
+       my @stemstrs;
+               map { push( @stemstrs, $_->editable( {linesep => ''} ) ) } 
+                       $self->tradition->stemmata;
+               join( "\n", @stemstrs );
+       };
+    $graph_attributes{'user'} = sub { 
+       $self->tradition->user ? $self->tradition->user->id : undef 
+    };
        
     foreach my $datum ( sort keys %graph_attributes ) {
        $graph_data_keys{$datum} = 'dg'.$gdi++;
         my $key = $root->addNewChild( $graphml_ns, 'key' );
+        my $dtype = ref( $graph_attributes{$datum} ) ? 'string' 
+               : $graph_attributes{$datum};
         $key->setAttribute( 'attr.name', $datum );
-        $key->setAttribute( 'attr.type', $graph_attributes{$datum} );
+        $key->setAttribute( 'attr.type', $dtype );
         $key->setAttribute( 'for', 'graph' );
         $key->setAttribute( 'id', $graph_data_keys{$datum} );          
     }
@@ -1062,11 +1159,9 @@ sub as_graphml {
        my $value;
        if( $datum eq 'version' ) {
                $value = '3.2';
-       } elsif( $datum eq 'stemmata' ) {
-               my @stemstrs;
-               map { push( @stemstrs, $_->editable( {linesep => ''} ) ) } 
-                       $self->tradition->stemmata;
-               $value = join( "\n", @stemstrs );
+       } elsif( ref( $graph_attributes{$datum} ) ) {
+               my $sub = $graph_attributes{$datum};
+               $value = &$sub();
        } elsif( $gattr_from{$datum} eq 'Tradition' ) {
                $value = $self->tradition->$datum;
        } else {
@@ -1095,6 +1190,8 @@ sub as_graphml {
                                # serialize them. Otherwise set nval to undef so that the
                                # key is excluded from this reading.
                        $nval = $nval ? $n->_serialize_lexemes : undef;
+               } elsif( $d eq 'normal_form' && $n->normal_form eq $n->text ) {
+                       $nval = undef;
                }
                if( $rankoffset && $d eq 'rank' && $n ne $self->start ) {
                        # Adjust the ranks within the subgraph.
@@ -1220,7 +1317,7 @@ sub alignment_table {
     my $table = { 'alignment' => [], 'length' => $self->end->rank - 1 };
     my @all_pos = ( 1 .. $self->end->rank - 1 );
     foreach my $wit ( sort { $a->sigil cmp $b->sigil } $self->tradition->witnesses ) {
-        # print STDERR "Making witness row(s) for " . $wit->sigil . "\n";
+        # say STDERR "Making witness row(s) for " . $wit->sigil;
         my @wit_path = $self->reading_sequence( $self->start, $self->end, $wit->sigil );
         my @row = _make_witness_row( \@wit_path, \@all_pos );
         push( @{$table->{'alignment'}}, 
@@ -1245,8 +1342,8 @@ sub _make_witness_row {
     foreach my $rdg ( @$path ) {
         my $rtext = $rdg->text;
         $rtext = '#LACUNA#' if $rdg->is_lacuna;
-        print STDERR "rank " . $rdg->rank . "\n" if $debug;
-        # print STDERR "No rank for " . $rdg->id . "\n" unless defined $rdg->rank;
+        say STDERR "rank " . $rdg->rank if $debug;
+        # say STDERR "No rank for " . $rdg->id unless defined $rdg->rank;
         $char_hash{$rdg->rank} = { 't' => $rdg };
     }
     my @row = map { $char_hash{$_} } @$positions;
@@ -1470,7 +1567,7 @@ Call make_witness_path for all witnesses in the tradition.
 sub make_witness_paths {
     my( $self ) = @_;
     foreach my $wit ( $self->tradition->witnesses ) {
-        # print STDERR "Making path for " . $wit->sigil . "\n";
+        # say STDERR "Making path for " . $wit->sigil;
         $self->make_witness_path( $wit );
     }
 }
@@ -1542,17 +1639,7 @@ sub calculate_ranks {
 
     # Transfer our rankings from the topological graph to the real one.
     foreach my $r ( $self->readings ) {
-        if( $r->is_nonrel ) {
-               # These are not in the equivalence graph.  Grab the rank of the highest
-               # predecessor + 1.
-               my @preds = $self->sequence->predecessors( $r );
-               my $mrank = 0;
-               map { my $rk = $node_ranks->{$self->equivalence( $_ )} + 1;
-                       $mrank = $rk > $mrank ? $rk : $mrank; } 
-                       $self->sequence->predecessors( $r );
-               throw( "All predecessors of $r unranked!" ) unless $mrank;
-               $r->rank( $mrank );
-        } elsif( defined $node_ranks->{$self->equivalence( $r->id )} ) {
+        if( defined $node_ranks->{$self->equivalence( $r->id )} ) {
             $r->rank( $node_ranks->{$self->equivalence( $r->id )} );
         } else {
                # Die. Find the last rank we calculated.
@@ -1601,8 +1688,17 @@ sub flatten_ranks {
         next unless $rdg->has_rank;
         my $key = $rdg->rank . "||" . $rdg->text;
         if( exists $unique_rank_rdg{$key} ) {
+               # Make sure they don't have different grammatical forms
+                       my $ur = $unique_rank_rdg{$key};
+                       if( $rdg->disambiguated && $ur->disambiguated ) {
+                               my $rform = join( '//', map { $_->form->to_string } $rdg->lexemes );
+                               my $uform = join( '//', map { $_->form->to_string } $ur->lexemes );
+                               next unless $rform eq $uform;
+                       } elsif( $rdg->disambiguated xor $ur->disambiguated ) {
+                               next;
+                       }
             # Combine!
-               # print STDERR "Combining readings at same rank: $key\n";
+               #say STDERR "Combining readings at same rank: $key";
                $changed = 1;
             $self->merge_readings( $unique_rank_rdg{$key}, $rdg );
             # TODO see if this now makes a common point.
@@ -1764,7 +1860,7 @@ sub _common_in_path {
        my @last_r2 = ( $r2 );
        # my %all_seen = ( $r1 => 'r1', $r2 => 'r2' );
        my %all_seen;
-       # print STDERR "Finding common $dir for $r1, $r2\n";
+       # say STDERR "Finding common $dir for $r1, $r2";
        while( !@candidates ) {
                last unless $iter--;  # Avoid looping infinitely
                # Iterate separately down the graph from r1 and r2
@@ -1772,7 +1868,7 @@ sub _common_in_path {
                foreach my $lc ( @last_r1 ) {
                        foreach my $p ( $lc->$dir ) {
                                if( $all_seen{$p->id} && $all_seen{$p->id} ne 'r1' ) {
-                                       # print STDERR "Path candidate $p from $lc\n";
+                                       # say STDERR "Path candidate $p from $lc";
                                        push( @candidates, $p );
                                } elsif( !$all_seen{$p->id} ) {
                                        $all_seen{$p->id} = 'r1';
@@ -1783,7 +1879,7 @@ sub _common_in_path {
                foreach my $lc ( @last_r2 ) {
                        foreach my $p ( $lc->$dir ) {
                                if( $all_seen{$p->id} && $all_seen{$p->id} ne 'r2' ) {
-                                       # print STDERR "Path candidate $p from $lc\n";
+                                       # say STDERR "Path candidate $p from $lc";
                                        push( @candidates, $p );
                                } elsif( !$all_seen{$p->id} ) {
                                        $all_seen{$p->id} = 'r2';