X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=blobdiff_plain;f=lib%2FText%2FTradition%2FCollation.pm;h=7b261359f263c341f8b561cc6d7be6417e203d2c;hb=d3e7842a9402304b1b701c2a72db001b324f1f2f;hp=786e86274459c01ced6a18ac7af908140d47017f;hpb=5f08271655e5441574dcb9aae795e239ebf229e3;p=scpubgit%2Fstemmatology.git diff --git a/lib/Text/Tradition/Collation.pm b/lib/Text/Tradition/Collation.pm index 786e862..7b26135 100644 --- a/lib/Text/Tradition/Collation.pm +++ b/lib/Text/Tradition/Collation.pm @@ -5,10 +5,11 @@ use File::Temp; use File::Which; use Graph; use IPC::Run qw( run binary ); -use Text::CSV_XS; +use Text::CSV; use Text::Tradition::Collation::Reading; use Text::Tradition::Collation::RelationshipStore; use Text::Tradition::Error; +use XML::Easy::Syntax qw( $xml10_namestartchar_rx $xml10_namechar_rx ); use XML::LibXML; use XML::LibXML::XPathContext; use Moose; @@ -30,6 +31,8 @@ has 'relations' => ( related_readings => 'related_readings', get_relationship => 'get_relationship', del_relationship => 'del_relationship', + equivalence => 'equivalence', + equivalence_graph => 'equivalence_graph', }, writer => '_set_relations', ); @@ -37,6 +40,7 @@ has 'relations' => ( has 'tradition' => ( is => 'ro', isa => 'Text::Tradition', + writer => '_set_tradition', weak_ref => 1, ); @@ -427,10 +431,13 @@ sub add_path { $self->_graphcalc_done(0); # Connect the readings - $self->sequence->add_edge( $source, $target ); + unless( $self->sequence->has_edge( $source, $target ) ) { + $self->sequence->add_edge( $source, $target ); + $self->relations->add_equivalence_edge( $source, $target ); + } # Note the witness in question $self->sequence->set_edge_attribute( $source, $target, $wit, 1 ); -}; +} sub del_path { my $self = shift; @@ -452,6 +459,7 @@ sub del_path { } unless( keys %{$self->sequence->get_edge_attributes( $source, $target )} ) { $self->sequence->delete_edge( $source, $target ); + $self->relations->delete_equivalence_edge( $source, $target ); } } @@ -494,8 +502,7 @@ sub clear_witness { sub add_relationship { my $self = shift; my( $source, $target, $opts ) = $self->_stringify_args( @_ ); - my( @vectors ) = $self->relations->add_relationship( $source, - $self->reading( $source ), $target, $self->reading( $target ), $opts ); + my( @vectors ) = $self->relations->add_relationship( $source, $target, $opts ); $self->_graphcalc_done(0); return @vectors; } @@ -881,9 +888,25 @@ is( scalar $st->collation->relationships, 3, "Reparsed collation has new relatio =cut sub as_graphml { - my( $self ) = @_; + my( $self, $options ) = @_; $self->calculate_ranks unless $self->_graphcalc_done; + my $start = $options->{'from'} + ? $self->reading( $options->{'from'} ) : $self->start; + my $end = $options->{'to'} + ? $self->reading( $options->{'to'} ) : $self->end; + if( $start->has_rank && $end->has_rank && $end->rank < $start->rank ) { + throw( 'Start node must be before end node' ); + } + # The readings need to be ranked for this to work. + $start = $self->start unless $start->has_rank; + $end = $self->end unless $end->has_rank; + my $rankoffset = 0; + unless( $start eq $self->start ) { + $rankoffset = $start->rank - 1; + } + my %use_readings; + # Some namespaces my $graphml_ns = 'http://graphml.graphdrawing.org/xmlns'; my $xsi_ns = 'http://www.w3.org/2001/XMLSchema-instance'; @@ -984,14 +1007,19 @@ sub as_graphml { $key->setAttribute( 'id', $edge_data_keys{$datum} ); } - # Add the collation graph itself + # Add the collation graph itself. First, sanitize the name to a valid XML ID. + my $xmlidname = $self->tradition->name; + $xmlidname =~ s/(?!$xml10_namechar_rx)./_/g; + if( $xmlidname !~ /^$xml10_namestartchar_rx/ ) { + $xmlidname = '_'.$xmlidname; + } my $sgraph = $root->addNewChild( $graphml_ns, 'graph' ); $sgraph->setAttribute( 'edgedefault', 'directed' ); - $sgraph->setAttribute( 'id', $self->tradition->name ); + $sgraph->setAttribute( 'id', $xmlidname ); $sgraph->setAttribute( 'parse.edgeids', 'canonical' ); - $sgraph->setAttribute( 'parse.edges', scalar($self->paths) ); + $sgraph->setAttribute( 'parse.edges', 0 ); # fill in later $sgraph->setAttribute( 'parse.nodeids', 'canonical' ); - $sgraph->setAttribute( 'parse.nodes', scalar($self->readings) ); + $sgraph->setAttribute( 'parse.nodes', 0 ); # fill in later $sgraph->setAttribute( 'parse.order', 'nodesfirst' ); # Collation attribute data @@ -1011,6 +1039,9 @@ sub as_graphml { my %node_hash; # Add our readings to the graph foreach my $n ( sort { $a->id cmp $b->id } $self->readings ) { + next if $n->has_rank && $n ne $self->start && $n ne $self->end && + ( $n->rank < $start->rank || $n->rank > $end->rank ); + $use_readings{$n->id} = 1; # Add to the main graph my $node_el = $sgraph->addNewChild( $graphml_ns, 'node' ); my $node_xmlid = 'n' . $node_ctr++; @@ -1018,6 +1049,11 @@ sub as_graphml { $node_el->setAttribute( 'id', $node_xmlid ); foreach my $d ( keys %reading_attributes ) { my $nval = $n->$d; + if( $rankoffset && $d eq 'rank' && $n ne $self->start ) { + # Adjust the ranks within the subgraph. + $nval = $n eq $self->end ? $end->rank - $rankoffset + 1 + : $nval - $rankoffset; + } _add_graphml_data( $node_el, $node_data_keys{$d}, $nval ) if defined $nval; } @@ -1027,7 +1063,13 @@ sub as_graphml { my $edge_ctr = 0; foreach my $e ( sort { $a->[0] cmp $b->[0] } $self->sequence->edges() ) { # We add an edge in the graphml for every witness in $e. - foreach my $wit ( sort $self->path_witnesses( $e ) ) { + next unless( $use_readings{$e->[0]} || $use_readings{$e->[1]} ); + my @edge_wits = sort $self->path_witnesses( $e ); + $e->[0] = $self->start->id unless $use_readings{$e->[0]}; + $e->[1] = $self->end->id unless $use_readings{$e->[1]}; + # Skip any path from start to end; that witness is not in the subgraph. + next if ( $e->[0] eq $self->start->id && $e->[1] eq $self->end->id ); + foreach my $wit ( @edge_wits ) { my( $id, $from, $to ) = ( 'e'.$edge_ctr++, $node_hash{ $e->[0] }, $node_hash{ $e->[1] } ); @@ -1051,6 +1093,10 @@ sub as_graphml { } } + # Report the actual number of nodes and edges that went in + $sgraph->setAttribute( 'parse.edges', $edge_ctr ); + $sgraph->setAttribute( 'parse.nodes', $node_ctr ); + # Add the relationship graph to the XML map { delete $edge_data_keys{$_} } @path_attributes; $self->relations->_as_graphml( $graphml_ns, $root, \%node_hash, @@ -1079,7 +1125,7 @@ row per witness (or witness uncorrected.) sub as_csv { my( $self ) = @_; my $table = $self->alignment_table; - my $csv = Text::CSV_XS->new( { binary => 1, quote_null => 0 } ); + my $csv = Text::CSV->new( { binary => 1, quote_null => 0 } ); my @result; # Make the header row $csv->combine( map { $_->{'witness'} } @{$table->{'alignment'}} ); @@ -1325,7 +1371,7 @@ sub common_readings { return @common; } -=head2 path_text( $sigil, $mainsigil [, $start, $end ] ) +=head2 path_text( $sigil, [, $start, $end ] ) Returns the text of a witness (plus its backup, if we are using a layer) as stored in the collation. The text is returned as a string, where the @@ -1340,6 +1386,13 @@ sub path_text { $start = $self->start unless $start; $end = $self->end unless $end; my @path = grep { !$_->is_meta } $self->reading_sequence( $start, $end, $wit ); + return _text_from_path( @path ); +} + +# Utility function so that we can cheat and use it when we need both the path +# and its text. +sub _text_from_path { + my( $self, @path ) = @_; my $pathtext = ''; my $last; foreach my $r ( @path ) { @@ -1387,11 +1440,16 @@ sub make_witness_path { my( $self, $wit ) = @_; my @chain = @{$wit->path}; my $sig = $wit->sigil; + # Add start and end if necessary + unshift( @chain, $self->start ) unless $chain[0] eq $self->start; + push( @chain, $self->end ) unless $chain[-1] eq $self->end; foreach my $idx ( 0 .. $#chain-1 ) { $self->add_path( $chain[$idx], $chain[$idx+1], $sig ); } if( $wit->is_layered ) { @chain = @{$wit->uncorrected_path}; + unshift( @chain, $self->start ) unless $chain[0] eq $self->start; + push( @chain, $self->end ) unless $chain[-1] eq $self->end; foreach my $idx( 0 .. $#chain-1 ) { my $source = $chain[$idx]; my $target = $chain[$idx+1]; @@ -1437,58 +1495,26 @@ sub calculate_ranks { my $self = shift; # Save the existing ranks, in case we need to invalidate the cached SVG. my %existing_ranks; - # Walk a version of the graph where every node linked by a relationship - # edge is fundamentally the same node, and do a topological ranking on - # the nodes in this graph. - my $topo_graph = Graph->new(); - my %rel_containers; - my $rel_ctr = 0; - # Add the nodes - foreach my $r ( $self->readings ) { - next if exists $rel_containers{$r->id}; - my @rels = $r->related_readings( 'colocated' ); - if( @rels ) { - # Make a relationship container. - push( @rels, $r ); - my $rn = 'rel_container_' . $rel_ctr++; - $topo_graph->add_vertex( $rn ); - foreach( @rels ) { - $rel_containers{$_->id} = $rn; - } - } else { - # Add a new node to mirror the old node. - $rel_containers{$r->id} = $r->id; - $topo_graph->add_vertex( $r->id ); - } - } + map { $existing_ranks{$_} = $_->rank } $self->readings; - # Add the edges. - foreach my $r ( $self->readings ) { - $existing_ranks{$r} = $r->rank; - foreach my $n ( $self->sequence->successors( $r->id ) ) { - my( $tfrom, $tto ) = ( $rel_containers{$r->id}, - $rel_containers{$n} ); - # $DB::single = 1 unless $tfrom && $tto; - $topo_graph->add_edge( $tfrom, $tto ); - } - } - - # Now do the rankings, starting with the start node. - my $topo_start = $rel_containers{$self->start->id}; + # Do the rankings based on the relationship equivalence graph, starting + # with the start node. + my $topo_start = $self->equivalence( $self->start->id ); my $node_ranks = { $topo_start => 0 }; my @curr_origin = ( $topo_start ); # A little iterative function. while( @curr_origin ) { - @curr_origin = _assign_rank( $topo_graph, $node_ranks, @curr_origin ); + @curr_origin = _assign_rank( $self->equivalence_graph, + $node_ranks, @curr_origin ); } # Transfer our rankings from the topological graph to the real one. foreach my $r ( $self->readings ) { - if( defined $node_ranks->{$rel_containers{$r->id}} ) { - $r->rank( $node_ranks->{$rel_containers{$r->id}} ); + if( defined $node_ranks->{$self->equivalence( $r->id )} ) { + $r->rank( $node_ranks->{$self->equivalence( $r->id )} ); } else { # Die. Find the last rank we calculated. - my @all_defined = sort { $node_ranks->{$rel_containers{$a->id}} - <=> $node_ranks->{$rel_containers{$b->id}} } + my @all_defined = sort { ( $node_ranks->{$self->equivalence( $a->id )}||-1 ) + <=> ( $node_ranks->{$self->equivalence( $b->id )}||-1 ) } $self->readings; my $last = pop @all_defined; throw( "Ranks not calculated after $last - do you have a cycle in the graph?" ); @@ -1674,10 +1700,12 @@ sub text_from_paths { =head2 common_predecessor( $reading_a, $reading_b ) Find the last reading that occurs in sequence before both the given readings. +At the very least this should be $self->start. =head2 common_successor( $reading_a, $reading_b ) Find the first reading that occurs in sequence after both the given readings. +At the very least this should be $self->end. =begin testing @@ -1718,26 +1746,45 @@ sub common_successor { return $self->_common_in_path( $r1, $r2, 'successors' ); } + +# TODO think about how to do this without ranks... sub _common_in_path { my( $self, $r1, $r2, $dir ) = @_; - my $iter = $r1->rank > $r2->rank ? $r1->rank : $r2->rank; - $iter = $self->end->rank - $iter if $dir eq 'successors'; + my $iter = $self->end->rank; my @candidates; - my @last_checked = ( $r1, $r2 ); + my @last_r1 = ( $r1 ); + my @last_r2 = ( $r2 ); + # my %all_seen = ( $r1 => 'r1', $r2 => 'r2' ); my %all_seen; + # print STDERR "Finding common $dir for $r1, $r2\n"; while( !@candidates ) { - my @new_lc; - foreach my $lc ( @last_checked ) { + last unless $iter--; # Avoid looping infinitely + # Iterate separately down the graph from r1 and r2 + my( @new_lc1, @new_lc2 ); + foreach my $lc ( @last_r1 ) { + foreach my $p ( $lc->$dir ) { + if( $all_seen{$p->id} && $all_seen{$p->id} ne 'r1' ) { + # print STDERR "Path candidate $p from $lc\n"; + push( @candidates, $p ); + } elsif( !$all_seen{$p->id} ) { + $all_seen{$p->id} = 'r1'; + push( @new_lc1, $p ); + } + } + } + foreach my $lc ( @last_r2 ) { foreach my $p ( $lc->$dir ) { - if( $all_seen{$p->id} ) { + if( $all_seen{$p->id} && $all_seen{$p->id} ne 'r2' ) { + # print STDERR "Path candidate $p from $lc\n"; push( @candidates, $p ); - } else { - $all_seen{$p->id} = 1; - push( @new_lc, $p ); + } elsif( !$all_seen{$p->id} ) { + $all_seen{$p->id} = 'r2'; + push( @new_lc2, $p ); } } } - @last_checked = @new_lc; + @last_r1 = @new_lc1; + @last_r2 = @new_lc2; } my @answer = sort { $a->rank <=> $b->rank } @candidates; return $dir eq 'predecessors' ? pop( @answer ) : shift ( @answer );