X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=blobdiff_plain;f=lib%2FText%2FTradition%2FCollation.pm;h=b53e05c91d444abe8ff196257e6d6888136a5d99;hb=b365fbaeead920ab613f87b331a393e72c5d033e;hp=9f40e07951b9d8bf10b0ad6aa4217d8e4048f0ab;hpb=508fd430d4411b209743d91556d5bca5ca89a8b3;p=scpubgit%2Fstemmatology.git diff --git a/lib/Text/Tradition/Collation.pm b/lib/Text/Tradition/Collation.pm index 9f40e07..b53e05c 100644 --- a/lib/Text/Tradition/Collation.pm +++ b/lib/Text/Tradition/Collation.pm @@ -9,6 +9,7 @@ use Text::Tradition::Collation::Reading; use Text::Tradition::Collation::RelationshipStore; use Text::Tradition::Error; use XML::LibXML; +use XML::LibXML::XPathContext; use Moose; has 'sequence' => ( @@ -86,6 +87,13 @@ has 'end' => ( writer => '_set_end', weak_ref => 1, ); + +has 'cached_svg' => ( + is => 'rw', + isa => 'Str', + predicate => 'has_cached_svg', + clearer => 'wipe_svg', + ); =head1 NAME @@ -431,68 +439,56 @@ sub reading_witnesses { =head1 OUTPUT METHODS -=head2 as_svg +=head2 as_svg( \%options ) Returns an SVG string that represents the graph, via as_dot and graphviz. +See as_dot for a list of options. =cut sub as_svg { - my( $self ) = @_; - - my @cmd = qw/dot -Tsvg/; - my( $svg, $err ); - my $dotfile = File::Temp->new(); - ## TODO REMOVE - # $dotfile->unlink_on_destroy(0); - binmode $dotfile, ':utf8'; - print $dotfile $self->as_dot(); - push( @cmd, $dotfile->filename ); - run( \@cmd, ">", binary(), \$svg ); - $svg = decode_utf8( $svg ); - return $svg; + my( $self, $opts ) = @_; + if( !$self->has_cached_svg || $opts->{'recalc'} ) { + my @cmd = qw/dot -Tsvg/; + my( $svg, $err ); + my $dotfile = File::Temp->new(); + ## USE FOR DEBUGGING + # $dotfile->unlink_on_destroy(0); + binmode $dotfile, ':utf8'; + print $dotfile $self->as_dot( $opts ); + push( @cmd, $dotfile->filename ); + run( \@cmd, ">", binary(), \$svg ); + $self->cached_svg( decode_utf8( $svg ) ); + } + return $self->cached_svg; } -=head2 svg_subgraph( $from, $to ) -Returns an SVG string that represents the portion of the graph given by the -specified range. The $from and $to variables refer to ranks within the graph. +=head2 as_dot( \%options ) -=cut +Returns a string that is the collation graph expressed in dot +(i.e. GraphViz) format. Options include: -sub svg_subgraph { - my( $self, $from, $to ) = @_; - - my $dot = $self->as_dot( $from, $to ); - unless( $dot ) { - throw( "Could not output a graph with range $from - $to" ); - } - - my @cmd = qw/dot -Tsvg/; - my( $svg, $err ); - my $dotfile = File::Temp->new(); - ## TODO REMOVE - # $dotfile->unlink_on_destroy(0); - binmode $dotfile, ':utf8'; - print $dotfile $dot; - push( @cmd, $dotfile->filename ); - run( \@cmd, ">", binary(), \$svg ); - $svg = decode_utf8( $svg ); - return $svg; -} +=over 4 +=item * from -=head2 as_dot( $from, $to ) +=item * to -Returns a string that is the collation graph expressed in dot -(i.e. GraphViz) format. If $from or $to is passed, as_dot creates -a subgraph rather than the entire graph. +=item * color_common + +=back =cut sub as_dot { - my( $self, $startrank, $endrank ) = @_; - + my( $self, $opts ) = @_; + my $startrank = $opts->{'from'} if $opts; + my $endrank = $opts->{'to'} if $opts; + my $color_common = $opts->{'color_common'} if $opts; + my $STRAIGHTENHACK = !$startrank && !$endrank && $self->end->rank + && $self->end->rank > 100; + # Check the arguments if( $startrank ) { return if $endrank && $startrank > $endrank; @@ -503,7 +499,6 @@ sub as_dot { $endrank = undef if $endrank == $self->end->rank; } - # TODO consider making some of these things configurable my $graph_name = $self->tradition->name; $graph_name =~ s/[^\w\s]//g; $graph_name = join( '_', split( /\s+/, $graph_name ) ); @@ -513,7 +508,7 @@ sub as_dot { 'bgcolor' => 'none', ); my %node_attrs = ( - 'fontsize' => 11, + 'fontsize' => 14, 'fillcolor' => 'white', 'style' => 'filled', 'shape' => 'ellipse' @@ -535,35 +530,59 @@ sub as_dot { if( $endrank ) { $dot .= "\t\"#SUBEND#\" [ label=\"...\" ];\n"; } + if( $STRAIGHTENHACK ) { + ## HACK part 1 + $dot .= "\tsubgraph { rank=same \"#START#\" \"#SILENT#\" }\n"; + $dot .= "\t\"#SILENT#\" [ shape=diamond,color=white,penwidth=0,label=\"\" ];" + } my %used; # Keep track of the readings that actually appear in the graph - foreach my $reading ( $self->readings ) { + # Sort the readings by rank if we have ranks; this speeds layout. + my @all_readings = $self->end->has_rank + ? sort { $a->rank <=> $b->rank } $self->readings + : $self->readings; + # TODO Refrain from outputting lacuna nodes - just grey out the edges. + foreach my $reading ( @all_readings ) { # Only output readings within our rank range. next if $startrank && $reading->rank < $startrank; next if $endrank && $reading->rank > $endrank; $used{$reading->id} = 1; # Need not output nodes without separate labels next if $reading->id eq $reading->text; + my $rattrs; my $label = $reading->text; $label =~ s/\"/\\\"/g; - $dot .= sprintf( "\t\"%s\" [ label=\"%s\" ];\n", $reading->id, $label ); + $rattrs->{'label'} = $label; + $rattrs->{'fillcolor'} = '#b3f36d' if $reading->is_common && $color_common; + $dot .= sprintf( "\t\"%s\" %s;\n", $reading->id, _dot_attr_string( $rattrs ) ); } - # Add the real edges + # Add the real edges. Need to weight one edge per rank jump, in a + # continuous line. + # my $weighted = $self->_add_edge_weights; my @edges = $self->paths; my( %substart, %subend ); foreach my $edge ( @edges ) { # Do we need to output this edge? if( $used{$edge->[0]} && $used{$edge->[1]} ) { - my $label = $self->path_display_label( $self->path_witnesses( $edge ) ); + my $label = $self->_path_display_label( $self->path_witnesses( $edge ) ); my $variables = { %edge_attrs, 'label' => $label }; + # Account for the rank gap if necessary - if( $self->reading( $edge->[1] )->has_rank - && $self->reading( $edge->[0] )->has_rank - && $self->reading( $edge->[1] )->rank - - $self->reading( $edge->[0] )->rank > 1 ) { - $variables->{'minlen'} = $self->reading( $edge->[1] )->rank - - $self->reading( $edge->[0] )->rank; + my $rank0 = $self->reading( $edge->[0] )->rank + if $self->reading( $edge->[0] )->has_rank; + my $rank1 = $self->reading( $edge->[1] )->rank + if $self->reading( $edge->[1] )->has_rank; + if( defined $rank0 && defined $rank1 && $rank1 - $rank0 > 1 ) { + $variables->{'minlen'} = $rank1 - $rank0; } + + # Add the calculated edge weights + # if( exists $weighted->{$edge->[0]} +# && $weighted->{$edge->[0]} eq $edge->[1] ) { +# # $variables->{'color'} = 'red'; +# $variables->{'weight'} = 3.0; +# } + # EXPERIMENTAL: make edge width reflect no. of witnesses my $extrawidth = scalar( $self->path_witnesses( $edge ) ) * 0.2; $variables->{'penwidth'} = $extrawidth + 0.8; # gives 1 for a single wit @@ -579,18 +598,22 @@ sub as_dot { } # Add substitute start and end edges if necessary foreach my $node ( keys %substart ) { - my $witstr = $self->path_display_label ( $self->reading_witnesses( $self->reading( $node ) ) ); + my $witstr = $self->_path_display_label ( $self->reading_witnesses( $self->reading( $node ) ) ); my $variables = { %edge_attrs, 'label' => $witstr }; my $varopts = _dot_attr_string( $variables ); $dot .= "\t\"#SUBSTART#\" -> \"$node\" $varopts;"; } foreach my $node ( keys %subend ) { - my $witstr = $self->path_display_label ( $self->reading_witnesses( $self->reading( $node ) ) ); + my $witstr = $self->_path_display_label ( $self->reading_witnesses( $self->reading( $node ) ) ); my $variables = { %edge_attrs, 'label' => $witstr }; my $varopts = _dot_attr_string( $variables ); $dot .= "\t\"$node\" -> \"#SUBEND#\" $varopts;"; } - + # HACK part 2 + if( $STRAIGHTENHACK ) { + $dot .= "\t\"#END#\" -> \"#SILENT#\" [ color=white,penwidth=0 ];\n"; + } + $dot .= "}\n"; return $dot; } @@ -605,6 +628,39 @@ sub _dot_attr_string { return( '[ ' . join( ', ', @attrs ) . ' ]' ); } +sub _add_edge_weights { + my $self = shift; + # Walk the graph from START to END, choosing the successor node with + # the largest number of witness paths each time. + my $weighted = {}; + my $curr = $self->start->id; + my $ranked = $self->end->has_rank; + while( $curr ne $self->end->id ) { + my $rank = $ranked ? $self->reading( $curr )->rank : 0; + my @succ = sort { $self->path_witnesses( $curr, $a ) + <=> $self->path_witnesses( $curr, $b ) } + $self->sequence->successors( $curr ); + my $next = pop @succ; + my $nextrank = $ranked ? $self->reading( $next )->rank : 0; + # Try to avoid lacunae in the weighted path. + while( @succ && + ( $self->reading( $next )->is_lacuna || + $nextrank - $rank > 1 ) ){ + $next = pop @succ; + } + $weighted->{$curr} = $next; + $curr = $next; + } + return $weighted; +} + +=head2 path_witnesses( $edge ) + +Returns the list of sigils whose witnesses are associated with the given edge. +The edge can be passed as either an array or an arrayref of ( $source, $target ). + +=cut + sub path_witnesses { my( $self, @edge ) = @_; # If edge is an arrayref, cope. @@ -616,7 +672,7 @@ sub path_witnesses { return @wits; } -sub path_display_label { +sub _path_display_label { my $self = shift; my @wits = sort @_; my $maj = scalar( $self->tradition->witnesses ) * 0.6; @@ -807,7 +863,7 @@ sub as_graphml { } # Add the relationship graph to the XML - $self->relations->as_graphml( $graphml_ns, $root, \%node_hash, + $self->relations->_as_graphml( $graphml_ns, $root, \%node_hash, $node_data_keys{'id'}, \%edge_data_keys ); # Save and return the thing @@ -870,9 +926,11 @@ keys have a true hash value will be included. sub make_alignment_table { my( $self, $noderefs, $include ) = @_; - unless( $self->linear ) { - throw( "Need a linear graph in order to make an alignment table" ); - } + # Make sure we can do this + throw( "Need a linear graph in order to make an alignment table" ) + unless $self->linear; + $self->calculate_ranks unless $self->end->has_rank; + my $table = { 'alignment' => [], 'length' => $self->end->rank - 1 }; my @all_pos = ( 1 .. $self->end->rank - 1 ); foreach my $wit ( sort { $a->sigil cmp $b->sigil } $self->tradition->witnesses ) { @@ -1088,6 +1146,19 @@ sub _witnesses_of_label { return @answer; } +=head2 common_readings + +Returns the list of common readings in the graph (i.e. those readings that are +shared by all non-lacunose witnesses.) + +=cut + +sub common_readings { + my $self = shift; + my @common = grep { $_->is_common } $self->readings; + return @common; +} + =head2 path_text( $sigil, $mainsigil [, $start, $end ] ) Returns the text of a witness (plus its backup, if we are using a layer) @@ -1161,10 +1232,36 @@ sub make_witness_path { Calculate the reading ranks (that is, their aligned positions relative to each other) for the graph. This can only be called on linear collations. +=begin testing + +use Text::Tradition; + +my $cxfile = 't/data/Collatex-16.xml'; +my $t = Text::Tradition->new( + 'name' => 'inline', + 'input' => 'CollateX', + 'file' => $cxfile, + ); +my $c = $t->collation; + +# Make an svg +my $svg = $c->as_svg; +is( substr( $svg, 0, 5 ), 'has_cached_svg, "SVG was cached" ); +is( $c->as_svg, $svg, "Cached SVG returned upon second call" ); +$c->calculate_ranks; +is( $c->as_svg, $svg, "Cached SVG retained with no rank change" ); +$c->add_relationship( 'n9', 'n23', { 'type' => 'spelling' } ); +isnt( $c->as_svg, $svg, "SVG changed after relationship add" ); + +=end testing + =cut sub calculate_ranks { my $self = shift; + # Save the existing ranks, in case we need to invalidate the cached SVG. + my %existing_ranks; # Walk a version of the graph where every node linked by a relationship # edge is fundamentally the same node, and do a topological ranking on # the nodes in this graph. @@ -1192,6 +1289,7 @@ sub calculate_ranks { # Add the edges. foreach my $r ( $self->readings ) { + $existing_ranks{$r} = $r->rank; foreach my $n ( $self->sequence->successors( $r->id ) ) { my( $tfrom, $tto ) = ( $rel_containers{$r->id}, $rel_containers{$n} ); @@ -1221,6 +1319,14 @@ sub calculate_ranks { throw( "Ranks not calculated after $last - do you have a cycle in the graph?" ); } } + # Do we need to invalidate the cached SVG? + if( $self->has_cached_svg ) { + foreach my $r ( $self->readings ) { + next if $existing_ranks{$r} == $r->rank; + $self->wipe_svg; + last; + } + } } sub _assign_rank { @@ -1275,12 +1381,104 @@ sub flatten_ranks { # Combine! # print STDERR "Combining readings at same rank: $key\n"; $self->merge_readings( $unique_rank_rdg{$key}, $rdg ); + # TODO see if this now makes a common point. } else { $unique_rank_rdg{$key} = $rdg; } } } +=head2 remove_collations + +Another convenience method for parsing. Removes all 'collation' relationships +that were defined in order to get the reading ranks to be correct. + +=begin testing + +use Text::Tradition; + +my $cxfile = 't/data/Collatex-16.xml'; +my $t = Text::Tradition->new( + 'name' => 'inline', + 'input' => 'CollateX', + 'file' => $cxfile, + ); +my $c = $t->collation; + +isnt( $c->reading('n23')->rank, $c->reading('n9')->rank, "Rank skew exists" ); +$c->add_relationship( 'n23', 'n9', { 'type' => 'collated', 'scope' => 'local' } ); +is( scalar $c->relationships, 4, "Found all expected relationships" ); +$c->remove_collations; +is( scalar $c->relationships, 3, "Collated relationships now gone" ); +is( $c->reading('n23')->rank, $c->reading('n9')->rank, "Aligned ranks were preserved" ); + +=end testing + +=cut + +sub remove_collations { + my $self = shift; + foreach my $reledge ( $self->relationships ) { + my $relobj = $self->relations->get_relationship( $reledge ); + if( $relobj && $relobj->type eq 'collated' ) { + $self->relations->delete_relationship( $reledge ); + } + } +} + + +=head2 calculate_common_readings + +Goes through the graph identifying the readings that appear in every witness +(apart from those with lacunae at that spot.) Marks them as common and returns +the list. + +=begin testing + +use Text::Tradition; + +my $cxfile = 't/data/Collatex-16.xml'; +my $t = Text::Tradition->new( + 'name' => 'inline', + 'input' => 'CollateX', + 'file' => $cxfile, + ); +my $c = $t->collation; + +my @common = $c->calculate_common_readings(); +is( scalar @common, 8, "Found correct number of common readings" ); +my @marked = sort $c->common_readings(); +is( scalar @common, 8, "All common readings got marked as such" ); +my @expected = qw/ n1 n12 n16 n19 n20 n5 n6 n7 /; +is_deeply( \@marked, \@expected, "Found correct list of common readings" ); + +=end testing + +=cut + +sub calculate_common_readings { + my $self = shift; + my @common; + my $table = $self->make_alignment_table( 1 ); + foreach my $idx ( 0 .. $table->{'length'} - 1 ) { + my @row = map { $_->{'tokens'}->[$idx]->{'t'} } @{$table->{'alignment'}}; + my %hash; + foreach my $r ( @row ) { + if( $r ) { + $hash{$r->id} = $r unless $r->is_meta; + } else { + $hash{'UNDEF'} = $r; + } + } + if( keys %hash == 1 && !exists $hash{'UNDEF'} ) { + my( $r ) = values %hash; + $r->is_common( 1 ); + push( @common, $r ); + } + } + return @common; +} + =head2 text_from_paths Calculate the text array for all witnesses from the path, for later consistency @@ -1344,16 +1542,16 @@ is( $c->common_successor( 'n21', 'n26' )->id, sub common_predecessor { my $self = shift; my( $r1, $r2 ) = $self->_objectify_args( @_ ); - return $self->common_in_path( $r1, $r2, 'predecessors' ); + return $self->_common_in_path( $r1, $r2, 'predecessors' ); } sub common_successor { my $self = shift; my( $r1, $r2 ) = $self->_objectify_args( @_ ); - return $self->common_in_path( $r1, $r2, 'successors' ); + return $self->_common_in_path( $r1, $r2, 'successors' ); } -sub common_in_path { +sub _common_in_path { my( $self, $r1, $r2, $dir ) = @_; my $iter = $r1->rank > $r2->rank ? $r1->rank : $r2->rank; $iter = $self->end->rank - $iter if $dir eq 'successors'; @@ -1388,10 +1586,12 @@ sub throw { no Moose; __PACKAGE__->meta->make_immutable; -=head1 BUGS / TODO +=head1 LICENSE -=over +This package is free software and is provided "as is" without express +or implied warranty. You can redistribute it and/or modify it under +the same terms as Perl itself. -=item * Get rid of $backup in reading_sequence +=head1 AUTHOR -=back +Tara L Andrews Eaurum@cpan.orgE