From: Tara L Andrews Date: Fri, 30 Sep 2011 23:00:29 +0000 (+0200) Subject: add facility in Stemma for distance tree alongside hypothesis; use this for group_var... X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=40f19742338169e47056cf0d3d4caec3892dabaa;p=scpubgit%2Fstemmatology.git add facility in Stemma for distance tree alongside hypothesis; use this for group_vars analysis --- diff --git a/group_vars.pl b/group_vars.pl index ecde4de..5ffeefd 100644 --- a/group_vars.pl +++ b/group_vars.pl @@ -48,7 +48,7 @@ my $col_wits = shift @$all_wits_table; # For each column in the table, group the readings by witness. -my $useful_vars = 0; +my $used_vars = 0; foreach my $i ( 0 .. $#$all_wits_table ) { my $rdg_wits = {}; my $col_rdgs = shift @$all_wits_table; @@ -65,34 +65,61 @@ foreach my $i ( 0 .. $#$all_wits_table ) { my( $groups, $readings ) = useful_variant( $rdg_wits ); next unless $groups && $readings; - # For all the groups with more than one member, make a group that contains - # all contiguous vertices to connect them. - # TODO Need to do pairwise comparison of groups - a variant location can - # have both coincidental and genealogical variants! + # We can look up witnesses for a reading; we also want to look up readings + # for a given witness. + my $group_readings = {}; + foreach my $x ( 0 .. $#$groups ) { + $group_readings->{wit_stringify( $groups->[$x] )} = $readings->[$x]; + } + + # For all the groups with more than one member, collect the list of all + # contiguous vertices needed to connect them. + # TODO: deal with a.c. reading logic + my $conflict = analyze_variant_location( $group_readings, $groups, $stemma->apsp ); + print wit_stringify( $groups ) . ' - ' . join( " / ", @$readings ) . "\n"; + foreach my $rdg ( keys %$conflict ) { + my $var = $conflict->{$rdg}; + print "\tReadings '$rdg' and '$var' are not genealogical\n"; + } + + # Now run the same analysis given a distance tree. + my $distance_apsp = $stemma->distance_trees->[0]->APSP_Floyd_Warshall(); + $conflict = analyze_variant_location( $group_readings, $groups, $distance_apsp ); + foreach my $rdg ( keys %$conflict ) { + my $var = $conflict->{$rdg}; + print "\tReadings '$rdg' and '$var' disregarded by parsimony\n"; + } + + # Record that we used this variant in an analysis + $used_vars++; + +} +print "Found $used_vars useful variants in this analysis\n"; + +sub analyze_variant_location { + my( $group_readings, $groups, $apsp ) = @_; my %contig; - my $conflict; - foreach my $g ( @$groups ) { - my @members = split( /,/, $g ); + my $conflict = {}; + foreach my $g ( sort { scalar @$b <=> scalar @$a } @$groups ) { + my @members = @$g; + my $gst = wit_stringify( $g ); + map { $contig{$_} = $gst } @members; # The witnesses need themselves to be + # in their collection. next unless @members > 1; - map { $contig{$_} = $g } @members; my $curr = pop @members; foreach my $m ( @members ) { - foreach my $v ( $stemma->apsp->path_vertices( $curr, $m ) ) { - $contig{$v} = $g unless exists $contig{$v}; - next if $contig{$v} eq $g; - # print STDERR "Conflict at $v between group $g and group " + foreach my $v ( $apsp->path_vertices( $curr, $m ) ) { + $contig{$v} = $gst unless exists $contig{$v}; + next if $contig{$v} eq $gst; + # print STDERR "Conflict at $v between group $gst and group " # . $contig{$v} . "\n"; - $conflict = 1; + # Record what is conflicting. + $conflict->{$group_readings->{$gst}} = $group_readings->{$contig{$v}}; } } } - print join( " / ", @$groups ) . ' - ' . join( " / ", @$readings ) . ' - '; - print $conflict ? "coincidental" : "genealogical"; - print "\n"; - $useful_vars++; - + return $conflict; } -print "Found $useful_vars useful variants\n"; # Add the variant, subject to a.c. representation logic. # This assumes that we will see the 'main' version before the a.c. version. @@ -118,8 +145,27 @@ sub useful_variant { return( undef, undef ) if $total <= 1; my( $groups, $text ); foreach my $var ( keys %$readings ) { - push( @$groups, join( ',', @{$readings->{$var}} ) ); + push( @$groups, $readings->{$var} ); push( @$text, $var ); } return( $groups, $text ); } + +# Take an array of witness groupings and produce a string like +# A,B / C,D,E / F + +sub wit_stringify { + my $groups = shift; + my @gst; + # If we were passed an array of witnesses instead of an array of + # groupings, then "group" the witnesses first. + unless( ref( $groups->[0] ) ) { + my $mkgrp = [ $groups ]; + $groups = $mkgrp; + } + foreach my $g ( @$groups ) { + push( @gst, join( ',', @$g ) ); + } + return join( ' / ', @gst ); +} + \ No newline at end of file diff --git a/lib/Text/Tradition/Stemma.pm b/lib/Text/Tradition/Stemma.pm index df8cf4a..a90d863 100644 --- a/lib/Text/Tradition/Stemma.pm +++ b/lib/Text/Tradition/Stemma.pm @@ -1,12 +1,13 @@ package Text::Tradition::Stemma; +use Bio::Phylo::IO; use File::chdir; use File::Temp; -use IPC::Run qw/ run /; -use Moose; -use Text::Tradition::Collation::Position; use Graph; use Graph::Reader::Dot; +use IPC::Run qw/ run /; +use Moose; +use Text::Balanced qw/ extract_bracketed /; has collation => ( is => 'ro', @@ -31,6 +32,13 @@ has apsp => ( is => 'rw', isa => 'Graph', ); + +has distance_trees => ( + is => 'ro', + isa => 'ArrayRef[Graph]', + writer => '_save_distance_trees', + predicate => 'has_distance_trees', + ); sub BUILD { my( $self, $args ) = @_; @@ -62,7 +70,21 @@ sub BUILD { $self->apsp( $undirected->APSP_Floyd_Warshall() ); } } - + +before 'distance_trees' => sub { + my $self = shift; + my %args = @_; + # TODO allow specification of method for calculating distance tree + if( $args{'recalc'} || !$self->has_distance_trees ) { + # We need to make a tree before we can return it. + my( $ok, $result ) = $self->run_phylip_pars(); + if( $ok ) { + $self->_save_distance_trees( _parse_newick( $result ) ); + } else { + warn "Failed to calculate distance tree: $result"; + } + } +}; sub make_character_matrix { my $self = shift; @@ -110,13 +132,13 @@ sub convert_characters { } } if( scalar( keys %unique ) > 8 ) { - warn "Have more than 8 variants on this location; pars will break"; + warn "Have more than 8 variants on this location; phylip will break"; } my @chars = map { $_ ? $unique{$_} : $unique{'__UNDEF__' } } @$row; return @chars; } -sub pars_input { +sub phylip_pars_input { my $self = shift; $self->make_character_matrix unless $self->has_character_matrix; my $matrix = ''; @@ -129,16 +151,15 @@ sub pars_input { return $matrix; } -sub run_pars { +sub run_phylip_pars { my $self = shift; # Set up a temporary directory for all the default Phylip files. my $phylip_dir = File::Temp->newdir(); - print STDERR $phylip_dir . "\n"; # $phylip_dir->unlink_on_destroy(0); # We need an infile, and we need a command input file. open( MATRIX, ">$phylip_dir/infile" ) or die "Could not write $phylip_dir/infile"; - print MATRIX $self->pars_input(); + print MATRIX $self->phylip_pars_input(); close MATRIX; open( CMD, ">$phylip_dir/cmdfile" ) or die "Could not write $phylip_dir/cmdfile"; @@ -201,6 +222,46 @@ sub run_pars { return( undef, join( '', @error ) ); } +sub _parse_newick { + my $newick = shift; + my @trees; + # Parse the result into a tree + my $forest = Bio::Phylo::IO->parse( + -format => 'newick', + -string => $newick, + ); + # Turn the tree into a graph, starting with the root node + foreach my $tree ( @{$forest->get_entities} ) { + push( @trees, _graph_from_bio( $tree ) ); + } + return \@trees; +} + +sub _graph_from_bio { + my $tree = shift; + my $graph = Graph->new( 'undirected' => 1 ); + # Give all the intermediate anonymous nodes a name. + my $i = 0; + foreach my $n ( @{$tree->get_entities} ) { + next if $n->get_name; + $n->set_name( $i++ ); + } + my $root = $tree->get_root->get_name; + $graph->add_vertex( $root ); + _add_tree_children( $graph, $root, $tree->get_root->get_children() ); + return $graph; +} + +sub _add_tree_children { + my( $graph, $parent, $tree_children ) = @_; + foreach my $c ( @$tree_children ) { + my $child = $c->get_name; + $graph->add_vertex( $child ); + $graph->add_path( $parent, $child ); + _add_tree_children( $graph, $child, $c->get_children() ); + } +} + no Moose; __PACKAGE__->meta->make_immutable; diff --git a/make_tradition.pl b/make_tradition.pl index 6828e71..22b50df 100644 --- a/make_tradition.pl +++ b/make_tradition.pl @@ -81,7 +81,7 @@ if( $HACK ) { if( $outformat eq 'stemma' ) { my $stemma = Text::Tradition::Stemma->new( 'collation' => $tradition->collation ); - my( $result, $tree ) = $stemma->run_pars(); + my( $result, $tree ) = $stemma->run_phylip_pars(); if( $result ) { print $tree; } else {