From: Tara L Andrews Date: Tue, 11 Sep 2012 18:58:11 +0000 (+0200) Subject: initial phylogeny generation work X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=ea45d2a63a485f3f4fec1240c6fc14327d29bc31;p=scpubgit%2Fstemmatology.git initial phylogeny generation work --- diff --git a/lib/Text/Tradition/Stemma.pm b/lib/Text/Tradition/Stemma.pm index 085e6b0..86ceb0e 100644 --- a/lib/Text/Tradition/Stemma.pm +++ b/lib/Text/Tradition/Stemma.pm @@ -7,7 +7,6 @@ use Graph; use Graph::Reader::Dot; use IPC::Run qw/ run binary /; use Text::Tradition::Error; -use Text::Tradition::StemmaUtil qw/ character_input phylip_pars parse_newick /; use Moose; =head1 NAME @@ -88,6 +87,13 @@ if called directly it takes the following options: =item * dot - A filehandle open to a DOT representation of the stemma graph. +=item * graph - If no DOT specification is given, you can pass a Graph object +instead. The vertices of the graph should have an attribute 'class' set to +either of the values 'extant' or 'hypothetical'. + +=item * is_undirected - If the graph specification (or graph object) is for an +undirected graph (e.g. a phylogenetic tree), this should be set. + =back =begin testing @@ -97,13 +103,16 @@ use TryCatch; use_ok( 'Text::Tradition::Stemma' ); # Try to create a bad graph +TODO: { + local $TODO = "cannot use stdout redirection trick with FastCGI"; my $baddotfh; -open( $baddotfh, 't/data/besoin_bad.dot' ) or die "Could not open test dotfile"; -try { - my $stemma = Text::Tradition::Stemma->new( dot => $baddotfh ); - ok( 0, "Created broken stemma from dotfile with syntax error" ); -} catch( Text::Tradition::Error $e ) { - like( $e->message, qr/^Error trying to parse/, "Syntax error in dot threw exception" ); + open( $baddotfh, 't/data/besoin_bad.dot' ) or die "Could not open test dotfile"; + try { + my $stemma = Text::Tradition::Stemma->new( dot => $baddotfh ); + ok( 0, "Created broken stemma from dotfile with syntax error" ); + } catch( Text::Tradition::Error $e ) { + like( $e->message, qr/^Error trying to parse/, "Syntax error in dot threw exception" ); + } } # Create a good graph @@ -120,6 +129,9 @@ foreach my $h ( $stemma->hypotheticals ) { } ok( $found_unicode_sigil, "Found a correctly encoded Unicode sigil" ); +# TODO Create stemma from graph, create stemma from undirected graph, +# create stemma from incompletely-specified graph + =end testing =cut @@ -127,7 +139,7 @@ ok( $found_unicode_sigil, "Found a correctly encoded Unicode sigil" ); has collation => ( is => 'ro', isa => 'Text::Tradition::Collation', - clearer => 'clear_collation', + clearer => 'clear_collation', # interim measure to remove refs in DB weak_ref => 1, ); @@ -136,15 +148,39 @@ has graph => ( isa => 'Graph', predicate => 'has_graph', ); + +has is_undirected => ( + is => 'ro', + isa => 'Bool', + default => undef, + writer => 'set_undirected', + ); sub BUILD { my( $self, $args ) = @_; # If we have been handed a dotfile, initialize it into a graph. if( exists $args->{'dot'} ) { $self->_graph_from_dot( $args->{'dot'} ); - } + } else { + } } +before 'graph' => sub { + my $self = shift; + if( @_ ) { + # Make sure all unclassed graph nodes are marked extant. + my $g = $_[0]; + throw( "Cannot set graph to a non-Graph object" ) + unless ref( $g ) eq 'Graph'; + foreach my $v ( $g->vertices ) { + unless( $g->has_vertex_attribute( $v, 'class' ) ) { + $g->set_vertex_attribute( $v, 'class', 'extant' ); + } + } + $self->set_undirected( $g->is_undirected ); + } +}; + sub _graph_from_dot { my( $self, $dotfh ) = @_; my $reader = Graph::Reader::Dot->new(); @@ -164,11 +200,6 @@ sub _graph_from_dot { throw( "Failed to create graph from dot" ); } $self->graph( $graph ); - # Go through the nodes and set any non-hypothetical node to extant. - foreach my $v ( $self->graph->vertices ) { - $self->graph->set_vertex_attribute( $v, 'class', 'extant' ) - unless $self->graph->has_vertex_attribute( $v, 'class' ); - } } =head1 METHODS @@ -224,8 +255,9 @@ sub as_dot { @edgeopts{ keys %{$opts->{'edge'}} } = values %{$opts->{'edge'}} if $opts->{'edge'}; + my $gdecl = $graph->is_directed ? 'digraph' : 'graph'; my @dotlines; - push( @dotlines, 'digraph stemma {' ); + push( @dotlines, "$gdecl stemma {" ); ## Print out the global attributes push( @dotlines, _make_dotline( 'graph', %graphopts ) ) if keys %graphopts; push( @dotlines, _make_dotline( 'edge', %edgeopts ) ) if keys %edgeopts; @@ -245,7 +277,8 @@ sub as_dot { # Add each of our edges. foreach my $e ( $graph->edges ) { my( $from, $to ) = map { _dotquote( $_ ) } @$e; - push( @dotlines, " $from -> $to;" ); + my $connector = $graph->is_directed ? '->' : '--'; + push( @dotlines, " $from $connector $to;" ); } push( @dotlines, '}' ); @@ -297,8 +330,9 @@ sub editable_graph { # Create the graph my $join = ( $opts && exists $opts->{'linesep'} ) ? $opts->{'linesep'} : "\n"; + my $gdecl = $graph->is_undirected ? 'graph' : 'digraph'; my @dotlines; - push( @dotlines, 'digraph stemma {' ); + push( @dotlines, "$gdecl stemma {" ); my @real; # A cheap sort foreach my $n ( sort $graph->vertices ) { my $c = $graph->get_vertex_attribute( $n, 'class' ); @@ -315,7 +349,8 @@ sub editable_graph { } foreach my $e ( sort _by_vertex $graph->edges ) { my( $from, $to ) = map { _dotquote( $_ ) } @$e; - push( @dotlines, " $from -> $to;" ); + my $conn = $graph->is_undirected ? '--' : '->'; + push( @dotlines, " $from $conn $to;" ); } push( @dotlines, '}' ); return join( $join, @dotlines ); @@ -345,11 +380,11 @@ sub _by_vertex { =head2 situation_graph( $extant, $layered ) -Returns a graph which is the original stemma with all witnesses not in the -%$extant hash marked as hypothetical, and witness layers added to the graph -according to the list in @$layered. A layered (a.c.) witness is added as a -parent of its main version, and additionally shares all other parents and -children with that version. +Returns a graph which is the original stemma graph with all witnesses not +in the %$extant hash marked as hypothetical, and witness layers added to +the graph according to the list in @$layered. A layered (a.c.) witness is +added as a parent of its main version, and additionally shares all other +parents and children with that version. =cut @@ -412,7 +447,8 @@ Returns an SVG representation of the graph, calling as_dot first. sub as_svg { my( $self, $opts ) = @_; my $dot = $self->as_dot( $opts ); - my @cmd = qw/dot -Tsvg/; + my @cmd = ( '-Tsvg' ); + unshift( @cmd, $self->is_undirected ? 'neato' : 'dot' ); my $svg; my $dotfile = File::Temp->new(); ## TODO REMOVE @@ -435,6 +471,7 @@ sub as_svg { my( $ew, $eh ) = @{$opts->{'size'}}; # If the graph is wider than it is tall, set width to ew and remove height. # Otherwise set height to eh and remove width. + # TODO Also scale the viewbox my $width = $svgdoc->documentElement->getAttribute('width'); my $height = $svgdoc->documentElement->getAttribute('height'); $width =~ s/\D+//g; @@ -490,6 +527,48 @@ sub hypotheticals { return @wits; } +=head2 root( $root_vertex ) { + +If the stemma graph is undirected, make it directed with $root_vertex at the root. +If it is directed, re-root it. + +=cut + +sub root_graph { + my( $self, $rootvertex ) = @_; + my $graph; + if( $self->is_undirected ) { + $graph = $self->graph; + } else { + # Make an undirected version of this graph. + $graph = $self->graph->undirected_copy(); + } + my $rooted = Graph->new(); + $rooted->add_vertex( $rootvertex ); + my @next = ( $rootvertex ); + while( @next ) { + my @children; + foreach my $v ( @next ) { + # Place its not-placed neighbors (ergo children) in the tree + # and connect them + foreach my $n ( grep { !$rooted->has_vertex( $_ ) } + $graph->neighbors( $v ) ) { + $rooted->add_vertex( $n ); + $rooted->add_edge( $v, $n ); + push( @children, $n ); + } + } + @next = @children; + } + # Set the vertex classes + map { $rooted->set_vertex_attribute( $_, 'class', 'hypothetical' ) } + $self->graph->hypotheticals; + map { $rooted->set_vertex_class( $_, 'class', 'extant' ) } + $self->graph->witnesses; + return $rooted; +} + + sub throw { Text::Tradition::Error->throw( 'ident' => 'Stemma error', diff --git a/lib/Text/Tradition/StemmaUtil.pm b/lib/Text/Tradition/StemmaUtil.pm index 295110d..5c3f848 100644 --- a/lib/Text/Tradition/StemmaUtil.pm +++ b/lib/Text/Tradition/StemmaUtil.pm @@ -13,6 +13,7 @@ use Graph; use Graph::Reader::Dot; use IPC::Run qw/ run binary /; use Text::Tradition::Error; +use Text::Tradition::Stemma; @EXPORT_OK = qw/ character_input phylip_pars parse_newick newick_to_svg /; =head1 NAME @@ -26,17 +27,39 @@ text collations. =head1 SUBROUTINES -=head2 character_input( $alignment_table ) +=head2 character_input( $tradition, $opts ) Returns a character matrix string suitable for Phylip programs, which corresponds to the given alignment table. See Text::Tradition::Collation -for a description of the alignment table format. +for a description of the alignment table format. Options include: + +=over + +=item * exclude_layer - Exclude layered witnesses from the character input, +using only the 'main' text of the witnesses in the tradition. + +=item * collapse - A reference to an array of relationship names that should +be treated as equivalent for the purposes of generating the character matrix. + +=back =cut sub character_input { - my $table = shift; - my $character_matrix = _make_character_matrix( $table ); + my ( $tradition, $opts ) = @_; + my $table = $tradition->collation->alignment_table; + if( $opts->{exclude_layer} ) { + # Filter out all alignment table rows that do not correspond + # to a named witness - these are the layered witnesses. + my $newtable = { alignment => [] }; + foreach my $row ( $table->{alignment} ) { + if( $tradition->has_witness( $row->{witness} ) ) { + push( @{$newtable->{alignment}}, $row ); + } + } + $table = $newtable; + } + my $character_matrix = _make_character_matrix( $table, $opts ); my $input = ''; my $rows = scalar @{$character_matrix}; my $columns = scalar @{$character_matrix->[0]} - 1; @@ -48,17 +71,15 @@ sub character_input { } sub _make_character_matrix { - my( $table ) = @_; + my( $table, $opts ) = @_; # Push the names of the witnesses to initialize the rows of the matrix. my @matrix = map { [ _normalize_witname( $_->{'witness'} ) ] } @{$table->{'alignment'}}; foreach my $token_index ( 0 .. $table->{'length'} - 1) { - # First implementation: make dumb alignment table, caring about - # nothing except which reading is in which position. - my @pos_readings = map { $_->{'tokens'}->[$token_index] } + my @pos_tokens = map { $_->{'tokens'}->[$token_index] } @{$table->{'alignment'}}; - my @pos_text = map { $_ ? $_->{'t'} : $_ } @pos_readings; - my @chars = _convert_characters( \@pos_text ); + my @pos_readings = map { $_ ? $_->{'t'} : $_ } @pos_tokens; + my @chars = _convert_characters( \@pos_readings, $opts ); foreach my $idx ( 0 .. $#matrix ) { push( @{$matrix[$idx]}, $chars[$idx] ); } @@ -77,21 +98,32 @@ sub _normalize_witname { } sub _convert_characters { - my $row = shift; + my( $row, $opts ) = @_; # This is a simple algorithm that treats every reading as different. # Eventually we will want to be able to specify how relationships # affect the character matrix. my %unique = ( '__UNDEF__' => 'X', '#LACUNA#' => '?', ); + my %equivalent; my %count; my $ctr = 0; - foreach my $word ( @$row ) { - if( $word && !exists $unique{$word} ) { - $unique{$word} = chr( 65 + $ctr ); - $ctr++; - } - $count{$word}++ if $word; + foreach my $rdg ( @$row ) { + next unless $rdg; + next if $rdg->is_lacuna; + next if exists $unique{$rdg->text}; + if( ref( $opts->{'collapse'} ) eq 'ARRAY' ) { + my @exclude_types = @{$opts->{'collapse'}}; + my @set = $rdg->related_readings( sub { my $rel = shift; + $rel->colocated && grep { $rel->type eq $_ } @exclude_types } ); + push( @set, $rdg ); + my $char = chr( 65 + $ctr++ ); + map { $unique{$_->text} = $char } @set; + $count{$rdg->text} += scalar @set; + } else { + $unique{$rdg->text} = chr( 65 + $ctr++ ); + $count{$rdg->text}++; + } } # Try to keep variants under 8 by lacunizing any singletons. if( scalar( keys %unique ) > 8 ) { @@ -105,7 +137,7 @@ sub _convert_characters { if( scalar( keys %u ) > 8 ) { warn "Have more than 8 variants on this location; phylip will break"; } - my @chars = map { $_ ? $unique{$_} : $unique{'__UNDEF__' } } @$row; + my @chars = map { $_ ? $unique{$_->text} : $unique{'__UNDEF__' } } @$row; return @chars; } @@ -184,13 +216,14 @@ sub phylip_pars { =head2 parse_newick( $newick_string ) -Parses the given Newick tree(s) into one or more undirected Graph objects. +Parses the given Newick tree(s) into one or more Stemma objects with +undirected graphs. =cut sub parse_newick { my $newick = shift; - my @trees; + my @stemmata; # Parse the result into a tree my $forest = Bio::Phylo::IO->parse( -format => 'newick', @@ -198,9 +231,47 @@ sub parse_newick { ); # Turn the tree into a graph, starting with the root node foreach my $tree ( @{$forest->get_entities} ) { - push( @trees, _graph_from_bio( $tree ) ); + my $stemma = Text::Tradition::Stemma->new( + graph => _graph_from_bio( $tree ), + is_undirected => 1 ); + push( @stemmata, $stemma ); + } + return \@stemmata; +} + +sub _graph_from_bio { + my $tree = shift; + my $graph = Graph->new( 'undirected' => 1 ); + # Give all the intermediate anonymous nodes a name. + my $i = 0; + my $classes = {}; + foreach my $n ( @{$tree->get_terminals} ) { + # The terminal nodes are our named witnesses. + $classes->{$n->get_name} = 'extant'; + } + foreach my $n ( @{$tree->get_internals} ) { + unless( defined $n->get_name && $n->get_name ne '' ) { + # Get an integer, make sure it's a unique name + while( exists $classes->{$i} ) { + $i++; + } + $n->set_name( $i++ ); + } + $classes->{$n->get_name} = 'hypothetical'; + } + _add_tree_children( $graph, $classes, undef, [ $tree->get_root ]); + return $graph; +} + +sub _add_tree_children { + my( $graph, $classes, $parent, $tree_children ) = @_; + foreach my $c ( @$tree_children ) { + my $child = $c->get_name; + $graph->add_vertex( $child ); + $graph->set_vertex_attribute( $child, 'class', $classes->{$child} ); + $graph->add_path( $parent, $child ) if defined $parent; + _add_tree_children( $graph, $classes, $child, $c->get_children() ); } - return \@trees; } =head2 newick_to_svg( $newick_string ) @@ -225,31 +296,6 @@ sub newick_to_svg { return decode_utf8( $svg ); } -sub _graph_from_bio { - my $tree = shift; - my $graph = Graph->new( 'undirected' => 1 ); - # Give all the intermediate anonymous nodes a name. - my $i = 0; - foreach my $n ( @{$tree->get_entities} ) { - next if $n->get_name; - $n->set_name( $i++ ); - } - my $root = $tree->get_root->get_name; - $graph->add_vertex( $root ); - _add_tree_children( $graph, $root, $tree->get_root->get_children() ); - return $graph; -} - -sub _add_tree_children { - my( $graph, $parent, $tree_children ) = @_; - foreach my $c ( @$tree_children ) { - my $child = $c->get_name; - $graph->add_vertex( $child ); - $graph->add_path( $parent, $child ); - _add_tree_children( $graph, $child, $c->get_children() ); - } -} - sub throw { Text::Tradition::Error->throw( 'ident' => 'StemmaUtil error', diff --git a/stemmaweb/lib/stemmaweb/Controller/Root.pm b/stemmaweb/lib/stemmaweb/Controller/Root.pm index c3812e7..eb00b0c 100644 --- a/stemmaweb/lib/stemmaweb/Controller/Root.pm +++ b/stemmaweb/lib/stemmaweb/Controller/Root.pm @@ -2,6 +2,7 @@ package stemmaweb::Controller::Root; use Moose; use namespace::autoclean; use Text::Tradition::Analysis qw/ run_analysis /; +use Text::Tradition::StemmaUtil qw/ character_input phylip_pars parse_newick /; use TryCatch; @@ -396,6 +397,49 @@ sub stemmadot :Local :Args(2) { $c->forward('View::JSON'); } +=head2 phylotrees + + GET /phylotrees/$textid + + Calculates the phylogenetic tree(s) from the given text variants, and returns a + set of the results. The user may then select a tree, choose a root node, and add + that to the stemmata for the tradition (if s/he has edit rights to the tradition.) + +=cut + +sub phylotrees :Local :Args(1) { + my( $self, $c, $textid ) = @_; + my $tradition = $c->model('Directory')->tradition( $textid ); + unless( $tradition ) { + return _json_error( $c, 500, "No tradition with ID $textid" ); + } + my $ok = _check_permission( $c, $tradition ); + return unless $ok; + + ## Make the character matrix and run pars + ## TODO normalization options + my $charmatrix = character_input( $tradition ); + my $newick; + try { + $newick = phylip_pars( $charmatrix ); + } catch ( Text::Tradition::Error $e ) { + return _json_error( $c, 500, $e->message ); + } + ## If we got a result, stash it + $c->stash->{'stemmadot'} = []; + $c->stash->{'stemmasvg'} = []; + if( $newick ) { + my $stemmata = parse_newick( $newick ); + foreach my $st ( @$stemmata ) { + push( @{$c->stash->{'stemmadot'}}, $st->editable({ linesep => ' ' }) ); + my $svgstr = $st->as_svg( {size => [ 800, 600 ] }); + $svgstr =~ s/\n//mg; + push( @{$c->stash->{'stemmasvg'}}, $svgstr ); + } + } + $c->stash->{'template'} = 'phylotrees.tt'; +} + #################### ### Helper functions #################### diff --git a/stemmaweb/root/src/phylotrees.tt b/stemmaweb/root/src/phylotrees.tt new file mode 100644 index 0000000..d8628ae --- /dev/null +++ b/stemmaweb/root/src/phylotrees.tt @@ -0,0 +1,176 @@ +[% WRAPPER header.tt + pagetitle = "Stemmaweb - Phylogeny calculation tool" + applicationjs = c.uri_for( 'js/phylotools.js' ) +%] + + +[% END %] + +
+

Stemmaweb - [% algorithm_name %] calculation for [% text_name %]

+ [% IF c.user_exists %]Hello! [% c.user.get_object.email %] Sign out | [% ELSE %]Login | [% END %]About +
+
+
+

Text [% text_name %]

+
+
+
+
+
+
+
+
+
+ Choose a root for this stemma +
+
+
+
+ Cancel root selection +
+
+
+
+ Add this stemma to the tradition +
+
+
+
+
+ + +
+

Calculating text phylogeny, please wait...

+ Loading tradition list... +
+ +[% PROCESS footer.tt %] \ No newline at end of file diff --git a/t/text_tradition_stemma.t b/t/text_tradition_stemma.t index 63b834c..c327a62 100644 --- a/t/text_tradition_stemma.t +++ b/t/text_tradition_stemma.t @@ -13,13 +13,16 @@ use TryCatch; use_ok( 'Text::Tradition::Stemma' ); # Try to create a bad graph +TODO: { + local $TODO = "cannot use stdout redirection trick with FastCGI"; my $baddotfh; -open( $baddotfh, 't/data/besoin_bad.dot' ) or die "Could not open test dotfile"; -try { - my $stemma = Text::Tradition::Stemma->new( dot => $baddotfh ); - ok( 0, "Created broken stemma from dotfile with syntax error" ); -} catch( Text::Tradition::Error $e ) { - like( $e->message, qr/^Error trying to parse/, "Syntax error in dot threw exception" ); + open( $baddotfh, 't/data/besoin_bad.dot' ) or die "Could not open test dotfile"; + try { + my $stemma = Text::Tradition::Stemma->new( dot => $baddotfh ); + ok( 0, "Created broken stemma from dotfile with syntax error" ); + } catch( Text::Tradition::Error $e ) { + like( $e->message, qr/^Error trying to parse/, "Syntax error in dot threw exception" ); + } } # Create a good graph @@ -35,6 +38,9 @@ foreach my $h ( $stemma->hypotheticals ) { $found_unicode_sigil = 1 if $h eq "\x{3b1}"; } ok( $found_unicode_sigil, "Found a correctly encoded Unicode sigil" ); + +# TODO Create stemma from graph, create stemma from undirected graph, +# create stemma from incompletely-specified graph }