1 package Text::Tradition::StemmaUtil;
6 use vars qw/ @EXPORT_OK /;
8 use Encode qw( decode_utf8 );
13 use Graph::Reader::Dot;
14 use IPC::Run qw/ run binary /;
15 use Text::Tradition::Error;
16 use Text::Tradition::Stemma;
17 @EXPORT_OK = qw/ character_input phylip_pars parse_newick newick_to_svg /;
21 Text::Tradition::StemmaUtil - standalone utilities for distance tree calculations
25 This package contains a set of utilities for running phylogenetic analysis on
30 =head2 character_input( $tradition, $opts )
32 Returns a character matrix string suitable for Phylip programs, which
33 corresponds to the given alignment table. See Text::Tradition::Collation
34 for a description of the alignment table format. Options include:
38 =item * exclude_layer - Exclude layered witnesses from the character input,
39 using only the 'main' text of the witnesses in the tradition.
41 =item * collapse - A reference to an array of relationship names that should
42 be treated as equivalent for the purposes of generating the character matrix.
49 my ( $tradition, $opts ) = @_;
50 my $table = $tradition->collation->alignment_table;
51 if( $opts->{exclude_layer} ) {
52 # Filter out all alignment table rows that do not correspond
53 # to a named witness - these are the layered witnesses.
54 my $newtable = { alignment => [] };
55 foreach my $row ( $table->{alignment} ) {
56 if( $tradition->has_witness( $row->{witness} ) ) {
57 push( @{$newtable->{alignment}}, $row );
62 my $character_matrix = _make_character_matrix( $table, $opts );
64 my $rows = scalar @{$character_matrix};
65 my $columns = scalar @{$character_matrix->[0]} - 1;
66 $input .= "\t$rows\t$columns\n";
67 foreach my $row ( @{$character_matrix} ) {
68 $input .= join( '', @$row ) . "\n";
73 sub _make_character_matrix {
74 my( $table, $opts ) = @_;
75 # Push the names of the witnesses to initialize the rows of the matrix.
76 my @matrix = map { [ _normalize_witname( $_->{'witness'} ) ] }
77 @{$table->{'alignment'}};
78 foreach my $token_index ( 0 .. $table->{'length'} - 1) {
79 my @pos_tokens = map { $_->{'tokens'}->[$token_index] }
80 @{$table->{'alignment'}};
81 my @pos_readings = map { $_ ? $_->{'t'} : $_ } @pos_tokens;
82 my @chars = _convert_characters( \@pos_readings, $opts );
83 foreach my $idx ( 0 .. $#matrix ) {
84 push( @{$matrix[$idx]}, $chars[$idx] );
90 # Helper function to make the witness name something legal for pars
92 sub _normalize_witname {
94 $witname =~ s/\s+/ /g;
95 $witname =~ s/[\[\]\(\)\:;,]//g;
96 $witname = substr( $witname, 0, 10 );
97 return sprintf( "%-10s", $witname );
100 sub _convert_characters {
101 my( $row, $opts ) = @_;
102 # This is a simple algorithm that treats every reading as different.
103 # Eventually we will want to be able to specify how relationships
104 # affect the character matrix.
105 my %unique = ( '__UNDEF__' => 'X',
111 foreach my $rdg ( @$row ) {
113 next if $rdg->is_lacuna;
114 next if exists $unique{$rdg->text};
115 if( ref( $opts->{'collapse'} ) eq 'ARRAY' ) {
116 my @exclude_types = @{$opts->{'collapse'}};
117 my @set = $rdg->related_readings( sub { my $rel = shift;
118 $rel->colocated && grep { $rel->type eq $_ } @exclude_types } );
120 my $char = chr( 65 + $ctr++ );
121 map { $unique{$_->text} = $char } @set;
122 $count{$rdg->text} += scalar @set;
124 $unique{$rdg->text} = chr( 65 + $ctr++ );
125 $count{$rdg->text}++;
128 # Try to keep variants under 8 by lacunizing any singletons.
129 if( scalar( keys %unique ) > 8 ) {
130 foreach my $word ( keys %count ) {
131 if( $count{$word} == 1 ) {
132 $unique{$word} = '?';
136 my %u = reverse %unique;
137 if( scalar( keys %u ) > 8 ) {
138 warn "Have more than 8 variants on this location; phylip will break";
140 my @chars = map { $_ ? $unique{$_->text} : $unique{'__UNDEF__' } } @$row;
144 =head2 phylip_pars( $character_matrix )
146 Runs Phylip Pars on the given character matrix. Returns results in Newick format.
151 my( $charmatrix ) = @_;
152 # Set up a temporary directory for all the default Phylip files.
153 my $phylip_dir = File::Temp->newdir();
154 # $phylip_dir->unlink_on_destroy(0);
155 # We need an infile, and we need a command input file.
156 open( MATRIX, ">$phylip_dir/infile" ) or die "Could not write $phylip_dir/infile";
157 print MATRIX $charmatrix;
160 open( CMD, ">$phylip_dir/cmdfile" ) or die "Could not write $phylip_dir/cmdfile";
161 ## TODO any configuration parameters we want to set here
162 # U Search for best tree? Yes
163 # S Search option? More thorough search
164 # V Number of trees to save? 100
165 # J Randomize input order of species? No. Use input order
166 # O Outgroup root? No, use as outgroup species 1
167 # T Use Threshold parsimony? No, use ordinary parsimony
168 # W Sites weighted? No
169 # M Analyze multiple data sets? No
170 # I Input species interleaved? Yes
171 # 0 Terminal type (IBM PC, ANSI, none)? ANSI
172 # 1 Print out the data at start of run No
173 # 2 Print indications of progress of run Yes
174 # 3 Print out tree Yes
175 # 4 Print out steps in each site No
176 # 5 Print character at all nodes of tree No
177 # 6 Write out trees onto tree file? Yes
181 # And then we run the program.
182 my $program = File::Which::which( 'pars' );
183 unless( -x $program ) {
184 throw( "Phylip pars not found in path" );
188 # We need to run it in our temporary directory where we have created
189 # all the expected files.
190 local $CWD = $phylip_dir;
191 my @cmd = ( $program );
192 run \@cmd, '<', 'cmdfile', '>', '/dev/null';
194 # Now our output should be in 'outfile' and our tree in 'outtree',
195 # both in the temp directory.
198 if( -f "$phylip_dir/outtree" ) {
199 open( TREE, "$phylip_dir/outtree" ) or die "Could not open outtree for read";
203 return join( '', @outtree ) if @outtree;
205 # If we got this far, we are about to throw an error.
207 if( -f "$phylip_dir/outfile" ) {
208 open( OUTPUT, "$phylip_dir/outfile" ) or die "Could not open output for read";
212 push( @error, "Neither outtree nor output file was produced!" );
214 throw( join( '', @error ) );
217 =head2 parse_newick( $newick_string )
219 Parses the given Newick tree(s) into one or more Stemma objects with
227 # Parse the result into a tree
228 my $forest = Bio::Phylo::IO->parse(
232 # Turn the tree into a graph, starting with the root node
233 foreach my $tree ( @{$forest->get_entities} ) {
234 my $stemma = Text::Tradition::Stemma->new(
235 graph => _graph_from_bio( $tree ),
236 is_undirected => 1 );
237 push( @stemmata, $stemma );
242 sub _graph_from_bio {
244 my $graph = Graph->new( 'undirected' => 1 );
245 # Give all the intermediate anonymous nodes a name.
248 foreach my $n ( @{$tree->get_terminals} ) {
249 # The terminal nodes are our named witnesses.
250 $classes->{$n->get_name} = 'extant';
252 foreach my $n ( @{$tree->get_internals} ) {
253 unless( defined $n->get_name && $n->get_name ne '' ) {
254 # Get an integer, make sure it's a unique name
255 while( exists $classes->{$i} ) {
258 $n->set_name( $i++ );
260 $classes->{$n->get_name} = 'hypothetical';
262 _add_tree_children( $graph, $classes, undef, [ $tree->get_root ]);
266 sub _add_tree_children {
267 my( $graph, $classes, $parent, $tree_children ) = @_;
268 foreach my $c ( @$tree_children ) {
269 my $child = $c->get_name;
270 $graph->add_vertex( $child );
271 $graph->set_vertex_attribute( $child, 'class', $classes->{$child} );
272 $graph->add_path( $parent, $child ) if defined $parent;
273 _add_tree_children( $graph, $classes, $child, $c->get_children() );
277 =head2 newick_to_svg( $newick_string )
279 Uses the FigTree utility (if installed) to transform the given Newick tree(s)
280 into a graph visualization.
286 my $program = File::Which::which( 'figtree' );
287 unless( -x $program ) {
288 throw( "FigTree commandline utility not found in path" );
291 my $nfile = File::Temp->new();
292 print $nfile $newick;
294 my @cmd = ( $program, '-graphic', 'SVG', $nfile );
295 run( \@cmd, ">", binary(), \$svg );
296 return decode_utf8( $svg );
300 Text::Tradition::Error->throw(
301 'ident' => 'StemmaUtil error',
310 This package is free software and is provided "as is" without express
311 or implied warranty. You can redistribute it and/or modify it under
312 the same terms as Perl itself.
316 Tara L Andrews E<lt>aurum@cpan.orgE<gt>