remove some debugging statements
[scpubgit/stemmatology.git] / lib / Text / Tradition / StemmaUtil.pm
CommitLineData
68454b71 1package Text::Tradition::StemmaUtil;
2
3use strict;
4use warnings;
5use Exporter 'import';
6use vars qw/ @EXPORT_OK /;
b02332ca 7use Bio::Phylo::IO;
27e2a8fe 8use Encode qw( decode_utf8 );
b02332ca 9use File::chdir;
10use File::Temp;
11use File::Which;
12use Graph;
13use Graph::Reader::Dot;
14use IPC::Run qw/ run binary /;
edac47cc 15use Text::Tradition::Error;
9457207b 16@EXPORT_OK = qw/ character_input phylip_pars parse_newick newick_to_svg /;
68454b71 17
027d819c 18=head1 NAME
19
20Text::Tradition::StemmaUtil - standalone utilities for distance tree calculations
21
22=head1 DESCRIPTION
23
24This package contains a set of utilities for running phylogenetic analysis on
25text collations.
26
27=head1 SUBROUTINES
28
9457207b 29=head2 character_input( $alignment_table )
30
31Returns a character matrix string suitable for Phylip programs, which
32corresponds to the given alignment table. See Text::Tradition::Collation
33for a description of the alignment table format.
34
027d819c 35=cut
36
9457207b 37sub character_input {
38 my $table = shift;
39 my $character_matrix = _make_character_matrix( $table );
40 my $input = '';
41 my $rows = scalar @{$character_matrix};
42 my $columns = scalar @{$character_matrix->[0]} - 1;
43 $input .= "\t$rows\t$columns\n";
44 foreach my $row ( @{$character_matrix} ) {
45 $input .= join( '', @$row ) . "\n";
46 }
47 return $input;
48}
49
027d819c 50sub _make_character_matrix {
68454b71 51 my( $table ) = @_;
52 # Push the names of the witnesses to initialize the rows of the matrix.
53 my @matrix = map { [ _normalize_witname( $_->{'witness'} ) ] }
54 @{$table->{'alignment'}};
55 foreach my $token_index ( 0 .. $table->{'length'} - 1) {
56 # First implementation: make dumb alignment table, caring about
57 # nothing except which reading is in which position.
58 my @pos_readings = map { $_->{'tokens'}->[$token_index] }
59 @{$table->{'alignment'}};
60 my @pos_text = map { $_ ? $_->{'t'} : $_ } @pos_readings;
027d819c 61 my @chars = _convert_characters( \@pos_text );
68454b71 62 foreach my $idx ( 0 .. $#matrix ) {
63 push( @{$matrix[$idx]}, $chars[$idx] );
64 }
65 }
66 return \@matrix;
67}
68
69# Helper function to make the witness name something legal for pars
70
71sub _normalize_witname {
72 my( $witname ) = @_;
73 $witname =~ s/\s+/ /g;
74 $witname =~ s/[\[\]\(\)\:;,]//g;
75 $witname = substr( $witname, 0, 10 );
76 return sprintf( "%-10s", $witname );
77}
78
027d819c 79sub _convert_characters {
68454b71 80 my $row = shift;
81 # This is a simple algorithm that treats every reading as different.
82 # Eventually we will want to be able to specify how relationships
83 # affect the character matrix.
84 my %unique = ( '__UNDEF__' => 'X',
85 '#LACUNA#' => '?',
86 );
87 my %count;
88 my $ctr = 0;
89 foreach my $word ( @$row ) {
90 if( $word && !exists $unique{$word} ) {
91 $unique{$word} = chr( 65 + $ctr );
92 $ctr++;
93 }
94 $count{$word}++ if $word;
95 }
96 # Try to keep variants under 8 by lacunizing any singletons.
97 if( scalar( keys %unique ) > 8 ) {
98 foreach my $word ( keys %count ) {
99 if( $count{$word} == 1 ) {
100 $unique{$word} = '?';
101 }
102 }
103 }
104 my %u = reverse %unique;
105 if( scalar( keys %u ) > 8 ) {
106 warn "Have more than 8 variants on this location; phylip will break";
107 }
108 my @chars = map { $_ ? $unique{$_} : $unique{'__UNDEF__' } } @$row;
109 return @chars;
110}
111
027d819c 112=head2 phylip_pars( $character_matrix )
113
114Runs Phylip Pars on the given character matrix. Returns results in Newick format.
115
116=cut
117
b02332ca 118sub phylip_pars {
119 my( $charmatrix ) = @_;
120 # Set up a temporary directory for all the default Phylip files.
121 my $phylip_dir = File::Temp->newdir();
122 # $phylip_dir->unlink_on_destroy(0);
123 # We need an infile, and we need a command input file.
124 open( MATRIX, ">$phylip_dir/infile" ) or die "Could not write $phylip_dir/infile";
125 print MATRIX $charmatrix;
126 close MATRIX;
127
128 open( CMD, ">$phylip_dir/cmdfile" ) or die "Could not write $phylip_dir/cmdfile";
129 ## TODO any configuration parameters we want to set here
130# U Search for best tree? Yes
131# S Search option? More thorough search
132# V Number of trees to save? 100
133# J Randomize input order of species? No. Use input order
134# O Outgroup root? No, use as outgroup species 1
135# T Use Threshold parsimony? No, use ordinary parsimony
136# W Sites weighted? No
137# M Analyze multiple data sets? No
138# I Input species interleaved? Yes
139# 0 Terminal type (IBM PC, ANSI, none)? ANSI
140# 1 Print out the data at start of run No
141# 2 Print indications of progress of run Yes
142# 3 Print out tree Yes
143# 4 Print out steps in each site No
144# 5 Print character at all nodes of tree No
145# 6 Write out trees onto tree file? Yes
146 print CMD "Y\n";
147 close CMD;
148
149 # And then we run the program.
150 my $program = File::Which::which( 'pars' );
151 unless( -x $program ) {
edac47cc 152 throw( "Phylip pars not found in path" );
b02332ca 153 }
154
155 {
156 # We need to run it in our temporary directory where we have created
157 # all the expected files.
158 local $CWD = $phylip_dir;
159 my @cmd = ( $program );
160 run \@cmd, '<', 'cmdfile', '>', '/dev/null';
161 }
162 # Now our output should be in 'outfile' and our tree in 'outtree',
163 # both in the temp directory.
164
165 my @outtree;
166 if( -f "$phylip_dir/outtree" ) {
167 open( TREE, "$phylip_dir/outtree" ) or die "Could not open outtree for read";
168 @outtree = <TREE>;
169 close TREE;
170 }
edac47cc 171 return join( '', @outtree ) if @outtree;
b02332ca 172
69403daa 173 # If we got this far, we are about to throw an error.
b02332ca 174 my @error;
175 if( -f "$phylip_dir/outfile" ) {
176 open( OUTPUT, "$phylip_dir/outfile" ) or die "Could not open output for read";
177 @error = <OUTPUT>;
178 close OUTPUT;
179 } else {
180 push( @error, "Neither outtree nor output file was produced!" );
181 }
edac47cc 182 throw( join( '', @error ) );
b02332ca 183}
184
027d819c 185=head2 parse_newick( $newick_string )
186
187Parses the given Newick tree(s) into one or more undirected Graph objects.
188
189=cut
190
b02332ca 191sub parse_newick {
192 my $newick = shift;
193 my @trees;
194 # Parse the result into a tree
195 my $forest = Bio::Phylo::IO->parse(
196 -format => 'newick',
197 -string => $newick,
198 );
199 # Turn the tree into a graph, starting with the root node
200 foreach my $tree ( @{$forest->get_entities} ) {
201 push( @trees, _graph_from_bio( $tree ) );
202 }
203 return \@trees;
204}
205
027d819c 206=head2 newick_to_svg( $newick_string )
207
208Uses the FigTree utility (if installed) to transform the given Newick tree(s)
209into a graph visualization.
210
211=cut
212
27e2a8fe 213sub newick_to_svg {
214 my $newick = shift;
215 my $program = File::Which::which( 'figtree' );
216 unless( -x $program ) {
edac47cc 217 throw( "FigTree commandline utility not found in path" );
27e2a8fe 218 }
219 my $svg;
220 my $nfile = File::Temp->new();
221 print $nfile $newick;
222 close $nfile;
223 my @cmd = ( $program, '-graphic', 'SVG', $nfile );
224 run( \@cmd, ">", binary(), \$svg );
225 return decode_utf8( $svg );
226}
227
b02332ca 228sub _graph_from_bio {
229 my $tree = shift;
230 my $graph = Graph->new( 'undirected' => 1 );
231 # Give all the intermediate anonymous nodes a name.
232 my $i = 0;
233 foreach my $n ( @{$tree->get_entities} ) {
234 next if $n->get_name;
235 $n->set_name( $i++ );
236 }
237 my $root = $tree->get_root->get_name;
238 $graph->add_vertex( $root );
239 _add_tree_children( $graph, $root, $tree->get_root->get_children() );
240 return $graph;
241}
242
243sub _add_tree_children {
244 my( $graph, $parent, $tree_children ) = @_;
245 foreach my $c ( @$tree_children ) {
246 my $child = $c->get_name;
247 $graph->add_vertex( $child );
248 $graph->add_path( $parent, $child );
249 _add_tree_children( $graph, $child, $c->get_children() );
250 }
251}
edac47cc 252
253sub throw {
254 Text::Tradition::Error->throw(
255 'ident' => 'StemmaUtil error',
256 'message' => $_[0],
257 );
258}
259
027d819c 2601;
261
262=head1 LICENSE
263
264This package is free software and is provided "as is" without express
265or implied warranty. You can redistribute it and/or modify it under
266the same terms as Perl itself.
267
268=head1 AUTHOR
269
270Tara L Andrews E<lt>aurum@cpan.orgE<gt>