huge pile of pod updates
[scpubgit/stemmatology.git] / lib / Text / Tradition / StemmaUtil.pm
CommitLineData
68454b71 1package Text::Tradition::StemmaUtil;
2
3use strict;
4use warnings;
5use Exporter 'import';
6use vars qw/ @EXPORT_OK /;
b02332ca 7use Bio::Phylo::IO;
27e2a8fe 8use Encode qw( decode_utf8 );
b02332ca 9use File::chdir;
10use File::Temp;
11use File::Which;
12use Graph;
13use Graph::Reader::Dot;
14use IPC::Run qw/ run binary /;
edac47cc 15use Text::Tradition::Error;
27e2a8fe 16@EXPORT_OK = qw/ make_character_matrix character_input phylip_pars
17 parse_newick newick_to_svg /;
68454b71 18
027d819c 19=head1 NAME
20
21Text::Tradition::StemmaUtil - standalone utilities for distance tree calculations
22
23=head1 DESCRIPTION
24
25This package contains a set of utilities for running phylogenetic analysis on
26text collations.
27
28=head1 SUBROUTINES
29
30=cut
31
32sub _make_character_matrix {
68454b71 33 my( $table ) = @_;
34 # Push the names of the witnesses to initialize the rows of the matrix.
35 my @matrix = map { [ _normalize_witname( $_->{'witness'} ) ] }
36 @{$table->{'alignment'}};
37 foreach my $token_index ( 0 .. $table->{'length'} - 1) {
38 # First implementation: make dumb alignment table, caring about
39 # nothing except which reading is in which position.
40 my @pos_readings = map { $_->{'tokens'}->[$token_index] }
41 @{$table->{'alignment'}};
42 my @pos_text = map { $_ ? $_->{'t'} : $_ } @pos_readings;
027d819c 43 my @chars = _convert_characters( \@pos_text );
68454b71 44 foreach my $idx ( 0 .. $#matrix ) {
45 push( @{$matrix[$idx]}, $chars[$idx] );
46 }
47 }
48 return \@matrix;
49}
50
51# Helper function to make the witness name something legal for pars
52
53sub _normalize_witname {
54 my( $witname ) = @_;
55 $witname =~ s/\s+/ /g;
56 $witname =~ s/[\[\]\(\)\:;,]//g;
57 $witname = substr( $witname, 0, 10 );
58 return sprintf( "%-10s", $witname );
59}
60
027d819c 61sub _convert_characters {
68454b71 62 my $row = shift;
63 # This is a simple algorithm that treats every reading as different.
64 # Eventually we will want to be able to specify how relationships
65 # affect the character matrix.
66 my %unique = ( '__UNDEF__' => 'X',
67 '#LACUNA#' => '?',
68 );
69 my %count;
70 my $ctr = 0;
71 foreach my $word ( @$row ) {
72 if( $word && !exists $unique{$word} ) {
73 $unique{$word} = chr( 65 + $ctr );
74 $ctr++;
75 }
76 $count{$word}++ if $word;
77 }
78 # Try to keep variants under 8 by lacunizing any singletons.
79 if( scalar( keys %unique ) > 8 ) {
80 foreach my $word ( keys %count ) {
81 if( $count{$word} == 1 ) {
82 $unique{$word} = '?';
83 }
84 }
85 }
86 my %u = reverse %unique;
87 if( scalar( keys %u ) > 8 ) {
88 warn "Have more than 8 variants on this location; phylip will break";
89 }
90 my @chars = map { $_ ? $unique{$_} : $unique{'__UNDEF__' } } @$row;
91 return @chars;
92}
93
027d819c 94=head2 character_input( $alignment_table )
95
96Returns a character matrix string suitable for Phylip programs, which
97corresponds to the given alignment table. See Text::Tradition::Collation
98for a description of the alignment table format.
99
100=cut
101
b02332ca 102sub character_input {
68454b71 103 my $table = shift;
027d819c 104 my $character_matrix = _make_character_matrix( $table );
68454b71 105 my $input = '';
106 my $rows = scalar @{$character_matrix};
107 my $columns = scalar @{$character_matrix->[0]} - 1;
108 $input .= "\t$rows\t$columns\n";
109 foreach my $row ( @{$character_matrix} ) {
110 $input .= join( '', @$row ) . "\n";
111 }
112 return $input;
113}
114
027d819c 115=head2 phylip_pars( $character_matrix )
116
117Runs Phylip Pars on the given character matrix. Returns results in Newick format.
118
119=cut
120
b02332ca 121sub phylip_pars {
122 my( $charmatrix ) = @_;
123 # Set up a temporary directory for all the default Phylip files.
124 my $phylip_dir = File::Temp->newdir();
125 # $phylip_dir->unlink_on_destroy(0);
126 # We need an infile, and we need a command input file.
127 open( MATRIX, ">$phylip_dir/infile" ) or die "Could not write $phylip_dir/infile";
128 print MATRIX $charmatrix;
129 close MATRIX;
130
131 open( CMD, ">$phylip_dir/cmdfile" ) or die "Could not write $phylip_dir/cmdfile";
132 ## TODO any configuration parameters we want to set here
133# U Search for best tree? Yes
134# S Search option? More thorough search
135# V Number of trees to save? 100
136# J Randomize input order of species? No. Use input order
137# O Outgroup root? No, use as outgroup species 1
138# T Use Threshold parsimony? No, use ordinary parsimony
139# W Sites weighted? No
140# M Analyze multiple data sets? No
141# I Input species interleaved? Yes
142# 0 Terminal type (IBM PC, ANSI, none)? ANSI
143# 1 Print out the data at start of run No
144# 2 Print indications of progress of run Yes
145# 3 Print out tree Yes
146# 4 Print out steps in each site No
147# 5 Print character at all nodes of tree No
148# 6 Write out trees onto tree file? Yes
149 print CMD "Y\n";
150 close CMD;
151
152 # And then we run the program.
153 my $program = File::Which::which( 'pars' );
154 unless( -x $program ) {
edac47cc 155 throw( "Phylip pars not found in path" );
b02332ca 156 }
157
158 {
159 # We need to run it in our temporary directory where we have created
160 # all the expected files.
161 local $CWD = $phylip_dir;
162 my @cmd = ( $program );
163 run \@cmd, '<', 'cmdfile', '>', '/dev/null';
164 }
165 # Now our output should be in 'outfile' and our tree in 'outtree',
166 # both in the temp directory.
167
168 my @outtree;
169 if( -f "$phylip_dir/outtree" ) {
170 open( TREE, "$phylip_dir/outtree" ) or die "Could not open outtree for read";
171 @outtree = <TREE>;
172 close TREE;
173 }
edac47cc 174 return join( '', @outtree ) if @outtree;
b02332ca 175
69403daa 176 # If we got this far, we are about to throw an error.
b02332ca 177 my @error;
178 if( -f "$phylip_dir/outfile" ) {
179 open( OUTPUT, "$phylip_dir/outfile" ) or die "Could not open output for read";
180 @error = <OUTPUT>;
181 close OUTPUT;
182 } else {
183 push( @error, "Neither outtree nor output file was produced!" );
184 }
edac47cc 185 throw( join( '', @error ) );
b02332ca 186}
187
027d819c 188=head2 parse_newick( $newick_string )
189
190Parses the given Newick tree(s) into one or more undirected Graph objects.
191
192=cut
193
b02332ca 194sub parse_newick {
195 my $newick = shift;
196 my @trees;
197 # Parse the result into a tree
198 my $forest = Bio::Phylo::IO->parse(
199 -format => 'newick',
200 -string => $newick,
201 );
202 # Turn the tree into a graph, starting with the root node
203 foreach my $tree ( @{$forest->get_entities} ) {
204 push( @trees, _graph_from_bio( $tree ) );
205 }
206 return \@trees;
207}
208
027d819c 209=head2 newick_to_svg( $newick_string )
210
211Uses the FigTree utility (if installed) to transform the given Newick tree(s)
212into a graph visualization.
213
214=cut
215
27e2a8fe 216sub newick_to_svg {
217 my $newick = shift;
218 my $program = File::Which::which( 'figtree' );
219 unless( -x $program ) {
edac47cc 220 throw( "FigTree commandline utility not found in path" );
27e2a8fe 221 }
222 my $svg;
223 my $nfile = File::Temp->new();
224 print $nfile $newick;
225 close $nfile;
226 my @cmd = ( $program, '-graphic', 'SVG', $nfile );
227 run( \@cmd, ">", binary(), \$svg );
228 return decode_utf8( $svg );
229}
230
b02332ca 231sub _graph_from_bio {
232 my $tree = shift;
233 my $graph = Graph->new( 'undirected' => 1 );
234 # Give all the intermediate anonymous nodes a name.
235 my $i = 0;
236 foreach my $n ( @{$tree->get_entities} ) {
237 next if $n->get_name;
238 $n->set_name( $i++ );
239 }
240 my $root = $tree->get_root->get_name;
241 $graph->add_vertex( $root );
242 _add_tree_children( $graph, $root, $tree->get_root->get_children() );
243 return $graph;
244}
245
246sub _add_tree_children {
247 my( $graph, $parent, $tree_children ) = @_;
248 foreach my $c ( @$tree_children ) {
249 my $child = $c->get_name;
250 $graph->add_vertex( $child );
251 $graph->add_path( $parent, $child );
252 _add_tree_children( $graph, $child, $c->get_children() );
253 }
254}
edac47cc 255
256sub throw {
257 Text::Tradition::Error->throw(
258 'ident' => 'StemmaUtil error',
259 'message' => $_[0],
260 );
261}
262
027d819c 2631;
264
265=head1 LICENSE
266
267This package is free software and is provided "as is" without express
268or implied warranty. You can redistribute it and/or modify it under
269the same terms as Perl itself.
270
271=head1 AUTHOR
272
273Tara L Andrews E<lt>aurum@cpan.orgE<gt>