change worker script to using 'say'; timestamp the log
[scpubgit/stemmatology.git] / analysis / lib / Text / Tradition / StemmaUtil.pm
CommitLineData
68454b71 1package Text::Tradition::StemmaUtil;
2
3use strict;
4use warnings;
5use Exporter 'import';
6use vars qw/ @EXPORT_OK /;
b02332ca 7use Bio::Phylo::IO;
27e2a8fe 8use Encode qw( decode_utf8 );
b02332ca 9use File::chdir;
10use File::Temp;
11use File::Which;
12use Graph;
b02332ca 13use IPC::Run qw/ run binary /;
edac47cc 14use Text::Tradition::Error;
ea45d2a6 15use Text::Tradition::Stemma;
9457207b 16@EXPORT_OK = qw/ character_input phylip_pars parse_newick newick_to_svg /;
68454b71 17
027d819c 18=head1 NAME
19
20Text::Tradition::StemmaUtil - standalone utilities for distance tree calculations
21
22=head1 DESCRIPTION
23
24This package contains a set of utilities for running phylogenetic analysis on
25text collations.
26
27=head1 SUBROUTINES
28
ea45d2a6 29=head2 character_input( $tradition, $opts )
9457207b 30
31Returns a character matrix string suitable for Phylip programs, which
32corresponds to the given alignment table. See Text::Tradition::Collation
ea45d2a6 33for a description of the alignment table format. Options include:
34
35=over
36
37=item * exclude_layer - Exclude layered witnesses from the character input,
38using only the 'main' text of the witnesses in the tradition.
39
40=item * collapse - A reference to an array of relationship names that should
41be treated as equivalent for the purposes of generating the character matrix.
42
43=back
9457207b 44
027d819c 45=cut
46
9457207b 47sub character_input {
ea45d2a6 48 my ( $tradition, $opts ) = @_;
49 my $table = $tradition->collation->alignment_table;
50 if( $opts->{exclude_layer} ) {
51 # Filter out all alignment table rows that do not correspond
52 # to a named witness - these are the layered witnesses.
b39e7cb5 53 my $newtable = { alignment => [], length => $table->{length} };
54 foreach my $row ( @{$table->{alignment}} ) {
ea45d2a6 55 if( $tradition->has_witness( $row->{witness} ) ) {
56 push( @{$newtable->{alignment}}, $row );
57 }
58 }
59 $table = $newtable;
60 }
61 my $character_matrix = _make_character_matrix( $table, $opts );
9457207b 62 my $input = '';
63 my $rows = scalar @{$character_matrix};
64 my $columns = scalar @{$character_matrix->[0]} - 1;
65 $input .= "\t$rows\t$columns\n";
66 foreach my $row ( @{$character_matrix} ) {
67 $input .= join( '', @$row ) . "\n";
68 }
69 return $input;
70}
71
027d819c 72sub _make_character_matrix {
ea45d2a6 73 my( $table, $opts ) = @_;
68454b71 74 # Push the names of the witnesses to initialize the rows of the matrix.
75 my @matrix = map { [ _normalize_witname( $_->{'witness'} ) ] }
76 @{$table->{'alignment'}};
77 foreach my $token_index ( 0 .. $table->{'length'} - 1) {
ea45d2a6 78 my @pos_tokens = map { $_->{'tokens'}->[$token_index] }
68454b71 79 @{$table->{'alignment'}};
ea45d2a6 80 my @pos_readings = map { $_ ? $_->{'t'} : $_ } @pos_tokens;
81 my @chars = _convert_characters( \@pos_readings, $opts );
68454b71 82 foreach my $idx ( 0 .. $#matrix ) {
83 push( @{$matrix[$idx]}, $chars[$idx] );
84 }
85 }
86 return \@matrix;
87}
88
89# Helper function to make the witness name something legal for pars
90
91sub _normalize_witname {
92 my( $witname ) = @_;
93 $witname =~ s/\s+/ /g;
94 $witname =~ s/[\[\]\(\)\:;,]//g;
95 $witname = substr( $witname, 0, 10 );
96 return sprintf( "%-10s", $witname );
97}
98
027d819c 99sub _convert_characters {
ea45d2a6 100 my( $row, $opts ) = @_;
68454b71 101 # This is a simple algorithm that treats every reading as different.
102 # Eventually we will want to be able to specify how relationships
103 # affect the character matrix.
104 my %unique = ( '__UNDEF__' => 'X',
105 '#LACUNA#' => '?',
106 );
ea45d2a6 107 my %equivalent;
68454b71 108 my %count;
109 my $ctr = 0;
ea45d2a6 110 foreach my $rdg ( @$row ) {
111 next unless $rdg;
112 next if $rdg->is_lacuna;
113 next if exists $unique{$rdg->text};
114 if( ref( $opts->{'collapse'} ) eq 'ARRAY' ) {
115 my @exclude_types = @{$opts->{'collapse'}};
116 my @set = $rdg->related_readings( sub { my $rel = shift;
117 $rel->colocated && grep { $rel->type eq $_ } @exclude_types } );
118 push( @set, $rdg );
119 my $char = chr( 65 + $ctr++ );
120 map { $unique{$_->text} = $char } @set;
121 $count{$rdg->text} += scalar @set;
122 } else {
123 $unique{$rdg->text} = chr( 65 + $ctr++ );
124 $count{$rdg->text}++;
125 }
68454b71 126 }
127 # Try to keep variants under 8 by lacunizing any singletons.
128 if( scalar( keys %unique ) > 8 ) {
129 foreach my $word ( keys %count ) {
130 if( $count{$word} == 1 ) {
131 $unique{$word} = '?';
132 }
133 }
134 }
135 my %u = reverse %unique;
136 if( scalar( keys %u ) > 8 ) {
137 warn "Have more than 8 variants on this location; phylip will break";
138 }
ea45d2a6 139 my @chars = map { $_ ? $unique{$_->text} : $unique{'__UNDEF__' } } @$row;
68454b71 140 return @chars;
141}
142
027d819c 143=head2 phylip_pars( $character_matrix )
144
145Runs Phylip Pars on the given character matrix. Returns results in Newick format.
146
147=cut
148
b02332ca 149sub phylip_pars {
150 my( $charmatrix ) = @_;
151 # Set up a temporary directory for all the default Phylip files.
152 my $phylip_dir = File::Temp->newdir();
153 # $phylip_dir->unlink_on_destroy(0);
154 # We need an infile, and we need a command input file.
155 open( MATRIX, ">$phylip_dir/infile" ) or die "Could not write $phylip_dir/infile";
156 print MATRIX $charmatrix;
157 close MATRIX;
158
159 open( CMD, ">$phylip_dir/cmdfile" ) or die "Could not write $phylip_dir/cmdfile";
160 ## TODO any configuration parameters we want to set here
161# U Search for best tree? Yes
162# S Search option? More thorough search
163# V Number of trees to save? 100
164# J Randomize input order of species? No. Use input order
165# O Outgroup root? No, use as outgroup species 1
166# T Use Threshold parsimony? No, use ordinary parsimony
167# W Sites weighted? No
168# M Analyze multiple data sets? No
169# I Input species interleaved? Yes
170# 0 Terminal type (IBM PC, ANSI, none)? ANSI
171# 1 Print out the data at start of run No
172# 2 Print indications of progress of run Yes
173# 3 Print out tree Yes
174# 4 Print out steps in each site No
175# 5 Print character at all nodes of tree No
176# 6 Write out trees onto tree file? Yes
177 print CMD "Y\n";
178 close CMD;
179
180 # And then we run the program.
181 my $program = File::Which::which( 'pars' );
b39e7cb5 182 unless( $program && -x $program ) {
edac47cc 183 throw( "Phylip pars not found in path" );
b02332ca 184 }
185
186 {
187 # We need to run it in our temporary directory where we have created
188 # all the expected files.
189 local $CWD = $phylip_dir;
190 my @cmd = ( $program );
191 run \@cmd, '<', 'cmdfile', '>', '/dev/null';
192 }
193 # Now our output should be in 'outfile' and our tree in 'outtree',
194 # both in the temp directory.
195
196 my @outtree;
197 if( -f "$phylip_dir/outtree" ) {
198 open( TREE, "$phylip_dir/outtree" ) or die "Could not open outtree for read";
199 @outtree = <TREE>;
200 close TREE;
201 }
edac47cc 202 return join( '', @outtree ) if @outtree;
b02332ca 203
69403daa 204 # If we got this far, we are about to throw an error.
b02332ca 205 my @error;
206 if( -f "$phylip_dir/outfile" ) {
207 open( OUTPUT, "$phylip_dir/outfile" ) or die "Could not open output for read";
208 @error = <OUTPUT>;
209 close OUTPUT;
210 } else {
211 push( @error, "Neither outtree nor output file was produced!" );
212 }
edac47cc 213 throw( join( '', @error ) );
b02332ca 214}
215
027d819c 216=head2 parse_newick( $newick_string )
217
ea45d2a6 218Parses the given Newick tree(s) into one or more Stemma objects with
219undirected graphs.
027d819c 220
221=cut
222
b02332ca 223sub parse_newick {
224 my $newick = shift;
ea45d2a6 225 my @stemmata;
b02332ca 226 # Parse the result into a tree
227 my $forest = Bio::Phylo::IO->parse(
228 -format => 'newick',
229 -string => $newick,
230 );
231 # Turn the tree into a graph, starting with the root node
232 foreach my $tree ( @{$forest->get_entities} ) {
ea45d2a6 233 my $stemma = Text::Tradition::Stemma->new(
98f22390 234 graph => _graph_from_bio( $tree ) );
ea45d2a6 235 push( @stemmata, $stemma );
236 }
237 return \@stemmata;
238}
239
240sub _graph_from_bio {
241 my $tree = shift;
242 my $graph = Graph->new( 'undirected' => 1 );
243 # Give all the intermediate anonymous nodes a name.
244 my $i = 0;
245 my $classes = {};
246 foreach my $n ( @{$tree->get_terminals} ) {
247 # The terminal nodes are our named witnesses.
248 $classes->{$n->get_name} = 'extant';
249 }
250 foreach my $n ( @{$tree->get_internals} ) {
251 unless( defined $n->get_name && $n->get_name ne '' ) {
252 # Get an integer, make sure it's a unique name
253 while( exists $classes->{$i} ) {
254 $i++;
255 }
256 $n->set_name( $i++ );
257 }
258 $classes->{$n->get_name} = 'hypothetical';
259 }
260 _add_tree_children( $graph, $classes, undef, [ $tree->get_root ]);
261 return $graph;
262}
263
264sub _add_tree_children {
265 my( $graph, $classes, $parent, $tree_children ) = @_;
266 foreach my $c ( @$tree_children ) {
267 my $child = $c->get_name;
268 $graph->add_vertex( $child );
269 $graph->set_vertex_attribute( $child, 'class', $classes->{$child} );
270 $graph->add_path( $parent, $child ) if defined $parent;
271 _add_tree_children( $graph, $classes, $child, $c->get_children() );
b02332ca 272 }
b02332ca 273}
274
027d819c 275=head2 newick_to_svg( $newick_string )
276
277Uses the FigTree utility (if installed) to transform the given Newick tree(s)
278into a graph visualization.
279
280=cut
281
27e2a8fe 282sub newick_to_svg {
283 my $newick = shift;
284 my $program = File::Which::which( 'figtree' );
285 unless( -x $program ) {
edac47cc 286 throw( "FigTree commandline utility not found in path" );
27e2a8fe 287 }
288 my $svg;
289 my $nfile = File::Temp->new();
290 print $nfile $newick;
291 close $nfile;
292 my @cmd = ( $program, '-graphic', 'SVG', $nfile );
293 run( \@cmd, ">", binary(), \$svg );
294 return decode_utf8( $svg );
295}
296
edac47cc 297sub throw {
298 Text::Tradition::Error->throw(
299 'ident' => 'StemmaUtil error',
300 'message' => $_[0],
301 );
302}
303
027d819c 3041;
305
306=head1 LICENSE
307
308This package is free software and is provided "as is" without express
309or implied warranty. You can redistribute it and/or modify it under
310the same terms as Perl itself.
311
312=head1 AUTHOR
313
314Tara L Andrews E<lt>aurum@cpan.orgE<gt>