fix return value mod in distance_trees
[scpubgit/stemmatology.git] / lib / Text / Tradition / StemmaUtil.pm
CommitLineData
68454b71 1package Text::Tradition::StemmaUtil;
2
3use strict;
4use warnings;
5use Exporter 'import';
6use vars qw/ @EXPORT_OK /;
b02332ca 7use Bio::Phylo::IO;
27e2a8fe 8use Encode qw( decode_utf8 );
b02332ca 9use File::chdir;
10use File::Temp;
11use File::Which;
12use Graph;
13use Graph::Reader::Dot;
14use IPC::Run qw/ run binary /;
edac47cc 15use Text::Tradition::Error;
27e2a8fe 16@EXPORT_OK = qw/ make_character_matrix character_input phylip_pars
17 parse_newick newick_to_svg /;
68454b71 18
19sub make_character_matrix {
20 my( $table ) = @_;
21 # Push the names of the witnesses to initialize the rows of the matrix.
22 my @matrix = map { [ _normalize_witname( $_->{'witness'} ) ] }
23 @{$table->{'alignment'}};
24 foreach my $token_index ( 0 .. $table->{'length'} - 1) {
25 # First implementation: make dumb alignment table, caring about
26 # nothing except which reading is in which position.
27 my @pos_readings = map { $_->{'tokens'}->[$token_index] }
28 @{$table->{'alignment'}};
29 my @pos_text = map { $_ ? $_->{'t'} : $_ } @pos_readings;
30 my @chars = convert_characters( \@pos_text );
31 foreach my $idx ( 0 .. $#matrix ) {
32 push( @{$matrix[$idx]}, $chars[$idx] );
33 }
34 }
35 return \@matrix;
36}
37
38# Helper function to make the witness name something legal for pars
39
40sub _normalize_witname {
41 my( $witname ) = @_;
42 $witname =~ s/\s+/ /g;
43 $witname =~ s/[\[\]\(\)\:;,]//g;
44 $witname = substr( $witname, 0, 10 );
45 return sprintf( "%-10s", $witname );
46}
47
48sub convert_characters {
49 my $row = shift;
50 # This is a simple algorithm that treats every reading as different.
51 # Eventually we will want to be able to specify how relationships
52 # affect the character matrix.
53 my %unique = ( '__UNDEF__' => 'X',
54 '#LACUNA#' => '?',
55 );
56 my %count;
57 my $ctr = 0;
58 foreach my $word ( @$row ) {
59 if( $word && !exists $unique{$word} ) {
60 $unique{$word} = chr( 65 + $ctr );
61 $ctr++;
62 }
63 $count{$word}++ if $word;
64 }
65 # Try to keep variants under 8 by lacunizing any singletons.
66 if( scalar( keys %unique ) > 8 ) {
67 foreach my $word ( keys %count ) {
68 if( $count{$word} == 1 ) {
69 $unique{$word} = '?';
70 }
71 }
72 }
73 my %u = reverse %unique;
74 if( scalar( keys %u ) > 8 ) {
75 warn "Have more than 8 variants on this location; phylip will break";
76 }
77 my @chars = map { $_ ? $unique{$_} : $unique{'__UNDEF__' } } @$row;
78 return @chars;
79}
80
b02332ca 81sub character_input {
68454b71 82 my $table = shift;
83 my $character_matrix = make_character_matrix( $table );
84 my $input = '';
85 my $rows = scalar @{$character_matrix};
86 my $columns = scalar @{$character_matrix->[0]} - 1;
87 $input .= "\t$rows\t$columns\n";
88 foreach my $row ( @{$character_matrix} ) {
89 $input .= join( '', @$row ) . "\n";
90 }
91 return $input;
92}
93
b02332ca 94sub phylip_pars {
95 my( $charmatrix ) = @_;
96 # Set up a temporary directory for all the default Phylip files.
97 my $phylip_dir = File::Temp->newdir();
98 # $phylip_dir->unlink_on_destroy(0);
99 # We need an infile, and we need a command input file.
100 open( MATRIX, ">$phylip_dir/infile" ) or die "Could not write $phylip_dir/infile";
101 print MATRIX $charmatrix;
102 close MATRIX;
103
104 open( CMD, ">$phylip_dir/cmdfile" ) or die "Could not write $phylip_dir/cmdfile";
105 ## TODO any configuration parameters we want to set here
106# U Search for best tree? Yes
107# S Search option? More thorough search
108# V Number of trees to save? 100
109# J Randomize input order of species? No. Use input order
110# O Outgroup root? No, use as outgroup species 1
111# T Use Threshold parsimony? No, use ordinary parsimony
112# W Sites weighted? No
113# M Analyze multiple data sets? No
114# I Input species interleaved? Yes
115# 0 Terminal type (IBM PC, ANSI, none)? ANSI
116# 1 Print out the data at start of run No
117# 2 Print indications of progress of run Yes
118# 3 Print out tree Yes
119# 4 Print out steps in each site No
120# 5 Print character at all nodes of tree No
121# 6 Write out trees onto tree file? Yes
122 print CMD "Y\n";
123 close CMD;
124
125 # And then we run the program.
126 my $program = File::Which::which( 'pars' );
127 unless( -x $program ) {
edac47cc 128 throw( "Phylip pars not found in path" );
b02332ca 129 }
130
131 {
132 # We need to run it in our temporary directory where we have created
133 # all the expected files.
134 local $CWD = $phylip_dir;
135 my @cmd = ( $program );
136 run \@cmd, '<', 'cmdfile', '>', '/dev/null';
137 }
138 # Now our output should be in 'outfile' and our tree in 'outtree',
139 # both in the temp directory.
140
141 my @outtree;
142 if( -f "$phylip_dir/outtree" ) {
143 open( TREE, "$phylip_dir/outtree" ) or die "Could not open outtree for read";
144 @outtree = <TREE>;
145 close TREE;
146 }
edac47cc 147 return join( '', @outtree ) if @outtree;
b02332ca 148
69403daa 149 # If we got this far, we are about to throw an error.
b02332ca 150 my @error;
151 if( -f "$phylip_dir/outfile" ) {
152 open( OUTPUT, "$phylip_dir/outfile" ) or die "Could not open output for read";
153 @error = <OUTPUT>;
154 close OUTPUT;
155 } else {
156 push( @error, "Neither outtree nor output file was produced!" );
157 }
edac47cc 158 throw( join( '', @error ) );
b02332ca 159}
160
161sub parse_newick {
162 my $newick = shift;
163 my @trees;
164 # Parse the result into a tree
165 my $forest = Bio::Phylo::IO->parse(
166 -format => 'newick',
167 -string => $newick,
168 );
169 # Turn the tree into a graph, starting with the root node
170 foreach my $tree ( @{$forest->get_entities} ) {
171 push( @trees, _graph_from_bio( $tree ) );
172 }
173 return \@trees;
174}
175
27e2a8fe 176sub newick_to_svg {
177 my $newick = shift;
178 my $program = File::Which::which( 'figtree' );
179 unless( -x $program ) {
edac47cc 180 throw( "FigTree commandline utility not found in path" );
27e2a8fe 181 }
182 my $svg;
183 my $nfile = File::Temp->new();
184 print $nfile $newick;
185 close $nfile;
186 my @cmd = ( $program, '-graphic', 'SVG', $nfile );
187 run( \@cmd, ">", binary(), \$svg );
188 return decode_utf8( $svg );
189}
190
b02332ca 191sub _graph_from_bio {
192 my $tree = shift;
193 my $graph = Graph->new( 'undirected' => 1 );
194 # Give all the intermediate anonymous nodes a name.
195 my $i = 0;
196 foreach my $n ( @{$tree->get_entities} ) {
197 next if $n->get_name;
198 $n->set_name( $i++ );
199 }
200 my $root = $tree->get_root->get_name;
201 $graph->add_vertex( $root );
202 _add_tree_children( $graph, $root, $tree->get_root->get_children() );
203 return $graph;
204}
205
206sub _add_tree_children {
207 my( $graph, $parent, $tree_children ) = @_;
208 foreach my $c ( @$tree_children ) {
209 my $child = $c->get_name;
210 $graph->add_vertex( $child );
211 $graph->add_path( $parent, $child );
212 _add_tree_children( $graph, $child, $c->get_children() );
213 }
214}
edac47cc 215
216sub throw {
217 Text::Tradition::Error->throw(
218 'ident' => 'StemmaUtil error',
219 'message' => $_[0],
220 );
221}
222