move more procedural stuff out of Stemma.pm into StemmaUtil
[scpubgit/stemmatology.git] / lib / Text / Tradition / StemmaUtil.pm
CommitLineData
68454b71 1package Text::Tradition::StemmaUtil;
2
3use strict;
4use warnings;
5use Exporter 'import';
6use vars qw/ @EXPORT_OK /;
b02332ca 7use Bio::Phylo::IO;
8use File::chdir;
9use File::Temp;
10use File::Which;
11use Graph;
12use Graph::Reader::Dot;
13use IPC::Run qw/ run binary /;
14@EXPORT_OK = qw/ make_character_matrix character_input phylip_pars parse_newick /;
68454b71 15
16sub make_character_matrix {
17 my( $table ) = @_;
18 # Push the names of the witnesses to initialize the rows of the matrix.
19 my @matrix = map { [ _normalize_witname( $_->{'witness'} ) ] }
20 @{$table->{'alignment'}};
21 foreach my $token_index ( 0 .. $table->{'length'} - 1) {
22 # First implementation: make dumb alignment table, caring about
23 # nothing except which reading is in which position.
24 my @pos_readings = map { $_->{'tokens'}->[$token_index] }
25 @{$table->{'alignment'}};
26 my @pos_text = map { $_ ? $_->{'t'} : $_ } @pos_readings;
27 my @chars = convert_characters( \@pos_text );
28 foreach my $idx ( 0 .. $#matrix ) {
29 push( @{$matrix[$idx]}, $chars[$idx] );
30 }
31 }
32 return \@matrix;
33}
34
35# Helper function to make the witness name something legal for pars
36
37sub _normalize_witname {
38 my( $witname ) = @_;
39 $witname =~ s/\s+/ /g;
40 $witname =~ s/[\[\]\(\)\:;,]//g;
41 $witname = substr( $witname, 0, 10 );
42 return sprintf( "%-10s", $witname );
43}
44
45sub convert_characters {
46 my $row = shift;
47 # This is a simple algorithm that treats every reading as different.
48 # Eventually we will want to be able to specify how relationships
49 # affect the character matrix.
50 my %unique = ( '__UNDEF__' => 'X',
51 '#LACUNA#' => '?',
52 );
53 my %count;
54 my $ctr = 0;
55 foreach my $word ( @$row ) {
56 if( $word && !exists $unique{$word} ) {
57 $unique{$word} = chr( 65 + $ctr );
58 $ctr++;
59 }
60 $count{$word}++ if $word;
61 }
62 # Try to keep variants under 8 by lacunizing any singletons.
63 if( scalar( keys %unique ) > 8 ) {
64 foreach my $word ( keys %count ) {
65 if( $count{$word} == 1 ) {
66 $unique{$word} = '?';
67 }
68 }
69 }
70 my %u = reverse %unique;
71 if( scalar( keys %u ) > 8 ) {
72 warn "Have more than 8 variants on this location; phylip will break";
73 }
74 my @chars = map { $_ ? $unique{$_} : $unique{'__UNDEF__' } } @$row;
75 return @chars;
76}
77
b02332ca 78sub character_input {
68454b71 79 my $table = shift;
80 my $character_matrix = make_character_matrix( $table );
81 my $input = '';
82 my $rows = scalar @{$character_matrix};
83 my $columns = scalar @{$character_matrix->[0]} - 1;
84 $input .= "\t$rows\t$columns\n";
85 foreach my $row ( @{$character_matrix} ) {
86 $input .= join( '', @$row ) . "\n";
87 }
88 return $input;
89}
90
b02332ca 91sub phylip_pars {
92 my( $charmatrix ) = @_;
93 # Set up a temporary directory for all the default Phylip files.
94 my $phylip_dir = File::Temp->newdir();
95 # $phylip_dir->unlink_on_destroy(0);
96 # We need an infile, and we need a command input file.
97 open( MATRIX, ">$phylip_dir/infile" ) or die "Could not write $phylip_dir/infile";
98 print MATRIX $charmatrix;
99 close MATRIX;
100
101 open( CMD, ">$phylip_dir/cmdfile" ) or die "Could not write $phylip_dir/cmdfile";
102 ## TODO any configuration parameters we want to set here
103# U Search for best tree? Yes
104# S Search option? More thorough search
105# V Number of trees to save? 100
106# J Randomize input order of species? No. Use input order
107# O Outgroup root? No, use as outgroup species 1
108# T Use Threshold parsimony? No, use ordinary parsimony
109# W Sites weighted? No
110# M Analyze multiple data sets? No
111# I Input species interleaved? Yes
112# 0 Terminal type (IBM PC, ANSI, none)? ANSI
113# 1 Print out the data at start of run No
114# 2 Print indications of progress of run Yes
115# 3 Print out tree Yes
116# 4 Print out steps in each site No
117# 5 Print character at all nodes of tree No
118# 6 Write out trees onto tree file? Yes
119 print CMD "Y\n";
120 close CMD;
121
122 # And then we run the program.
123 my $program = File::Which::which( 'pars' );
124 unless( -x $program ) {
125 return( undef, "Phylip pars not found in path" );
126 }
127
128 {
129 # We need to run it in our temporary directory where we have created
130 # all the expected files.
131 local $CWD = $phylip_dir;
132 my @cmd = ( $program );
133 run \@cmd, '<', 'cmdfile', '>', '/dev/null';
134 }
135 # Now our output should be in 'outfile' and our tree in 'outtree',
136 # both in the temp directory.
137
138 my @outtree;
139 if( -f "$phylip_dir/outtree" ) {
140 open( TREE, "$phylip_dir/outtree" ) or die "Could not open outtree for read";
141 @outtree = <TREE>;
142 close TREE;
143 }
144 return( 1, join( '', @outtree ) ) if @outtree;
145
146 my @error;
147 if( -f "$phylip_dir/outfile" ) {
148 open( OUTPUT, "$phylip_dir/outfile" ) or die "Could not open output for read";
149 @error = <OUTPUT>;
150 close OUTPUT;
151 } else {
152 push( @error, "Neither outtree nor output file was produced!" );
153 }
154 return( undef, join( '', @error ) );
155}
156
157sub parse_newick {
158 my $newick = shift;
159 my @trees;
160 # Parse the result into a tree
161 my $forest = Bio::Phylo::IO->parse(
162 -format => 'newick',
163 -string => $newick,
164 );
165 # Turn the tree into a graph, starting with the root node
166 foreach my $tree ( @{$forest->get_entities} ) {
167 push( @trees, _graph_from_bio( $tree ) );
168 }
169 return \@trees;
170}
171
172sub _graph_from_bio {
173 my $tree = shift;
174 my $graph = Graph->new( 'undirected' => 1 );
175 # Give all the intermediate anonymous nodes a name.
176 my $i = 0;
177 foreach my $n ( @{$tree->get_entities} ) {
178 next if $n->get_name;
179 $n->set_name( $i++ );
180 }
181 my $root = $tree->get_root->get_name;
182 $graph->add_vertex( $root );
183 _add_tree_children( $graph, $root, $tree->get_root->get_children() );
184 return $graph;
185}
186
187sub _add_tree_children {
188 my( $graph, $parent, $tree_children ) = @_;
189 foreach my $c ( @$tree_children ) {
190 my $child = $c->get_name;
191 $graph->add_vertex( $child );
192 $graph->add_path( $parent, $child );
193 _add_tree_children( $graph, $child, $c->get_children() );
194 }
195}