1 package Text::Tradition::StemmaUtil;
6 use vars qw/ @EXPORT_OK /;
8 use Encode qw( decode_utf8 );
13 use Graph::Reader::Dot;
14 use IPC::Run qw/ run binary /;
15 @EXPORT_OK = qw/ make_character_matrix character_input phylip_pars
16 parse_newick newick_to_svg /;
18 sub make_character_matrix {
20 # Push the names of the witnesses to initialize the rows of the matrix.
21 my @matrix = map { [ _normalize_witname( $_->{'witness'} ) ] }
22 @{$table->{'alignment'}};
23 foreach my $token_index ( 0 .. $table->{'length'} - 1) {
24 # First implementation: make dumb alignment table, caring about
25 # nothing except which reading is in which position.
26 my @pos_readings = map { $_->{'tokens'}->[$token_index] }
27 @{$table->{'alignment'}};
28 my @pos_text = map { $_ ? $_->{'t'} : $_ } @pos_readings;
29 my @chars = convert_characters( \@pos_text );
30 foreach my $idx ( 0 .. $#matrix ) {
31 push( @{$matrix[$idx]}, $chars[$idx] );
37 # Helper function to make the witness name something legal for pars
39 sub _normalize_witname {
41 $witname =~ s/\s+/ /g;
42 $witname =~ s/[\[\]\(\)\:;,]//g;
43 $witname = substr( $witname, 0, 10 );
44 return sprintf( "%-10s", $witname );
47 sub convert_characters {
49 # This is a simple algorithm that treats every reading as different.
50 # Eventually we will want to be able to specify how relationships
51 # affect the character matrix.
52 my %unique = ( '__UNDEF__' => 'X',
57 foreach my $word ( @$row ) {
58 if( $word && !exists $unique{$word} ) {
59 $unique{$word} = chr( 65 + $ctr );
62 $count{$word}++ if $word;
64 # Try to keep variants under 8 by lacunizing any singletons.
65 if( scalar( keys %unique ) > 8 ) {
66 foreach my $word ( keys %count ) {
67 if( $count{$word} == 1 ) {
72 my %u = reverse %unique;
73 if( scalar( keys %u ) > 8 ) {
74 warn "Have more than 8 variants on this location; phylip will break";
76 my @chars = map { $_ ? $unique{$_} : $unique{'__UNDEF__' } } @$row;
82 my $character_matrix = make_character_matrix( $table );
84 my $rows = scalar @{$character_matrix};
85 my $columns = scalar @{$character_matrix->[0]} - 1;
86 $input .= "\t$rows\t$columns\n";
87 foreach my $row ( @{$character_matrix} ) {
88 $input .= join( '', @$row ) . "\n";
94 my( $charmatrix ) = @_;
95 # Set up a temporary directory for all the default Phylip files.
96 my $phylip_dir = File::Temp->newdir();
97 # $phylip_dir->unlink_on_destroy(0);
98 # We need an infile, and we need a command input file.
99 open( MATRIX, ">$phylip_dir/infile" ) or die "Could not write $phylip_dir/infile";
100 print MATRIX $charmatrix;
103 open( CMD, ">$phylip_dir/cmdfile" ) or die "Could not write $phylip_dir/cmdfile";
104 ## TODO any configuration parameters we want to set here
105 # U Search for best tree? Yes
106 # S Search option? More thorough search
107 # V Number of trees to save? 100
108 # J Randomize input order of species? No. Use input order
109 # O Outgroup root? No, use as outgroup species 1
110 # T Use Threshold parsimony? No, use ordinary parsimony
111 # W Sites weighted? No
112 # M Analyze multiple data sets? No
113 # I Input species interleaved? Yes
114 # 0 Terminal type (IBM PC, ANSI, none)? ANSI
115 # 1 Print out the data at start of run No
116 # 2 Print indications of progress of run Yes
117 # 3 Print out tree Yes
118 # 4 Print out steps in each site No
119 # 5 Print character at all nodes of tree No
120 # 6 Write out trees onto tree file? Yes
124 # And then we run the program.
125 my $program = File::Which::which( 'pars' );
126 unless( -x $program ) {
127 return( undef, "Phylip pars not found in path" );
131 # We need to run it in our temporary directory where we have created
132 # all the expected files.
133 local $CWD = $phylip_dir;
134 my @cmd = ( $program );
135 run \@cmd, '<', 'cmdfile', '>', '/dev/null';
137 # Now our output should be in 'outfile' and our tree in 'outtree',
138 # both in the temp directory.
141 if( -f "$phylip_dir/outtree" ) {
142 open( TREE, "$phylip_dir/outtree" ) or die "Could not open outtree for read";
146 return( 1, join( '', @outtree ) ) if @outtree;
149 if( -f "$phylip_dir/outfile" ) {
150 open( OUTPUT, "$phylip_dir/outfile" ) or die "Could not open output for read";
154 push( @error, "Neither outtree nor output file was produced!" );
156 return( undef, join( '', @error ) );
162 # Parse the result into a tree
163 my $forest = Bio::Phylo::IO->parse(
167 # Turn the tree into a graph, starting with the root node
168 foreach my $tree ( @{$forest->get_entities} ) {
169 push( @trees, _graph_from_bio( $tree ) );
176 my $program = File::Which::which( 'figtree' );
177 unless( -x $program ) {
178 warn "FigTree commandline utility not found in path";
182 my $nfile = File::Temp->new();
183 print $nfile $newick;
185 my @cmd = ( $program, '-graphic', 'SVG', $nfile );
186 run( \@cmd, ">", binary(), \$svg );
187 return decode_utf8( $svg );
190 sub _graph_from_bio {
192 my $graph = Graph->new( 'undirected' => 1 );
193 # Give all the intermediate anonymous nodes a name.
195 foreach my $n ( @{$tree->get_entities} ) {
196 next if $n->get_name;
197 $n->set_name( $i++ );
199 my $root = $tree->get_root->get_name;
200 $graph->add_vertex( $root );
201 _add_tree_children( $graph, $root, $tree->get_root->get_children() );
205 sub _add_tree_children {
206 my( $graph, $parent, $tree_children ) = @_;
207 foreach my $c ( @$tree_children ) {
208 my $child = $c->get_name;
209 $graph->add_vertex( $child );
210 $graph->add_path( $parent, $child );
211 _add_tree_children( $graph, $child, $c->get_children() );