1 package Text::Tradition::StemmaUtil;
6 use vars qw/ @EXPORT_OK /;
8 use Encode qw( decode_utf8 );
13 use IPC::Run qw/ run binary /;
14 use Text::Tradition::Error;
15 @EXPORT_OK = qw/ display_graph editable_graph
16 character_input phylip_pars parse_newick newick_to_svg /;
20 Text::Tradition::StemmaUtil - standalone utilities for stemma graph display and
21 distance tree calculations
25 This package contains a set of utilities for displaying arbitrary stemmata and
26 running phylogenetic analysis on text collations.
30 =head2 display_graph( $graph, $opts )
32 Returns a dot specification intended for display, according to the logical
33 attributes of the witnesses.
38 my( $graph, $opts ) = @_;
40 # Get default and specified options
43 'bgcolor' => 'transparent',
48 'fillcolor' => 'white',
50 'shape' => 'ellipse', # Shape for the extant nodes
53 'arrowhead' => 'none',
55 @graphopts{ keys %{$opts->{'graph'}} } = values %{$opts->{'graph'}}
57 @nodeopts{ keys %{$opts->{'node'}} } = values %{$opts->{'node'}}
59 @edgeopts{ keys %{$opts->{'edge'}} } = values %{$opts->{'edge'}}
62 my $gdecl = $graph->is_directed ? 'digraph' : 'graph';
63 my $gname = $opts->{'name'} ? '"' . $opts->{'name'} . '"'
66 push( @dotlines, "$gdecl $gname {" );
67 ## Print out the global attributes
68 push( @dotlines, _make_dotline( 'graph', %graphopts ) ) if keys %graphopts;
69 push( @dotlines, _make_dotline( 'edge', %edgeopts ) ) if keys %edgeopts;
70 push( @dotlines, _make_dotline( 'node', %nodeopts ) ) if keys %nodeopts;
72 # Add each of the nodes.
73 foreach my $n ( $graph->vertices ) {
74 my %vattr = ( 'id' => $n ); # Set the SVG element ID to the sigil itself
75 if( $graph->has_vertex_attribute( $n, 'label' ) ) {
76 $vattr{'label'} = $graph->get_vertex_attribute( $n, 'label' );
78 push( @dotlines, _make_dotline( $n, %vattr ) );
80 # Add each of our edges.
81 foreach my $e ( $graph->edges ) {
82 my( $from, $to ) = map { _dotquote( $_ ) } @$e;
83 my $connector = $graph->is_directed ? '->' : '--';
84 push( @dotlines, " $from $connector $to;" );
86 push( @dotlines, '}' );
88 return join( "\n", @dotlines );
92 =head2 editable_graph( $graph, $opts )
94 Returns a dot specification of a stemma graph with logical witness features,
95 intended for editing the stemma definition.
100 my( $graph, $opts ) = @_;
103 my $join = ( $opts && exists $opts->{'linesep'} ) ? $opts->{'linesep'} : "\n";
104 my $fq = $opts->{'forcequote'};
105 my $gdecl = $graph->is_undirected ? 'graph' : 'digraph';
106 my $gname = exists $opts->{'name'} ? '"' . $opts->{'name'} . '"'
109 push( @dotlines, "$gdecl $gname {" );
110 my @real; # A cheap sort
111 foreach my $n ( sort $graph->vertices ) {
112 my $c = $graph->get_vertex_attribute( $n, 'class' );
113 $c = 'extant' unless $c;
114 if( $c eq 'extant' ) {
117 push( @dotlines, _make_dotline( $n, 'class' => $c, 'forcequote' => $fq ) );
120 # Now do the real ones
121 foreach my $n ( @real ) {
122 push( @dotlines, _make_dotline( $n, 'class' => 'extant', 'forcequote' => $fq ) );
124 foreach my $e ( sort _by_vertex $graph->edges ) {
125 my( $from, $to ) = map { _dotquote( $_ ) } @$e;
126 my $conn = $graph->is_undirected ? '--' : '->';
127 push( @dotlines, " $from $conn $to;" );
129 push( @dotlines, '}' );
130 return join( $join, @dotlines );
134 my( $obj, %attr ) = @_;
136 my $fq = delete $attr{forcequote};
137 foreach my $k ( keys %attr ) {
138 my $v = _dotquote( $attr{$k}, $fq );
139 push( @pairs, "$k=$v" );
141 return sprintf( " %s [ %s ];", _dotquote( $obj ), join( ', ', @pairs ) );
145 my( $str, $force ) = @_;
146 return $str if $str =~ /^[A-Za-z0-9]+$/;
148 $str = '"' . $str . '"';
153 return $a->[0].$a->[1] cmp $b->[0].$b->[1];
156 =head2 character_input( $tradition, $opts )
158 Returns a character matrix string suitable for Phylip programs, which
159 corresponds to the given alignment table. See Text::Tradition::Collation
160 for a description of the alignment table format. Options include:
164 =item * exclude_layer - Exclude layered witnesses from the character input,
165 using only the 'main' text of the witnesses in the tradition.
167 =item * collapse - A reference to an array of relationship names that should
168 be treated as equivalent for the purposes of generating the character matrix.
174 sub character_input {
175 my ( $tradition, $opts ) = @_;
176 my $table = $tradition->collation->alignment_table;
177 if( $opts->{exclude_layer} ) {
178 # Filter out all alignment table rows that do not correspond
179 # to a named witness - these are the layered witnesses.
180 my $newtable = { alignment => [], length => $table->{length} };
181 foreach my $row ( @{$table->{alignment}} ) {
182 if( $tradition->has_witness( $row->{witness} ) ) {
183 push( @{$newtable->{alignment}}, $row );
188 my $character_matrix = _make_character_matrix( $table, $opts );
190 my $rows = scalar @{$character_matrix};
191 my $columns = scalar @{$character_matrix->[0]} - 1;
192 $input .= "\t$rows\t$columns\n";
193 foreach my $row ( @{$character_matrix} ) {
194 $input .= join( '', @$row ) . "\n";
199 sub _make_character_matrix {
200 my( $table, $opts ) = @_;
201 # Push the names of the witnesses to initialize the rows of the matrix.
202 my @matrix = map { [ _normalize_witname( $_->{'witness'} ) ] }
203 @{$table->{'alignment'}};
204 foreach my $token_index ( 0 .. $table->{'length'} - 1) {
205 my @pos_tokens = map { $_->{'tokens'}->[$token_index] }
206 @{$table->{'alignment'}};
207 my @pos_readings = map { $_ ? $_->{'t'} : $_ } @pos_tokens;
208 my @chars = _convert_characters( \@pos_readings, $opts );
209 foreach my $idx ( 0 .. $#matrix ) {
210 push( @{$matrix[$idx]}, $chars[$idx] );
216 # Helper function to make the witness name something legal for pars
218 sub _normalize_witname {
220 $witname =~ s/\s+/ /g;
221 $witname =~ s/[\[\]\(\)\:;,]//g;
222 $witname = substr( $witname, 0, 10 );
223 return sprintf( "%-10s", $witname );
226 sub _convert_characters {
227 my( $row, $opts ) = @_;
228 # This is a simple algorithm that treats every reading as different.
229 # Eventually we will want to be able to specify how relationships
230 # affect the character matrix.
231 my %unique = ( '__UNDEF__' => 'X',
237 foreach my $rdg ( @$row ) {
239 next if $rdg->is_lacuna;
240 next if exists $unique{$rdg->text};
241 if( ref( $opts->{'collapse'} ) eq 'ARRAY' ) {
242 my @exclude_types = @{$opts->{'collapse'}};
243 my @set = $rdg->related_readings( sub { my $rel = shift;
244 $rel->colocated && grep { $rel->type eq $_ } @exclude_types } );
246 my $char = chr( 65 + $ctr++ );
247 map { $unique{$_->text} = $char } @set;
248 $count{$rdg->text} += scalar @set;
250 $unique{$rdg->text} = chr( 65 + $ctr++ );
251 $count{$rdg->text}++;
254 # Try to keep variants under 8 by lacunizing any singletons.
255 if( scalar( keys %unique ) > 8 ) {
256 foreach my $word ( keys %count ) {
257 if( $count{$word} == 1 ) {
258 $unique{$word} = '?';
262 my %u = reverse %unique;
263 if( scalar( keys %u ) > 8 ) {
264 warn "Have more than 8 variants on this location; phylip will break";
266 my @chars = map { $_ ? $unique{$_->text} : $unique{'__UNDEF__' } } @$row;
270 =head2 phylip_pars( $character_matrix )
272 Runs Phylip Pars on the given character matrix. Returns results in Newick format.
277 my( $charmatrix ) = @_;
278 # Set up a temporary directory for all the default Phylip files.
279 my $phylip_dir = File::Temp->newdir();
280 # $phylip_dir->unlink_on_destroy(0);
281 # We need an infile, and we need a command input file.
282 open( MATRIX, ">$phylip_dir/infile" ) or die "Could not write $phylip_dir/infile";
283 print MATRIX $charmatrix;
286 open( CMD, ">$phylip_dir/cmdfile" ) or die "Could not write $phylip_dir/cmdfile";
287 ## TODO any configuration parameters we want to set here
288 # U Search for best tree? Yes
289 # S Search option? More thorough search
290 # V Number of trees to save? 100
291 # J Randomize input order of species? No. Use input order
292 # O Outgroup root? No, use as outgroup species 1
293 # T Use Threshold parsimony? No, use ordinary parsimony
294 # W Sites weighted? No
295 # M Analyze multiple data sets? No
296 # I Input species interleaved? Yes
297 # 0 Terminal type (IBM PC, ANSI, none)? ANSI
298 # 1 Print out the data at start of run No
299 # 2 Print indications of progress of run Yes
300 # 3 Print out tree Yes
301 # 4 Print out steps in each site No
302 # 5 Print character at all nodes of tree No
303 # 6 Write out trees onto tree file? Yes
307 # And then we run the program.
308 my $program = File::Which::which( 'pars' );
309 unless( $program && -x $program ) {
310 throw( "Phylip pars not found in path" );
314 # We need to run it in our temporary directory where we have created
315 # all the expected files.
316 local $CWD = $phylip_dir;
317 my @cmd = ( $program );
318 run \@cmd, '<', 'cmdfile', '>', '/dev/null';
320 # Now our output should be in 'outfile' and our tree in 'outtree',
321 # both in the temp directory.
324 if( -f "$phylip_dir/outtree" ) {
325 open( TREE, "$phylip_dir/outtree" ) or die "Could not open outtree for read";
329 return join( '', @outtree ) if @outtree;
331 # If we got this far, we are about to throw an error.
333 if( -f "$phylip_dir/outfile" ) {
334 open( OUTPUT, "$phylip_dir/outfile" ) or die "Could not open output for read";
338 push( @error, "Neither outtree nor output file was produced!" );
340 throw( join( '', @error ) );
343 =head2 parse_newick( $newick_string )
345 Parses the given Newick tree(s) into one or more Stemma objects with
352 # Parse the result into a set of trees and return them.
353 my $forest = Bio::Phylo::IO->parse(
357 return map { _graph_from_bio( $_ ) } @{$forest->get_entities};
360 sub _graph_from_bio {
362 my $graph = Graph->new( 'undirected' => 1 );
363 # Give all the intermediate anonymous nodes a name.
366 foreach my $n ( @{$tree->get_terminals} ) {
367 # The terminal nodes are our named witnesses.
368 $classes->{$n->get_name} = 'extant';
370 foreach my $n ( @{$tree->get_internals} ) {
371 unless( defined $n->get_name && $n->get_name ne '' ) {
372 # Get an integer, make sure it's a unique name
373 while( exists $classes->{$i} ) {
376 $n->set_name( $i++ );
378 $classes->{$n->get_name} = 'hypothetical';
380 _add_tree_children( $graph, $classes, undef, [ $tree->get_root ]);
384 sub _add_tree_children {
385 my( $graph, $classes, $parent, $tree_children ) = @_;
386 foreach my $c ( @$tree_children ) {
387 my $child = $c->get_name;
388 $graph->add_vertex( $child );
389 $graph->set_vertex_attribute( $child, 'class', $classes->{$child} );
390 $graph->add_path( $parent, $child ) if defined $parent;
391 _add_tree_children( $graph, $classes, $child, $c->get_children() );
395 =head2 newick_to_svg( $newick_string )
397 Uses the FigTree utility (if installed) to transform the given Newick tree(s)
398 into a graph visualization.
404 my $program = File::Which::which( 'figtree' );
405 unless( -x $program ) {
406 throw( "FigTree commandline utility not found in path" );
409 my $nfile = File::Temp->new();
410 print $nfile $newick;
412 my @cmd = ( $program, '-graphic', 'SVG', $nfile );
413 run( \@cmd, ">", binary(), \$svg );
414 return decode_utf8( $svg );
418 Text::Tradition::Error->throw(
419 'ident' => 'StemmaUtil error',
428 This package is free software and is provided "as is" without express
429 or implied warranty. You can redistribute it and/or modify it under
430 the same terms as Perl itself.
434 Tara L Andrews E<lt>aurum@cpan.orgE<gt>