Collect all hacks for Graph::Reader::Dot into a single utility. Fixes #15
[scpubgit/stemmatology.git] / analysis / lib / Text / Tradition / StemmaUtil.pm
CommitLineData
68454b71 1package Text::Tradition::StemmaUtil;
2
3use strict;
4use warnings;
5use Exporter 'import';
6use vars qw/ @EXPORT_OK /;
b02332ca 7use Bio::Phylo::IO;
27e2a8fe 8use Encode qw( decode_utf8 );
b02332ca 9use File::chdir;
10use File::Temp;
11use File::Which;
12use Graph;
11954259 13use Graph::Reader::Dot;
b02332ca 14use IPC::Run qw/ run binary /;
edac47cc 15use Text::Tradition::Error;
11954259 16@EXPORT_OK = qw/ read_graph display_graph editable_graph
5873cf38 17 character_input phylip_pars parse_newick newick_to_svg /;
68454b71 18
027d819c 19=head1 NAME
20
5873cf38 21Text::Tradition::StemmaUtil - standalone utilities for stemma graph display and
22distance tree calculations
027d819c 23
24=head1 DESCRIPTION
25
5873cf38 26This package contains a set of utilities for displaying arbitrary stemmata and
27running phylogenetic analysis on text collations.
027d819c 28
29=head1 SUBROUTINES
30
11954259 31=head2 read_graph( $dotstr) {
32
33Parses the graph specification on the filehandle in $dotstr and returns a Graph
34object. This subroutine works around some deficiencies in Graph::Reader::Dot.
35
36=cut
37
38sub read_graph {
39 my $dotstr = shift;
40 # Graph::Reader::Dot does not handle bare non-ASCII Unicode word characters.
41 # We get around this by wrapping all words in double quotes, as long as they
42 # aren't already wrapped, and as long as they aren't the initial '(di)?graph'.
43 # Also need to deal correctly with the graph title.
44 if( $dotstr =~ /^\s*((di)?graph)\s+(.*?)\s*\{(.*)$/s ) {
45 my( $decl, $ident, $rest ) = ( $1, $3, $4 );
46 unless( substr( $ident, 0, 1 ) eq '"' ) {
47 $ident = '"'.$ident.'"';
48 }
49 $rest =~ s/(?<!")\b(\w+)\b(?!")/"$1"/g;
50 $dotstr = "$decl $ident { $rest";
51 }
52
53 # Now open a filehandle onto the string and pass it to Graph::Reader::Dot.
54 my $dotfh;
55 open $dotfh, '<', \$dotstr;
56 binmode $dotfh, ':utf8';
57 my $reader = Graph::Reader::Dot->new();
58 # Redirect STDOUT in order to trap any error messages - syntax errors
59 # are evidently not fatal.
60 my $graph;
61 my $reader_out;
62 my $reader_err;
63 {
64 local(*STDOUT);
65 open( STDOUT, ">", \$reader_out );
66 local(*STDERR);
67 open( STDERR, ">", \$reader_err );
68 $graph = $reader->read_graph( $dotfh );
69 close STDOUT;
70 close STDERR;
71 }
72 if( $reader_out && $reader_out =~ /error/s ) {
73 throw( "Error trying to parse dot: $reader_out" );
74 } elsif( !$graph ) {
75 throw( "Failed to create graph from dot" );
76 }
77 # Wrench the graph identifier out of the graph
78 ## HORRIBLE HACK but there is no API access to the graph identifier!
79 $graph->set_graph_attribute( 'name', $graph->[4]->{'name'} );
80
81 # Correct for implicit graph -> digraph quirk of reader
82 if( $reader_err && $reader_err =~ /graph will be treated as digraph/ ) {
83 my $udgraph = $graph->undirected_copy;
84 $udgraph->set_graph_attributes( $graph->get_graph_attributes );
85 foreach my $v ( $graph->vertices ) {
86 $udgraph->set_vertex_attributes( $v, $graph->get_vertex_attributes( $v ) );
87 }
88 $graph = $udgraph;
89 }
90
91 return $graph;
92}
93
5873cf38 94=head2 display_graph( $graph, $opts )
95
96Returns a dot specification intended for display, according to the logical
97attributes of the witnesses.
98
99=cut
100
101sub display_graph {
102 my( $graph, $opts ) = @_;
103
104 # Get default and specified options
105 my %graphopts = (
106 # 'ratio' => 1,
107 'bgcolor' => 'transparent',
108 );
109 my %nodeopts = (
110 'fontsize' => 11,
111 'style' => 'filled',
112 'fillcolor' => 'white',
113 'color' => 'white',
114 'shape' => 'ellipse', # Shape for the extant nodes
115 );
116 my %edgeopts = (
117 'arrowhead' => 'none',
118 );
119 @graphopts{ keys %{$opts->{'graph'}} } = values %{$opts->{'graph'}}
120 if $opts->{'graph'};
121 @nodeopts{ keys %{$opts->{'node'}} } = values %{$opts->{'node'}}
122 if $opts->{'node'};
123 @edgeopts{ keys %{$opts->{'edge'}} } = values %{$opts->{'edge'}}
124 if $opts->{'edge'};
125
126 my $gdecl = $graph->is_directed ? 'digraph' : 'graph';
127 my $gname = $opts->{'name'} ? '"' . $opts->{'name'} . '"'
128 : 'stemma';
129 my @dotlines;
130 push( @dotlines, "$gdecl $gname {" );
131 ## Print out the global attributes
132 push( @dotlines, _make_dotline( 'graph', %graphopts ) ) if keys %graphopts;
133 push( @dotlines, _make_dotline( 'edge', %edgeopts ) ) if keys %edgeopts;
134 push( @dotlines, _make_dotline( 'node', %nodeopts ) ) if keys %nodeopts;
135
136 # Add each of the nodes.
137 foreach my $n ( $graph->vertices ) {
138 my %vattr = ( 'id' => $n ); # Set the SVG element ID to the sigil itself
139 if( $graph->has_vertex_attribute( $n, 'label' ) ) {
140 $vattr{'label'} = $graph->get_vertex_attribute( $n, 'label' );
141 }
142 push( @dotlines, _make_dotline( $n, %vattr ) );
143 }
144 # Add each of our edges.
145 foreach my $e ( $graph->edges ) {
146 my( $from, $to ) = map { _dotquote( $_ ) } @$e;
147 my $connector = $graph->is_directed ? '->' : '--';
148 push( @dotlines, " $from $connector $to;" );
149 }
150 push( @dotlines, '}' );
151
152 return join( "\n", @dotlines );
153}
154
155
156=head2 editable_graph( $graph, $opts )
157
158Returns a dot specification of a stemma graph with logical witness features,
159intended for editing the stemma definition.
160
161=cut
162
163sub editable_graph {
164 my( $graph, $opts ) = @_;
165
166 # Create the graph
167 my $join = ( $opts && exists $opts->{'linesep'} ) ? $opts->{'linesep'} : "\n";
168 my $fq = $opts->{'forcequote'};
169 my $gdecl = $graph->is_undirected ? 'graph' : 'digraph';
170 my $gname = exists $opts->{'name'} ? '"' . $opts->{'name'} . '"'
171 : 'stemma';
172 my @dotlines;
173 push( @dotlines, "$gdecl $gname {" );
174 my @real; # A cheap sort
175 foreach my $n ( sort $graph->vertices ) {
176 my $c = $graph->get_vertex_attribute( $n, 'class' );
177 $c = 'extant' unless $c;
178 if( $c eq 'extant' ) {
179 push( @real, $n );
180 } else {
181 push( @dotlines, _make_dotline( $n, 'class' => $c, 'forcequote' => $fq ) );
182 }
183 }
184 # Now do the real ones
185 foreach my $n ( @real ) {
186 push( @dotlines, _make_dotline( $n, 'class' => 'extant', 'forcequote' => $fq ) );
187 }
188 foreach my $e ( sort _by_vertex $graph->edges ) {
189 my( $from, $to ) = map { _dotquote( $_ ) } @$e;
190 my $conn = $graph->is_undirected ? '--' : '->';
191 push( @dotlines, " $from $conn $to;" );
192 }
193 push( @dotlines, '}' );
194 return join( $join, @dotlines );
195}
196
197sub _make_dotline {
198 my( $obj, %attr ) = @_;
199 my @pairs;
200 my $fq = delete $attr{forcequote};
201 foreach my $k ( keys %attr ) {
202 my $v = _dotquote( $attr{$k}, $fq );
203 push( @pairs, "$k=$v" );
204 }
205 return sprintf( " %s [ %s ];", _dotquote( $obj ), join( ', ', @pairs ) );
206}
207
208sub _dotquote {
209 my( $str, $force ) = @_;
210 return $str if $str =~ /^[A-Za-z0-9]+$/;
211 $str =~ s/\"/\\\"/g;
212 $str = '"' . $str . '"';
213 return $str;
214}
215
216sub _by_vertex {
217 return $a->[0].$a->[1] cmp $b->[0].$b->[1];
218}
219
ea45d2a6 220=head2 character_input( $tradition, $opts )
9457207b 221
222Returns a character matrix string suitable for Phylip programs, which
223corresponds to the given alignment table. See Text::Tradition::Collation
ea45d2a6 224for a description of the alignment table format. Options include:
225
226=over
227
228=item * exclude_layer - Exclude layered witnesses from the character input,
229using only the 'main' text of the witnesses in the tradition.
230
231=item * collapse - A reference to an array of relationship names that should
232be treated as equivalent for the purposes of generating the character matrix.
233
234=back
9457207b 235
027d819c 236=cut
237
9457207b 238sub character_input {
ea45d2a6 239 my ( $tradition, $opts ) = @_;
240 my $table = $tradition->collation->alignment_table;
241 if( $opts->{exclude_layer} ) {
242 # Filter out all alignment table rows that do not correspond
243 # to a named witness - these are the layered witnesses.
b39e7cb5 244 my $newtable = { alignment => [], length => $table->{length} };
245 foreach my $row ( @{$table->{alignment}} ) {
ea45d2a6 246 if( $tradition->has_witness( $row->{witness} ) ) {
247 push( @{$newtable->{alignment}}, $row );
248 }
249 }
250 $table = $newtable;
251 }
252 my $character_matrix = _make_character_matrix( $table, $opts );
9457207b 253 my $input = '';
254 my $rows = scalar @{$character_matrix};
255 my $columns = scalar @{$character_matrix->[0]} - 1;
256 $input .= "\t$rows\t$columns\n";
257 foreach my $row ( @{$character_matrix} ) {
258 $input .= join( '', @$row ) . "\n";
259 }
260 return $input;
261}
262
027d819c 263sub _make_character_matrix {
ea45d2a6 264 my( $table, $opts ) = @_;
68454b71 265 # Push the names of the witnesses to initialize the rows of the matrix.
266 my @matrix = map { [ _normalize_witname( $_->{'witness'} ) ] }
267 @{$table->{'alignment'}};
268 foreach my $token_index ( 0 .. $table->{'length'} - 1) {
ea45d2a6 269 my @pos_tokens = map { $_->{'tokens'}->[$token_index] }
68454b71 270 @{$table->{'alignment'}};
ea45d2a6 271 my @pos_readings = map { $_ ? $_->{'t'} : $_ } @pos_tokens;
272 my @chars = _convert_characters( \@pos_readings, $opts );
68454b71 273 foreach my $idx ( 0 .. $#matrix ) {
274 push( @{$matrix[$idx]}, $chars[$idx] );
275 }
276 }
277 return \@matrix;
278}
279
280# Helper function to make the witness name something legal for pars
281
282sub _normalize_witname {
283 my( $witname ) = @_;
284 $witname =~ s/\s+/ /g;
285 $witname =~ s/[\[\]\(\)\:;,]//g;
286 $witname = substr( $witname, 0, 10 );
287 return sprintf( "%-10s", $witname );
288}
289
027d819c 290sub _convert_characters {
ea45d2a6 291 my( $row, $opts ) = @_;
68454b71 292 # This is a simple algorithm that treats every reading as different.
293 # Eventually we will want to be able to specify how relationships
294 # affect the character matrix.
295 my %unique = ( '__UNDEF__' => 'X',
296 '#LACUNA#' => '?',
297 );
ea45d2a6 298 my %equivalent;
68454b71 299 my %count;
300 my $ctr = 0;
ea45d2a6 301 foreach my $rdg ( @$row ) {
302 next unless $rdg;
303 next if $rdg->is_lacuna;
304 next if exists $unique{$rdg->text};
305 if( ref( $opts->{'collapse'} ) eq 'ARRAY' ) {
306 my @exclude_types = @{$opts->{'collapse'}};
307 my @set = $rdg->related_readings( sub { my $rel = shift;
308 $rel->colocated && grep { $rel->type eq $_ } @exclude_types } );
309 push( @set, $rdg );
310 my $char = chr( 65 + $ctr++ );
311 map { $unique{$_->text} = $char } @set;
312 $count{$rdg->text} += scalar @set;
313 } else {
314 $unique{$rdg->text} = chr( 65 + $ctr++ );
315 $count{$rdg->text}++;
316 }
68454b71 317 }
318 # Try to keep variants under 8 by lacunizing any singletons.
319 if( scalar( keys %unique ) > 8 ) {
320 foreach my $word ( keys %count ) {
321 if( $count{$word} == 1 ) {
322 $unique{$word} = '?';
323 }
324 }
325 }
326 my %u = reverse %unique;
327 if( scalar( keys %u ) > 8 ) {
328 warn "Have more than 8 variants on this location; phylip will break";
329 }
ea45d2a6 330 my @chars = map { $_ ? $unique{$_->text} : $unique{'__UNDEF__' } } @$row;
68454b71 331 return @chars;
332}
333
027d819c 334=head2 phylip_pars( $character_matrix )
335
336Runs Phylip Pars on the given character matrix. Returns results in Newick format.
337
338=cut
339
b02332ca 340sub phylip_pars {
341 my( $charmatrix ) = @_;
342 # Set up a temporary directory for all the default Phylip files.
343 my $phylip_dir = File::Temp->newdir();
344 # $phylip_dir->unlink_on_destroy(0);
345 # We need an infile, and we need a command input file.
346 open( MATRIX, ">$phylip_dir/infile" ) or die "Could not write $phylip_dir/infile";
347 print MATRIX $charmatrix;
348 close MATRIX;
349
350 open( CMD, ">$phylip_dir/cmdfile" ) or die "Could not write $phylip_dir/cmdfile";
351 ## TODO any configuration parameters we want to set here
352# U Search for best tree? Yes
353# S Search option? More thorough search
354# V Number of trees to save? 100
355# J Randomize input order of species? No. Use input order
356# O Outgroup root? No, use as outgroup species 1
357# T Use Threshold parsimony? No, use ordinary parsimony
358# W Sites weighted? No
359# M Analyze multiple data sets? No
360# I Input species interleaved? Yes
361# 0 Terminal type (IBM PC, ANSI, none)? ANSI
362# 1 Print out the data at start of run No
363# 2 Print indications of progress of run Yes
364# 3 Print out tree Yes
365# 4 Print out steps in each site No
366# 5 Print character at all nodes of tree No
367# 6 Write out trees onto tree file? Yes
368 print CMD "Y\n";
369 close CMD;
370
371 # And then we run the program.
372 my $program = File::Which::which( 'pars' );
b39e7cb5 373 unless( $program && -x $program ) {
edac47cc 374 throw( "Phylip pars not found in path" );
b02332ca 375 }
376
377 {
378 # We need to run it in our temporary directory where we have created
379 # all the expected files.
380 local $CWD = $phylip_dir;
381 my @cmd = ( $program );
382 run \@cmd, '<', 'cmdfile', '>', '/dev/null';
383 }
384 # Now our output should be in 'outfile' and our tree in 'outtree',
385 # both in the temp directory.
386
387 my @outtree;
388 if( -f "$phylip_dir/outtree" ) {
389 open( TREE, "$phylip_dir/outtree" ) or die "Could not open outtree for read";
390 @outtree = <TREE>;
391 close TREE;
392 }
edac47cc 393 return join( '', @outtree ) if @outtree;
b02332ca 394
69403daa 395 # If we got this far, we are about to throw an error.
b02332ca 396 my @error;
397 if( -f "$phylip_dir/outfile" ) {
398 open( OUTPUT, "$phylip_dir/outfile" ) or die "Could not open output for read";
399 @error = <OUTPUT>;
400 close OUTPUT;
401 } else {
402 push( @error, "Neither outtree nor output file was produced!" );
403 }
edac47cc 404 throw( join( '', @error ) );
b02332ca 405}
406
027d819c 407=head2 parse_newick( $newick_string )
408
ea45d2a6 409Parses the given Newick tree(s) into one or more Stemma objects with
410undirected graphs.
027d819c 411
412=cut
413
b02332ca 414sub parse_newick {
415 my $newick = shift;
5873cf38 416 # Parse the result into a set of trees and return them.
b02332ca 417 my $forest = Bio::Phylo::IO->parse(
418 -format => 'newick',
419 -string => $newick,
420 );
5873cf38 421 return map { _graph_from_bio( $_ ) } @{$forest->get_entities};
ea45d2a6 422}
423
424sub _graph_from_bio {
425 my $tree = shift;
426 my $graph = Graph->new( 'undirected' => 1 );
427 # Give all the intermediate anonymous nodes a name.
428 my $i = 0;
429 my $classes = {};
430 foreach my $n ( @{$tree->get_terminals} ) {
431 # The terminal nodes are our named witnesses.
432 $classes->{$n->get_name} = 'extant';
433 }
434 foreach my $n ( @{$tree->get_internals} ) {
435 unless( defined $n->get_name && $n->get_name ne '' ) {
436 # Get an integer, make sure it's a unique name
437 while( exists $classes->{$i} ) {
438 $i++;
439 }
440 $n->set_name( $i++ );
441 }
442 $classes->{$n->get_name} = 'hypothetical';
443 }
444 _add_tree_children( $graph, $classes, undef, [ $tree->get_root ]);
445 return $graph;
446}
447
448sub _add_tree_children {
449 my( $graph, $classes, $parent, $tree_children ) = @_;
450 foreach my $c ( @$tree_children ) {
451 my $child = $c->get_name;
452 $graph->add_vertex( $child );
453 $graph->set_vertex_attribute( $child, 'class', $classes->{$child} );
454 $graph->add_path( $parent, $child ) if defined $parent;
455 _add_tree_children( $graph, $classes, $child, $c->get_children() );
b02332ca 456 }
b02332ca 457}
458
027d819c 459=head2 newick_to_svg( $newick_string )
460
461Uses the FigTree utility (if installed) to transform the given Newick tree(s)
462into a graph visualization.
463
464=cut
465
27e2a8fe 466sub newick_to_svg {
467 my $newick = shift;
468 my $program = File::Which::which( 'figtree' );
469 unless( -x $program ) {
edac47cc 470 throw( "FigTree commandline utility not found in path" );
27e2a8fe 471 }
472 my $svg;
473 my $nfile = File::Temp->new();
474 print $nfile $newick;
475 close $nfile;
476 my @cmd = ( $program, '-graphic', 'SVG', $nfile );
477 run( \@cmd, ">", binary(), \$svg );
478 return decode_utf8( $svg );
479}
480
edac47cc 481sub throw {
482 Text::Tradition::Error->throw(
483 'ident' => 'StemmaUtil error',
484 'message' => $_[0],
485 );
486}
487
027d819c 4881;
489
490=head1 LICENSE
491
492This package is free software and is provided "as is" without express
493or implied warranty. You can redistribute it and/or modify it under
494the same terms as Perl itself.
495
496=head1 AUTHOR
497
498Tara L Andrews E<lt>aurum@cpan.orgE<gt>