Rework Stemma / StemmaUtil so that utility functions are all in the latter. Fixes #14
[scpubgit/stemmatology.git] / analysis / lib / Text / Tradition / StemmaUtil.pm
CommitLineData
68454b71 1package Text::Tradition::StemmaUtil;
2
3use strict;
4use warnings;
5use Exporter 'import';
6use vars qw/ @EXPORT_OK /;
b02332ca 7use Bio::Phylo::IO;
27e2a8fe 8use Encode qw( decode_utf8 );
b02332ca 9use File::chdir;
10use File::Temp;
11use File::Which;
12use Graph;
b02332ca 13use IPC::Run qw/ run binary /;
edac47cc 14use Text::Tradition::Error;
5873cf38 15@EXPORT_OK = qw/ display_graph editable_graph
16 character_input phylip_pars parse_newick newick_to_svg /;
68454b71 17
027d819c 18=head1 NAME
19
5873cf38 20Text::Tradition::StemmaUtil - standalone utilities for stemma graph display and
21distance tree calculations
027d819c 22
23=head1 DESCRIPTION
24
5873cf38 25This package contains a set of utilities for displaying arbitrary stemmata and
26running phylogenetic analysis on text collations.
027d819c 27
28=head1 SUBROUTINES
29
5873cf38 30=head2 display_graph( $graph, $opts )
31
32Returns a dot specification intended for display, according to the logical
33attributes of the witnesses.
34
35=cut
36
37sub display_graph {
38 my( $graph, $opts ) = @_;
39
40 # Get default and specified options
41 my %graphopts = (
42 # 'ratio' => 1,
43 'bgcolor' => 'transparent',
44 );
45 my %nodeopts = (
46 'fontsize' => 11,
47 'style' => 'filled',
48 'fillcolor' => 'white',
49 'color' => 'white',
50 'shape' => 'ellipse', # Shape for the extant nodes
51 );
52 my %edgeopts = (
53 'arrowhead' => 'none',
54 );
55 @graphopts{ keys %{$opts->{'graph'}} } = values %{$opts->{'graph'}}
56 if $opts->{'graph'};
57 @nodeopts{ keys %{$opts->{'node'}} } = values %{$opts->{'node'}}
58 if $opts->{'node'};
59 @edgeopts{ keys %{$opts->{'edge'}} } = values %{$opts->{'edge'}}
60 if $opts->{'edge'};
61
62 my $gdecl = $graph->is_directed ? 'digraph' : 'graph';
63 my $gname = $opts->{'name'} ? '"' . $opts->{'name'} . '"'
64 : 'stemma';
65 my @dotlines;
66 push( @dotlines, "$gdecl $gname {" );
67 ## Print out the global attributes
68 push( @dotlines, _make_dotline( 'graph', %graphopts ) ) if keys %graphopts;
69 push( @dotlines, _make_dotline( 'edge', %edgeopts ) ) if keys %edgeopts;
70 push( @dotlines, _make_dotline( 'node', %nodeopts ) ) if keys %nodeopts;
71
72 # Add each of the nodes.
73 foreach my $n ( $graph->vertices ) {
74 my %vattr = ( 'id' => $n ); # Set the SVG element ID to the sigil itself
75 if( $graph->has_vertex_attribute( $n, 'label' ) ) {
76 $vattr{'label'} = $graph->get_vertex_attribute( $n, 'label' );
77 }
78 push( @dotlines, _make_dotline( $n, %vattr ) );
79 }
80 # Add each of our edges.
81 foreach my $e ( $graph->edges ) {
82 my( $from, $to ) = map { _dotquote( $_ ) } @$e;
83 my $connector = $graph->is_directed ? '->' : '--';
84 push( @dotlines, " $from $connector $to;" );
85 }
86 push( @dotlines, '}' );
87
88 return join( "\n", @dotlines );
89}
90
91
92=head2 editable_graph( $graph, $opts )
93
94Returns a dot specification of a stemma graph with logical witness features,
95intended for editing the stemma definition.
96
97=cut
98
99sub editable_graph {
100 my( $graph, $opts ) = @_;
101
102 # Create the graph
103 my $join = ( $opts && exists $opts->{'linesep'} ) ? $opts->{'linesep'} : "\n";
104 my $fq = $opts->{'forcequote'};
105 my $gdecl = $graph->is_undirected ? 'graph' : 'digraph';
106 my $gname = exists $opts->{'name'} ? '"' . $opts->{'name'} . '"'
107 : 'stemma';
108 my @dotlines;
109 push( @dotlines, "$gdecl $gname {" );
110 my @real; # A cheap sort
111 foreach my $n ( sort $graph->vertices ) {
112 my $c = $graph->get_vertex_attribute( $n, 'class' );
113 $c = 'extant' unless $c;
114 if( $c eq 'extant' ) {
115 push( @real, $n );
116 } else {
117 push( @dotlines, _make_dotline( $n, 'class' => $c, 'forcequote' => $fq ) );
118 }
119 }
120 # Now do the real ones
121 foreach my $n ( @real ) {
122 push( @dotlines, _make_dotline( $n, 'class' => 'extant', 'forcequote' => $fq ) );
123 }
124 foreach my $e ( sort _by_vertex $graph->edges ) {
125 my( $from, $to ) = map { _dotquote( $_ ) } @$e;
126 my $conn = $graph->is_undirected ? '--' : '->';
127 push( @dotlines, " $from $conn $to;" );
128 }
129 push( @dotlines, '}' );
130 return join( $join, @dotlines );
131}
132
133sub _make_dotline {
134 my( $obj, %attr ) = @_;
135 my @pairs;
136 my $fq = delete $attr{forcequote};
137 foreach my $k ( keys %attr ) {
138 my $v = _dotquote( $attr{$k}, $fq );
139 push( @pairs, "$k=$v" );
140 }
141 return sprintf( " %s [ %s ];", _dotquote( $obj ), join( ', ', @pairs ) );
142}
143
144sub _dotquote {
145 my( $str, $force ) = @_;
146 return $str if $str =~ /^[A-Za-z0-9]+$/;
147 $str =~ s/\"/\\\"/g;
148 $str = '"' . $str . '"';
149 return $str;
150}
151
152sub _by_vertex {
153 return $a->[0].$a->[1] cmp $b->[0].$b->[1];
154}
155
ea45d2a6 156=head2 character_input( $tradition, $opts )
9457207b 157
158Returns a character matrix string suitable for Phylip programs, which
159corresponds to the given alignment table. See Text::Tradition::Collation
ea45d2a6 160for a description of the alignment table format. Options include:
161
162=over
163
164=item * exclude_layer - Exclude layered witnesses from the character input,
165using only the 'main' text of the witnesses in the tradition.
166
167=item * collapse - A reference to an array of relationship names that should
168be treated as equivalent for the purposes of generating the character matrix.
169
170=back
9457207b 171
027d819c 172=cut
173
9457207b 174sub character_input {
ea45d2a6 175 my ( $tradition, $opts ) = @_;
176 my $table = $tradition->collation->alignment_table;
177 if( $opts->{exclude_layer} ) {
178 # Filter out all alignment table rows that do not correspond
179 # to a named witness - these are the layered witnesses.
b39e7cb5 180 my $newtable = { alignment => [], length => $table->{length} };
181 foreach my $row ( @{$table->{alignment}} ) {
ea45d2a6 182 if( $tradition->has_witness( $row->{witness} ) ) {
183 push( @{$newtable->{alignment}}, $row );
184 }
185 }
186 $table = $newtable;
187 }
188 my $character_matrix = _make_character_matrix( $table, $opts );
9457207b 189 my $input = '';
190 my $rows = scalar @{$character_matrix};
191 my $columns = scalar @{$character_matrix->[0]} - 1;
192 $input .= "\t$rows\t$columns\n";
193 foreach my $row ( @{$character_matrix} ) {
194 $input .= join( '', @$row ) . "\n";
195 }
196 return $input;
197}
198
027d819c 199sub _make_character_matrix {
ea45d2a6 200 my( $table, $opts ) = @_;
68454b71 201 # Push the names of the witnesses to initialize the rows of the matrix.
202 my @matrix = map { [ _normalize_witname( $_->{'witness'} ) ] }
203 @{$table->{'alignment'}};
204 foreach my $token_index ( 0 .. $table->{'length'} - 1) {
ea45d2a6 205 my @pos_tokens = map { $_->{'tokens'}->[$token_index] }
68454b71 206 @{$table->{'alignment'}};
ea45d2a6 207 my @pos_readings = map { $_ ? $_->{'t'} : $_ } @pos_tokens;
208 my @chars = _convert_characters( \@pos_readings, $opts );
68454b71 209 foreach my $idx ( 0 .. $#matrix ) {
210 push( @{$matrix[$idx]}, $chars[$idx] );
211 }
212 }
213 return \@matrix;
214}
215
216# Helper function to make the witness name something legal for pars
217
218sub _normalize_witname {
219 my( $witname ) = @_;
220 $witname =~ s/\s+/ /g;
221 $witname =~ s/[\[\]\(\)\:;,]//g;
222 $witname = substr( $witname, 0, 10 );
223 return sprintf( "%-10s", $witname );
224}
225
027d819c 226sub _convert_characters {
ea45d2a6 227 my( $row, $opts ) = @_;
68454b71 228 # This is a simple algorithm that treats every reading as different.
229 # Eventually we will want to be able to specify how relationships
230 # affect the character matrix.
231 my %unique = ( '__UNDEF__' => 'X',
232 '#LACUNA#' => '?',
233 );
ea45d2a6 234 my %equivalent;
68454b71 235 my %count;
236 my $ctr = 0;
ea45d2a6 237 foreach my $rdg ( @$row ) {
238 next unless $rdg;
239 next if $rdg->is_lacuna;
240 next if exists $unique{$rdg->text};
241 if( ref( $opts->{'collapse'} ) eq 'ARRAY' ) {
242 my @exclude_types = @{$opts->{'collapse'}};
243 my @set = $rdg->related_readings( sub { my $rel = shift;
244 $rel->colocated && grep { $rel->type eq $_ } @exclude_types } );
245 push( @set, $rdg );
246 my $char = chr( 65 + $ctr++ );
247 map { $unique{$_->text} = $char } @set;
248 $count{$rdg->text} += scalar @set;
249 } else {
250 $unique{$rdg->text} = chr( 65 + $ctr++ );
251 $count{$rdg->text}++;
252 }
68454b71 253 }
254 # Try to keep variants under 8 by lacunizing any singletons.
255 if( scalar( keys %unique ) > 8 ) {
256 foreach my $word ( keys %count ) {
257 if( $count{$word} == 1 ) {
258 $unique{$word} = '?';
259 }
260 }
261 }
262 my %u = reverse %unique;
263 if( scalar( keys %u ) > 8 ) {
264 warn "Have more than 8 variants on this location; phylip will break";
265 }
ea45d2a6 266 my @chars = map { $_ ? $unique{$_->text} : $unique{'__UNDEF__' } } @$row;
68454b71 267 return @chars;
268}
269
027d819c 270=head2 phylip_pars( $character_matrix )
271
272Runs Phylip Pars on the given character matrix. Returns results in Newick format.
273
274=cut
275
b02332ca 276sub phylip_pars {
277 my( $charmatrix ) = @_;
278 # Set up a temporary directory for all the default Phylip files.
279 my $phylip_dir = File::Temp->newdir();
280 # $phylip_dir->unlink_on_destroy(0);
281 # We need an infile, and we need a command input file.
282 open( MATRIX, ">$phylip_dir/infile" ) or die "Could not write $phylip_dir/infile";
283 print MATRIX $charmatrix;
284 close MATRIX;
285
286 open( CMD, ">$phylip_dir/cmdfile" ) or die "Could not write $phylip_dir/cmdfile";
287 ## TODO any configuration parameters we want to set here
288# U Search for best tree? Yes
289# S Search option? More thorough search
290# V Number of trees to save? 100
291# J Randomize input order of species? No. Use input order
292# O Outgroup root? No, use as outgroup species 1
293# T Use Threshold parsimony? No, use ordinary parsimony
294# W Sites weighted? No
295# M Analyze multiple data sets? No
296# I Input species interleaved? Yes
297# 0 Terminal type (IBM PC, ANSI, none)? ANSI
298# 1 Print out the data at start of run No
299# 2 Print indications of progress of run Yes
300# 3 Print out tree Yes
301# 4 Print out steps in each site No
302# 5 Print character at all nodes of tree No
303# 6 Write out trees onto tree file? Yes
304 print CMD "Y\n";
305 close CMD;
306
307 # And then we run the program.
308 my $program = File::Which::which( 'pars' );
b39e7cb5 309 unless( $program && -x $program ) {
edac47cc 310 throw( "Phylip pars not found in path" );
b02332ca 311 }
312
313 {
314 # We need to run it in our temporary directory where we have created
315 # all the expected files.
316 local $CWD = $phylip_dir;
317 my @cmd = ( $program );
318 run \@cmd, '<', 'cmdfile', '>', '/dev/null';
319 }
320 # Now our output should be in 'outfile' and our tree in 'outtree',
321 # both in the temp directory.
322
323 my @outtree;
324 if( -f "$phylip_dir/outtree" ) {
325 open( TREE, "$phylip_dir/outtree" ) or die "Could not open outtree for read";
326 @outtree = <TREE>;
327 close TREE;
328 }
edac47cc 329 return join( '', @outtree ) if @outtree;
b02332ca 330
69403daa 331 # If we got this far, we are about to throw an error.
b02332ca 332 my @error;
333 if( -f "$phylip_dir/outfile" ) {
334 open( OUTPUT, "$phylip_dir/outfile" ) or die "Could not open output for read";
335 @error = <OUTPUT>;
336 close OUTPUT;
337 } else {
338 push( @error, "Neither outtree nor output file was produced!" );
339 }
edac47cc 340 throw( join( '', @error ) );
b02332ca 341}
342
027d819c 343=head2 parse_newick( $newick_string )
344
ea45d2a6 345Parses the given Newick tree(s) into one or more Stemma objects with
346undirected graphs.
027d819c 347
348=cut
349
b02332ca 350sub parse_newick {
351 my $newick = shift;
5873cf38 352 # Parse the result into a set of trees and return them.
b02332ca 353 my $forest = Bio::Phylo::IO->parse(
354 -format => 'newick',
355 -string => $newick,
356 );
5873cf38 357 return map { _graph_from_bio( $_ ) } @{$forest->get_entities};
ea45d2a6 358}
359
360sub _graph_from_bio {
361 my $tree = shift;
362 my $graph = Graph->new( 'undirected' => 1 );
363 # Give all the intermediate anonymous nodes a name.
364 my $i = 0;
365 my $classes = {};
366 foreach my $n ( @{$tree->get_terminals} ) {
367 # The terminal nodes are our named witnesses.
368 $classes->{$n->get_name} = 'extant';
369 }
370 foreach my $n ( @{$tree->get_internals} ) {
371 unless( defined $n->get_name && $n->get_name ne '' ) {
372 # Get an integer, make sure it's a unique name
373 while( exists $classes->{$i} ) {
374 $i++;
375 }
376 $n->set_name( $i++ );
377 }
378 $classes->{$n->get_name} = 'hypothetical';
379 }
380 _add_tree_children( $graph, $classes, undef, [ $tree->get_root ]);
381 return $graph;
382}
383
384sub _add_tree_children {
385 my( $graph, $classes, $parent, $tree_children ) = @_;
386 foreach my $c ( @$tree_children ) {
387 my $child = $c->get_name;
388 $graph->add_vertex( $child );
389 $graph->set_vertex_attribute( $child, 'class', $classes->{$child} );
390 $graph->add_path( $parent, $child ) if defined $parent;
391 _add_tree_children( $graph, $classes, $child, $c->get_children() );
b02332ca 392 }
b02332ca 393}
394
027d819c 395=head2 newick_to_svg( $newick_string )
396
397Uses the FigTree utility (if installed) to transform the given Newick tree(s)
398into a graph visualization.
399
400=cut
401
27e2a8fe 402sub newick_to_svg {
403 my $newick = shift;
404 my $program = File::Which::which( 'figtree' );
405 unless( -x $program ) {
edac47cc 406 throw( "FigTree commandline utility not found in path" );
27e2a8fe 407 }
408 my $svg;
409 my $nfile = File::Temp->new();
410 print $nfile $newick;
411 close $nfile;
412 my @cmd = ( $program, '-graphic', 'SVG', $nfile );
413 run( \@cmd, ">", binary(), \$svg );
414 return decode_utf8( $svg );
415}
416
edac47cc 417sub throw {
418 Text::Tradition::Error->throw(
419 'ident' => 'StemmaUtil error',
420 'message' => $_[0],
421 );
422}
423
027d819c 4241;
425
426=head1 LICENSE
427
428This package is free software and is provided "as is" without express
429or implied warranty. You can redistribute it and/or modify it under
430the same terms as Perl itself.
431
432=head1 AUTHOR
433
434Tara L Andrews E<lt>aurum@cpan.orgE<gt>