fix last bugs, make stexaminer work under new regime
[scpubgit/stemmatology.git] / lib / Text / Tradition / Analysis.pm
CommitLineData
d71100ed 1package Text::Tradition::Analysis;
2
3use strict;
4use warnings;
e4386ba9 5use Benchmark;
88a6bac5 6use Encode qw/ encode_utf8 /;
d1348d38 7use Exporter 'import';
88a6bac5 8use JSON qw/ encode_json decode_json /;
9use LWP::UserAgent;
d71100ed 10use Text::Tradition;
11use Text::Tradition::Stemma;
12
d1348d38 13use vars qw/ @EXPORT_OK /;
a2cf85dd 14@EXPORT_OK = qw/ run_analysis group_variants analyze_variant_location wit_stringify /;
d1348d38 15
7f52eac8 16=head1 NAME
17
18Text::Tradition::Analysis - functions for stemma analysis of a tradition
19
20=head1 SYNOPSIS
21
22 use Text::Tradition;
23 use Text::Tradition::Analysis qw/ run_analysis analyze_variant_location /;
24 my $t = Text::Tradition->new(
25 'name' => 'this is a text',
26 'input' => 'TEI',
27 'file' => '/path/to/tei_parallel_seg_file.xml' );
28 $t->add_stemma( 'dotfile' => $stemmafile );
29
30 my $variant_data = run_analysis( $tradition );
31 # Recalculate rank $n treating all orthographic variants as equivalent
32 my $reanalyze = analyze_variant_location( $tradition, $n, 0, 'orthographic' );
33
34=head1 DESCRIPTION
35
36Text::Tradition is a library for representation and analysis of collated
37texts, particularly medieval ones. The Collation is the central feature of
38a Tradition, where the text, its sequence of readings, and its relationships
39between readings are actually kept.
40
41=head1 SUBROUTINES
42
88a6bac5 43=head2 run_analysis( $tradition, %opts )
7f52eac8 44
88a6bac5 45Runs the analysis described in analyze_variant_location on every location in the
46collation of the given tradition, with the given options. These include:
7f52eac8 47
88a6bac5 48=over 4
49
50=item * stemma_id - Specify which of the tradition's stemmata to use. Default
51is 0 (i.e. the first).
52
53=item * ranks - Specify a list of location ranks to analyze; exclude the rest.
54
55=item * merge_types - Specify a list of relationship types, where related readings
56should be treated as identical for the purposes of analysis.
57
58=back
7f52eac8 59
60=begin testing
61
62use Text::Tradition;
63use Text::Tradition::Analysis qw/ run_analysis analyze_variant_location /;
64
65my $datafile = 't/data/florilegium_tei_ps.xml';
66my $tradition = Text::Tradition->new( 'input' => 'TEI',
67 'name' => 'test0',
68 'file' => $datafile );
69my $s = $tradition->add_stemma( 'dotfile' => 't/data/florilegium.dot' );
70is( ref( $s ), 'Text::Tradition::Stemma', "Added stemma to tradition" );
71
f00cefe8 72my %expected_genealogical = (
a44aaf2a 73 1 => 0,
f00cefe8 74 2 => 1,
a44aaf2a 75 3 => 0,
76 5 => 0,
77 7 => 0,
78 8 => 0,
79 10 => 0,
f00cefe8 80 13 => 1,
a44aaf2a 81 33 => 0,
82 34 => 0,
83 37 => 0,
84 60 => 0,
f00cefe8 85 81 => 1,
a44aaf2a 86 84 => 0,
87 87 => 0,
88 101 => 0,
89 102 => 0,
f00cefe8 90 122 => 1,
a44aaf2a 91 157 => 0,
f00cefe8 92 166 => 1,
93 169 => 1,
a44aaf2a 94 200 => 0,
f00cefe8 95 216 => 1,
96 217 => 1,
97 219 => 1,
98 241 => 1,
99 242 => 1,
100 243 => 1,
101);
102
7f52eac8 103my $data = run_analysis( $tradition );
f00cefe8 104foreach my $row ( @{$data->{'variants'}} ) {
a44aaf2a 105 # Account for rows that used to be "not useful"
106 unless( exists $expected_genealogical{$row->{'id'}} ) {
107 $expected_genealogical{$row->{'id'}} = 1;
108 }
f00cefe8 109 is( $row->{'genealogical'}, $expected_genealogical{$row->{'id'}},
110 "Got correct genealogical flag for row " . $row->{'id'} );
111}
a44aaf2a 112is( $data->{'conflict_count'}, 34, "Got right conflict count" );
113is( $data->{'variant_count'}, 58, "Got right total variant number" );
7f52eac8 114
115=end testing
116
117=cut
118
d71100ed 119sub run_analysis {
88a6bac5 120 my( $tradition, %opts ) = @_;
f00cefe8 121 my $c = $tradition->collation;
88a6bac5 122
123 my $stemma_id = $opts{'stemma_id'} || 0;
124 my @ranks = @{$opts{'ranks'}} if ref( $opts{'ranks'} ) eq 'ARRAY';
125 my @collapse = @{$opts{'merge_types'}} if ref( $opts{'merge_types'} ) eq 'ARRAY';
126
127 # Get the stemma
128 my $stemma = $tradition->stemma( $stemma_id );
129 # Figure out which witnesses we are working with
130 my @lacunose = $stemma->hypotheticals;
fae07016 131 my @tradition_wits = map { $_->sigil } $tradition->witnesses;
132 map { push( @tradition_wits, $_->sigil."_ac" ) if $_->is_layered }
133 $tradition->witnesses;
134 push( @lacunose, _symmdiff( [ $stemma->witnesses ], \@tradition_wits ) );
88a6bac5 135
136 # Find and mark 'common' ranks for exclusion, unless they were
137 # explicitly specified.
138 unless( @ranks ) {
139 my %common_rank;
a44aaf2a 140 foreach my $rdg ( $c->common_readings ) {
88a6bac5 141 $common_rank{$rdg->rank} = 1;
142 }
143 @ranks = grep { !$common_rank{$_} } ( 1 .. $c->end->rank-1 );
d71100ed 144 }
7f52eac8 145
88a6bac5 146 # Group the variants to send to the solver
147 my @groups;
a44aaf2a 148 my %lacunae;
88a6bac5 149 foreach my $rank ( @ranks ) {
a44aaf2a 150 my $missing = [ @lacunose ];
151 push( @groups, group_variants( $tradition, $rank, $missing, \@collapse ) );
152 $lacunae{$rank} = $missing;
d71100ed 153 }
154
88a6bac5 155 # Parse the answer
156 my $answer = solve_variants( $stemma->editable( ' ' ), @groups );
fae07016 157
88a6bac5 158 # Do further analysis on the answer
a44aaf2a 159 my $conflict_count = 0;
88a6bac5 160 foreach my $idx ( 0 .. $#ranks ) {
161 my $location = $answer->{'variants'}->[$idx];
162 # Add the rank back in
163 $location->{'id'} = $ranks[$idx];
a44aaf2a 164 # Add the lacunae back in
165 $location->{'missing'} = $lacunae{$ranks[$idx]};
88a6bac5 166 # Run the extra analysis we need.
88a6bac5 167 analyze_location( $tradition, $stemma->graph, $location );
a44aaf2a 168 # Add the reading text back in
169 foreach my $rdghash ( @{$location->{'readings'}} ) {
170 $conflict_count++
171 if exists $rdghash->{'conflict'} && $rdghash->{'conflict'};
172 my $rdg = $c->reading( $rdghash->{'readingid'} );
173 $rdghash->{'text'} = $rdg ? $rdg->text : $rdghash->{'readingid'};
174 }
88a6bac5 175 }
a44aaf2a 176 $answer->{'conflict_count'} = $conflict_count;
f00cefe8 177
88a6bac5 178 return $answer;
d71100ed 179}
180
7f52eac8 181=head2 group_variants( $tradition, $rank, $lacunose, @merge_relationship_types )
182
183Groups the variants at the given $rank of the collation, treating any
184relationships in @merge_relationship_types as equivalent. $lacunose should
185be a reference to an array, to which the sigla of lacunose witnesses at this
186rank will be appended.
187
188Returns two ordered lists $readings, $groups, where $readings->[$n] is attested
189by the witnesses listed in $groups->[$n].
190
191=cut
192
193# Return group_readings, groups, lacunose
d1348d38 194sub group_variants {
7f52eac8 195 my( $tradition, $rank, $lacunose, $collapse ) = @_;
196 my $c = $tradition->collation;
fae07016 197 my $aclabel = $c->ac_label;
7f52eac8 198 # Get the alignment table readings
199 my %readings_at_rank;
200 my @gap_wits;
201 foreach my $tablewit ( @{$tradition->collation->alignment_table->{'alignment'}} ) {
202 my $rdg = $tablewit->{'tokens'}->[$rank-1];
fae07016 203 my $wit = $tablewit->{'witness'};
204 $wit =~ s/^(.*)\Q$aclabel\E$/${1}_ac/;
7f52eac8 205 if( $rdg && $rdg->{'t'}->is_lacuna ) {
fae07016 206 _add_to_witlist( $wit, $lacunose, '_ac' );
7f52eac8 207 } elsif( $rdg ) {
208 $readings_at_rank{$rdg->{'t'}->text} = $rdg->{'t'};
209 } else {
fae07016 210 _add_to_witlist( $wit, \@gap_wits, '_ac' );
7f52eac8 211 }
212 }
d1348d38 213
7f52eac8 214 # Group the readings, collapsing groups by relationship if needed
215 my %grouped_readings;
216 foreach my $rdg ( sort { $b->witnesses <=> $a->witnesses } values %readings_at_rank ) {
217 # Skip readings that have been collapsed into others.
f00cefe8 218 next if exists $grouped_readings{$rdg->id} && !$grouped_readings{$rdg->id};
7f52eac8 219 my @wits = $rdg->witnesses;
fae07016 220 map { s/\Q$aclabel\E$/_ac/ } @wits;
7f52eac8 221 if( $collapse ) {
222 my $filter = sub { my $r = $_[0]; grep { $_ eq $r->type } @$collapse; };
223 foreach my $other ( $rdg->related_readings( $filter ) ) {
fae07016 224 my @otherwits = $other->witnesses;
225 map { s/\Q$aclabel\E$/_ac/ } @otherwits;
226 push( @wits, @otherwits );
f00cefe8 227 $grouped_readings{$other->id} = 0;
d1348d38 228 }
229 }
f00cefe8 230 $grouped_readings{$rdg->id} = \@wits;
7f52eac8 231 }
232 $grouped_readings{'(omitted)'} = \@gap_wits if @gap_wits;
233 # Get rid of our collapsed readings
234 map { delete $grouped_readings{$_} unless $grouped_readings{$_} }
235 keys %grouped_readings
236 if $collapse;
237
5be0cdeb 238 return \%grouped_readings;
d1348d38 239}
240
88a6bac5 241=head2 solve_variants( $graph, @groups )
242
243Sends the set of groups to the external graph solver service and returns
244a cleaned-up answer, adding the rank IDs back where they belong.
245
246The JSON has the form
247 { "graph": [ stemmagraph DOT string without newlines ],
248 "groupings": [ array of arrays of groups, one per rank ] }
249
250The answer has the form
251 { "variants" => [ array of variant location structures ],
252 "variant_count" => total,
253 "conflict_count" => number of conflicts detected,
254 "genealogical_count" => number of solutions found }
255
256=cut
257
258sub solve_variants {
259 my( $graph, @groups ) = @_;
260
261 # Make the json with stemma + groups
262 my $jsonstruct = { 'graph' => $graph, 'groupings' => [] };
263 foreach my $ghash ( @groups ) {
264 my @grouping;
265 foreach my $k ( sort keys %$ghash ) {
266 push( @grouping, $ghash->{$k} );
267 }
268 push( @{$jsonstruct->{'groupings'}}, \@grouping );
269 }
270 my $json = encode_json( $jsonstruct );
271
272 # Send it off and get the result
273 my $solver_url = 'http://byzantini.st/cgi-bin/graphcalc.cgi';
274 my $ua = LWP::UserAgent->new();
275 my $resp = $ua->post( $solver_url, 'Content-Type' => 'application/json',
276 'Content' => $json );
277
278 my $answer;
279 if( $resp->is_success ) {
280 $answer = decode_json( $resp->content );
281 } else {
fae07016 282 # Fall back to the old method.
283 warn "IDP solver returned " . $resp->status_line . " / " . $resp->content
284 . "; falling back to perl method";
285 $answer = perl_solver( $graph, @groups );
88a6bac5 286 }
287
288 # Fold the result back into what we know about the groups.
289 my $variants = [];
290 my $genealogical = 0;
291 foreach my $idx ( 0 .. $#groups ) {
292 my( $calc_groups, $result ) = @{$answer->[$idx]};
293 $genealogical++ if $result;
294 my $input_group = $groups[$idx];
295 foreach my $k ( sort keys %$input_group ) {
296 my $cg = shift @$calc_groups;
297 $input_group->{$k} = $cg;
298 }
299 my $vstruct = {
300 'genealogical' => $result,
301 'readings' => [],
fae07016 302 };
88a6bac5 303 foreach my $k ( keys %$input_group ) {
304 push( @{$vstruct->{'readings'}},
fae07016 305 { 'readingid' => $k, 'group' => $input_group->{$k}} );
88a6bac5 306 }
307 push( @$variants, $vstruct );
308 }
309
310 return { 'variants' => $variants,
311 'variant_count' => scalar @$variants,
312 'genealogical_count' => $genealogical };
313}
314
fae07016 315=head2 analyze_location ( $tradition, $graph, $location_hash )
7f52eac8 316
fae07016 317Given the tradition, its stemma graph, and the solution from the graph solver,
318work out the rest of the information we want. For each reading we need missing,
319conflict, reading_parents, independent_occurrence, followed, not_followed, and follow_unknown. Alters the location_hash in place.
7f52eac8 320
321=cut
732152b1 322
fae07016 323sub analyze_location {
324 my ( $tradition, $graph, $variant_row ) = @_;
325
326 # Make a hash of all known node memberships, and make the subgraphs.
327 my $contig = {};
328 my $reading_roots = {};
329 my $subgraph = {};
330 foreach my $rdghash ( @{$variant_row->{'readings'}} ) {
331 my $rid = $rdghash->{'readingid'};
332 map { $contig->{$_} = $rid } @{$rdghash->{'group'}};
c4a4fb1b 333
fae07016 334 # Make the subgraph.
335 my $part = $graph->copy;
336 my %these_vertices;
337 map { $these_vertices{$_} = 1 } @{$rdghash->{'group'}};
338 $part->delete_vertices( grep { !$these_vertices{$_} } $part->vertices );
339 $subgraph->{$rid} = $part;
340 # Get the reading roots.
341 map { $reading_roots->{$_} = $rid } $part->predecessorless_vertices;
bebec0e9 342 }
343
fae07016 344 # Now that we have all the node group memberships, calculate followed/
bebec0e9 345 # non-followed/unknown values for each reading. Also figure out the
346 # reading's evident parent(s).
347 foreach my $rdghash ( @{$variant_row->{'readings'}} ) {
fae07016 348 # Group string key - TODO do we need this?
349 my $gst = wit_stringify( $rdghash->{'group'} );
350 my $rid = $rdghash->{'readingid'};
351 # Get the subgraph
352 my $part = $subgraph->{$rid};
353
354 # Start figuring things out.
bebec0e9 355 my @roots = $part->predecessorless_vertices;
356 $rdghash->{'independent_occurrence'} = scalar @roots;
357 $rdghash->{'followed'} = scalar( $part->vertices ) - scalar( @roots );
358 # Find the parent readings, if any, of this reading.
f00cefe8 359 my %rdgparents;
bebec0e9 360 foreach my $wit ( @roots ) {
f00cefe8 361 # Look in the main stemma to find this witness's extant or known-reading
362 # immediate ancestor(s), and look up the reading that each ancestor olds.
363 my @check = $graph->predecessors( $wit );
364 while( @check ) {
365 my @next;
366 foreach my $wparent( @check ) {
fae07016 367 my $preading = $contig->{$wparent};
368 if( $preading ) {
369 $rdgparents{$preading} = 1;
f00cefe8 370 } else {
371 push( @next, $graph->predecessors( $wparent ) );
372 }
373 }
374 @check = @next;
375 }
bebec0e9 376 }
f00cefe8 377 $rdghash->{'reading_parents'} = [ keys %rdgparents ];
bebec0e9 378
379 # Find the number of times this reading was altered, and the number of
380 # times we're not sure.
381 my( %nofollow, %unknownfollow );
382 foreach my $wit ( $part->vertices ) {
383 foreach my $wchild ( $graph->successors( $wit ) ) {
384 next if $part->has_vertex( $wchild );
fae07016 385 if( $reading_roots->{$wchild} && $contig->{$wchild} ) {
bebec0e9 386 # It definitely changed here.
387 $nofollow{$wchild} = 1;
388 } elsif( !($contig->{$wchild}) ) {
389 # The child is a hypothetical node not definitely in
390 # any group. Answer is unknown.
391 $unknownfollow{$wchild} = 1;
392 } # else it's a non-root node in a known group, and therefore
393 # is presumed to have its reading from its group, not this link.
394 }
395 }
396 $rdghash->{'not_followed'} = keys %nofollow;
397 $rdghash->{'follow_unknown'} = keys %unknownfollow;
fae07016 398
399 # Now say whether this reading represents a conflict.
400 unless( $variant_row->{'genealogical'} ) {
401 $rdghash->{'conflict'} = @roots != 1;
402 }
c4a4fb1b 403 }
d71100ed 404}
405
fae07016 406
407=head2 perl_solver( $tradition, $rank, $stemma_id, @merge_relationship_types )
408
409** NOTE ** This method should hopefully not be called - it is not guaranteed
410to be correct. Serves as a backup for the real solver.
411
412Runs an analysis of the given tradition, at the location given in $rank,
413against the graph of the stemma specified in $stemma_id. The argument
414@merge_relationship_types is an optional list of relationship types for
415which readings so related should be treated as equivalent.
416
417Returns a nested array data structure as follows:
418
419 [ [ group_list, is_genealogical ], [ group_list, is_genealogical ] ... ]
420
421where the group list is the array of arrays passed in for each element of @groups,
422possibly with the addition of hypothetical readings.
423
424
425=cut
426
427sub perl_solver {
428 my( $graph, @groups ) = @_;
429
430 warn "Not implemented yet";
431 return [];
432}
433
434 # Now do the work.
435# my $contig = {};
436# my $subgraph = {};
437# my $is_conflicted;
438# my $conflict = {};
439# my %reading_roots;
440# my $variant_row = { 'id' => $rank, 'readings' => [] };
441# # Mark each ms as in its own group, first.
442# foreach my $g ( @$groups ) {
443# my $gst = wit_stringify( $g );
444# map { $contig->{$_} = $gst } @$g;
445# }
446# # Now for each unmarked node in the graph, initialize an array
447# # for possible group memberships. We will use this later to
448# # resolve potential conflicts.
449# map { $contig->{$_} = [] unless $contig->{$_} } $graph->vertices;
450# foreach my $g ( sort { scalar @$b <=> scalar @$a } @$groups ) {
451# my $gst = wit_stringify( $g ); # This is the group name
452# # Copy the graph, and delete all non-members from the new graph.
453# my $part = $graph->copy;
454# my @group_roots;
455# $part->delete_vertices(
456# grep { !ref( $contig->{$_} ) && $contig->{$_} ne $gst } $graph->vertices );
457#
458# # Now look to see if our group is connected.
459# if( $undirected ) { # For use with distance trees etc.
460# # Find all vertices reachable from the first (arbitrary) group
461# # member. If we are genealogical this should include them all.
462# my $reachable = {};
463# map { $reachable->{$_} = 1 } $part->all_reachable( $g->[0] );
464# # TODO This is a terrible way to do distance trees, since all
465# # non-leaf nodes are included in every graph part now. We may
466# # have to go back to SPDP.
467# } else {
468# if( @$g > 1 ) {
469# # We have to take directionality into account.
470# # How many root nodes do we have?
471# my @roots = grep { ref( $contig->{$_} ) || $contig->{$_} eq $gst }
472# $part->predecessorless_vertices;
473# # Assuming that @$g > 1, find the first root node that has at
474# # least one successor belonging to our group. If this reading
475# # is genealogical, there should be only one, but we will check
476# # that implicitly later.
477# foreach my $root ( @roots ) {
478# # Prune the tree to get rid of extraneous hypotheticals.
479# $root = _prune_subtree( $part, $root, $contig );
480# next unless $root;
481# # Save this root for our group.
482# push( @group_roots, $root );
483# # Get all the successor nodes of our root.
484# }
485# } else {
486# # Dispense with the trivial case of one reading.
487# my $wit = pop @$g;
488# @group_roots = ( $wit );
489# foreach my $v ( $part->vertices ) {
490# $part->delete_vertex( $v ) unless $v eq $wit;
491# }
492# }
493# }
494#
495# map { $reading_roots{$_} = 1 } @group_roots;
496# if( @group_roots > 1 ) {
497# $conflict->{$group_readings->{$gst}} = 1;
498# $is_conflicted = 1;
499# }
500# # Paint the 'hypotheticals' with our group.
501# foreach my $wit ( $part->vertices ) {
502# if( ref( $contig->{$wit} ) ) {
503# push( @{$contig->{$wit}}, $gst );
504# } elsif( $contig->{$wit} ne $gst ) {
505# warn "How did we get here?";
506# }
507# }
508#
509#
510# # Start to write the reading, and save the group subgraph.
511# my $reading = { 'readingid' => $group_readings->{$gst},
512# 'missing' => wit_stringify( \@lacunose ),
513# 'group' => $gst }; # This will change if we find no conflict
514# # Save the relevant subgraph.
515# $subgraph->{$gst} = $part;
516# push( @{$variant_row->{'readings'}}, $reading );
517# }
518#
519# # For each of our hypothetical readings, flatten its 'contig' array if
520# # the array contains zero or one group. If we have any unflattened arrays,
521# # we may need to run the resolution process. If the reading is already known
522# # to have a conflict, flatten the 'contig' array to nothing; we won't resolve
523# # it.
524# my @resolve;
525# foreach my $wit ( keys %$contig ) {
526# next unless ref( $contig->{$wit} );
527# if( @{$contig->{$wit}} > 1 ) {
528# if( $is_conflicted ) {
529# $contig->{$wit} = ''; # We aren't going to decide.
530# } else {
531# push( @resolve, $wit );
532# }
533# } else {
534# my $gst = pop @{$contig->{$wit}};
535# $contig->{$wit} = $gst || '';
536# }
537# }
538#
539# if( @resolve ) {
540# my $still_contig = {};
541# foreach my $h ( @resolve ) {
542# # For each of the hypothetical readings with more than one possibility,
543# # try deleting it from each of its member subgraphs in turn, and see
544# # if that breaks the contiguous grouping.
545# # TODO This can still break in a corner case where group A can use
546# # either vertex 1 or 2, and group B can use either vertex 2 or 1.
547# # Revisit this if necessary; it could get brute-force nasty.
548# foreach my $gst ( @{$contig->{$h}} ) {
549# my $gpart = $subgraph->{$gst}->copy();
550# # If we have come this far, there is only one root and everything
551# # is reachable from it.
552# my( $root ) = $gpart->predecessorless_vertices;
553# my $reachable = {};
554# map { $reachable->{$_} = 1 } $gpart->vertices;
555#
556# # Try deleting the hypothetical node.
557# $gpart->delete_vertex( $h );
558# if( $h eq $root ) {
559# # See if we still have a single root.
560# my @roots = $gpart->predecessorless_vertices;
561# warn "This shouldn't have happened" unless @roots;
562# if( @roots > 1 ) {
563# # $h is needed by this group.
564# if( exists( $still_contig->{$h} ) ) {
565# # Conflict!
566# $conflict->{$group_readings->{$gst}} = 1;
567# $still_contig->{$h} = '';
568# } else {
569# $still_contig->{$h} = $gst;
570# }
571# }
572# } else {
573# # $h is somewhere in the middle. See if everything
574# # else can still be reached from the root.
575# my %still_reachable = ( $root => 1 );
576# map { $still_reachable{$_} = 1 }
577# $gpart->all_successors( $root );
578# foreach my $v ( keys %$reachable ) {
579# next if $v eq $h;
580# if( !$still_reachable{$v}
581# && ( $contig->{$v} eq $gst
582# || ( exists $still_contig->{$v}
583# && $still_contig->{$v} eq $gst ) ) ) {
584# # We need $h.
585# if( exists $still_contig->{$h} ) {
586# # Conflict!
587# $conflict->{$group_readings->{$gst}} = 1;
588# $still_contig->{$h} = '';
589# } else {
590# $still_contig->{$h} = $gst;
591# }
592# last;
593# } # else we don't need $h in this group.
594# } # end foreach $v
595# } # endif $h eq $root
596# } # end foreach $gst
597# } # end foreach $h
598#
599# # Now we have some hypothetical vertices in $still_contig that are the
600# # "real" group memberships. Replace these in $contig.
601# foreach my $v ( keys %$contig ) {
602# next unless ref $contig->{$v};
603# $contig->{$v} = $still_contig->{$v};
604# }
605# } # end if @resolve
606#
607#
608# $variant_row->{'genealogical'} = !( keys %$conflict );
609# return $variant_row;
610# }
611
7f52eac8 612sub _prune_subtree {
231d71fc 613 my( $tree, $root, $contighash ) = @_;
614 # First, delete hypothetical leaves / orphans until there are none left.
615 my @orphan_hypotheticals = grep { ref( $contighash->{$_} ) }
616 $tree->successorless_vertices;
617 while( @orphan_hypotheticals ) {
618 $tree->delete_vertices( @orphan_hypotheticals );
619 @orphan_hypotheticals = grep { ref( $contighash->{$_} ) }
620 $tree->successorless_vertices;
621 }
622 # Then delete a hypothetical root with only one successor, moving the
bebec0e9 623 # root to the first child that has no other predecessors.
231d71fc 624 while( $tree->successors( $root ) == 1 && ref $contighash->{$root} ) {
625 my @nextroot = $tree->successors( $root );
626 $tree->delete_vertex( $root );
bebec0e9 627 ( $root ) = grep { $tree->is_predecessorless_vertex( $_ ) } @nextroot;
231d71fc 628 }
629 # The tree has been modified in place, but we need to know the new root.
bebec0e9 630 $root = undef unless $root && $tree->has_vertex( $root );
231d71fc 631 return $root;
632}
d71100ed 633# Add the variant, subject to a.c. representation logic.
634# This assumes that we will see the 'main' version before the a.c. version.
635sub add_variant_wit {
636 my( $arr, $wit, $acstr ) = @_;
637 my $skip;
638 if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
639 my $real = $1;
640 $skip = grep { $_ =~ /^\Q$real\E$/ } @$arr;
641 }
642 push( @$arr, $wit ) unless $skip;
643}
644
5be0cdeb 645sub _useful_variant {
646 my( $group_readings, $graph, $acstr ) = @_;
647
648 # TODO Decide what to do with AC witnesses
649
650 # Sort by group size and return
651 my $is_useful = 0;
652 my( @readings, @groups ); # The sorted groups for our answer.
653 foreach my $rdg ( sort { @{$group_readings->{$b}} <=> @{$group_readings->{$a}} }
654 keys %$group_readings ) {
655 push( @readings, $rdg );
656 push( @groups, $group_readings->{$rdg} );
657 if( @{$group_readings->{$rdg}} > 1 ) {
658 $is_useful++;
659 } else {
660 my( $wit ) = @{$group_readings->{$rdg}};
661 $wit =~ s/^(.*)\Q$acstr\E$/$1/;
662 $is_useful++ unless( $graph->is_sink_vertex( $wit ) );
663 }
664 }
665 if( $is_useful > 1 ) {
666 return( \@readings, \@groups );
667 } else {
668 return( [], [] );
669 }
670}
671
7f52eac8 672=head2 wit_stringify( $groups )
673
674Takes an array of witness groupings and produces a string like
675['A','B'] / ['C','D','E'] / ['F']
d71100ed 676
7f52eac8 677=cut
d71100ed 678
679sub wit_stringify {
680 my $groups = shift;
681 my @gst;
682 # If we were passed an array of witnesses instead of an array of
683 # groupings, then "group" the witnesses first.
684 unless( ref( $groups->[0] ) ) {
685 my $mkgrp = [ $groups ];
686 $groups = $mkgrp;
687 }
688 foreach my $g ( @$groups ) {
689 push( @gst, '[' . join( ',', map { "'$_'" } @$g ) . ']' );
690 }
691 return join( ' / ', @gst );
692}
7f52eac8 693
5be0cdeb 694# Helper function to ensure that X and X a.c. never appear in the same list.
695sub _add_to_witlist {
696 my( $wit, $list, $acstr ) = @_;
697 my %inlist;
698 my $idx = 0;
699 map { $inlist{$_} = $idx++ } @$list;
700 if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
701 my $acwit = $1;
702 unless( exists $inlist{$acwit} ) {
703 push( @$list, $acwit.$acstr );
704 }
705 } else {
706 if( exists( $inlist{$wit.$acstr} ) ) {
707 # Replace the a.c. version with the main witness
708 my $i = $inlist{$wit.$acstr};
709 $list->[$i] = $wit;
710 } else {
711 push( @$list, $wit );
712 }
713 }
714}
715
bebec0e9 716sub _symmdiff {
717 my( $lista, $listb ) = @_;
7f52eac8 718 my %union;
719 my %scalars;
720 map { $union{$_} = 1; $scalars{$_} = $_ } @$lista;
721 map { $union{$_} += 1; $scalars{$_} = $_ } @$listb;
bebec0e9 722 my @set = grep { $union{$_} == 1 } keys %union;
7f52eac8 723 return map { $scalars{$_} } @set;
724}
725
7261;
727
728=head1 LICENSE
729
730This package is free software and is provided "as is" without express
731or implied warranty. You can redistribute it and/or modify it under
732the same terms as Perl itself.
733
734=head1 AUTHOR
735
736Tara L Andrews E<lt>aurum@cpan.orgE<gt>