perl solver works again too
[scpubgit/stemmatology.git] / lib / Text / Tradition / Analysis.pm
CommitLineData
d71100ed 1package Text::Tradition::Analysis;
2
3use strict;
4use warnings;
e4386ba9 5use Benchmark;
88a6bac5 6use Encode qw/ encode_utf8 /;
d1348d38 7use Exporter 'import';
88a6bac5 8use JSON qw/ encode_json decode_json /;
9use LWP::UserAgent;
d71100ed 10use Text::Tradition;
11use Text::Tradition::Stemma;
12
d1348d38 13use vars qw/ @EXPORT_OK /;
a2cf85dd 14@EXPORT_OK = qw/ run_analysis group_variants analyze_variant_location wit_stringify /;
d1348d38 15
7f52eac8 16=head1 NAME
17
18Text::Tradition::Analysis - functions for stemma analysis of a tradition
19
20=head1 SYNOPSIS
21
22 use Text::Tradition;
23 use Text::Tradition::Analysis qw/ run_analysis analyze_variant_location /;
24 my $t = Text::Tradition->new(
25 'name' => 'this is a text',
26 'input' => 'TEI',
27 'file' => '/path/to/tei_parallel_seg_file.xml' );
28 $t->add_stemma( 'dotfile' => $stemmafile );
29
30 my $variant_data = run_analysis( $tradition );
31 # Recalculate rank $n treating all orthographic variants as equivalent
32 my $reanalyze = analyze_variant_location( $tradition, $n, 0, 'orthographic' );
33
34=head1 DESCRIPTION
35
36Text::Tradition is a library for representation and analysis of collated
37texts, particularly medieval ones. The Collation is the central feature of
38a Tradition, where the text, its sequence of readings, and its relationships
39between readings are actually kept.
40
41=head1 SUBROUTINES
42
88a6bac5 43=head2 run_analysis( $tradition, %opts )
7f52eac8 44
88a6bac5 45Runs the analysis described in analyze_variant_location on every location in the
46collation of the given tradition, with the given options. These include:
7f52eac8 47
88a6bac5 48=over 4
49
50=item * stemma_id - Specify which of the tradition's stemmata to use. Default
51is 0 (i.e. the first).
52
53=item * ranks - Specify a list of location ranks to analyze; exclude the rest.
54
55=item * merge_types - Specify a list of relationship types, where related readings
56should be treated as identical for the purposes of analysis.
57
58=back
7f52eac8 59
60=begin testing
61
62use Text::Tradition;
63use Text::Tradition::Analysis qw/ run_analysis analyze_variant_location /;
64
65my $datafile = 't/data/florilegium_tei_ps.xml';
66my $tradition = Text::Tradition->new( 'input' => 'TEI',
67 'name' => 'test0',
68 'file' => $datafile );
69my $s = $tradition->add_stemma( 'dotfile' => 't/data/florilegium.dot' );
70is( ref( $s ), 'Text::Tradition::Stemma', "Added stemma to tradition" );
71
f00cefe8 72my %expected_genealogical = (
a44aaf2a 73 1 => 0,
f00cefe8 74 2 => 1,
a44aaf2a 75 3 => 0,
76 5 => 0,
77 7 => 0,
78 8 => 0,
79 10 => 0,
f00cefe8 80 13 => 1,
a44aaf2a 81 33 => 0,
82 34 => 0,
83 37 => 0,
84 60 => 0,
f00cefe8 85 81 => 1,
a44aaf2a 86 84 => 0,
87 87 => 0,
88 101 => 0,
89 102 => 0,
f00cefe8 90 122 => 1,
a44aaf2a 91 157 => 0,
f00cefe8 92 166 => 1,
93 169 => 1,
a44aaf2a 94 200 => 0,
f00cefe8 95 216 => 1,
96 217 => 1,
97 219 => 1,
98 241 => 1,
99 242 => 1,
100 243 => 1,
101);
102
7f52eac8 103my $data = run_analysis( $tradition );
f00cefe8 104foreach my $row ( @{$data->{'variants'}} ) {
a44aaf2a 105 # Account for rows that used to be "not useful"
106 unless( exists $expected_genealogical{$row->{'id'}} ) {
107 $expected_genealogical{$row->{'id'}} = 1;
108 }
f00cefe8 109 is( $row->{'genealogical'}, $expected_genealogical{$row->{'id'}},
110 "Got correct genealogical flag for row " . $row->{'id'} );
111}
a44aaf2a 112is( $data->{'conflict_count'}, 34, "Got right conflict count" );
113is( $data->{'variant_count'}, 58, "Got right total variant number" );
7f52eac8 114
115=end testing
116
117=cut
118
d71100ed 119sub run_analysis {
88a6bac5 120 my( $tradition, %opts ) = @_;
f00cefe8 121 my $c = $tradition->collation;
88a6bac5 122
123 my $stemma_id = $opts{'stemma_id'} || 0;
124 my @ranks = @{$opts{'ranks'}} if ref( $opts{'ranks'} ) eq 'ARRAY';
125 my @collapse = @{$opts{'merge_types'}} if ref( $opts{'merge_types'} ) eq 'ARRAY';
126
127 # Get the stemma
128 my $stemma = $tradition->stemma( $stemma_id );
129 # Figure out which witnesses we are working with
130 my @lacunose = $stemma->hypotheticals;
fae07016 131 my @tradition_wits = map { $_->sigil } $tradition->witnesses;
132 map { push( @tradition_wits, $_->sigil."_ac" ) if $_->is_layered }
133 $tradition->witnesses;
134 push( @lacunose, _symmdiff( [ $stemma->witnesses ], \@tradition_wits ) );
88a6bac5 135
136 # Find and mark 'common' ranks for exclusion, unless they were
137 # explicitly specified.
138 unless( @ranks ) {
139 my %common_rank;
a44aaf2a 140 foreach my $rdg ( $c->common_readings ) {
88a6bac5 141 $common_rank{$rdg->rank} = 1;
142 }
143 @ranks = grep { !$common_rank{$_} } ( 1 .. $c->end->rank-1 );
d71100ed 144 }
7f52eac8 145
88a6bac5 146 # Group the variants to send to the solver
147 my @groups;
a44aaf2a 148 my %lacunae;
88a6bac5 149 foreach my $rank ( @ranks ) {
a44aaf2a 150 my $missing = [ @lacunose ];
151 push( @groups, group_variants( $tradition, $rank, $missing, \@collapse ) );
152 $lacunae{$rank} = $missing;
d71100ed 153 }
154
88a6bac5 155 # Parse the answer
e59b8faa 156 my $answer = solve_variants( $stemma, @groups );
fae07016 157
88a6bac5 158 # Do further analysis on the answer
a44aaf2a 159 my $conflict_count = 0;
88a6bac5 160 foreach my $idx ( 0 .. $#ranks ) {
161 my $location = $answer->{'variants'}->[$idx];
162 # Add the rank back in
163 $location->{'id'} = $ranks[$idx];
a44aaf2a 164 # Add the lacunae back in
165 $location->{'missing'} = $lacunae{$ranks[$idx]};
88a6bac5 166 # Run the extra analysis we need.
88a6bac5 167 analyze_location( $tradition, $stemma->graph, $location );
a44aaf2a 168 # Add the reading text back in
169 foreach my $rdghash ( @{$location->{'readings'}} ) {
170 $conflict_count++
171 if exists $rdghash->{'conflict'} && $rdghash->{'conflict'};
172 my $rdg = $c->reading( $rdghash->{'readingid'} );
173 $rdghash->{'text'} = $rdg ? $rdg->text : $rdghash->{'readingid'};
174 }
88a6bac5 175 }
a44aaf2a 176 $answer->{'conflict_count'} = $conflict_count;
f00cefe8 177
88a6bac5 178 return $answer;
d71100ed 179}
180
7f52eac8 181=head2 group_variants( $tradition, $rank, $lacunose, @merge_relationship_types )
182
183Groups the variants at the given $rank of the collation, treating any
184relationships in @merge_relationship_types as equivalent. $lacunose should
185be a reference to an array, to which the sigla of lacunose witnesses at this
186rank will be appended.
187
188Returns two ordered lists $readings, $groups, where $readings->[$n] is attested
189by the witnesses listed in $groups->[$n].
190
191=cut
192
193# Return group_readings, groups, lacunose
d1348d38 194sub group_variants {
7f52eac8 195 my( $tradition, $rank, $lacunose, $collapse ) = @_;
196 my $c = $tradition->collation;
e59b8faa 197 # All the regexps here are to get rid of space characters in witness names.
fae07016 198 my $aclabel = $c->ac_label;
e59b8faa 199 $aclabel =~ s/\s/_/g;
7f52eac8 200 # Get the alignment table readings
201 my %readings_at_rank;
202 my @gap_wits;
203 foreach my $tablewit ( @{$tradition->collation->alignment_table->{'alignment'}} ) {
204 my $rdg = $tablewit->{'tokens'}->[$rank-1];
fae07016 205 my $wit = $tablewit->{'witness'};
e59b8faa 206 $wit =~ s/\s/_/g;
7f52eac8 207 if( $rdg && $rdg->{'t'}->is_lacuna ) {
e59b8faa 208 _add_to_witlist( $wit, $lacunose, $aclabel );
7f52eac8 209 } elsif( $rdg ) {
210 $readings_at_rank{$rdg->{'t'}->text} = $rdg->{'t'};
211 } else {
e59b8faa 212 _add_to_witlist( $wit, \@gap_wits, $aclabel );
7f52eac8 213 }
214 }
d1348d38 215
7f52eac8 216 # Group the readings, collapsing groups by relationship if needed
217 my %grouped_readings;
218 foreach my $rdg ( sort { $b->witnesses <=> $a->witnesses } values %readings_at_rank ) {
219 # Skip readings that have been collapsed into others.
f00cefe8 220 next if exists $grouped_readings{$rdg->id} && !$grouped_readings{$rdg->id};
7f52eac8 221 my @wits = $rdg->witnesses;
e59b8faa 222 map { s/\s/_/g } @wits;
7f52eac8 223 if( $collapse ) {
224 my $filter = sub { my $r = $_[0]; grep { $_ eq $r->type } @$collapse; };
225 foreach my $other ( $rdg->related_readings( $filter ) ) {
fae07016 226 my @otherwits = $other->witnesses;
e59b8faa 227 map { s/\s/_/g } @otherwits;
fae07016 228 push( @wits, @otherwits );
f00cefe8 229 $grouped_readings{$other->id} = 0;
d1348d38 230 }
231 }
f00cefe8 232 $grouped_readings{$rdg->id} = \@wits;
7f52eac8 233 }
234 $grouped_readings{'(omitted)'} = \@gap_wits if @gap_wits;
235 # Get rid of our collapsed readings
236 map { delete $grouped_readings{$_} unless $grouped_readings{$_} }
237 keys %grouped_readings
238 if $collapse;
239
5be0cdeb 240 return \%grouped_readings;
d1348d38 241}
242
88a6bac5 243=head2 solve_variants( $graph, @groups )
244
245Sends the set of groups to the external graph solver service and returns
246a cleaned-up answer, adding the rank IDs back where they belong.
247
248The JSON has the form
249 { "graph": [ stemmagraph DOT string without newlines ],
250 "groupings": [ array of arrays of groups, one per rank ] }
251
252The answer has the form
253 { "variants" => [ array of variant location structures ],
254 "variant_count" => total,
255 "conflict_count" => number of conflicts detected,
256 "genealogical_count" => number of solutions found }
257
258=cut
259
260sub solve_variants {
e59b8faa 261 my( $stemma, @groups ) = @_;
88a6bac5 262
263 # Make the json with stemma + groups
e59b8faa 264 my $jsonstruct = { 'graph' => $stemma->editable( ' ' ), 'groupings' => [] };
88a6bac5 265 foreach my $ghash ( @groups ) {
266 my @grouping;
267 foreach my $k ( sort keys %$ghash ) {
268 push( @grouping, $ghash->{$k} );
269 }
270 push( @{$jsonstruct->{'groupings'}}, \@grouping );
271 }
272 my $json = encode_json( $jsonstruct );
273
274 # Send it off and get the result
275 my $solver_url = 'http://byzantini.st/cgi-bin/graphcalc.cgi';
276 my $ua = LWP::UserAgent->new();
277 my $resp = $ua->post( $solver_url, 'Content-Type' => 'application/json',
278 'Content' => $json );
279
280 my $answer;
281 if( $resp->is_success ) {
282 $answer = decode_json( $resp->content );
283 } else {
fae07016 284 # Fall back to the old method.
285 warn "IDP solver returned " . $resp->status_line . " / " . $resp->content
286 . "; falling back to perl method";
e59b8faa 287 $answer = perl_solver( $stemma, @{$jsonstruct->{'groupings'}} );
88a6bac5 288 }
289
290 # Fold the result back into what we know about the groups.
291 my $variants = [];
292 my $genealogical = 0;
293 foreach my $idx ( 0 .. $#groups ) {
294 my( $calc_groups, $result ) = @{$answer->[$idx]};
295 $genealogical++ if $result;
296 my $input_group = $groups[$idx];
297 foreach my $k ( sort keys %$input_group ) {
298 my $cg = shift @$calc_groups;
299 $input_group->{$k} = $cg;
300 }
301 my $vstruct = {
302 'genealogical' => $result,
303 'readings' => [],
fae07016 304 };
88a6bac5 305 foreach my $k ( keys %$input_group ) {
306 push( @{$vstruct->{'readings'}},
fae07016 307 { 'readingid' => $k, 'group' => $input_group->{$k}} );
88a6bac5 308 }
309 push( @$variants, $vstruct );
310 }
311
312 return { 'variants' => $variants,
313 'variant_count' => scalar @$variants,
314 'genealogical_count' => $genealogical };
315}
316
fae07016 317=head2 analyze_location ( $tradition, $graph, $location_hash )
7f52eac8 318
fae07016 319Given the tradition, its stemma graph, and the solution from the graph solver,
320work out the rest of the information we want. For each reading we need missing,
321conflict, reading_parents, independent_occurrence, followed, not_followed, and follow_unknown. Alters the location_hash in place.
7f52eac8 322
323=cut
732152b1 324
fae07016 325sub analyze_location {
326 my ( $tradition, $graph, $variant_row ) = @_;
327
328 # Make a hash of all known node memberships, and make the subgraphs.
329 my $contig = {};
330 my $reading_roots = {};
331 my $subgraph = {};
332 foreach my $rdghash ( @{$variant_row->{'readings'}} ) {
333 my $rid = $rdghash->{'readingid'};
334 map { $contig->{$_} = $rid } @{$rdghash->{'group'}};
c4a4fb1b 335
fae07016 336 # Make the subgraph.
337 my $part = $graph->copy;
338 my %these_vertices;
339 map { $these_vertices{$_} = 1 } @{$rdghash->{'group'}};
340 $part->delete_vertices( grep { !$these_vertices{$_} } $part->vertices );
341 $subgraph->{$rid} = $part;
342 # Get the reading roots.
343 map { $reading_roots->{$_} = $rid } $part->predecessorless_vertices;
bebec0e9 344 }
345
fae07016 346 # Now that we have all the node group memberships, calculate followed/
bebec0e9 347 # non-followed/unknown values for each reading. Also figure out the
348 # reading's evident parent(s).
349 foreach my $rdghash ( @{$variant_row->{'readings'}} ) {
fae07016 350 # Group string key - TODO do we need this?
351 my $gst = wit_stringify( $rdghash->{'group'} );
352 my $rid = $rdghash->{'readingid'};
353 # Get the subgraph
354 my $part = $subgraph->{$rid};
355
356 # Start figuring things out.
bebec0e9 357 my @roots = $part->predecessorless_vertices;
358 $rdghash->{'independent_occurrence'} = scalar @roots;
359 $rdghash->{'followed'} = scalar( $part->vertices ) - scalar( @roots );
360 # Find the parent readings, if any, of this reading.
f00cefe8 361 my %rdgparents;
bebec0e9 362 foreach my $wit ( @roots ) {
f00cefe8 363 # Look in the main stemma to find this witness's extant or known-reading
364 # immediate ancestor(s), and look up the reading that each ancestor olds.
365 my @check = $graph->predecessors( $wit );
366 while( @check ) {
367 my @next;
368 foreach my $wparent( @check ) {
fae07016 369 my $preading = $contig->{$wparent};
370 if( $preading ) {
371 $rdgparents{$preading} = 1;
f00cefe8 372 } else {
373 push( @next, $graph->predecessors( $wparent ) );
374 }
375 }
376 @check = @next;
377 }
bebec0e9 378 }
f00cefe8 379 $rdghash->{'reading_parents'} = [ keys %rdgparents ];
bebec0e9 380
381 # Find the number of times this reading was altered, and the number of
382 # times we're not sure.
383 my( %nofollow, %unknownfollow );
384 foreach my $wit ( $part->vertices ) {
385 foreach my $wchild ( $graph->successors( $wit ) ) {
386 next if $part->has_vertex( $wchild );
fae07016 387 if( $reading_roots->{$wchild} && $contig->{$wchild} ) {
bebec0e9 388 # It definitely changed here.
389 $nofollow{$wchild} = 1;
390 } elsif( !($contig->{$wchild}) ) {
391 # The child is a hypothetical node not definitely in
392 # any group. Answer is unknown.
393 $unknownfollow{$wchild} = 1;
394 } # else it's a non-root node in a known group, and therefore
395 # is presumed to have its reading from its group, not this link.
396 }
397 }
398 $rdghash->{'not_followed'} = keys %nofollow;
399 $rdghash->{'follow_unknown'} = keys %unknownfollow;
fae07016 400
401 # Now say whether this reading represents a conflict.
402 unless( $variant_row->{'genealogical'} ) {
403 $rdghash->{'conflict'} = @roots != 1;
404 }
c4a4fb1b 405 }
d71100ed 406}
407
fae07016 408
409=head2 perl_solver( $tradition, $rank, $stemma_id, @merge_relationship_types )
410
411** NOTE ** This method should hopefully not be called - it is not guaranteed
412to be correct. Serves as a backup for the real solver.
413
414Runs an analysis of the given tradition, at the location given in $rank,
415against the graph of the stemma specified in $stemma_id. The argument
416@merge_relationship_types is an optional list of relationship types for
417which readings so related should be treated as equivalent.
418
419Returns a nested array data structure as follows:
420
421 [ [ group_list, is_genealogical ], [ group_list, is_genealogical ] ... ]
422
423where the group list is the array of arrays passed in for each element of @groups,
424possibly with the addition of hypothetical readings.
425
426
427=cut
428
429sub perl_solver {
e59b8faa 430 my( $stemma, @groups ) = @_;
431 my $graph = $stemma->graph;
432 my @answer;
433 foreach my $g ( @groups ) {
434 push( @answer, _solve_variant_location( $graph, $g ) );
435 }
436 return \@answer;
fae07016 437}
438
e59b8faa 439sub _solve_variant_location {
440 my( $graph, $groups ) = @_;
fae07016 441 # Now do the work.
e59b8faa 442 my $contig = {};
443 my $subgraph = {};
444 my $is_conflicted;
445 my $conflict = {};
446
447 # Mark each ms as in its own group, first.
448 foreach my $g ( @$groups ) {
449 my $gst = wit_stringify( $g );
450 map { $contig->{$_} = $gst } @$g;
451 }
452
453 # Now for each unmarked node in the graph, initialize an array
454 # for possible group memberships. We will use this later to
455 # resolve potential conflicts.
456 map { $contig->{$_} = [] unless $contig->{$_} } $graph->vertices;
457 foreach my $g ( sort { scalar @$b <=> scalar @$a } @$groups ) {
458 my $gst = wit_stringify( $g ); # This is the group name
459 # Copy the graph, and delete all non-members from the new graph.
460 my $part = $graph->copy;
461 my @group_roots;
462 $part->delete_vertices(
463 grep { !ref( $contig->{$_} ) && $contig->{$_} ne $gst } $graph->vertices );
464
465 # Now look to see if our group is connected.
466 if( @$g > 1 ) {
467 # We have to take directionality into account.
468 # How many root nodes do we have?
469 my @roots = grep { ref( $contig->{$_} ) || $contig->{$_} eq $gst }
470 $part->predecessorless_vertices;
471 # Assuming that @$g > 1, find the first root node that has at
472 # least one successor belonging to our group. If this reading
473 # is genealogical, there should be only one, but we will check
474 # that implicitly later.
475 foreach my $root ( @roots ) {
476 # Prune the tree to get rid of extraneous hypotheticals.
477 $root = _prune_subtree( $part, $root, $contig );
478 next unless $root;
479 # Save this root for our group.
480 push( @group_roots, $root );
481 # Get all the successor nodes of our root.
482 }
483 } else {
484 # Dispense with the trivial case of one reading.
485 my $wit = $g->[0];
486 @group_roots = ( $wit );
487 foreach my $v ( $part->vertices ) {
488 $part->delete_vertex( $v ) unless $v eq $wit;
489 }
490 }
491
492 if( @group_roots > 1 ) {
493 $conflict->{$gst} = 1;
494 $is_conflicted = 1;
495 }
496 # Paint the 'hypotheticals' with our group.
497 foreach my $wit ( $part->vertices ) {
498 if( ref( $contig->{$wit} ) ) {
499 push( @{$contig->{$wit}}, $gst );
500 } elsif( $contig->{$wit} ne $gst ) {
501 warn "How did we get here?";
502 }
503 }
504
505
506 # Save the relevant subgraph.
507 $subgraph->{$gst} = $part;
508 }
509
510 # For each of our hypothetical readings, flatten its 'contig' array if
511 # the array contains zero or one group. If we have any unflattened arrays,
512 # we may need to run the resolution process. If the reading is already known
513 # to have a conflict, flatten the 'contig' array to nothing; we won't resolve
514 # it.
515 my @resolve;
516 foreach my $wit ( keys %$contig ) {
517 next unless ref( $contig->{$wit} );
518 if( @{$contig->{$wit}} > 1 ) {
519 if( $is_conflicted ) {
520 $contig->{$wit} = ''; # We aren't going to decide.
521 } else {
522 push( @resolve, $wit );
523 }
524 } else {
525 my $gst = pop @{$contig->{$wit}};
526 $contig->{$wit} = $gst || '';
527 }
528 }
529
530 if( @resolve ) {
531 my $still_contig = {};
532 foreach my $h ( @resolve ) {
533 # For each of the hypothetical readings with more than one possibility,
534 # try deleting it from each of its member subgraphs in turn, and see
535 # if that breaks the contiguous grouping.
536 # TODO This can still break in a corner case where group A can use
537 # either vertex 1 or 2, and group B can use either vertex 2 or 1.
538 # Revisit this if necessary; it could get brute-force nasty.
539 foreach my $gst ( @{$contig->{$h}} ) {
540 my $gpart = $subgraph->{$gst}->copy();
541 # If we have come this far, there is only one root and everything
542 # is reachable from it.
543 my( $root ) = $gpart->predecessorless_vertices;
544 my $reachable = {};
545 map { $reachable->{$_} = 1 } $gpart->vertices;
546
547 # Try deleting the hypothetical node.
548 $gpart->delete_vertex( $h );
549 if( $h eq $root ) {
550 # See if we still have a single root.
551 my @roots = $gpart->predecessorless_vertices;
552 warn "This shouldn't have happened" unless @roots;
553 if( @roots > 1 ) {
554 # $h is needed by this group.
555 if( exists( $still_contig->{$h} ) ) {
556 # Conflict!
557 $conflict->{$gst} = 1;
558 $still_contig->{$h} = '';
559 } else {
560 $still_contig->{$h} = $gst;
561 }
562 }
563 } else {
564 # $h is somewhere in the middle. See if everything
565 # else can still be reached from the root.
566 my %still_reachable = ( $root => 1 );
567 map { $still_reachable{$_} = 1 }
568 $gpart->all_successors( $root );
569 foreach my $v ( keys %$reachable ) {
570 next if $v eq $h;
571 if( !$still_reachable{$v}
572 && ( $contig->{$v} eq $gst
573 || ( exists $still_contig->{$v}
574 && $still_contig->{$v} eq $gst ) ) ) {
575 # We need $h.
576 if( exists $still_contig->{$h} ) {
577 # Conflict!
578 $conflict->{$gst} = 1;
579 $still_contig->{$h} = '';
580 } else {
581 $still_contig->{$h} = $gst;
582 }
583 last;
584 } # else we don't need $h in this group.
585 } # end foreach $v
586 } # endif $h eq $root
587 } # end foreach $gst
588 } # end foreach $h
589
590 # Now we have some hypothetical vertices in $still_contig that are the
591 # "real" group memberships. Replace these in $contig.
592 foreach my $v ( keys %$contig ) {
593 next unless ref $contig->{$v};
594 $contig->{$v} = $still_contig->{$v};
595 }
596 } # end if @resolve
597
598 my $is_genealogical = keys %$conflict ? JSON::false : JSON::true;
599 my $variant_row = [ [], $is_genealogical ];
600 # Fill in the groupings from $contig.
601 foreach my $g ( @$groups ) {
602 my $gst = wit_stringify( $g );
603 my @realgroup = grep { $contig->{$_} eq $gst } keys %$contig;
604 push( @{$variant_row->[0]}, \@realgroup );
605 }
606 return $variant_row;
607}
fae07016 608
7f52eac8 609sub _prune_subtree {
231d71fc 610 my( $tree, $root, $contighash ) = @_;
611 # First, delete hypothetical leaves / orphans until there are none left.
612 my @orphan_hypotheticals = grep { ref( $contighash->{$_} ) }
613 $tree->successorless_vertices;
614 while( @orphan_hypotheticals ) {
615 $tree->delete_vertices( @orphan_hypotheticals );
616 @orphan_hypotheticals = grep { ref( $contighash->{$_} ) }
617 $tree->successorless_vertices;
618 }
619 # Then delete a hypothetical root with only one successor, moving the
bebec0e9 620 # root to the first child that has no other predecessors.
231d71fc 621 while( $tree->successors( $root ) == 1 && ref $contighash->{$root} ) {
622 my @nextroot = $tree->successors( $root );
623 $tree->delete_vertex( $root );
bebec0e9 624 ( $root ) = grep { $tree->is_predecessorless_vertex( $_ ) } @nextroot;
231d71fc 625 }
626 # The tree has been modified in place, but we need to know the new root.
bebec0e9 627 $root = undef unless $root && $tree->has_vertex( $root );
231d71fc 628 return $root;
629}
d71100ed 630# Add the variant, subject to a.c. representation logic.
631# This assumes that we will see the 'main' version before the a.c. version.
632sub add_variant_wit {
633 my( $arr, $wit, $acstr ) = @_;
634 my $skip;
635 if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
636 my $real = $1;
637 $skip = grep { $_ =~ /^\Q$real\E$/ } @$arr;
638 }
639 push( @$arr, $wit ) unless $skip;
640}
641
5be0cdeb 642sub _useful_variant {
643 my( $group_readings, $graph, $acstr ) = @_;
644
645 # TODO Decide what to do with AC witnesses
646
647 # Sort by group size and return
648 my $is_useful = 0;
649 my( @readings, @groups ); # The sorted groups for our answer.
650 foreach my $rdg ( sort { @{$group_readings->{$b}} <=> @{$group_readings->{$a}} }
651 keys %$group_readings ) {
652 push( @readings, $rdg );
653 push( @groups, $group_readings->{$rdg} );
654 if( @{$group_readings->{$rdg}} > 1 ) {
655 $is_useful++;
656 } else {
657 my( $wit ) = @{$group_readings->{$rdg}};
658 $wit =~ s/^(.*)\Q$acstr\E$/$1/;
659 $is_useful++ unless( $graph->is_sink_vertex( $wit ) );
660 }
661 }
662 if( $is_useful > 1 ) {
663 return( \@readings, \@groups );
664 } else {
665 return( [], [] );
666 }
667}
668
7f52eac8 669=head2 wit_stringify( $groups )
670
671Takes an array of witness groupings and produces a string like
672['A','B'] / ['C','D','E'] / ['F']
d71100ed 673
7f52eac8 674=cut
d71100ed 675
676sub wit_stringify {
677 my $groups = shift;
678 my @gst;
679 # If we were passed an array of witnesses instead of an array of
680 # groupings, then "group" the witnesses first.
681 unless( ref( $groups->[0] ) ) {
682 my $mkgrp = [ $groups ];
683 $groups = $mkgrp;
684 }
685 foreach my $g ( @$groups ) {
686 push( @gst, '[' . join( ',', map { "'$_'" } @$g ) . ']' );
687 }
688 return join( ' / ', @gst );
689}
7f52eac8 690
5be0cdeb 691# Helper function to ensure that X and X a.c. never appear in the same list.
692sub _add_to_witlist {
693 my( $wit, $list, $acstr ) = @_;
694 my %inlist;
695 my $idx = 0;
696 map { $inlist{$_} = $idx++ } @$list;
697 if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
698 my $acwit = $1;
699 unless( exists $inlist{$acwit} ) {
700 push( @$list, $acwit.$acstr );
701 }
702 } else {
703 if( exists( $inlist{$wit.$acstr} ) ) {
704 # Replace the a.c. version with the main witness
705 my $i = $inlist{$wit.$acstr};
706 $list->[$i] = $wit;
707 } else {
708 push( @$list, $wit );
709 }
710 }
711}
712
bebec0e9 713sub _symmdiff {
714 my( $lista, $listb ) = @_;
7f52eac8 715 my %union;
716 my %scalars;
717 map { $union{$_} = 1; $scalars{$_} = $_ } @$lista;
718 map { $union{$_} += 1; $scalars{$_} = $_ } @$listb;
bebec0e9 719 my @set = grep { $union{$_} == 1 } keys %union;
7f52eac8 720 return map { $scalars{$_} } @set;
721}
722
7231;
724
725=head1 LICENSE
726
727This package is free software and is provided "as is" without express
728or implied warranty. You can redistribute it and/or modify it under
729the same terms as Perl itself.
730
731=head1 AUTHOR
732
733Tara L Andrews E<lt>aurum@cpan.orgE<gt>