refactor GraphML write/parse to use Moose introspection
[scpubgit/stemmatology.git] / lib / Text / Tradition / Analysis.pm
CommitLineData
d71100ed 1package Text::Tradition::Analysis;
2
3use strict;
4use warnings;
e4386ba9 5use Benchmark;
88a6bac5 6use Encode qw/ encode_utf8 /;
d1348d38 7use Exporter 'import';
88a6bac5 8use JSON qw/ encode_json decode_json /;
9use LWP::UserAgent;
d71100ed 10use Text::Tradition;
11use Text::Tradition::Stemma;
12
d1348d38 13use vars qw/ @EXPORT_OK /;
a2cf85dd 14@EXPORT_OK = qw/ run_analysis group_variants analyze_variant_location wit_stringify /;
d1348d38 15
7f52eac8 16=head1 NAME
17
18Text::Tradition::Analysis - functions for stemma analysis of a tradition
19
20=head1 SYNOPSIS
21
22 use Text::Tradition;
23 use Text::Tradition::Analysis qw/ run_analysis analyze_variant_location /;
24 my $t = Text::Tradition->new(
25 'name' => 'this is a text',
26 'input' => 'TEI',
27 'file' => '/path/to/tei_parallel_seg_file.xml' );
28 $t->add_stemma( 'dotfile' => $stemmafile );
29
30 my $variant_data = run_analysis( $tradition );
31 # Recalculate rank $n treating all orthographic variants as equivalent
32 my $reanalyze = analyze_variant_location( $tradition, $n, 0, 'orthographic' );
33
34=head1 DESCRIPTION
35
36Text::Tradition is a library for representation and analysis of collated
37texts, particularly medieval ones. The Collation is the central feature of
38a Tradition, where the text, its sequence of readings, and its relationships
39between readings are actually kept.
40
41=head1 SUBROUTINES
42
88a6bac5 43=head2 run_analysis( $tradition, %opts )
7f52eac8 44
88a6bac5 45Runs the analysis described in analyze_variant_location on every location in the
46collation of the given tradition, with the given options. These include:
7f52eac8 47
88a6bac5 48=over 4
49
50=item * stemma_id - Specify which of the tradition's stemmata to use. Default
51is 0 (i.e. the first).
52
53=item * ranks - Specify a list of location ranks to analyze; exclude the rest.
54
55=item * merge_types - Specify a list of relationship types, where related readings
56should be treated as identical for the purposes of analysis.
57
58=back
7f52eac8 59
60=begin testing
61
62use Text::Tradition;
63use Text::Tradition::Analysis qw/ run_analysis analyze_variant_location /;
64
65my $datafile = 't/data/florilegium_tei_ps.xml';
66my $tradition = Text::Tradition->new( 'input' => 'TEI',
67 'name' => 'test0',
68 'file' => $datafile );
69my $s = $tradition->add_stemma( 'dotfile' => 't/data/florilegium.dot' );
70is( ref( $s ), 'Text::Tradition::Stemma', "Added stemma to tradition" );
71
f00cefe8 72my %expected_genealogical = (
a44aaf2a 73 1 => 0,
f00cefe8 74 2 => 1,
a44aaf2a 75 3 => 0,
76 5 => 0,
77 7 => 0,
78 8 => 0,
79 10 => 0,
f00cefe8 80 13 => 1,
a44aaf2a 81 33 => 0,
82 34 => 0,
83 37 => 0,
84 60 => 0,
f00cefe8 85 81 => 1,
a44aaf2a 86 84 => 0,
87 87 => 0,
88 101 => 0,
89 102 => 0,
f00cefe8 90 122 => 1,
a44aaf2a 91 157 => 0,
f00cefe8 92 166 => 1,
93 169 => 1,
a44aaf2a 94 200 => 0,
f00cefe8 95 216 => 1,
96 217 => 1,
97 219 => 1,
98 241 => 1,
99 242 => 1,
100 243 => 1,
101);
102
7f52eac8 103my $data = run_analysis( $tradition );
f00cefe8 104foreach my $row ( @{$data->{'variants'}} ) {
a44aaf2a 105 # Account for rows that used to be "not useful"
106 unless( exists $expected_genealogical{$row->{'id'}} ) {
107 $expected_genealogical{$row->{'id'}} = 1;
108 }
f00cefe8 109 is( $row->{'genealogical'}, $expected_genealogical{$row->{'id'}},
110 "Got correct genealogical flag for row " . $row->{'id'} );
111}
a44aaf2a 112is( $data->{'conflict_count'}, 34, "Got right conflict count" );
113is( $data->{'variant_count'}, 58, "Got right total variant number" );
7f52eac8 114
115=end testing
116
117=cut
118
d71100ed 119sub run_analysis {
88a6bac5 120 my( $tradition, %opts ) = @_;
f00cefe8 121 my $c = $tradition->collation;
88a6bac5 122
123 my $stemma_id = $opts{'stemma_id'} || 0;
1d73ecad 124 my @ranks = ref( $opts{'ranks'} ) eq 'ARRAY' ? @{$opts{'ranks'}} : ();
125 my @collapse = ref( $opts{'merge_types'} ) eq 'ARRAY' ? @{$opts{'merge_types'}} : ();
88a6bac5 126
127 # Get the stemma
128 my $stemma = $tradition->stemma( $stemma_id );
129 # Figure out which witnesses we are working with
130 my @lacunose = $stemma->hypotheticals;
fae07016 131 my @tradition_wits = map { $_->sigil } $tradition->witnesses;
132 map { push( @tradition_wits, $_->sigil."_ac" ) if $_->is_layered }
133 $tradition->witnesses;
134 push( @lacunose, _symmdiff( [ $stemma->witnesses ], \@tradition_wits ) );
88a6bac5 135
136 # Find and mark 'common' ranks for exclusion, unless they were
137 # explicitly specified.
138 unless( @ranks ) {
139 my %common_rank;
a44aaf2a 140 foreach my $rdg ( $c->common_readings ) {
88a6bac5 141 $common_rank{$rdg->rank} = 1;
142 }
143 @ranks = grep { !$common_rank{$_} } ( 1 .. $c->end->rank-1 );
d71100ed 144 }
7f52eac8 145
88a6bac5 146 # Group the variants to send to the solver
147 my @groups;
a44aaf2a 148 my %lacunae;
88a6bac5 149 foreach my $rank ( @ranks ) {
a44aaf2a 150 my $missing = [ @lacunose ];
151 push( @groups, group_variants( $tradition, $rank, $missing, \@collapse ) );
152 $lacunae{$rank} = $missing;
d71100ed 153 }
154
88a6bac5 155 # Parse the answer
e59b8faa 156 my $answer = solve_variants( $stemma, @groups );
fae07016 157
88a6bac5 158 # Do further analysis on the answer
a44aaf2a 159 my $conflict_count = 0;
88a6bac5 160 foreach my $idx ( 0 .. $#ranks ) {
161 my $location = $answer->{'variants'}->[$idx];
162 # Add the rank back in
163 $location->{'id'} = $ranks[$idx];
a44aaf2a 164 # Add the lacunae back in
165 $location->{'missing'} = $lacunae{$ranks[$idx]};
88a6bac5 166 # Run the extra analysis we need.
88a6bac5 167 analyze_location( $tradition, $stemma->graph, $location );
a44aaf2a 168 # Add the reading text back in
169 foreach my $rdghash ( @{$location->{'readings'}} ) {
170 $conflict_count++
171 if exists $rdghash->{'conflict'} && $rdghash->{'conflict'};
172 my $rdg = $c->reading( $rdghash->{'readingid'} );
173 $rdghash->{'text'} = $rdg ? $rdg->text : $rdghash->{'readingid'};
174 }
88a6bac5 175 }
a44aaf2a 176 $answer->{'conflict_count'} = $conflict_count;
f00cefe8 177
88a6bac5 178 return $answer;
d71100ed 179}
180
7f52eac8 181=head2 group_variants( $tradition, $rank, $lacunose, @merge_relationship_types )
182
183Groups the variants at the given $rank of the collation, treating any
184relationships in @merge_relationship_types as equivalent. $lacunose should
185be a reference to an array, to which the sigla of lacunose witnesses at this
186rank will be appended.
187
188Returns two ordered lists $readings, $groups, where $readings->[$n] is attested
189by the witnesses listed in $groups->[$n].
190
191=cut
192
193# Return group_readings, groups, lacunose
d1348d38 194sub group_variants {
7f52eac8 195 my( $tradition, $rank, $lacunose, $collapse ) = @_;
196 my $c = $tradition->collation;
fae07016 197 my $aclabel = $c->ac_label;
7f52eac8 198 # Get the alignment table readings
199 my %readings_at_rank;
200 my @gap_wits;
1d73ecad 201 foreach my $tablewit ( @{$c->alignment_table->{'alignment'}} ) {
7f52eac8 202 my $rdg = $tablewit->{'tokens'}->[$rank-1];
fae07016 203 my $wit = $tablewit->{'witness'};
1d73ecad 204 $wit =~ s/^(.*)\Q$aclabel\E$/${1}_ac/;
7f52eac8 205 if( $rdg && $rdg->{'t'}->is_lacuna ) {
1d73ecad 206 _add_to_witlist( $wit, $lacunose, '_ac' );
7f52eac8 207 } elsif( $rdg ) {
208 $readings_at_rank{$rdg->{'t'}->text} = $rdg->{'t'};
209 } else {
1d73ecad 210 _add_to_witlist( $wit, \@gap_wits, '_ac' );
7f52eac8 211 }
212 }
d1348d38 213
7f52eac8 214 # Group the readings, collapsing groups by relationship if needed
215 my %grouped_readings;
216 foreach my $rdg ( sort { $b->witnesses <=> $a->witnesses } values %readings_at_rank ) {
217 # Skip readings that have been collapsed into others.
f00cefe8 218 next if exists $grouped_readings{$rdg->id} && !$grouped_readings{$rdg->id};
7f52eac8 219 my @wits = $rdg->witnesses;
1d73ecad 220 map { s/\Q$aclabel\E$/_ac/ } @wits;
7f52eac8 221 if( $collapse ) {
222 my $filter = sub { my $r = $_[0]; grep { $_ eq $r->type } @$collapse; };
223 foreach my $other ( $rdg->related_readings( $filter ) ) {
fae07016 224 my @otherwits = $other->witnesses;
1d73ecad 225 map { s/\Q$aclabel\E$/_ac/ } @otherwits;
fae07016 226 push( @wits, @otherwits );
f00cefe8 227 $grouped_readings{$other->id} = 0;
d1348d38 228 }
229 }
f00cefe8 230 $grouped_readings{$rdg->id} = \@wits;
7f52eac8 231 }
232 $grouped_readings{'(omitted)'} = \@gap_wits if @gap_wits;
233 # Get rid of our collapsed readings
234 map { delete $grouped_readings{$_} unless $grouped_readings{$_} }
235 keys %grouped_readings
236 if $collapse;
237
5be0cdeb 238 return \%grouped_readings;
d1348d38 239}
240
88a6bac5 241=head2 solve_variants( $graph, @groups )
242
243Sends the set of groups to the external graph solver service and returns
244a cleaned-up answer, adding the rank IDs back where they belong.
245
246The JSON has the form
247 { "graph": [ stemmagraph DOT string without newlines ],
248 "groupings": [ array of arrays of groups, one per rank ] }
249
250The answer has the form
251 { "variants" => [ array of variant location structures ],
252 "variant_count" => total,
253 "conflict_count" => number of conflicts detected,
254 "genealogical_count" => number of solutions found }
255
256=cut
257
258sub solve_variants {
e59b8faa 259 my( $stemma, @groups ) = @_;
88a6bac5 260
261 # Make the json with stemma + groups
e59b8faa 262 my $jsonstruct = { 'graph' => $stemma->editable( ' ' ), 'groupings' => [] };
88a6bac5 263 foreach my $ghash ( @groups ) {
264 my @grouping;
265 foreach my $k ( sort keys %$ghash ) {
266 push( @grouping, $ghash->{$k} );
267 }
268 push( @{$jsonstruct->{'groupings'}}, \@grouping );
269 }
270 my $json = encode_json( $jsonstruct );
271
272 # Send it off and get the result
273 my $solver_url = 'http://byzantini.st/cgi-bin/graphcalc.cgi';
274 my $ua = LWP::UserAgent->new();
275 my $resp = $ua->post( $solver_url, 'Content-Type' => 'application/json',
276 'Content' => $json );
277
278 my $answer;
279 if( $resp->is_success ) {
280 $answer = decode_json( $resp->content );
281 } else {
fae07016 282 # Fall back to the old method.
283 warn "IDP solver returned " . $resp->status_line . " / " . $resp->content
284 . "; falling back to perl method";
e59b8faa 285 $answer = perl_solver( $stemma, @{$jsonstruct->{'groupings'}} );
88a6bac5 286 }
287
288 # Fold the result back into what we know about the groups.
289 my $variants = [];
290 my $genealogical = 0;
291 foreach my $idx ( 0 .. $#groups ) {
292 my( $calc_groups, $result ) = @{$answer->[$idx]};
293 $genealogical++ if $result;
294 my $input_group = $groups[$idx];
295 foreach my $k ( sort keys %$input_group ) {
296 my $cg = shift @$calc_groups;
297 $input_group->{$k} = $cg;
298 }
299 my $vstruct = {
300 'genealogical' => $result,
301 'readings' => [],
fae07016 302 };
88a6bac5 303 foreach my $k ( keys %$input_group ) {
304 push( @{$vstruct->{'readings'}},
fae07016 305 { 'readingid' => $k, 'group' => $input_group->{$k}} );
88a6bac5 306 }
307 push( @$variants, $vstruct );
308 }
309
310 return { 'variants' => $variants,
311 'variant_count' => scalar @$variants,
312 'genealogical_count' => $genealogical };
313}
314
fae07016 315=head2 analyze_location ( $tradition, $graph, $location_hash )
7f52eac8 316
fae07016 317Given the tradition, its stemma graph, and the solution from the graph solver,
318work out the rest of the information we want. For each reading we need missing,
319conflict, reading_parents, independent_occurrence, followed, not_followed, and follow_unknown. Alters the location_hash in place.
7f52eac8 320
321=cut
732152b1 322
fae07016 323sub analyze_location {
324 my ( $tradition, $graph, $variant_row ) = @_;
325
326 # Make a hash of all known node memberships, and make the subgraphs.
327 my $contig = {};
328 my $reading_roots = {};
329 my $subgraph = {};
330 foreach my $rdghash ( @{$variant_row->{'readings'}} ) {
331 my $rid = $rdghash->{'readingid'};
332 map { $contig->{$_} = $rid } @{$rdghash->{'group'}};
c4a4fb1b 333
fae07016 334 # Make the subgraph.
335 my $part = $graph->copy;
336 my %these_vertices;
337 map { $these_vertices{$_} = 1 } @{$rdghash->{'group'}};
338 $part->delete_vertices( grep { !$these_vertices{$_} } $part->vertices );
339 $subgraph->{$rid} = $part;
340 # Get the reading roots.
341 map { $reading_roots->{$_} = $rid } $part->predecessorless_vertices;
bebec0e9 342 }
343
fae07016 344 # Now that we have all the node group memberships, calculate followed/
bebec0e9 345 # non-followed/unknown values for each reading. Also figure out the
346 # reading's evident parent(s).
347 foreach my $rdghash ( @{$variant_row->{'readings'}} ) {
fae07016 348 # Group string key - TODO do we need this?
349 my $gst = wit_stringify( $rdghash->{'group'} );
350 my $rid = $rdghash->{'readingid'};
351 # Get the subgraph
352 my $part = $subgraph->{$rid};
353
354 # Start figuring things out.
bebec0e9 355 my @roots = $part->predecessorless_vertices;
356 $rdghash->{'independent_occurrence'} = scalar @roots;
357 $rdghash->{'followed'} = scalar( $part->vertices ) - scalar( @roots );
358 # Find the parent readings, if any, of this reading.
f00cefe8 359 my %rdgparents;
bebec0e9 360 foreach my $wit ( @roots ) {
f00cefe8 361 # Look in the main stemma to find this witness's extant or known-reading
362 # immediate ancestor(s), and look up the reading that each ancestor olds.
363 my @check = $graph->predecessors( $wit );
364 while( @check ) {
365 my @next;
366 foreach my $wparent( @check ) {
fae07016 367 my $preading = $contig->{$wparent};
368 if( $preading ) {
369 $rdgparents{$preading} = 1;
f00cefe8 370 } else {
371 push( @next, $graph->predecessors( $wparent ) );
372 }
373 }
374 @check = @next;
375 }
bebec0e9 376 }
f00cefe8 377 $rdghash->{'reading_parents'} = [ keys %rdgparents ];
bebec0e9 378
379 # Find the number of times this reading was altered, and the number of
380 # times we're not sure.
381 my( %nofollow, %unknownfollow );
382 foreach my $wit ( $part->vertices ) {
383 foreach my $wchild ( $graph->successors( $wit ) ) {
384 next if $part->has_vertex( $wchild );
fae07016 385 if( $reading_roots->{$wchild} && $contig->{$wchild} ) {
bebec0e9 386 # It definitely changed here.
387 $nofollow{$wchild} = 1;
388 } elsif( !($contig->{$wchild}) ) {
389 # The child is a hypothetical node not definitely in
390 # any group. Answer is unknown.
391 $unknownfollow{$wchild} = 1;
392 } # else it's a non-root node in a known group, and therefore
393 # is presumed to have its reading from its group, not this link.
394 }
395 }
396 $rdghash->{'not_followed'} = keys %nofollow;
397 $rdghash->{'follow_unknown'} = keys %unknownfollow;
fae07016 398
399 # Now say whether this reading represents a conflict.
400 unless( $variant_row->{'genealogical'} ) {
401 $rdghash->{'conflict'} = @roots != 1;
402 }
c4a4fb1b 403 }
d71100ed 404}
405
fae07016 406
407=head2 perl_solver( $tradition, $rank, $stemma_id, @merge_relationship_types )
408
409** NOTE ** This method should hopefully not be called - it is not guaranteed
410to be correct. Serves as a backup for the real solver.
411
412Runs an analysis of the given tradition, at the location given in $rank,
413against the graph of the stemma specified in $stemma_id. The argument
414@merge_relationship_types is an optional list of relationship types for
415which readings so related should be treated as equivalent.
416
417Returns a nested array data structure as follows:
418
419 [ [ group_list, is_genealogical ], [ group_list, is_genealogical ] ... ]
420
421where the group list is the array of arrays passed in for each element of @groups,
422possibly with the addition of hypothetical readings.
423
424
425=cut
426
427sub perl_solver {
e59b8faa 428 my( $stemma, @groups ) = @_;
429 my $graph = $stemma->graph;
430 my @answer;
431 foreach my $g ( @groups ) {
432 push( @answer, _solve_variant_location( $graph, $g ) );
433 }
434 return \@answer;
fae07016 435}
436
e59b8faa 437sub _solve_variant_location {
438 my( $graph, $groups ) = @_;
fae07016 439 # Now do the work.
e59b8faa 440 my $contig = {};
441 my $subgraph = {};
442 my $is_conflicted;
443 my $conflict = {};
444
445 # Mark each ms as in its own group, first.
446 foreach my $g ( @$groups ) {
447 my $gst = wit_stringify( $g );
448 map { $contig->{$_} = $gst } @$g;
449 }
450
451 # Now for each unmarked node in the graph, initialize an array
452 # for possible group memberships. We will use this later to
453 # resolve potential conflicts.
454 map { $contig->{$_} = [] unless $contig->{$_} } $graph->vertices;
455 foreach my $g ( sort { scalar @$b <=> scalar @$a } @$groups ) {
456 my $gst = wit_stringify( $g ); # This is the group name
457 # Copy the graph, and delete all non-members from the new graph.
458 my $part = $graph->copy;
459 my @group_roots;
460 $part->delete_vertices(
461 grep { !ref( $contig->{$_} ) && $contig->{$_} ne $gst } $graph->vertices );
462
463 # Now look to see if our group is connected.
464 if( @$g > 1 ) {
465 # We have to take directionality into account.
466 # How many root nodes do we have?
467 my @roots = grep { ref( $contig->{$_} ) || $contig->{$_} eq $gst }
468 $part->predecessorless_vertices;
469 # Assuming that @$g > 1, find the first root node that has at
470 # least one successor belonging to our group. If this reading
471 # is genealogical, there should be only one, but we will check
472 # that implicitly later.
473 foreach my $root ( @roots ) {
474 # Prune the tree to get rid of extraneous hypotheticals.
475 $root = _prune_subtree( $part, $root, $contig );
476 next unless $root;
477 # Save this root for our group.
478 push( @group_roots, $root );
479 # Get all the successor nodes of our root.
480 }
481 } else {
482 # Dispense with the trivial case of one reading.
483 my $wit = $g->[0];
484 @group_roots = ( $wit );
485 foreach my $v ( $part->vertices ) {
486 $part->delete_vertex( $v ) unless $v eq $wit;
487 }
488 }
489
490 if( @group_roots > 1 ) {
491 $conflict->{$gst} = 1;
492 $is_conflicted = 1;
493 }
494 # Paint the 'hypotheticals' with our group.
495 foreach my $wit ( $part->vertices ) {
496 if( ref( $contig->{$wit} ) ) {
497 push( @{$contig->{$wit}}, $gst );
498 } elsif( $contig->{$wit} ne $gst ) {
499 warn "How did we get here?";
500 }
501 }
502
503
504 # Save the relevant subgraph.
505 $subgraph->{$gst} = $part;
506 }
507
508 # For each of our hypothetical readings, flatten its 'contig' array if
509 # the array contains zero or one group. If we have any unflattened arrays,
510 # we may need to run the resolution process. If the reading is already known
511 # to have a conflict, flatten the 'contig' array to nothing; we won't resolve
512 # it.
513 my @resolve;
514 foreach my $wit ( keys %$contig ) {
515 next unless ref( $contig->{$wit} );
516 if( @{$contig->{$wit}} > 1 ) {
517 if( $is_conflicted ) {
518 $contig->{$wit} = ''; # We aren't going to decide.
519 } else {
520 push( @resolve, $wit );
521 }
522 } else {
523 my $gst = pop @{$contig->{$wit}};
524 $contig->{$wit} = $gst || '';
525 }
526 }
527
528 if( @resolve ) {
529 my $still_contig = {};
530 foreach my $h ( @resolve ) {
531 # For each of the hypothetical readings with more than one possibility,
532 # try deleting it from each of its member subgraphs in turn, and see
533 # if that breaks the contiguous grouping.
534 # TODO This can still break in a corner case where group A can use
535 # either vertex 1 or 2, and group B can use either vertex 2 or 1.
536 # Revisit this if necessary; it could get brute-force nasty.
537 foreach my $gst ( @{$contig->{$h}} ) {
538 my $gpart = $subgraph->{$gst}->copy();
539 # If we have come this far, there is only one root and everything
540 # is reachable from it.
541 my( $root ) = $gpart->predecessorless_vertices;
542 my $reachable = {};
543 map { $reachable->{$_} = 1 } $gpart->vertices;
544
545 # Try deleting the hypothetical node.
546 $gpart->delete_vertex( $h );
547 if( $h eq $root ) {
548 # See if we still have a single root.
549 my @roots = $gpart->predecessorless_vertices;
550 warn "This shouldn't have happened" unless @roots;
551 if( @roots > 1 ) {
552 # $h is needed by this group.
553 if( exists( $still_contig->{$h} ) ) {
554 # Conflict!
555 $conflict->{$gst} = 1;
556 $still_contig->{$h} = '';
557 } else {
558 $still_contig->{$h} = $gst;
559 }
560 }
561 } else {
562 # $h is somewhere in the middle. See if everything
563 # else can still be reached from the root.
564 my %still_reachable = ( $root => 1 );
565 map { $still_reachable{$_} = 1 }
566 $gpart->all_successors( $root );
567 foreach my $v ( keys %$reachable ) {
568 next if $v eq $h;
569 if( !$still_reachable{$v}
570 && ( $contig->{$v} eq $gst
571 || ( exists $still_contig->{$v}
572 && $still_contig->{$v} eq $gst ) ) ) {
573 # We need $h.
574 if( exists $still_contig->{$h} ) {
575 # Conflict!
576 $conflict->{$gst} = 1;
577 $still_contig->{$h} = '';
578 } else {
579 $still_contig->{$h} = $gst;
580 }
581 last;
582 } # else we don't need $h in this group.
583 } # end foreach $v
584 } # endif $h eq $root
585 } # end foreach $gst
586 } # end foreach $h
587
588 # Now we have some hypothetical vertices in $still_contig that are the
589 # "real" group memberships. Replace these in $contig.
590 foreach my $v ( keys %$contig ) {
591 next unless ref $contig->{$v};
592 $contig->{$v} = $still_contig->{$v};
593 }
594 } # end if @resolve
595
596 my $is_genealogical = keys %$conflict ? JSON::false : JSON::true;
597 my $variant_row = [ [], $is_genealogical ];
598 # Fill in the groupings from $contig.
599 foreach my $g ( @$groups ) {
600 my $gst = wit_stringify( $g );
601 my @realgroup = grep { $contig->{$_} eq $gst } keys %$contig;
602 push( @{$variant_row->[0]}, \@realgroup );
603 }
604 return $variant_row;
605}
fae07016 606
7f52eac8 607sub _prune_subtree {
231d71fc 608 my( $tree, $root, $contighash ) = @_;
609 # First, delete hypothetical leaves / orphans until there are none left.
610 my @orphan_hypotheticals = grep { ref( $contighash->{$_} ) }
611 $tree->successorless_vertices;
612 while( @orphan_hypotheticals ) {
613 $tree->delete_vertices( @orphan_hypotheticals );
614 @orphan_hypotheticals = grep { ref( $contighash->{$_} ) }
615 $tree->successorless_vertices;
616 }
617 # Then delete a hypothetical root with only one successor, moving the
bebec0e9 618 # root to the first child that has no other predecessors.
231d71fc 619 while( $tree->successors( $root ) == 1 && ref $contighash->{$root} ) {
620 my @nextroot = $tree->successors( $root );
621 $tree->delete_vertex( $root );
bebec0e9 622 ( $root ) = grep { $tree->is_predecessorless_vertex( $_ ) } @nextroot;
231d71fc 623 }
624 # The tree has been modified in place, but we need to know the new root.
bebec0e9 625 $root = undef unless $root && $tree->has_vertex( $root );
231d71fc 626 return $root;
627}
d71100ed 628# Add the variant, subject to a.c. representation logic.
629# This assumes that we will see the 'main' version before the a.c. version.
630sub add_variant_wit {
631 my( $arr, $wit, $acstr ) = @_;
632 my $skip;
633 if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
634 my $real = $1;
635 $skip = grep { $_ =~ /^\Q$real\E$/ } @$arr;
636 }
637 push( @$arr, $wit ) unless $skip;
638}
639
5be0cdeb 640sub _useful_variant {
641 my( $group_readings, $graph, $acstr ) = @_;
642
643 # TODO Decide what to do with AC witnesses
644
645 # Sort by group size and return
646 my $is_useful = 0;
647 my( @readings, @groups ); # The sorted groups for our answer.
648 foreach my $rdg ( sort { @{$group_readings->{$b}} <=> @{$group_readings->{$a}} }
649 keys %$group_readings ) {
650 push( @readings, $rdg );
651 push( @groups, $group_readings->{$rdg} );
652 if( @{$group_readings->{$rdg}} > 1 ) {
653 $is_useful++;
654 } else {
655 my( $wit ) = @{$group_readings->{$rdg}};
656 $wit =~ s/^(.*)\Q$acstr\E$/$1/;
657 $is_useful++ unless( $graph->is_sink_vertex( $wit ) );
658 }
659 }
660 if( $is_useful > 1 ) {
661 return( \@readings, \@groups );
662 } else {
663 return( [], [] );
664 }
665}
666
7f52eac8 667=head2 wit_stringify( $groups )
668
669Takes an array of witness groupings and produces a string like
670['A','B'] / ['C','D','E'] / ['F']
d71100ed 671
7f52eac8 672=cut
d71100ed 673
674sub wit_stringify {
675 my $groups = shift;
676 my @gst;
677 # If we were passed an array of witnesses instead of an array of
678 # groupings, then "group" the witnesses first.
679 unless( ref( $groups->[0] ) ) {
680 my $mkgrp = [ $groups ];
681 $groups = $mkgrp;
682 }
683 foreach my $g ( @$groups ) {
684 push( @gst, '[' . join( ',', map { "'$_'" } @$g ) . ']' );
685 }
686 return join( ' / ', @gst );
687}
7f52eac8 688
5be0cdeb 689# Helper function to ensure that X and X a.c. never appear in the same list.
690sub _add_to_witlist {
691 my( $wit, $list, $acstr ) = @_;
692 my %inlist;
693 my $idx = 0;
694 map { $inlist{$_} = $idx++ } @$list;
695 if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
696 my $acwit = $1;
697 unless( exists $inlist{$acwit} ) {
698 push( @$list, $acwit.$acstr );
699 }
700 } else {
701 if( exists( $inlist{$wit.$acstr} ) ) {
702 # Replace the a.c. version with the main witness
703 my $i = $inlist{$wit.$acstr};
704 $list->[$i] = $wit;
705 } else {
706 push( @$list, $wit );
707 }
708 }
709}
710
bebec0e9 711sub _symmdiff {
712 my( $lista, $listb ) = @_;
7f52eac8 713 my %union;
714 my %scalars;
715 map { $union{$_} = 1; $scalars{$_} = $_ } @$lista;
716 map { $union{$_} += 1; $scalars{$_} = $_ } @$listb;
bebec0e9 717 my @set = grep { $union{$_} == 1 } keys %union;
7f52eac8 718 return map { $scalars{$_} } @set;
719}
720
7211;
722
723=head1 LICENSE
724
725This package is free software and is provided "as is" without express
726or implied warranty. You can redistribute it and/or modify it under
727the same terms as Perl itself.
728
729=head1 AUTHOR
730
731Tara L Andrews E<lt>aurum@cpan.orgE<gt>