use the straighten hack whenever explicitly specified, subgraph or no
[scpubgit/stemmatology.git] / lib / Text / Tradition / Analysis.pm
CommitLineData
d71100ed 1package Text::Tradition::Analysis;
2
3use strict;
4use warnings;
e4386ba9 5use Benchmark;
88a6bac5 6use Encode qw/ encode_utf8 /;
d1348d38 7use Exporter 'import';
b4cb2d60 8use Graph;
88a6bac5 9use JSON qw/ encode_json decode_json /;
10use LWP::UserAgent;
d71100ed 11use Text::Tradition;
12use Text::Tradition::Stemma;
13
d1348d38 14use vars qw/ @EXPORT_OK /;
a2cf85dd 15@EXPORT_OK = qw/ run_analysis group_variants analyze_variant_location wit_stringify /;
d1348d38 16
7f52eac8 17=head1 NAME
18
19Text::Tradition::Analysis - functions for stemma analysis of a tradition
20
21=head1 SYNOPSIS
22
23 use Text::Tradition;
24 use Text::Tradition::Analysis qw/ run_analysis analyze_variant_location /;
25 my $t = Text::Tradition->new(
26 'name' => 'this is a text',
27 'input' => 'TEI',
28 'file' => '/path/to/tei_parallel_seg_file.xml' );
29 $t->add_stemma( 'dotfile' => $stemmafile );
30
31 my $variant_data = run_analysis( $tradition );
32 # Recalculate rank $n treating all orthographic variants as equivalent
33 my $reanalyze = analyze_variant_location( $tradition, $n, 0, 'orthographic' );
34
35=head1 DESCRIPTION
36
37Text::Tradition is a library for representation and analysis of collated
38texts, particularly medieval ones. The Collation is the central feature of
39a Tradition, where the text, its sequence of readings, and its relationships
40between readings are actually kept.
41
42=head1 SUBROUTINES
43
88a6bac5 44=head2 run_analysis( $tradition, %opts )
7f52eac8 45
88a6bac5 46Runs the analysis described in analyze_variant_location on every location in the
47collation of the given tradition, with the given options. These include:
7f52eac8 48
88a6bac5 49=over 4
50
51=item * stemma_id - Specify which of the tradition's stemmata to use. Default
52is 0 (i.e. the first).
53
54=item * ranks - Specify a list of location ranks to analyze; exclude the rest.
55
56=item * merge_types - Specify a list of relationship types, where related readings
57should be treated as identical for the purposes of analysis.
58
59=back
7f52eac8 60
61=begin testing
62
63use Text::Tradition;
64use Text::Tradition::Analysis qw/ run_analysis analyze_variant_location /;
65
66my $datafile = 't/data/florilegium_tei_ps.xml';
67my $tradition = Text::Tradition->new( 'input' => 'TEI',
68 'name' => 'test0',
69 'file' => $datafile );
70my $s = $tradition->add_stemma( 'dotfile' => 't/data/florilegium.dot' );
71is( ref( $s ), 'Text::Tradition::Stemma', "Added stemma to tradition" );
72
f00cefe8 73my %expected_genealogical = (
a44aaf2a 74 1 => 0,
f00cefe8 75 2 => 1,
a44aaf2a 76 3 => 0,
77 5 => 0,
78 7 => 0,
79 8 => 0,
80 10 => 0,
f00cefe8 81 13 => 1,
a44aaf2a 82 33 => 0,
83 34 => 0,
84 37 => 0,
85 60 => 0,
f00cefe8 86 81 => 1,
a44aaf2a 87 84 => 0,
88 87 => 0,
89 101 => 0,
90 102 => 0,
f00cefe8 91 122 => 1,
a44aaf2a 92 157 => 0,
f00cefe8 93 166 => 1,
94 169 => 1,
a44aaf2a 95 200 => 0,
f00cefe8 96 216 => 1,
97 217 => 1,
98 219 => 1,
99 241 => 1,
100 242 => 1,
101 243 => 1,
102);
103
7f52eac8 104my $data = run_analysis( $tradition );
f00cefe8 105foreach my $row ( @{$data->{'variants'}} ) {
a44aaf2a 106 # Account for rows that used to be "not useful"
107 unless( exists $expected_genealogical{$row->{'id'}} ) {
108 $expected_genealogical{$row->{'id'}} = 1;
109 }
adc08836 110 my $gen_bool = $row->{'genealogical'} ? 1 : 0;
111 is( $gen_bool, $expected_genealogical{$row->{'id'}},
f00cefe8 112 "Got correct genealogical flag for row " . $row->{'id'} );
113}
a44aaf2a 114is( $data->{'variant_count'}, 58, "Got right total variant number" );
b4cb2d60 115# TODO Make something meaningful of conflict count, maybe test other bits
7f52eac8 116
117=end testing
118
119=cut
120
d71100ed 121sub run_analysis {
88a6bac5 122 my( $tradition, %opts ) = @_;
f00cefe8 123 my $c = $tradition->collation;
88a6bac5 124
125 my $stemma_id = $opts{'stemma_id'} || 0;
1d73ecad 126 my @ranks = ref( $opts{'ranks'} ) eq 'ARRAY' ? @{$opts{'ranks'}} : ();
127 my @collapse = ref( $opts{'merge_types'} ) eq 'ARRAY' ? @{$opts{'merge_types'}} : ();
88a6bac5 128
129 # Get the stemma
130 my $stemma = $tradition->stemma( $stemma_id );
b4cb2d60 131
88a6bac5 132 # Figure out which witnesses we are working with
133 my @lacunose = $stemma->hypotheticals;
fae07016 134 my @tradition_wits = map { $_->sigil } $tradition->witnesses;
b4cb2d60 135 map { push( @tradition_wits, $_->sigil.$c->ac_label ) if $_->is_layered }
fae07016 136 $tradition->witnesses;
137 push( @lacunose, _symmdiff( [ $stemma->witnesses ], \@tradition_wits ) );
88a6bac5 138
139 # Find and mark 'common' ranks for exclusion, unless they were
140 # explicitly specified.
141 unless( @ranks ) {
142 my %common_rank;
a44aaf2a 143 foreach my $rdg ( $c->common_readings ) {
88a6bac5 144 $common_rank{$rdg->rank} = 1;
145 }
146 @ranks = grep { !$common_rank{$_} } ( 1 .. $c->end->rank-1 );
d71100ed 147 }
7f52eac8 148
88a6bac5 149 # Group the variants to send to the solver
150 my @groups;
a44aaf2a 151 my %lacunae;
88a6bac5 152 foreach my $rank ( @ranks ) {
a44aaf2a 153 my $missing = [ @lacunose ];
154 push( @groups, group_variants( $tradition, $rank, $missing, \@collapse ) );
155 $lacunae{$rank} = $missing;
d71100ed 156 }
b4cb2d60 157 $DB::single = 1;
88a6bac5 158 # Parse the answer
e59b8faa 159 my $answer = solve_variants( $stemma, @groups );
fae07016 160
88a6bac5 161 # Do further analysis on the answer
a44aaf2a 162 my $conflict_count = 0;
88a6bac5 163 foreach my $idx ( 0 .. $#ranks ) {
164 my $location = $answer->{'variants'}->[$idx];
165 # Add the rank back in
166 $location->{'id'} = $ranks[$idx];
a44aaf2a 167 # Add the lacunae back in
168 $location->{'missing'} = $lacunae{$ranks[$idx]};
88a6bac5 169 # Run the extra analysis we need.
88a6bac5 170 analyze_location( $tradition, $stemma->graph, $location );
a44aaf2a 171 # Add the reading text back in
172 foreach my $rdghash ( @{$location->{'readings'}} ) {
173 $conflict_count++
174 if exists $rdghash->{'conflict'} && $rdghash->{'conflict'};
175 my $rdg = $c->reading( $rdghash->{'readingid'} );
176 $rdghash->{'text'} = $rdg ? $rdg->text : $rdghash->{'readingid'};
177 }
88a6bac5 178 }
a44aaf2a 179 $answer->{'conflict_count'} = $conflict_count;
f00cefe8 180
88a6bac5 181 return $answer;
d71100ed 182}
183
7f52eac8 184=head2 group_variants( $tradition, $rank, $lacunose, @merge_relationship_types )
185
186Groups the variants at the given $rank of the collation, treating any
187relationships in @merge_relationship_types as equivalent. $lacunose should
188be a reference to an array, to which the sigla of lacunose witnesses at this
189rank will be appended.
190
191Returns two ordered lists $readings, $groups, where $readings->[$n] is attested
192by the witnesses listed in $groups->[$n].
193
194=cut
195
196# Return group_readings, groups, lacunose
d1348d38 197sub group_variants {
7f52eac8 198 my( $tradition, $rank, $lacunose, $collapse ) = @_;
199 my $c = $tradition->collation;
b4cb2d60 200 my $aclabel = $c->ac_label;
7f52eac8 201 # Get the alignment table readings
202 my %readings_at_rank;
203 my @gap_wits;
1d73ecad 204 foreach my $tablewit ( @{$c->alignment_table->{'alignment'}} ) {
7f52eac8 205 my $rdg = $tablewit->{'tokens'}->[$rank-1];
fae07016 206 my $wit = $tablewit->{'witness'};
7f52eac8 207 if( $rdg && $rdg->{'t'}->is_lacuna ) {
b4cb2d60 208 _add_to_witlist( $wit, $lacunose, $aclabel );
7f52eac8 209 } elsif( $rdg ) {
210 $readings_at_rank{$rdg->{'t'}->text} = $rdg->{'t'};
211 } else {
b4cb2d60 212 _add_to_witlist( $wit, \@gap_wits, $aclabel );
7f52eac8 213 }
214 }
d1348d38 215
7f52eac8 216 # Group the readings, collapsing groups by relationship if needed
217 my %grouped_readings;
b4cb2d60 218 foreach my $rdg ( sort { $b->witnesses <=> $a->witnesses }
219 values %readings_at_rank ) {
7f52eac8 220 # Skip readings that have been collapsed into others.
f00cefe8 221 next if exists $grouped_readings{$rdg->id} && !$grouped_readings{$rdg->id};
7f52eac8 222 my @wits = $rdg->witnesses;
223 if( $collapse ) {
224 my $filter = sub { my $r = $_[0]; grep { $_ eq $r->type } @$collapse; };
225 foreach my $other ( $rdg->related_readings( $filter ) ) {
fae07016 226 my @otherwits = $other->witnesses;
fae07016 227 push( @wits, @otherwits );
f00cefe8 228 $grouped_readings{$other->id} = 0;
d1348d38 229 }
230 }
f00cefe8 231 $grouped_readings{$rdg->id} = \@wits;
7f52eac8 232 }
233 $grouped_readings{'(omitted)'} = \@gap_wits if @gap_wits;
234 # Get rid of our collapsed readings
235 map { delete $grouped_readings{$_} unless $grouped_readings{$_} }
236 keys %grouped_readings
237 if $collapse;
238
5be0cdeb 239 return \%grouped_readings;
d1348d38 240}
241
88a6bac5 242=head2 solve_variants( $graph, @groups )
243
244Sends the set of groups to the external graph solver service and returns
245a cleaned-up answer, adding the rank IDs back where they belong.
246
247The JSON has the form
248 { "graph": [ stemmagraph DOT string without newlines ],
249 "groupings": [ array of arrays of groups, one per rank ] }
250
251The answer has the form
252 { "variants" => [ array of variant location structures ],
253 "variant_count" => total,
254 "conflict_count" => number of conflicts detected,
255 "genealogical_count" => number of solutions found }
256
257=cut
258
259sub solve_variants {
e59b8faa 260 my( $stemma, @groups ) = @_;
88a6bac5 261
262 # Make the json with stemma + groups
b4cb2d60 263 my $groupings = [];
88a6bac5 264 foreach my $ghash ( @groups ) {
265 my @grouping;
266 foreach my $k ( sort keys %$ghash ) {
267 push( @grouping, $ghash->{$k} );
268 }
b4cb2d60 269 push( @$groupings, \@grouping );
88a6bac5 270 }
b4cb2d60 271 ## Witness map is a HACK to get around limitations in node names from IDP
272 my $witness_map = {};
273 my $json = encode_json( _safe_wit_strings( $stemma, $groupings, $witness_map ) );
88a6bac5 274
275 # Send it off and get the result
276 my $solver_url = 'http://byzantini.st/cgi-bin/graphcalc.cgi';
277 my $ua = LWP::UserAgent->new();
278 my $resp = $ua->post( $solver_url, 'Content-Type' => 'application/json',
279 'Content' => $json );
280
281 my $answer;
282 if( $resp->is_success ) {
b4cb2d60 283 $answer = _desanitize_names( decode_json( $resp->content ), $witness_map );
88a6bac5 284 } else {
fae07016 285 # Fall back to the old method.
286 warn "IDP solver returned " . $resp->status_line . " / " . $resp->content
287 . "; falling back to perl method";
b4cb2d60 288 $answer = perl_solver( $stemma, @$groupings );
88a6bac5 289 }
290
291 # Fold the result back into what we know about the groups.
292 my $variants = [];
293 my $genealogical = 0;
294 foreach my $idx ( 0 .. $#groups ) {
295 my( $calc_groups, $result ) = @{$answer->[$idx]};
296 $genealogical++ if $result;
297 my $input_group = $groups[$idx];
298 foreach my $k ( sort keys %$input_group ) {
299 my $cg = shift @$calc_groups;
300 $input_group->{$k} = $cg;
301 }
302 my $vstruct = {
303 'genealogical' => $result,
304 'readings' => [],
fae07016 305 };
88a6bac5 306 foreach my $k ( keys %$input_group ) {
307 push( @{$vstruct->{'readings'}},
fae07016 308 { 'readingid' => $k, 'group' => $input_group->{$k}} );
88a6bac5 309 }
310 push( @$variants, $vstruct );
311 }
312
313 return { 'variants' => $variants,
314 'variant_count' => scalar @$variants,
315 'genealogical_count' => $genealogical };
316}
317
b4cb2d60 318#### HACKERY to cope with IDP's limited idea of what a node name looks like ###
319
320sub _safe_wit_strings {
321 my( $stemma, $groupings, $witness_map ) = @_;
322 my $safegraph = Graph->new();
323 # Convert the graph to a safe representation and store the conversion.
324 foreach my $n ( $stemma->graph->vertices ) {
325 my $sn = _safe_witstr( $n );
326 warn "Ambiguous stringification $sn for $n and " . $witness_map->{$sn}
327 if exists $witness_map->{$sn};
328 $witness_map->{$sn} = $n;
329 $safegraph->add_vertex( $sn );
330 $safegraph->set_vertex_attributes( $sn,
331 $stemma->graph->get_vertex_attributes( $n ) );
332 }
333 foreach my $e ( $stemma->graph->edges ) {
334 my @safe_e = ( _safe_witstr( $e->[0] ), _safe_witstr( $e->[1] ) );
335 $safegraph->add_edge( @safe_e );
336 }
337 my $safe_stemma = Text::Tradition::Stemma->new(
338 'collation' => $stemma->collation, 'graph' => $safegraph );
339
340 # Now convert the witness groupings to a safe representation.
341 my $safe_groupings = [];
342 foreach my $grouping ( @$groupings ) {
343 my $safe_grouping = [];
344 foreach my $group ( @$grouping ) {
345 my $safe_group = [];
346 foreach my $n ( @$group ) {
347 my $sn = _safe_witstr( $n );
348 warn "Ambiguous stringification $sn for $n and " . $witness_map->{$sn}
349 if exists $witness_map->{$sn} && $witness_map->{$sn} ne $n;
350 $witness_map->{$sn} = $n;
351 push( @$safe_group, $sn );
352 }
353 push( @$safe_grouping, $safe_group );
354 }
355 push( @$safe_groupings, $safe_grouping );
356 }
357
358 # Return it all in the struct we expect. We have stored the reductions
359 # in the $witness_map that we were passed.
360 return { 'graph' => $safe_stemma->editable( ' ' ), 'groupings' => $safe_groupings };
361}
362
363sub _safe_witstr {
364 my $witstr = shift;
365 $witstr =~ s/\s+/_/g;
366 $witstr =~ s/[^\w\d-]//g;
367 return $witstr;
368}
369
370sub _desanitize_names {
371 my( $jsonstruct, $witness_map ) = @_;
372 my $result = [];
373 foreach my $grouping ( @$jsonstruct ) {
374 my $real_grouping = [];
375 foreach my $element ( @$grouping ) {
376 if( ref( $element ) eq 'ARRAY' ) {
377 # it's the groupset.
378 my $real_groupset = [];
379 foreach my $group ( @$element ) {
380 my $real_group = [];
381 foreach my $n ( @$group ) {
382 my $rn = $witness_map->{$n};
383 push( @$real_group, $rn );
384 }
385 push( @$real_groupset, $real_group );
386 }
387 push( @$real_grouping, $real_groupset );
388 } else {
389 # It is the boolean, not actually a group.
390 push( @$real_grouping, $element );
391 }
392 }
393 push( @$result, $real_grouping );
394 }
395 return $result;
396}
397
398### END HACKERY ###
399
fae07016 400=head2 analyze_location ( $tradition, $graph, $location_hash )
7f52eac8 401
fae07016 402Given the tradition, its stemma graph, and the solution from the graph solver,
403work out the rest of the information we want. For each reading we need missing,
404conflict, reading_parents, independent_occurrence, followed, not_followed, and follow_unknown. Alters the location_hash in place.
7f52eac8 405
406=cut
732152b1 407
fae07016 408sub analyze_location {
409 my ( $tradition, $graph, $variant_row ) = @_;
410
411 # Make a hash of all known node memberships, and make the subgraphs.
412 my $contig = {};
413 my $reading_roots = {};
414 my $subgraph = {};
415 foreach my $rdghash ( @{$variant_row->{'readings'}} ) {
416 my $rid = $rdghash->{'readingid'};
417 map { $contig->{$_} = $rid } @{$rdghash->{'group'}};
c4a4fb1b 418
fae07016 419 # Make the subgraph.
420 my $part = $graph->copy;
421 my %these_vertices;
422 map { $these_vertices{$_} = 1 } @{$rdghash->{'group'}};
423 $part->delete_vertices( grep { !$these_vertices{$_} } $part->vertices );
424 $subgraph->{$rid} = $part;
425 # Get the reading roots.
426 map { $reading_roots->{$_} = $rid } $part->predecessorless_vertices;
bebec0e9 427 }
428
fae07016 429 # Now that we have all the node group memberships, calculate followed/
bebec0e9 430 # non-followed/unknown values for each reading. Also figure out the
431 # reading's evident parent(s).
432 foreach my $rdghash ( @{$variant_row->{'readings'}} ) {
fae07016 433 # Group string key - TODO do we need this?
434 my $gst = wit_stringify( $rdghash->{'group'} );
435 my $rid = $rdghash->{'readingid'};
436 # Get the subgraph
437 my $part = $subgraph->{$rid};
438
439 # Start figuring things out.
bebec0e9 440 my @roots = $part->predecessorless_vertices;
441 $rdghash->{'independent_occurrence'} = scalar @roots;
442 $rdghash->{'followed'} = scalar( $part->vertices ) - scalar( @roots );
443 # Find the parent readings, if any, of this reading.
f00cefe8 444 my %rdgparents;
bebec0e9 445 foreach my $wit ( @roots ) {
f00cefe8 446 # Look in the main stemma to find this witness's extant or known-reading
447 # immediate ancestor(s), and look up the reading that each ancestor olds.
448 my @check = $graph->predecessors( $wit );
449 while( @check ) {
450 my @next;
451 foreach my $wparent( @check ) {
fae07016 452 my $preading = $contig->{$wparent};
453 if( $preading ) {
454 $rdgparents{$preading} = 1;
f00cefe8 455 } else {
456 push( @next, $graph->predecessors( $wparent ) );
457 }
458 }
459 @check = @next;
460 }
bebec0e9 461 }
f00cefe8 462 $rdghash->{'reading_parents'} = [ keys %rdgparents ];
bebec0e9 463
464 # Find the number of times this reading was altered, and the number of
465 # times we're not sure.
466 my( %nofollow, %unknownfollow );
467 foreach my $wit ( $part->vertices ) {
468 foreach my $wchild ( $graph->successors( $wit ) ) {
469 next if $part->has_vertex( $wchild );
fae07016 470 if( $reading_roots->{$wchild} && $contig->{$wchild} ) {
bebec0e9 471 # It definitely changed here.
472 $nofollow{$wchild} = 1;
473 } elsif( !($contig->{$wchild}) ) {
474 # The child is a hypothetical node not definitely in
475 # any group. Answer is unknown.
476 $unknownfollow{$wchild} = 1;
477 } # else it's a non-root node in a known group, and therefore
478 # is presumed to have its reading from its group, not this link.
479 }
480 }
481 $rdghash->{'not_followed'} = keys %nofollow;
482 $rdghash->{'follow_unknown'} = keys %unknownfollow;
fae07016 483
484 # Now say whether this reading represents a conflict.
485 unless( $variant_row->{'genealogical'} ) {
486 $rdghash->{'conflict'} = @roots != 1;
487 }
c4a4fb1b 488 }
d71100ed 489}
490
fae07016 491
492=head2 perl_solver( $tradition, $rank, $stemma_id, @merge_relationship_types )
493
494** NOTE ** This method should hopefully not be called - it is not guaranteed
495to be correct. Serves as a backup for the real solver.
496
497Runs an analysis of the given tradition, at the location given in $rank,
498against the graph of the stemma specified in $stemma_id. The argument
499@merge_relationship_types is an optional list of relationship types for
500which readings so related should be treated as equivalent.
501
502Returns a nested array data structure as follows:
503
504 [ [ group_list, is_genealogical ], [ group_list, is_genealogical ] ... ]
505
506where the group list is the array of arrays passed in for each element of @groups,
507possibly with the addition of hypothetical readings.
508
509
510=cut
511
512sub perl_solver {
e59b8faa 513 my( $stemma, @groups ) = @_;
514 my $graph = $stemma->graph;
515 my @answer;
516 foreach my $g ( @groups ) {
517 push( @answer, _solve_variant_location( $graph, $g ) );
518 }
519 return \@answer;
fae07016 520}
521
e59b8faa 522sub _solve_variant_location {
523 my( $graph, $groups ) = @_;
fae07016 524 # Now do the work.
e59b8faa 525 my $contig = {};
526 my $subgraph = {};
527 my $is_conflicted;
528 my $conflict = {};
529
530 # Mark each ms as in its own group, first.
531 foreach my $g ( @$groups ) {
532 my $gst = wit_stringify( $g );
533 map { $contig->{$_} = $gst } @$g;
534 }
535
536 # Now for each unmarked node in the graph, initialize an array
537 # for possible group memberships. We will use this later to
538 # resolve potential conflicts.
539 map { $contig->{$_} = [] unless $contig->{$_} } $graph->vertices;
540 foreach my $g ( sort { scalar @$b <=> scalar @$a } @$groups ) {
541 my $gst = wit_stringify( $g ); # This is the group name
542 # Copy the graph, and delete all non-members from the new graph.
543 my $part = $graph->copy;
544 my @group_roots;
545 $part->delete_vertices(
546 grep { !ref( $contig->{$_} ) && $contig->{$_} ne $gst } $graph->vertices );
547
548 # Now look to see if our group is connected.
549 if( @$g > 1 ) {
550 # We have to take directionality into account.
551 # How many root nodes do we have?
552 my @roots = grep { ref( $contig->{$_} ) || $contig->{$_} eq $gst }
553 $part->predecessorless_vertices;
554 # Assuming that @$g > 1, find the first root node that has at
555 # least one successor belonging to our group. If this reading
556 # is genealogical, there should be only one, but we will check
557 # that implicitly later.
558 foreach my $root ( @roots ) {
559 # Prune the tree to get rid of extraneous hypotheticals.
560 $root = _prune_subtree( $part, $root, $contig );
561 next unless $root;
562 # Save this root for our group.
563 push( @group_roots, $root );
564 # Get all the successor nodes of our root.
565 }
566 } else {
567 # Dispense with the trivial case of one reading.
568 my $wit = $g->[0];
569 @group_roots = ( $wit );
570 foreach my $v ( $part->vertices ) {
571 $part->delete_vertex( $v ) unless $v eq $wit;
572 }
573 }
574
575 if( @group_roots > 1 ) {
576 $conflict->{$gst} = 1;
577 $is_conflicted = 1;
578 }
579 # Paint the 'hypotheticals' with our group.
580 foreach my $wit ( $part->vertices ) {
581 if( ref( $contig->{$wit} ) ) {
582 push( @{$contig->{$wit}}, $gst );
583 } elsif( $contig->{$wit} ne $gst ) {
584 warn "How did we get here?";
585 }
586 }
587
588
589 # Save the relevant subgraph.
590 $subgraph->{$gst} = $part;
591 }
592
593 # For each of our hypothetical readings, flatten its 'contig' array if
594 # the array contains zero or one group. If we have any unflattened arrays,
595 # we may need to run the resolution process. If the reading is already known
596 # to have a conflict, flatten the 'contig' array to nothing; we won't resolve
597 # it.
598 my @resolve;
599 foreach my $wit ( keys %$contig ) {
600 next unless ref( $contig->{$wit} );
601 if( @{$contig->{$wit}} > 1 ) {
602 if( $is_conflicted ) {
603 $contig->{$wit} = ''; # We aren't going to decide.
604 } else {
605 push( @resolve, $wit );
606 }
607 } else {
608 my $gst = pop @{$contig->{$wit}};
609 $contig->{$wit} = $gst || '';
610 }
611 }
612
613 if( @resolve ) {
614 my $still_contig = {};
615 foreach my $h ( @resolve ) {
616 # For each of the hypothetical readings with more than one possibility,
617 # try deleting it from each of its member subgraphs in turn, and see
618 # if that breaks the contiguous grouping.
619 # TODO This can still break in a corner case where group A can use
620 # either vertex 1 or 2, and group B can use either vertex 2 or 1.
621 # Revisit this if necessary; it could get brute-force nasty.
622 foreach my $gst ( @{$contig->{$h}} ) {
623 my $gpart = $subgraph->{$gst}->copy();
624 # If we have come this far, there is only one root and everything
625 # is reachable from it.
626 my( $root ) = $gpart->predecessorless_vertices;
627 my $reachable = {};
628 map { $reachable->{$_} = 1 } $gpart->vertices;
629
630 # Try deleting the hypothetical node.
631 $gpart->delete_vertex( $h );
632 if( $h eq $root ) {
633 # See if we still have a single root.
634 my @roots = $gpart->predecessorless_vertices;
635 warn "This shouldn't have happened" unless @roots;
636 if( @roots > 1 ) {
637 # $h is needed by this group.
638 if( exists( $still_contig->{$h} ) ) {
639 # Conflict!
640 $conflict->{$gst} = 1;
641 $still_contig->{$h} = '';
642 } else {
643 $still_contig->{$h} = $gst;
644 }
645 }
646 } else {
647 # $h is somewhere in the middle. See if everything
648 # else can still be reached from the root.
649 my %still_reachable = ( $root => 1 );
650 map { $still_reachable{$_} = 1 }
651 $gpart->all_successors( $root );
652 foreach my $v ( keys %$reachable ) {
653 next if $v eq $h;
654 if( !$still_reachable{$v}
655 && ( $contig->{$v} eq $gst
656 || ( exists $still_contig->{$v}
657 && $still_contig->{$v} eq $gst ) ) ) {
658 # We need $h.
659 if( exists $still_contig->{$h} ) {
660 # Conflict!
661 $conflict->{$gst} = 1;
662 $still_contig->{$h} = '';
663 } else {
664 $still_contig->{$h} = $gst;
665 }
666 last;
667 } # else we don't need $h in this group.
668 } # end foreach $v
669 } # endif $h eq $root
670 } # end foreach $gst
671 } # end foreach $h
672
673 # Now we have some hypothetical vertices in $still_contig that are the
674 # "real" group memberships. Replace these in $contig.
675 foreach my $v ( keys %$contig ) {
676 next unless ref $contig->{$v};
677 $contig->{$v} = $still_contig->{$v};
678 }
679 } # end if @resolve
680
681 my $is_genealogical = keys %$conflict ? JSON::false : JSON::true;
682 my $variant_row = [ [], $is_genealogical ];
683 # Fill in the groupings from $contig.
684 foreach my $g ( @$groups ) {
685 my $gst = wit_stringify( $g );
686 my @realgroup = grep { $contig->{$_} eq $gst } keys %$contig;
687 push( @{$variant_row->[0]}, \@realgroup );
688 }
689 return $variant_row;
690}
fae07016 691
7f52eac8 692sub _prune_subtree {
231d71fc 693 my( $tree, $root, $contighash ) = @_;
694 # First, delete hypothetical leaves / orphans until there are none left.
695 my @orphan_hypotheticals = grep { ref( $contighash->{$_} ) }
696 $tree->successorless_vertices;
697 while( @orphan_hypotheticals ) {
698 $tree->delete_vertices( @orphan_hypotheticals );
699 @orphan_hypotheticals = grep { ref( $contighash->{$_} ) }
700 $tree->successorless_vertices;
701 }
702 # Then delete a hypothetical root with only one successor, moving the
bebec0e9 703 # root to the first child that has no other predecessors.
231d71fc 704 while( $tree->successors( $root ) == 1 && ref $contighash->{$root} ) {
705 my @nextroot = $tree->successors( $root );
706 $tree->delete_vertex( $root );
bebec0e9 707 ( $root ) = grep { $tree->is_predecessorless_vertex( $_ ) } @nextroot;
231d71fc 708 }
709 # The tree has been modified in place, but we need to know the new root.
bebec0e9 710 $root = undef unless $root && $tree->has_vertex( $root );
231d71fc 711 return $root;
712}
d71100ed 713# Add the variant, subject to a.c. representation logic.
714# This assumes that we will see the 'main' version before the a.c. version.
715sub add_variant_wit {
716 my( $arr, $wit, $acstr ) = @_;
717 my $skip;
718 if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
719 my $real = $1;
720 $skip = grep { $_ =~ /^\Q$real\E$/ } @$arr;
721 }
722 push( @$arr, $wit ) unless $skip;
723}
724
5be0cdeb 725sub _useful_variant {
726 my( $group_readings, $graph, $acstr ) = @_;
727
728 # TODO Decide what to do with AC witnesses
729
730 # Sort by group size and return
731 my $is_useful = 0;
732 my( @readings, @groups ); # The sorted groups for our answer.
733 foreach my $rdg ( sort { @{$group_readings->{$b}} <=> @{$group_readings->{$a}} }
734 keys %$group_readings ) {
735 push( @readings, $rdg );
736 push( @groups, $group_readings->{$rdg} );
737 if( @{$group_readings->{$rdg}} > 1 ) {
738 $is_useful++;
739 } else {
740 my( $wit ) = @{$group_readings->{$rdg}};
741 $wit =~ s/^(.*)\Q$acstr\E$/$1/;
742 $is_useful++ unless( $graph->is_sink_vertex( $wit ) );
743 }
744 }
745 if( $is_useful > 1 ) {
746 return( \@readings, \@groups );
747 } else {
748 return( [], [] );
749 }
750}
751
7f52eac8 752=head2 wit_stringify( $groups )
753
754Takes an array of witness groupings and produces a string like
755['A','B'] / ['C','D','E'] / ['F']
d71100ed 756
7f52eac8 757=cut
d71100ed 758
759sub wit_stringify {
760 my $groups = shift;
761 my @gst;
762 # If we were passed an array of witnesses instead of an array of
763 # groupings, then "group" the witnesses first.
764 unless( ref( $groups->[0] ) ) {
765 my $mkgrp = [ $groups ];
766 $groups = $mkgrp;
767 }
768 foreach my $g ( @$groups ) {
769 push( @gst, '[' . join( ',', map { "'$_'" } @$g ) . ']' );
770 }
771 return join( ' / ', @gst );
772}
7f52eac8 773
5be0cdeb 774# Helper function to ensure that X and X a.c. never appear in the same list.
775sub _add_to_witlist {
776 my( $wit, $list, $acstr ) = @_;
777 my %inlist;
778 my $idx = 0;
779 map { $inlist{$_} = $idx++ } @$list;
780 if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
781 my $acwit = $1;
782 unless( exists $inlist{$acwit} ) {
783 push( @$list, $acwit.$acstr );
784 }
785 } else {
786 if( exists( $inlist{$wit.$acstr} ) ) {
787 # Replace the a.c. version with the main witness
788 my $i = $inlist{$wit.$acstr};
789 $list->[$i] = $wit;
790 } else {
791 push( @$list, $wit );
792 }
793 }
794}
795
bebec0e9 796sub _symmdiff {
797 my( $lista, $listb ) = @_;
7f52eac8 798 my %union;
799 my %scalars;
800 map { $union{$_} = 1; $scalars{$_} = $_ } @$lista;
801 map { $union{$_} += 1; $scalars{$_} = $_ } @$listb;
bebec0e9 802 my @set = grep { $union{$_} == 1 } keys %union;
7f52eac8 803 return map { $scalars{$_} } @set;
804}
805
8061;
807
808=head1 LICENSE
809
810This package is free software and is provided "as is" without express
811or implied warranty. You can redistribute it and/or modify it under
812the same terms as Perl itself.
813
814=head1 AUTHOR
815
816Tara L Andrews E<lt>aurum@cpan.orgE<gt>