more robust hack for IDP node name problem
[scpubgit/stemmatology.git] / lib / Text / Tradition / Analysis.pm
CommitLineData
d71100ed 1package Text::Tradition::Analysis;
2
3use strict;
4use warnings;
e4386ba9 5use Benchmark;
88a6bac5 6use Encode qw/ encode_utf8 /;
d1348d38 7use Exporter 'import';
b4cb2d60 8use Graph;
88a6bac5 9use JSON qw/ encode_json decode_json /;
10use LWP::UserAgent;
d71100ed 11use Text::Tradition;
12use Text::Tradition::Stemma;
13
d1348d38 14use vars qw/ @EXPORT_OK /;
a2cf85dd 15@EXPORT_OK = qw/ run_analysis group_variants analyze_variant_location wit_stringify /;
d1348d38 16
7f52eac8 17=head1 NAME
18
19Text::Tradition::Analysis - functions for stemma analysis of a tradition
20
21=head1 SYNOPSIS
22
23 use Text::Tradition;
24 use Text::Tradition::Analysis qw/ run_analysis analyze_variant_location /;
25 my $t = Text::Tradition->new(
26 'name' => 'this is a text',
27 'input' => 'TEI',
28 'file' => '/path/to/tei_parallel_seg_file.xml' );
29 $t->add_stemma( 'dotfile' => $stemmafile );
30
31 my $variant_data = run_analysis( $tradition );
32 # Recalculate rank $n treating all orthographic variants as equivalent
33 my $reanalyze = analyze_variant_location( $tradition, $n, 0, 'orthographic' );
34
35=head1 DESCRIPTION
36
37Text::Tradition is a library for representation and analysis of collated
38texts, particularly medieval ones. The Collation is the central feature of
39a Tradition, where the text, its sequence of readings, and its relationships
40between readings are actually kept.
41
42=head1 SUBROUTINES
43
88a6bac5 44=head2 run_analysis( $tradition, %opts )
7f52eac8 45
88a6bac5 46Runs the analysis described in analyze_variant_location on every location in the
47collation of the given tradition, with the given options. These include:
7f52eac8 48
88a6bac5 49=over 4
50
51=item * stemma_id - Specify which of the tradition's stemmata to use. Default
52is 0 (i.e. the first).
53
54=item * ranks - Specify a list of location ranks to analyze; exclude the rest.
55
56=item * merge_types - Specify a list of relationship types, where related readings
57should be treated as identical for the purposes of analysis.
58
59=back
7f52eac8 60
61=begin testing
62
63use Text::Tradition;
64use Text::Tradition::Analysis qw/ run_analysis analyze_variant_location /;
65
66my $datafile = 't/data/florilegium_tei_ps.xml';
67my $tradition = Text::Tradition->new( 'input' => 'TEI',
68 'name' => 'test0',
69 'file' => $datafile );
70my $s = $tradition->add_stemma( 'dotfile' => 't/data/florilegium.dot' );
71is( ref( $s ), 'Text::Tradition::Stemma', "Added stemma to tradition" );
72
f00cefe8 73my %expected_genealogical = (
a44aaf2a 74 1 => 0,
f00cefe8 75 2 => 1,
a44aaf2a 76 3 => 0,
77 5 => 0,
78 7 => 0,
79 8 => 0,
80 10 => 0,
f00cefe8 81 13 => 1,
a44aaf2a 82 33 => 0,
83 34 => 0,
84 37 => 0,
85 60 => 0,
f00cefe8 86 81 => 1,
a44aaf2a 87 84 => 0,
88 87 => 0,
89 101 => 0,
90 102 => 0,
f00cefe8 91 122 => 1,
a44aaf2a 92 157 => 0,
f00cefe8 93 166 => 1,
94 169 => 1,
a44aaf2a 95 200 => 0,
f00cefe8 96 216 => 1,
97 217 => 1,
98 219 => 1,
99 241 => 1,
100 242 => 1,
101 243 => 1,
102);
103
7f52eac8 104my $data = run_analysis( $tradition );
f00cefe8 105foreach my $row ( @{$data->{'variants'}} ) {
a44aaf2a 106 # Account for rows that used to be "not useful"
107 unless( exists $expected_genealogical{$row->{'id'}} ) {
108 $expected_genealogical{$row->{'id'}} = 1;
109 }
f00cefe8 110 is( $row->{'genealogical'}, $expected_genealogical{$row->{'id'}},
111 "Got correct genealogical flag for row " . $row->{'id'} );
112}
a44aaf2a 113is( $data->{'variant_count'}, 58, "Got right total variant number" );
b4cb2d60 114# TODO Make something meaningful of conflict count, maybe test other bits
7f52eac8 115
116=end testing
117
118=cut
119
d71100ed 120sub run_analysis {
88a6bac5 121 my( $tradition, %opts ) = @_;
f00cefe8 122 my $c = $tradition->collation;
88a6bac5 123
124 my $stemma_id = $opts{'stemma_id'} || 0;
1d73ecad 125 my @ranks = ref( $opts{'ranks'} ) eq 'ARRAY' ? @{$opts{'ranks'}} : ();
126 my @collapse = ref( $opts{'merge_types'} ) eq 'ARRAY' ? @{$opts{'merge_types'}} : ();
88a6bac5 127
128 # Get the stemma
129 my $stemma = $tradition->stemma( $stemma_id );
b4cb2d60 130
88a6bac5 131 # Figure out which witnesses we are working with
132 my @lacunose = $stemma->hypotheticals;
fae07016 133 my @tradition_wits = map { $_->sigil } $tradition->witnesses;
b4cb2d60 134 map { push( @tradition_wits, $_->sigil.$c->ac_label ) if $_->is_layered }
fae07016 135 $tradition->witnesses;
136 push( @lacunose, _symmdiff( [ $stemma->witnesses ], \@tradition_wits ) );
88a6bac5 137
138 # Find and mark 'common' ranks for exclusion, unless they were
139 # explicitly specified.
140 unless( @ranks ) {
141 my %common_rank;
a44aaf2a 142 foreach my $rdg ( $c->common_readings ) {
88a6bac5 143 $common_rank{$rdg->rank} = 1;
144 }
145 @ranks = grep { !$common_rank{$_} } ( 1 .. $c->end->rank-1 );
d71100ed 146 }
7f52eac8 147
88a6bac5 148 # Group the variants to send to the solver
149 my @groups;
a44aaf2a 150 my %lacunae;
88a6bac5 151 foreach my $rank ( @ranks ) {
a44aaf2a 152 my $missing = [ @lacunose ];
153 push( @groups, group_variants( $tradition, $rank, $missing, \@collapse ) );
154 $lacunae{$rank} = $missing;
d71100ed 155 }
b4cb2d60 156 $DB::single = 1;
88a6bac5 157 # Parse the answer
e59b8faa 158 my $answer = solve_variants( $stemma, @groups );
fae07016 159
88a6bac5 160 # Do further analysis on the answer
a44aaf2a 161 my $conflict_count = 0;
88a6bac5 162 foreach my $idx ( 0 .. $#ranks ) {
163 my $location = $answer->{'variants'}->[$idx];
164 # Add the rank back in
165 $location->{'id'} = $ranks[$idx];
a44aaf2a 166 # Add the lacunae back in
167 $location->{'missing'} = $lacunae{$ranks[$idx]};
88a6bac5 168 # Run the extra analysis we need.
88a6bac5 169 analyze_location( $tradition, $stemma->graph, $location );
a44aaf2a 170 # Add the reading text back in
171 foreach my $rdghash ( @{$location->{'readings'}} ) {
172 $conflict_count++
173 if exists $rdghash->{'conflict'} && $rdghash->{'conflict'};
174 my $rdg = $c->reading( $rdghash->{'readingid'} );
175 $rdghash->{'text'} = $rdg ? $rdg->text : $rdghash->{'readingid'};
176 }
88a6bac5 177 }
a44aaf2a 178 $answer->{'conflict_count'} = $conflict_count;
f00cefe8 179
88a6bac5 180 return $answer;
d71100ed 181}
182
7f52eac8 183=head2 group_variants( $tradition, $rank, $lacunose, @merge_relationship_types )
184
185Groups the variants at the given $rank of the collation, treating any
186relationships in @merge_relationship_types as equivalent. $lacunose should
187be a reference to an array, to which the sigla of lacunose witnesses at this
188rank will be appended.
189
190Returns two ordered lists $readings, $groups, where $readings->[$n] is attested
191by the witnesses listed in $groups->[$n].
192
193=cut
194
195# Return group_readings, groups, lacunose
d1348d38 196sub group_variants {
7f52eac8 197 my( $tradition, $rank, $lacunose, $collapse ) = @_;
198 my $c = $tradition->collation;
b4cb2d60 199 my $aclabel = $c->ac_label;
7f52eac8 200 # Get the alignment table readings
201 my %readings_at_rank;
202 my @gap_wits;
1d73ecad 203 foreach my $tablewit ( @{$c->alignment_table->{'alignment'}} ) {
7f52eac8 204 my $rdg = $tablewit->{'tokens'}->[$rank-1];
fae07016 205 my $wit = $tablewit->{'witness'};
7f52eac8 206 if( $rdg && $rdg->{'t'}->is_lacuna ) {
b4cb2d60 207 _add_to_witlist( $wit, $lacunose, $aclabel );
7f52eac8 208 } elsif( $rdg ) {
209 $readings_at_rank{$rdg->{'t'}->text} = $rdg->{'t'};
210 } else {
b4cb2d60 211 _add_to_witlist( $wit, \@gap_wits, $aclabel );
7f52eac8 212 }
213 }
d1348d38 214
7f52eac8 215 # Group the readings, collapsing groups by relationship if needed
216 my %grouped_readings;
b4cb2d60 217 foreach my $rdg ( sort { $b->witnesses <=> $a->witnesses }
218 values %readings_at_rank ) {
7f52eac8 219 # Skip readings that have been collapsed into others.
f00cefe8 220 next if exists $grouped_readings{$rdg->id} && !$grouped_readings{$rdg->id};
7f52eac8 221 my @wits = $rdg->witnesses;
222 if( $collapse ) {
223 my $filter = sub { my $r = $_[0]; grep { $_ eq $r->type } @$collapse; };
224 foreach my $other ( $rdg->related_readings( $filter ) ) {
fae07016 225 my @otherwits = $other->witnesses;
fae07016 226 push( @wits, @otherwits );
f00cefe8 227 $grouped_readings{$other->id} = 0;
d1348d38 228 }
229 }
f00cefe8 230 $grouped_readings{$rdg->id} = \@wits;
7f52eac8 231 }
232 $grouped_readings{'(omitted)'} = \@gap_wits if @gap_wits;
233 # Get rid of our collapsed readings
234 map { delete $grouped_readings{$_} unless $grouped_readings{$_} }
235 keys %grouped_readings
236 if $collapse;
237
5be0cdeb 238 return \%grouped_readings;
d1348d38 239}
240
88a6bac5 241=head2 solve_variants( $graph, @groups )
242
243Sends the set of groups to the external graph solver service and returns
244a cleaned-up answer, adding the rank IDs back where they belong.
245
246The JSON has the form
247 { "graph": [ stemmagraph DOT string without newlines ],
248 "groupings": [ array of arrays of groups, one per rank ] }
249
250The answer has the form
251 { "variants" => [ array of variant location structures ],
252 "variant_count" => total,
253 "conflict_count" => number of conflicts detected,
254 "genealogical_count" => number of solutions found }
255
256=cut
257
258sub solve_variants {
e59b8faa 259 my( $stemma, @groups ) = @_;
88a6bac5 260
261 # Make the json with stemma + groups
b4cb2d60 262 my $groupings = [];
88a6bac5 263 foreach my $ghash ( @groups ) {
264 my @grouping;
265 foreach my $k ( sort keys %$ghash ) {
266 push( @grouping, $ghash->{$k} );
267 }
b4cb2d60 268 push( @$groupings, \@grouping );
88a6bac5 269 }
b4cb2d60 270 ## Witness map is a HACK to get around limitations in node names from IDP
271 my $witness_map = {};
272 my $json = encode_json( _safe_wit_strings( $stemma, $groupings, $witness_map ) );
88a6bac5 273
274 # Send it off and get the result
275 my $solver_url = 'http://byzantini.st/cgi-bin/graphcalc.cgi';
276 my $ua = LWP::UserAgent->new();
277 my $resp = $ua->post( $solver_url, 'Content-Type' => 'application/json',
278 'Content' => $json );
279
280 my $answer;
281 if( $resp->is_success ) {
b4cb2d60 282 $answer = _desanitize_names( decode_json( $resp->content ), $witness_map );
88a6bac5 283 } else {
fae07016 284 # Fall back to the old method.
285 warn "IDP solver returned " . $resp->status_line . " / " . $resp->content
286 . "; falling back to perl method";
b4cb2d60 287 $answer = perl_solver( $stemma, @$groupings );
88a6bac5 288 }
289
290 # Fold the result back into what we know about the groups.
291 my $variants = [];
292 my $genealogical = 0;
293 foreach my $idx ( 0 .. $#groups ) {
294 my( $calc_groups, $result ) = @{$answer->[$idx]};
295 $genealogical++ if $result;
296 my $input_group = $groups[$idx];
297 foreach my $k ( sort keys %$input_group ) {
298 my $cg = shift @$calc_groups;
299 $input_group->{$k} = $cg;
300 }
301 my $vstruct = {
302 'genealogical' => $result,
303 'readings' => [],
fae07016 304 };
88a6bac5 305 foreach my $k ( keys %$input_group ) {
306 push( @{$vstruct->{'readings'}},
fae07016 307 { 'readingid' => $k, 'group' => $input_group->{$k}} );
88a6bac5 308 }
309 push( @$variants, $vstruct );
310 }
311
312 return { 'variants' => $variants,
313 'variant_count' => scalar @$variants,
314 'genealogical_count' => $genealogical };
315}
316
b4cb2d60 317#### HACKERY to cope with IDP's limited idea of what a node name looks like ###
318
319sub _safe_wit_strings {
320 my( $stemma, $groupings, $witness_map ) = @_;
321 my $safegraph = Graph->new();
322 # Convert the graph to a safe representation and store the conversion.
323 foreach my $n ( $stemma->graph->vertices ) {
324 my $sn = _safe_witstr( $n );
325 warn "Ambiguous stringification $sn for $n and " . $witness_map->{$sn}
326 if exists $witness_map->{$sn};
327 $witness_map->{$sn} = $n;
328 $safegraph->add_vertex( $sn );
329 $safegraph->set_vertex_attributes( $sn,
330 $stemma->graph->get_vertex_attributes( $n ) );
331 }
332 foreach my $e ( $stemma->graph->edges ) {
333 my @safe_e = ( _safe_witstr( $e->[0] ), _safe_witstr( $e->[1] ) );
334 $safegraph->add_edge( @safe_e );
335 }
336 my $safe_stemma = Text::Tradition::Stemma->new(
337 'collation' => $stemma->collation, 'graph' => $safegraph );
338
339 # Now convert the witness groupings to a safe representation.
340 my $safe_groupings = [];
341 foreach my $grouping ( @$groupings ) {
342 my $safe_grouping = [];
343 foreach my $group ( @$grouping ) {
344 my $safe_group = [];
345 foreach my $n ( @$group ) {
346 my $sn = _safe_witstr( $n );
347 warn "Ambiguous stringification $sn for $n and " . $witness_map->{$sn}
348 if exists $witness_map->{$sn} && $witness_map->{$sn} ne $n;
349 $witness_map->{$sn} = $n;
350 push( @$safe_group, $sn );
351 }
352 push( @$safe_grouping, $safe_group );
353 }
354 push( @$safe_groupings, $safe_grouping );
355 }
356
357 # Return it all in the struct we expect. We have stored the reductions
358 # in the $witness_map that we were passed.
359 return { 'graph' => $safe_stemma->editable( ' ' ), 'groupings' => $safe_groupings };
360}
361
362sub _safe_witstr {
363 my $witstr = shift;
364 $witstr =~ s/\s+/_/g;
365 $witstr =~ s/[^\w\d-]//g;
366 return $witstr;
367}
368
369sub _desanitize_names {
370 my( $jsonstruct, $witness_map ) = @_;
371 my $result = [];
372 foreach my $grouping ( @$jsonstruct ) {
373 my $real_grouping = [];
374 foreach my $element ( @$grouping ) {
375 if( ref( $element ) eq 'ARRAY' ) {
376 # it's the groupset.
377 my $real_groupset = [];
378 foreach my $group ( @$element ) {
379 my $real_group = [];
380 foreach my $n ( @$group ) {
381 my $rn = $witness_map->{$n};
382 push( @$real_group, $rn );
383 }
384 push( @$real_groupset, $real_group );
385 }
386 push( @$real_grouping, $real_groupset );
387 } else {
388 # It is the boolean, not actually a group.
389 push( @$real_grouping, $element );
390 }
391 }
392 push( @$result, $real_grouping );
393 }
394 return $result;
395}
396
397### END HACKERY ###
398
fae07016 399=head2 analyze_location ( $tradition, $graph, $location_hash )
7f52eac8 400
fae07016 401Given the tradition, its stemma graph, and the solution from the graph solver,
402work out the rest of the information we want. For each reading we need missing,
403conflict, reading_parents, independent_occurrence, followed, not_followed, and follow_unknown. Alters the location_hash in place.
7f52eac8 404
405=cut
732152b1 406
fae07016 407sub analyze_location {
408 my ( $tradition, $graph, $variant_row ) = @_;
409
410 # Make a hash of all known node memberships, and make the subgraphs.
411 my $contig = {};
412 my $reading_roots = {};
413 my $subgraph = {};
414 foreach my $rdghash ( @{$variant_row->{'readings'}} ) {
415 my $rid = $rdghash->{'readingid'};
416 map { $contig->{$_} = $rid } @{$rdghash->{'group'}};
c4a4fb1b 417
fae07016 418 # Make the subgraph.
419 my $part = $graph->copy;
420 my %these_vertices;
421 map { $these_vertices{$_} = 1 } @{$rdghash->{'group'}};
422 $part->delete_vertices( grep { !$these_vertices{$_} } $part->vertices );
423 $subgraph->{$rid} = $part;
424 # Get the reading roots.
425 map { $reading_roots->{$_} = $rid } $part->predecessorless_vertices;
bebec0e9 426 }
427
fae07016 428 # Now that we have all the node group memberships, calculate followed/
bebec0e9 429 # non-followed/unknown values for each reading. Also figure out the
430 # reading's evident parent(s).
431 foreach my $rdghash ( @{$variant_row->{'readings'}} ) {
fae07016 432 # Group string key - TODO do we need this?
433 my $gst = wit_stringify( $rdghash->{'group'} );
434 my $rid = $rdghash->{'readingid'};
435 # Get the subgraph
436 my $part = $subgraph->{$rid};
437
438 # Start figuring things out.
bebec0e9 439 my @roots = $part->predecessorless_vertices;
440 $rdghash->{'independent_occurrence'} = scalar @roots;
441 $rdghash->{'followed'} = scalar( $part->vertices ) - scalar( @roots );
442 # Find the parent readings, if any, of this reading.
f00cefe8 443 my %rdgparents;
bebec0e9 444 foreach my $wit ( @roots ) {
f00cefe8 445 # Look in the main stemma to find this witness's extant or known-reading
446 # immediate ancestor(s), and look up the reading that each ancestor olds.
447 my @check = $graph->predecessors( $wit );
448 while( @check ) {
449 my @next;
450 foreach my $wparent( @check ) {
fae07016 451 my $preading = $contig->{$wparent};
452 if( $preading ) {
453 $rdgparents{$preading} = 1;
f00cefe8 454 } else {
455 push( @next, $graph->predecessors( $wparent ) );
456 }
457 }
458 @check = @next;
459 }
bebec0e9 460 }
f00cefe8 461 $rdghash->{'reading_parents'} = [ keys %rdgparents ];
bebec0e9 462
463 # Find the number of times this reading was altered, and the number of
464 # times we're not sure.
465 my( %nofollow, %unknownfollow );
466 foreach my $wit ( $part->vertices ) {
467 foreach my $wchild ( $graph->successors( $wit ) ) {
468 next if $part->has_vertex( $wchild );
fae07016 469 if( $reading_roots->{$wchild} && $contig->{$wchild} ) {
bebec0e9 470 # It definitely changed here.
471 $nofollow{$wchild} = 1;
472 } elsif( !($contig->{$wchild}) ) {
473 # The child is a hypothetical node not definitely in
474 # any group. Answer is unknown.
475 $unknownfollow{$wchild} = 1;
476 } # else it's a non-root node in a known group, and therefore
477 # is presumed to have its reading from its group, not this link.
478 }
479 }
480 $rdghash->{'not_followed'} = keys %nofollow;
481 $rdghash->{'follow_unknown'} = keys %unknownfollow;
fae07016 482
483 # Now say whether this reading represents a conflict.
484 unless( $variant_row->{'genealogical'} ) {
485 $rdghash->{'conflict'} = @roots != 1;
486 }
c4a4fb1b 487 }
d71100ed 488}
489
fae07016 490
491=head2 perl_solver( $tradition, $rank, $stemma_id, @merge_relationship_types )
492
493** NOTE ** This method should hopefully not be called - it is not guaranteed
494to be correct. Serves as a backup for the real solver.
495
496Runs an analysis of the given tradition, at the location given in $rank,
497against the graph of the stemma specified in $stemma_id. The argument
498@merge_relationship_types is an optional list of relationship types for
499which readings so related should be treated as equivalent.
500
501Returns a nested array data structure as follows:
502
503 [ [ group_list, is_genealogical ], [ group_list, is_genealogical ] ... ]
504
505where the group list is the array of arrays passed in for each element of @groups,
506possibly with the addition of hypothetical readings.
507
508
509=cut
510
511sub perl_solver {
e59b8faa 512 my( $stemma, @groups ) = @_;
513 my $graph = $stemma->graph;
514 my @answer;
515 foreach my $g ( @groups ) {
516 push( @answer, _solve_variant_location( $graph, $g ) );
517 }
518 return \@answer;
fae07016 519}
520
e59b8faa 521sub _solve_variant_location {
522 my( $graph, $groups ) = @_;
fae07016 523 # Now do the work.
e59b8faa 524 my $contig = {};
525 my $subgraph = {};
526 my $is_conflicted;
527 my $conflict = {};
528
529 # Mark each ms as in its own group, first.
530 foreach my $g ( @$groups ) {
531 my $gst = wit_stringify( $g );
532 map { $contig->{$_} = $gst } @$g;
533 }
534
535 # Now for each unmarked node in the graph, initialize an array
536 # for possible group memberships. We will use this later to
537 # resolve potential conflicts.
538 map { $contig->{$_} = [] unless $contig->{$_} } $graph->vertices;
539 foreach my $g ( sort { scalar @$b <=> scalar @$a } @$groups ) {
540 my $gst = wit_stringify( $g ); # This is the group name
541 # Copy the graph, and delete all non-members from the new graph.
542 my $part = $graph->copy;
543 my @group_roots;
544 $part->delete_vertices(
545 grep { !ref( $contig->{$_} ) && $contig->{$_} ne $gst } $graph->vertices );
546
547 # Now look to see if our group is connected.
548 if( @$g > 1 ) {
549 # We have to take directionality into account.
550 # How many root nodes do we have?
551 my @roots = grep { ref( $contig->{$_} ) || $contig->{$_} eq $gst }
552 $part->predecessorless_vertices;
553 # Assuming that @$g > 1, find the first root node that has at
554 # least one successor belonging to our group. If this reading
555 # is genealogical, there should be only one, but we will check
556 # that implicitly later.
557 foreach my $root ( @roots ) {
558 # Prune the tree to get rid of extraneous hypotheticals.
559 $root = _prune_subtree( $part, $root, $contig );
560 next unless $root;
561 # Save this root for our group.
562 push( @group_roots, $root );
563 # Get all the successor nodes of our root.
564 }
565 } else {
566 # Dispense with the trivial case of one reading.
567 my $wit = $g->[0];
568 @group_roots = ( $wit );
569 foreach my $v ( $part->vertices ) {
570 $part->delete_vertex( $v ) unless $v eq $wit;
571 }
572 }
573
574 if( @group_roots > 1 ) {
575 $conflict->{$gst} = 1;
576 $is_conflicted = 1;
577 }
578 # Paint the 'hypotheticals' with our group.
579 foreach my $wit ( $part->vertices ) {
580 if( ref( $contig->{$wit} ) ) {
581 push( @{$contig->{$wit}}, $gst );
582 } elsif( $contig->{$wit} ne $gst ) {
583 warn "How did we get here?";
584 }
585 }
586
587
588 # Save the relevant subgraph.
589 $subgraph->{$gst} = $part;
590 }
591
592 # For each of our hypothetical readings, flatten its 'contig' array if
593 # the array contains zero or one group. If we have any unflattened arrays,
594 # we may need to run the resolution process. If the reading is already known
595 # to have a conflict, flatten the 'contig' array to nothing; we won't resolve
596 # it.
597 my @resolve;
598 foreach my $wit ( keys %$contig ) {
599 next unless ref( $contig->{$wit} );
600 if( @{$contig->{$wit}} > 1 ) {
601 if( $is_conflicted ) {
602 $contig->{$wit} = ''; # We aren't going to decide.
603 } else {
604 push( @resolve, $wit );
605 }
606 } else {
607 my $gst = pop @{$contig->{$wit}};
608 $contig->{$wit} = $gst || '';
609 }
610 }
611
612 if( @resolve ) {
613 my $still_contig = {};
614 foreach my $h ( @resolve ) {
615 # For each of the hypothetical readings with more than one possibility,
616 # try deleting it from each of its member subgraphs in turn, and see
617 # if that breaks the contiguous grouping.
618 # TODO This can still break in a corner case where group A can use
619 # either vertex 1 or 2, and group B can use either vertex 2 or 1.
620 # Revisit this if necessary; it could get brute-force nasty.
621 foreach my $gst ( @{$contig->{$h}} ) {
622 my $gpart = $subgraph->{$gst}->copy();
623 # If we have come this far, there is only one root and everything
624 # is reachable from it.
625 my( $root ) = $gpart->predecessorless_vertices;
626 my $reachable = {};
627 map { $reachable->{$_} = 1 } $gpart->vertices;
628
629 # Try deleting the hypothetical node.
630 $gpart->delete_vertex( $h );
631 if( $h eq $root ) {
632 # See if we still have a single root.
633 my @roots = $gpart->predecessorless_vertices;
634 warn "This shouldn't have happened" unless @roots;
635 if( @roots > 1 ) {
636 # $h is needed by this group.
637 if( exists( $still_contig->{$h} ) ) {
638 # Conflict!
639 $conflict->{$gst} = 1;
640 $still_contig->{$h} = '';
641 } else {
642 $still_contig->{$h} = $gst;
643 }
644 }
645 } else {
646 # $h is somewhere in the middle. See if everything
647 # else can still be reached from the root.
648 my %still_reachable = ( $root => 1 );
649 map { $still_reachable{$_} = 1 }
650 $gpart->all_successors( $root );
651 foreach my $v ( keys %$reachable ) {
652 next if $v eq $h;
653 if( !$still_reachable{$v}
654 && ( $contig->{$v} eq $gst
655 || ( exists $still_contig->{$v}
656 && $still_contig->{$v} eq $gst ) ) ) {
657 # We need $h.
658 if( exists $still_contig->{$h} ) {
659 # Conflict!
660 $conflict->{$gst} = 1;
661 $still_contig->{$h} = '';
662 } else {
663 $still_contig->{$h} = $gst;
664 }
665 last;
666 } # else we don't need $h in this group.
667 } # end foreach $v
668 } # endif $h eq $root
669 } # end foreach $gst
670 } # end foreach $h
671
672 # Now we have some hypothetical vertices in $still_contig that are the
673 # "real" group memberships. Replace these in $contig.
674 foreach my $v ( keys %$contig ) {
675 next unless ref $contig->{$v};
676 $contig->{$v} = $still_contig->{$v};
677 }
678 } # end if @resolve
679
680 my $is_genealogical = keys %$conflict ? JSON::false : JSON::true;
681 my $variant_row = [ [], $is_genealogical ];
682 # Fill in the groupings from $contig.
683 foreach my $g ( @$groups ) {
684 my $gst = wit_stringify( $g );
685 my @realgroup = grep { $contig->{$_} eq $gst } keys %$contig;
686 push( @{$variant_row->[0]}, \@realgroup );
687 }
688 return $variant_row;
689}
fae07016 690
7f52eac8 691sub _prune_subtree {
231d71fc 692 my( $tree, $root, $contighash ) = @_;
693 # First, delete hypothetical leaves / orphans until there are none left.
694 my @orphan_hypotheticals = grep { ref( $contighash->{$_} ) }
695 $tree->successorless_vertices;
696 while( @orphan_hypotheticals ) {
697 $tree->delete_vertices( @orphan_hypotheticals );
698 @orphan_hypotheticals = grep { ref( $contighash->{$_} ) }
699 $tree->successorless_vertices;
700 }
701 # Then delete a hypothetical root with only one successor, moving the
bebec0e9 702 # root to the first child that has no other predecessors.
231d71fc 703 while( $tree->successors( $root ) == 1 && ref $contighash->{$root} ) {
704 my @nextroot = $tree->successors( $root );
705 $tree->delete_vertex( $root );
bebec0e9 706 ( $root ) = grep { $tree->is_predecessorless_vertex( $_ ) } @nextroot;
231d71fc 707 }
708 # The tree has been modified in place, but we need to know the new root.
bebec0e9 709 $root = undef unless $root && $tree->has_vertex( $root );
231d71fc 710 return $root;
711}
d71100ed 712# Add the variant, subject to a.c. representation logic.
713# This assumes that we will see the 'main' version before the a.c. version.
714sub add_variant_wit {
715 my( $arr, $wit, $acstr ) = @_;
716 my $skip;
717 if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
718 my $real = $1;
719 $skip = grep { $_ =~ /^\Q$real\E$/ } @$arr;
720 }
721 push( @$arr, $wit ) unless $skip;
722}
723
5be0cdeb 724sub _useful_variant {
725 my( $group_readings, $graph, $acstr ) = @_;
726
727 # TODO Decide what to do with AC witnesses
728
729 # Sort by group size and return
730 my $is_useful = 0;
731 my( @readings, @groups ); # The sorted groups for our answer.
732 foreach my $rdg ( sort { @{$group_readings->{$b}} <=> @{$group_readings->{$a}} }
733 keys %$group_readings ) {
734 push( @readings, $rdg );
735 push( @groups, $group_readings->{$rdg} );
736 if( @{$group_readings->{$rdg}} > 1 ) {
737 $is_useful++;
738 } else {
739 my( $wit ) = @{$group_readings->{$rdg}};
740 $wit =~ s/^(.*)\Q$acstr\E$/$1/;
741 $is_useful++ unless( $graph->is_sink_vertex( $wit ) );
742 }
743 }
744 if( $is_useful > 1 ) {
745 return( \@readings, \@groups );
746 } else {
747 return( [], [] );
748 }
749}
750
7f52eac8 751=head2 wit_stringify( $groups )
752
753Takes an array of witness groupings and produces a string like
754['A','B'] / ['C','D','E'] / ['F']
d71100ed 755
7f52eac8 756=cut
d71100ed 757
758sub wit_stringify {
759 my $groups = shift;
760 my @gst;
761 # If we were passed an array of witnesses instead of an array of
762 # groupings, then "group" the witnesses first.
763 unless( ref( $groups->[0] ) ) {
764 my $mkgrp = [ $groups ];
765 $groups = $mkgrp;
766 }
767 foreach my $g ( @$groups ) {
768 push( @gst, '[' . join( ',', map { "'$_'" } @$g ) . ']' );
769 }
770 return join( ' / ', @gst );
771}
7f52eac8 772
5be0cdeb 773# Helper function to ensure that X and X a.c. never appear in the same list.
774sub _add_to_witlist {
775 my( $wit, $list, $acstr ) = @_;
776 my %inlist;
777 my $idx = 0;
778 map { $inlist{$_} = $idx++ } @$list;
779 if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
780 my $acwit = $1;
781 unless( exists $inlist{$acwit} ) {
782 push( @$list, $acwit.$acstr );
783 }
784 } else {
785 if( exists( $inlist{$wit.$acstr} ) ) {
786 # Replace the a.c. version with the main witness
787 my $i = $inlist{$wit.$acstr};
788 $list->[$i] = $wit;
789 } else {
790 push( @$list, $wit );
791 }
792 }
793}
794
bebec0e9 795sub _symmdiff {
796 my( $lista, $listb ) = @_;
7f52eac8 797 my %union;
798 my %scalars;
799 map { $union{$_} = 1; $scalars{$_} = $_ } @$lista;
800 map { $union{$_} += 1; $scalars{$_} = $_ } @$listb;
bebec0e9 801 my @set = grep { $union{$_} == 1 } keys %union;
7f52eac8 802 return map { $scalars{$_} } @set;
803}
804
8051;
806
807=head1 LICENSE
808
809This package is free software and is provided "as is" without express
810or implied warranty. You can redistribute it and/or modify it under
811the same terms as Perl itself.
812
813=head1 AUTHOR
814
815Tara L Andrews E<lt>aurum@cpan.orgE<gt>