allow lookup by name as well as ID
[scpubgit/stemmatology.git] / lib / Text / Tradition / Analysis.pm
CommitLineData
d71100ed 1package Text::Tradition::Analysis;
2
3use strict;
4use warnings;
e4386ba9 5use Benchmark;
88a6bac5 6use Encode qw/ encode_utf8 /;
d1348d38 7use Exporter 'import';
b4cb2d60 8use Graph;
88a6bac5 9use JSON qw/ encode_json decode_json /;
10use LWP::UserAgent;
d71100ed 11use Text::Tradition;
12use Text::Tradition::Stemma;
13
d1348d38 14use vars qw/ @EXPORT_OK /;
a2cf85dd 15@EXPORT_OK = qw/ run_analysis group_variants analyze_variant_location wit_stringify /;
d1348d38 16
7f52eac8 17=head1 NAME
18
19Text::Tradition::Analysis - functions for stemma analysis of a tradition
20
21=head1 SYNOPSIS
22
23 use Text::Tradition;
24 use Text::Tradition::Analysis qw/ run_analysis analyze_variant_location /;
25 my $t = Text::Tradition->new(
26 'name' => 'this is a text',
27 'input' => 'TEI',
28 'file' => '/path/to/tei_parallel_seg_file.xml' );
29 $t->add_stemma( 'dotfile' => $stemmafile );
30
31 my $variant_data = run_analysis( $tradition );
32 # Recalculate rank $n treating all orthographic variants as equivalent
33 my $reanalyze = analyze_variant_location( $tradition, $n, 0, 'orthographic' );
34
35=head1 DESCRIPTION
36
37Text::Tradition is a library for representation and analysis of collated
38texts, particularly medieval ones. The Collation is the central feature of
39a Tradition, where the text, its sequence of readings, and its relationships
40between readings are actually kept.
41
42=head1 SUBROUTINES
43
88a6bac5 44=head2 run_analysis( $tradition, %opts )
7f52eac8 45
88a6bac5 46Runs the analysis described in analyze_variant_location on every location in the
47collation of the given tradition, with the given options. These include:
7f52eac8 48
88a6bac5 49=over 4
50
51=item * stemma_id - Specify which of the tradition's stemmata to use. Default
52is 0 (i.e. the first).
53
54=item * ranks - Specify a list of location ranks to analyze; exclude the rest.
55
56=item * merge_types - Specify a list of relationship types, where related readings
57should be treated as identical for the purposes of analysis.
58
59=back
7f52eac8 60
61=begin testing
62
63use Text::Tradition;
64use Text::Tradition::Analysis qw/ run_analysis analyze_variant_location /;
65
66my $datafile = 't/data/florilegium_tei_ps.xml';
67my $tradition = Text::Tradition->new( 'input' => 'TEI',
68 'name' => 'test0',
69 'file' => $datafile );
70my $s = $tradition->add_stemma( 'dotfile' => 't/data/florilegium.dot' );
71is( ref( $s ), 'Text::Tradition::Stemma', "Added stemma to tradition" );
72
f00cefe8 73my %expected_genealogical = (
a44aaf2a 74 1 => 0,
f00cefe8 75 2 => 1,
a44aaf2a 76 3 => 0,
77 5 => 0,
78 7 => 0,
79 8 => 0,
80 10 => 0,
f00cefe8 81 13 => 1,
a44aaf2a 82 33 => 0,
83 34 => 0,
84 37 => 0,
85 60 => 0,
f00cefe8 86 81 => 1,
a44aaf2a 87 84 => 0,
88 87 => 0,
89 101 => 0,
90 102 => 0,
f00cefe8 91 122 => 1,
a44aaf2a 92 157 => 0,
f00cefe8 93 166 => 1,
94 169 => 1,
a44aaf2a 95 200 => 0,
f00cefe8 96 216 => 1,
97 217 => 1,
98 219 => 1,
99 241 => 1,
100 242 => 1,
101 243 => 1,
102);
103
7f52eac8 104my $data = run_analysis( $tradition );
f00cefe8 105foreach my $row ( @{$data->{'variants'}} ) {
a44aaf2a 106 # Account for rows that used to be "not useful"
107 unless( exists $expected_genealogical{$row->{'id'}} ) {
108 $expected_genealogical{$row->{'id'}} = 1;
109 }
6d25a3a0 110 is( $row->{'genealogical'}, $expected_genealogical{$row->{'id'}},
f00cefe8 111 "Got correct genealogical flag for row " . $row->{'id'} );
112}
a44aaf2a 113is( $data->{'variant_count'}, 58, "Got right total variant number" );
b4cb2d60 114# TODO Make something meaningful of conflict count, maybe test other bits
7f52eac8 115
116=end testing
117
118=cut
119
d71100ed 120sub run_analysis {
88a6bac5 121 my( $tradition, %opts ) = @_;
f00cefe8 122 my $c = $tradition->collation;
88a6bac5 123
124 my $stemma_id = $opts{'stemma_id'} || 0;
1d73ecad 125 my @ranks = ref( $opts{'ranks'} ) eq 'ARRAY' ? @{$opts{'ranks'}} : ();
126 my @collapse = ref( $opts{'merge_types'} ) eq 'ARRAY' ? @{$opts{'merge_types'}} : ();
88a6bac5 127
128 # Get the stemma
129 my $stemma = $tradition->stemma( $stemma_id );
b4cb2d60 130
88a6bac5 131 # Figure out which witnesses we are working with
132 my @lacunose = $stemma->hypotheticals;
fae07016 133 my @tradition_wits = map { $_->sigil } $tradition->witnesses;
b4cb2d60 134 map { push( @tradition_wits, $_->sigil.$c->ac_label ) if $_->is_layered }
fae07016 135 $tradition->witnesses;
136 push( @lacunose, _symmdiff( [ $stemma->witnesses ], \@tradition_wits ) );
88a6bac5 137
138 # Find and mark 'common' ranks for exclusion, unless they were
139 # explicitly specified.
140 unless( @ranks ) {
141 my %common_rank;
a44aaf2a 142 foreach my $rdg ( $c->common_readings ) {
88a6bac5 143 $common_rank{$rdg->rank} = 1;
144 }
145 @ranks = grep { !$common_rank{$_} } ( 1 .. $c->end->rank-1 );
d71100ed 146 }
7f52eac8 147
88a6bac5 148 # Group the variants to send to the solver
149 my @groups;
a44aaf2a 150 my %lacunae;
88a6bac5 151 foreach my $rank ( @ranks ) {
a44aaf2a 152 my $missing = [ @lacunose ];
ef2c9703 153 push( @groups, group_variants( $tradition, $rank, $missing, \@collapse ) );
a44aaf2a 154 $lacunae{$rank} = $missing;
d71100ed 155 }
b4cb2d60 156 $DB::single = 1;
88a6bac5 157 # Parse the answer
e59b8faa 158 my $answer = solve_variants( $stemma, @groups );
fae07016 159
88a6bac5 160 # Do further analysis on the answer
a44aaf2a 161 my $conflict_count = 0;
88a6bac5 162 foreach my $idx ( 0 .. $#ranks ) {
163 my $location = $answer->{'variants'}->[$idx];
164 # Add the rank back in
165 $location->{'id'} = $ranks[$idx];
a44aaf2a 166 # Add the lacunae back in
167 $location->{'missing'} = $lacunae{$ranks[$idx]};
88a6bac5 168 # Run the extra analysis we need.
88a6bac5 169 analyze_location( $tradition, $stemma->graph, $location );
a44aaf2a 170 # Add the reading text back in
171 foreach my $rdghash ( @{$location->{'readings'}} ) {
172 $conflict_count++
173 if exists $rdghash->{'conflict'} && $rdghash->{'conflict'};
174 my $rdg = $c->reading( $rdghash->{'readingid'} );
175 $rdghash->{'text'} = $rdg ? $rdg->text : $rdghash->{'readingid'};
176 }
88a6bac5 177 }
a44aaf2a 178 $answer->{'conflict_count'} = $conflict_count;
f00cefe8 179
88a6bac5 180 return $answer;
d71100ed 181}
182
7f52eac8 183=head2 group_variants( $tradition, $rank, $lacunose, @merge_relationship_types )
184
185Groups the variants at the given $rank of the collation, treating any
186relationships in @merge_relationship_types as equivalent. $lacunose should
187be a reference to an array, to which the sigla of lacunose witnesses at this
188rank will be appended.
189
ef2c9703 190Returns two ordered lists $readings, $groups, where $readings->[$n] is attested
191by the witnesses listed in $groups->[$n].
7f52eac8 192
193=cut
194
195# Return group_readings, groups, lacunose
d1348d38 196sub group_variants {
7f52eac8 197 my( $tradition, $rank, $lacunose, $collapse ) = @_;
198 my $c = $tradition->collation;
b4cb2d60 199 my $aclabel = $c->ac_label;
7f52eac8 200 # Get the alignment table readings
201 my %readings_at_rank;
202 my @gap_wits;
1d73ecad 203 foreach my $tablewit ( @{$c->alignment_table->{'alignment'}} ) {
7f52eac8 204 my $rdg = $tablewit->{'tokens'}->[$rank-1];
fae07016 205 my $wit = $tablewit->{'witness'};
7f52eac8 206 if( $rdg && $rdg->{'t'}->is_lacuna ) {
b4cb2d60 207 _add_to_witlist( $wit, $lacunose, $aclabel );
7f52eac8 208 } elsif( $rdg ) {
209 $readings_at_rank{$rdg->{'t'}->text} = $rdg->{'t'};
210 } else {
b4cb2d60 211 _add_to_witlist( $wit, \@gap_wits, $aclabel );
7f52eac8 212 }
213 }
d1348d38 214
7f52eac8 215 # Group the readings, collapsing groups by relationship if needed
216 my %grouped_readings;
b4cb2d60 217 foreach my $rdg ( sort { $b->witnesses <=> $a->witnesses }
218 values %readings_at_rank ) {
7f52eac8 219 # Skip readings that have been collapsed into others.
f00cefe8 220 next if exists $grouped_readings{$rdg->id} && !$grouped_readings{$rdg->id};
7f52eac8 221 my @wits = $rdg->witnesses;
222 if( $collapse ) {
223 my $filter = sub { my $r = $_[0]; grep { $_ eq $r->type } @$collapse; };
224 foreach my $other ( $rdg->related_readings( $filter ) ) {
fae07016 225 my @otherwits = $other->witnesses;
fae07016 226 push( @wits, @otherwits );
f00cefe8 227 $grouped_readings{$other->id} = 0;
d1348d38 228 }
229 }
f00cefe8 230 $grouped_readings{$rdg->id} = \@wits;
7f52eac8 231 }
232 $grouped_readings{'(omitted)'} = \@gap_wits if @gap_wits;
233 # Get rid of our collapsed readings
234 map { delete $grouped_readings{$_} unless $grouped_readings{$_} }
235 keys %grouped_readings
236 if $collapse;
237
5be0cdeb 238 return \%grouped_readings;
d1348d38 239}
240
88a6bac5 241=head2 solve_variants( $graph, @groups )
242
243Sends the set of groups to the external graph solver service and returns
244a cleaned-up answer, adding the rank IDs back where they belong.
245
246The JSON has the form
247 { "graph": [ stemmagraph DOT string without newlines ],
248 "groupings": [ array of arrays of groups, one per rank ] }
249
250The answer has the form
251 { "variants" => [ array of variant location structures ],
252 "variant_count" => total,
253 "conflict_count" => number of conflicts detected,
254 "genealogical_count" => number of solutions found }
255
256=cut
257
258sub solve_variants {
e59b8faa 259 my( $stemma, @groups ) = @_;
88a6bac5 260
261 # Make the json with stemma + groups
b4cb2d60 262 my $groupings = [];
88a6bac5 263 foreach my $ghash ( @groups ) {
264 my @grouping;
6d25a3a0 265 foreach my $k ( keys %$ghash ) {
88a6bac5 266 push( @grouping, $ghash->{$k} );
267 }
b4cb2d60 268 push( @$groupings, \@grouping );
88a6bac5 269 }
b4cb2d60 270 ## Witness map is a HACK to get around limitations in node names from IDP
271 my $witness_map = {};
272 my $json = encode_json( _safe_wit_strings( $stemma, $groupings, $witness_map ) );
88a6bac5 273
274 # Send it off and get the result
275 my $solver_url = 'http://byzantini.st/cgi-bin/graphcalc.cgi';
276 my $ua = LWP::UserAgent->new();
277 my $resp = $ua->post( $solver_url, 'Content-Type' => 'application/json',
278 'Content' => $json );
279
280 my $answer;
6d25a3a0 281 my $used_idp;
88a6bac5 282 if( $resp->is_success ) {
b4cb2d60 283 $answer = _desanitize_names( decode_json( $resp->content ), $witness_map );
6d25a3a0 284 $used_idp = 1;
88a6bac5 285 } else {
fae07016 286 # Fall back to the old method.
287 warn "IDP solver returned " . $resp->status_line . " / " . $resp->content
288 . "; falling back to perl method";
b4cb2d60 289 $answer = perl_solver( $stemma, @$groupings );
88a6bac5 290 }
291
292 # Fold the result back into what we know about the groups.
293 my $variants = [];
294 my $genealogical = 0;
295 foreach my $idx ( 0 .. $#groups ) {
296 my( $calc_groups, $result ) = @{$answer->[$idx]};
6d25a3a0 297 if( $result ) {
298 $genealogical++;
299 # Prune the calculated groups, in case the IDP solver failed to.
300 if( $used_idp ) {
301 my @pruned_groups;
302 foreach my $cg ( @$calc_groups ) {
303 my @pg = _prune_group( $cg, $stemma );
304 push( @pruned_groups, \@pg );
305 }
306 $calc_groups = \@pruned_groups;
307 }
308 }
88a6bac5 309 my $input_group = $groups[$idx];
310 foreach my $k ( sort keys %$input_group ) {
311 my $cg = shift @$calc_groups;
312 $input_group->{$k} = $cg;
313 }
314 my $vstruct = {
315 'genealogical' => $result,
316 'readings' => [],
fae07016 317 };
88a6bac5 318 foreach my $k ( keys %$input_group ) {
319 push( @{$vstruct->{'readings'}},
fae07016 320 { 'readingid' => $k, 'group' => $input_group->{$k}} );
88a6bac5 321 }
322 push( @$variants, $vstruct );
323 }
324
325 return { 'variants' => $variants,
326 'variant_count' => scalar @$variants,
327 'genealogical_count' => $genealogical };
328}
329
b4cb2d60 330#### HACKERY to cope with IDP's limited idea of what a node name looks like ###
331
332sub _safe_wit_strings {
333 my( $stemma, $groupings, $witness_map ) = @_;
334 my $safegraph = Graph->new();
335 # Convert the graph to a safe representation and store the conversion.
336 foreach my $n ( $stemma->graph->vertices ) {
337 my $sn = _safe_witstr( $n );
338 warn "Ambiguous stringification $sn for $n and " . $witness_map->{$sn}
339 if exists $witness_map->{$sn};
340 $witness_map->{$sn} = $n;
341 $safegraph->add_vertex( $sn );
342 $safegraph->set_vertex_attributes( $sn,
343 $stemma->graph->get_vertex_attributes( $n ) );
344 }
345 foreach my $e ( $stemma->graph->edges ) {
346 my @safe_e = ( _safe_witstr( $e->[0] ), _safe_witstr( $e->[1] ) );
347 $safegraph->add_edge( @safe_e );
348 }
349 my $safe_stemma = Text::Tradition::Stemma->new(
350 'collation' => $stemma->collation, 'graph' => $safegraph );
351
352 # Now convert the witness groupings to a safe representation.
353 my $safe_groupings = [];
354 foreach my $grouping ( @$groupings ) {
355 my $safe_grouping = [];
356 foreach my $group ( @$grouping ) {
357 my $safe_group = [];
358 foreach my $n ( @$group ) {
359 my $sn = _safe_witstr( $n );
360 warn "Ambiguous stringification $sn for $n and " . $witness_map->{$sn}
361 if exists $witness_map->{$sn} && $witness_map->{$sn} ne $n;
362 $witness_map->{$sn} = $n;
363 push( @$safe_group, $sn );
364 }
365 push( @$safe_grouping, $safe_group );
366 }
367 push( @$safe_groupings, $safe_grouping );
368 }
369
370 # Return it all in the struct we expect. We have stored the reductions
371 # in the $witness_map that we were passed.
372 return { 'graph' => $safe_stemma->editable( ' ' ), 'groupings' => $safe_groupings };
373}
374
375sub _safe_witstr {
376 my $witstr = shift;
377 $witstr =~ s/\s+/_/g;
378 $witstr =~ s/[^\w\d-]//g;
379 return $witstr;
380}
381
382sub _desanitize_names {
383 my( $jsonstruct, $witness_map ) = @_;
384 my $result = [];
385 foreach my $grouping ( @$jsonstruct ) {
386 my $real_grouping = [];
387 foreach my $element ( @$grouping ) {
388 if( ref( $element ) eq 'ARRAY' ) {
389 # it's the groupset.
390 my $real_groupset = [];
391 foreach my $group ( @$element ) {
392 my $real_group = [];
393 foreach my $n ( @$group ) {
394 my $rn = $witness_map->{$n};
395 push( @$real_group, $rn );
396 }
397 push( @$real_groupset, $real_group );
398 }
399 push( @$real_grouping, $real_groupset );
400 } else {
401 # It is the boolean, not actually a group.
402 push( @$real_grouping, $element );
403 }
404 }
405 push( @$result, $real_grouping );
406 }
407 return $result;
408}
409
410### END HACKERY ###
411
fae07016 412=head2 analyze_location ( $tradition, $graph, $location_hash )
7f52eac8 413
fae07016 414Given the tradition, its stemma graph, and the solution from the graph solver,
415work out the rest of the information we want. For each reading we need missing,
416conflict, reading_parents, independent_occurrence, followed, not_followed, and follow_unknown. Alters the location_hash in place.
7f52eac8 417
418=cut
732152b1 419
fae07016 420sub analyze_location {
421 my ( $tradition, $graph, $variant_row ) = @_;
422
423 # Make a hash of all known node memberships, and make the subgraphs.
424 my $contig = {};
425 my $reading_roots = {};
426 my $subgraph = {};
427 foreach my $rdghash ( @{$variant_row->{'readings'}} ) {
428 my $rid = $rdghash->{'readingid'};
429 map { $contig->{$_} = $rid } @{$rdghash->{'group'}};
c4a4fb1b 430
fae07016 431 # Make the subgraph.
432 my $part = $graph->copy;
433 my %these_vertices;
434 map { $these_vertices{$_} = 1 } @{$rdghash->{'group'}};
435 $part->delete_vertices( grep { !$these_vertices{$_} } $part->vertices );
436 $subgraph->{$rid} = $part;
437 # Get the reading roots.
438 map { $reading_roots->{$_} = $rid } $part->predecessorless_vertices;
bebec0e9 439 }
440
fae07016 441 # Now that we have all the node group memberships, calculate followed/
bebec0e9 442 # non-followed/unknown values for each reading. Also figure out the
443 # reading's evident parent(s).
444 foreach my $rdghash ( @{$variant_row->{'readings'}} ) {
fae07016 445 # Group string key - TODO do we need this?
446 my $gst = wit_stringify( $rdghash->{'group'} );
447 my $rid = $rdghash->{'readingid'};
448 # Get the subgraph
449 my $part = $subgraph->{$rid};
450
451 # Start figuring things out.
bebec0e9 452 my @roots = $part->predecessorless_vertices;
453 $rdghash->{'independent_occurrence'} = scalar @roots;
454 $rdghash->{'followed'} = scalar( $part->vertices ) - scalar( @roots );
455 # Find the parent readings, if any, of this reading.
f00cefe8 456 my %rdgparents;
bebec0e9 457 foreach my $wit ( @roots ) {
f00cefe8 458 # Look in the main stemma to find this witness's extant or known-reading
459 # immediate ancestor(s), and look up the reading that each ancestor olds.
460 my @check = $graph->predecessors( $wit );
461 while( @check ) {
462 my @next;
463 foreach my $wparent( @check ) {
fae07016 464 my $preading = $contig->{$wparent};
465 if( $preading ) {
466 $rdgparents{$preading} = 1;
f00cefe8 467 } else {
468 push( @next, $graph->predecessors( $wparent ) );
469 }
470 }
471 @check = @next;
472 }
bebec0e9 473 }
f00cefe8 474 $rdghash->{'reading_parents'} = [ keys %rdgparents ];
bebec0e9 475
476 # Find the number of times this reading was altered, and the number of
477 # times we're not sure.
478 my( %nofollow, %unknownfollow );
479 foreach my $wit ( $part->vertices ) {
480 foreach my $wchild ( $graph->successors( $wit ) ) {
481 next if $part->has_vertex( $wchild );
fae07016 482 if( $reading_roots->{$wchild} && $contig->{$wchild} ) {
bebec0e9 483 # It definitely changed here.
484 $nofollow{$wchild} = 1;
485 } elsif( !($contig->{$wchild}) ) {
486 # The child is a hypothetical node not definitely in
487 # any group. Answer is unknown.
488 $unknownfollow{$wchild} = 1;
489 } # else it's a non-root node in a known group, and therefore
490 # is presumed to have its reading from its group, not this link.
491 }
492 }
493 $rdghash->{'not_followed'} = keys %nofollow;
494 $rdghash->{'follow_unknown'} = keys %unknownfollow;
fae07016 495
496 # Now say whether this reading represents a conflict.
497 unless( $variant_row->{'genealogical'} ) {
498 $rdghash->{'conflict'} = @roots != 1;
499 }
c4a4fb1b 500 }
d71100ed 501}
502
fae07016 503
504=head2 perl_solver( $tradition, $rank, $stemma_id, @merge_relationship_types )
505
506** NOTE ** This method should hopefully not be called - it is not guaranteed
507to be correct. Serves as a backup for the real solver.
508
509Runs an analysis of the given tradition, at the location given in $rank,
510against the graph of the stemma specified in $stemma_id. The argument
511@merge_relationship_types is an optional list of relationship types for
512which readings so related should be treated as equivalent.
513
514Returns a nested array data structure as follows:
515
516 [ [ group_list, is_genealogical ], [ group_list, is_genealogical ] ... ]
517
518where the group list is the array of arrays passed in for each element of @groups,
519possibly with the addition of hypothetical readings.
520
521
522=cut
523
524sub perl_solver {
e59b8faa 525 my( $stemma, @groups ) = @_;
526 my $graph = $stemma->graph;
527 my @answer;
528 foreach my $g ( @groups ) {
529 push( @answer, _solve_variant_location( $graph, $g ) );
530 }
531 return \@answer;
fae07016 532}
533
e59b8faa 534sub _solve_variant_location {
535 my( $graph, $groups ) = @_;
fae07016 536 # Now do the work.
e59b8faa 537 my $contig = {};
538 my $subgraph = {};
539 my $is_conflicted;
540 my $conflict = {};
541
542 # Mark each ms as in its own group, first.
543 foreach my $g ( @$groups ) {
544 my $gst = wit_stringify( $g );
545 map { $contig->{$_} = $gst } @$g;
546 }
547
548 # Now for each unmarked node in the graph, initialize an array
549 # for possible group memberships. We will use this later to
550 # resolve potential conflicts.
551 map { $contig->{$_} = [] unless $contig->{$_} } $graph->vertices;
552 foreach my $g ( sort { scalar @$b <=> scalar @$a } @$groups ) {
553 my $gst = wit_stringify( $g ); # This is the group name
554 # Copy the graph, and delete all non-members from the new graph.
555 my $part = $graph->copy;
556 my @group_roots;
557 $part->delete_vertices(
558 grep { !ref( $contig->{$_} ) && $contig->{$_} ne $gst } $graph->vertices );
559
560 # Now look to see if our group is connected.
561 if( @$g > 1 ) {
562 # We have to take directionality into account.
563 # How many root nodes do we have?
564 my @roots = grep { ref( $contig->{$_} ) || $contig->{$_} eq $gst }
565 $part->predecessorless_vertices;
566 # Assuming that @$g > 1, find the first root node that has at
567 # least one successor belonging to our group. If this reading
568 # is genealogical, there should be only one, but we will check
569 # that implicitly later.
570 foreach my $root ( @roots ) {
571 # Prune the tree to get rid of extraneous hypotheticals.
572 $root = _prune_subtree( $part, $root, $contig );
573 next unless $root;
574 # Save this root for our group.
575 push( @group_roots, $root );
576 # Get all the successor nodes of our root.
577 }
578 } else {
579 # Dispense with the trivial case of one reading.
580 my $wit = $g->[0];
581 @group_roots = ( $wit );
582 foreach my $v ( $part->vertices ) {
583 $part->delete_vertex( $v ) unless $v eq $wit;
584 }
585 }
586
587 if( @group_roots > 1 ) {
588 $conflict->{$gst} = 1;
589 $is_conflicted = 1;
590 }
591 # Paint the 'hypotheticals' with our group.
592 foreach my $wit ( $part->vertices ) {
593 if( ref( $contig->{$wit} ) ) {
594 push( @{$contig->{$wit}}, $gst );
595 } elsif( $contig->{$wit} ne $gst ) {
596 warn "How did we get here?";
597 }
598 }
599
600
601 # Save the relevant subgraph.
602 $subgraph->{$gst} = $part;
603 }
604
605 # For each of our hypothetical readings, flatten its 'contig' array if
606 # the array contains zero or one group. If we have any unflattened arrays,
607 # we may need to run the resolution process. If the reading is already known
608 # to have a conflict, flatten the 'contig' array to nothing; we won't resolve
609 # it.
610 my @resolve;
611 foreach my $wit ( keys %$contig ) {
612 next unless ref( $contig->{$wit} );
613 if( @{$contig->{$wit}} > 1 ) {
614 if( $is_conflicted ) {
615 $contig->{$wit} = ''; # We aren't going to decide.
616 } else {
617 push( @resolve, $wit );
618 }
619 } else {
620 my $gst = pop @{$contig->{$wit}};
621 $contig->{$wit} = $gst || '';
622 }
623 }
624
625 if( @resolve ) {
626 my $still_contig = {};
627 foreach my $h ( @resolve ) {
628 # For each of the hypothetical readings with more than one possibility,
629 # try deleting it from each of its member subgraphs in turn, and see
630 # if that breaks the contiguous grouping.
631 # TODO This can still break in a corner case where group A can use
632 # either vertex 1 or 2, and group B can use either vertex 2 or 1.
633 # Revisit this if necessary; it could get brute-force nasty.
634 foreach my $gst ( @{$contig->{$h}} ) {
635 my $gpart = $subgraph->{$gst}->copy();
636 # If we have come this far, there is only one root and everything
637 # is reachable from it.
638 my( $root ) = $gpart->predecessorless_vertices;
639 my $reachable = {};
640 map { $reachable->{$_} = 1 } $gpart->vertices;
641
642 # Try deleting the hypothetical node.
643 $gpart->delete_vertex( $h );
644 if( $h eq $root ) {
645 # See if we still have a single root.
646 my @roots = $gpart->predecessorless_vertices;
647 warn "This shouldn't have happened" unless @roots;
648 if( @roots > 1 ) {
649 # $h is needed by this group.
650 if( exists( $still_contig->{$h} ) ) {
651 # Conflict!
652 $conflict->{$gst} = 1;
653 $still_contig->{$h} = '';
654 } else {
655 $still_contig->{$h} = $gst;
656 }
657 }
658 } else {
659 # $h is somewhere in the middle. See if everything
660 # else can still be reached from the root.
661 my %still_reachable = ( $root => 1 );
662 map { $still_reachable{$_} = 1 }
663 $gpart->all_successors( $root );
664 foreach my $v ( keys %$reachable ) {
665 next if $v eq $h;
666 if( !$still_reachable{$v}
667 && ( $contig->{$v} eq $gst
668 || ( exists $still_contig->{$v}
669 && $still_contig->{$v} eq $gst ) ) ) {
670 # We need $h.
671 if( exists $still_contig->{$h} ) {
672 # Conflict!
673 $conflict->{$gst} = 1;
674 $still_contig->{$h} = '';
675 } else {
676 $still_contig->{$h} = $gst;
677 }
678 last;
679 } # else we don't need $h in this group.
680 } # end foreach $v
681 } # endif $h eq $root
682 } # end foreach $gst
683 } # end foreach $h
684
685 # Now we have some hypothetical vertices in $still_contig that are the
686 # "real" group memberships. Replace these in $contig.
687 foreach my $v ( keys %$contig ) {
688 next unless ref $contig->{$v};
689 $contig->{$v} = $still_contig->{$v};
690 }
691 } # end if @resolve
692
693 my $is_genealogical = keys %$conflict ? JSON::false : JSON::true;
694 my $variant_row = [ [], $is_genealogical ];
695 # Fill in the groupings from $contig.
696 foreach my $g ( @$groups ) {
697 my $gst = wit_stringify( $g );
698 my @realgroup = grep { $contig->{$_} eq $gst } keys %$contig;
699 push( @{$variant_row->[0]}, \@realgroup );
700 }
701 return $variant_row;
702}
fae07016 703
6d25a3a0 704sub _prune_group {
705 my( $group, $stemma ) = @_;
706 # Get these into a form prune_subtree will recognize. Make a "contighash"
707 my $hypohash = {};
708 map { $hypohash->{$_} = 1 } @$group;
709 # ...with reference values for hypotheticals.
710 map { $hypohash->{$_} = [] } $stemma->hypotheticals;
711 # Make our subgraph
712 my $subgraph = $stemma->graph->copy;
713 map { $subgraph->delete_vertex( $_ ) unless exists $hypohash->{$_} }
714 $subgraph->vertices;
715 # ...and find the root.
716 my( $root ) = $subgraph->predecessorless_vertices;
717 # Now prune and return the remaining vertices.
718 _prune_subtree( $subgraph, $root, $hypohash );
719 return $subgraph->vertices;
720}
721
7f52eac8 722sub _prune_subtree {
231d71fc 723 my( $tree, $root, $contighash ) = @_;
724 # First, delete hypothetical leaves / orphans until there are none left.
725 my @orphan_hypotheticals = grep { ref( $contighash->{$_} ) }
726 $tree->successorless_vertices;
727 while( @orphan_hypotheticals ) {
728 $tree->delete_vertices( @orphan_hypotheticals );
729 @orphan_hypotheticals = grep { ref( $contighash->{$_} ) }
730 $tree->successorless_vertices;
731 }
732 # Then delete a hypothetical root with only one successor, moving the
bebec0e9 733 # root to the first child that has no other predecessors.
231d71fc 734 while( $tree->successors( $root ) == 1 && ref $contighash->{$root} ) {
735 my @nextroot = $tree->successors( $root );
736 $tree->delete_vertex( $root );
bebec0e9 737 ( $root ) = grep { $tree->is_predecessorless_vertex( $_ ) } @nextroot;
231d71fc 738 }
739 # The tree has been modified in place, but we need to know the new root.
bebec0e9 740 $root = undef unless $root && $tree->has_vertex( $root );
231d71fc 741 return $root;
742}
d71100ed 743# Add the variant, subject to a.c. representation logic.
744# This assumes that we will see the 'main' version before the a.c. version.
745sub add_variant_wit {
746 my( $arr, $wit, $acstr ) = @_;
747 my $skip;
748 if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
749 my $real = $1;
750 $skip = grep { $_ =~ /^\Q$real\E$/ } @$arr;
751 }
752 push( @$arr, $wit ) unless $skip;
753}
754
5be0cdeb 755sub _useful_variant {
756 my( $group_readings, $graph, $acstr ) = @_;
757
758 # TODO Decide what to do with AC witnesses
759
760 # Sort by group size and return
761 my $is_useful = 0;
762 my( @readings, @groups ); # The sorted groups for our answer.
763 foreach my $rdg ( sort { @{$group_readings->{$b}} <=> @{$group_readings->{$a}} }
764 keys %$group_readings ) {
765 push( @readings, $rdg );
766 push( @groups, $group_readings->{$rdg} );
767 if( @{$group_readings->{$rdg}} > 1 ) {
768 $is_useful++;
769 } else {
770 my( $wit ) = @{$group_readings->{$rdg}};
771 $wit =~ s/^(.*)\Q$acstr\E$/$1/;
772 $is_useful++ unless( $graph->is_sink_vertex( $wit ) );
773 }
774 }
775 if( $is_useful > 1 ) {
776 return( \@readings, \@groups );
777 } else {
778 return( [], [] );
779 }
780}
781
7f52eac8 782=head2 wit_stringify( $groups )
783
784Takes an array of witness groupings and produces a string like
785['A','B'] / ['C','D','E'] / ['F']
d71100ed 786
7f52eac8 787=cut
d71100ed 788
789sub wit_stringify {
790 my $groups = shift;
791 my @gst;
792 # If we were passed an array of witnesses instead of an array of
793 # groupings, then "group" the witnesses first.
794 unless( ref( $groups->[0] ) ) {
795 my $mkgrp = [ $groups ];
796 $groups = $mkgrp;
797 }
798 foreach my $g ( @$groups ) {
799 push( @gst, '[' . join( ',', map { "'$_'" } @$g ) . ']' );
800 }
801 return join( ' / ', @gst );
802}
7f52eac8 803
5be0cdeb 804# Helper function to ensure that X and X a.c. never appear in the same list.
805sub _add_to_witlist {
806 my( $wit, $list, $acstr ) = @_;
807 my %inlist;
808 my $idx = 0;
809 map { $inlist{$_} = $idx++ } @$list;
810 if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
811 my $acwit = $1;
812 unless( exists $inlist{$acwit} ) {
813 push( @$list, $acwit.$acstr );
814 }
815 } else {
816 if( exists( $inlist{$wit.$acstr} ) ) {
817 # Replace the a.c. version with the main witness
818 my $i = $inlist{$wit.$acstr};
819 $list->[$i] = $wit;
820 } else {
821 push( @$list, $wit );
822 }
823 }
824}
825
bebec0e9 826sub _symmdiff {
827 my( $lista, $listb ) = @_;
7f52eac8 828 my %union;
829 my %scalars;
830 map { $union{$_} = 1; $scalars{$_} = $_ } @$lista;
831 map { $union{$_} += 1; $scalars{$_} = $_ } @$listb;
bebec0e9 832 my @set = grep { $union{$_} == 1 } keys %union;
7f52eac8 833 return map { $scalars{$_} } @set;
834}
835
8361;
837
838=head1 LICENSE
839
840This package is free software and is provided "as is" without express
841or implied warranty. You can redistribute it and/or modify it under
842the same terms as Perl itself.
843
844=head1 AUTHOR
845
846Tara L Andrews E<lt>aurum@cpan.orgE<gt>