Merge branch 'master' of github.com:tla/stemmatology
[scpubgit/stemmatology.git] / lib / Text / Tradition / Analysis.pm
CommitLineData
d71100ed 1package Text::Tradition::Analysis;
2
3use strict;
4use warnings;
e4386ba9 5use Benchmark;
88a6bac5 6use Encode qw/ encode_utf8 /;
d1348d38 7use Exporter 'import';
b4cb2d60 8use Graph;
88a6bac5 9use JSON qw/ encode_json decode_json /;
10use LWP::UserAgent;
d71100ed 11use Text::Tradition;
12use Text::Tradition::Stemma;
13
d1348d38 14use vars qw/ @EXPORT_OK /;
a2cf85dd 15@EXPORT_OK = qw/ run_analysis group_variants analyze_variant_location wit_stringify /;
d1348d38 16
7f52eac8 17=head1 NAME
18
19Text::Tradition::Analysis - functions for stemma analysis of a tradition
20
21=head1 SYNOPSIS
22
23 use Text::Tradition;
24 use Text::Tradition::Analysis qw/ run_analysis analyze_variant_location /;
25 my $t = Text::Tradition->new(
26 'name' => 'this is a text',
27 'input' => 'TEI',
28 'file' => '/path/to/tei_parallel_seg_file.xml' );
29 $t->add_stemma( 'dotfile' => $stemmafile );
30
31 my $variant_data = run_analysis( $tradition );
32 # Recalculate rank $n treating all orthographic variants as equivalent
33 my $reanalyze = analyze_variant_location( $tradition, $n, 0, 'orthographic' );
34
35=head1 DESCRIPTION
36
37Text::Tradition is a library for representation and analysis of collated
38texts, particularly medieval ones. The Collation is the central feature of
39a Tradition, where the text, its sequence of readings, and its relationships
40between readings are actually kept.
41
42=head1 SUBROUTINES
43
88a6bac5 44=head2 run_analysis( $tradition, %opts )
7f52eac8 45
88a6bac5 46Runs the analysis described in analyze_variant_location on every location in the
47collation of the given tradition, with the given options. These include:
7f52eac8 48
88a6bac5 49=over 4
50
51=item * stemma_id - Specify which of the tradition's stemmata to use. Default
52is 0 (i.e. the first).
53
54=item * ranks - Specify a list of location ranks to analyze; exclude the rest.
55
56=item * merge_types - Specify a list of relationship types, where related readings
57should be treated as identical for the purposes of analysis.
58
f2fec47d 59=item * exclude_type1 - Exclude those ranks whose groupings have only type-1 variants.
60
88a6bac5 61=back
7f52eac8 62
63=begin testing
64
65use Text::Tradition;
66use Text::Tradition::Analysis qw/ run_analysis analyze_variant_location /;
67
68my $datafile = 't/data/florilegium_tei_ps.xml';
69my $tradition = Text::Tradition->new( 'input' => 'TEI',
70 'name' => 'test0',
71 'file' => $datafile );
72my $s = $tradition->add_stemma( 'dotfile' => 't/data/florilegium.dot' );
73is( ref( $s ), 'Text::Tradition::Stemma', "Added stemma to tradition" );
74
f00cefe8 75my %expected_genealogical = (
a44aaf2a 76 1 => 0,
f00cefe8 77 2 => 1,
a44aaf2a 78 3 => 0,
79 5 => 0,
80 7 => 0,
81 8 => 0,
82 10 => 0,
f00cefe8 83 13 => 1,
a44aaf2a 84 33 => 0,
85 34 => 0,
86 37 => 0,
87 60 => 0,
f00cefe8 88 81 => 1,
a44aaf2a 89 84 => 0,
90 87 => 0,
91 101 => 0,
92 102 => 0,
f00cefe8 93 122 => 1,
a44aaf2a 94 157 => 0,
f00cefe8 95 166 => 1,
96 169 => 1,
a44aaf2a 97 200 => 0,
f00cefe8 98 216 => 1,
99 217 => 1,
100 219 => 1,
101 241 => 1,
102 242 => 1,
103 243 => 1,
104);
105
7f52eac8 106my $data = run_analysis( $tradition );
f00cefe8 107foreach my $row ( @{$data->{'variants'}} ) {
a44aaf2a 108 # Account for rows that used to be "not useful"
109 unless( exists $expected_genealogical{$row->{'id'}} ) {
110 $expected_genealogical{$row->{'id'}} = 1;
111 }
6d25a3a0 112 is( $row->{'genealogical'}, $expected_genealogical{$row->{'id'}},
f00cefe8 113 "Got correct genealogical flag for row " . $row->{'id'} );
114}
a44aaf2a 115is( $data->{'variant_count'}, 58, "Got right total variant number" );
b4cb2d60 116# TODO Make something meaningful of conflict count, maybe test other bits
7f52eac8 117
118=end testing
119
120=cut
121
d71100ed 122sub run_analysis {
88a6bac5 123 my( $tradition, %opts ) = @_;
f00cefe8 124 my $c = $tradition->collation;
88a6bac5 125
126 my $stemma_id = $opts{'stemma_id'} || 0;
1d73ecad 127 my @ranks = ref( $opts{'ranks'} ) eq 'ARRAY' ? @{$opts{'ranks'}} : ();
128 my @collapse = ref( $opts{'merge_types'} ) eq 'ARRAY' ? @{$opts{'merge_types'}} : ();
88a6bac5 129
130 # Get the stemma
131 my $stemma = $tradition->stemma( $stemma_id );
b4cb2d60 132
88a6bac5 133 # Figure out which witnesses we are working with
134 my @lacunose = $stemma->hypotheticals;
fae07016 135 my @tradition_wits = map { $_->sigil } $tradition->witnesses;
b4cb2d60 136 map { push( @tradition_wits, $_->sigil.$c->ac_label ) if $_->is_layered }
fae07016 137 $tradition->witnesses;
138 push( @lacunose, _symmdiff( [ $stemma->witnesses ], \@tradition_wits ) );
88a6bac5 139
140 # Find and mark 'common' ranks for exclusion, unless they were
141 # explicitly specified.
142 unless( @ranks ) {
143 my %common_rank;
a44aaf2a 144 foreach my $rdg ( $c->common_readings ) {
88a6bac5 145 $common_rank{$rdg->rank} = 1;
146 }
147 @ranks = grep { !$common_rank{$_} } ( 1 .. $c->end->rank-1 );
d71100ed 148 }
7f52eac8 149
88a6bac5 150 # Group the variants to send to the solver
151 my @groups;
a44aaf2a 152 my %lacunae;
88a6bac5 153 foreach my $rank ( @ranks ) {
a44aaf2a 154 my $missing = [ @lacunose ];
f2fec47d 155 my $rankgroup = group_variants( $tradition, $rank, $missing, \@collapse );
156 if( $opts{'exclude_type1'} ) {
157 # Check to see whether this is a "useful" group.
158 my( $rdgs, $grps ) = _useful_variant( $rankgroup );
159 next unless @$rdgs;
160 }
161 push( @groups, $rankgroup );
a44aaf2a 162 $lacunae{$rank} = $missing;
d71100ed 163 }
b4cb2d60 164 $DB::single = 1;
88a6bac5 165 # Parse the answer
e59b8faa 166 my $answer = solve_variants( $stemma, @groups );
fae07016 167
88a6bac5 168 # Do further analysis on the answer
a44aaf2a 169 my $conflict_count = 0;
88a6bac5 170 foreach my $idx ( 0 .. $#ranks ) {
171 my $location = $answer->{'variants'}->[$idx];
172 # Add the rank back in
173 $location->{'id'} = $ranks[$idx];
a44aaf2a 174 # Add the lacunae back in
175 $location->{'missing'} = $lacunae{$ranks[$idx]};
88a6bac5 176 # Run the extra analysis we need.
88a6bac5 177 analyze_location( $tradition, $stemma->graph, $location );
a44aaf2a 178 # Add the reading text back in
179 foreach my $rdghash ( @{$location->{'readings'}} ) {
180 $conflict_count++
181 if exists $rdghash->{'conflict'} && $rdghash->{'conflict'};
182 my $rdg = $c->reading( $rdghash->{'readingid'} );
183 $rdghash->{'text'} = $rdg ? $rdg->text : $rdghash->{'readingid'};
184 }
88a6bac5 185 }
a44aaf2a 186 $answer->{'conflict_count'} = $conflict_count;
f00cefe8 187
88a6bac5 188 return $answer;
d71100ed 189}
190
7f52eac8 191=head2 group_variants( $tradition, $rank, $lacunose, @merge_relationship_types )
192
193Groups the variants at the given $rank of the collation, treating any
194relationships in @merge_relationship_types as equivalent. $lacunose should
195be a reference to an array, to which the sigla of lacunose witnesses at this
196rank will be appended.
197
f2fec47d 198Returns a hash $group_readings where $rdg is attested by the witnesses listed
199in $group_readings->{$rdg}.
7f52eac8 200
201=cut
202
203# Return group_readings, groups, lacunose
d1348d38 204sub group_variants {
7f52eac8 205 my( $tradition, $rank, $lacunose, $collapse ) = @_;
206 my $c = $tradition->collation;
b4cb2d60 207 my $aclabel = $c->ac_label;
7f52eac8 208 # Get the alignment table readings
209 my %readings_at_rank;
210 my @gap_wits;
1d73ecad 211 foreach my $tablewit ( @{$c->alignment_table->{'alignment'}} ) {
7f52eac8 212 my $rdg = $tablewit->{'tokens'}->[$rank-1];
fae07016 213 my $wit = $tablewit->{'witness'};
7f52eac8 214 if( $rdg && $rdg->{'t'}->is_lacuna ) {
b4cb2d60 215 _add_to_witlist( $wit, $lacunose, $aclabel );
7f52eac8 216 } elsif( $rdg ) {
217 $readings_at_rank{$rdg->{'t'}->text} = $rdg->{'t'};
218 } else {
b4cb2d60 219 _add_to_witlist( $wit, \@gap_wits, $aclabel );
7f52eac8 220 }
221 }
d1348d38 222
7f52eac8 223 # Group the readings, collapsing groups by relationship if needed
224 my %grouped_readings;
b4cb2d60 225 foreach my $rdg ( sort { $b->witnesses <=> $a->witnesses }
226 values %readings_at_rank ) {
7f52eac8 227 # Skip readings that have been collapsed into others.
f00cefe8 228 next if exists $grouped_readings{$rdg->id} && !$grouped_readings{$rdg->id};
7f52eac8 229 my @wits = $rdg->witnesses;
230 if( $collapse ) {
231 my $filter = sub { my $r = $_[0]; grep { $_ eq $r->type } @$collapse; };
232 foreach my $other ( $rdg->related_readings( $filter ) ) {
fae07016 233 my @otherwits = $other->witnesses;
fae07016 234 push( @wits, @otherwits );
f00cefe8 235 $grouped_readings{$other->id} = 0;
d1348d38 236 }
237 }
f00cefe8 238 $grouped_readings{$rdg->id} = \@wits;
7f52eac8 239 }
240 $grouped_readings{'(omitted)'} = \@gap_wits if @gap_wits;
241 # Get rid of our collapsed readings
242 map { delete $grouped_readings{$_} unless $grouped_readings{$_} }
243 keys %grouped_readings
244 if $collapse;
245
5be0cdeb 246 return \%grouped_readings;
d1348d38 247}
248
88a6bac5 249=head2 solve_variants( $graph, @groups )
250
251Sends the set of groups to the external graph solver service and returns
252a cleaned-up answer, adding the rank IDs back where they belong.
253
254The JSON has the form
255 { "graph": [ stemmagraph DOT string without newlines ],
256 "groupings": [ array of arrays of groups, one per rank ] }
257
258The answer has the form
259 { "variants" => [ array of variant location structures ],
260 "variant_count" => total,
261 "conflict_count" => number of conflicts detected,
262 "genealogical_count" => number of solutions found }
263
264=cut
265
266sub solve_variants {
e59b8faa 267 my( $stemma, @groups ) = @_;
88a6bac5 268
269 # Make the json with stemma + groups
b4cb2d60 270 my $groupings = [];
88a6bac5 271 foreach my $ghash ( @groups ) {
272 my @grouping;
6d25a3a0 273 foreach my $k ( keys %$ghash ) {
88a6bac5 274 push( @grouping, $ghash->{$k} );
275 }
b4cb2d60 276 push( @$groupings, \@grouping );
88a6bac5 277 }
b4cb2d60 278 ## Witness map is a HACK to get around limitations in node names from IDP
279 my $witness_map = {};
280 my $json = encode_json( _safe_wit_strings( $stemma, $groupings, $witness_map ) );
88a6bac5 281
282 # Send it off and get the result
283 my $solver_url = 'http://byzantini.st/cgi-bin/graphcalc.cgi';
284 my $ua = LWP::UserAgent->new();
285 my $resp = $ua->post( $solver_url, 'Content-Type' => 'application/json',
286 'Content' => $json );
287
288 my $answer;
6d25a3a0 289 my $used_idp;
88a6bac5 290 if( $resp->is_success ) {
b4cb2d60 291 $answer = _desanitize_names( decode_json( $resp->content ), $witness_map );
6d25a3a0 292 $used_idp = 1;
88a6bac5 293 } else {
fae07016 294 # Fall back to the old method.
295 warn "IDP solver returned " . $resp->status_line . " / " . $resp->content
296 . "; falling back to perl method";
b4cb2d60 297 $answer = perl_solver( $stemma, @$groupings );
88a6bac5 298 }
299
300 # Fold the result back into what we know about the groups.
301 my $variants = [];
302 my $genealogical = 0;
303 foreach my $idx ( 0 .. $#groups ) {
304 my( $calc_groups, $result ) = @{$answer->[$idx]};
6d25a3a0 305 if( $result ) {
306 $genealogical++;
307 # Prune the calculated groups, in case the IDP solver failed to.
308 if( $used_idp ) {
309 my @pruned_groups;
310 foreach my $cg ( @$calc_groups ) {
311 my @pg = _prune_group( $cg, $stemma );
312 push( @pruned_groups, \@pg );
313 }
314 $calc_groups = \@pruned_groups;
315 }
316 }
88a6bac5 317 my $input_group = $groups[$idx];
318 foreach my $k ( sort keys %$input_group ) {
319 my $cg = shift @$calc_groups;
320 $input_group->{$k} = $cg;
321 }
322 my $vstruct = {
323 'genealogical' => $result,
324 'readings' => [],
fae07016 325 };
88a6bac5 326 foreach my $k ( keys %$input_group ) {
327 push( @{$vstruct->{'readings'}},
fae07016 328 { 'readingid' => $k, 'group' => $input_group->{$k}} );
88a6bac5 329 }
330 push( @$variants, $vstruct );
331 }
332
333 return { 'variants' => $variants,
334 'variant_count' => scalar @$variants,
335 'genealogical_count' => $genealogical };
336}
337
b4cb2d60 338#### HACKERY to cope with IDP's limited idea of what a node name looks like ###
339
340sub _safe_wit_strings {
341 my( $stemma, $groupings, $witness_map ) = @_;
342 my $safegraph = Graph->new();
343 # Convert the graph to a safe representation and store the conversion.
344 foreach my $n ( $stemma->graph->vertices ) {
345 my $sn = _safe_witstr( $n );
346 warn "Ambiguous stringification $sn for $n and " . $witness_map->{$sn}
347 if exists $witness_map->{$sn};
348 $witness_map->{$sn} = $n;
349 $safegraph->add_vertex( $sn );
350 $safegraph->set_vertex_attributes( $sn,
351 $stemma->graph->get_vertex_attributes( $n ) );
352 }
353 foreach my $e ( $stemma->graph->edges ) {
354 my @safe_e = ( _safe_witstr( $e->[0] ), _safe_witstr( $e->[1] ) );
355 $safegraph->add_edge( @safe_e );
356 }
357 my $safe_stemma = Text::Tradition::Stemma->new(
358 'collation' => $stemma->collation, 'graph' => $safegraph );
359
360 # Now convert the witness groupings to a safe representation.
361 my $safe_groupings = [];
362 foreach my $grouping ( @$groupings ) {
363 my $safe_grouping = [];
364 foreach my $group ( @$grouping ) {
365 my $safe_group = [];
366 foreach my $n ( @$group ) {
367 my $sn = _safe_witstr( $n );
368 warn "Ambiguous stringification $sn for $n and " . $witness_map->{$sn}
369 if exists $witness_map->{$sn} && $witness_map->{$sn} ne $n;
370 $witness_map->{$sn} = $n;
371 push( @$safe_group, $sn );
372 }
373 push( @$safe_grouping, $safe_group );
374 }
375 push( @$safe_groupings, $safe_grouping );
376 }
377
378 # Return it all in the struct we expect. We have stored the reductions
379 # in the $witness_map that we were passed.
380 return { 'graph' => $safe_stemma->editable( ' ' ), 'groupings' => $safe_groupings };
381}
382
383sub _safe_witstr {
384 my $witstr = shift;
385 $witstr =~ s/\s+/_/g;
386 $witstr =~ s/[^\w\d-]//g;
387 return $witstr;
388}
389
390sub _desanitize_names {
391 my( $jsonstruct, $witness_map ) = @_;
392 my $result = [];
393 foreach my $grouping ( @$jsonstruct ) {
394 my $real_grouping = [];
395 foreach my $element ( @$grouping ) {
396 if( ref( $element ) eq 'ARRAY' ) {
397 # it's the groupset.
398 my $real_groupset = [];
399 foreach my $group ( @$element ) {
400 my $real_group = [];
401 foreach my $n ( @$group ) {
402 my $rn = $witness_map->{$n};
403 push( @$real_group, $rn );
404 }
405 push( @$real_groupset, $real_group );
406 }
407 push( @$real_grouping, $real_groupset );
408 } else {
409 # It is the boolean, not actually a group.
410 push( @$real_grouping, $element );
411 }
412 }
413 push( @$result, $real_grouping );
414 }
415 return $result;
416}
417
418### END HACKERY ###
419
fae07016 420=head2 analyze_location ( $tradition, $graph, $location_hash )
7f52eac8 421
fae07016 422Given the tradition, its stemma graph, and the solution from the graph solver,
423work out the rest of the information we want. For each reading we need missing,
424conflict, reading_parents, independent_occurrence, followed, not_followed, and follow_unknown. Alters the location_hash in place.
7f52eac8 425
426=cut
732152b1 427
fae07016 428sub analyze_location {
429 my ( $tradition, $graph, $variant_row ) = @_;
430
431 # Make a hash of all known node memberships, and make the subgraphs.
432 my $contig = {};
433 my $reading_roots = {};
434 my $subgraph = {};
435 foreach my $rdghash ( @{$variant_row->{'readings'}} ) {
436 my $rid = $rdghash->{'readingid'};
437 map { $contig->{$_} = $rid } @{$rdghash->{'group'}};
c4a4fb1b 438
fae07016 439 # Make the subgraph.
440 my $part = $graph->copy;
441 my %these_vertices;
442 map { $these_vertices{$_} = 1 } @{$rdghash->{'group'}};
443 $part->delete_vertices( grep { !$these_vertices{$_} } $part->vertices );
444 $subgraph->{$rid} = $part;
445 # Get the reading roots.
446 map { $reading_roots->{$_} = $rid } $part->predecessorless_vertices;
bebec0e9 447 }
448
fae07016 449 # Now that we have all the node group memberships, calculate followed/
bebec0e9 450 # non-followed/unknown values for each reading. Also figure out the
451 # reading's evident parent(s).
452 foreach my $rdghash ( @{$variant_row->{'readings'}} ) {
fae07016 453 # Group string key - TODO do we need this?
454 my $gst = wit_stringify( $rdghash->{'group'} );
455 my $rid = $rdghash->{'readingid'};
456 # Get the subgraph
457 my $part = $subgraph->{$rid};
458
459 # Start figuring things out.
bebec0e9 460 my @roots = $part->predecessorless_vertices;
461 $rdghash->{'independent_occurrence'} = scalar @roots;
462 $rdghash->{'followed'} = scalar( $part->vertices ) - scalar( @roots );
463 # Find the parent readings, if any, of this reading.
f00cefe8 464 my %rdgparents;
bebec0e9 465 foreach my $wit ( @roots ) {
f00cefe8 466 # Look in the main stemma to find this witness's extant or known-reading
467 # immediate ancestor(s), and look up the reading that each ancestor olds.
468 my @check = $graph->predecessors( $wit );
469 while( @check ) {
470 my @next;
471 foreach my $wparent( @check ) {
fae07016 472 my $preading = $contig->{$wparent};
473 if( $preading ) {
474 $rdgparents{$preading} = 1;
f00cefe8 475 } else {
476 push( @next, $graph->predecessors( $wparent ) );
477 }
478 }
479 @check = @next;
480 }
bebec0e9 481 }
f00cefe8 482 $rdghash->{'reading_parents'} = [ keys %rdgparents ];
bebec0e9 483
484 # Find the number of times this reading was altered, and the number of
485 # times we're not sure.
486 my( %nofollow, %unknownfollow );
487 foreach my $wit ( $part->vertices ) {
488 foreach my $wchild ( $graph->successors( $wit ) ) {
489 next if $part->has_vertex( $wchild );
fae07016 490 if( $reading_roots->{$wchild} && $contig->{$wchild} ) {
bebec0e9 491 # It definitely changed here.
492 $nofollow{$wchild} = 1;
493 } elsif( !($contig->{$wchild}) ) {
494 # The child is a hypothetical node not definitely in
495 # any group. Answer is unknown.
496 $unknownfollow{$wchild} = 1;
497 } # else it's a non-root node in a known group, and therefore
498 # is presumed to have its reading from its group, not this link.
499 }
500 }
501 $rdghash->{'not_followed'} = keys %nofollow;
502 $rdghash->{'follow_unknown'} = keys %unknownfollow;
fae07016 503
504 # Now say whether this reading represents a conflict.
505 unless( $variant_row->{'genealogical'} ) {
506 $rdghash->{'conflict'} = @roots != 1;
507 }
c4a4fb1b 508 }
d71100ed 509}
510
fae07016 511
512=head2 perl_solver( $tradition, $rank, $stemma_id, @merge_relationship_types )
513
514** NOTE ** This method should hopefully not be called - it is not guaranteed
515to be correct. Serves as a backup for the real solver.
516
517Runs an analysis of the given tradition, at the location given in $rank,
518against the graph of the stemma specified in $stemma_id. The argument
519@merge_relationship_types is an optional list of relationship types for
520which readings so related should be treated as equivalent.
521
522Returns a nested array data structure as follows:
523
524 [ [ group_list, is_genealogical ], [ group_list, is_genealogical ] ... ]
525
526where the group list is the array of arrays passed in for each element of @groups,
527possibly with the addition of hypothetical readings.
528
529
530=cut
531
532sub perl_solver {
e59b8faa 533 my( $stemma, @groups ) = @_;
534 my $graph = $stemma->graph;
535 my @answer;
536 foreach my $g ( @groups ) {
537 push( @answer, _solve_variant_location( $graph, $g ) );
538 }
539 return \@answer;
fae07016 540}
541
e59b8faa 542sub _solve_variant_location {
543 my( $graph, $groups ) = @_;
fae07016 544 # Now do the work.
e59b8faa 545 my $contig = {};
546 my $subgraph = {};
547 my $is_conflicted;
548 my $conflict = {};
549
550 # Mark each ms as in its own group, first.
551 foreach my $g ( @$groups ) {
552 my $gst = wit_stringify( $g );
553 map { $contig->{$_} = $gst } @$g;
554 }
555
556 # Now for each unmarked node in the graph, initialize an array
557 # for possible group memberships. We will use this later to
558 # resolve potential conflicts.
559 map { $contig->{$_} = [] unless $contig->{$_} } $graph->vertices;
560 foreach my $g ( sort { scalar @$b <=> scalar @$a } @$groups ) {
561 my $gst = wit_stringify( $g ); # This is the group name
562 # Copy the graph, and delete all non-members from the new graph.
563 my $part = $graph->copy;
564 my @group_roots;
565 $part->delete_vertices(
566 grep { !ref( $contig->{$_} ) && $contig->{$_} ne $gst } $graph->vertices );
567
568 # Now look to see if our group is connected.
569 if( @$g > 1 ) {
570 # We have to take directionality into account.
571 # How many root nodes do we have?
572 my @roots = grep { ref( $contig->{$_} ) || $contig->{$_} eq $gst }
573 $part->predecessorless_vertices;
574 # Assuming that @$g > 1, find the first root node that has at
575 # least one successor belonging to our group. If this reading
576 # is genealogical, there should be only one, but we will check
577 # that implicitly later.
578 foreach my $root ( @roots ) {
579 # Prune the tree to get rid of extraneous hypotheticals.
580 $root = _prune_subtree( $part, $root, $contig );
581 next unless $root;
582 # Save this root for our group.
583 push( @group_roots, $root );
584 # Get all the successor nodes of our root.
585 }
586 } else {
587 # Dispense with the trivial case of one reading.
588 my $wit = $g->[0];
589 @group_roots = ( $wit );
590 foreach my $v ( $part->vertices ) {
591 $part->delete_vertex( $v ) unless $v eq $wit;
592 }
593 }
594
595 if( @group_roots > 1 ) {
596 $conflict->{$gst} = 1;
597 $is_conflicted = 1;
598 }
599 # Paint the 'hypotheticals' with our group.
600 foreach my $wit ( $part->vertices ) {
601 if( ref( $contig->{$wit} ) ) {
602 push( @{$contig->{$wit}}, $gst );
603 } elsif( $contig->{$wit} ne $gst ) {
604 warn "How did we get here?";
605 }
606 }
607
608
609 # Save the relevant subgraph.
610 $subgraph->{$gst} = $part;
611 }
612
613 # For each of our hypothetical readings, flatten its 'contig' array if
614 # the array contains zero or one group. If we have any unflattened arrays,
615 # we may need to run the resolution process. If the reading is already known
616 # to have a conflict, flatten the 'contig' array to nothing; we won't resolve
617 # it.
618 my @resolve;
619 foreach my $wit ( keys %$contig ) {
620 next unless ref( $contig->{$wit} );
621 if( @{$contig->{$wit}} > 1 ) {
622 if( $is_conflicted ) {
623 $contig->{$wit} = ''; # We aren't going to decide.
624 } else {
625 push( @resolve, $wit );
626 }
627 } else {
628 my $gst = pop @{$contig->{$wit}};
629 $contig->{$wit} = $gst || '';
630 }
631 }
632
633 if( @resolve ) {
634 my $still_contig = {};
635 foreach my $h ( @resolve ) {
636 # For each of the hypothetical readings with more than one possibility,
637 # try deleting it from each of its member subgraphs in turn, and see
638 # if that breaks the contiguous grouping.
639 # TODO This can still break in a corner case where group A can use
640 # either vertex 1 or 2, and group B can use either vertex 2 or 1.
641 # Revisit this if necessary; it could get brute-force nasty.
642 foreach my $gst ( @{$contig->{$h}} ) {
643 my $gpart = $subgraph->{$gst}->copy();
644 # If we have come this far, there is only one root and everything
645 # is reachable from it.
646 my( $root ) = $gpart->predecessorless_vertices;
647 my $reachable = {};
648 map { $reachable->{$_} = 1 } $gpart->vertices;
649
650 # Try deleting the hypothetical node.
651 $gpart->delete_vertex( $h );
652 if( $h eq $root ) {
653 # See if we still have a single root.
654 my @roots = $gpart->predecessorless_vertices;
655 warn "This shouldn't have happened" unless @roots;
656 if( @roots > 1 ) {
657 # $h is needed by this group.
658 if( exists( $still_contig->{$h} ) ) {
659 # Conflict!
660 $conflict->{$gst} = 1;
661 $still_contig->{$h} = '';
662 } else {
663 $still_contig->{$h} = $gst;
664 }
665 }
666 } else {
667 # $h is somewhere in the middle. See if everything
668 # else can still be reached from the root.
669 my %still_reachable = ( $root => 1 );
670 map { $still_reachable{$_} = 1 }
671 $gpart->all_successors( $root );
672 foreach my $v ( keys %$reachable ) {
673 next if $v eq $h;
674 if( !$still_reachable{$v}
675 && ( $contig->{$v} eq $gst
676 || ( exists $still_contig->{$v}
677 && $still_contig->{$v} eq $gst ) ) ) {
678 # We need $h.
679 if( exists $still_contig->{$h} ) {
680 # Conflict!
681 $conflict->{$gst} = 1;
682 $still_contig->{$h} = '';
683 } else {
684 $still_contig->{$h} = $gst;
685 }
686 last;
687 } # else we don't need $h in this group.
688 } # end foreach $v
689 } # endif $h eq $root
690 } # end foreach $gst
691 } # end foreach $h
692
693 # Now we have some hypothetical vertices in $still_contig that are the
694 # "real" group memberships. Replace these in $contig.
695 foreach my $v ( keys %$contig ) {
696 next unless ref $contig->{$v};
697 $contig->{$v} = $still_contig->{$v};
698 }
699 } # end if @resolve
700
701 my $is_genealogical = keys %$conflict ? JSON::false : JSON::true;
702 my $variant_row = [ [], $is_genealogical ];
703 # Fill in the groupings from $contig.
704 foreach my $g ( @$groups ) {
705 my $gst = wit_stringify( $g );
706 my @realgroup = grep { $contig->{$_} eq $gst } keys %$contig;
707 push( @{$variant_row->[0]}, \@realgroup );
708 }
709 return $variant_row;
710}
fae07016 711
6d25a3a0 712sub _prune_group {
713 my( $group, $stemma ) = @_;
714 # Get these into a form prune_subtree will recognize. Make a "contighash"
715 my $hypohash = {};
716 map { $hypohash->{$_} = 1 } @$group;
717 # ...with reference values for hypotheticals.
718 map { $hypohash->{$_} = [] } $stemma->hypotheticals;
719 # Make our subgraph
720 my $subgraph = $stemma->graph->copy;
721 map { $subgraph->delete_vertex( $_ ) unless exists $hypohash->{$_} }
722 $subgraph->vertices;
723 # ...and find the root.
724 my( $root ) = $subgraph->predecessorless_vertices;
725 # Now prune and return the remaining vertices.
726 _prune_subtree( $subgraph, $root, $hypohash );
727 return $subgraph->vertices;
728}
729
7f52eac8 730sub _prune_subtree {
231d71fc 731 my( $tree, $root, $contighash ) = @_;
732 # First, delete hypothetical leaves / orphans until there are none left.
733 my @orphan_hypotheticals = grep { ref( $contighash->{$_} ) }
734 $tree->successorless_vertices;
735 while( @orphan_hypotheticals ) {
736 $tree->delete_vertices( @orphan_hypotheticals );
737 @orphan_hypotheticals = grep { ref( $contighash->{$_} ) }
738 $tree->successorless_vertices;
739 }
740 # Then delete a hypothetical root with only one successor, moving the
bebec0e9 741 # root to the first child that has no other predecessors.
231d71fc 742 while( $tree->successors( $root ) == 1 && ref $contighash->{$root} ) {
743 my @nextroot = $tree->successors( $root );
744 $tree->delete_vertex( $root );
bebec0e9 745 ( $root ) = grep { $tree->is_predecessorless_vertex( $_ ) } @nextroot;
231d71fc 746 }
747 # The tree has been modified in place, but we need to know the new root.
bebec0e9 748 $root = undef unless $root && $tree->has_vertex( $root );
231d71fc 749 return $root;
750}
d71100ed 751# Add the variant, subject to a.c. representation logic.
752# This assumes that we will see the 'main' version before the a.c. version.
753sub add_variant_wit {
754 my( $arr, $wit, $acstr ) = @_;
755 my $skip;
756 if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
757 my $real = $1;
758 $skip = grep { $_ =~ /^\Q$real\E$/ } @$arr;
759 }
760 push( @$arr, $wit ) unless $skip;
761}
762
5be0cdeb 763sub _useful_variant {
764 my( $group_readings, $graph, $acstr ) = @_;
765
766 # TODO Decide what to do with AC witnesses
767
768 # Sort by group size and return
769 my $is_useful = 0;
770 my( @readings, @groups ); # The sorted groups for our answer.
771 foreach my $rdg ( sort { @{$group_readings->{$b}} <=> @{$group_readings->{$a}} }
772 keys %$group_readings ) {
773 push( @readings, $rdg );
774 push( @groups, $group_readings->{$rdg} );
775 if( @{$group_readings->{$rdg}} > 1 ) {
776 $is_useful++;
777 } else {
778 my( $wit ) = @{$group_readings->{$rdg}};
779 $wit =~ s/^(.*)\Q$acstr\E$/$1/;
780 $is_useful++ unless( $graph->is_sink_vertex( $wit ) );
781 }
782 }
783 if( $is_useful > 1 ) {
784 return( \@readings, \@groups );
785 } else {
786 return( [], [] );
787 }
788}
789
7f52eac8 790=head2 wit_stringify( $groups )
791
792Takes an array of witness groupings and produces a string like
793['A','B'] / ['C','D','E'] / ['F']
d71100ed 794
7f52eac8 795=cut
d71100ed 796
797sub wit_stringify {
798 my $groups = shift;
799 my @gst;
800 # If we were passed an array of witnesses instead of an array of
801 # groupings, then "group" the witnesses first.
802 unless( ref( $groups->[0] ) ) {
803 my $mkgrp = [ $groups ];
804 $groups = $mkgrp;
805 }
806 foreach my $g ( @$groups ) {
807 push( @gst, '[' . join( ',', map { "'$_'" } @$g ) . ']' );
808 }
809 return join( ' / ', @gst );
810}
7f52eac8 811
5be0cdeb 812# Helper function to ensure that X and X a.c. never appear in the same list.
813sub _add_to_witlist {
814 my( $wit, $list, $acstr ) = @_;
815 my %inlist;
816 my $idx = 0;
817 map { $inlist{$_} = $idx++ } @$list;
818 if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
819 my $acwit = $1;
820 unless( exists $inlist{$acwit} ) {
821 push( @$list, $acwit.$acstr );
822 }
823 } else {
824 if( exists( $inlist{$wit.$acstr} ) ) {
825 # Replace the a.c. version with the main witness
826 my $i = $inlist{$wit.$acstr};
827 $list->[$i] = $wit;
828 } else {
829 push( @$list, $wit );
830 }
831 }
832}
833
bebec0e9 834sub _symmdiff {
835 my( $lista, $listb ) = @_;
7f52eac8 836 my %union;
837 my %scalars;
838 map { $union{$_} = 1; $scalars{$_} = $_ } @$lista;
839 map { $union{$_} += 1; $scalars{$_} = $_ } @$listb;
bebec0e9 840 my @set = grep { $union{$_} == 1 } keys %union;
7f52eac8 841 return map { $scalars{$_} } @set;
842}
843
8441;
845
846=head1 LICENSE
847
848This package is free software and is provided "as is" without express
849or implied warranty. You can redistribute it and/or modify it under
850the same terms as Perl itself.
851
852=head1 AUTHOR
853
854Tara L Andrews E<lt>aurum@cpan.orgE<gt>