make path labels do the right thing with a.c. witnesses
[scpubgit/stemmatology.git] / lib / Text / Tradition / Analysis.pm
CommitLineData
d71100ed 1package Text::Tradition::Analysis;
2
3use strict;
4use warnings;
e4386ba9 5use Benchmark;
88a6bac5 6use Encode qw/ encode_utf8 /;
d1348d38 7use Exporter 'import';
b4cb2d60 8use Graph;
88a6bac5 9use JSON qw/ encode_json decode_json /;
10use LWP::UserAgent;
d71100ed 11use Text::Tradition;
12use Text::Tradition::Stemma;
13
d1348d38 14use vars qw/ @EXPORT_OK /;
a2cf85dd 15@EXPORT_OK = qw/ run_analysis group_variants analyze_variant_location wit_stringify /;
d1348d38 16
7f52eac8 17=head1 NAME
18
19Text::Tradition::Analysis - functions for stemma analysis of a tradition
20
21=head1 SYNOPSIS
22
23 use Text::Tradition;
24 use Text::Tradition::Analysis qw/ run_analysis analyze_variant_location /;
25 my $t = Text::Tradition->new(
26 'name' => 'this is a text',
27 'input' => 'TEI',
28 'file' => '/path/to/tei_parallel_seg_file.xml' );
29 $t->add_stemma( 'dotfile' => $stemmafile );
30
31 my $variant_data = run_analysis( $tradition );
32 # Recalculate rank $n treating all orthographic variants as equivalent
33 my $reanalyze = analyze_variant_location( $tradition, $n, 0, 'orthographic' );
34
35=head1 DESCRIPTION
36
37Text::Tradition is a library for representation and analysis of collated
38texts, particularly medieval ones. The Collation is the central feature of
39a Tradition, where the text, its sequence of readings, and its relationships
40between readings are actually kept.
41
42=head1 SUBROUTINES
43
88a6bac5 44=head2 run_analysis( $tradition, %opts )
7f52eac8 45
88a6bac5 46Runs the analysis described in analyze_variant_location on every location in the
47collation of the given tradition, with the given options. These include:
7f52eac8 48
88a6bac5 49=over 4
50
51=item * stemma_id - Specify which of the tradition's stemmata to use. Default
52is 0 (i.e. the first).
53
54=item * ranks - Specify a list of location ranks to analyze; exclude the rest.
55
56=item * merge_types - Specify a list of relationship types, where related readings
57should be treated as identical for the purposes of analysis.
58
ffa22d6f 59=item * exclude_type1 - Exclude those ranks whose groupings have only type-1 variants.
60
88a6bac5 61=back
7f52eac8 62
63=begin testing
64
65use Text::Tradition;
66use Text::Tradition::Analysis qw/ run_analysis analyze_variant_location /;
67
68my $datafile = 't/data/florilegium_tei_ps.xml';
69my $tradition = Text::Tradition->new( 'input' => 'TEI',
70 'name' => 'test0',
71 'file' => $datafile );
72my $s = $tradition->add_stemma( 'dotfile' => 't/data/florilegium.dot' );
73is( ref( $s ), 'Text::Tradition::Stemma', "Added stemma to tradition" );
74
f00cefe8 75my %expected_genealogical = (
a44aaf2a 76 1 => 0,
f00cefe8 77 2 => 1,
a44aaf2a 78 3 => 0,
79 5 => 0,
80 7 => 0,
81 8 => 0,
82 10 => 0,
f00cefe8 83 13 => 1,
a44aaf2a 84 33 => 0,
85 34 => 0,
86 37 => 0,
87 60 => 0,
f00cefe8 88 81 => 1,
a44aaf2a 89 84 => 0,
90 87 => 0,
91 101 => 0,
92 102 => 0,
f00cefe8 93 122 => 1,
a44aaf2a 94 157 => 0,
f00cefe8 95 166 => 1,
96 169 => 1,
a44aaf2a 97 200 => 0,
f00cefe8 98 216 => 1,
99 217 => 1,
100 219 => 1,
101 241 => 1,
102 242 => 1,
103 243 => 1,
104);
105
7f52eac8 106my $data = run_analysis( $tradition );
f00cefe8 107foreach my $row ( @{$data->{'variants'}} ) {
a44aaf2a 108 # Account for rows that used to be "not useful"
109 unless( exists $expected_genealogical{$row->{'id'}} ) {
110 $expected_genealogical{$row->{'id'}} = 1;
111 }
6d25a3a0 112 is( $row->{'genealogical'}, $expected_genealogical{$row->{'id'}},
f00cefe8 113 "Got correct genealogical flag for row " . $row->{'id'} );
114}
a44aaf2a 115is( $data->{'variant_count'}, 58, "Got right total variant number" );
b4cb2d60 116# TODO Make something meaningful of conflict count, maybe test other bits
7f52eac8 117
118=end testing
119
120=cut
121
d71100ed 122sub run_analysis {
88a6bac5 123 my( $tradition, %opts ) = @_;
f00cefe8 124 my $c = $tradition->collation;
88a6bac5 125
126 my $stemma_id = $opts{'stemma_id'} || 0;
1d73ecad 127 my @ranks = ref( $opts{'ranks'} ) eq 'ARRAY' ? @{$opts{'ranks'}} : ();
128 my @collapse = ref( $opts{'merge_types'} ) eq 'ARRAY' ? @{$opts{'merge_types'}} : ();
88a6bac5 129
130 # Get the stemma
131 my $stemma = $tradition->stemma( $stemma_id );
b4cb2d60 132
88a6bac5 133 # Figure out which witnesses we are working with
134 my @lacunose = $stemma->hypotheticals;
fae07016 135 my @tradition_wits = map { $_->sigil } $tradition->witnesses;
b4cb2d60 136 map { push( @tradition_wits, $_->sigil.$c->ac_label ) if $_->is_layered }
fae07016 137 $tradition->witnesses;
138 push( @lacunose, _symmdiff( [ $stemma->witnesses ], \@tradition_wits ) );
88a6bac5 139
140 # Find and mark 'common' ranks for exclusion, unless they were
141 # explicitly specified.
142 unless( @ranks ) {
143 my %common_rank;
a44aaf2a 144 foreach my $rdg ( $c->common_readings ) {
88a6bac5 145 $common_rank{$rdg->rank} = 1;
146 }
147 @ranks = grep { !$common_rank{$_} } ( 1 .. $c->end->rank-1 );
d71100ed 148 }
7f52eac8 149
88a6bac5 150 # Group the variants to send to the solver
151 my @groups;
f629cb3b 152 my @use_ranks;
a44aaf2a 153 my %lacunae;
88a6bac5 154 foreach my $rank ( @ranks ) {
f629cb3b 155 $DB::single = 1 if $rank == 1003;
a44aaf2a 156 my $missing = [ @lacunose ];
ffa22d6f 157 my $rankgroup = group_variants( $tradition, $rank, $missing, \@collapse );
158 if( $opts{'exclude_type1'} ) {
159 # Check to see whether this is a "useful" group.
160 my( $rdgs, $grps ) = _useful_variant( $rankgroup,
161 $stemma->graph, $c->ac_label );
162 next unless @$rdgs;
163 }
f629cb3b 164 push( @use_ranks, $rank );
ffa22d6f 165 push( @groups, $rankgroup );
a44aaf2a 166 $lacunae{$rank} = $missing;
d71100ed 167 }
88a6bac5 168 # Parse the answer
e59b8faa 169 my $answer = solve_variants( $stemma, @groups );
fae07016 170
88a6bac5 171 # Do further analysis on the answer
a44aaf2a 172 my $conflict_count = 0;
f629cb3b 173 foreach my $idx ( 0 .. $#use_ranks ) {
88a6bac5 174 my $location = $answer->{'variants'}->[$idx];
175 # Add the rank back in
f629cb3b 176 $location->{'id'} = $use_ranks[$idx];
a44aaf2a 177 # Add the lacunae back in
f629cb3b 178 $location->{'missing'} = $lacunae{$use_ranks[$idx]};
179 my %lmiss;
180 map { $lmiss{$_} = 1 } @{$location->{'missing'}};
88a6bac5 181 # Run the extra analysis we need.
88a6bac5 182 analyze_location( $tradition, $stemma->graph, $location );
a44aaf2a 183 foreach my $rdghash ( @{$location->{'readings'}} ) {
184 $conflict_count++
185 if exists $rdghash->{'conflict'} && $rdghash->{'conflict'};
f629cb3b 186 # Add the reading text back in
a44aaf2a 187 my $rdg = $c->reading( $rdghash->{'readingid'} );
188 $rdghash->{'text'} = $rdg ? $rdg->text : $rdghash->{'readingid'};
f629cb3b 189 # Remove lacunose witnesses from this reading's list now that the
190 # analysis is done
191 my @realgroup;
192 map { push( @realgroup, $_ ) unless $lmiss{$_} } $rdghash->{'group'};
193 $rdghash->{'group'} = \@realgroup;
a44aaf2a 194 }
88a6bac5 195 }
a44aaf2a 196 $answer->{'conflict_count'} = $conflict_count;
f00cefe8 197
88a6bac5 198 return $answer;
d71100ed 199}
200
7f52eac8 201=head2 group_variants( $tradition, $rank, $lacunose, @merge_relationship_types )
202
203Groups the variants at the given $rank of the collation, treating any
204relationships in @merge_relationship_types as equivalent. $lacunose should
205be a reference to an array, to which the sigla of lacunose witnesses at this
206rank will be appended.
207
ffa22d6f 208Returns a hash $group_readings where $rdg is attested by the witnesses listed
209in $group_readings->{$rdg}.
7f52eac8 210
211=cut
212
213# Return group_readings, groups, lacunose
d1348d38 214sub group_variants {
7f52eac8 215 my( $tradition, $rank, $lacunose, $collapse ) = @_;
216 my $c = $tradition->collation;
b4cb2d60 217 my $aclabel = $c->ac_label;
7f52eac8 218 # Get the alignment table readings
219 my %readings_at_rank;
ffa22d6f 220 my %is_lacunose; # lookup table for $lacunose
221 map { $is_lacunose{$_} = 1 } @$lacunose;
7f52eac8 222 my @gap_wits;
1d73ecad 223 foreach my $tablewit ( @{$c->alignment_table->{'alignment'}} ) {
7f52eac8 224 my $rdg = $tablewit->{'tokens'}->[$rank-1];
fae07016 225 my $wit = $tablewit->{'witness'};
ffa22d6f 226 # Exclude the witness if it is "lacunose" which if we got here
227 # means "not in the stemma".
228 next if $is_lacunose{$wit};
7f52eac8 229 if( $rdg && $rdg->{'t'}->is_lacuna ) {
b4cb2d60 230 _add_to_witlist( $wit, $lacunose, $aclabel );
7f52eac8 231 } elsif( $rdg ) {
232 $readings_at_rank{$rdg->{'t'}->text} = $rdg->{'t'};
233 } else {
b4cb2d60 234 _add_to_witlist( $wit, \@gap_wits, $aclabel );
7f52eac8 235 }
236 }
d1348d38 237
7f52eac8 238 # Group the readings, collapsing groups by relationship if needed
239 my %grouped_readings;
b4cb2d60 240 foreach my $rdg ( sort { $b->witnesses <=> $a->witnesses }
241 values %readings_at_rank ) {
7f52eac8 242 # Skip readings that have been collapsed into others.
f00cefe8 243 next if exists $grouped_readings{$rdg->id} && !$grouped_readings{$rdg->id};
7f52eac8 244 my @wits = $rdg->witnesses;
245 if( $collapse ) {
246 my $filter = sub { my $r = $_[0]; grep { $_ eq $r->type } @$collapse; };
247 foreach my $other ( $rdg->related_readings( $filter ) ) {
fae07016 248 my @otherwits = $other->witnesses;
fae07016 249 push( @wits, @otherwits );
f00cefe8 250 $grouped_readings{$other->id} = 0;
d1348d38 251 }
252 }
ffa22d6f 253 my @use_wits = grep { !$is_lacunose{$_} } @wits;
254 $grouped_readings{$rdg->id} = \@use_wits;
7f52eac8 255 }
256 $grouped_readings{'(omitted)'} = \@gap_wits if @gap_wits;
257 # Get rid of our collapsed readings
258 map { delete $grouped_readings{$_} unless $grouped_readings{$_} }
259 keys %grouped_readings
260 if $collapse;
261
5be0cdeb 262 return \%grouped_readings;
d1348d38 263}
264
88a6bac5 265=head2 solve_variants( $graph, @groups )
266
267Sends the set of groups to the external graph solver service and returns
268a cleaned-up answer, adding the rank IDs back where they belong.
269
270The JSON has the form
271 { "graph": [ stemmagraph DOT string without newlines ],
272 "groupings": [ array of arrays of groups, one per rank ] }
273
274The answer has the form
275 { "variants" => [ array of variant location structures ],
276 "variant_count" => total,
277 "conflict_count" => number of conflicts detected,
278 "genealogical_count" => number of solutions found }
279
280=cut
281
282sub solve_variants {
e59b8faa 283 my( $stemma, @groups ) = @_;
88a6bac5 284
285 # Make the json with stemma + groups
b4cb2d60 286 my $groupings = [];
88a6bac5 287 foreach my $ghash ( @groups ) {
288 my @grouping;
6d25a3a0 289 foreach my $k ( keys %$ghash ) {
88a6bac5 290 push( @grouping, $ghash->{$k} );
291 }
b4cb2d60 292 push( @$groupings, \@grouping );
88a6bac5 293 }
b4cb2d60 294 ## Witness map is a HACK to get around limitations in node names from IDP
295 my $witness_map = {};
296 my $json = encode_json( _safe_wit_strings( $stemma, $groupings, $witness_map ) );
88a6bac5 297
298 # Send it off and get the result
299 my $solver_url = 'http://byzantini.st/cgi-bin/graphcalc.cgi';
300 my $ua = LWP::UserAgent->new();
301 my $resp = $ua->post( $solver_url, 'Content-Type' => 'application/json',
302 'Content' => $json );
303
304 my $answer;
6d25a3a0 305 my $used_idp;
88a6bac5 306 if( $resp->is_success ) {
b4cb2d60 307 $answer = _desanitize_names( decode_json( $resp->content ), $witness_map );
6d25a3a0 308 $used_idp = 1;
88a6bac5 309 } else {
fae07016 310 # Fall back to the old method.
311 warn "IDP solver returned " . $resp->status_line . " / " . $resp->content
312 . "; falling back to perl method";
b4cb2d60 313 $answer = perl_solver( $stemma, @$groupings );
88a6bac5 314 }
315
316 # Fold the result back into what we know about the groups.
317 my $variants = [];
318 my $genealogical = 0;
319 foreach my $idx ( 0 .. $#groups ) {
320 my( $calc_groups, $result ) = @{$answer->[$idx]};
6d25a3a0 321 if( $result ) {
322 $genealogical++;
323 # Prune the calculated groups, in case the IDP solver failed to.
324 if( $used_idp ) {
325 my @pruned_groups;
326 foreach my $cg ( @$calc_groups ) {
327 my @pg = _prune_group( $cg, $stemma );
328 push( @pruned_groups, \@pg );
329 }
330 $calc_groups = \@pruned_groups;
331 }
332 }
88a6bac5 333 my $input_group = $groups[$idx];
f629cb3b 334 foreach my $k ( keys %$input_group ) {
88a6bac5 335 my $cg = shift @$calc_groups;
336 $input_group->{$k} = $cg;
337 }
338 my $vstruct = {
339 'genealogical' => $result,
340 'readings' => [],
fae07016 341 };
f629cb3b 342 foreach my $k ( sort { @{$input_group->{$b}} <=> @{$input_group->{$a}} }
343 keys %$input_group ) {
88a6bac5 344 push( @{$vstruct->{'readings'}},
fae07016 345 { 'readingid' => $k, 'group' => $input_group->{$k}} );
88a6bac5 346 }
347 push( @$variants, $vstruct );
348 }
349
350 return { 'variants' => $variants,
351 'variant_count' => scalar @$variants,
352 'genealogical_count' => $genealogical };
353}
354
b4cb2d60 355#### HACKERY to cope with IDP's limited idea of what a node name looks like ###
356
357sub _safe_wit_strings {
358 my( $stemma, $groupings, $witness_map ) = @_;
359 my $safegraph = Graph->new();
360 # Convert the graph to a safe representation and store the conversion.
361 foreach my $n ( $stemma->graph->vertices ) {
362 my $sn = _safe_witstr( $n );
363 warn "Ambiguous stringification $sn for $n and " . $witness_map->{$sn}
364 if exists $witness_map->{$sn};
365 $witness_map->{$sn} = $n;
366 $safegraph->add_vertex( $sn );
367 $safegraph->set_vertex_attributes( $sn,
368 $stemma->graph->get_vertex_attributes( $n ) );
369 }
370 foreach my $e ( $stemma->graph->edges ) {
371 my @safe_e = ( _safe_witstr( $e->[0] ), _safe_witstr( $e->[1] ) );
372 $safegraph->add_edge( @safe_e );
373 }
374 my $safe_stemma = Text::Tradition::Stemma->new(
375 'collation' => $stemma->collation, 'graph' => $safegraph );
376
377 # Now convert the witness groupings to a safe representation.
378 my $safe_groupings = [];
379 foreach my $grouping ( @$groupings ) {
380 my $safe_grouping = [];
381 foreach my $group ( @$grouping ) {
382 my $safe_group = [];
383 foreach my $n ( @$group ) {
384 my $sn = _safe_witstr( $n );
385 warn "Ambiguous stringification $sn for $n and " . $witness_map->{$sn}
386 if exists $witness_map->{$sn} && $witness_map->{$sn} ne $n;
387 $witness_map->{$sn} = $n;
388 push( @$safe_group, $sn );
389 }
390 push( @$safe_grouping, $safe_group );
391 }
392 push( @$safe_groupings, $safe_grouping );
393 }
394
395 # Return it all in the struct we expect. We have stored the reductions
396 # in the $witness_map that we were passed.
397 return { 'graph' => $safe_stemma->editable( ' ' ), 'groupings' => $safe_groupings };
398}
399
400sub _safe_witstr {
401 my $witstr = shift;
402 $witstr =~ s/\s+/_/g;
403 $witstr =~ s/[^\w\d-]//g;
404 return $witstr;
405}
406
407sub _desanitize_names {
408 my( $jsonstruct, $witness_map ) = @_;
409 my $result = [];
410 foreach my $grouping ( @$jsonstruct ) {
411 my $real_grouping = [];
412 foreach my $element ( @$grouping ) {
413 if( ref( $element ) eq 'ARRAY' ) {
414 # it's the groupset.
415 my $real_groupset = [];
416 foreach my $group ( @$element ) {
417 my $real_group = [];
418 foreach my $n ( @$group ) {
419 my $rn = $witness_map->{$n};
420 push( @$real_group, $rn );
421 }
422 push( @$real_groupset, $real_group );
423 }
424 push( @$real_grouping, $real_groupset );
425 } else {
426 # It is the boolean, not actually a group.
427 push( @$real_grouping, $element );
428 }
429 }
430 push( @$result, $real_grouping );
431 }
432 return $result;
433}
434
435### END HACKERY ###
436
fae07016 437=head2 analyze_location ( $tradition, $graph, $location_hash )
7f52eac8 438
fae07016 439Given the tradition, its stemma graph, and the solution from the graph solver,
440work out the rest of the information we want. For each reading we need missing,
441conflict, reading_parents, independent_occurrence, followed, not_followed, and follow_unknown. Alters the location_hash in place.
7f52eac8 442
443=cut
732152b1 444
fae07016 445sub analyze_location {
446 my ( $tradition, $graph, $variant_row ) = @_;
447
448 # Make a hash of all known node memberships, and make the subgraphs.
449 my $contig = {};
450 my $reading_roots = {};
451 my $subgraph = {};
452 foreach my $rdghash ( @{$variant_row->{'readings'}} ) {
453 my $rid = $rdghash->{'readingid'};
454 map { $contig->{$_} = $rid } @{$rdghash->{'group'}};
c4a4fb1b 455
fae07016 456 # Make the subgraph.
457 my $part = $graph->copy;
458 my %these_vertices;
459 map { $these_vertices{$_} = 1 } @{$rdghash->{'group'}};
460 $part->delete_vertices( grep { !$these_vertices{$_} } $part->vertices );
461 $subgraph->{$rid} = $part;
462 # Get the reading roots.
463 map { $reading_roots->{$_} = $rid } $part->predecessorless_vertices;
bebec0e9 464 }
465
fae07016 466 # Now that we have all the node group memberships, calculate followed/
bebec0e9 467 # non-followed/unknown values for each reading. Also figure out the
468 # reading's evident parent(s).
469 foreach my $rdghash ( @{$variant_row->{'readings'}} ) {
fae07016 470 # Group string key - TODO do we need this?
471 my $gst = wit_stringify( $rdghash->{'group'} );
472 my $rid = $rdghash->{'readingid'};
473 # Get the subgraph
474 my $part = $subgraph->{$rid};
475
476 # Start figuring things out.
bebec0e9 477 my @roots = $part->predecessorless_vertices;
478 $rdghash->{'independent_occurrence'} = scalar @roots;
479 $rdghash->{'followed'} = scalar( $part->vertices ) - scalar( @roots );
480 # Find the parent readings, if any, of this reading.
f00cefe8 481 my %rdgparents;
bebec0e9 482 foreach my $wit ( @roots ) {
f00cefe8 483 # Look in the main stemma to find this witness's extant or known-reading
484 # immediate ancestor(s), and look up the reading that each ancestor olds.
485 my @check = $graph->predecessors( $wit );
486 while( @check ) {
487 my @next;
488 foreach my $wparent( @check ) {
fae07016 489 my $preading = $contig->{$wparent};
490 if( $preading ) {
491 $rdgparents{$preading} = 1;
f00cefe8 492 } else {
493 push( @next, $graph->predecessors( $wparent ) );
494 }
495 }
496 @check = @next;
497 }
bebec0e9 498 }
f00cefe8 499 $rdghash->{'reading_parents'} = [ keys %rdgparents ];
bebec0e9 500
501 # Find the number of times this reading was altered, and the number of
502 # times we're not sure.
503 my( %nofollow, %unknownfollow );
504 foreach my $wit ( $part->vertices ) {
505 foreach my $wchild ( $graph->successors( $wit ) ) {
506 next if $part->has_vertex( $wchild );
fae07016 507 if( $reading_roots->{$wchild} && $contig->{$wchild} ) {
bebec0e9 508 # It definitely changed here.
509 $nofollow{$wchild} = 1;
510 } elsif( !($contig->{$wchild}) ) {
511 # The child is a hypothetical node not definitely in
512 # any group. Answer is unknown.
513 $unknownfollow{$wchild} = 1;
514 } # else it's a non-root node in a known group, and therefore
515 # is presumed to have its reading from its group, not this link.
516 }
517 }
518 $rdghash->{'not_followed'} = keys %nofollow;
519 $rdghash->{'follow_unknown'} = keys %unknownfollow;
fae07016 520
521 # Now say whether this reading represents a conflict.
522 unless( $variant_row->{'genealogical'} ) {
523 $rdghash->{'conflict'} = @roots != 1;
524 }
c4a4fb1b 525 }
d71100ed 526}
527
fae07016 528
529=head2 perl_solver( $tradition, $rank, $stemma_id, @merge_relationship_types )
530
531** NOTE ** This method should hopefully not be called - it is not guaranteed
532to be correct. Serves as a backup for the real solver.
533
534Runs an analysis of the given tradition, at the location given in $rank,
535against the graph of the stemma specified in $stemma_id. The argument
536@merge_relationship_types is an optional list of relationship types for
537which readings so related should be treated as equivalent.
538
539Returns a nested array data structure as follows:
540
541 [ [ group_list, is_genealogical ], [ group_list, is_genealogical ] ... ]
542
543where the group list is the array of arrays passed in for each element of @groups,
544possibly with the addition of hypothetical readings.
545
546
547=cut
548
549sub perl_solver {
e59b8faa 550 my( $stemma, @groups ) = @_;
551 my $graph = $stemma->graph;
552 my @answer;
553 foreach my $g ( @groups ) {
554 push( @answer, _solve_variant_location( $graph, $g ) );
555 }
556 return \@answer;
fae07016 557}
558
e59b8faa 559sub _solve_variant_location {
560 my( $graph, $groups ) = @_;
fae07016 561 # Now do the work.
e59b8faa 562 my $contig = {};
563 my $subgraph = {};
564 my $is_conflicted;
565 my $conflict = {};
566
567 # Mark each ms as in its own group, first.
568 foreach my $g ( @$groups ) {
569 my $gst = wit_stringify( $g );
570 map { $contig->{$_} = $gst } @$g;
571 }
572
573 # Now for each unmarked node in the graph, initialize an array
574 # for possible group memberships. We will use this later to
575 # resolve potential conflicts.
576 map { $contig->{$_} = [] unless $contig->{$_} } $graph->vertices;
577 foreach my $g ( sort { scalar @$b <=> scalar @$a } @$groups ) {
578 my $gst = wit_stringify( $g ); # This is the group name
579 # Copy the graph, and delete all non-members from the new graph.
580 my $part = $graph->copy;
581 my @group_roots;
582 $part->delete_vertices(
583 grep { !ref( $contig->{$_} ) && $contig->{$_} ne $gst } $graph->vertices );
584
585 # Now look to see if our group is connected.
586 if( @$g > 1 ) {
587 # We have to take directionality into account.
588 # How many root nodes do we have?
589 my @roots = grep { ref( $contig->{$_} ) || $contig->{$_} eq $gst }
590 $part->predecessorless_vertices;
591 # Assuming that @$g > 1, find the first root node that has at
592 # least one successor belonging to our group. If this reading
593 # is genealogical, there should be only one, but we will check
594 # that implicitly later.
595 foreach my $root ( @roots ) {
596 # Prune the tree to get rid of extraneous hypotheticals.
597 $root = _prune_subtree( $part, $root, $contig );
598 next unless $root;
599 # Save this root for our group.
600 push( @group_roots, $root );
601 # Get all the successor nodes of our root.
602 }
603 } else {
604 # Dispense with the trivial case of one reading.
605 my $wit = $g->[0];
606 @group_roots = ( $wit );
607 foreach my $v ( $part->vertices ) {
608 $part->delete_vertex( $v ) unless $v eq $wit;
609 }
610 }
611
612 if( @group_roots > 1 ) {
613 $conflict->{$gst} = 1;
614 $is_conflicted = 1;
615 }
616 # Paint the 'hypotheticals' with our group.
617 foreach my $wit ( $part->vertices ) {
618 if( ref( $contig->{$wit} ) ) {
619 push( @{$contig->{$wit}}, $gst );
620 } elsif( $contig->{$wit} ne $gst ) {
621 warn "How did we get here?";
622 }
623 }
624
625
626 # Save the relevant subgraph.
627 $subgraph->{$gst} = $part;
628 }
629
630 # For each of our hypothetical readings, flatten its 'contig' array if
631 # the array contains zero or one group. If we have any unflattened arrays,
632 # we may need to run the resolution process. If the reading is already known
633 # to have a conflict, flatten the 'contig' array to nothing; we won't resolve
634 # it.
635 my @resolve;
636 foreach my $wit ( keys %$contig ) {
637 next unless ref( $contig->{$wit} );
638 if( @{$contig->{$wit}} > 1 ) {
639 if( $is_conflicted ) {
640 $contig->{$wit} = ''; # We aren't going to decide.
641 } else {
642 push( @resolve, $wit );
643 }
644 } else {
645 my $gst = pop @{$contig->{$wit}};
646 $contig->{$wit} = $gst || '';
647 }
648 }
649
650 if( @resolve ) {
651 my $still_contig = {};
652 foreach my $h ( @resolve ) {
653 # For each of the hypothetical readings with more than one possibility,
654 # try deleting it from each of its member subgraphs in turn, and see
655 # if that breaks the contiguous grouping.
656 # TODO This can still break in a corner case where group A can use
657 # either vertex 1 or 2, and group B can use either vertex 2 or 1.
658 # Revisit this if necessary; it could get brute-force nasty.
659 foreach my $gst ( @{$contig->{$h}} ) {
660 my $gpart = $subgraph->{$gst}->copy();
661 # If we have come this far, there is only one root and everything
662 # is reachable from it.
663 my( $root ) = $gpart->predecessorless_vertices;
664 my $reachable = {};
665 map { $reachable->{$_} = 1 } $gpart->vertices;
666
667 # Try deleting the hypothetical node.
668 $gpart->delete_vertex( $h );
669 if( $h eq $root ) {
670 # See if we still have a single root.
671 my @roots = $gpart->predecessorless_vertices;
672 warn "This shouldn't have happened" unless @roots;
673 if( @roots > 1 ) {
674 # $h is needed by this group.
675 if( exists( $still_contig->{$h} ) ) {
676 # Conflict!
677 $conflict->{$gst} = 1;
678 $still_contig->{$h} = '';
679 } else {
680 $still_contig->{$h} = $gst;
681 }
682 }
683 } else {
684 # $h is somewhere in the middle. See if everything
685 # else can still be reached from the root.
686 my %still_reachable = ( $root => 1 );
687 map { $still_reachable{$_} = 1 }
688 $gpart->all_successors( $root );
689 foreach my $v ( keys %$reachable ) {
690 next if $v eq $h;
691 if( !$still_reachable{$v}
692 && ( $contig->{$v} eq $gst
693 || ( exists $still_contig->{$v}
694 && $still_contig->{$v} eq $gst ) ) ) {
695 # We need $h.
696 if( exists $still_contig->{$h} ) {
697 # Conflict!
698 $conflict->{$gst} = 1;
699 $still_contig->{$h} = '';
700 } else {
701 $still_contig->{$h} = $gst;
702 }
703 last;
704 } # else we don't need $h in this group.
705 } # end foreach $v
706 } # endif $h eq $root
707 } # end foreach $gst
708 } # end foreach $h
709
710 # Now we have some hypothetical vertices in $still_contig that are the
711 # "real" group memberships. Replace these in $contig.
712 foreach my $v ( keys %$contig ) {
713 next unless ref $contig->{$v};
714 $contig->{$v} = $still_contig->{$v};
715 }
716 } # end if @resolve
717
718 my $is_genealogical = keys %$conflict ? JSON::false : JSON::true;
719 my $variant_row = [ [], $is_genealogical ];
720 # Fill in the groupings from $contig.
721 foreach my $g ( @$groups ) {
722 my $gst = wit_stringify( $g );
723 my @realgroup = grep { $contig->{$_} eq $gst } keys %$contig;
724 push( @{$variant_row->[0]}, \@realgroup );
725 }
726 return $variant_row;
727}
fae07016 728
6d25a3a0 729sub _prune_group {
730 my( $group, $stemma ) = @_;
731 # Get these into a form prune_subtree will recognize. Make a "contighash"
732 my $hypohash = {};
733 map { $hypohash->{$_} = 1 } @$group;
734 # ...with reference values for hypotheticals.
735 map { $hypohash->{$_} = [] } $stemma->hypotheticals;
736 # Make our subgraph
737 my $subgraph = $stemma->graph->copy;
738 map { $subgraph->delete_vertex( $_ ) unless exists $hypohash->{$_} }
739 $subgraph->vertices;
740 # ...and find the root.
741 my( $root ) = $subgraph->predecessorless_vertices;
742 # Now prune and return the remaining vertices.
743 _prune_subtree( $subgraph, $root, $hypohash );
744 return $subgraph->vertices;
745}
746
7f52eac8 747sub _prune_subtree {
231d71fc 748 my( $tree, $root, $contighash ) = @_;
749 # First, delete hypothetical leaves / orphans until there are none left.
750 my @orphan_hypotheticals = grep { ref( $contighash->{$_} ) }
751 $tree->successorless_vertices;
752 while( @orphan_hypotheticals ) {
753 $tree->delete_vertices( @orphan_hypotheticals );
754 @orphan_hypotheticals = grep { ref( $contighash->{$_} ) }
755 $tree->successorless_vertices;
756 }
757 # Then delete a hypothetical root with only one successor, moving the
bebec0e9 758 # root to the first child that has no other predecessors.
231d71fc 759 while( $tree->successors( $root ) == 1 && ref $contighash->{$root} ) {
760 my @nextroot = $tree->successors( $root );
761 $tree->delete_vertex( $root );
bebec0e9 762 ( $root ) = grep { $tree->is_predecessorless_vertex( $_ ) } @nextroot;
231d71fc 763 }
764 # The tree has been modified in place, but we need to know the new root.
bebec0e9 765 $root = undef unless $root && $tree->has_vertex( $root );
231d71fc 766 return $root;
767}
d71100ed 768# Add the variant, subject to a.c. representation logic.
769# This assumes that we will see the 'main' version before the a.c. version.
770sub add_variant_wit {
771 my( $arr, $wit, $acstr ) = @_;
772 my $skip;
773 if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
774 my $real = $1;
775 $skip = grep { $_ =~ /^\Q$real\E$/ } @$arr;
776 }
777 push( @$arr, $wit ) unless $skip;
778}
779
5be0cdeb 780sub _useful_variant {
781 my( $group_readings, $graph, $acstr ) = @_;
782
783 # TODO Decide what to do with AC witnesses
784
785 # Sort by group size and return
786 my $is_useful = 0;
787 my( @readings, @groups ); # The sorted groups for our answer.
788 foreach my $rdg ( sort { @{$group_readings->{$b}} <=> @{$group_readings->{$a}} }
789 keys %$group_readings ) {
790 push( @readings, $rdg );
791 push( @groups, $group_readings->{$rdg} );
792 if( @{$group_readings->{$rdg}} > 1 ) {
793 $is_useful++;
794 } else {
795 my( $wit ) = @{$group_readings->{$rdg}};
796 $wit =~ s/^(.*)\Q$acstr\E$/$1/;
797 $is_useful++ unless( $graph->is_sink_vertex( $wit ) );
798 }
799 }
800 if( $is_useful > 1 ) {
801 return( \@readings, \@groups );
802 } else {
803 return( [], [] );
804 }
805}
806
7f52eac8 807=head2 wit_stringify( $groups )
808
809Takes an array of witness groupings and produces a string like
810['A','B'] / ['C','D','E'] / ['F']
d71100ed 811
7f52eac8 812=cut
d71100ed 813
814sub wit_stringify {
815 my $groups = shift;
816 my @gst;
817 # If we were passed an array of witnesses instead of an array of
818 # groupings, then "group" the witnesses first.
819 unless( ref( $groups->[0] ) ) {
820 my $mkgrp = [ $groups ];
821 $groups = $mkgrp;
822 }
823 foreach my $g ( @$groups ) {
824 push( @gst, '[' . join( ',', map { "'$_'" } @$g ) . ']' );
825 }
826 return join( ' / ', @gst );
827}
7f52eac8 828
5be0cdeb 829# Helper function to ensure that X and X a.c. never appear in the same list.
830sub _add_to_witlist {
831 my( $wit, $list, $acstr ) = @_;
832 my %inlist;
833 my $idx = 0;
834 map { $inlist{$_} = $idx++ } @$list;
835 if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
836 my $acwit = $1;
837 unless( exists $inlist{$acwit} ) {
838 push( @$list, $acwit.$acstr );
839 }
840 } else {
841 if( exists( $inlist{$wit.$acstr} ) ) {
842 # Replace the a.c. version with the main witness
843 my $i = $inlist{$wit.$acstr};
844 $list->[$i] = $wit;
845 } else {
846 push( @$list, $wit );
847 }
848 }
849}
850
bebec0e9 851sub _symmdiff {
852 my( $lista, $listb ) = @_;
7f52eac8 853 my %union;
854 my %scalars;
855 map { $union{$_} = 1; $scalars{$_} = $_ } @$lista;
856 map { $union{$_} += 1; $scalars{$_} = $_ } @$listb;
bebec0e9 857 my @set = grep { $union{$_} == 1 } keys %union;
7f52eac8 858 return map { $scalars{$_} } @set;
859}
860
8611;
862
863=head1 LICENSE
864
865This package is free software and is provided "as is" without express
866or implied warranty. You can redistribute it and/or modify it under
867the same terms as Perl itself.
868
869=head1 AUTHOR
870
871Tara L Andrews E<lt>aurum@cpan.orgE<gt>