only show 'interesting' variants in the stexaminer; send only witnesses
[scpubgit/stemmatology.git] / lib / Text / Tradition / Analysis.pm
CommitLineData
d71100ed 1package Text::Tradition::Analysis;
2
3use strict;
4use warnings;
e4386ba9 5use Benchmark;
88a6bac5 6use Encode qw/ encode_utf8 /;
d1348d38 7use Exporter 'import';
b4cb2d60 8use Graph;
88a6bac5 9use JSON qw/ encode_json decode_json /;
10use LWP::UserAgent;
d71100ed 11use Text::Tradition;
12use Text::Tradition::Stemma;
13
d1348d38 14use vars qw/ @EXPORT_OK /;
a2cf85dd 15@EXPORT_OK = qw/ run_analysis group_variants analyze_variant_location wit_stringify /;
d1348d38 16
7f52eac8 17=head1 NAME
18
19Text::Tradition::Analysis - functions for stemma analysis of a tradition
20
21=head1 SYNOPSIS
22
23 use Text::Tradition;
24 use Text::Tradition::Analysis qw/ run_analysis analyze_variant_location /;
25 my $t = Text::Tradition->new(
26 'name' => 'this is a text',
27 'input' => 'TEI',
28 'file' => '/path/to/tei_parallel_seg_file.xml' );
29 $t->add_stemma( 'dotfile' => $stemmafile );
30
31 my $variant_data = run_analysis( $tradition );
32 # Recalculate rank $n treating all orthographic variants as equivalent
33 my $reanalyze = analyze_variant_location( $tradition, $n, 0, 'orthographic' );
34
35=head1 DESCRIPTION
36
37Text::Tradition is a library for representation and analysis of collated
38texts, particularly medieval ones. The Collation is the central feature of
39a Tradition, where the text, its sequence of readings, and its relationships
40between readings are actually kept.
41
42=head1 SUBROUTINES
43
88a6bac5 44=head2 run_analysis( $tradition, %opts )
7f52eac8 45
88a6bac5 46Runs the analysis described in analyze_variant_location on every location in the
47collation of the given tradition, with the given options. These include:
7f52eac8 48
88a6bac5 49=over 4
50
51=item * stemma_id - Specify which of the tradition's stemmata to use. Default
52is 0 (i.e. the first).
53
54=item * ranks - Specify a list of location ranks to analyze; exclude the rest.
55
56=item * merge_types - Specify a list of relationship types, where related readings
57should be treated as identical for the purposes of analysis.
58
ffa22d6f 59=item * exclude_type1 - Exclude those ranks whose groupings have only type-1 variants.
60
88a6bac5 61=back
7f52eac8 62
63=begin testing
64
65use Text::Tradition;
66use Text::Tradition::Analysis qw/ run_analysis analyze_variant_location /;
67
68my $datafile = 't/data/florilegium_tei_ps.xml';
69my $tradition = Text::Tradition->new( 'input' => 'TEI',
70 'name' => 'test0',
71 'file' => $datafile );
72my $s = $tradition->add_stemma( 'dotfile' => 't/data/florilegium.dot' );
73is( ref( $s ), 'Text::Tradition::Stemma', "Added stemma to tradition" );
74
f00cefe8 75my %expected_genealogical = (
a44aaf2a 76 1 => 0,
f00cefe8 77 2 => 1,
a44aaf2a 78 3 => 0,
79 5 => 0,
80 7 => 0,
81 8 => 0,
82 10 => 0,
f00cefe8 83 13 => 1,
a44aaf2a 84 33 => 0,
85 34 => 0,
86 37 => 0,
87 60 => 0,
f00cefe8 88 81 => 1,
a44aaf2a 89 84 => 0,
90 87 => 0,
91 101 => 0,
92 102 => 0,
f00cefe8 93 122 => 1,
a44aaf2a 94 157 => 0,
f00cefe8 95 166 => 1,
96 169 => 1,
a44aaf2a 97 200 => 0,
f00cefe8 98 216 => 1,
99 217 => 1,
100 219 => 1,
101 241 => 1,
102 242 => 1,
103 243 => 1,
104);
105
7f52eac8 106my $data = run_analysis( $tradition );
f00cefe8 107foreach my $row ( @{$data->{'variants'}} ) {
a44aaf2a 108 # Account for rows that used to be "not useful"
109 unless( exists $expected_genealogical{$row->{'id'}} ) {
110 $expected_genealogical{$row->{'id'}} = 1;
111 }
6d25a3a0 112 is( $row->{'genealogical'}, $expected_genealogical{$row->{'id'}},
f00cefe8 113 "Got correct genealogical flag for row " . $row->{'id'} );
114}
a44aaf2a 115is( $data->{'variant_count'}, 58, "Got right total variant number" );
b4cb2d60 116# TODO Make something meaningful of conflict count, maybe test other bits
7f52eac8 117
118=end testing
119
120=cut
121
d71100ed 122sub run_analysis {
88a6bac5 123 my( $tradition, %opts ) = @_;
f00cefe8 124 my $c = $tradition->collation;
88a6bac5 125
126 my $stemma_id = $opts{'stemma_id'} || 0;
1d73ecad 127 my @ranks = ref( $opts{'ranks'} ) eq 'ARRAY' ? @{$opts{'ranks'}} : ();
128 my @collapse = ref( $opts{'merge_types'} ) eq 'ARRAY' ? @{$opts{'merge_types'}} : ();
88a6bac5 129
130 # Get the stemma
131 my $stemma = $tradition->stemma( $stemma_id );
b4cb2d60 132
88a6bac5 133 # Figure out which witnesses we are working with
134 my @lacunose = $stemma->hypotheticals;
fae07016 135 my @tradition_wits = map { $_->sigil } $tradition->witnesses;
b4cb2d60 136 map { push( @tradition_wits, $_->sigil.$c->ac_label ) if $_->is_layered }
fae07016 137 $tradition->witnesses;
138 push( @lacunose, _symmdiff( [ $stemma->witnesses ], \@tradition_wits ) );
88a6bac5 139
140 # Find and mark 'common' ranks for exclusion, unless they were
141 # explicitly specified.
142 unless( @ranks ) {
143 my %common_rank;
a44aaf2a 144 foreach my $rdg ( $c->common_readings ) {
88a6bac5 145 $common_rank{$rdg->rank} = 1;
146 }
147 @ranks = grep { !$common_rank{$_} } ( 1 .. $c->end->rank-1 );
d71100ed 148 }
7f52eac8 149
88a6bac5 150 # Group the variants to send to the solver
151 my @groups;
a44aaf2a 152 my %lacunae;
88a6bac5 153 foreach my $rank ( @ranks ) {
a44aaf2a 154 my $missing = [ @lacunose ];
ffa22d6f 155 my $rankgroup = group_variants( $tradition, $rank, $missing, \@collapse );
156 if( $opts{'exclude_type1'} ) {
157 # Check to see whether this is a "useful" group.
158 my( $rdgs, $grps ) = _useful_variant( $rankgroup,
159 $stemma->graph, $c->ac_label );
160 next unless @$rdgs;
161 }
162 push( @groups, $rankgroup );
a44aaf2a 163 $lacunae{$rank} = $missing;
d71100ed 164 }
b4cb2d60 165 $DB::single = 1;
88a6bac5 166 # Parse the answer
e59b8faa 167 my $answer = solve_variants( $stemma, @groups );
fae07016 168
88a6bac5 169 # Do further analysis on the answer
a44aaf2a 170 my $conflict_count = 0;
88a6bac5 171 foreach my $idx ( 0 .. $#ranks ) {
172 my $location = $answer->{'variants'}->[$idx];
173 # Add the rank back in
174 $location->{'id'} = $ranks[$idx];
a44aaf2a 175 # Add the lacunae back in
176 $location->{'missing'} = $lacunae{$ranks[$idx]};
88a6bac5 177 # Run the extra analysis we need.
88a6bac5 178 analyze_location( $tradition, $stemma->graph, $location );
a44aaf2a 179 # Add the reading text back in
180 foreach my $rdghash ( @{$location->{'readings'}} ) {
181 $conflict_count++
182 if exists $rdghash->{'conflict'} && $rdghash->{'conflict'};
183 my $rdg = $c->reading( $rdghash->{'readingid'} );
184 $rdghash->{'text'} = $rdg ? $rdg->text : $rdghash->{'readingid'};
185 }
88a6bac5 186 }
a44aaf2a 187 $answer->{'conflict_count'} = $conflict_count;
f00cefe8 188
88a6bac5 189 return $answer;
d71100ed 190}
191
7f52eac8 192=head2 group_variants( $tradition, $rank, $lacunose, @merge_relationship_types )
193
194Groups the variants at the given $rank of the collation, treating any
195relationships in @merge_relationship_types as equivalent. $lacunose should
196be a reference to an array, to which the sigla of lacunose witnesses at this
197rank will be appended.
198
ffa22d6f 199Returns a hash $group_readings where $rdg is attested by the witnesses listed
200in $group_readings->{$rdg}.
7f52eac8 201
202=cut
203
204# Return group_readings, groups, lacunose
d1348d38 205sub group_variants {
7f52eac8 206 my( $tradition, $rank, $lacunose, $collapse ) = @_;
207 my $c = $tradition->collation;
b4cb2d60 208 my $aclabel = $c->ac_label;
7f52eac8 209 # Get the alignment table readings
210 my %readings_at_rank;
ffa22d6f 211 my %is_lacunose; # lookup table for $lacunose
212 map { $is_lacunose{$_} = 1 } @$lacunose;
7f52eac8 213 my @gap_wits;
1d73ecad 214 foreach my $tablewit ( @{$c->alignment_table->{'alignment'}} ) {
7f52eac8 215 my $rdg = $tablewit->{'tokens'}->[$rank-1];
fae07016 216 my $wit = $tablewit->{'witness'};
ffa22d6f 217 # Exclude the witness if it is "lacunose" which if we got here
218 # means "not in the stemma".
219 next if $is_lacunose{$wit};
7f52eac8 220 if( $rdg && $rdg->{'t'}->is_lacuna ) {
b4cb2d60 221 _add_to_witlist( $wit, $lacunose, $aclabel );
7f52eac8 222 } elsif( $rdg ) {
223 $readings_at_rank{$rdg->{'t'}->text} = $rdg->{'t'};
224 } else {
b4cb2d60 225 _add_to_witlist( $wit, \@gap_wits, $aclabel );
7f52eac8 226 }
227 }
d1348d38 228
7f52eac8 229 # Group the readings, collapsing groups by relationship if needed
230 my %grouped_readings;
b4cb2d60 231 foreach my $rdg ( sort { $b->witnesses <=> $a->witnesses }
232 values %readings_at_rank ) {
7f52eac8 233 # Skip readings that have been collapsed into others.
f00cefe8 234 next if exists $grouped_readings{$rdg->id} && !$grouped_readings{$rdg->id};
7f52eac8 235 my @wits = $rdg->witnesses;
236 if( $collapse ) {
237 my $filter = sub { my $r = $_[0]; grep { $_ eq $r->type } @$collapse; };
238 foreach my $other ( $rdg->related_readings( $filter ) ) {
fae07016 239 my @otherwits = $other->witnesses;
fae07016 240 push( @wits, @otherwits );
f00cefe8 241 $grouped_readings{$other->id} = 0;
d1348d38 242 }
243 }
ffa22d6f 244 my @use_wits = grep { !$is_lacunose{$_} } @wits;
245 $grouped_readings{$rdg->id} = \@use_wits;
7f52eac8 246 }
247 $grouped_readings{'(omitted)'} = \@gap_wits if @gap_wits;
248 # Get rid of our collapsed readings
249 map { delete $grouped_readings{$_} unless $grouped_readings{$_} }
250 keys %grouped_readings
251 if $collapse;
252
5be0cdeb 253 return \%grouped_readings;
d1348d38 254}
255
88a6bac5 256=head2 solve_variants( $graph, @groups )
257
258Sends the set of groups to the external graph solver service and returns
259a cleaned-up answer, adding the rank IDs back where they belong.
260
261The JSON has the form
262 { "graph": [ stemmagraph DOT string without newlines ],
263 "groupings": [ array of arrays of groups, one per rank ] }
264
265The answer has the form
266 { "variants" => [ array of variant location structures ],
267 "variant_count" => total,
268 "conflict_count" => number of conflicts detected,
269 "genealogical_count" => number of solutions found }
270
271=cut
272
273sub solve_variants {
e59b8faa 274 my( $stemma, @groups ) = @_;
88a6bac5 275
276 # Make the json with stemma + groups
b4cb2d60 277 my $groupings = [];
88a6bac5 278 foreach my $ghash ( @groups ) {
279 my @grouping;
6d25a3a0 280 foreach my $k ( keys %$ghash ) {
88a6bac5 281 push( @grouping, $ghash->{$k} );
282 }
b4cb2d60 283 push( @$groupings, \@grouping );
88a6bac5 284 }
b4cb2d60 285 ## Witness map is a HACK to get around limitations in node names from IDP
286 my $witness_map = {};
287 my $json = encode_json( _safe_wit_strings( $stemma, $groupings, $witness_map ) );
88a6bac5 288
289 # Send it off and get the result
290 my $solver_url = 'http://byzantini.st/cgi-bin/graphcalc.cgi';
291 my $ua = LWP::UserAgent->new();
292 my $resp = $ua->post( $solver_url, 'Content-Type' => 'application/json',
293 'Content' => $json );
294
295 my $answer;
6d25a3a0 296 my $used_idp;
88a6bac5 297 if( $resp->is_success ) {
b4cb2d60 298 $answer = _desanitize_names( decode_json( $resp->content ), $witness_map );
6d25a3a0 299 $used_idp = 1;
88a6bac5 300 } else {
fae07016 301 # Fall back to the old method.
302 warn "IDP solver returned " . $resp->status_line . " / " . $resp->content
303 . "; falling back to perl method";
b4cb2d60 304 $answer = perl_solver( $stemma, @$groupings );
88a6bac5 305 }
306
307 # Fold the result back into what we know about the groups.
308 my $variants = [];
309 my $genealogical = 0;
310 foreach my $idx ( 0 .. $#groups ) {
311 my( $calc_groups, $result ) = @{$answer->[$idx]};
6d25a3a0 312 if( $result ) {
313 $genealogical++;
314 # Prune the calculated groups, in case the IDP solver failed to.
315 if( $used_idp ) {
316 my @pruned_groups;
317 foreach my $cg ( @$calc_groups ) {
318 my @pg = _prune_group( $cg, $stemma );
319 push( @pruned_groups, \@pg );
320 }
321 $calc_groups = \@pruned_groups;
322 }
323 }
88a6bac5 324 my $input_group = $groups[$idx];
325 foreach my $k ( sort keys %$input_group ) {
326 my $cg = shift @$calc_groups;
327 $input_group->{$k} = $cg;
328 }
329 my $vstruct = {
330 'genealogical' => $result,
331 'readings' => [],
fae07016 332 };
88a6bac5 333 foreach my $k ( keys %$input_group ) {
334 push( @{$vstruct->{'readings'}},
fae07016 335 { 'readingid' => $k, 'group' => $input_group->{$k}} );
88a6bac5 336 }
337 push( @$variants, $vstruct );
338 }
339
340 return { 'variants' => $variants,
341 'variant_count' => scalar @$variants,
342 'genealogical_count' => $genealogical };
343}
344
b4cb2d60 345#### HACKERY to cope with IDP's limited idea of what a node name looks like ###
346
347sub _safe_wit_strings {
348 my( $stemma, $groupings, $witness_map ) = @_;
349 my $safegraph = Graph->new();
350 # Convert the graph to a safe representation and store the conversion.
351 foreach my $n ( $stemma->graph->vertices ) {
352 my $sn = _safe_witstr( $n );
353 warn "Ambiguous stringification $sn for $n and " . $witness_map->{$sn}
354 if exists $witness_map->{$sn};
355 $witness_map->{$sn} = $n;
356 $safegraph->add_vertex( $sn );
357 $safegraph->set_vertex_attributes( $sn,
358 $stemma->graph->get_vertex_attributes( $n ) );
359 }
360 foreach my $e ( $stemma->graph->edges ) {
361 my @safe_e = ( _safe_witstr( $e->[0] ), _safe_witstr( $e->[1] ) );
362 $safegraph->add_edge( @safe_e );
363 }
364 my $safe_stemma = Text::Tradition::Stemma->new(
365 'collation' => $stemma->collation, 'graph' => $safegraph );
366
367 # Now convert the witness groupings to a safe representation.
368 my $safe_groupings = [];
369 foreach my $grouping ( @$groupings ) {
370 my $safe_grouping = [];
371 foreach my $group ( @$grouping ) {
372 my $safe_group = [];
373 foreach my $n ( @$group ) {
374 my $sn = _safe_witstr( $n );
375 warn "Ambiguous stringification $sn for $n and " . $witness_map->{$sn}
376 if exists $witness_map->{$sn} && $witness_map->{$sn} ne $n;
377 $witness_map->{$sn} = $n;
378 push( @$safe_group, $sn );
379 }
380 push( @$safe_grouping, $safe_group );
381 }
382 push( @$safe_groupings, $safe_grouping );
383 }
384
385 # Return it all in the struct we expect. We have stored the reductions
386 # in the $witness_map that we were passed.
387 return { 'graph' => $safe_stemma->editable( ' ' ), 'groupings' => $safe_groupings };
388}
389
390sub _safe_witstr {
391 my $witstr = shift;
392 $witstr =~ s/\s+/_/g;
393 $witstr =~ s/[^\w\d-]//g;
394 return $witstr;
395}
396
397sub _desanitize_names {
398 my( $jsonstruct, $witness_map ) = @_;
399 my $result = [];
400 foreach my $grouping ( @$jsonstruct ) {
401 my $real_grouping = [];
402 foreach my $element ( @$grouping ) {
403 if( ref( $element ) eq 'ARRAY' ) {
404 # it's the groupset.
405 my $real_groupset = [];
406 foreach my $group ( @$element ) {
407 my $real_group = [];
408 foreach my $n ( @$group ) {
409 my $rn = $witness_map->{$n};
410 push( @$real_group, $rn );
411 }
412 push( @$real_groupset, $real_group );
413 }
414 push( @$real_grouping, $real_groupset );
415 } else {
416 # It is the boolean, not actually a group.
417 push( @$real_grouping, $element );
418 }
419 }
420 push( @$result, $real_grouping );
421 }
422 return $result;
423}
424
425### END HACKERY ###
426
fae07016 427=head2 analyze_location ( $tradition, $graph, $location_hash )
7f52eac8 428
fae07016 429Given the tradition, its stemma graph, and the solution from the graph solver,
430work out the rest of the information we want. For each reading we need missing,
431conflict, reading_parents, independent_occurrence, followed, not_followed, and follow_unknown. Alters the location_hash in place.
7f52eac8 432
433=cut
732152b1 434
fae07016 435sub analyze_location {
436 my ( $tradition, $graph, $variant_row ) = @_;
437
438 # Make a hash of all known node memberships, and make the subgraphs.
439 my $contig = {};
440 my $reading_roots = {};
441 my $subgraph = {};
442 foreach my $rdghash ( @{$variant_row->{'readings'}} ) {
443 my $rid = $rdghash->{'readingid'};
444 map { $contig->{$_} = $rid } @{$rdghash->{'group'}};
c4a4fb1b 445
fae07016 446 # Make the subgraph.
447 my $part = $graph->copy;
448 my %these_vertices;
449 map { $these_vertices{$_} = 1 } @{$rdghash->{'group'}};
450 $part->delete_vertices( grep { !$these_vertices{$_} } $part->vertices );
451 $subgraph->{$rid} = $part;
452 # Get the reading roots.
453 map { $reading_roots->{$_} = $rid } $part->predecessorless_vertices;
bebec0e9 454 }
455
fae07016 456 # Now that we have all the node group memberships, calculate followed/
bebec0e9 457 # non-followed/unknown values for each reading. Also figure out the
458 # reading's evident parent(s).
459 foreach my $rdghash ( @{$variant_row->{'readings'}} ) {
fae07016 460 # Group string key - TODO do we need this?
461 my $gst = wit_stringify( $rdghash->{'group'} );
462 my $rid = $rdghash->{'readingid'};
463 # Get the subgraph
464 my $part = $subgraph->{$rid};
465
466 # Start figuring things out.
bebec0e9 467 my @roots = $part->predecessorless_vertices;
468 $rdghash->{'independent_occurrence'} = scalar @roots;
469 $rdghash->{'followed'} = scalar( $part->vertices ) - scalar( @roots );
470 # Find the parent readings, if any, of this reading.
f00cefe8 471 my %rdgparents;
bebec0e9 472 foreach my $wit ( @roots ) {
f00cefe8 473 # Look in the main stemma to find this witness's extant or known-reading
474 # immediate ancestor(s), and look up the reading that each ancestor olds.
475 my @check = $graph->predecessors( $wit );
476 while( @check ) {
477 my @next;
478 foreach my $wparent( @check ) {
fae07016 479 my $preading = $contig->{$wparent};
480 if( $preading ) {
481 $rdgparents{$preading} = 1;
f00cefe8 482 } else {
483 push( @next, $graph->predecessors( $wparent ) );
484 }
485 }
486 @check = @next;
487 }
bebec0e9 488 }
f00cefe8 489 $rdghash->{'reading_parents'} = [ keys %rdgparents ];
bebec0e9 490
491 # Find the number of times this reading was altered, and the number of
492 # times we're not sure.
493 my( %nofollow, %unknownfollow );
494 foreach my $wit ( $part->vertices ) {
495 foreach my $wchild ( $graph->successors( $wit ) ) {
496 next if $part->has_vertex( $wchild );
fae07016 497 if( $reading_roots->{$wchild} && $contig->{$wchild} ) {
bebec0e9 498 # It definitely changed here.
499 $nofollow{$wchild} = 1;
500 } elsif( !($contig->{$wchild}) ) {
501 # The child is a hypothetical node not definitely in
502 # any group. Answer is unknown.
503 $unknownfollow{$wchild} = 1;
504 } # else it's a non-root node in a known group, and therefore
505 # is presumed to have its reading from its group, not this link.
506 }
507 }
508 $rdghash->{'not_followed'} = keys %nofollow;
509 $rdghash->{'follow_unknown'} = keys %unknownfollow;
fae07016 510
511 # Now say whether this reading represents a conflict.
512 unless( $variant_row->{'genealogical'} ) {
513 $rdghash->{'conflict'} = @roots != 1;
514 }
c4a4fb1b 515 }
d71100ed 516}
517
fae07016 518
519=head2 perl_solver( $tradition, $rank, $stemma_id, @merge_relationship_types )
520
521** NOTE ** This method should hopefully not be called - it is not guaranteed
522to be correct. Serves as a backup for the real solver.
523
524Runs an analysis of the given tradition, at the location given in $rank,
525against the graph of the stemma specified in $stemma_id. The argument
526@merge_relationship_types is an optional list of relationship types for
527which readings so related should be treated as equivalent.
528
529Returns a nested array data structure as follows:
530
531 [ [ group_list, is_genealogical ], [ group_list, is_genealogical ] ... ]
532
533where the group list is the array of arrays passed in for each element of @groups,
534possibly with the addition of hypothetical readings.
535
536
537=cut
538
539sub perl_solver {
e59b8faa 540 my( $stemma, @groups ) = @_;
541 my $graph = $stemma->graph;
542 my @answer;
543 foreach my $g ( @groups ) {
544 push( @answer, _solve_variant_location( $graph, $g ) );
545 }
546 return \@answer;
fae07016 547}
548
e59b8faa 549sub _solve_variant_location {
550 my( $graph, $groups ) = @_;
fae07016 551 # Now do the work.
e59b8faa 552 my $contig = {};
553 my $subgraph = {};
554 my $is_conflicted;
555 my $conflict = {};
556
557 # Mark each ms as in its own group, first.
558 foreach my $g ( @$groups ) {
559 my $gst = wit_stringify( $g );
560 map { $contig->{$_} = $gst } @$g;
561 }
562
563 # Now for each unmarked node in the graph, initialize an array
564 # for possible group memberships. We will use this later to
565 # resolve potential conflicts.
566 map { $contig->{$_} = [] unless $contig->{$_} } $graph->vertices;
567 foreach my $g ( sort { scalar @$b <=> scalar @$a } @$groups ) {
568 my $gst = wit_stringify( $g ); # This is the group name
569 # Copy the graph, and delete all non-members from the new graph.
570 my $part = $graph->copy;
571 my @group_roots;
572 $part->delete_vertices(
573 grep { !ref( $contig->{$_} ) && $contig->{$_} ne $gst } $graph->vertices );
574
575 # Now look to see if our group is connected.
576 if( @$g > 1 ) {
577 # We have to take directionality into account.
578 # How many root nodes do we have?
579 my @roots = grep { ref( $contig->{$_} ) || $contig->{$_} eq $gst }
580 $part->predecessorless_vertices;
581 # Assuming that @$g > 1, find the first root node that has at
582 # least one successor belonging to our group. If this reading
583 # is genealogical, there should be only one, but we will check
584 # that implicitly later.
585 foreach my $root ( @roots ) {
586 # Prune the tree to get rid of extraneous hypotheticals.
587 $root = _prune_subtree( $part, $root, $contig );
588 next unless $root;
589 # Save this root for our group.
590 push( @group_roots, $root );
591 # Get all the successor nodes of our root.
592 }
593 } else {
594 # Dispense with the trivial case of one reading.
595 my $wit = $g->[0];
596 @group_roots = ( $wit );
597 foreach my $v ( $part->vertices ) {
598 $part->delete_vertex( $v ) unless $v eq $wit;
599 }
600 }
601
602 if( @group_roots > 1 ) {
603 $conflict->{$gst} = 1;
604 $is_conflicted = 1;
605 }
606 # Paint the 'hypotheticals' with our group.
607 foreach my $wit ( $part->vertices ) {
608 if( ref( $contig->{$wit} ) ) {
609 push( @{$contig->{$wit}}, $gst );
610 } elsif( $contig->{$wit} ne $gst ) {
611 warn "How did we get here?";
612 }
613 }
614
615
616 # Save the relevant subgraph.
617 $subgraph->{$gst} = $part;
618 }
619
620 # For each of our hypothetical readings, flatten its 'contig' array if
621 # the array contains zero or one group. If we have any unflattened arrays,
622 # we may need to run the resolution process. If the reading is already known
623 # to have a conflict, flatten the 'contig' array to nothing; we won't resolve
624 # it.
625 my @resolve;
626 foreach my $wit ( keys %$contig ) {
627 next unless ref( $contig->{$wit} );
628 if( @{$contig->{$wit}} > 1 ) {
629 if( $is_conflicted ) {
630 $contig->{$wit} = ''; # We aren't going to decide.
631 } else {
632 push( @resolve, $wit );
633 }
634 } else {
635 my $gst = pop @{$contig->{$wit}};
636 $contig->{$wit} = $gst || '';
637 }
638 }
639
640 if( @resolve ) {
641 my $still_contig = {};
642 foreach my $h ( @resolve ) {
643 # For each of the hypothetical readings with more than one possibility,
644 # try deleting it from each of its member subgraphs in turn, and see
645 # if that breaks the contiguous grouping.
646 # TODO This can still break in a corner case where group A can use
647 # either vertex 1 or 2, and group B can use either vertex 2 or 1.
648 # Revisit this if necessary; it could get brute-force nasty.
649 foreach my $gst ( @{$contig->{$h}} ) {
650 my $gpart = $subgraph->{$gst}->copy();
651 # If we have come this far, there is only one root and everything
652 # is reachable from it.
653 my( $root ) = $gpart->predecessorless_vertices;
654 my $reachable = {};
655 map { $reachable->{$_} = 1 } $gpart->vertices;
656
657 # Try deleting the hypothetical node.
658 $gpart->delete_vertex( $h );
659 if( $h eq $root ) {
660 # See if we still have a single root.
661 my @roots = $gpart->predecessorless_vertices;
662 warn "This shouldn't have happened" unless @roots;
663 if( @roots > 1 ) {
664 # $h is needed by this group.
665 if( exists( $still_contig->{$h} ) ) {
666 # Conflict!
667 $conflict->{$gst} = 1;
668 $still_contig->{$h} = '';
669 } else {
670 $still_contig->{$h} = $gst;
671 }
672 }
673 } else {
674 # $h is somewhere in the middle. See if everything
675 # else can still be reached from the root.
676 my %still_reachable = ( $root => 1 );
677 map { $still_reachable{$_} = 1 }
678 $gpart->all_successors( $root );
679 foreach my $v ( keys %$reachable ) {
680 next if $v eq $h;
681 if( !$still_reachable{$v}
682 && ( $contig->{$v} eq $gst
683 || ( exists $still_contig->{$v}
684 && $still_contig->{$v} eq $gst ) ) ) {
685 # We need $h.
686 if( exists $still_contig->{$h} ) {
687 # Conflict!
688 $conflict->{$gst} = 1;
689 $still_contig->{$h} = '';
690 } else {
691 $still_contig->{$h} = $gst;
692 }
693 last;
694 } # else we don't need $h in this group.
695 } # end foreach $v
696 } # endif $h eq $root
697 } # end foreach $gst
698 } # end foreach $h
699
700 # Now we have some hypothetical vertices in $still_contig that are the
701 # "real" group memberships. Replace these in $contig.
702 foreach my $v ( keys %$contig ) {
703 next unless ref $contig->{$v};
704 $contig->{$v} = $still_contig->{$v};
705 }
706 } # end if @resolve
707
708 my $is_genealogical = keys %$conflict ? JSON::false : JSON::true;
709 my $variant_row = [ [], $is_genealogical ];
710 # Fill in the groupings from $contig.
711 foreach my $g ( @$groups ) {
712 my $gst = wit_stringify( $g );
713 my @realgroup = grep { $contig->{$_} eq $gst } keys %$contig;
714 push( @{$variant_row->[0]}, \@realgroup );
715 }
716 return $variant_row;
717}
fae07016 718
6d25a3a0 719sub _prune_group {
720 my( $group, $stemma ) = @_;
721 # Get these into a form prune_subtree will recognize. Make a "contighash"
722 my $hypohash = {};
723 map { $hypohash->{$_} = 1 } @$group;
724 # ...with reference values for hypotheticals.
725 map { $hypohash->{$_} = [] } $stemma->hypotheticals;
726 # Make our subgraph
727 my $subgraph = $stemma->graph->copy;
728 map { $subgraph->delete_vertex( $_ ) unless exists $hypohash->{$_} }
729 $subgraph->vertices;
730 # ...and find the root.
731 my( $root ) = $subgraph->predecessorless_vertices;
732 # Now prune and return the remaining vertices.
733 _prune_subtree( $subgraph, $root, $hypohash );
734 return $subgraph->vertices;
735}
736
7f52eac8 737sub _prune_subtree {
231d71fc 738 my( $tree, $root, $contighash ) = @_;
739 # First, delete hypothetical leaves / orphans until there are none left.
740 my @orphan_hypotheticals = grep { ref( $contighash->{$_} ) }
741 $tree->successorless_vertices;
742 while( @orphan_hypotheticals ) {
743 $tree->delete_vertices( @orphan_hypotheticals );
744 @orphan_hypotheticals = grep { ref( $contighash->{$_} ) }
745 $tree->successorless_vertices;
746 }
747 # Then delete a hypothetical root with only one successor, moving the
bebec0e9 748 # root to the first child that has no other predecessors.
231d71fc 749 while( $tree->successors( $root ) == 1 && ref $contighash->{$root} ) {
750 my @nextroot = $tree->successors( $root );
751 $tree->delete_vertex( $root );
bebec0e9 752 ( $root ) = grep { $tree->is_predecessorless_vertex( $_ ) } @nextroot;
231d71fc 753 }
754 # The tree has been modified in place, but we need to know the new root.
bebec0e9 755 $root = undef unless $root && $tree->has_vertex( $root );
231d71fc 756 return $root;
757}
d71100ed 758# Add the variant, subject to a.c. representation logic.
759# This assumes that we will see the 'main' version before the a.c. version.
760sub add_variant_wit {
761 my( $arr, $wit, $acstr ) = @_;
762 my $skip;
763 if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
764 my $real = $1;
765 $skip = grep { $_ =~ /^\Q$real\E$/ } @$arr;
766 }
767 push( @$arr, $wit ) unless $skip;
768}
769
5be0cdeb 770sub _useful_variant {
771 my( $group_readings, $graph, $acstr ) = @_;
772
773 # TODO Decide what to do with AC witnesses
774
775 # Sort by group size and return
776 my $is_useful = 0;
777 my( @readings, @groups ); # The sorted groups for our answer.
778 foreach my $rdg ( sort { @{$group_readings->{$b}} <=> @{$group_readings->{$a}} }
779 keys %$group_readings ) {
780 push( @readings, $rdg );
781 push( @groups, $group_readings->{$rdg} );
782 if( @{$group_readings->{$rdg}} > 1 ) {
783 $is_useful++;
784 } else {
785 my( $wit ) = @{$group_readings->{$rdg}};
786 $wit =~ s/^(.*)\Q$acstr\E$/$1/;
787 $is_useful++ unless( $graph->is_sink_vertex( $wit ) );
788 }
789 }
790 if( $is_useful > 1 ) {
791 return( \@readings, \@groups );
792 } else {
793 return( [], [] );
794 }
795}
796
7f52eac8 797=head2 wit_stringify( $groups )
798
799Takes an array of witness groupings and produces a string like
800['A','B'] / ['C','D','E'] / ['F']
d71100ed 801
7f52eac8 802=cut
d71100ed 803
804sub wit_stringify {
805 my $groups = shift;
806 my @gst;
807 # If we were passed an array of witnesses instead of an array of
808 # groupings, then "group" the witnesses first.
809 unless( ref( $groups->[0] ) ) {
810 my $mkgrp = [ $groups ];
811 $groups = $mkgrp;
812 }
813 foreach my $g ( @$groups ) {
814 push( @gst, '[' . join( ',', map { "'$_'" } @$g ) . ']' );
815 }
816 return join( ' / ', @gst );
817}
7f52eac8 818
5be0cdeb 819# Helper function to ensure that X and X a.c. never appear in the same list.
820sub _add_to_witlist {
821 my( $wit, $list, $acstr ) = @_;
822 my %inlist;
823 my $idx = 0;
824 map { $inlist{$_} = $idx++ } @$list;
825 if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
826 my $acwit = $1;
827 unless( exists $inlist{$acwit} ) {
828 push( @$list, $acwit.$acstr );
829 }
830 } else {
831 if( exists( $inlist{$wit.$acstr} ) ) {
832 # Replace the a.c. version with the main witness
833 my $i = $inlist{$wit.$acstr};
834 $list->[$i] = $wit;
835 } else {
836 push( @$list, $wit );
837 }
838 }
839}
840
bebec0e9 841sub _symmdiff {
842 my( $lista, $listb ) = @_;
7f52eac8 843 my %union;
844 my %scalars;
845 map { $union{$_} = 1; $scalars{$_} = $_ } @$lista;
846 map { $union{$_} += 1; $scalars{$_} = $_ } @$listb;
bebec0e9 847 my @set = grep { $union{$_} == 1 } keys %union;
7f52eac8 848 return map { $scalars{$_} } @set;
849}
850
8511;
852
853=head1 LICENSE
854
855This package is free software and is provided "as is" without express
856or implied warranty. You can redistribute it and/or modify it under
857the same terms as Perl itself.
858
859=head1 AUTHOR
860
861Tara L Andrews E<lt>aurum@cpan.orgE<gt>