1 package Text::Tradition::Analysis;
6 use Encode qw/ encode_utf8 /;
9 use JSON qw/ encode_json decode_json /;
12 use Text::Tradition::Stemma;
14 use vars qw/ @EXPORT_OK /;
15 @EXPORT_OK = qw/ run_analysis group_variants analyze_variant_location wit_stringify /;
19 Text::Tradition::Analysis - functions for stemma analysis of a tradition
24 use Text::Tradition::Analysis qw/ run_analysis analyze_variant_location /;
25 my $t = Text::Tradition->new(
26 'name' => 'this is a text',
28 'file' => '/path/to/tei_parallel_seg_file.xml' );
29 $t->add_stemma( 'dotfile' => $stemmafile );
31 my $variant_data = run_analysis( $tradition );
32 # Recalculate rank $n treating all orthographic variants as equivalent
33 my $reanalyze = analyze_variant_location( $tradition, $n, 0, 'orthographic' );
37 Text::Tradition is a library for representation and analysis of collated
38 texts, particularly medieval ones. The Collation is the central feature of
39 a Tradition, where the text, its sequence of readings, and its relationships
40 between readings are actually kept.
44 =head2 run_analysis( $tradition, %opts )
46 Runs the analysis described in analyze_variant_location on every location in the
47 collation of the given tradition, with the given options. These include:
51 =item * stemma_id - Specify which of the tradition's stemmata to use. Default
52 is 0 (i.e. the first).
54 =item * ranks - Specify a list of location ranks to analyze; exclude the rest.
56 =item * merge_types - Specify a list of relationship types, where related readings
57 should be treated as identical for the purposes of analysis.
59 =item * exclude_type1 - Exclude those ranks whose groupings have only type-1 variants.
66 use Text::Tradition::Analysis qw/ run_analysis analyze_variant_location /;
68 my $datafile = 't/data/florilegium_tei_ps.xml';
69 my $tradition = Text::Tradition->new( 'input' => 'TEI',
71 'file' => $datafile );
72 my $s = $tradition->add_stemma( 'dotfile' => 't/data/florilegium.dot' );
73 is( ref( $s ), 'Text::Tradition::Stemma', "Added stemma to tradition" );
75 my %expected_genealogical = (
106 my $data = run_analysis( $tradition );
107 my $c = $tradition->collation;
108 foreach my $row ( @{$data->{'variants'}} ) {
109 # Account for rows that used to be "not useful"
110 unless( exists $expected_genealogical{$row->{'id'}} ) {
111 $expected_genealogical{$row->{'id'}} = 1;
113 is( $row->{'genealogical'}, $expected_genealogical{$row->{'id'}},
114 "Got correct genealogical flag for row " . $row->{'id'} );
115 # Check that we have the right row with the right groups
116 my $rank = $row->{'id'};
117 foreach my $rdghash ( @{$row->{'readings'}} ) {
118 # Skip 'readings' that aren't really
119 next unless $c->reading( $rdghash->{'readingid'} );
121 is( $c->reading( $rdghash->{'readingid'} )->rank, $rank,
122 "Got correct reading rank" );
123 # Check the witnesses
124 my @realwits = sort $c->reading_witnesses( $rdghash->{'readingid'} );
125 my @sgrp = sort @{$rdghash->{'group'}};
126 is_deeply( \@sgrp, \@realwits, "Reading analyzed with correct groups" );
129 is( $data->{'variant_count'}, 58, "Got right total variant number" );
130 # TODO Make something meaningful of conflict count, maybe test other bits
137 my( $tradition, %opts ) = @_;
138 my $c = $tradition->collation;
140 my $stemma_id = $opts{'stemma_id'} || 0;
141 my @ranks = ref( $opts{'ranks'} ) eq 'ARRAY' ? @{$opts{'ranks'}} : ();
142 my @collapse = ref( $opts{'merge_types'} ) eq 'ARRAY' ? @{$opts{'merge_types'}} : ();
145 my $stemma = $tradition->stemma( $stemma_id );
147 # Figure out which witnesses we are working with - that is, the ones that
148 # appear both in the stemma and in the tradition. All others are 'lacunose'
150 my @lacunose = $stemma->hypotheticals;
151 my @tradition_wits = map { $_->sigil } $tradition->witnesses;
152 push( @lacunose, _symmdiff( [ $stemma->witnesses ], \@tradition_wits ) );
154 # Find and mark 'common' ranks for exclusion, unless they were
155 # explicitly specified.
158 foreach my $rdg ( $c->common_readings ) {
159 $common_rank{$rdg->rank} = 1;
161 @ranks = grep { !$common_rank{$_} } ( 1 .. $c->end->rank-1 );
164 # Group the variants to send to the solver
168 foreach my $rank ( @ranks ) {
169 my $missing = [ @lacunose ];
170 my $rankgroup = group_variants( $tradition, $rank, $missing, \@collapse );
171 if( $opts{'exclude_type1'} ) {
172 # Check to see whether this is a "useful" group.
173 my( $rdgs, $grps ) = _useful_variant( $rankgroup,
174 $stemma->graph, $c->ac_label );
177 push( @use_ranks, $rank );
178 push( @groups, $rankgroup );
179 $lacunae{$rank} = $missing;
182 my $answer = solve_variants( $stemma, @groups );
184 # Do further analysis on the answer
185 my $conflict_count = 0;
186 my $aclabel = $c->ac_label;
187 foreach my $idx ( 0 .. $#use_ranks ) {
188 my $location = $answer->{'variants'}->[$idx];
189 # Add the rank back in
190 $location->{'id'} = $use_ranks[$idx];
191 # Note what our lacunae are
193 map { $lmiss{$_} = 1 } @{$lacunae{$use_ranks[$idx]}};
194 # Run through the reading groups and add as 'lacunae' any redundant
195 # a.c. witnesses (yes, we have to do this before the analysis, thus
196 # identical loops before and after. Boo.)
197 # TODO Consider making these callbacks to analyze_location
198 foreach my $rdghash ( @{$location->{'readings'}} ) {
200 map { $rwits{$_} = 1 } @{$rdghash->{'group'}};
201 foreach my $rw ( keys %rwits ) {
202 if( $rw =~ /^(.*)\Q$aclabel\E$/ ) {
203 if( exists $rwits{$1} ) {
209 $rdghash->{'group'} = [ keys %rwits ];
211 $location->{'missing'} = [ keys %lmiss ];
213 # Run the extra analysis we need.
214 analyze_location( $tradition, $stemma->graph, $location );
216 # Do the final post-analysis tidying up of the data.
217 foreach my $rdghash ( @{$location->{'readings'}} ) {
219 if exists $rdghash->{'conflict'} && $rdghash->{'conflict'};
220 # Add the reading text back in
221 my $rdg = $c->reading( $rdghash->{'readingid'} );
222 $rdghash->{'text'} = $rdg ? $rdg->text : $rdghash->{'readingid'};
223 # Remove lacunose witnesses from this reading's list now that the
226 map { push( @realgroup, $_ ) unless $lmiss{$_} } @{$rdghash->{'group'}};
227 $rdghash->{'group'} = \@realgroup;
228 # TODO Record hypotheticals used to create group, if we end up
232 $answer->{'conflict_count'} = $conflict_count;
237 =head2 group_variants( $tradition, $rank, $lacunose, @merge_relationship_types )
239 Groups the variants at the given $rank of the collation, treating any
240 relationships in @merge_relationship_types as equivalent. $lacunose should
241 be a reference to an array, to which the sigla of lacunose witnesses at this
242 rank will be appended.
244 Returns a hash $group_readings where $rdg is attested by the witnesses listed
245 in $group_readings->{$rdg}.
249 # Return group_readings, groups, lacunose
251 my( $tradition, $rank, $lacunose, $collapse ) = @_;
252 my $c = $tradition->collation;
253 my $aclabel = $c->ac_label;
255 # Get the alignment table readings
256 my %readings_at_rank;
257 my %is_lacunose; # lookup table for $lacunose
258 map { $is_lacunose{$_} = 1 } @$lacunose;
260 foreach my $tablewit ( @{$c->alignment_table->{'alignment'}} ) {
261 my $rdg = $tablewit->{'tokens'}->[$rank-1];
262 my $wit = $tablewit->{'witness'};
263 # Exclude the witness if it is "lacunose" which if we got here
264 # means "not in the stemma".
265 next if $is_lacunose{$wit};
266 if( $rdg && $rdg->{'t'}->is_lacuna ) {
267 _add_to_witlist( $wit, $lacunose, $aclabel );
269 $readings_at_rank{$rdg->{'t'}->text} = $rdg->{'t'};
271 _add_to_witlist( $wit, \@gap_wits, $aclabel );
275 # Group the readings, collapsing groups by relationship if needed
276 my %grouped_readings;
277 foreach my $rdg ( values %readings_at_rank ) {
278 # Skip readings that have been collapsed into others.
279 next if exists $grouped_readings{$rdg->id} && !$grouped_readings{$rdg->id};
280 # Get the witness list, including from readings collapsed into this one.
281 my @wits = $rdg->witnesses;
283 my $filter = sub { my $r = $_[0]; grep { $_ eq $r->type } @$collapse; };
284 foreach my $other ( $rdg->related_readings( $filter ) ) {
285 my @otherwits = $other->witnesses;
286 push( @wits, @otherwits );
287 $grouped_readings{$other->id} = 0;
290 # Filter the group to those witnesses in the stemma
292 foreach my $wit ( @wits ) {
293 next if $is_lacunose{$wit};
294 push( @use_wits, $wit );
296 $grouped_readings{$rdg->id} = \@use_wits;
298 $grouped_readings{'(omitted)'} = \@gap_wits if @gap_wits;
299 # Get rid of our collapsed readings
300 map { delete $grouped_readings{$_} unless $grouped_readings{$_} }
301 keys %grouped_readings
305 return \%grouped_readings;
308 # Helper function to ensure that X and X a.c. never appear in the same list.
309 sub _add_to_witlist {
310 my( $wit, $list, $acstr ) = @_;
313 map { $inlist{$_} = $idx++ } @$list;
314 if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
316 unless( exists $inlist{$acwit} ) {
317 push( @$list, $acwit.$acstr );
320 if( exists( $inlist{$wit.$acstr} ) ) {
321 # Replace the a.c. version with the main witness
322 my $i = $inlist{$wit.$acstr};
325 push( @$list, $wit );
330 =head2 solve_variants( $graph, @groups )
332 Sends the set of groups to the external graph solver service and returns
333 a cleaned-up answer, adding the rank IDs back where they belong.
335 The JSON has the form
336 { "graph": [ stemmagraph DOT string without newlines ],
337 "groupings": [ array of arrays of groups, one per rank ] }
339 The answer has the form
340 { "variants" => [ array of variant location structures ],
341 "variant_count" => total,
342 "conflict_count" => number of conflicts detected,
343 "genealogical_count" => number of solutions found }
348 my( $stemma, @groups ) = @_;
349 my $aclabel = $stemma->collation->ac_label;
351 # Filter the groups down to distinct groups, and work out what graph
352 # should be used in the calculation of each group. We want to send each
353 # distinct problem to the solver only once.
354 # We need a whole bunch of lookup tables for this.
355 my $index_groupkeys = {}; # Save the order of readings
356 my $group_indices = {}; # Save the indices that have a given grouping
357 my $graph_problems = {}; # Save the groupings for the given graph
359 foreach my $idx ( 0..$#groups ) {
360 my $ghash = $groups[$idx];
362 # Sort the groupings from big to little, and scan for a.c. witnesses
363 # that would need an extended graph.
364 my @acwits; # note which AC witnesses crop up at this rank
365 my @idxkeys = sort { scalar @{$ghash->{$b}} <=> scalar @{$ghash->{$a}} }
367 foreach my $rdg ( @idxkeys ) {
368 my @sg = sort @{$ghash->{$rdg}};
369 push( @acwits, grep { $_ =~ /\Q$aclabel\E$/ } @sg );
370 push( @grouping, \@sg );
372 # Save the reading order
373 $index_groupkeys->{$idx} = \@idxkeys;
375 # Now associate the distinct group with this index
376 my $gstr = wit_stringify( \@grouping );
377 push( @{$group_indices->{$gstr}}, $idx );
379 # Finally, add the group to the list to be calculated for this graph.
380 map { s/\Q$aclabel\E$// } @acwits;
381 my $graph = $stemma->extend_graph( \@acwits );
382 unless( exists $graph_problems->{"$graph"} ) {
383 $graph_problems->{"$graph"} = { 'object' => $graph, 'groups' => [] };
385 push( @{$graph_problems->{"$graph"}->{'groups'}}, \@grouping );
388 ## For each distinct graph, send its groups to the solver.
390 my $solver_url = 'http://byzantini.st/cgi-bin/graphcalc.cgi';
391 my $ua = LWP::UserAgent->new();
392 ## Witness map is a HACK to get around limitations in node names from IDP
393 my $witness_map = {};
394 ## Variables to store answers as they come back
395 my $variants = [ ( undef ) x ( scalar keys %$index_groupkeys ) ];
396 my $genealogical = 0;
397 foreach my $graphkey ( keys %$graph_problems ) {
398 my $graph = $graph_problems->{$graphkey}->{'object'};
399 my $groupings = $graph_problems->{$graphkey}->{'groups'};
400 my $json = encode_json( _safe_wit_strings( $graph, $stemma->collation,
401 $groupings, $witness_map ) );
402 # Send it off and get the result
403 my $resp = $ua->post( $solver_url, 'Content-Type' => 'application/json',
404 'Content' => $json );
407 if( $resp->is_success ) {
408 $answer = _desanitize_names( decode_json( $resp->content ), $witness_map );
411 # Fall back to the old method.
412 warn "IDP solver returned " . $resp->status_line . " / " . $resp->content
413 . "; falling back to perl method";
414 $answer = perl_solver( $graph, @$groupings );
416 ## The answer is the evaluated groupings, plus a boolean for whether
417 ## they were genealogical. Reconstruct our original groups.
418 foreach my $gidx ( 0 .. $#{$groupings} ) {
419 my( $calc_groups, $result ) = @{$answer->[$gidx]};
422 # Prune the calculated groups, in case the IDP solver failed to.
425 foreach my $cg ( @$calc_groups ) {
426 # This is a little wasteful but the path of least
427 # resistance. Send both the stemma, which knows what
428 # its hypotheticals are, and the actual graph used.
429 my @pg = _prune_group( $cg, $stemma, $graph );
430 push( @pruned_groups, \@pg );
432 $calc_groups = \@pruned_groups;
435 # Retrieve the key for the original group that went to the solver
436 my $input_group = wit_stringify( $groupings->[$gidx] );
437 foreach my $oidx ( @{$group_indices->{$input_group}} ) {
438 my @readings = @{$index_groupkeys->{$oidx}};
440 'genealogical' => $result,
443 foreach my $ridx ( 0 .. $#readings ) {
444 push( @{$vstruct->{'readings'}},
445 { 'readingid' => $readings[$ridx],
446 'group' => $calc_groups->[$ridx] } );
448 $variants->[$oidx] = $vstruct;
453 return { 'variants' => $variants,
454 'variant_count' => scalar @$variants,
455 'genealogical_count' => $genealogical };
458 #### HACKERY to cope with IDP's limited idea of what a node name looks like ###
460 sub _safe_wit_strings {
461 my( $graph, $c, $groupings, $witness_map ) = @_;
462 # Parse the graph we were given into a stemma.
463 my $safegraph = Graph->new();
464 # Convert the graph to a safe representation and store the conversion.
465 foreach my $n ( $graph->vertices ) {
466 my $sn = _safe_witstr( $n );
467 if( exists $witness_map->{$sn} ) {
468 warn "Ambiguous stringification $sn for $n and " . $witness_map->{$sn}
469 if $witness_map->{$sn} ne $n;
471 $witness_map->{$sn} = $n;
473 $safegraph->add_vertex( $sn );
474 $safegraph->set_vertex_attributes( $sn,
475 $graph->get_vertex_attributes( $n ) );
477 foreach my $e ( $graph->edges ) {
478 my @safe_e = ( _safe_witstr( $e->[0] ), _safe_witstr( $e->[1] ) );
479 $safegraph->add_edge( @safe_e );
481 my $safe_stemma = Text::Tradition::Stemma->new(
482 'collation' => $c, 'graph' => $safegraph );
484 # Now convert the witness groupings to a safe representation.
485 my $safe_groupings = [];
486 foreach my $grouping ( @$groupings ) {
487 my $safe_grouping = [];
488 foreach my $group ( @$grouping ) {
490 foreach my $n ( @$group ) {
491 my $sn = _safe_witstr( $n );
492 warn "Ambiguous stringification $sn for $n and " . $witness_map->{$sn}
493 if exists $witness_map->{$sn} && $witness_map->{$sn} ne $n;
494 $witness_map->{$sn} = $n;
495 push( @$safe_group, $sn );
497 push( @$safe_grouping, $safe_group );
499 push( @$safe_groupings, $safe_grouping );
502 # Return it all in the struct we expect. We have stored the reductions
503 # in the $witness_map that we were passed.
504 return { 'graph' => $safe_stemma->editable( { 'linesep' => ' ' } ),
505 'groupings' => $safe_groupings };
510 $witstr =~ s/\s+/_/g;
511 $witstr =~ s/[^\w\d-]//g;
515 sub _desanitize_names {
516 my( $jsonstruct, $witness_map ) = @_;
518 foreach my $grouping ( @$jsonstruct ) {
519 my $real_grouping = [];
520 foreach my $element ( @$grouping ) {
521 if( ref( $element ) eq 'ARRAY' ) {
523 my $real_groupset = [];
524 foreach my $group ( @$element ) {
526 foreach my $n ( @$group ) {
527 my $rn = $witness_map->{$n};
528 push( @$real_group, $rn );
530 push( @$real_groupset, $real_group );
532 push( @$real_grouping, $real_groupset );
534 # It is the boolean, not actually a group.
535 push( @$real_grouping, $element );
538 push( @$result, $real_grouping );
545 =head2 analyze_location ( $tradition, $graph, $location_hash )
547 Given the tradition, its stemma graph, and the solution from the graph solver,
548 work out the rest of the information we want. For each reading we need missing,
549 conflict, reading_parents, independent_occurrence, followed, not_followed, and follow_unknown. Alters the location_hash in place.
553 sub analyze_location {
554 my ( $tradition, $graph, $variant_row ) = @_;
556 # Make a hash of all known node memberships, and make the subgraphs.
558 my $reading_roots = {};
560 foreach my $rdghash ( @{$variant_row->{'readings'}} ) {
561 my $rid = $rdghash->{'readingid'};
562 map { $contig->{$_} = $rid } @{$rdghash->{'group'}};
565 my $part = $graph->copy;
567 map { $these_vertices{$_} = 1 } @{$rdghash->{'group'}};
568 $part->delete_vertices( grep { !$these_vertices{$_} } $part->vertices );
569 $subgraph->{$rid} = $part;
570 # Get the reading roots.
571 map { $reading_roots->{$_} = $rid } $part->predecessorless_vertices;
574 # Now that we have all the node group memberships, calculate followed/
575 # non-followed/unknown values for each reading. Also figure out the
576 # reading's evident parent(s).
577 foreach my $rdghash ( @{$variant_row->{'readings'}} ) {
578 # Group string key - TODO do we need this?
579 my $gst = wit_stringify( $rdghash->{'group'} );
580 my $rid = $rdghash->{'readingid'};
582 my $part = $subgraph->{$rid};
584 # Start figuring things out.
585 my @roots = $part->predecessorless_vertices;
586 $rdghash->{'independent_occurrence'} = scalar @roots;
587 $rdghash->{'followed'} = scalar( $part->vertices ) - scalar( @roots );
588 # Find the parent readings, if any, of this reading.
590 foreach my $wit ( @roots ) {
591 # Look in the main stemma to find this witness's extant or known-reading
592 # immediate ancestor(s), and look up the reading that each ancestor olds.
593 my @check = $graph->predecessors( $wit );
596 foreach my $wparent( @check ) {
597 my $preading = $contig->{$wparent};
599 $rdgparents{$preading} = 1;
601 push( @next, $graph->predecessors( $wparent ) );
607 $rdghash->{'reading_parents'} = [ keys %rdgparents ];
609 # Find the number of times this reading was altered, and the number of
610 # times we're not sure.
611 my( %nofollow, %unknownfollow );
612 foreach my $wit ( $part->vertices ) {
613 foreach my $wchild ( $graph->successors( $wit ) ) {
614 next if $part->has_vertex( $wchild );
615 if( $reading_roots->{$wchild} && $contig->{$wchild} ) {
616 # It definitely changed here.
617 $nofollow{$wchild} = 1;
618 } elsif( !($contig->{$wchild}) ) {
619 # The child is a hypothetical node not definitely in
620 # any group. Answer is unknown.
621 $unknownfollow{$wchild} = 1;
622 } # else it's a non-root node in a known group, and therefore
623 # is presumed to have its reading from its group, not this link.
626 $rdghash->{'not_followed'} = keys %nofollow;
627 $rdghash->{'follow_unknown'} = keys %unknownfollow;
629 # Now say whether this reading represents a conflict.
630 unless( $variant_row->{'genealogical'} ) {
631 $rdghash->{'conflict'} = @roots != 1;
637 =head2 perl_solver( $tradition, $rank, $stemma_id, @merge_relationship_types )
639 ** NOTE ** This method should hopefully not be called - it is not guaranteed
640 to be correct. Serves as a backup for the real solver.
642 Runs an analysis of the given tradition, at the location given in $rank,
643 against the graph of the stemma specified in $stemma_id. The argument
644 @merge_relationship_types is an optional list of relationship types for
645 which readings so related should be treated as equivalent.
647 Returns a nested array data structure as follows:
649 [ [ group_list, is_genealogical ], [ group_list, is_genealogical ] ... ]
651 where the group list is the array of arrays passed in for each element of @groups,
652 possibly with the addition of hypothetical readings.
658 my( $graph, @groups ) = @_;
660 foreach my $g ( @groups ) {
661 push( @answer, _solve_variant_location( $graph, $g ) );
666 sub _solve_variant_location {
667 my( $graph, $groups ) = @_;
674 # Mark each ms as in its own group, first.
675 foreach my $g ( @$groups ) {
676 my $gst = wit_stringify( $g );
677 map { $contig->{$_} = $gst } @$g;
680 # Now for each unmarked node in the graph, initialize an array
681 # for possible group memberships. We will use this later to
682 # resolve potential conflicts.
683 map { $contig->{$_} = [] unless $contig->{$_} } $graph->vertices;
684 foreach my $g ( sort { scalar @$b <=> scalar @$a } @$groups ) {
685 my $gst = wit_stringify( $g ); # This is the group name
686 # Copy the graph, and delete all non-members from the new graph.
687 my $part = $graph->copy;
689 $part->delete_vertices(
690 grep { !ref( $contig->{$_} ) && $contig->{$_} ne $gst } $graph->vertices );
692 # Now look to see if our group is connected.
694 # We have to take directionality into account.
695 # How many root nodes do we have?
696 my @roots = grep { ref( $contig->{$_} ) || $contig->{$_} eq $gst }
697 $part->predecessorless_vertices;
698 # Assuming that @$g > 1, find the first root node that has at
699 # least one successor belonging to our group. If this reading
700 # is genealogical, there should be only one, but we will check
701 # that implicitly later.
702 foreach my $root ( @roots ) {
703 # Prune the tree to get rid of extraneous hypotheticals.
704 $root = _prune_subtree( $part, $root, $contig );
706 # Save this root for our group.
707 push( @group_roots, $root );
708 # Get all the successor nodes of our root.
711 # Dispense with the trivial case of one reading.
713 @group_roots = ( $wit );
714 foreach my $v ( $part->vertices ) {
715 $part->delete_vertex( $v ) unless $v eq $wit;
719 if( @group_roots > 1 ) {
720 $conflict->{$gst} = 1;
723 # Paint the 'hypotheticals' with our group.
724 foreach my $wit ( $part->vertices ) {
725 if( ref( $contig->{$wit} ) ) {
726 push( @{$contig->{$wit}}, $gst );
727 } elsif( $contig->{$wit} ne $gst ) {
728 warn "How did we get here?";
733 # Save the relevant subgraph.
734 $subgraph->{$gst} = $part;
737 # For each of our hypothetical readings, flatten its 'contig' array if
738 # the array contains zero or one group. If we have any unflattened arrays,
739 # we may need to run the resolution process. If the reading is already known
740 # to have a conflict, flatten the 'contig' array to nothing; we won't resolve
743 foreach my $wit ( keys %$contig ) {
744 next unless ref( $contig->{$wit} );
745 if( @{$contig->{$wit}} > 1 ) {
746 if( $is_conflicted ) {
747 $contig->{$wit} = ''; # We aren't going to decide.
749 push( @resolve, $wit );
752 my $gst = pop @{$contig->{$wit}};
753 $contig->{$wit} = $gst || '';
758 my $still_contig = {};
759 foreach my $h ( @resolve ) {
760 # For each of the hypothetical readings with more than one possibility,
761 # try deleting it from each of its member subgraphs in turn, and see
762 # if that breaks the contiguous grouping.
763 # TODO This can still break in a corner case where group A can use
764 # either vertex 1 or 2, and group B can use either vertex 2 or 1.
765 # Revisit this if necessary; it could get brute-force nasty.
766 foreach my $gst ( @{$contig->{$h}} ) {
767 my $gpart = $subgraph->{$gst}->copy();
768 # If we have come this far, there is only one root and everything
769 # is reachable from it.
770 my( $root ) = $gpart->predecessorless_vertices;
772 map { $reachable->{$_} = 1 } $gpart->vertices;
774 # Try deleting the hypothetical node.
775 $gpart->delete_vertex( $h );
777 # See if we still have a single root.
778 my @roots = $gpart->predecessorless_vertices;
779 warn "This shouldn't have happened" unless @roots;
781 # $h is needed by this group.
782 if( exists( $still_contig->{$h} ) ) {
784 $conflict->{$gst} = 1;
785 $still_contig->{$h} = '';
787 $still_contig->{$h} = $gst;
791 # $h is somewhere in the middle. See if everything
792 # else can still be reached from the root.
793 my %still_reachable = ( $root => 1 );
794 map { $still_reachable{$_} = 1 }
795 $gpart->all_successors( $root );
796 foreach my $v ( keys %$reachable ) {
798 if( !$still_reachable{$v}
799 && ( $contig->{$v} eq $gst
800 || ( exists $still_contig->{$v}
801 && $still_contig->{$v} eq $gst ) ) ) {
803 if( exists $still_contig->{$h} ) {
805 $conflict->{$gst} = 1;
806 $still_contig->{$h} = '';
808 $still_contig->{$h} = $gst;
811 } # else we don't need $h in this group.
813 } # endif $h eq $root
817 # Now we have some hypothetical vertices in $still_contig that are the
818 # "real" group memberships. Replace these in $contig.
819 foreach my $v ( keys %$contig ) {
820 next unless ref $contig->{$v};
821 $contig->{$v} = $still_contig->{$v};
825 my $is_genealogical = keys %$conflict ? JSON::false : JSON::true;
826 my $variant_row = [ [], $is_genealogical ];
827 # Fill in the groupings from $contig.
828 foreach my $g ( @$groups ) {
829 my $gst = wit_stringify( $g );
830 my @realgroup = grep { $contig->{$_} eq $gst } keys %$contig;
831 push( @{$variant_row->[0]}, \@realgroup );
837 my( $group, $stemma, $graph ) = @_;
838 # Get these into a form prune_subtree will recognize. Make a "contighash"
840 map { $hypohash->{$_} = 1 } @$group;
841 # ...with reference values for hypotheticals.
842 map { $hypohash->{$_} = [] } $stemma->hypotheticals;
844 my $subgraph = $graph->copy;
845 map { $subgraph->delete_vertex( $_ ) unless exists $hypohash->{$_} }
847 # ...and find the root.
848 my( $root ) = $subgraph->predecessorless_vertices;
849 # Now prune and return the remaining vertices.
850 _prune_subtree( $subgraph, $root, $hypohash );
851 return $subgraph->vertices;
855 my( $tree, $root, $contighash ) = @_;
856 # First, delete hypothetical leaves / orphans until there are none left.
857 my @orphan_hypotheticals = grep { ref( $contighash->{$_} ) }
858 $tree->successorless_vertices;
859 while( @orphan_hypotheticals ) {
860 $tree->delete_vertices( @orphan_hypotheticals );
861 @orphan_hypotheticals = grep { ref( $contighash->{$_} ) }
862 $tree->successorless_vertices;
864 # Then delete a hypothetical root with only one successor, moving the
865 # root to the first child that has no other predecessors.
866 while( $tree->successors( $root ) == 1 && ref $contighash->{$root} ) {
867 my @nextroot = $tree->successors( $root );
868 $tree->delete_vertex( $root );
869 ( $root ) = grep { $tree->is_predecessorless_vertex( $_ ) } @nextroot;
871 # The tree has been modified in place, but we need to know the new root.
872 $root = undef unless $root && $tree->has_vertex( $root );
875 # Add the variant, subject to a.c. representation logic.
876 # This assumes that we will see the 'main' version before the a.c. version.
877 sub add_variant_wit {
878 my( $arr, $wit, $acstr ) = @_;
880 if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
882 $skip = grep { $_ =~ /^\Q$real\E$/ } @$arr;
884 push( @$arr, $wit ) unless $skip;
887 sub _useful_variant {
888 my( $group_readings, $graph, $acstr ) = @_;
890 # TODO Decide what to do with AC witnesses
892 # Sort by group size and return
894 my( @readings, @groups ); # The sorted groups for our answer.
895 foreach my $rdg ( sort { @{$group_readings->{$b}} <=> @{$group_readings->{$a}} }
896 keys %$group_readings ) {
897 push( @readings, $rdg );
898 push( @groups, $group_readings->{$rdg} );
899 if( @{$group_readings->{$rdg}} > 1 ) {
902 my( $wit ) = @{$group_readings->{$rdg}};
903 $wit =~ s/^(.*)\Q$acstr\E$/$1/;
904 $is_useful++ unless( $graph->is_sink_vertex( $wit ) );
907 if( $is_useful > 1 ) {
908 return( \@readings, \@groups );
914 =head2 wit_stringify( $groups )
916 Takes an array of witness groupings and produces a string like
917 ['A','B'] / ['C','D','E'] / ['F']
924 # If we were passed an array of witnesses instead of an array of
925 # groupings, then "group" the witnesses first.
926 unless( ref( $groups->[0] ) ) {
927 my $mkgrp = [ $groups ];
930 foreach my $g ( @$groups ) {
931 push( @gst, '[' . join( ',', map { "'$_'" } @$g ) . ']' );
933 return join( ' / ', @gst );
937 my( $lista, $listb ) = @_;
940 map { $union{$_} = 1; $scalars{$_} = $_ } @$lista;
941 map { $union{$_} += 1; $scalars{$_} = $_ } @$listb;
942 my @set = grep { $union{$_} == 1 } keys %union;
943 return map { $scalars{$_} } @set;
950 This package is free software and is provided "as is" without express
951 or implied warranty. You can redistribute it and/or modify it under
952 the same terms as Perl itself.
956 Tara L Andrews E<lt>aurum@cpan.orgE<gt>