1 package Text::Tradition::Analysis;
6 use Encode qw/ encode_utf8 /;
9 use JSON qw/ encode_json decode_json /;
12 use Text::Tradition::Stemma;
14 use vars qw/ @EXPORT_OK /;
15 @EXPORT_OK = qw/ run_analysis group_variants analyze_variant_location wit_stringify /;
19 Text::Tradition::Analysis - functions for stemma analysis of a tradition
24 use Text::Tradition::Analysis qw/ run_analysis analyze_variant_location /;
25 my $t = Text::Tradition->new(
26 'name' => 'this is a text',
28 'file' => '/path/to/tei_parallel_seg_file.xml' );
29 $t->add_stemma( 'dotfile' => $stemmafile );
31 my $variant_data = run_analysis( $tradition );
32 # Recalculate rank $n treating all orthographic variants as equivalent
33 my $reanalyze = analyze_variant_location( $tradition, $n, 0, 'orthographic' );
37 Text::Tradition is a library for representation and analysis of collated
38 texts, particularly medieval ones. The Collation is the central feature of
39 a Tradition, where the text, its sequence of readings, and its relationships
40 between readings are actually kept.
44 =head2 run_analysis( $tradition, %opts )
46 Runs the analysis described in analyze_variant_location on every location in the
47 collation of the given tradition, with the given options. These include:
51 =item * stemma_id - Specify which of the tradition's stemmata to use. Default
52 is 0 (i.e. the first).
54 =item * ranks - Specify a list of location ranks to analyze; exclude the rest.
56 =item * merge_types - Specify a list of relationship types, where related readings
57 should be treated as identical for the purposes of analysis.
59 =item * exclude_type1 - Exclude those ranks whose groupings have only type-1 variants.
66 use Text::Tradition::Analysis qw/ run_analysis analyze_variant_location /;
68 my $datafile = 't/data/florilegium_tei_ps.xml';
69 my $tradition = Text::Tradition->new( 'input' => 'TEI',
71 'file' => $datafile );
72 my $s = $tradition->add_stemma( 'dotfile' => 't/data/florilegium.dot' );
73 is( ref( $s ), 'Text::Tradition::Stemma', "Added stemma to tradition" );
75 my %expected_genealogical = (
106 my $data = run_analysis( $tradition );
107 my $c = $tradition->collation;
108 foreach my $row ( @{$data->{'variants'}} ) {
109 # Account for rows that used to be "not useful"
110 unless( exists $expected_genealogical{$row->{'id'}} ) {
111 $expected_genealogical{$row->{'id'}} = 1;
113 is( $row->{'genealogical'}, $expected_genealogical{$row->{'id'}},
114 "Got correct genealogical flag for row " . $row->{'id'} );
115 # Check that we have the right row with the right groups
116 my $rank = $row->{'id'};
117 foreach my $rdghash ( @{$row->{'readings'}} ) {
118 # Skip 'readings' that aren't really
119 next unless $c->reading( $rdghash->{'readingid'} );
121 is( $c->reading( $rdghash->{'readingid'} )->rank, $rank,
122 "Got correct reading rank" );
123 # Check the witnesses
124 my @realwits = sort $c->reading_witnesses( $rdghash->{'readingid'} );
125 my @sgrp = sort @{$rdghash->{'group'}};
126 is_deeply( \@sgrp, \@realwits, "Reading analyzed with correct groups" );
129 is( $data->{'variant_count'}, 58, "Got right total variant number" );
130 # TODO Make something meaningful of conflict count, maybe test other bits
137 my( $tradition, %opts ) = @_;
138 my $c = $tradition->collation;
140 my $stemma_id = $opts{'stemma_id'} || 0;
141 my @ranks = ref( $opts{'ranks'} ) eq 'ARRAY' ? @{$opts{'ranks'}} : ();
142 my @collapse = ref( $opts{'merge_types'} ) eq 'ARRAY' ? @{$opts{'merge_types'}} : ();
145 my $stemma = $tradition->stemma( $stemma_id );
147 # Figure out which witnesses we are working with
148 my @lacunose = $stemma->hypotheticals;
149 my @tradition_wits = map { $_->sigil } $tradition->witnesses;
150 map { push( @tradition_wits, $_->sigil.$c->ac_label ) if $_->is_layered }
151 $tradition->witnesses;
152 push( @lacunose, _symmdiff( [ $stemma->witnesses ], \@tradition_wits ) );
154 # Find and mark 'common' ranks for exclusion, unless they were
155 # explicitly specified.
158 foreach my $rdg ( $c->common_readings ) {
159 $common_rank{$rdg->rank} = 1;
161 @ranks = grep { !$common_rank{$_} } ( 1 .. $c->end->rank-1 );
164 # Group the variants to send to the solver
168 foreach my $rank ( @ranks ) {
169 my $missing = [ @lacunose ];
170 my $rankgroup = group_variants( $tradition, $rank, $missing, \@collapse );
171 if( $opts{'exclude_type1'} ) {
172 # Check to see whether this is a "useful" group.
173 my( $rdgs, $grps ) = _useful_variant( $rankgroup,
174 $stemma->graph, $c->ac_label );
177 push( @use_ranks, $rank );
178 push( @groups, $rankgroup );
179 $lacunae{$rank} = $missing;
182 my $answer = solve_variants( $stemma, @groups );
184 # Do further analysis on the answer
185 my $conflict_count = 0;
186 my $aclabel = $c->ac_label;
187 foreach my $idx ( 0 .. $#use_ranks ) {
188 my $location = $answer->{'variants'}->[$idx];
189 # Add the rank back in
190 $location->{'id'} = $use_ranks[$idx];
191 # Note what our lacunae are
193 map { $lmiss{$_} = 1 } @{$lacunae{$use_ranks[$idx]}};
194 # Run through the reading groups and add as 'lacunae' any redundant
195 # a.c. witnesses (yes, we have to do this before the analysis, thus
196 # identical loops before and after. Boo.)
197 # TODO Consider making these callbacks to analyze_location
198 foreach my $rdghash ( @{$location->{'readings'}} ) {
200 map { $rwits{$_} = 1 } @{$rdghash->{'group'}};
201 foreach my $rw ( keys %rwits ) {
202 if( $rw =~ /^(.*)\Q$aclabel\E$/ ) {
203 if( exists $rwits{$1} ) {
209 $rdghash->{'group'} = [ keys %rwits ];
211 $location->{'missing'} = [ keys %lmiss ];
213 # Run the extra analysis we need.
214 analyze_location( $tradition, $stemma->graph, $location );
216 # Do the final post-analysis tidying up of the data.
217 foreach my $rdghash ( @{$location->{'readings'}} ) {
219 if exists $rdghash->{'conflict'} && $rdghash->{'conflict'};
220 # Add the reading text back in
221 my $rdg = $c->reading( $rdghash->{'readingid'} );
222 $rdghash->{'text'} = $rdg ? $rdg->text : $rdghash->{'readingid'};
223 # Remove lacunose witnesses from this reading's list now that the
226 map { push( @realgroup, $_ ) unless $lmiss{$_} } @{$rdghash->{'group'}};
227 $rdghash->{'group'} = \@realgroup;
228 # TODO Record hypotheticals used to create group, if we end up
232 $answer->{'conflict_count'} = $conflict_count;
237 =head2 group_variants( $tradition, $rank, $lacunose, @merge_relationship_types )
239 Groups the variants at the given $rank of the collation, treating any
240 relationships in @merge_relationship_types as equivalent. $lacunose should
241 be a reference to an array, to which the sigla of lacunose witnesses at this
242 rank will be appended.
244 Returns a hash $group_readings where $rdg is attested by the witnesses listed
245 in $group_readings->{$rdg}.
249 # Return group_readings, groups, lacunose
251 my( $tradition, $rank, $lacunose, $collapse ) = @_;
252 my $c = $tradition->collation;
253 my $aclabel = $c->ac_label;
254 # Get the alignment table readings
255 my %readings_at_rank;
256 my %is_lacunose; # lookup table for $lacunose
257 map { $is_lacunose{$_} = 1 } @$lacunose;
259 foreach my $tablewit ( @{$c->alignment_table->{'alignment'}} ) {
260 my $rdg = $tablewit->{'tokens'}->[$rank-1];
261 my $wit = $tablewit->{'witness'};
262 # Exclude the witness if it is "lacunose" which if we got here
263 # means "not in the stemma".
264 next if $is_lacunose{$wit};
265 if( $rdg && $rdg->{'t'}->is_lacuna ) {
266 _add_to_witlist( $wit, $lacunose, $aclabel );
268 $readings_at_rank{$rdg->{'t'}->text} = $rdg->{'t'};
270 _add_to_witlist( $wit, \@gap_wits, $aclabel );
274 # Group the readings, collapsing groups by relationship if needed
275 my %grouped_readings;
276 foreach my $rdg ( sort { $b->witnesses <=> $a->witnesses }
277 values %readings_at_rank ) {
278 # Skip readings that have been collapsed into others.
279 next if exists $grouped_readings{$rdg->id} && !$grouped_readings{$rdg->id};
280 my @wits = $rdg->witnesses;
282 my $filter = sub { my $r = $_[0]; grep { $_ eq $r->type } @$collapse; };
283 foreach my $other ( $rdg->related_readings( $filter ) ) {
284 my @otherwits = $other->witnesses;
285 push( @wits, @otherwits );
286 $grouped_readings{$other->id} = 0;
289 my @use_wits = grep { !$is_lacunose{$_} } @wits;
290 $grouped_readings{$rdg->id} = \@use_wits;
292 $grouped_readings{'(omitted)'} = \@gap_wits if @gap_wits;
293 # Get rid of our collapsed readings
294 map { delete $grouped_readings{$_} unless $grouped_readings{$_} }
295 keys %grouped_readings
298 return \%grouped_readings;
301 =head2 solve_variants( $graph, @groups )
303 Sends the set of groups to the external graph solver service and returns
304 a cleaned-up answer, adding the rank IDs back where they belong.
306 The JSON has the form
307 { "graph": [ stemmagraph DOT string without newlines ],
308 "groupings": [ array of arrays of groups, one per rank ] }
310 The answer has the form
311 { "variants" => [ array of variant location structures ],
312 "variant_count" => total,
313 "conflict_count" => number of conflicts detected,
314 "genealogical_count" => number of solutions found }
319 my( $stemma, @groups ) = @_;
321 # Make the json with stemma + groups
323 foreach my $ghash ( @groups ) {
325 foreach my $k ( sort keys %$ghash ) {
326 push( @grouping, $ghash->{$k} );
328 push( @$groupings, \@grouping );
330 ## Witness map is a HACK to get around limitations in node names from IDP
331 my $witness_map = {};
332 my $json = encode_json( _safe_wit_strings( $stemma, $groupings, $witness_map ) );
334 # Send it off and get the result
335 my $solver_url = 'http://byzantini.st/cgi-bin/graphcalc.cgi';
336 my $ua = LWP::UserAgent->new();
337 my $resp = $ua->post( $solver_url, 'Content-Type' => 'application/json',
338 'Content' => $json );
342 if( $resp->is_success ) {
343 $answer = _desanitize_names( decode_json( $resp->content ), $witness_map );
346 # Fall back to the old method.
347 warn "IDP solver returned " . $resp->status_line . " / " . $resp->content
348 . "; falling back to perl method";
349 $answer = perl_solver( $stemma, @$groupings );
352 # Fold the result back into what we know about the groups.
354 my $genealogical = 0;
355 foreach my $idx ( 0 .. $#groups ) {
356 my( $calc_groups, $result ) = @{$answer->[$idx]};
359 # Prune the calculated groups, in case the IDP solver failed to.
362 foreach my $cg ( @$calc_groups ) {
363 my @pg = _prune_group( $cg, $stemma );
364 push( @pruned_groups, \@pg );
366 $calc_groups = \@pruned_groups;
369 my $input_group = $groups[$idx];
370 foreach my $k ( sort keys %$input_group ) {
371 my $cg = shift @$calc_groups;
372 $input_group->{$k} = $cg;
375 'genealogical' => $result,
378 foreach my $k ( sort { @{$input_group->{$b}} <=> @{$input_group->{$a}} }
379 keys %$input_group ) {
380 push( @{$vstruct->{'readings'}},
381 { 'readingid' => $k, 'group' => $input_group->{$k}} );
383 push( @$variants, $vstruct );
386 return { 'variants' => $variants,
387 'variant_count' => scalar @$variants,
388 'genealogical_count' => $genealogical };
391 #### HACKERY to cope with IDP's limited idea of what a node name looks like ###
393 sub _safe_wit_strings {
394 my( $stemma, $groupings, $witness_map ) = @_;
395 my $safegraph = Graph->new();
396 # Convert the graph to a safe representation and store the conversion.
397 foreach my $n ( $stemma->graph->vertices ) {
398 my $sn = _safe_witstr( $n );
399 warn "Ambiguous stringification $sn for $n and " . $witness_map->{$sn}
400 if exists $witness_map->{$sn};
401 $witness_map->{$sn} = $n;
402 $safegraph->add_vertex( $sn );
403 $safegraph->set_vertex_attributes( $sn,
404 $stemma->graph->get_vertex_attributes( $n ) );
406 foreach my $e ( $stemma->graph->edges ) {
407 my @safe_e = ( _safe_witstr( $e->[0] ), _safe_witstr( $e->[1] ) );
408 $safegraph->add_edge( @safe_e );
410 my $safe_stemma = Text::Tradition::Stemma->new(
411 'collation' => $stemma->collation, 'graph' => $safegraph );
413 # Now convert the witness groupings to a safe representation.
414 my $safe_groupings = [];
415 foreach my $grouping ( @$groupings ) {
416 my $safe_grouping = [];
417 foreach my $group ( @$grouping ) {
419 foreach my $n ( @$group ) {
420 my $sn = _safe_witstr( $n );
421 warn "Ambiguous stringification $sn for $n and " . $witness_map->{$sn}
422 if exists $witness_map->{$sn} && $witness_map->{$sn} ne $n;
423 $witness_map->{$sn} = $n;
424 push( @$safe_group, $sn );
426 push( @$safe_grouping, $safe_group );
428 push( @$safe_groupings, $safe_grouping );
431 # Return it all in the struct we expect. We have stored the reductions
432 # in the $witness_map that we were passed.
433 return { 'graph' => $safe_stemma->editable( ' ' ), 'groupings' => $safe_groupings };
438 $witstr =~ s/\s+/_/g;
439 $witstr =~ s/[^\w\d-]//g;
443 sub _desanitize_names {
444 my( $jsonstruct, $witness_map ) = @_;
446 foreach my $grouping ( @$jsonstruct ) {
447 my $real_grouping = [];
448 foreach my $element ( @$grouping ) {
449 if( ref( $element ) eq 'ARRAY' ) {
451 my $real_groupset = [];
452 foreach my $group ( @$element ) {
454 foreach my $n ( @$group ) {
455 my $rn = $witness_map->{$n};
456 push( @$real_group, $rn );
458 push( @$real_groupset, $real_group );
460 push( @$real_grouping, $real_groupset );
462 # It is the boolean, not actually a group.
463 push( @$real_grouping, $element );
466 push( @$result, $real_grouping );
473 =head2 analyze_location ( $tradition, $graph, $location_hash )
475 Given the tradition, its stemma graph, and the solution from the graph solver,
476 work out the rest of the information we want. For each reading we need missing,
477 conflict, reading_parents, independent_occurrence, followed, not_followed, and follow_unknown. Alters the location_hash in place.
481 sub analyze_location {
482 my ( $tradition, $graph, $variant_row ) = @_;
484 # Make a hash of all known node memberships, and make the subgraphs.
486 my $reading_roots = {};
488 foreach my $rdghash ( @{$variant_row->{'readings'}} ) {
489 my $rid = $rdghash->{'readingid'};
490 map { $contig->{$_} = $rid } @{$rdghash->{'group'}};
493 my $part = $graph->copy;
495 map { $these_vertices{$_} = 1 } @{$rdghash->{'group'}};
496 $part->delete_vertices( grep { !$these_vertices{$_} } $part->vertices );
497 $subgraph->{$rid} = $part;
498 # Get the reading roots.
499 map { $reading_roots->{$_} = $rid } $part->predecessorless_vertices;
502 # Now that we have all the node group memberships, calculate followed/
503 # non-followed/unknown values for each reading. Also figure out the
504 # reading's evident parent(s).
505 foreach my $rdghash ( @{$variant_row->{'readings'}} ) {
506 # Group string key - TODO do we need this?
507 my $gst = wit_stringify( $rdghash->{'group'} );
508 my $rid = $rdghash->{'readingid'};
510 my $part = $subgraph->{$rid};
512 # Start figuring things out.
513 my @roots = $part->predecessorless_vertices;
514 $rdghash->{'independent_occurrence'} = scalar @roots;
515 $rdghash->{'followed'} = scalar( $part->vertices ) - scalar( @roots );
516 # Find the parent readings, if any, of this reading.
518 foreach my $wit ( @roots ) {
519 # Look in the main stemma to find this witness's extant or known-reading
520 # immediate ancestor(s), and look up the reading that each ancestor olds.
521 my @check = $graph->predecessors( $wit );
524 foreach my $wparent( @check ) {
525 my $preading = $contig->{$wparent};
527 $rdgparents{$preading} = 1;
529 push( @next, $graph->predecessors( $wparent ) );
535 $rdghash->{'reading_parents'} = [ keys %rdgparents ];
537 # Find the number of times this reading was altered, and the number of
538 # times we're not sure.
539 my( %nofollow, %unknownfollow );
540 foreach my $wit ( $part->vertices ) {
541 foreach my $wchild ( $graph->successors( $wit ) ) {
542 next if $part->has_vertex( $wchild );
543 if( $reading_roots->{$wchild} && $contig->{$wchild} ) {
544 # It definitely changed here.
545 $nofollow{$wchild} = 1;
546 } elsif( !($contig->{$wchild}) ) {
547 # The child is a hypothetical node not definitely in
548 # any group. Answer is unknown.
549 $unknownfollow{$wchild} = 1;
550 } # else it's a non-root node in a known group, and therefore
551 # is presumed to have its reading from its group, not this link.
554 $rdghash->{'not_followed'} = keys %nofollow;
555 $rdghash->{'follow_unknown'} = keys %unknownfollow;
557 # Now say whether this reading represents a conflict.
558 unless( $variant_row->{'genealogical'} ) {
559 $rdghash->{'conflict'} = @roots != 1;
565 =head2 perl_solver( $tradition, $rank, $stemma_id, @merge_relationship_types )
567 ** NOTE ** This method should hopefully not be called - it is not guaranteed
568 to be correct. Serves as a backup for the real solver.
570 Runs an analysis of the given tradition, at the location given in $rank,
571 against the graph of the stemma specified in $stemma_id. The argument
572 @merge_relationship_types is an optional list of relationship types for
573 which readings so related should be treated as equivalent.
575 Returns a nested array data structure as follows:
577 [ [ group_list, is_genealogical ], [ group_list, is_genealogical ] ... ]
579 where the group list is the array of arrays passed in for each element of @groups,
580 possibly with the addition of hypothetical readings.
586 my( $stemma, @groups ) = @_;
587 my $graph = $stemma->graph;
589 foreach my $g ( @groups ) {
590 push( @answer, _solve_variant_location( $graph, $g ) );
595 sub _solve_variant_location {
596 my( $graph, $groups ) = @_;
603 # Mark each ms as in its own group, first.
604 foreach my $g ( @$groups ) {
605 my $gst = wit_stringify( $g );
606 map { $contig->{$_} = $gst } @$g;
609 # Now for each unmarked node in the graph, initialize an array
610 # for possible group memberships. We will use this later to
611 # resolve potential conflicts.
612 map { $contig->{$_} = [] unless $contig->{$_} } $graph->vertices;
613 foreach my $g ( sort { scalar @$b <=> scalar @$a } @$groups ) {
614 my $gst = wit_stringify( $g ); # This is the group name
615 # Copy the graph, and delete all non-members from the new graph.
616 my $part = $graph->copy;
618 $part->delete_vertices(
619 grep { !ref( $contig->{$_} ) && $contig->{$_} ne $gst } $graph->vertices );
621 # Now look to see if our group is connected.
623 # We have to take directionality into account.
624 # How many root nodes do we have?
625 my @roots = grep { ref( $contig->{$_} ) || $contig->{$_} eq $gst }
626 $part->predecessorless_vertices;
627 # Assuming that @$g > 1, find the first root node that has at
628 # least one successor belonging to our group. If this reading
629 # is genealogical, there should be only one, but we will check
630 # that implicitly later.
631 foreach my $root ( @roots ) {
632 # Prune the tree to get rid of extraneous hypotheticals.
633 $root = _prune_subtree( $part, $root, $contig );
635 # Save this root for our group.
636 push( @group_roots, $root );
637 # Get all the successor nodes of our root.
640 # Dispense with the trivial case of one reading.
642 @group_roots = ( $wit );
643 foreach my $v ( $part->vertices ) {
644 $part->delete_vertex( $v ) unless $v eq $wit;
648 if( @group_roots > 1 ) {
649 $conflict->{$gst} = 1;
652 # Paint the 'hypotheticals' with our group.
653 foreach my $wit ( $part->vertices ) {
654 if( ref( $contig->{$wit} ) ) {
655 push( @{$contig->{$wit}}, $gst );
656 } elsif( $contig->{$wit} ne $gst ) {
657 warn "How did we get here?";
662 # Save the relevant subgraph.
663 $subgraph->{$gst} = $part;
666 # For each of our hypothetical readings, flatten its 'contig' array if
667 # the array contains zero or one group. If we have any unflattened arrays,
668 # we may need to run the resolution process. If the reading is already known
669 # to have a conflict, flatten the 'contig' array to nothing; we won't resolve
672 foreach my $wit ( keys %$contig ) {
673 next unless ref( $contig->{$wit} );
674 if( @{$contig->{$wit}} > 1 ) {
675 if( $is_conflicted ) {
676 $contig->{$wit} = ''; # We aren't going to decide.
678 push( @resolve, $wit );
681 my $gst = pop @{$contig->{$wit}};
682 $contig->{$wit} = $gst || '';
687 my $still_contig = {};
688 foreach my $h ( @resolve ) {
689 # For each of the hypothetical readings with more than one possibility,
690 # try deleting it from each of its member subgraphs in turn, and see
691 # if that breaks the contiguous grouping.
692 # TODO This can still break in a corner case where group A can use
693 # either vertex 1 or 2, and group B can use either vertex 2 or 1.
694 # Revisit this if necessary; it could get brute-force nasty.
695 foreach my $gst ( @{$contig->{$h}} ) {
696 my $gpart = $subgraph->{$gst}->copy();
697 # If we have come this far, there is only one root and everything
698 # is reachable from it.
699 my( $root ) = $gpart->predecessorless_vertices;
701 map { $reachable->{$_} = 1 } $gpart->vertices;
703 # Try deleting the hypothetical node.
704 $gpart->delete_vertex( $h );
706 # See if we still have a single root.
707 my @roots = $gpart->predecessorless_vertices;
708 warn "This shouldn't have happened" unless @roots;
710 # $h is needed by this group.
711 if( exists( $still_contig->{$h} ) ) {
713 $conflict->{$gst} = 1;
714 $still_contig->{$h} = '';
716 $still_contig->{$h} = $gst;
720 # $h is somewhere in the middle. See if everything
721 # else can still be reached from the root.
722 my %still_reachable = ( $root => 1 );
723 map { $still_reachable{$_} = 1 }
724 $gpart->all_successors( $root );
725 foreach my $v ( keys %$reachable ) {
727 if( !$still_reachable{$v}
728 && ( $contig->{$v} eq $gst
729 || ( exists $still_contig->{$v}
730 && $still_contig->{$v} eq $gst ) ) ) {
732 if( exists $still_contig->{$h} ) {
734 $conflict->{$gst} = 1;
735 $still_contig->{$h} = '';
737 $still_contig->{$h} = $gst;
740 } # else we don't need $h in this group.
742 } # endif $h eq $root
746 # Now we have some hypothetical vertices in $still_contig that are the
747 # "real" group memberships. Replace these in $contig.
748 foreach my $v ( keys %$contig ) {
749 next unless ref $contig->{$v};
750 $contig->{$v} = $still_contig->{$v};
754 my $is_genealogical = keys %$conflict ? JSON::false : JSON::true;
755 my $variant_row = [ [], $is_genealogical ];
756 # Fill in the groupings from $contig.
757 foreach my $g ( @$groups ) {
758 my $gst = wit_stringify( $g );
759 my @realgroup = grep { $contig->{$_} eq $gst } keys %$contig;
760 push( @{$variant_row->[0]}, \@realgroup );
766 my( $group, $stemma ) = @_;
767 # Get these into a form prune_subtree will recognize. Make a "contighash"
769 map { $hypohash->{$_} = 1 } @$group;
770 # ...with reference values for hypotheticals.
771 map { $hypohash->{$_} = [] } $stemma->hypotheticals;
773 my $subgraph = $stemma->graph->copy;
774 map { $subgraph->delete_vertex( $_ ) unless exists $hypohash->{$_} }
776 # ...and find the root.
777 my( $root ) = $subgraph->predecessorless_vertices;
778 # Now prune and return the remaining vertices.
779 _prune_subtree( $subgraph, $root, $hypohash );
780 return $subgraph->vertices;
784 my( $tree, $root, $contighash ) = @_;
785 # First, delete hypothetical leaves / orphans until there are none left.
786 my @orphan_hypotheticals = grep { ref( $contighash->{$_} ) }
787 $tree->successorless_vertices;
788 while( @orphan_hypotheticals ) {
789 $tree->delete_vertices( @orphan_hypotheticals );
790 @orphan_hypotheticals = grep { ref( $contighash->{$_} ) }
791 $tree->successorless_vertices;
793 # Then delete a hypothetical root with only one successor, moving the
794 # root to the first child that has no other predecessors.
795 while( $tree->successors( $root ) == 1 && ref $contighash->{$root} ) {
796 my @nextroot = $tree->successors( $root );
797 $tree->delete_vertex( $root );
798 ( $root ) = grep { $tree->is_predecessorless_vertex( $_ ) } @nextroot;
800 # The tree has been modified in place, but we need to know the new root.
801 $root = undef unless $root && $tree->has_vertex( $root );
804 # Add the variant, subject to a.c. representation logic.
805 # This assumes that we will see the 'main' version before the a.c. version.
806 sub add_variant_wit {
807 my( $arr, $wit, $acstr ) = @_;
809 if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
811 $skip = grep { $_ =~ /^\Q$real\E$/ } @$arr;
813 push( @$arr, $wit ) unless $skip;
816 sub _useful_variant {
817 my( $group_readings, $graph, $acstr ) = @_;
819 # TODO Decide what to do with AC witnesses
821 # Sort by group size and return
823 my( @readings, @groups ); # The sorted groups for our answer.
824 foreach my $rdg ( sort { @{$group_readings->{$b}} <=> @{$group_readings->{$a}} }
825 keys %$group_readings ) {
826 push( @readings, $rdg );
827 push( @groups, $group_readings->{$rdg} );
828 if( @{$group_readings->{$rdg}} > 1 ) {
831 my( $wit ) = @{$group_readings->{$rdg}};
832 $wit =~ s/^(.*)\Q$acstr\E$/$1/;
833 $is_useful++ unless( $graph->is_sink_vertex( $wit ) );
836 if( $is_useful > 1 ) {
837 return( \@readings, \@groups );
843 =head2 wit_stringify( $groups )
845 Takes an array of witness groupings and produces a string like
846 ['A','B'] / ['C','D','E'] / ['F']
853 # If we were passed an array of witnesses instead of an array of
854 # groupings, then "group" the witnesses first.
855 unless( ref( $groups->[0] ) ) {
856 my $mkgrp = [ $groups ];
859 foreach my $g ( @$groups ) {
860 push( @gst, '[' . join( ',', map { "'$_'" } @$g ) . ']' );
862 return join( ' / ', @gst );
865 # Helper function to ensure that X and X a.c. never appear in the same list.
866 sub _add_to_witlist {
867 my( $wit, $list, $acstr ) = @_;
870 map { $inlist{$_} = $idx++ } @$list;
871 if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
873 unless( exists $inlist{$acwit} ) {
874 push( @$list, $acwit.$acstr );
877 if( exists( $inlist{$wit.$acstr} ) ) {
878 # Replace the a.c. version with the main witness
879 my $i = $inlist{$wit.$acstr};
882 push( @$list, $wit );
888 my( $lista, $listb ) = @_;
891 map { $union{$_} = 1; $scalars{$_} = $_ } @$lista;
892 map { $union{$_} += 1; $scalars{$_} = $_ } @$listb;
893 my @set = grep { $union{$_} == 1 } keys %union;
894 return map { $scalars{$_} } @set;
901 This package is free software and is provided "as is" without express
902 or implied warranty. You can redistribute it and/or modify it under
903 the same terms as Perl itself.
907 Tara L Andrews E<lt>aurum@cpan.orgE<gt>