1 package Text::Tradition::Analysis;
6 use Encode qw/ encode_utf8 /;
9 use JSON qw/ encode_json decode_json /;
12 use Text::Tradition::Stemma;
14 use vars qw/ @EXPORT_OK /;
15 @EXPORT_OK = qw/ run_analysis group_variants analyze_variant_location wit_stringify /;
19 Text::Tradition::Analysis - functions for stemma analysis of a tradition
24 use Text::Tradition::Analysis qw/ run_analysis analyze_variant_location /;
25 my $t = Text::Tradition->new(
26 'name' => 'this is a text',
28 'file' => '/path/to/tei_parallel_seg_file.xml' );
29 $t->add_stemma( 'dotfile' => $stemmafile );
31 my $variant_data = run_analysis( $tradition );
32 # Recalculate rank $n treating all orthographic variants as equivalent
33 my $reanalyze = analyze_variant_location( $tradition, $n, 0, 'orthographic' );
37 Text::Tradition is a library for representation and analysis of collated
38 texts, particularly medieval ones. The Collation is the central feature of
39 a Tradition, where the text, its sequence of readings, and its relationships
40 between readings are actually kept.
44 =head2 run_analysis( $tradition, %opts )
46 Runs the analysis described in analyze_variant_location on every location in the
47 collation of the given tradition, with the given options. These include:
51 =item * stemma_id - Specify which of the tradition's stemmata to use. Default
52 is 0 (i.e. the first).
54 =item * ranks - Specify a list of location ranks to analyze; exclude the rest.
56 =item * merge_types - Specify a list of relationship types, where related readings
57 should be treated as identical for the purposes of analysis.
59 =item * exclude_type1 - Exclude those ranks whose groupings have only type-1 variants.
66 use Text::Tradition::Analysis qw/ run_analysis analyze_variant_location /;
68 my $datafile = 't/data/florilegium_tei_ps.xml';
69 my $tradition = Text::Tradition->new( 'input' => 'TEI',
71 'file' => $datafile );
72 my $s = $tradition->add_stemma( 'dotfile' => 't/data/florilegium.dot' );
73 is( ref( $s ), 'Text::Tradition::Stemma', "Added stemma to tradition" );
75 my %expected_genealogical = (
106 my $data = run_analysis( $tradition );
107 my $c = $tradition->collation;
108 foreach my $row ( @{$data->{'variants'}} ) {
109 # Account for rows that used to be "not useful"
110 unless( exists $expected_genealogical{$row->{'id'}} ) {
111 $expected_genealogical{$row->{'id'}} = 1;
113 my $gen_bool = $row->{'genealogical'} ? 1 : 0;
114 is( $gen_bool, $expected_genealogical{$row->{'id'}},
115 "Got correct genealogical flag for row " . $row->{'id'} );
116 # Check that we have the right row with the right groups
117 my $rank = $row->{'id'};
118 foreach my $rdghash ( @{$row->{'readings'}} ) {
119 # Skip 'readings' that aren't really
120 next unless $c->reading( $rdghash->{'readingid'} );
122 is( $c->reading( $rdghash->{'readingid'} )->rank, $rank,
123 "Got correct reading rank" );
124 # Check the witnesses
125 my @realwits = sort $c->reading_witnesses( $rdghash->{'readingid'} );
126 my @sgrp = sort @{$rdghash->{'group'}};
127 is_deeply( \@sgrp, \@realwits, "Reading analyzed with correct groups" );
130 is( $data->{'variant_count'}, 58, "Got right total variant number" );
131 # TODO Make something meaningful of conflict count, maybe test other bits
138 my( $tradition, %opts ) = @_;
139 my $c = $tradition->collation;
141 my $stemma_id = $opts{'stemma_id'} || 0;
142 my @ranks = ref( $opts{'ranks'} ) eq 'ARRAY' ? @{$opts{'ranks'}} : ();
143 my @collapse = ref( $opts{'merge_types'} ) eq 'ARRAY' ? @{$opts{'merge_types'}} : ();
146 my $stemma = $tradition->stemma( $stemma_id );
148 # Figure out which witnesses we are working with - that is, the ones that
149 # appear both in the stemma and in the tradition. All others are 'lacunose'
151 my @lacunose = $stemma->hypotheticals;
152 my @tradition_wits = map { $_->sigil } $tradition->witnesses;
153 push( @lacunose, _symmdiff( [ $stemma->witnesses ], \@tradition_wits ) );
155 # Find and mark 'common' ranks for exclusion, unless they were
156 # explicitly specified.
159 foreach my $rdg ( $c->common_readings ) {
160 $common_rank{$rdg->rank} = 1;
162 @ranks = grep { !$common_rank{$_} } ( 1 .. $c->end->rank-1 );
165 # Group the variants to send to the solver
169 foreach my $rank ( @ranks ) {
170 my $missing = [ @lacunose ];
171 my $rankgroup = group_variants( $tradition, $rank, $missing, \@collapse );
172 if( $opts{'exclude_type1'} ) {
173 # Check to see whether this is a "useful" group.
174 my( $rdgs, $grps ) = _useful_variant( $rankgroup,
175 $stemma->graph, $c->ac_label );
178 push( @use_ranks, $rank );
179 push( @groups, $rankgroup );
180 $lacunae{$rank} = $missing;
183 my $answer = solve_variants( $stemma, @groups );
185 # Do further analysis on the answer
186 my $conflict_count = 0;
187 my $aclabel = $c->ac_label;
188 foreach my $idx ( 0 .. $#use_ranks ) {
189 my $location = $answer->{'variants'}->[$idx];
190 # Add the rank back in
191 $location->{'id'} = $use_ranks[$idx];
192 # Note what our lacunae are
194 map { $lmiss{$_} = 1 } @{$lacunae{$use_ranks[$idx]}};
195 # Run through the reading groups and add as 'lacunae' any redundant
196 # a.c. witnesses (yes, we have to do this before the analysis, thus
197 # identical loops before and after. Boo.)
198 # TODO Consider making these callbacks to analyze_location
199 foreach my $rdghash ( @{$location->{'readings'}} ) {
201 map { $rwits{$_} = 1 } @{$rdghash->{'group'}};
202 foreach my $rw ( keys %rwits ) {
203 if( $rw =~ /^(.*)\Q$aclabel\E$/ ) {
204 if( exists $rwits{$1} ) {
210 $rdghash->{'group'} = [ keys %rwits ];
212 $location->{'missing'} = [ keys %lmiss ];
214 # Run the extra analysis we need.
215 analyze_location( $tradition, $stemma->graph, $location );
217 # Do the final post-analysis tidying up of the data.
218 foreach my $rdghash ( @{$location->{'readings'}} ) {
220 if exists $rdghash->{'conflict'} && $rdghash->{'conflict'};
221 # Add the reading text back in
222 my $rdg = $c->reading( $rdghash->{'readingid'} );
223 $rdghash->{'text'} = $rdg ? $rdg->text : $rdghash->{'readingid'};
224 # Remove lacunose witnesses from this reading's list now that the
227 map { push( @realgroup, $_ ) unless $lmiss{$_} } @{$rdghash->{'group'}};
228 $rdghash->{'group'} = \@realgroup;
229 # TODO Record hypotheticals used to create group, if we end up
233 $answer->{'conflict_count'} = $conflict_count;
238 =head2 group_variants( $tradition, $rank, $lacunose, @merge_relationship_types )
240 Groups the variants at the given $rank of the collation, treating any
241 relationships in @merge_relationship_types as equivalent. $lacunose should
242 be a reference to an array, to which the sigla of lacunose witnesses at this
243 rank will be appended.
245 Returns a hash $group_readings where $rdg is attested by the witnesses listed
246 in $group_readings->{$rdg}.
250 # Return group_readings, groups, lacunose
252 my( $tradition, $rank, $lacunose, $collapse ) = @_;
253 my $c = $tradition->collation;
254 my $aclabel = $c->ac_label;
256 # Get the alignment table readings
257 my %readings_at_rank;
258 my %is_lacunose; # lookup table for $lacunose
259 map { $is_lacunose{$_} = 1 } @$lacunose;
261 foreach my $tablewit ( @{$c->alignment_table->{'alignment'}} ) {
262 my $rdg = $tablewit->{'tokens'}->[$rank-1];
263 my $wit = $tablewit->{'witness'};
264 # Exclude the witness if it is "lacunose" which if we got here
265 # means "not in the stemma".
266 next if $is_lacunose{$wit};
267 if( $rdg && $rdg->{'t'}->is_lacuna ) {
268 _add_to_witlist( $wit, $lacunose, $aclabel );
270 $readings_at_rank{$rdg->{'t'}->text} = $rdg->{'t'};
272 _add_to_witlist( $wit, \@gap_wits, $aclabel );
276 # Group the readings, collapsing groups by relationship if needed
277 my %grouped_readings;
278 foreach my $rdg ( values %readings_at_rank ) {
279 # Skip readings that have been collapsed into others.
280 next if exists $grouped_readings{$rdg->id} && !$grouped_readings{$rdg->id};
281 # Get the witness list, including from readings collapsed into this one.
282 my @wits = $rdg->witnesses;
284 my $filter = sub { my $r = $_[0]; grep { $_ eq $r->type } @$collapse; };
285 foreach my $other ( $rdg->related_readings( $filter ) ) {
286 my @otherwits = $other->witnesses;
287 push( @wits, @otherwits );
288 $grouped_readings{$other->id} = 0;
291 # Filter the group to those witnesses in the stemma
293 foreach my $wit ( @wits ) {
294 next if $is_lacunose{$wit};
295 push( @use_wits, $wit );
297 $grouped_readings{$rdg->id} = \@use_wits;
299 $grouped_readings{'(omitted)'} = \@gap_wits if @gap_wits;
300 # Get rid of our collapsed readings
301 map { delete $grouped_readings{$_} unless $grouped_readings{$_} }
302 keys %grouped_readings
306 return \%grouped_readings;
309 # Helper function to ensure that X and X a.c. never appear in the same list.
310 sub _add_to_witlist {
311 my( $wit, $list, $acstr ) = @_;
314 map { $inlist{$_} = $idx++ } @$list;
315 if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
317 unless( exists $inlist{$acwit} ) {
318 push( @$list, $acwit.$acstr );
321 if( exists( $inlist{$wit.$acstr} ) ) {
322 # Replace the a.c. version with the main witness
323 my $i = $inlist{$wit.$acstr};
326 push( @$list, $wit );
331 =head2 solve_variants( $graph, @groups )
333 Sends the set of groups to the external graph solver service and returns
334 a cleaned-up answer, adding the rank IDs back where they belong.
336 The JSON has the form
337 { "graph": [ stemmagraph DOT string without newlines ],
338 "groupings": [ array of arrays of groups, one per rank ] }
340 The answer has the form
341 { "variants" => [ array of variant location structures ],
342 "variant_count" => total,
343 "conflict_count" => number of conflicts detected,
344 "genealogical_count" => number of solutions found }
349 my( $stemma, @groups ) = @_;
350 my $aclabel = $stemma->collation->ac_label;
352 # Filter the groups down to distinct groups, and work out what graph
353 # should be used in the calculation of each group. We want to send each
354 # distinct problem to the solver only once.
355 # We need a whole bunch of lookup tables for this.
356 my $index_groupkeys = {}; # Save the order of readings
357 my $group_indices = {}; # Save the indices that have a given grouping
358 my $graph_problems = {}; # Save the groupings for the given graph
360 foreach my $idx ( 0..$#groups ) {
361 my $ghash = $groups[$idx];
363 # Sort the groupings from big to little, and scan for a.c. witnesses
364 # that would need an extended graph.
365 my @acwits; # note which AC witnesses crop up at this rank
366 my @idxkeys = sort { scalar @{$ghash->{$b}} <=> scalar @{$ghash->{$a}} }
368 foreach my $rdg ( @idxkeys ) {
369 my @sg = sort @{$ghash->{$rdg}};
370 push( @acwits, grep { $_ =~ /\Q$aclabel\E$/ } @sg );
371 push( @grouping, \@sg );
373 # Save the reading order
374 $index_groupkeys->{$idx} = \@idxkeys;
376 # Now associate the distinct group with this index
377 my $gstr = wit_stringify( \@grouping );
378 push( @{$group_indices->{$gstr}}, $idx );
380 # Finally, add the group to the list to be calculated for this graph.
381 map { s/\Q$aclabel\E$// } @acwits;
382 my $graph = $stemma->extend_graph( \@acwits );
383 unless( exists $graph_problems->{"$graph"} ) {
384 $graph_problems->{"$graph"} = { 'object' => $graph, 'groups' => [] };
386 push( @{$graph_problems->{"$graph"}->{'groups'}}, \@grouping );
389 ## For each distinct graph, send its groups to the solver.
391 my $solver_url = 'http://byzantini.st/cgi-bin/graphcalc.cgi';
392 my $ua = LWP::UserAgent->new();
393 ## Witness map is a HACK to get around limitations in node names from IDP
394 my $witness_map = {};
395 ## Variables to store answers as they come back
396 my $variants = [ ( undef ) x ( scalar keys %$index_groupkeys ) ];
397 my $genealogical = 0;
398 foreach my $graphkey ( keys %$graph_problems ) {
399 my $graph = $graph_problems->{$graphkey}->{'object'};
400 my $groupings = $graph_problems->{$graphkey}->{'groups'};
401 my $json = encode_json( _safe_wit_strings( $graph, $stemma->collation,
402 $groupings, $witness_map ) );
403 # Send it off and get the result
404 my $resp = $ua->post( $solver_url, 'Content-Type' => 'application/json',
405 'Content' => $json );
408 if( $resp->is_success ) {
409 $answer = _desanitize_names( decode_json( $resp->content ), $witness_map );
412 # Fall back to the old method.
413 warn "IDP solver returned " . $resp->status_line . " / " . $resp->content
414 . "; falling back to perl method";
415 $answer = perl_solver( $graph, @$groupings );
417 ## The answer is the evaluated groupings, plus a boolean for whether
418 ## they were genealogical. Reconstruct our original groups.
419 foreach my $gidx ( 0 .. $#{$groupings} ) {
420 my( $calc_groups, $result ) = @{$answer->[$gidx]};
423 # Prune the calculated groups, in case the IDP solver failed to.
426 foreach my $cg ( @$calc_groups ) {
427 # This is a little wasteful but the path of least
428 # resistance. Send both the stemma, which knows what
429 # its hypotheticals are, and the actual graph used.
430 my @pg = _prune_group( $cg, $stemma, $graph );
431 push( @pruned_groups, \@pg );
433 $calc_groups = \@pruned_groups;
436 # Retrieve the key for the original group that went to the solver
437 my $input_group = wit_stringify( $groupings->[$gidx] );
438 foreach my $oidx ( @{$group_indices->{$input_group}} ) {
439 my @readings = @{$index_groupkeys->{$oidx}};
441 'genealogical' => $result,
444 foreach my $ridx ( 0 .. $#readings ) {
445 push( @{$vstruct->{'readings'}},
446 { 'readingid' => $readings[$ridx],
447 'group' => $calc_groups->[$ridx] } );
449 $variants->[$oidx] = $vstruct;
454 return { 'variants' => $variants,
455 'variant_count' => scalar @$variants,
456 'genealogical_count' => $genealogical };
459 #### HACKERY to cope with IDP's limited idea of what a node name looks like ###
461 sub _safe_wit_strings {
462 my( $graph, $c, $groupings, $witness_map ) = @_;
463 # Parse the graph we were given into a stemma.
464 my $safegraph = Graph->new();
465 # Convert the graph to a safe representation and store the conversion.
466 foreach my $n ( $graph->vertices ) {
467 my $sn = _safe_witstr( $n );
468 if( exists $witness_map->{$sn} ) {
469 warn "Ambiguous stringification $sn for $n and " . $witness_map->{$sn}
470 if $witness_map->{$sn} ne $n;
472 $witness_map->{$sn} = $n;
474 $safegraph->add_vertex( $sn );
475 $safegraph->set_vertex_attributes( $sn,
476 $graph->get_vertex_attributes( $n ) );
478 foreach my $e ( $graph->edges ) {
479 my @safe_e = ( _safe_witstr( $e->[0] ), _safe_witstr( $e->[1] ) );
480 $safegraph->add_edge( @safe_e );
482 my $safe_stemma = Text::Tradition::Stemma->new(
483 'collation' => $c, 'graph' => $safegraph );
485 # Now convert the witness groupings to a safe representation.
486 my $safe_groupings = [];
487 foreach my $grouping ( @$groupings ) {
488 my $safe_grouping = [];
489 foreach my $group ( @$grouping ) {
491 foreach my $n ( @$group ) {
492 my $sn = _safe_witstr( $n );
493 warn "Ambiguous stringification $sn for $n and " . $witness_map->{$sn}
494 if exists $witness_map->{$sn} && $witness_map->{$sn} ne $n;
495 $witness_map->{$sn} = $n;
496 push( @$safe_group, $sn );
498 push( @$safe_grouping, $safe_group );
500 push( @$safe_groupings, $safe_grouping );
503 # Return it all in the struct we expect. We have stored the reductions
504 # in the $witness_map that we were passed.
505 return { 'graph' => $safe_stemma->editable( { 'linesep' => ' ' } ),
506 'groupings' => $safe_groupings };
511 $witstr =~ s/\s+/_/g;
512 $witstr =~ s/[^\w\d-]//g;
516 sub _desanitize_names {
517 my( $jsonstruct, $witness_map ) = @_;
519 foreach my $grouping ( @$jsonstruct ) {
520 my $real_grouping = [];
521 foreach my $element ( @$grouping ) {
522 if( ref( $element ) eq 'ARRAY' ) {
524 my $real_groupset = [];
525 foreach my $group ( @$element ) {
527 foreach my $n ( @$group ) {
528 my $rn = $witness_map->{$n};
529 push( @$real_group, $rn );
531 push( @$real_groupset, $real_group );
533 push( @$real_grouping, $real_groupset );
535 # It is the boolean, not actually a group.
536 push( @$real_grouping, $element );
539 push( @$result, $real_grouping );
546 =head2 analyze_location ( $tradition, $graph, $location_hash )
548 Given the tradition, its stemma graph, and the solution from the graph solver,
549 work out the rest of the information we want. For each reading we need missing,
550 conflict, reading_parents, independent_occurrence, followed, not_followed, and follow_unknown. Alters the location_hash in place.
554 sub analyze_location {
555 my ( $tradition, $graph, $variant_row ) = @_;
557 # Make a hash of all known node memberships, and make the subgraphs.
559 my $reading_roots = {};
561 foreach my $rdghash ( @{$variant_row->{'readings'}} ) {
562 my $rid = $rdghash->{'readingid'};
563 map { $contig->{$_} = $rid } @{$rdghash->{'group'}};
566 my $part = $graph->copy;
568 map { $these_vertices{$_} = 1 } @{$rdghash->{'group'}};
569 $part->delete_vertices( grep { !$these_vertices{$_} } $part->vertices );
570 $subgraph->{$rid} = $part;
571 # Get the reading roots.
572 map { $reading_roots->{$_} = $rid } $part->predecessorless_vertices;
575 # Now that we have all the node group memberships, calculate followed/
576 # non-followed/unknown values for each reading. Also figure out the
577 # reading's evident parent(s).
578 foreach my $rdghash ( @{$variant_row->{'readings'}} ) {
579 # Group string key - TODO do we need this?
580 my $gst = wit_stringify( $rdghash->{'group'} );
581 my $rid = $rdghash->{'readingid'};
583 my $part = $subgraph->{$rid};
585 # Start figuring things out.
586 my @roots = $part->predecessorless_vertices;
587 $rdghash->{'independent_occurrence'} = scalar @roots;
588 $rdghash->{'followed'} = scalar( $part->vertices ) - scalar( @roots );
589 # Find the parent readings, if any, of this reading.
591 foreach my $wit ( @roots ) {
592 # Look in the main stemma to find this witness's extant or known-reading
593 # immediate ancestor(s), and look up the reading that each ancestor olds.
594 my @check = $graph->predecessors( $wit );
597 foreach my $wparent( @check ) {
598 my $preading = $contig->{$wparent};
600 $rdgparents{$preading} = 1;
602 push( @next, $graph->predecessors( $wparent ) );
608 $rdghash->{'reading_parents'} = [ keys %rdgparents ];
610 # Find the number of times this reading was altered, and the number of
611 # times we're not sure.
612 my( %nofollow, %unknownfollow );
613 foreach my $wit ( $part->vertices ) {
614 foreach my $wchild ( $graph->successors( $wit ) ) {
615 next if $part->has_vertex( $wchild );
616 if( $reading_roots->{$wchild} && $contig->{$wchild} ) {
617 # It definitely changed here.
618 $nofollow{$wchild} = 1;
619 } elsif( !($contig->{$wchild}) ) {
620 # The child is a hypothetical node not definitely in
621 # any group. Answer is unknown.
622 $unknownfollow{$wchild} = 1;
623 } # else it's a non-root node in a known group, and therefore
624 # is presumed to have its reading from its group, not this link.
627 $rdghash->{'not_followed'} = keys %nofollow;
628 $rdghash->{'follow_unknown'} = keys %unknownfollow;
630 # Now say whether this reading represents a conflict.
631 unless( $variant_row->{'genealogical'} ) {
632 $rdghash->{'conflict'} = @roots != 1;
638 =head2 perl_solver( $tradition, $rank, $stemma_id, @merge_relationship_types )
640 ** NOTE ** This method should hopefully not be called - it is not guaranteed
641 to be correct. Serves as a backup for the real solver.
643 Runs an analysis of the given tradition, at the location given in $rank,
644 against the graph of the stemma specified in $stemma_id. The argument
645 @merge_relationship_types is an optional list of relationship types for
646 which readings so related should be treated as equivalent.
648 Returns a nested array data structure as follows:
650 [ [ group_list, is_genealogical ], [ group_list, is_genealogical ] ... ]
652 where the group list is the array of arrays passed in for each element of @groups,
653 possibly with the addition of hypothetical readings.
659 my( $graph, @groups ) = @_;
661 foreach my $g ( @groups ) {
662 push( @answer, _solve_variant_location( $graph, $g ) );
667 sub _solve_variant_location {
668 my( $graph, $groups ) = @_;
675 # Mark each ms as in its own group, first.
676 foreach my $g ( @$groups ) {
677 my $gst = wit_stringify( $g );
678 map { $contig->{$_} = $gst } @$g;
681 # Now for each unmarked node in the graph, initialize an array
682 # for possible group memberships. We will use this later to
683 # resolve potential conflicts.
684 map { $contig->{$_} = [] unless $contig->{$_} } $graph->vertices;
685 foreach my $g ( sort { scalar @$b <=> scalar @$a } @$groups ) {
686 my $gst = wit_stringify( $g ); # This is the group name
687 # Copy the graph, and delete all non-members from the new graph.
688 my $part = $graph->copy;
690 $part->delete_vertices(
691 grep { !ref( $contig->{$_} ) && $contig->{$_} ne $gst } $graph->vertices );
693 # Now look to see if our group is connected.
695 # We have to take directionality into account.
696 # How many root nodes do we have?
697 my @roots = grep { ref( $contig->{$_} ) || $contig->{$_} eq $gst }
698 $part->predecessorless_vertices;
699 # Assuming that @$g > 1, find the first root node that has at
700 # least one successor belonging to our group. If this reading
701 # is genealogical, there should be only one, but we will check
702 # that implicitly later.
703 foreach my $root ( @roots ) {
704 # Prune the tree to get rid of extraneous hypotheticals.
705 $root = _prune_subtree( $part, $root, $contig );
707 # Save this root for our group.
708 push( @group_roots, $root );
709 # Get all the successor nodes of our root.
712 # Dispense with the trivial case of one reading.
714 @group_roots = ( $wit );
715 foreach my $v ( $part->vertices ) {
716 $part->delete_vertex( $v ) unless $v eq $wit;
720 if( @group_roots > 1 ) {
721 $conflict->{$gst} = 1;
724 # Paint the 'hypotheticals' with our group.
725 foreach my $wit ( $part->vertices ) {
726 if( ref( $contig->{$wit} ) ) {
727 push( @{$contig->{$wit}}, $gst );
728 } elsif( $contig->{$wit} ne $gst ) {
729 warn "How did we get here?";
734 # Save the relevant subgraph.
735 $subgraph->{$gst} = $part;
738 # For each of our hypothetical readings, flatten its 'contig' array if
739 # the array contains zero or one group. If we have any unflattened arrays,
740 # we may need to run the resolution process. If the reading is already known
741 # to have a conflict, flatten the 'contig' array to nothing; we won't resolve
744 foreach my $wit ( keys %$contig ) {
745 next unless ref( $contig->{$wit} );
746 if( @{$contig->{$wit}} > 1 ) {
747 if( $is_conflicted ) {
748 $contig->{$wit} = ''; # We aren't going to decide.
750 push( @resolve, $wit );
753 my $gst = pop @{$contig->{$wit}};
754 $contig->{$wit} = $gst || '';
759 my $still_contig = {};
760 foreach my $h ( @resolve ) {
761 # For each of the hypothetical readings with more than one possibility,
762 # try deleting it from each of its member subgraphs in turn, and see
763 # if that breaks the contiguous grouping.
764 # TODO This can still break in a corner case where group A can use
765 # either vertex 1 or 2, and group B can use either vertex 2 or 1.
766 # Revisit this if necessary; it could get brute-force nasty.
767 foreach my $gst ( @{$contig->{$h}} ) {
768 my $gpart = $subgraph->{$gst}->copy();
769 # If we have come this far, there is only one root and everything
770 # is reachable from it.
771 my( $root ) = $gpart->predecessorless_vertices;
773 map { $reachable->{$_} = 1 } $gpart->vertices;
775 # Try deleting the hypothetical node.
776 $gpart->delete_vertex( $h );
778 # See if we still have a single root.
779 my @roots = $gpart->predecessorless_vertices;
780 warn "This shouldn't have happened" unless @roots;
782 # $h is needed by this group.
783 if( exists( $still_contig->{$h} ) ) {
785 $conflict->{$gst} = 1;
786 $still_contig->{$h} = '';
788 $still_contig->{$h} = $gst;
792 # $h is somewhere in the middle. See if everything
793 # else can still be reached from the root.
794 my %still_reachable = ( $root => 1 );
795 map { $still_reachable{$_} = 1 }
796 $gpart->all_successors( $root );
797 foreach my $v ( keys %$reachable ) {
799 if( !$still_reachable{$v}
800 && ( $contig->{$v} eq $gst
801 || ( exists $still_contig->{$v}
802 && $still_contig->{$v} eq $gst ) ) ) {
804 if( exists $still_contig->{$h} ) {
806 $conflict->{$gst} = 1;
807 $still_contig->{$h} = '';
809 $still_contig->{$h} = $gst;
812 } # else we don't need $h in this group.
814 } # endif $h eq $root
818 # Now we have some hypothetical vertices in $still_contig that are the
819 # "real" group memberships. Replace these in $contig.
820 foreach my $v ( keys %$contig ) {
821 next unless ref $contig->{$v};
822 $contig->{$v} = $still_contig->{$v};
826 my $is_genealogical = keys %$conflict ? JSON::false : JSON::true;
827 my $variant_row = [ [], $is_genealogical ];
828 # Fill in the groupings from $contig.
829 foreach my $g ( @$groups ) {
830 my $gst = wit_stringify( $g );
831 my @realgroup = grep { $contig->{$_} eq $gst } keys %$contig;
832 push( @{$variant_row->[0]}, \@realgroup );
838 my( $group, $stemma, $graph ) = @_;
839 # Get these into a form prune_subtree will recognize. Make a "contighash"
841 map { $hypohash->{$_} = 1 } @$group;
842 # ...with reference values for hypotheticals.
843 map { $hypohash->{$_} = [] } $stemma->hypotheticals;
845 my $subgraph = $graph->copy;
846 map { $subgraph->delete_vertex( $_ ) unless exists $hypohash->{$_} }
848 # ...and find the root.
849 my( $root ) = $subgraph->predecessorless_vertices;
850 # Now prune and return the remaining vertices.
851 _prune_subtree( $subgraph, $root, $hypohash );
852 return $subgraph->vertices;
856 my( $tree, $root, $contighash ) = @_;
857 # First, delete hypothetical leaves / orphans until there are none left.
858 my @orphan_hypotheticals = grep { ref( $contighash->{$_} ) }
859 $tree->successorless_vertices;
860 while( @orphan_hypotheticals ) {
861 $tree->delete_vertices( @orphan_hypotheticals );
862 @orphan_hypotheticals = grep { ref( $contighash->{$_} ) }
863 $tree->successorless_vertices;
865 # Then delete a hypothetical root with only one successor, moving the
866 # root to the first child that has no other predecessors.
867 while( $tree->successors( $root ) == 1 && ref $contighash->{$root} ) {
868 my @nextroot = $tree->successors( $root );
869 $tree->delete_vertex( $root );
870 ( $root ) = grep { $tree->is_predecessorless_vertex( $_ ) } @nextroot;
872 # The tree has been modified in place, but we need to know the new root.
873 $root = undef unless $root && $tree->has_vertex( $root );
876 # Add the variant, subject to a.c. representation logic.
877 # This assumes that we will see the 'main' version before the a.c. version.
878 sub add_variant_wit {
879 my( $arr, $wit, $acstr ) = @_;
881 if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
883 $skip = grep { $_ =~ /^\Q$real\E$/ } @$arr;
885 push( @$arr, $wit ) unless $skip;
888 sub _useful_variant {
889 my( $group_readings, $graph, $acstr ) = @_;
891 # TODO Decide what to do with AC witnesses
893 # Sort by group size and return
895 my( @readings, @groups ); # The sorted groups for our answer.
896 foreach my $rdg ( sort { @{$group_readings->{$b}} <=> @{$group_readings->{$a}} }
897 keys %$group_readings ) {
898 push( @readings, $rdg );
899 push( @groups, $group_readings->{$rdg} );
900 if( @{$group_readings->{$rdg}} > 1 ) {
903 my( $wit ) = @{$group_readings->{$rdg}};
904 $wit =~ s/^(.*)\Q$acstr\E$/$1/;
905 $is_useful++ unless( $graph->is_sink_vertex( $wit ) );
908 if( $is_useful > 1 ) {
909 return( \@readings, \@groups );
915 =head2 wit_stringify( $groups )
917 Takes an array of witness groupings and produces a string like
918 ['A','B'] / ['C','D','E'] / ['F']
925 # If we were passed an array of witnesses instead of an array of
926 # groupings, then "group" the witnesses first.
927 unless( ref( $groups->[0] ) ) {
928 my $mkgrp = [ $groups ];
931 foreach my $g ( @$groups ) {
932 push( @gst, '[' . join( ',', map { "'$_'" } @$g ) . ']' );
934 return join( ' / ', @gst );
938 my( $lista, $listb ) = @_;
941 map { $union{$_} = 1; $scalars{$_} = $_ } @$lista;
942 map { $union{$_} += 1; $scalars{$_} = $_ } @$listb;
943 my @set = grep { $union{$_} == 1 } keys %union;
944 return map { $scalars{$_} } @set;
951 This package is free software and is provided "as is" without express
952 or implied warranty. You can redistribute it and/or modify it under
953 the same terms as Perl itself.
957 Tara L Andrews E<lt>aurum@cpan.orgE<gt>