1 package Text::Tradition::Analysis;
6 use Encode qw/ encode_utf8 /;
9 use JSON qw/ encode_json decode_json /;
12 use Text::Tradition::Stemma;
14 use vars qw/ @EXPORT_OK /;
15 @EXPORT_OK = qw/ run_analysis group_variants analyze_variant_location wit_stringify /;
19 Text::Tradition::Analysis - functions for stemma analysis of a tradition
24 use Text::Tradition::Analysis qw/ run_analysis analyze_variant_location /;
25 my $t = Text::Tradition->new(
26 'name' => 'this is a text',
28 'file' => '/path/to/tei_parallel_seg_file.xml' );
29 $t->add_stemma( 'dotfile' => $stemmafile );
31 my $variant_data = run_analysis( $tradition );
32 # Recalculate rank $n treating all orthographic variants as equivalent
33 my $reanalyze = analyze_variant_location( $tradition, $n, 0, 'orthographic' );
37 Text::Tradition is a library for representation and analysis of collated
38 texts, particularly medieval ones. The Collation is the central feature of
39 a Tradition, where the text, its sequence of readings, and its relationships
40 between readings are actually kept.
44 =head2 run_analysis( $tradition, %opts )
46 Runs the analysis described in analyze_variant_location on every location in the
47 collation of the given tradition, with the given options. These include:
51 =item * stemma_id - Specify which of the tradition's stemmata to use. Default
52 is 0 (i.e. the first).
54 =item * ranks - Specify a list of location ranks to analyze; exclude the rest.
56 =item * merge_types - Specify a list of relationship types, where related readings
57 should be treated as identical for the purposes of analysis.
59 =item * exclude_type1 - Exclude those ranks whose groupings have only type-1 variants.
66 use Text::Tradition::Analysis qw/ run_analysis analyze_variant_location /;
68 my $datafile = 't/data/florilegium_tei_ps.xml';
69 my $tradition = Text::Tradition->new( 'input' => 'TEI',
71 'file' => $datafile );
72 my $s = $tradition->add_stemma( 'dotfile' => 't/data/florilegium.dot' );
73 is( ref( $s ), 'Text::Tradition::Stemma', "Added stemma to tradition" );
75 my %expected_genealogical = (
106 my $data = run_analysis( $tradition );
107 my $c = $tradition->collation;
108 foreach my $row ( @{$data->{'variants'}} ) {
109 # Account for rows that used to be "not useful"
110 unless( exists $expected_genealogical{$row->{'id'}} ) {
111 $expected_genealogical{$row->{'id'}} = 1;
113 my $gen_bool = $row->{'genealogical'} ? 1 : 0;
114 is( $gen_bool, $expected_genealogical{$row->{'id'}},
115 "Got correct genealogical flag for row " . $row->{'id'} );
116 # Check that we have the right row with the right groups
117 my $rank = $row->{'id'};
118 foreach my $rdghash ( @{$row->{'readings'}} ) {
119 # Skip 'readings' that aren't really
120 next unless $c->reading( $rdghash->{'readingid'} );
122 is( $c->reading( $rdghash->{'readingid'} )->rank, $rank,
123 "Got correct reading rank" );
124 # Check the witnesses
125 my @realwits = sort $c->reading_witnesses( $rdghash->{'readingid'} );
126 my @sgrp = sort @{$rdghash->{'group'}};
127 is_deeply( \@sgrp, \@realwits, "Reading analyzed with correct groups" );
130 is( $data->{'variant_count'}, 58, "Got right total variant number" );
131 # TODO Make something meaningful of conflict count, maybe test other bits
138 my( $tradition, %opts ) = @_;
139 my $c = $tradition->collation;
141 my $stemma_id = $opts{'stemma_id'} || 0;
142 my @ranks = ref( $opts{'ranks'} ) eq 'ARRAY' ? @{$opts{'ranks'}} : ();
143 my @collapse = ref( $opts{'merge_types'} ) eq 'ARRAY' ? @{$opts{'merge_types'}} : ();
146 my $stemma = $tradition->stemma( $stemma_id );
148 # Figure out which witnesses we are working with - that is, the ones that
149 # appear both in the stemma and in the tradition. All others are 'lacunose'
151 my @lacunose = $stemma->hypotheticals;
152 my @tradition_wits = map { $_->sigil } $tradition->witnesses;
153 push( @lacunose, _symmdiff( [ $stemma->witnesses ], \@tradition_wits ) );
155 # Find and mark 'common' ranks for exclusion, unless they were
156 # explicitly specified.
159 foreach my $rdg ( $c->common_readings ) {
160 $common_rank{$rdg->rank} = 1;
162 @ranks = grep { !$common_rank{$_} } ( 1 .. $c->end->rank-1 );
165 # Group the variants to send to the solver
170 foreach my $rank ( @ranks ) {
171 my $missing = [ @lacunose ];
172 my $rankgroup = group_variants( $tradition, $rank, $missing, $moved, \@collapse );
173 # Filter out any empty rankgroups
174 # (e.g. from the later rank for a transposition)
175 next unless keys %$rankgroup;
176 if( $opts{'exclude_type1'} ) {
177 # Check to see whether this is a "useful" group.
178 my( $rdgs, $grps ) = _useful_variant( $rankgroup,
179 $stemma->graph, $c->ac_label );
182 push( @use_ranks, $rank );
183 push( @groups, $rankgroup );
184 $lacunae{$rank} = $missing;
187 my $answer = solve_variants( $stemma, @groups );
189 # Do further analysis on the answer
190 my $conflict_count = 0;
191 my $aclabel = $c->ac_label;
192 foreach my $idx ( 0 .. $#use_ranks ) {
193 my $location = $answer->{'variants'}->[$idx];
194 # Add the rank back in
195 my $rank = $use_ranks[$idx];
196 $location->{'id'} = $rank;
197 # Note what our lacunae are
199 map { $lmiss{$_} = 1 } @{$lacunae{$use_ranks[$idx]}};
200 $location->{'missing'} = [ keys %lmiss ];
202 # Run the extra analysis we need.
203 analyze_location( $tradition, $stemma->graph, $location, \%lmiss );
205 # Do the final post-analysis tidying up of the data.
206 foreach my $rdghash ( @{$location->{'readings'}} ) {
208 if exists $rdghash->{'conflict'} && $rdghash->{'conflict'};
209 # Add the reading text back in, setting display value as needed
210 my $rdg = $c->reading( $rdghash->{'readingid'} );
212 $rdghash->{'text'} = $rdg->text .
213 ( $rdg->rank == $rank ? '' : ' [' . $rdg->rank . ']' );
215 # Remove lacunose witnesses from this reading's list now that the
218 map { push( @realgroup, $_ ) unless $lmiss{$_} } @{$rdghash->{'group'}};
219 $rdghash->{'group'} = \@realgroup;
220 # TODO Record hypotheticals used to create group, if we end up
224 $answer->{'conflict_count'} = $conflict_count;
229 =head2 group_variants( $tradition, $rank, $lacunose, @merge_relationship_types )
231 Groups the variants at the given $rank of the collation, treating any
232 relationships in @merge_relationship_types as equivalent. $lacunose should
233 be a reference to an array, to which the sigla of lacunose witnesses at this
234 rank will be appended; $transposed should be a reference to a hash, wherein
235 the identities of transposed readings and their relatives will be stored.
237 Returns a hash $group_readings where $rdg is attested by the witnesses listed
238 in $group_readings->{$rdg}.
242 # Return group_readings, groups, lacunose
244 my( $tradition, $rank, $lacunose, $transposed, $collapse ) = @_;
245 my $c = $tradition->collation;
246 my $aclabel = $c->ac_label;
247 # Get the alignment table readings
248 my %readings_at_rank;
249 my %is_lacunose; # lookup table for $lacunose
250 map { $is_lacunose{$_} = 1 } @$lacunose;
253 foreach my $tablewit ( @{$c->alignment_table->{'alignment'}} ) {
254 my $rdg = $tablewit->{'tokens'}->[$rank-1];
255 my $wit = $tablewit->{'witness'};
256 # Exclude the witness if it is "lacunose" which if we got here
257 # means "not in the stemma".
258 next if $is_lacunose{$wit};
259 # Note if the witness is actually in a lacuna
260 if( $rdg && $rdg->{'t'}->is_lacuna ) {
261 _add_to_witlist( $wit, $lacunose, $aclabel );
262 # Otherwise the witness either has a positive reading...
264 # If the reading has been counted elsewhere as a transposition, ignore it.
265 if( $transposed->{$rdg->{'t'}->id} ) {
266 # TODO This doesn't cope with three-way transpositions
267 map { $moved_wits{$_} = 1 } @{$transposed->{$rdg->{'t'}->id}};
270 # Otherwise, record it...
271 $readings_at_rank{$rdg->{'t'}->id} = $rdg->{'t'};
272 # ...and grab any transpositions, and their relations.
273 my @transp = grep { $_->rank != $rank } $rdg->{'t'}->related_readings();
274 foreach my $trdg ( @transp ) {
275 map { $moved_wits{$_} = 1 } $trdg->witnesses;
276 $transposed->{$trdg->id} = [ $rdg->{'t'}->witnesses ];
277 $readings_at_rank{$trdg->id} = $trdg;
279 # ...or it is empty, ergo a gap.
281 push( @check_for_gaps, $wit );
285 map { _add_to_witlist( $_, \@gap_wits, $aclabel )
286 unless $moved_wits{$_} } @check_for_gaps;
287 # TODO check for, and break into a new row, any doubled-up witness readings
288 # after transposition...
289 # Group the readings, collapsing groups by relationship if needed
290 my %grouped_readings;
291 foreach my $rdg ( values %readings_at_rank ) {
292 # Skip readings that have been collapsed into others.
293 next if exists $grouped_readings{$rdg->id} && !$grouped_readings{$rdg->id};
294 # Get the witness list, including from readings collapsed into this one.
295 my @wits = $rdg->witnesses;
297 my $filter = sub { my $r = $_[0]; grep { $_ eq $r->type } @$collapse; };
298 foreach my $other ( $rdg->related_readings( $filter ) ) {
299 my @otherwits = $other->witnesses;
300 push( @wits, @otherwits );
301 $grouped_readings{$other->id} = 0;
304 # Filter the group to those witnesses in the stemma
306 foreach my $wit ( @wits ) {
307 next if $is_lacunose{$wit};
308 push( @use_wits, $wit );
310 $grouped_readings{$rdg->id} = \@use_wits;
312 $grouped_readings{'(omitted)'} = \@gap_wits if @gap_wits;
313 # Get rid of our collapsed readings
314 map { delete $grouped_readings{$_} unless $grouped_readings{$_} }
315 keys %grouped_readings
319 return \%grouped_readings;
322 # Helper function to ensure that X and X a.c. never appear in the same list.
323 sub _add_to_witlist {
324 my( $wit, $list, $acstr ) = @_;
327 map { $inlist{$_} = $idx++ } @$list;
328 if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
330 unless( exists $inlist{$acwit} ) {
331 push( @$list, $acwit.$acstr );
334 if( exists( $inlist{$wit.$acstr} ) ) {
335 # Replace the a.c. version with the main witness
336 my $i = $inlist{$wit.$acstr};
339 push( @$list, $wit );
344 =head2 solve_variants( $graph, @groups )
346 Sends the set of groups to the external graph solver service and returns
347 a cleaned-up answer, adding the rank IDs back where they belong.
349 The JSON has the form
350 { "graph": [ stemmagraph DOT string without newlines ],
351 "groupings": [ array of arrays of groups, one per rank ] }
353 The answer has the form
354 { "variants" => [ array of variant location structures ],
355 "variant_count" => total,
356 "conflict_count" => number of conflicts detected,
357 "genealogical_count" => number of solutions found }
362 my( $stemma, @groups ) = @_;
363 my $aclabel = $stemma->collation->ac_label;
365 # Filter the groups down to distinct groups, and work out what graph
366 # should be used in the calculation of each group. We want to send each
367 # distinct problem to the solver only once.
368 # We need a whole bunch of lookup tables for this.
369 my $index_groupkeys = {}; # Save the order of readings
370 my $group_indices = {}; # Save the indices that have a given grouping
371 my $graph_problems = {}; # Save the groupings for the given graph
373 foreach my $idx ( 0..$#groups ) {
374 my $ghash = $groups[$idx];
376 # Sort the groupings from big to little, and scan for a.c. witnesses
377 # that would need an extended graph.
378 my @acwits; # note which AC witnesses crop up at this rank
379 my @idxkeys = sort { scalar @{$ghash->{$b}} <=> scalar @{$ghash->{$a}} }
381 foreach my $rdg ( @idxkeys ) {
382 my @sg = sort @{$ghash->{$rdg}};
383 push( @acwits, grep { $_ =~ /\Q$aclabel\E$/ } @sg );
384 push( @grouping, \@sg );
386 # Save the reading order
387 $index_groupkeys->{$idx} = \@idxkeys;
389 # Now associate the distinct group with this index
390 my $gstr = wit_stringify( \@grouping );
391 push( @{$group_indices->{$gstr}}, $idx );
393 # Finally, add the group to the list to be calculated for this graph.
394 map { s/\Q$aclabel\E$// } @acwits;
395 my $graph = $stemma->extend_graph( \@acwits );
396 unless( exists $graph_problems->{"$graph"} ) {
397 $graph_problems->{"$graph"} = { 'object' => $graph, 'groups' => [] };
399 push( @{$graph_problems->{"$graph"}->{'groups'}}, \@grouping );
402 ## For each distinct graph, send its groups to the solver.
403 my $solver_url = 'http://byzantini.st/cgi-bin/graphcalc.cgi';
404 my $ua = LWP::UserAgent->new();
405 ## Witness map is a HACK to get around limitations in node names from IDP
406 my $witness_map = {};
407 ## Variables to store answers as they come back
408 my $variants = [ ( undef ) x ( scalar keys %$index_groupkeys ) ];
409 my $genealogical = 0;
410 foreach my $graphkey ( keys %$graph_problems ) {
411 my $graph = $graph_problems->{$graphkey}->{'object'};
412 my $groupings = $graph_problems->{$graphkey}->{'groups'};
413 my $json = encode_json( _safe_wit_strings( $graph, $stemma->collation,
414 $groupings, $witness_map ) );
415 # Send it off and get the result
416 #print STDERR "Sending request: $json\n";
417 my $resp = $ua->post( $solver_url, 'Content-Type' => 'application/json',
418 'Content' => $json );
421 if( $resp->is_success ) {
422 $answer = _desanitize_names( decode_json( $resp->content ), $witness_map );
425 # Fall back to the old method.
426 warn "IDP solver returned " . $resp->status_line . " / " . $resp->content
427 . "; falling back to perl method";
428 $answer = perl_solver( $graph, @$groupings );
430 ## The answer is the evaluated groupings, plus a boolean for whether
431 ## they were genealogical. Reconstruct our original groups.
432 foreach my $gidx ( 0 .. $#{$groupings} ) {
433 my( $calc_groups, $result ) = @{$answer->[$gidx]};
436 # Prune the calculated groups, in case the IDP solver failed to.
439 foreach my $cg ( @$calc_groups ) {
440 # This is a little wasteful but the path of least
441 # resistance. Send both the stemma, which knows what
442 # its hypotheticals are, and the actual graph used.
443 my @pg = _prune_group( $cg, $stemma, $graph );
444 push( @pruned_groups, \@pg );
446 $calc_groups = \@pruned_groups;
449 # Retrieve the key for the original group that went to the solver
450 my $input_group = wit_stringify( $groupings->[$gidx] );
451 foreach my $oidx ( @{$group_indices->{$input_group}} ) {
452 my @readings = @{$index_groupkeys->{$oidx}};
454 'genealogical' => $result,
457 foreach my $ridx ( 0 .. $#readings ) {
458 push( @{$vstruct->{'readings'}},
459 { 'readingid' => $readings[$ridx],
460 'group' => $calc_groups->[$ridx] } );
462 $variants->[$oidx] = $vstruct;
467 return { 'variants' => $variants,
468 'variant_count' => scalar @$variants,
469 'genealogical_count' => $genealogical };
472 #### HACKERY to cope with IDP's limited idea of what a node name looks like ###
474 sub _safe_wit_strings {
475 my( $graph, $c, $groupings, $witness_map ) = @_;
476 # Parse the graph we were given into a stemma.
477 my $safegraph = Graph->new();
478 # Convert the graph to a safe representation and store the conversion.
479 foreach my $n ( $graph->vertices ) {
480 my $sn = _safe_witstr( $n );
481 if( exists $witness_map->{$sn} ) {
482 warn "Ambiguous stringification $sn for $n and " . $witness_map->{$sn}
483 if $witness_map->{$sn} ne $n;
485 $witness_map->{$sn} = $n;
487 $safegraph->add_vertex( $sn );
488 $safegraph->set_vertex_attributes( $sn,
489 $graph->get_vertex_attributes( $n ) );
491 foreach my $e ( $graph->edges ) {
492 my @safe_e = ( _safe_witstr( $e->[0] ), _safe_witstr( $e->[1] ) );
493 $safegraph->add_edge( @safe_e );
495 my $safe_stemma = Text::Tradition::Stemma->new(
496 'collation' => $c, 'graph' => $safegraph );
498 # Now convert the witness groupings to a safe representation.
499 my $safe_groupings = [];
500 foreach my $grouping ( @$groupings ) {
501 my $safe_grouping = [];
502 foreach my $group ( @$grouping ) {
504 foreach my $n ( @$group ) {
505 my $sn = _safe_witstr( $n );
506 warn "Ambiguous stringification $sn for $n and " . $witness_map->{$sn}
507 if exists $witness_map->{$sn} && $witness_map->{$sn} ne $n;
508 $witness_map->{$sn} = $n;
509 push( @$safe_group, $sn );
511 push( @$safe_grouping, $safe_group );
513 push( @$safe_groupings, $safe_grouping );
516 # Return it all in the struct we expect. We have stored the reductions
517 # in the $witness_map that we were passed.
518 return { 'graph' => $safe_stemma->editable( { 'linesep' => ' ' } ),
519 'groupings' => $safe_groupings };
524 $witstr =~ s/\s+/_/g;
525 $witstr =~ s/[^\w\d-]//g;
529 sub _desanitize_names {
530 my( $jsonstruct, $witness_map ) = @_;
532 foreach my $grouping ( @$jsonstruct ) {
533 my $real_grouping = [];
534 foreach my $element ( @$grouping ) {
535 if( ref( $element ) eq 'ARRAY' ) {
537 my $real_groupset = [];
538 foreach my $group ( @$element ) {
540 foreach my $n ( @$group ) {
541 my $rn = $witness_map->{$n};
542 push( @$real_group, $rn );
544 push( @$real_groupset, $real_group );
546 push( @$real_grouping, $real_groupset );
548 # It is the boolean, not actually a group.
549 push( @$real_grouping, $element );
552 push( @$result, $real_grouping );
559 =head2 analyze_location ( $tradition, $graph, $location_hash )
561 Given the tradition, its stemma graph, and the solution from the graph solver,
562 work out the rest of the information we want. For each reading we need missing,
563 conflict, reading_parents, independent_occurrence, followed, not_followed, and follow_unknown. Alters the location_hash in place.
567 sub analyze_location {
568 my ( $tradition, $graph, $variant_row, $lacunose ) = @_;
569 my $c = $tradition->collation;
571 # Make a hash of all known node memberships, and make the subgraphs.
573 my $reading_roots = {};
575 $DB::single = 1 if $variant_row->{id} == 6;
576 # Note which witnesses positively belong to which group
577 foreach my $rdghash ( @{$variant_row->{'readings'}} ) {
578 my $rid = $rdghash->{'readingid'};
579 map { $contig->{$_} = $rid } @{$rdghash->{'group'}};
582 # Now, armed with that knowledge, make a subgraph for each reading
583 # and note the root(s) of each subgraph.
584 foreach my $rdghash( @{$variant_row->{'readings'}} ) {
585 my $rid = $rdghash->{'readingid'};
588 my $part = $graph->copy;
589 my @todelete = grep { exists $contig->{$_} && $contig->{$_} ne $rid }
591 $part->delete_vertices( @todelete );
592 _prune_subtree( $part, $lacunose );
593 $subgraph->{$rid} = $part;
594 # Record the remaining lacunose nodes as part of this group, if
595 # we are dealing with a non-genealogical reading.
596 unless( $variant_row->{'genealogical'} ) {
597 map { $contig->{$_} = $rid } $part->vertices;
599 # Get the reading roots.
600 map { $reading_roots->{$_} = $rid } $part->predecessorless_vertices;
603 # Now that we have all the node group memberships, calculate followed/
604 # non-followed/unknown values for each reading. Also figure out the
605 # reading's evident parent(s).
606 foreach my $rdghash ( @{$variant_row->{'readings'}} ) {
607 my $rid = $rdghash->{'readingid'};
609 my $part = $subgraph->{$rid};
611 # Start figuring things out.
612 my @roots = grep { $reading_roots->{$_} eq $rid } keys %$reading_roots;
613 $rdghash->{'independent_occurrence'} = \@roots;
614 $rdghash->{'followed'} = scalar( $part->vertices ) - scalar( @roots );
615 # Find the parent readings, if any, of this reading.
617 foreach my $wit ( @roots ) {
618 # Look in the main stemma to find this witness's extant or known-reading
619 # immediate ancestor(s), and look up the reading that each ancestor olds.
620 my @check = $graph->predecessors( $wit );
623 foreach my $wparent( @check ) {
624 my $preading = $contig->{$wparent};
626 $rdgparents->{$preading} = 1;
628 push( @next, $graph->predecessors( $wparent ) );
634 foreach my $p ( keys %$rdgparents ) {
635 # Resolve the relationship of the parent to the reading, and
636 # save it in our hash.
637 my $pobj = $c->reading( $p );
639 my $prep = $pobj ? $pobj->id . ' (' . $pobj->text . ')' : $p;
641 my $rel = $c->get_relationship( $p, $rdghash->{readingid} );
643 $relation = { type => $rel->type };
644 if( $rel->has_annotation ) {
645 $relation->{'annotation'} = $rel->annotation;
649 $rdgparents->{$p} = { 'label' => $prep, 'relation' => $relation };
652 $rdghash->{'reading_parents'} = $rdgparents;
654 # Find the number of times this reading was altered, and the number of
655 # times we're not sure.
656 my( %nofollow, %unknownfollow );
657 foreach my $wit ( $part->vertices ) {
658 foreach my $wchild ( $graph->successors( $wit ) ) {
659 next if $part->has_vertex( $wchild );
660 if( $reading_roots->{$wchild} && $contig->{$wchild} ) {
661 # It definitely changed here.
662 $nofollow{$wchild} = 1;
663 } elsif( !($contig->{$wchild}) ) {
664 # The child is a hypothetical node not definitely in
665 # any group. Answer is unknown.
666 $unknownfollow{$wchild} = 1;
667 } # else it's a non-root node in a known group, and therefore
668 # is presumed to have its reading from its group, not this link.
671 $rdghash->{'not_followed'} = keys %nofollow;
672 $rdghash->{'follow_unknown'} = keys %unknownfollow;
674 # Now say whether this reading represents a conflict.
675 unless( $variant_row->{'genealogical'} ) {
676 $rdghash->{'conflict'} = @roots != 1;
682 =head2 perl_solver( $tradition, $rank, $stemma_id, @merge_relationship_types )
684 ** NOTE ** This method should hopefully not be called - it is not guaranteed
685 to be correct. Serves as a backup for the real solver.
687 Runs an analysis of the given tradition, at the location given in $rank,
688 against the graph of the stemma specified in $stemma_id. The argument
689 @merge_relationship_types is an optional list of relationship types for
690 which readings so related should be treated as equivalent.
692 Returns a nested array data structure as follows:
694 [ [ group_list, is_genealogical ], [ group_list, is_genealogical ] ... ]
696 where the group list is the array of arrays passed in for each element of @groups,
697 possibly with the addition of hypothetical readings.
703 my( $graph, @groups ) = @_;
705 foreach my $g ( @groups ) {
706 push( @answer, _solve_variant_location( $graph, $g ) );
711 sub _solve_variant_location {
712 my( $graph, $groups ) = @_;
719 # Mark each ms as in its own group, first.
720 foreach my $g ( @$groups ) {
721 my $gst = wit_stringify( $g );
722 map { $contig->{$_} = $gst } @$g;
725 # Now for each unmarked node in the graph, initialize an array
726 # for possible group memberships. We will use this later to
727 # resolve potential conflicts.
728 map { $contig->{$_} = [] unless $contig->{$_} } $graph->vertices;
729 foreach my $g ( sort { scalar @$b <=> scalar @$a } @$groups ) {
730 my $gst = wit_stringify( $g ); # This is the group name
731 # Copy the graph, and delete all non-members from the new graph.
732 my $part = $graph->copy;
734 $part->delete_vertices(
735 grep { !ref( $contig->{$_} ) && $contig->{$_} ne $gst } $graph->vertices );
737 # Now look to see if our group is connected.
739 # We have to take directionality into account.
740 # How many root nodes do we have?
741 my @roots = grep { ref( $contig->{$_} ) || $contig->{$_} eq $gst }
742 $part->predecessorless_vertices;
743 # Assuming that @$g > 1, find the first root node that has at
744 # least one successor belonging to our group. If this reading
745 # is genealogical, there should be only one, but we will check
746 # that implicitly later.
747 foreach my $root ( @roots ) {
748 # Prune the tree to get rid of extraneous hypotheticals.
749 $root = _prune_subtree_old( $part, $root, $contig );
751 # Save this root for our group.
752 push( @group_roots, $root );
753 # Get all the successor nodes of our root.
756 # Dispense with the trivial case of one reading.
758 @group_roots = ( $wit );
759 foreach my $v ( $part->vertices ) {
760 $part->delete_vertex( $v ) unless $v eq $wit;
764 if( @group_roots > 1 ) {
765 $conflict->{$gst} = 1;
768 # Paint the 'hypotheticals' with our group.
769 foreach my $wit ( $part->vertices ) {
770 if( ref( $contig->{$wit} ) ) {
771 push( @{$contig->{$wit}}, $gst );
772 } elsif( $contig->{$wit} ne $gst ) {
773 warn "How did we get here?";
778 # Save the relevant subgraph.
779 $subgraph->{$gst} = $part;
782 # For each of our hypothetical readings, flatten its 'contig' array if
783 # the array contains zero or one group. If we have any unflattened arrays,
784 # we may need to run the resolution process. If the reading is already known
785 # to have a conflict, flatten the 'contig' array to nothing; we won't resolve
788 foreach my $wit ( keys %$contig ) {
789 next unless ref( $contig->{$wit} );
790 if( @{$contig->{$wit}} > 1 ) {
791 if( $is_conflicted ) {
792 $contig->{$wit} = ''; # We aren't going to decide.
794 push( @resolve, $wit );
797 my $gst = pop @{$contig->{$wit}};
798 $contig->{$wit} = $gst || '';
803 my $still_contig = {};
804 foreach my $h ( @resolve ) {
805 # For each of the hypothetical readings with more than one possibility,
806 # try deleting it from each of its member subgraphs in turn, and see
807 # if that breaks the contiguous grouping.
808 # TODO This can still break in a corner case where group A can use
809 # either vertex 1 or 2, and group B can use either vertex 2 or 1.
810 # Revisit this if necessary; it could get brute-force nasty.
811 foreach my $gst ( @{$contig->{$h}} ) {
812 my $gpart = $subgraph->{$gst}->copy();
813 # If we have come this far, there is only one root and everything
814 # is reachable from it.
815 my( $root ) = $gpart->predecessorless_vertices;
817 map { $reachable->{$_} = 1 } $gpart->vertices;
819 # Try deleting the hypothetical node.
820 $gpart->delete_vertex( $h );
822 # See if we still have a single root.
823 my @roots = $gpart->predecessorless_vertices;
824 warn "This shouldn't have happened" unless @roots;
826 # $h is needed by this group.
827 if( exists( $still_contig->{$h} ) ) {
829 $conflict->{$gst} = 1;
830 $still_contig->{$h} = '';
832 $still_contig->{$h} = $gst;
836 # $h is somewhere in the middle. See if everything
837 # else can still be reached from the root.
838 my %still_reachable = ( $root => 1 );
839 map { $still_reachable{$_} = 1 }
840 $gpart->all_successors( $root );
841 foreach my $v ( keys %$reachable ) {
843 if( !$still_reachable{$v}
844 && ( $contig->{$v} eq $gst
845 || ( exists $still_contig->{$v}
846 && $still_contig->{$v} eq $gst ) ) ) {
848 if( exists $still_contig->{$h} ) {
850 $conflict->{$gst} = 1;
851 $still_contig->{$h} = '';
853 $still_contig->{$h} = $gst;
856 } # else we don't need $h in this group.
858 } # endif $h eq $root
862 # Now we have some hypothetical vertices in $still_contig that are the
863 # "real" group memberships. Replace these in $contig.
864 foreach my $v ( keys %$contig ) {
865 next unless ref $contig->{$v};
866 $contig->{$v} = $still_contig->{$v};
870 my $is_genealogical = keys %$conflict ? JSON::false : JSON::true;
871 my $variant_row = [ [], $is_genealogical ];
872 # Fill in the groupings from $contig.
873 foreach my $g ( @$groups ) {
874 my $gst = wit_stringify( $g );
875 my @realgroup = grep { $contig->{$_} eq $gst } keys %$contig;
876 push( @{$variant_row->[0]}, \@realgroup );
882 my( $group, $stemma, $graph ) = @_;
884 map { $lacunose->{$_} = 1 } $stemma->hypotheticals;
885 map { $lacunose->{$_} = 0 } @$group;
887 my $subgraph = $graph->copy;
888 map { $subgraph->delete_vertex( $_ ) unless exists $lacunose->{$_} }
890 # ...and find the root.
891 # Now prune and return the remaining vertices.
892 _prune_subtree( $subgraph, $lacunose );
893 return $subgraph->vertices;
897 my( $tree, $lacunose ) = @_;
899 # Delete lacunose witnesses that have no successors
900 my @orphan_hypotheticals;
903 die "Infinite loop on leaves" if $ctr > 100;
904 @orphan_hypotheticals = grep { $lacunose->{$_} }
905 $tree->successorless_vertices;
906 $tree->delete_vertices( @orphan_hypotheticals );
908 } while( @orphan_hypotheticals );
910 # Delete lacunose roots that have a single successor
914 die "Infinite loop on roots" if $ctr > 100;
915 @redundant_root = grep { $lacunose->{$_} && $tree->successors( $_ ) == 1 }
916 $tree->predecessorless_vertices;
917 $tree->delete_vertices( @redundant_root );
919 } while( @redundant_root );
922 sub _prune_subtree_old {
923 my( $tree, $root, $contighash ) = @_;
924 # First, delete hypothetical leaves / orphans until there are none left.
925 my @orphan_hypotheticals = grep { ref( $contighash->{$_} ) }
926 $tree->successorless_vertices;
927 while( @orphan_hypotheticals ) {
928 $tree->delete_vertices( @orphan_hypotheticals );
929 @orphan_hypotheticals = grep { ref( $contighash->{$_} ) }
930 $tree->successorless_vertices;
932 # Then delete a hypothetical root with only one successor, moving the
933 # root to the first child that has no other predecessors.
934 while( $tree->successors( $root ) == 1 && ref $contighash->{$root} ) {
935 my @nextroot = $tree->successors( $root );
936 $tree->delete_vertex( $root );
937 ( $root ) = grep { $tree->is_predecessorless_vertex( $_ ) } @nextroot;
939 # The tree has been modified in place, but we need to know the new root.
940 $root = undef unless $root && $tree->has_vertex( $root );
943 # Add the variant, subject to a.c. representation logic.
944 # This assumes that we will see the 'main' version before the a.c. version.
945 sub add_variant_wit {
946 my( $arr, $wit, $acstr ) = @_;
948 if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
950 $skip = grep { $_ =~ /^\Q$real\E$/ } @$arr;
952 push( @$arr, $wit ) unless $skip;
955 sub _useful_variant {
956 my( $group_readings, $graph, $acstr ) = @_;
958 # TODO Decide what to do with AC witnesses
960 # Sort by group size and return
962 my( @readings, @groups ); # The sorted groups for our answer.
963 foreach my $rdg ( sort { @{$group_readings->{$b}} <=> @{$group_readings->{$a}} }
964 keys %$group_readings ) {
965 push( @readings, $rdg );
966 push( @groups, $group_readings->{$rdg} );
967 if( @{$group_readings->{$rdg}} > 1 ) {
970 my( $wit ) = @{$group_readings->{$rdg}};
971 $wit =~ s/^(.*)\Q$acstr\E$/$1/;
972 $is_useful++ unless( $graph->is_sink_vertex( $wit ) );
975 if( $is_useful > 1 ) {
976 return( \@readings, \@groups );
982 =head2 wit_stringify( $groups )
984 Takes an array of witness groupings and produces a string like
985 ['A','B'] / ['C','D','E'] / ['F']
992 # If we were passed an array of witnesses instead of an array of
993 # groupings, then "group" the witnesses first.
994 unless( ref( $groups->[0] ) ) {
995 my $mkgrp = [ $groups ];
998 foreach my $g ( @$groups ) {
999 push( @gst, '[' . join( ',', map { "'$_'" } @$g ) . ']' );
1001 return join( ' / ', @gst );
1005 my( $lista, $listb ) = @_;
1008 map { $union{$_} = 1; $scalars{$_} = $_ } @$lista;
1009 map { $union{$_} += 1; $scalars{$_} = $_ } @$listb;
1010 my @set = grep { $union{$_} == 1 } keys %union;
1011 return map { $scalars{$_} } @set;
1018 This package is free software and is provided "as is" without express
1019 or implied warranty. You can redistribute it and/or modify it under
1020 the same terms as Perl itself.
1024 Tara L Andrews E<lt>aurum@cpan.orgE<gt>