1 package Text::Tradition::Analysis;
6 use Encode qw/ encode_utf8 /;
9 use JSON qw/ encode_json decode_json /;
12 use Text::Tradition::Stemma;
14 use vars qw/ @EXPORT_OK /;
15 @EXPORT_OK = qw/ run_analysis group_variants analyze_variant_location wit_stringify /;
19 Text::Tradition::Analysis - functions for stemma analysis of a tradition
24 use Text::Tradition::Analysis qw/ run_analysis analyze_variant_location /;
25 my $t = Text::Tradition->new(
26 'name' => 'this is a text',
28 'file' => '/path/to/tei_parallel_seg_file.xml' );
29 $t->add_stemma( 'dotfile' => $stemmafile );
31 my $variant_data = run_analysis( $tradition );
32 # Recalculate rank $n treating all orthographic variants as equivalent
33 my $reanalyze = analyze_variant_location( $tradition, $n, 0, 'orthographic' );
37 Text::Tradition is a library for representation and analysis of collated
38 texts, particularly medieval ones. The Collation is the central feature of
39 a Tradition, where the text, its sequence of readings, and its relationships
40 between readings are actually kept.
44 =head2 run_analysis( $tradition, %opts )
46 Runs the analysis described in analyze_variant_location on every location in the
47 collation of the given tradition, with the given options. These include:
51 =item * stemma_id - Specify which of the tradition's stemmata to use. Default
52 is 0 (i.e. the first).
54 =item * ranks - Specify a list of location ranks to analyze; exclude the rest.
56 =item * merge_types - Specify a list of relationship types, where related readings
57 should be treated as identical for the purposes of analysis.
59 =item * exclude_type1 - Exclude those ranks whose groupings have only type-1 variants.
66 use Text::Tradition::Analysis qw/ run_analysis analyze_variant_location /;
68 my $datafile = 't/data/florilegium_tei_ps.xml';
69 my $tradition = Text::Tradition->new( 'input' => 'TEI',
71 'file' => $datafile );
72 my $s = $tradition->add_stemma( 'dotfile' => 't/data/florilegium.dot' );
73 is( ref( $s ), 'Text::Tradition::Stemma', "Added stemma to tradition" );
75 my %expected_genealogical = (
106 my $data = run_analysis( $tradition );
107 my $c = $tradition->collation;
108 foreach my $row ( @{$data->{'variants'}} ) {
109 # Account for rows that used to be "not useful"
110 unless( exists $expected_genealogical{$row->{'id'}} ) {
111 $expected_genealogical{$row->{'id'}} = 1;
113 my $gen_bool = $row->{'genealogical'} ? 1 : 0;
114 is( $gen_bool, $expected_genealogical{$row->{'id'}},
115 "Got correct genealogical flag for row " . $row->{'id'} );
116 # Check that we have the right row with the right groups
117 my $rank = $row->{'id'};
118 foreach my $rdghash ( @{$row->{'readings'}} ) {
119 # Skip 'readings' that aren't really
120 next unless $c->reading( $rdghash->{'readingid'} );
122 is( $c->reading( $rdghash->{'readingid'} )->rank, $rank,
123 "Got correct reading rank" );
124 # Check the witnesses
125 my @realwits = sort $c->reading_witnesses( $rdghash->{'readingid'} );
126 my @sgrp = sort @{$rdghash->{'group'}};
127 is_deeply( \@sgrp, \@realwits, "Reading analyzed with correct groups" );
130 is( $data->{'variant_count'}, 58, "Got right total variant number" );
131 # TODO Make something meaningful of conflict count, maybe test other bits
138 my( $tradition, %opts ) = @_;
139 my $c = $tradition->collation;
141 my $stemma_id = $opts{'stemma_id'} || 0;
142 my @ranks = ref( $opts{'ranks'} ) eq 'ARRAY' ? @{$opts{'ranks'}} : ();
143 my @collapse = ref( $opts{'merge_types'} ) eq 'ARRAY' ? @{$opts{'merge_types'}} : ();
146 my $stemma = $tradition->stemma( $stemma_id );
148 # Figure out which witnesses we are working with - that is, the ones that
149 # appear both in the stemma and in the tradition. All others are 'lacunose'
151 my @lacunose = $stemma->hypotheticals;
152 my @tradition_wits = map { $_->sigil } $tradition->witnesses;
153 push( @lacunose, _symmdiff( [ $stemma->witnesses ], \@tradition_wits ) );
155 # Find and mark 'common' ranks for exclusion, unless they were
156 # explicitly specified.
159 foreach my $rdg ( $c->common_readings ) {
160 $common_rank{$rdg->rank} = 1;
162 @ranks = grep { !$common_rank{$_} } ( 1 .. $c->end->rank-1 );
165 # Group the variants to send to the solver
170 foreach my $rank ( @ranks ) {
171 my $missing = [ @lacunose ];
172 my $rankgroup = group_variants( $tradition, $rank, $missing, $moved, \@collapse );
173 # Filter out any empty rankgroups
174 # (e.g. from the later rank for a transposition)
175 next unless keys %$rankgroup;
176 if( $opts{'exclude_type1'} ) {
177 # Check to see whether this is a "useful" group.
178 my( $rdgs, $grps ) = _useful_variant( $rankgroup,
179 $stemma->graph, $c->ac_label );
182 push( @use_ranks, $rank );
183 push( @groups, $rankgroup );
184 $lacunae{$rank} = $missing;
187 my $answer = solve_variants( $stemma, @groups );
189 # Do further analysis on the answer
190 my $conflict_count = 0;
191 my $aclabel = $c->ac_label;
192 foreach my $idx ( 0 .. $#use_ranks ) {
193 my $location = $answer->{'variants'}->[$idx];
194 # Add the rank back in
195 my $rank = $use_ranks[$idx];
196 $location->{'id'} = $rank;
197 # Note what our lacunae are
199 map { $lmiss{$_} = 1 } @{$lacunae{$use_ranks[$idx]}};
200 $location->{'missing'} = [ keys %lmiss ];
202 # Run the extra analysis we need.
203 analyze_location( $tradition, $stemma, $location, \%lmiss );
206 # Do the final post-analysis tidying up of the data.
207 foreach my $rdghash ( @{$location->{'readings'}} ) {
209 if exists $rdghash->{'conflict'} && $rdghash->{'conflict'};
210 # Add the reading text back in, setting display value as needed
211 my $rdg = $c->reading( $rdghash->{'readingid'} );
213 $rdghash->{'text'} = $rdg->text .
214 ( $rdg->rank == $rank ? '' : ' [' . $rdg->rank . ']' );
216 # Remove lacunose witnesses from this reading's list now that the
219 map { push( @realgroup, $_ ) unless $lmiss{$_} } @{$rdghash->{'group'}};
220 $rdghash->{'group'} = \@realgroup;
221 # Note any layered witnesses that appear in this group
222 foreach( @realgroup ) {
223 if( $_ =~ /^(.*)\Q$aclabel\E$/ ) {
224 push( @layerwits, $1 );
228 $location->{'layerwits'} = \@layerwits if @layerwits;
230 $answer->{'conflict_count'} = $conflict_count;
235 =head2 group_variants( $tradition, $rank, $lacunose, @merge_relationship_types )
237 Groups the variants at the given $rank of the collation, treating any
238 relationships in @merge_relationship_types as equivalent. $lacunose should
239 be a reference to an array, to which the sigla of lacunose witnesses at this
240 rank will be appended; $transposed should be a reference to a hash, wherein
241 the identities of transposed readings and their relatives will be stored.
243 Returns a hash $group_readings where $rdg is attested by the witnesses listed
244 in $group_readings->{$rdg}.
248 # Return group_readings, groups, lacunose
250 my( $tradition, $rank, $lacunose, $transposed, $collapse ) = @_;
251 my $c = $tradition->collation;
252 my $aclabel = $c->ac_label;
253 # Get the alignment table readings
254 my %readings_at_rank;
255 my %is_lacunose; # lookup table for $lacunose
256 map { $is_lacunose{$_} = 1 } @$lacunose;
259 foreach my $tablewit ( @{$c->alignment_table->{'alignment'}} ) {
260 my $rdg = $tablewit->{'tokens'}->[$rank-1];
261 my $wit = $tablewit->{'witness'};
262 # Exclude the witness if it is "lacunose" which if we got here
263 # means "not in the stemma".
264 next if $is_lacunose{$wit};
265 # Note if the witness is actually in a lacuna
266 if( $rdg && $rdg->{'t'}->is_lacuna ) {
267 _add_to_witlist( $wit, $lacunose, $aclabel );
268 # Otherwise the witness either has a positive reading...
270 # If the reading has been counted elsewhere as a transposition, ignore it.
271 if( $transposed->{$rdg->{'t'}->id} ) {
272 # TODO This doesn't cope with three-way transpositions
273 map { $moved_wits{$_} = 1 } @{$transposed->{$rdg->{'t'}->id}};
276 # Otherwise, record it...
277 $readings_at_rank{$rdg->{'t'}->id} = $rdg->{'t'};
278 # ...and grab any transpositions, and their relations.
279 my @transp = grep { $_->rank != $rank } $rdg->{'t'}->related_readings();
280 foreach my $trdg ( @transp ) {
281 map { $moved_wits{$_} = 1 } $trdg->witnesses;
282 $transposed->{$trdg->id} = [ $rdg->{'t'}->witnesses ];
283 $readings_at_rank{$trdg->id} = $trdg;
285 # ...or it is empty, ergo a gap.
287 push( @check_for_gaps, $wit );
291 map { _add_to_witlist( $_, \@gap_wits, $aclabel )
292 unless $moved_wits{$_} } @check_for_gaps;
293 # TODO check for, and break into a new row, any doubled-up witness readings
294 # after transposition...
295 # Group the readings, collapsing groups by relationship if needed
296 my %grouped_readings;
297 foreach my $rdg ( values %readings_at_rank ) {
298 # Skip readings that have been collapsed into others.
299 next if exists $grouped_readings{$rdg->id} && !$grouped_readings{$rdg->id};
300 # Get the witness list, including from readings collapsed into this one.
301 my @wits = $rdg->witnesses;
303 my $filter = sub { my $r = $_[0]; grep { $_ eq $r->type } @$collapse; };
304 foreach my $other ( $rdg->related_readings( $filter ) ) {
305 my @otherwits = $other->witnesses;
306 push( @wits, @otherwits );
307 $grouped_readings{$other->id} = 0;
310 # Filter the group to those witnesses in the stemma
312 foreach my $wit ( @wits ) {
313 next if $is_lacunose{$wit};
314 push( @use_wits, $wit );
316 $grouped_readings{$rdg->id} = \@use_wits;
318 $grouped_readings{'(omitted)'} = \@gap_wits if @gap_wits;
319 # Get rid of our collapsed readings
320 map { delete $grouped_readings{$_} unless $grouped_readings{$_} }
321 keys %grouped_readings
325 return \%grouped_readings;
328 # Helper function to ensure that X and X a.c. never appear in the same list.
329 sub _add_to_witlist {
330 my( $wit, $list, $acstr ) = @_;
333 map { $inlist{$_} = $idx++ } @$list;
334 if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
336 unless( exists $inlist{$acwit} ) {
337 push( @$list, $acwit.$acstr );
340 if( exists( $inlist{$wit.$acstr} ) ) {
341 # Replace the a.c. version with the main witness
342 my $i = $inlist{$wit.$acstr};
345 push( @$list, $wit );
350 =head2 solve_variants( $graph, @groups )
352 Sends the set of groups to the external graph solver service and returns
353 a cleaned-up answer, adding the rank IDs back where they belong.
355 The JSON has the form
356 { "graph": [ stemmagraph DOT string without newlines ],
357 "groupings": [ array of arrays of groups, one per rank ] }
359 The answer has the form
360 { "variants" => [ array of variant location structures ],
361 "variant_count" => total,
362 "conflict_count" => number of conflicts detected,
363 "genealogical_count" => number of solutions found }
368 my( $stemma, @groups ) = @_;
369 my $aclabel = $stemma->collation->ac_label;
371 # Filter the groups down to distinct groups, and work out what graph
372 # should be used in the calculation of each group. We want to send each
373 # distinct problem to the solver only once.
374 # We need a whole bunch of lookup tables for this.
375 my $index_groupkeys = {}; # Save the order of readings
376 my $group_indices = {}; # Save the indices that have a given grouping
377 my $graph_problems = {}; # Save the groupings for the given graph
379 foreach my $idx ( 0..$#groups ) {
380 my $ghash = $groups[$idx];
382 # Sort the groupings from big to little, and scan for a.c. witnesses
383 # that would need an extended graph.
384 my @acwits; # note which AC witnesses crop up at this rank
385 my @idxkeys = sort { scalar @{$ghash->{$b}} <=> scalar @{$ghash->{$a}} }
387 foreach my $rdg ( @idxkeys ) {
388 my @sg = sort @{$ghash->{$rdg}};
389 push( @acwits, grep { $_ =~ /\Q$aclabel\E$/ } @sg );
390 push( @grouping, \@sg );
392 # Save the reading order
393 $index_groupkeys->{$idx} = \@idxkeys;
395 # Now associate the distinct group with this index
396 my $gstr = wit_stringify( \@grouping );
397 push( @{$group_indices->{$gstr}}, $idx );
399 # Finally, add the group to the list to be calculated for this graph.
400 map { s/\Q$aclabel\E$// } @acwits;
401 my $graph = $stemma->extend_graph( \@acwits );
402 unless( exists $graph_problems->{"$graph"} ) {
403 $graph_problems->{"$graph"} = { 'object' => $graph, 'groups' => [] };
405 push( @{$graph_problems->{"$graph"}->{'groups'}}, \@grouping );
408 ## For each distinct graph, send its groups to the solver.
409 my $solver_url = 'http://byzantini.st/cgi-bin/graphcalc.cgi';
410 my $ua = LWP::UserAgent->new();
411 ## Witness map is a HACK to get around limitations in node names from IDP
412 my $witness_map = {};
413 ## Variables to store answers as they come back
414 my $variants = [ ( undef ) x ( scalar keys %$index_groupkeys ) ];
415 my $genealogical = 0;
416 foreach my $graphkey ( keys %$graph_problems ) {
417 my $graph = $graph_problems->{$graphkey}->{'object'};
418 my $groupings = $graph_problems->{$graphkey}->{'groups'};
419 my $json = encode_json( _safe_wit_strings( $graph, $stemma->collation,
420 $groupings, $witness_map ) );
421 # Send it off and get the result
422 #print STDERR "Sending request: $json\n";
423 my $resp = $ua->post( $solver_url, 'Content-Type' => 'application/json',
424 'Content' => $json );
427 if( $resp->is_success ) {
428 $answer = _desanitize_names( decode_json( $resp->content ), $witness_map );
431 # Fall back to the old method.
432 warn "IDP solver returned " . $resp->status_line . " / " . $resp->content
433 . "; falling back to perl method";
434 $answer = perl_solver( $graph, @$groupings );
436 ## The answer is the evaluated groupings, plus a boolean for whether
437 ## they were genealogical. Reconstruct our original groups.
438 foreach my $gidx ( 0 .. $#{$groupings} ) {
439 my( $calc_groups, $result ) = @{$answer->[$gidx]};
442 # Prune the calculated groups, in case the IDP solver failed to.
445 foreach my $cg ( @$calc_groups ) {
446 # This is a little wasteful but the path of least
447 # resistance. Send both the stemma, which knows what
448 # its hypotheticals are, and the actual graph used.
449 my @pg = _prune_group( $cg, $stemma, $graph );
450 push( @pruned_groups, \@pg );
452 $calc_groups = \@pruned_groups;
455 # Retrieve the key for the original group that went to the solver
456 my $input_group = wit_stringify( $groupings->[$gidx] );
457 foreach my $oidx ( @{$group_indices->{$input_group}} ) {
458 my @readings = @{$index_groupkeys->{$oidx}};
460 'genealogical' => $result,
463 foreach my $ridx ( 0 .. $#readings ) {
464 push( @{$vstruct->{'readings'}},
465 { 'readingid' => $readings[$ridx],
466 'group' => $calc_groups->[$ridx] } );
468 $variants->[$oidx] = $vstruct;
473 return { 'variants' => $variants,
474 'variant_count' => scalar @$variants,
475 'genealogical_count' => $genealogical };
478 #### HACKERY to cope with IDP's limited idea of what a node name looks like ###
480 sub _safe_wit_strings {
481 my( $graph, $c, $groupings, $witness_map ) = @_;
482 # Parse the graph we were given into a stemma.
483 my $safegraph = Graph->new();
484 # Convert the graph to a safe representation and store the conversion.
485 foreach my $n ( $graph->vertices ) {
486 my $sn = _safe_witstr( $n );
487 if( exists $witness_map->{$sn} ) {
488 warn "Ambiguous stringification $sn for $n and " . $witness_map->{$sn}
489 if $witness_map->{$sn} ne $n;
491 $witness_map->{$sn} = $n;
493 $safegraph->add_vertex( $sn );
494 $safegraph->set_vertex_attributes( $sn,
495 $graph->get_vertex_attributes( $n ) );
497 foreach my $e ( $graph->edges ) {
498 my @safe_e = ( _safe_witstr( $e->[0] ), _safe_witstr( $e->[1] ) );
499 $safegraph->add_edge( @safe_e );
501 my $safe_stemma = Text::Tradition::Stemma->new(
502 'collation' => $c, 'graph' => $safegraph );
504 # Now convert the witness groupings to a safe representation.
505 my $safe_groupings = [];
506 foreach my $grouping ( @$groupings ) {
507 my $safe_grouping = [];
508 foreach my $group ( @$grouping ) {
510 foreach my $n ( @$group ) {
511 my $sn = _safe_witstr( $n );
512 warn "Ambiguous stringification $sn for $n and " . $witness_map->{$sn}
513 if exists $witness_map->{$sn} && $witness_map->{$sn} ne $n;
514 $witness_map->{$sn} = $n;
515 push( @$safe_group, $sn );
517 push( @$safe_grouping, $safe_group );
519 push( @$safe_groupings, $safe_grouping );
522 # Return it all in the struct we expect. We have stored the reductions
523 # in the $witness_map that we were passed.
524 return { 'graph' => $safe_stemma->editable( { 'linesep' => ' ' } ),
525 'groupings' => $safe_groupings };
530 $witstr =~ s/\s+/_/g;
531 $witstr =~ s/[^\w\d-]//g;
535 sub _desanitize_names {
536 my( $jsonstruct, $witness_map ) = @_;
538 foreach my $grouping ( @$jsonstruct ) {
539 my $real_grouping = [];
540 foreach my $element ( @$grouping ) {
541 if( ref( $element ) eq 'ARRAY' ) {
543 my $real_groupset = [];
544 foreach my $group ( @$element ) {
546 foreach my $n ( @$group ) {
547 my $rn = $witness_map->{$n};
548 push( @$real_group, $rn );
550 push( @$real_groupset, $real_group );
552 push( @$real_grouping, $real_groupset );
554 # It is the boolean, not actually a group.
555 push( @$real_grouping, $element );
558 push( @$result, $real_grouping );
565 =head2 analyze_location ( $tradition, $graph, $location_hash )
567 Given the tradition, its stemma graph, and the solution from the graph solver,
568 work out the rest of the information we want. For each reading we need missing,
569 conflict, reading_parents, independent_occurrence, followed, not_followed, and follow_unknown. Alters the location_hash in place.
573 sub analyze_location {
574 my ( $tradition, $stemma, $variant_row, $lacunose ) = @_;
575 my $c = $tradition->collation;
577 # Make a hash of all known node memberships, and make the subgraphs.
579 my $reading_roots = {};
581 my $acstr = $c->ac_label;
583 $DB::single = 1 if $variant_row->{id} == 87;
584 # Note which witnesses positively belong to which group
585 foreach my $rdghash ( @{$variant_row->{'readings'}} ) {
586 my $rid = $rdghash->{'readingid'};
587 foreach my $wit ( @{$rdghash->{'group'}} ) {
588 $contig->{$wit} = $rid;
589 if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
595 # Get the actual graph we should work with
596 my $graph = @acwits ? $stemma->extend_graph( \@acwits ) : $stemma->graph;
598 # Now, armed with that knowledge, make a subgraph for each reading
599 # and note the root(s) of each subgraph.
600 foreach my $rdghash( @{$variant_row->{'readings'}} ) {
601 my $rid = $rdghash->{'readingid'};
604 my $part = $graph->copy;
605 my @todelete = grep { exists $contig->{$_} && $contig->{$_} ne $rid }
607 $part->delete_vertices( @todelete );
608 _prune_subtree( $part, $lacunose );
609 $subgraph->{$rid} = $part;
610 # Record the remaining lacunose nodes as part of this group, if
611 # we are dealing with a non-genealogical reading.
612 unless( $variant_row->{'genealogical'} ) {
613 map { $contig->{$_} = $rid } $part->vertices;
615 # Get the reading roots.
616 map { $reading_roots->{$_} = $rid } $part->predecessorless_vertices;
619 # Now that we have all the node group memberships, calculate followed/
620 # non-followed/unknown values for each reading. Also figure out the
621 # reading's evident parent(s).
622 foreach my $rdghash ( @{$variant_row->{'readings'}} ) {
623 my $rid = $rdghash->{'readingid'};
625 my $part = $subgraph->{$rid};
627 # Start figuring things out.
628 my @roots = grep { $reading_roots->{$_} eq $rid } keys %$reading_roots;
629 $rdghash->{'independent_occurrence'} = \@roots;
630 $rdghash->{'followed'} = scalar( $part->vertices ) - scalar( @roots );
631 # Find the parent readings, if any, of this reading.
633 foreach my $wit ( @roots ) {
634 # Look in the main stemma to find this witness's extant or known-reading
635 # immediate ancestor(s), and look up the reading that each ancestor olds.
636 my @check = $graph->predecessors( $wit );
639 foreach my $wparent( @check ) {
640 my $preading = $contig->{$wparent};
642 $rdgparents->{$preading} = 1;
644 push( @next, $graph->predecessors( $wparent ) );
650 foreach my $p ( keys %$rdgparents ) {
651 # Resolve the relationship of the parent to the reading, and
652 # save it in our hash.
653 my $pobj = $c->reading( $p );
655 my $prep = $pobj ? $pobj->id . ' (' . $pobj->text . ')' : $p;
657 my $rel = $c->get_relationship( $p, $rdghash->{readingid} );
659 $relation = { type => $rel->type };
660 if( $rel->has_annotation ) {
661 $relation->{'annotation'} = $rel->annotation;
665 $rdgparents->{$p} = { 'label' => $prep, 'relation' => $relation };
668 $rdghash->{'reading_parents'} = $rdgparents;
670 # Find the number of times this reading was altered, and the number of
671 # times we're not sure.
672 my( %nofollow, %unknownfollow );
673 foreach my $wit ( $part->vertices ) {
674 foreach my $wchild ( $graph->successors( $wit ) ) {
675 next if $part->has_vertex( $wchild );
676 if( $reading_roots->{$wchild} && $contig->{$wchild} ) {
677 # It definitely changed here.
678 $nofollow{$wchild} = 1;
679 } elsif( !($contig->{$wchild}) ) {
680 # The child is a hypothetical node not definitely in
681 # any group. Answer is unknown.
682 $unknownfollow{$wchild} = 1;
683 } # else it's a non-root node in a known group, and therefore
684 # is presumed to have its reading from its group, not this link.
687 $rdghash->{'not_followed'} = keys %nofollow;
688 $rdghash->{'follow_unknown'} = keys %unknownfollow;
690 # Now say whether this reading represents a conflict.
691 unless( $variant_row->{'genealogical'} ) {
692 $rdghash->{'conflict'} = @roots != 1;
698 =head2 perl_solver( $tradition, $rank, $stemma_id, @merge_relationship_types )
700 ** NOTE ** This method should hopefully not be called - it is not guaranteed
701 to be correct. Serves as a backup for the real solver.
703 Runs an analysis of the given tradition, at the location given in $rank,
704 against the graph of the stemma specified in $stemma_id. The argument
705 @merge_relationship_types is an optional list of relationship types for
706 which readings so related should be treated as equivalent.
708 Returns a nested array data structure as follows:
710 [ [ group_list, is_genealogical ], [ group_list, is_genealogical ] ... ]
712 where the group list is the array of arrays passed in for each element of @groups,
713 possibly with the addition of hypothetical readings.
719 my( $graph, @groups ) = @_;
721 foreach my $g ( @groups ) {
722 push( @answer, _solve_variant_location( $graph, $g ) );
727 sub _solve_variant_location {
728 my( $graph, $groups ) = @_;
735 # Mark each ms as in its own group, first.
736 foreach my $g ( @$groups ) {
737 my $gst = wit_stringify( $g );
738 map { $contig->{$_} = $gst } @$g;
741 # Now for each unmarked node in the graph, initialize an array
742 # for possible group memberships. We will use this later to
743 # resolve potential conflicts.
744 map { $contig->{$_} = [] unless $contig->{$_} } $graph->vertices;
745 foreach my $g ( sort { scalar @$b <=> scalar @$a } @$groups ) {
746 my $gst = wit_stringify( $g ); # This is the group name
747 # Copy the graph, and delete all non-members from the new graph.
748 my $part = $graph->copy;
750 $part->delete_vertices(
751 grep { !ref( $contig->{$_} ) && $contig->{$_} ne $gst } $graph->vertices );
753 # Now look to see if our group is connected.
755 # We have to take directionality into account.
756 # How many root nodes do we have?
757 my @roots = grep { ref( $contig->{$_} ) || $contig->{$_} eq $gst }
758 $part->predecessorless_vertices;
759 # Assuming that @$g > 1, find the first root node that has at
760 # least one successor belonging to our group. If this reading
761 # is genealogical, there should be only one, but we will check
762 # that implicitly later.
763 foreach my $root ( @roots ) {
764 # Prune the tree to get rid of extraneous hypotheticals.
765 $root = _prune_subtree_old( $part, $root, $contig );
767 # Save this root for our group.
768 push( @group_roots, $root );
769 # Get all the successor nodes of our root.
772 # Dispense with the trivial case of one reading.
774 @group_roots = ( $wit );
775 foreach my $v ( $part->vertices ) {
776 $part->delete_vertex( $v ) unless $v eq $wit;
780 if( @group_roots > 1 ) {
781 $conflict->{$gst} = 1;
784 # Paint the 'hypotheticals' with our group.
785 foreach my $wit ( $part->vertices ) {
786 if( ref( $contig->{$wit} ) ) {
787 push( @{$contig->{$wit}}, $gst );
788 } elsif( $contig->{$wit} ne $gst ) {
789 warn "How did we get here?";
794 # Save the relevant subgraph.
795 $subgraph->{$gst} = $part;
798 # For each of our hypothetical readings, flatten its 'contig' array if
799 # the array contains zero or one group. If we have any unflattened arrays,
800 # we may need to run the resolution process. If the reading is already known
801 # to have a conflict, flatten the 'contig' array to nothing; we won't resolve
804 foreach my $wit ( keys %$contig ) {
805 next unless ref( $contig->{$wit} );
806 if( @{$contig->{$wit}} > 1 ) {
807 if( $is_conflicted ) {
808 $contig->{$wit} = ''; # We aren't going to decide.
810 push( @resolve, $wit );
813 my $gst = pop @{$contig->{$wit}};
814 $contig->{$wit} = $gst || '';
819 my $still_contig = {};
820 foreach my $h ( @resolve ) {
821 # For each of the hypothetical readings with more than one possibility,
822 # try deleting it from each of its member subgraphs in turn, and see
823 # if that breaks the contiguous grouping.
824 # TODO This can still break in a corner case where group A can use
825 # either vertex 1 or 2, and group B can use either vertex 2 or 1.
826 # Revisit this if necessary; it could get brute-force nasty.
827 foreach my $gst ( @{$contig->{$h}} ) {
828 my $gpart = $subgraph->{$gst}->copy();
829 # If we have come this far, there is only one root and everything
830 # is reachable from it.
831 my( $root ) = $gpart->predecessorless_vertices;
833 map { $reachable->{$_} = 1 } $gpart->vertices;
835 # Try deleting the hypothetical node.
836 $gpart->delete_vertex( $h );
838 # See if we still have a single root.
839 my @roots = $gpart->predecessorless_vertices;
840 warn "This shouldn't have happened" unless @roots;
842 # $h is needed by this group.
843 if( exists( $still_contig->{$h} ) ) {
845 $conflict->{$gst} = 1;
846 $still_contig->{$h} = '';
848 $still_contig->{$h} = $gst;
852 # $h is somewhere in the middle. See if everything
853 # else can still be reached from the root.
854 my %still_reachable = ( $root => 1 );
855 map { $still_reachable{$_} = 1 }
856 $gpart->all_successors( $root );
857 foreach my $v ( keys %$reachable ) {
859 if( !$still_reachable{$v}
860 && ( $contig->{$v} eq $gst
861 || ( exists $still_contig->{$v}
862 && $still_contig->{$v} eq $gst ) ) ) {
864 if( exists $still_contig->{$h} ) {
866 $conflict->{$gst} = 1;
867 $still_contig->{$h} = '';
869 $still_contig->{$h} = $gst;
872 } # else we don't need $h in this group.
874 } # endif $h eq $root
878 # Now we have some hypothetical vertices in $still_contig that are the
879 # "real" group memberships. Replace these in $contig.
880 foreach my $v ( keys %$contig ) {
881 next unless ref $contig->{$v};
882 $contig->{$v} = $still_contig->{$v};
886 my $is_genealogical = keys %$conflict ? JSON::false : JSON::true;
887 my $variant_row = [ [], $is_genealogical ];
888 # Fill in the groupings from $contig.
889 foreach my $g ( @$groups ) {
890 my $gst = wit_stringify( $g );
891 my @realgroup = grep { $contig->{$_} eq $gst } keys %$contig;
892 push( @{$variant_row->[0]}, \@realgroup );
898 my( $group, $stemma, $graph ) = @_;
900 map { $lacunose->{$_} = 1 } $stemma->hypotheticals;
901 map { $lacunose->{$_} = 0 } @$group;
903 my $subgraph = $graph->copy;
904 map { $subgraph->delete_vertex( $_ ) unless exists $lacunose->{$_} }
906 # ...and find the root.
907 # Now prune and return the remaining vertices.
908 _prune_subtree( $subgraph, $lacunose );
909 return $subgraph->vertices;
913 my( $tree, $lacunose ) = @_;
915 # Delete lacunose witnesses that have no successors
916 my @orphan_hypotheticals;
919 die "Infinite loop on leaves" if $ctr > 100;
920 @orphan_hypotheticals = grep { $lacunose->{$_} }
921 $tree->successorless_vertices;
922 $tree->delete_vertices( @orphan_hypotheticals );
924 } while( @orphan_hypotheticals );
926 # Delete lacunose roots that have a single successor
930 die "Infinite loop on roots" if $ctr > 100;
931 @redundant_root = grep { $lacunose->{$_} && $tree->successors( $_ ) == 1 }
932 $tree->predecessorless_vertices;
933 $tree->delete_vertices( @redundant_root );
935 } while( @redundant_root );
938 sub _prune_subtree_old {
939 my( $tree, $root, $contighash ) = @_;
940 # First, delete hypothetical leaves / orphans until there are none left.
941 my @orphan_hypotheticals = grep { ref( $contighash->{$_} ) }
942 $tree->successorless_vertices;
943 while( @orphan_hypotheticals ) {
944 $tree->delete_vertices( @orphan_hypotheticals );
945 @orphan_hypotheticals = grep { ref( $contighash->{$_} ) }
946 $tree->successorless_vertices;
948 # Then delete a hypothetical root with only one successor, moving the
949 # root to the first child that has no other predecessors.
950 while( $tree->successors( $root ) == 1 && ref $contighash->{$root} ) {
951 my @nextroot = $tree->successors( $root );
952 $tree->delete_vertex( $root );
953 ( $root ) = grep { $tree->is_predecessorless_vertex( $_ ) } @nextroot;
955 # The tree has been modified in place, but we need to know the new root.
956 $root = undef unless $root && $tree->has_vertex( $root );
959 # Add the variant, subject to a.c. representation logic.
960 # This assumes that we will see the 'main' version before the a.c. version.
961 sub add_variant_wit {
962 my( $arr, $wit, $acstr ) = @_;
964 if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
966 $skip = grep { $_ =~ /^\Q$real\E$/ } @$arr;
968 push( @$arr, $wit ) unless $skip;
971 sub _useful_variant {
972 my( $group_readings, $graph, $acstr ) = @_;
974 # TODO Decide what to do with AC witnesses
976 # Sort by group size and return
978 my( @readings, @groups ); # The sorted groups for our answer.
979 foreach my $rdg ( sort { @{$group_readings->{$b}} <=> @{$group_readings->{$a}} }
980 keys %$group_readings ) {
981 push( @readings, $rdg );
982 push( @groups, $group_readings->{$rdg} );
983 if( @{$group_readings->{$rdg}} > 1 ) {
986 my( $wit ) = @{$group_readings->{$rdg}};
987 $wit =~ s/^(.*)\Q$acstr\E$/$1/;
988 $is_useful++ unless( $graph->is_sink_vertex( $wit ) );
991 if( $is_useful > 1 ) {
992 return( \@readings, \@groups );
998 =head2 wit_stringify( $groups )
1000 Takes an array of witness groupings and produces a string like
1001 ['A','B'] / ['C','D','E'] / ['F']
1008 # If we were passed an array of witnesses instead of an array of
1009 # groupings, then "group" the witnesses first.
1010 unless( ref( $groups->[0] ) ) {
1011 my $mkgrp = [ $groups ];
1014 foreach my $g ( @$groups ) {
1015 push( @gst, '[' . join( ',', map { "'$_'" } @$g ) . ']' );
1017 return join( ' / ', @gst );
1021 my( $lista, $listb ) = @_;
1024 map { $union{$_} = 1; $scalars{$_} = $_ } @$lista;
1025 map { $union{$_} += 1; $scalars{$_} = $_ } @$listb;
1026 my @set = grep { $union{$_} == 1 } keys %union;
1027 return map { $scalars{$_} } @set;
1034 This package is free software and is provided "as is" without express
1035 or implied warranty. You can redistribute it and/or modify it under
1036 the same terms as Perl itself.
1040 Tara L Andrews E<lt>aurum@cpan.orgE<gt>