lib/Text/Tradition/Analysis.pm

   1 package Text::Tradition::Analysis;
   2
   3 use strict;
   4 use warnings;
   5 use Benchmark;
   6 use Encode qw/ encode_utf8 /;
   7 use Exporter 'import';
   8 use Graph;
   9 use JSON qw/ encode_json decode_json /;
  10 use LWP::UserAgent;
  11 use Text::Tradition;
  12 use Text::Tradition::Stemma;
  13
  14 use vars qw/ @EXPORT_OK /;
  15 @EXPORT_OK = qw/ run_analysis group_variants analyze_variant_location wit_stringify /;
  16
  17 =head1 NAME
  18
  19 Text::Tradition::Analysis - functions for stemma analysis of a tradition
  20
  21 =head1 SYNOPSIS
  22
  23   use Text::Tradition;
  24   use Text::Tradition::Analysis qw/ run_analysis analyze_variant_location /;
  25   my $t = Text::Tradition->new(
  26     'name' => 'this is a text',
  27     'input' => 'TEI',
  28     'file' => '/path/to/tei_parallel_seg_file.xml' );
  29   $t->add_stemma( 'dotfile' => $stemmafile );
  30
  31   my $variant_data = run_analysis( $tradition );
  32   # Recalculate rank $n treating all orthographic variants as equivalent
  33   my $reanalyze = analyze_variant_location( $tradition, $n, 0, 'orthographic' );
  34
  35 =head1 DESCRIPTION
  36
  37 Text::Tradition is a library for representation and analysis of collated
  38 texts, particularly medieval ones.  The Collation is the central feature of
  39 a Tradition, where the text, its sequence of readings, and its relationships
  40 between readings are actually kept.
  41
  42 =head1 SUBROUTINES
  43
  44 =head2 run_analysis( $tradition, %opts )
  45
  46 Runs the analysis described in analyze_variant_location on every location in the
  47 collation of the given tradition, with the given options. These include:
  48
  49 =over 4
  50
  51 =item * stemma_id - Specify which of the tradition's stemmata to use. Default
  52 is 0 (i.e. the first).
  53
  54 =item * ranks - Specify a list of location ranks to analyze; exclude the rest.
  55
  56 =item * merge_types - Specify a list of relationship types, where related readings
  57 should be treated as identical for the purposes of analysis.
  58
  59 =back
  60
  61 =begin testing
  62
  63 use Text::Tradition;
  64 use Text::Tradition::Analysis qw/ run_analysis analyze_variant_location /;
  65
  66 my $datafile = 't/data/florilegium_tei_ps.xml';
  67 my $tradition = Text::Tradition->new( 'input' => 'TEI',
  68                                       'name' => 'test0',
  69                                       'file' => $datafile );
  70 my $s = $tradition->add_stemma( 'dotfile' => 't/data/florilegium.dot' );
  71 is( ref( $s ), 'Text::Tradition::Stemma', "Added stemma to tradition" );
  72
  73 my %expected_genealogical = (
  74         1 => 0,
  75         2 => 1,
  76         3 =>  0,
  77         5 =>  0,
  78         7 =>  0,
  79         8 =>  0,
  80         10 => 0,
  81         13 => 1,
  82         33 => 0,
  83         34 => 0,
  84         37 => 0,
  85         60 => 0,
  86         81 => 1,
  87         84 => 0,
  88         87 => 0,
  89         101 => 0,
  90         102 => 0,
  91         122 => 1,
  92         157 => 0,
  93         166 => 1,
  94         169 => 1,
  95         200 => 0,
  96         216 => 1,
  97         217 => 1,
  98         219 => 1,
  99         241 => 1,
 100         242 => 1,
 101         243 => 1,
 102 );
 103
 104 my $data = run_analysis( $tradition );
 105 foreach my $row ( @{$data->{'variants'}} ) {
 106         # Account for rows that used to be "not useful"
 107         unless( exists $expected_genealogical{$row->{'id'}} ) {
 108                 $expected_genealogical{$row->{'id'}} = 1;
 109         }
 110         is( $row->{'genealogical'}, $expected_genealogical{$row->{'id'}},
 111                 "Got correct genealogical flag for row " . $row->{'id'} );
 112 }
 113 is( $data->{'variant_count'}, 58, "Got right total variant number" );
 114 # TODO Make something meaningful of conflict count, maybe test other bits
 115
 116 =end testing
 117
 118 =cut
 119
 120 sub run_analysis {
 121         my( $tradition, %opts ) = @_;
 122         my $c = $tradition->collation;
 123
 124         my $stemma_id = $opts{'stemma_id'} || 0;
 125         my @ranks = ref( $opts{'ranks'} ) eq 'ARRAY' ? @{$opts{'ranks'}} : ();
 126         my @collapse = ref( $opts{'merge_types'} ) eq 'ARRAY' ? @{$opts{'merge_types'}} : ();
 127
 128         # Get the stemma
 129         my $stemma = $tradition->stemma( $stemma_id );
 130
 131         # Figure out which witnesses we are working with
 132         my @lacunose = $stemma->hypotheticals;
 133         my @tradition_wits = map { $_->sigil } $tradition->witnesses;
 134         map { push( @tradition_wits, $_->sigil.$c->ac_label ) if $_->is_layered }
 135                 $tradition->witnesses;
 136         push( @lacunose, _symmdiff( [ $stemma->witnesses ], \@tradition_wits ) );
 137
 138         # Find and mark 'common' ranks for exclusion, unless they were
 139         # explicitly specified.
 140         unless( @ranks ) {
 141                 my %common_rank;
 142                 foreach my $rdg ( $c->common_readings ) {
 143                         $common_rank{$rdg->rank} = 1;
 144                 }
 145                 @ranks = grep { !$common_rank{$_} } ( 1 .. $c->end->rank-1 );
 146         }
 147
 148         # Group the variants to send to the solver
 149         my @groups;
 150         my %lacunae;
 151         foreach my $rank ( @ranks ) {
 152                 my $missing = [ @lacunose ];
 153                 push( @groups, group_variants( $tradition, $rank, $missing, \@collapse ) );
 154                 $lacunae{$rank} = $missing;
 155         }
 156         $DB::single = 1;
 157         # Parse the answer
 158         my $answer = solve_variants( $stemma, @groups );
 159
 160         # Do further analysis on the answer
 161         my $conflict_count = 0;
 162         foreach my $idx ( 0 .. $#ranks ) {
 163                 my $location = $answer->{'variants'}->[$idx];
 164                 # Add the rank back in
 165                 $location->{'id'} = $ranks[$idx];
 166                 # Add the lacunae back in
 167                 $location->{'missing'} = $lacunae{$ranks[$idx]};
 168                 # Run the extra analysis we need.
 169                 analyze_location( $tradition, $stemma->graph, $location );
 170                 # Add the reading text back in
 171                 foreach my $rdghash ( @{$location->{'readings'}} ) {
 172                         $conflict_count++
 173                                 if exists $rdghash->{'conflict'} && $rdghash->{'conflict'};
 174                         my $rdg = $c->reading( $rdghash->{'readingid'} );
 175                         $rdghash->{'text'} = $rdg ? $rdg->text : $rdghash->{'readingid'};
 176                 }
 177         }
 178         $answer->{'conflict_count'} = $conflict_count;
 179
 180         return $answer;
 181 }
 182
 183 =head2 group_variants( $tradition, $rank, $lacunose, @merge_relationship_types )
 184
 185 Groups the variants at the given $rank of the collation, treating any
 186 relationships in @merge_relationship_types as equivalent.  $lacunose should
 187 be a reference to an array, to which the sigla of lacunose witnesses at this
 188 rank will be appended.
 189
 190 Returns two ordered lists $readings, $groups, where $readings->[$n] is attested
 191 by the witnesses listed in $groups->[$n].
 192
 193 =cut
 194
 195 # Return group_readings, groups, lacunose
 196 sub group_variants {
 197         my( $tradition, $rank, $lacunose, $collapse ) = @_;
 198         my $c = $tradition->collation;
 199         my $aclabel =  $c->ac_label;
 200         # Get the alignment table readings
 201         my %readings_at_rank;
 202         my @gap_wits;
 203         foreach my $tablewit ( @{$c->alignment_table->{'alignment'}} ) {
 204                 my $rdg = $tablewit->{'tokens'}->[$rank-1];
 205                 my $wit = $tablewit->{'witness'};
 206                 if( $rdg && $rdg->{'t'}->is_lacuna ) {
 207                         _add_to_witlist( $wit, $lacunose, $aclabel );
 208                 } elsif( $rdg ) {
 209                         $readings_at_rank{$rdg->{'t'}->text} = $rdg->{'t'};
 210                 } else {
 211                         _add_to_witlist( $wit, \@gap_wits, $aclabel );
 212                 }
 213         }
 214
 215         # Group the readings, collapsing groups by relationship if needed
 216         my %grouped_readings;
 217         foreach my $rdg ( sort { $b->witnesses <=> $a->witnesses }
 218                                                    values %readings_at_rank ) {
 219                 # Skip readings that have been collapsed into others.
 220                 next if exists $grouped_readings{$rdg->id} && !$grouped_readings{$rdg->id};
 221                 my @wits = $rdg->witnesses;
 222                 if( $collapse ) {
 223                         my $filter = sub { my $r = $_[0]; grep { $_ eq $r->type } @$collapse; };
 224                         foreach my $other ( $rdg->related_readings( $filter ) ) {
 225                                 my @otherwits = $other->witnesses;
 226                                 push( @wits, @otherwits );
 227                                 $grouped_readings{$other->id} = 0;
 228                         }
 229                 }
 230                 $grouped_readings{$rdg->id} = \@wits;
 231         }
 232         $grouped_readings{'(omitted)'} = \@gap_wits if @gap_wits;
 233         # Get rid of our collapsed readings
 234         map { delete $grouped_readings{$_} unless $grouped_readings{$_} }
 235                 keys %grouped_readings
 236                 if $collapse;
 237
 238         return \%grouped_readings;
 239 }
 240
 241 =head2 solve_variants( $graph, @groups )
 242
 243 Sends the set of groups to the external graph solver service and returns
 244 a cleaned-up answer, adding the rank IDs back where they belong.
 245
 246 The JSON has the form
 247   { "graph": [ stemmagraph DOT string without newlines ],
 248     "groupings": [ array of arrays of groups, one per rank ] }
 249
 250 The answer has the form
 251   { "variants" => [ array of variant location structures ],
 252     "variant_count" => total,
 253     "conflict_count" => number of conflicts detected,
 254     "genealogical_count" => number of solutions found }
 255
 256 =cut
 257
 258 sub solve_variants {
 259         my( $stemma, @groups ) = @_;
 260
 261         # Make the json with stemma + groups
 262         my $groupings = [];
 263         foreach my $ghash ( @groups ) {
 264                 my @grouping;
 265                 foreach my $k ( sort keys %$ghash ) {
 266                         push( @grouping, $ghash->{$k} );
 267                 }
 268                 push( @$groupings, \@grouping );
 269         }
 270         ## Witness map is a HACK to get around limitations in node names from IDP
 271         my $witness_map = {};
 272         my $json = encode_json( _safe_wit_strings( $stemma, $groupings, $witness_map ) );
 273
 274         # Send it off and get the result
 275         my $solver_url = 'http://byzantini.st/cgi-bin/graphcalc.cgi';
 276         my $ua = LWP::UserAgent->new();
 277         my $resp = $ua->post( $solver_url, 'Content-Type' => 'application/json',
 278                                                   'Content' => $json );
 279
 280         my $answer;
 281         if( $resp->is_success ) {
 282                 $answer = _desanitize_names( decode_json( $resp->content ), $witness_map );
 283         } else {
 284                 # Fall back to the old method.
 285                 warn "IDP solver returned " . $resp->status_line . " / " . $resp->content
 286                         . "; falling back to perl method";
 287                 $answer = perl_solver( $stemma, @$groupings );
 288         }
 289
 290         # Fold the result back into what we know about the groups.
 291         my $variants = [];
 292         my $genealogical = 0;
 293         foreach my $idx ( 0 .. $#groups ) {
 294                 my( $calc_groups, $result ) = @{$answer->[$idx]};
 295                 $genealogical++ if $result;
 296                 my $input_group = $groups[$idx];
 297                 foreach my $k ( sort keys %$input_group ) {
 298                         my $cg = shift @$calc_groups;
 299                         $input_group->{$k} = $cg;
 300                 }
 301                 my $vstruct = {
 302                         'genealogical' => $result,
 303                         'readings' => [],
 304                 };
 305                 foreach my $k ( keys %$input_group ) {
 306                         push( @{$vstruct->{'readings'}},
 307                                   { 'readingid' => $k, 'group' => $input_group->{$k}} );
 308                 }
 309                 push( @$variants, $vstruct );
 310         }
 311
 312         return { 'variants' => $variants,
 313                          'variant_count' => scalar @$variants,
 314                          'genealogical_count' => $genealogical };
 315 }
 316
 317 #### HACKERY to cope with IDP's limited idea of what a node name looks like ###
 318
 319 sub _safe_wit_strings {
 320         my( $stemma, $groupings, $witness_map ) = @_;
 321         my $safegraph = Graph->new();
 322         # Convert the graph to a safe representation and store the conversion.
 323         foreach my $n ( $stemma->graph->vertices ) {
 324                 my $sn = _safe_witstr( $n );
 325                 warn "Ambiguous stringification $sn for $n and " . $witness_map->{$sn}
 326                         if exists $witness_map->{$sn};
 327                 $witness_map->{$sn} = $n;
 328                 $safegraph->add_vertex( $sn );
 329                 $safegraph->set_vertex_attributes( $sn,
 330                         $stemma->graph->get_vertex_attributes( $n ) );
 331         }
 332         foreach my $e ( $stemma->graph->edges ) {
 333                 my @safe_e = ( _safe_witstr( $e->[0] ), _safe_witstr( $e->[1] ) );
 334                 $safegraph->add_edge( @safe_e );
 335         }
 336         my $safe_stemma = Text::Tradition::Stemma->new(
 337                 'collation' => $stemma->collation, 'graph' => $safegraph );
 338
 339         # Now convert the witness groupings to a safe representation.
 340         my $safe_groupings = [];
 341         foreach my $grouping ( @$groupings ) {
 342                 my $safe_grouping = [];
 343                 foreach my $group ( @$grouping ) {
 344                         my $safe_group = [];
 345                         foreach my $n ( @$group ) {
 346                                 my $sn = _safe_witstr( $n );
 347                                 warn "Ambiguous stringification $sn for $n and " . $witness_map->{$sn}
 348                                         if exists $witness_map->{$sn} && $witness_map->{$sn} ne $n;
 349                                 $witness_map->{$sn} = $n;
 350                                 push( @$safe_group, $sn );
 351                         }
 352                         push( @$safe_grouping, $safe_group );
 353                 }
 354                 push( @$safe_groupings, $safe_grouping );
 355         }
 356
 357         # Return it all in the struct we expect.  We have stored the reductions
 358         # in the $witness_map that we were passed.
 359         return { 'graph' => $safe_stemma->editable( ' ' ), 'groupings' => $safe_groupings };
 360 }
 361
 362 sub _safe_witstr {
 363         my $witstr = shift;
 364         $witstr =~ s/\s+/_/g;
 365         $witstr =~ s/[^\w\d-]//g;
 366         return $witstr;
 367 }
 368
 369 sub _desanitize_names {
 370         my( $jsonstruct, $witness_map ) = @_;
 371         my $result = [];
 372         foreach my $grouping ( @$jsonstruct ) {
 373                 my $real_grouping = [];
 374                 foreach my $element ( @$grouping ) {
 375                         if( ref( $element ) eq 'ARRAY' ) {
 376                                 # it's the groupset.
 377                                 my $real_groupset = [];
 378                                 foreach my $group ( @$element ) {
 379                                         my $real_group = [];
 380                                         foreach my $n ( @$group ) {
 381                                                 my $rn = $witness_map->{$n};
 382                                                 push( @$real_group, $rn );
 383                                         }
 384                                         push( @$real_groupset, $real_group );
 385                                 }
 386                                 push( @$real_grouping, $real_groupset );
 387                         } else {
 388                                 # It is the boolean, not actually a group.
 389                                 push( @$real_grouping, $element );
 390                         }
 391                 }
 392                 push( @$result, $real_grouping );
 393         }
 394         return $result;
 395 }
 396
 397 ### END HACKERY ###
 398
 399 =head2 analyze_location ( $tradition, $graph, $location_hash )
 400
 401 Given the tradition, its stemma graph, and the solution from the graph solver,
 402 work out the rest of the information we want.  For each reading we need missing,
 403 conflict, reading_parents, independent_occurrence, followed, not_followed, and follow_unknown.  Alters the location_hash in place.
 404
 405 =cut
 406
 407 sub analyze_location {
 408         my ( $tradition, $graph, $variant_row ) = @_;
 409
 410         # Make a hash of all known node memberships, and make the subgraphs.
 411         my $contig = {};
 412         my $reading_roots = {};
 413         my $subgraph = {};
 414     foreach my $rdghash ( @{$variant_row->{'readings'}} ) {
 415         my $rid = $rdghash->{'readingid'};
 416                 map { $contig->{$_} = $rid } @{$rdghash->{'group'}};
 417
 418         # Make the subgraph.
 419         my $part = $graph->copy;
 420                 my %these_vertices;
 421                 map { $these_vertices{$_} = 1 } @{$rdghash->{'group'}};
 422                 $part->delete_vertices( grep { !$these_vertices{$_} } $part->vertices );
 423                 $subgraph->{$rid} = $part;
 424                 # Get the reading roots.
 425                 map { $reading_roots->{$_} = $rid } $part->predecessorless_vertices;
 426         }
 427
 428         # Now that we have all the node group memberships, calculate followed/
 429     # non-followed/unknown values for each reading.  Also figure out the
 430     # reading's evident parent(s).
 431     foreach my $rdghash ( @{$variant_row->{'readings'}} ) {
 432         # Group string key - TODO do we need this?
 433         my $gst = wit_stringify( $rdghash->{'group'} );
 434         my $rid = $rdghash->{'readingid'};
 435         # Get the subgraph
 436         my $part = $subgraph->{$rid};
 437
 438         # Start figuring things out.
 439         my @roots = $part->predecessorless_vertices;
 440         $rdghash->{'independent_occurrence'} = scalar @roots;
 441         $rdghash->{'followed'} = scalar( $part->vertices ) - scalar( @roots );
 442         # Find the parent readings, if any, of this reading.
 443         my %rdgparents;
 444         foreach my $wit ( @roots ) {
 445                 # Look in the main stemma to find this witness's extant or known-reading
 446                 # immediate ancestor(s), and look up the reading that each ancestor olds.
 447                         my @check = $graph->predecessors( $wit );
 448                         while( @check ) {
 449                                 my @next;
 450                                 foreach my $wparent( @check ) {
 451                                         my $preading = $contig->{$wparent};
 452                                         if( $preading ) {
 453                                                 $rdgparents{$preading} = 1;
 454                                         } else {
 455                                                 push( @next, $graph->predecessors( $wparent ) );
 456                                         }
 457                                 }
 458                                 @check = @next;
 459                         }
 460                 }
 461                 $rdghash->{'reading_parents'} = [ keys %rdgparents ];
 462
 463                 # Find the number of times this reading was altered, and the number of
 464                 # times we're not sure.
 465                 my( %nofollow, %unknownfollow );
 466                 foreach my $wit ( $part->vertices ) {
 467                         foreach my $wchild ( $graph->successors( $wit ) ) {
 468                                 next if $part->has_vertex( $wchild );
 469                                 if( $reading_roots->{$wchild} && $contig->{$wchild} ) {
 470                                         # It definitely changed here.
 471                                         $nofollow{$wchild} = 1;
 472                                 } elsif( !($contig->{$wchild}) ) {
 473                                         # The child is a hypothetical node not definitely in
 474                                         # any group. Answer is unknown.
 475                                         $unknownfollow{$wchild} = 1;
 476                                 } # else it's a non-root node in a known group, and therefore
 477                                   # is presumed to have its reading from its group, not this link.
 478                         }
 479                 }
 480                 $rdghash->{'not_followed'} = keys %nofollow;
 481                 $rdghash->{'follow_unknown'} = keys %unknownfollow;
 482
 483                 # Now say whether this reading represents a conflict.
 484                 unless( $variant_row->{'genealogical'} ) {
 485                         $rdghash->{'conflict'} = @roots != 1;
 486                 }
 487     }
 488 }
 489
 490
 491 =head2 perl_solver( $tradition, $rank, $stemma_id, @merge_relationship_types )
 492
 493 ** NOTE ** This method should hopefully not be called - it is not guaranteed
 494 to be correct.  Serves as a backup for the real solver.
 495
 496 Runs an analysis of the given tradition, at the location given in $rank,
 497 against the graph of the stemma specified in $stemma_id.  The argument
 498 @merge_relationship_types is an optional list of relationship types for
 499 which readings so related should be treated as equivalent.
 500
 501 Returns a nested array data structure as follows:
 502
 503  [ [ group_list, is_genealogical ], [ group_list, is_genealogical ] ... ]
 504
 505 where the group list is the array of arrays passed in for each element of @groups,
 506 possibly with the addition of hypothetical readings.
 507
 508
 509 =cut
 510
 511 sub perl_solver {
 512         my( $stemma, @groups ) = @_;
 513         my $graph = $stemma->graph;
 514         my @answer;
 515         foreach my $g ( @groups ) {
 516                 push( @answer, _solve_variant_location( $graph, $g ) );
 517         }
 518         return \@answer;
 519 }
 520
 521 sub _solve_variant_location {
 522         my( $graph, $groups ) = @_;
 523         # Now do the work.
 524     my $contig = {};
 525     my $subgraph = {};
 526     my $is_conflicted;
 527     my $conflict = {};
 528
 529     # Mark each ms as in its own group, first.
 530     foreach my $g ( @$groups ) {
 531         my $gst = wit_stringify( $g );
 532         map { $contig->{$_} = $gst } @$g;
 533     }
 534
 535     # Now for each unmarked node in the graph, initialize an array
 536     # for possible group memberships.  We will use this later to
 537     # resolve potential conflicts.
 538     map { $contig->{$_} = [] unless $contig->{$_} } $graph->vertices;
 539     foreach my $g ( sort { scalar @$b <=> scalar @$a } @$groups ) {
 540         my $gst = wit_stringify( $g );  # This is the group name
 541         # Copy the graph, and delete all non-members from the new graph.
 542         my $part = $graph->copy;
 543         my @group_roots;
 544         $part->delete_vertices(
 545             grep { !ref( $contig->{$_} ) && $contig->{$_} ne $gst } $graph->vertices );
 546
 547         # Now look to see if our group is connected.
 548                 if( @$g > 1 ) {
 549                         # We have to take directionality into account.
 550                         # How many root nodes do we have?
 551                         my @roots = grep { ref( $contig->{$_} ) || $contig->{$_} eq $gst }
 552                                 $part->predecessorless_vertices;
 553                         # Assuming that @$g > 1, find the first root node that has at
 554                         # least one successor belonging to our group. If this reading
 555                         # is genealogical, there should be only one, but we will check
 556                         # that implicitly later.
 557                         foreach my $root ( @roots ) {
 558                                 # Prune the tree to get rid of extraneous hypotheticals.
 559                                 $root = _prune_subtree( $part, $root, $contig );
 560                                 next unless $root;
 561                                 # Save this root for our group.
 562                                 push( @group_roots, $root );
 563                                 # Get all the successor nodes of our root.
 564                         }
 565                 } else {
 566                         # Dispense with the trivial case of one reading.
 567                         my $wit = $g->[0];
 568                         @group_roots = ( $wit );
 569                         foreach my $v ( $part->vertices ) {
 570                                 $part->delete_vertex( $v ) unless $v eq $wit;
 571                         }
 572         }
 573
 574         if( @group_roots > 1 ) {
 575                 $conflict->{$gst} = 1;
 576                 $is_conflicted = 1;
 577         }
 578         # Paint the 'hypotheticals' with our group.
 579                 foreach my $wit ( $part->vertices ) {
 580                         if( ref( $contig->{$wit} ) ) {
 581                                 push( @{$contig->{$wit}}, $gst );
 582                         } elsif( $contig->{$wit} ne $gst ) {
 583                                 warn "How did we get here?";
 584                         }
 585                 }
 586
 587
 588                 # Save the relevant subgraph.
 589                 $subgraph->{$gst} = $part;
 590     }
 591
 592         # For each of our hypothetical readings, flatten its 'contig' array if
 593         # the array contains zero or one group.  If we have any unflattened arrays,
 594         # we may need to run the resolution process. If the reading is already known
 595         # to have a conflict, flatten the 'contig' array to nothing; we won't resolve
 596         # it.
 597         my @resolve;
 598         foreach my $wit ( keys %$contig ) {
 599                 next unless ref( $contig->{$wit} );
 600                 if( @{$contig->{$wit}} > 1 ) {
 601                         if( $is_conflicted ) {
 602                                 $contig->{$wit} = '';  # We aren't going to decide.
 603                         } else {
 604                                 push( @resolve, $wit );
 605                         }
 606                 } else {
 607                         my $gst = pop @{$contig->{$wit}};
 608                         $contig->{$wit} = $gst || '';
 609                 }
 610         }
 611
 612     if( @resolve ) {
 613         my $still_contig = {};
 614         foreach my $h ( @resolve ) {
 615             # For each of the hypothetical readings with more than one possibility,
 616             # try deleting it from each of its member subgraphs in turn, and see
 617             # if that breaks the contiguous grouping.
 618             # TODO This can still break in a corner case where group A can use
 619             # either vertex 1 or 2, and group B can use either vertex 2 or 1.
 620             # Revisit this if necessary; it could get brute-force nasty.
 621             foreach my $gst ( @{$contig->{$h}} ) {
 622                 my $gpart = $subgraph->{$gst}->copy();
 623                 # If we have come this far, there is only one root and everything
 624                 # is reachable from it.
 625                 my( $root ) = $gpart->predecessorless_vertices;
 626                 my $reachable = {};
 627                 map { $reachable->{$_} = 1 } $gpart->vertices;
 628
 629                 # Try deleting the hypothetical node.
 630                 $gpart->delete_vertex( $h );
 631                 if( $h eq $root ) {
 632                         # See if we still have a single root.
 633                         my @roots = $gpart->predecessorless_vertices;
 634                         warn "This shouldn't have happened" unless @roots;
 635                         if( @roots > 1 ) {
 636                                 # $h is needed by this group.
 637                                 if( exists( $still_contig->{$h} ) ) {
 638                                         # Conflict!
 639                                         $conflict->{$gst} = 1;
 640                                         $still_contig->{$h} = '';
 641                                 } else {
 642                                         $still_contig->{$h} = $gst;
 643                                 }
 644                         }
 645                 } else {
 646                         # $h is somewhere in the middle. See if everything
 647                         # else can still be reached from the root.
 648                                         my %still_reachable = ( $root => 1 );
 649                                         map { $still_reachable{$_} = 1 }
 650                                                 $gpart->all_successors( $root );
 651                                         foreach my $v ( keys %$reachable ) {
 652                                                 next if $v eq $h;
 653                                                 if( !$still_reachable{$v}
 654                                                         && ( $contig->{$v} eq $gst
 655                                                                  || ( exists $still_contig->{$v}
 656                                                                           && $still_contig->{$v} eq $gst ) ) ) {
 657                                                         # We need $h.
 658                                                         if( exists $still_contig->{$h} ) {
 659                                                                 # Conflict!
 660                                                                 $conflict->{$gst} = 1;
 661                                                                 $still_contig->{$h} = '';
 662                                                         } else {
 663                                                                 $still_contig->{$h} = $gst;
 664                                                         }
 665                                                         last;
 666                                                 } # else we don't need $h in this group.
 667                                         } # end foreach $v
 668                                 } # endif $h eq $root
 669             } # end foreach $gst
 670         } # end foreach $h
 671
 672         # Now we have some hypothetical vertices in $still_contig that are the
 673         # "real" group memberships.  Replace these in $contig.
 674                 foreach my $v ( keys %$contig ) {
 675                         next unless ref $contig->{$v};
 676                         $contig->{$v} = $still_contig->{$v};
 677                 }
 678     } # end if @resolve
 679
 680     my $is_genealogical = keys %$conflict ? JSON::false : JSON::true;
 681         my $variant_row = [ [], $is_genealogical ];
 682         # Fill in the groupings from $contig.
 683         foreach my $g ( @$groups ) {
 684         my $gst = wit_stringify( $g );
 685         my @realgroup = grep { $contig->{$_} eq $gst } keys %$contig;
 686         push( @{$variant_row->[0]}, \@realgroup );
 687     }
 688     return $variant_row;
 689 }
 690
 691 sub _prune_subtree {
 692     my( $tree, $root, $contighash ) = @_;
 693     # First, delete hypothetical leaves / orphans until there are none left.
 694     my @orphan_hypotheticals = grep { ref( $contighash->{$_} ) }
 695         $tree->successorless_vertices;
 696     while( @orphan_hypotheticals ) {
 697         $tree->delete_vertices( @orphan_hypotheticals );
 698         @orphan_hypotheticals = grep { ref( $contighash->{$_} ) }
 699             $tree->successorless_vertices;
 700     }
 701     # Then delete a hypothetical root with only one successor, moving the
 702     # root to the first child that has no other predecessors.
 703     while( $tree->successors( $root ) == 1 && ref $contighash->{$root} ) {
 704         my @nextroot = $tree->successors( $root );
 705         $tree->delete_vertex( $root );
 706         ( $root ) = grep { $tree->is_predecessorless_vertex( $_ ) } @nextroot;
 707     }
 708     # The tree has been modified in place, but we need to know the new root.
 709     $root = undef unless $root && $tree->has_vertex( $root );
 710     return $root;
 711 }
 712 # Add the variant, subject to a.c. representation logic.
 713 # This assumes that we will see the 'main' version before the a.c. version.
 714 sub add_variant_wit {
 715     my( $arr, $wit, $acstr ) = @_;
 716     my $skip;
 717     if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
 718         my $real = $1;
 719         $skip = grep { $_ =~ /^\Q$real\E$/ } @$arr;
 720     }
 721     push( @$arr, $wit ) unless $skip;
 722 }
 723
 724 sub _useful_variant {
 725         my( $group_readings, $graph, $acstr ) = @_;
 726
 727         # TODO Decide what to do with AC witnesses
 728
 729         # Sort by group size and return
 730         my $is_useful = 0;
 731         my( @readings, @groups );   # The sorted groups for our answer.
 732         foreach my $rdg ( sort { @{$group_readings->{$b}} <=> @{$group_readings->{$a}} }
 733                 keys %$group_readings ) {
 734                 push( @readings, $rdg );
 735                 push( @groups, $group_readings->{$rdg} );
 736                 if( @{$group_readings->{$rdg}} > 1 ) {
 737                         $is_useful++;
 738                 } else {
 739                         my( $wit ) = @{$group_readings->{$rdg}};
 740                         $wit =~ s/^(.*)\Q$acstr\E$/$1/;
 741                         $is_useful++ unless( $graph->is_sink_vertex( $wit ) );
 742                 }
 743         }
 744         if( $is_useful > 1 ) {
 745                 return( \@readings, \@groups );
 746         } else {
 747                 return( [], [] );
 748         }
 749 }
 750
 751 =head2 wit_stringify( $groups )
 752
 753 Takes an array of witness groupings and produces a string like
 754 ['A','B'] / ['C','D','E'] / ['F']
 755
 756 =cut
 757
 758 sub wit_stringify {
 759     my $groups = shift;
 760     my @gst;
 761     # If we were passed an array of witnesses instead of an array of
 762     # groupings, then "group" the witnesses first.
 763     unless( ref( $groups->[0] ) ) {
 764         my $mkgrp = [ $groups ];
 765         $groups = $mkgrp;
 766     }
 767     foreach my $g ( @$groups ) {
 768         push( @gst, '[' . join( ',', map { "'$_'" } @$g ) . ']' );
 769     }
 770     return join( ' / ', @gst );
 771 }
 772
 773 # Helper function to ensure that X and X a.c. never appear in the same list.
 774 sub _add_to_witlist {
 775         my( $wit, $list, $acstr ) = @_;
 776         my %inlist;
 777         my $idx = 0;
 778         map { $inlist{$_} = $idx++ } @$list;
 779         if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
 780                 my $acwit = $1;
 781                 unless( exists $inlist{$acwit} ) {
 782                         push( @$list, $acwit.$acstr );
 783                 }
 784         } else {
 785                 if( exists( $inlist{$wit.$acstr} ) ) {
 786                         # Replace the a.c. version with the main witness
 787                         my $i = $inlist{$wit.$acstr};
 788                         $list->[$i] = $wit;
 789                 } else {
 790                         push( @$list, $wit );
 791                 }
 792         }
 793 }
 794
 795 sub _symmdiff {
 796         my( $lista, $listb ) = @_;
 797         my %union;
 798         my %scalars;
 799         map { $union{$_} = 1; $scalars{$_} = $_ } @$lista;
 800         map { $union{$_} += 1; $scalars{$_} = $_ } @$listb;
 801         my @set = grep { $union{$_} == 1 } keys %union;
 802         return map { $scalars{$_} } @set;
 803 }
 804
 805 1;
 806
 807 =head1 LICENSE
 808
 809 This package is free software and is provided "as is" without express
 810 or implied warranty.  You can redistribute it and/or modify it under
 811 the same terms as Perl itself.
 812
 813 =head1 AUTHOR
 814
 815 Tara L Andrews E<lt>aurum@cpan.orgE<gt>