lib/Text/Tradition/Analysis.pm

   1 package Text::Tradition::Analysis;
   2
   3 use strict;
   4 use warnings;
   5 use Benchmark;
   6 use Encode qw/ encode_utf8 /;
   7 use Exporter 'import';
   8 use JSON qw/ encode_json decode_json /;
   9 use LWP::UserAgent;
  10 use Text::Tradition;
  11 use Text::Tradition::Stemma;
  12
  13 use vars qw/ @EXPORT_OK /;
  14 @EXPORT_OK = qw/ run_analysis group_variants analyze_variant_location wit_stringify /;
  15
  16 =head1 NAME
  17
  18 Text::Tradition::Analysis - functions for stemma analysis of a tradition
  19
  20 =head1 SYNOPSIS
  21
  22   use Text::Tradition;
  23   use Text::Tradition::Analysis qw/ run_analysis analyze_variant_location /;
  24   my $t = Text::Tradition->new(
  25     'name' => 'this is a text',
  26     'input' => 'TEI',
  27     'file' => '/path/to/tei_parallel_seg_file.xml' );
  28   $t->add_stemma( 'dotfile' => $stemmafile );
  29
  30   my $variant_data = run_analysis( $tradition );
  31   # Recalculate rank $n treating all orthographic variants as equivalent
  32   my $reanalyze = analyze_variant_location( $tradition, $n, 0, 'orthographic' );
  33
  34 =head1 DESCRIPTION
  35
  36 Text::Tradition is a library for representation and analysis of collated
  37 texts, particularly medieval ones.  The Collation is the central feature of
  38 a Tradition, where the text, its sequence of readings, and its relationships
  39 between readings are actually kept.
  40
  41 =head1 SUBROUTINES
  42
  43 =head2 run_analysis( $tradition, %opts )
  44
  45 Runs the analysis described in analyze_variant_location on every location in the
  46 collation of the given tradition, with the given options. These include:
  47
  48 =over 4
  49
  50 =item * stemma_id - Specify which of the tradition's stemmata to use. Default
  51 is 0 (i.e. the first).
  52
  53 =item * ranks - Specify a list of location ranks to analyze; exclude the rest.
  54
  55 =item * merge_types - Specify a list of relationship types, where related readings
  56 should be treated as identical for the purposes of analysis.
  57
  58 =back
  59
  60 =begin testing
  61
  62 use Text::Tradition;
  63 use Text::Tradition::Analysis qw/ run_analysis analyze_variant_location /;
  64
  65 my $datafile = 't/data/florilegium_tei_ps.xml';
  66 my $tradition = Text::Tradition->new( 'input' => 'TEI',
  67                                       'name' => 'test0',
  68                                       'file' => $datafile );
  69 my $s = $tradition->add_stemma( 'dotfile' => 't/data/florilegium.dot' );
  70 is( ref( $s ), 'Text::Tradition::Stemma', "Added stemma to tradition" );
  71
  72 my %expected_genealogical = (
  73         1 => 0,
  74         2 => 1,
  75         3 =>  0,
  76         5 =>  0,
  77         7 =>  0,
  78         8 =>  0,
  79         10 => 0,
  80         13 => 1,
  81         33 => 0,
  82         34 => 0,
  83         37 => 0,
  84         60 => 0,
  85         81 => 1,
  86         84 => 0,
  87         87 => 0,
  88         101 => 0,
  89         102 => 0,
  90         122 => 1,
  91         157 => 0,
  92         166 => 1,
  93         169 => 1,
  94         200 => 0,
  95         216 => 1,
  96         217 => 1,
  97         219 => 1,
  98         241 => 1,
  99         242 => 1,
 100         243 => 1,
 101 );
 102
 103 my $data = run_analysis( $tradition );
 104 foreach my $row ( @{$data->{'variants'}} ) {
 105         # Account for rows that used to be "not useful"
 106         unless( exists $expected_genealogical{$row->{'id'}} ) {
 107                 $expected_genealogical{$row->{'id'}} = 1;
 108         }
 109         is( $row->{'genealogical'}, $expected_genealogical{$row->{'id'}},
 110                 "Got correct genealogical flag for row " . $row->{'id'} );
 111 }
 112 is( $data->{'conflict_count'}, 34, "Got right conflict count" );
 113 is( $data->{'variant_count'}, 58, "Got right total variant number" );
 114
 115 =end testing
 116
 117 =cut
 118
 119 sub run_analysis {
 120         my( $tradition, %opts ) = @_;
 121         my $c = $tradition->collation;
 122
 123         my $stemma_id = $opts{'stemma_id'} || 0;
 124         my @ranks = @{$opts{'ranks'}} if ref( $opts{'ranks'} ) eq 'ARRAY';
 125         my @collapse = @{$opts{'merge_types'}} if ref( $opts{'merge_types'} ) eq 'ARRAY';
 126
 127         # Get the stemma
 128         my $stemma = $tradition->stemma( $stemma_id );
 129         # Figure out which witnesses we are working with
 130         my @lacunose = $stemma->hypotheticals;
 131         my @tradition_wits = map { $_->sigil } $tradition->witnesses;
 132         map { push( @tradition_wits, $_->sigil."_ac" ) if $_->is_layered }
 133                 $tradition->witnesses;
 134         push( @lacunose, _symmdiff( [ $stemma->witnesses ], \@tradition_wits ) );
 135
 136         # Find and mark 'common' ranks for exclusion, unless they were
 137         # explicitly specified.
 138         unless( @ranks ) {
 139                 my %common_rank;
 140                 foreach my $rdg ( $c->common_readings ) {
 141                         $common_rank{$rdg->rank} = 1;
 142                 }
 143                 @ranks = grep { !$common_rank{$_} } ( 1 .. $c->end->rank-1 );
 144         }
 145
 146         # Group the variants to send to the solver
 147         my @groups;
 148         my %lacunae;
 149         foreach my $rank ( @ranks ) {
 150                 my $missing = [ @lacunose ];
 151                 push( @groups, group_variants( $tradition, $rank, $missing, \@collapse ) );
 152                 $lacunae{$rank} = $missing;
 153         }
 154
 155         # Parse the answer
 156         my $answer = solve_variants( $stemma->editable( ' ' ), @groups );
 157
 158         # Do further analysis on the answer
 159         my $conflict_count = 0;
 160         foreach my $idx ( 0 .. $#ranks ) {
 161                 my $location = $answer->{'variants'}->[$idx];
 162                 # Add the rank back in
 163                 $location->{'id'} = $ranks[$idx];
 164                 # Add the lacunae back in
 165                 $location->{'missing'} = $lacunae{$ranks[$idx]};
 166                 # Run the extra analysis we need.
 167                 analyze_location( $tradition, $stemma->graph, $location );
 168                 # Add the reading text back in
 169                 foreach my $rdghash ( @{$location->{'readings'}} ) {
 170                         $conflict_count++
 171                                 if exists $rdghash->{'conflict'} && $rdghash->{'conflict'};
 172                         my $rdg = $c->reading( $rdghash->{'readingid'} );
 173                         $rdghash->{'text'} = $rdg ? $rdg->text : $rdghash->{'readingid'};
 174                 }
 175         }
 176         $answer->{'conflict_count'} = $conflict_count;
 177
 178         return $answer;
 179 }
 180
 181 =head2 group_variants( $tradition, $rank, $lacunose, @merge_relationship_types )
 182
 183 Groups the variants at the given $rank of the collation, treating any
 184 relationships in @merge_relationship_types as equivalent.  $lacunose should
 185 be a reference to an array, to which the sigla of lacunose witnesses at this
 186 rank will be appended.
 187
 188 Returns two ordered lists $readings, $groups, where $readings->[$n] is attested
 189 by the witnesses listed in $groups->[$n].
 190
 191 =cut
 192
 193 # Return group_readings, groups, lacunose
 194 sub group_variants {
 195         my( $tradition, $rank, $lacunose, $collapse ) = @_;
 196         my $c = $tradition->collation;
 197         my $aclabel = $c->ac_label;
 198         # Get the alignment table readings
 199         my %readings_at_rank;
 200         my @gap_wits;
 201         foreach my $tablewit ( @{$tradition->collation->alignment_table->{'alignment'}} ) {
 202                 my $rdg = $tablewit->{'tokens'}->[$rank-1];
 203                 my $wit = $tablewit->{'witness'};
 204                 $wit =~ s/^(.*)\Q$aclabel\E$/${1}_ac/;
 205                 if( $rdg && $rdg->{'t'}->is_lacuna ) {
 206                         _add_to_witlist( $wit, $lacunose, '_ac' );
 207                 } elsif( $rdg ) {
 208                         $readings_at_rank{$rdg->{'t'}->text} = $rdg->{'t'};
 209                 } else {
 210                         _add_to_witlist( $wit, \@gap_wits, '_ac' );
 211                 }
 212         }
 213
 214         # Group the readings, collapsing groups by relationship if needed
 215         my %grouped_readings;
 216         foreach my $rdg ( sort { $b->witnesses <=> $a->witnesses } values %readings_at_rank ) {
 217                 # Skip readings that have been collapsed into others.
 218                 next if exists $grouped_readings{$rdg->id} && !$grouped_readings{$rdg->id};
 219                 my @wits = $rdg->witnesses;
 220                 map { s/\Q$aclabel\E$/_ac/ } @wits;
 221                 if( $collapse ) {
 222                         my $filter = sub { my $r = $_[0]; grep { $_ eq $r->type } @$collapse; };
 223                         foreach my $other ( $rdg->related_readings( $filter ) ) {
 224                                 my @otherwits = $other->witnesses;
 225                                 map { s/\Q$aclabel\E$/_ac/ } @otherwits;
 226                                 push( @wits, @otherwits );
 227                                 $grouped_readings{$other->id} = 0;
 228                         }
 229                 }
 230                 $grouped_readings{$rdg->id} = \@wits;
 231         }
 232         $grouped_readings{'(omitted)'} = \@gap_wits if @gap_wits;
 233         # Get rid of our collapsed readings
 234         map { delete $grouped_readings{$_} unless $grouped_readings{$_} }
 235                 keys %grouped_readings
 236                 if $collapse;
 237
 238         return \%grouped_readings;
 239 }
 240
 241 =head2 solve_variants( $graph, @groups )
 242
 243 Sends the set of groups to the external graph solver service and returns
 244 a cleaned-up answer, adding the rank IDs back where they belong.
 245
 246 The JSON has the form
 247   { "graph": [ stemmagraph DOT string without newlines ],
 248     "groupings": [ array of arrays of groups, one per rank ] }
 249
 250 The answer has the form
 251   { "variants" => [ array of variant location structures ],
 252     "variant_count" => total,
 253     "conflict_count" => number of conflicts detected,
 254     "genealogical_count" => number of solutions found }
 255
 256 =cut
 257
 258 sub solve_variants {
 259         my( $graph, @groups ) = @_;
 260
 261         # Make the json with stemma + groups
 262         my $jsonstruct = { 'graph' => $graph, 'groupings' => [] };
 263         foreach my $ghash ( @groups ) {
 264                 my @grouping;
 265                 foreach my $k ( sort keys %$ghash ) {
 266                         push( @grouping, $ghash->{$k} );
 267                 }
 268                 push( @{$jsonstruct->{'groupings'}}, \@grouping );
 269         }
 270         my $json = encode_json( $jsonstruct );
 271
 272         # Send it off and get the result
 273         my $solver_url = 'http://byzantini.st/cgi-bin/graphcalc.cgi';
 274         my $ua = LWP::UserAgent->new();
 275         my $resp = $ua->post( $solver_url, 'Content-Type' => 'application/json',
 276                                                   'Content' => $json );
 277
 278         my $answer;
 279         if( $resp->is_success ) {
 280                 $answer = decode_json( $resp->content );
 281         } else {
 282                 # Fall back to the old method.
 283                 warn "IDP solver returned " . $resp->status_line . " / " . $resp->content
 284                         . "; falling back to perl method";
 285                 $answer = perl_solver( $graph, @groups );
 286         }
 287
 288         # Fold the result back into what we know about the groups.
 289         my $variants = [];
 290         my $genealogical = 0;
 291         foreach my $idx ( 0 .. $#groups ) {
 292                 my( $calc_groups, $result ) = @{$answer->[$idx]};
 293                 $genealogical++ if $result;
 294                 my $input_group = $groups[$idx];
 295                 foreach my $k ( sort keys %$input_group ) {
 296                         my $cg = shift @$calc_groups;
 297                         $input_group->{$k} = $cg;
 298                 }
 299                 my $vstruct = {
 300                         'genealogical' => $result,
 301                         'readings' => [],
 302                 };
 303                 foreach my $k ( keys %$input_group ) {
 304                         push( @{$vstruct->{'readings'}},
 305                                   { 'readingid' => $k, 'group' => $input_group->{$k}} );
 306                 }
 307                 push( @$variants, $vstruct );
 308         }
 309
 310         return { 'variants' => $variants,
 311                          'variant_count' => scalar @$variants,
 312                          'genealogical_count' => $genealogical };
 313 }
 314
 315 =head2 analyze_location ( $tradition, $graph, $location_hash )
 316
 317 Given the tradition, its stemma graph, and the solution from the graph solver,
 318 work out the rest of the information we want.  For each reading we need missing,
 319 conflict, reading_parents, independent_occurrence, followed, not_followed, and follow_unknown.  Alters the location_hash in place.
 320
 321 =cut
 322
 323 sub analyze_location {
 324         my ( $tradition, $graph, $variant_row ) = @_;
 325
 326         # Make a hash of all known node memberships, and make the subgraphs.
 327         my $contig = {};
 328         my $reading_roots = {};
 329         my $subgraph = {};
 330     foreach my $rdghash ( @{$variant_row->{'readings'}} ) {
 331         my $rid = $rdghash->{'readingid'};
 332                 map { $contig->{$_} = $rid } @{$rdghash->{'group'}};
 333
 334         # Make the subgraph.
 335         my $part = $graph->copy;
 336                 my %these_vertices;
 337                 map { $these_vertices{$_} = 1 } @{$rdghash->{'group'}};
 338                 $part->delete_vertices( grep { !$these_vertices{$_} } $part->vertices );
 339                 $subgraph->{$rid} = $part;
 340                 # Get the reading roots.
 341                 map { $reading_roots->{$_} = $rid } $part->predecessorless_vertices;
 342         }
 343
 344         # Now that we have all the node group memberships, calculate followed/
 345     # non-followed/unknown values for each reading.  Also figure out the
 346     # reading's evident parent(s).
 347     foreach my $rdghash ( @{$variant_row->{'readings'}} ) {
 348         # Group string key - TODO do we need this?
 349         my $gst = wit_stringify( $rdghash->{'group'} );
 350         my $rid = $rdghash->{'readingid'};
 351         # Get the subgraph
 352         my $part = $subgraph->{$rid};
 353
 354         # Start figuring things out.
 355         my @roots = $part->predecessorless_vertices;
 356         $rdghash->{'independent_occurrence'} = scalar @roots;
 357         $rdghash->{'followed'} = scalar( $part->vertices ) - scalar( @roots );
 358         # Find the parent readings, if any, of this reading.
 359         my %rdgparents;
 360         foreach my $wit ( @roots ) {
 361                 # Look in the main stemma to find this witness's extant or known-reading
 362                 # immediate ancestor(s), and look up the reading that each ancestor olds.
 363                         my @check = $graph->predecessors( $wit );
 364                         while( @check ) {
 365                                 my @next;
 366                                 foreach my $wparent( @check ) {
 367                                         my $preading = $contig->{$wparent};
 368                                         if( $preading ) {
 369                                                 $rdgparents{$preading} = 1;
 370                                         } else {
 371                                                 push( @next, $graph->predecessors( $wparent ) );
 372                                         }
 373                                 }
 374                                 @check = @next;
 375                         }
 376                 }
 377                 $rdghash->{'reading_parents'} = [ keys %rdgparents ];
 378
 379                 # Find the number of times this reading was altered, and the number of
 380                 # times we're not sure.
 381                 my( %nofollow, %unknownfollow );
 382                 foreach my $wit ( $part->vertices ) {
 383                         foreach my $wchild ( $graph->successors( $wit ) ) {
 384                                 next if $part->has_vertex( $wchild );
 385                                 if( $reading_roots->{$wchild} && $contig->{$wchild} ) {
 386                                         # It definitely changed here.
 387                                         $nofollow{$wchild} = 1;
 388                                 } elsif( !($contig->{$wchild}) ) {
 389                                         # The child is a hypothetical node not definitely in
 390                                         # any group. Answer is unknown.
 391                                         $unknownfollow{$wchild} = 1;
 392                                 } # else it's a non-root node in a known group, and therefore
 393                                   # is presumed to have its reading from its group, not this link.
 394                         }
 395                 }
 396                 $rdghash->{'not_followed'} = keys %nofollow;
 397                 $rdghash->{'follow_unknown'} = keys %unknownfollow;
 398
 399                 # Now say whether this reading represents a conflict.
 400                 unless( $variant_row->{'genealogical'} ) {
 401                         $rdghash->{'conflict'} = @roots != 1;
 402                 }
 403     }
 404 }
 405
 406
 407 =head2 perl_solver( $tradition, $rank, $stemma_id, @merge_relationship_types )
 408
 409 ** NOTE ** This method should hopefully not be called - it is not guaranteed
 410 to be correct.  Serves as a backup for the real solver.
 411
 412 Runs an analysis of the given tradition, at the location given in $rank,
 413 against the graph of the stemma specified in $stemma_id.  The argument
 414 @merge_relationship_types is an optional list of relationship types for
 415 which readings so related should be treated as equivalent.
 416
 417 Returns a nested array data structure as follows:
 418
 419  [ [ group_list, is_genealogical ], [ group_list, is_genealogical ] ... ]
 420
 421 where the group list is the array of arrays passed in for each element of @groups,
 422 possibly with the addition of hypothetical readings.
 423
 424
 425 =cut
 426
 427 sub perl_solver {
 428         my( $graph, @groups ) = @_;
 429
 430         warn "Not implemented yet";
 431         return [];
 432 }
 433
 434         # Now do the work.
 435 #     my $contig = {};
 436 #     my $subgraph = {};
 437 #     my $is_conflicted;
 438 #     my $conflict = {};
 439 #     my %reading_roots;
 440 #     my $variant_row = { 'id' => $rank, 'readings' => [] };
 441 #     # Mark each ms as in its own group, first.
 442 #     foreach my $g ( @$groups ) {
 443 #         my $gst = wit_stringify( $g );
 444 #         map { $contig->{$_} = $gst } @$g;
 445 #     }
 446 #     # Now for each unmarked node in the graph, initialize an array
 447 #     # for possible group memberships.  We will use this later to
 448 #     # resolve potential conflicts.
 449 #     map { $contig->{$_} = [] unless $contig->{$_} } $graph->vertices;
 450 #     foreach my $g ( sort { scalar @$b <=> scalar @$a } @$groups ) {
 451 #         my $gst = wit_stringify( $g );  # This is the group name
 452 #         # Copy the graph, and delete all non-members from the new graph.
 453 #         my $part = $graph->copy;
 454 #         my @group_roots;
 455 #         $part->delete_vertices(
 456 #             grep { !ref( $contig->{$_} ) && $contig->{$_} ne $gst } $graph->vertices );
 457 #
 458 #         # Now look to see if our group is connected.
 459 #         if( $undirected ) { # For use with distance trees etc.
 460 #             # Find all vertices reachable from the first (arbitrary) group
 461 #             # member.  If we are genealogical this should include them all.
 462 #             my $reachable = {};
 463 #             map { $reachable->{$_} = 1 } $part->all_reachable( $g->[0] );
 464 #             # TODO This is a terrible way to do distance trees, since all
 465 #             # non-leaf nodes are included in every graph part now. We may
 466 #             # have to go back to SPDP.
 467 #         } else {
 468 #             if( @$g > 1 ) {
 469 #                 # We have to take directionality into account.
 470 #                 # How many root nodes do we have?
 471 #                 my @roots = grep { ref( $contig->{$_} ) || $contig->{$_} eq $gst }
 472 #                     $part->predecessorless_vertices;
 473 #                 # Assuming that @$g > 1, find the first root node that has at
 474 #                 # least one successor belonging to our group. If this reading
 475 #                 # is genealogical, there should be only one, but we will check
 476 #                 # that implicitly later.
 477 #                 foreach my $root ( @roots ) {
 478 #                     # Prune the tree to get rid of extraneous hypotheticals.
 479 #                     $root = _prune_subtree( $part, $root, $contig );
 480 #                     next unless $root;
 481 #                     # Save this root for our group.
 482 #                     push( @group_roots, $root );
 483 #                     # Get all the successor nodes of our root.
 484 #                 }
 485 #             } else {
 486 #               # Dispense with the trivial case of one reading.
 487 #               my $wit = pop @$g;
 488 #                 @group_roots = ( $wit );
 489 #                 foreach my $v ( $part->vertices ) {
 490 #                       $part->delete_vertex( $v ) unless $v eq $wit;
 491 #                 }
 492 #             }
 493 #         }
 494 #
 495 #         map { $reading_roots{$_} = 1 } @group_roots;
 496 #         if( @group_roots > 1 ) {
 497 #               $conflict->{$group_readings->{$gst}} = 1;
 498 #               $is_conflicted = 1;
 499 #         }
 500 #         # Paint the 'hypotheticals' with our group.
 501 #               foreach my $wit ( $part->vertices ) {
 502 #                       if( ref( $contig->{$wit} ) ) {
 503 #                               push( @{$contig->{$wit}}, $gst );
 504 #                       } elsif( $contig->{$wit} ne $gst ) {
 505 #                               warn "How did we get here?";
 506 #                       }
 507 #               }
 508 #
 509 #
 510 #         # Start to write the reading, and save the group subgraph.
 511 #         my $reading = { 'readingid' => $group_readings->{$gst},
 512 #                         'missing' => wit_stringify( \@lacunose ),
 513 #                         'group' => $gst };  # This will change if we find no conflict
 514 #               # Save the relevant subgraph.
 515 #               $subgraph->{$gst} = $part;
 516 #         push( @{$variant_row->{'readings'}}, $reading );
 517 #     }
 518 #
 519 #       # For each of our hypothetical readings, flatten its 'contig' array if
 520 #       # the array contains zero or one group.  If we have any unflattened arrays,
 521 #       # we may need to run the resolution process. If the reading is already known
 522 #       # to have a conflict, flatten the 'contig' array to nothing; we won't resolve
 523 #       # it.
 524 #       my @resolve;
 525 #       foreach my $wit ( keys %$contig ) {
 526 #               next unless ref( $contig->{$wit} );
 527 #               if( @{$contig->{$wit}} > 1 ) {
 528 #                       if( $is_conflicted ) {
 529 #                               $contig->{$wit} = '';  # We aren't going to decide.
 530 #                       } else {
 531 #                               push( @resolve, $wit );
 532 #                       }
 533 #               } else {
 534 #                       my $gst = pop @{$contig->{$wit}};
 535 #                       $contig->{$wit} = $gst || '';
 536 #               }
 537 #       }
 538 #
 539 #     if( @resolve ) {
 540 #         my $still_contig = {};
 541 #         foreach my $h ( @resolve ) {
 542 #             # For each of the hypothetical readings with more than one possibility,
 543 #             # try deleting it from each of its member subgraphs in turn, and see
 544 #             # if that breaks the contiguous grouping.
 545 #             # TODO This can still break in a corner case where group A can use
 546 #             # either vertex 1 or 2, and group B can use either vertex 2 or 1.
 547 #             # Revisit this if necessary; it could get brute-force nasty.
 548 #             foreach my $gst ( @{$contig->{$h}} ) {
 549 #                 my $gpart = $subgraph->{$gst}->copy();
 550 #                 # If we have come this far, there is only one root and everything
 551 #                 # is reachable from it.
 552 #                 my( $root ) = $gpart->predecessorless_vertices;
 553 #                 my $reachable = {};
 554 #                 map { $reachable->{$_} = 1 } $gpart->vertices;
 555 #
 556 #                 # Try deleting the hypothetical node.
 557 #                 $gpart->delete_vertex( $h );
 558 #                 if( $h eq $root ) {
 559 #                       # See if we still have a single root.
 560 #                       my @roots = $gpart->predecessorless_vertices;
 561 #                       warn "This shouldn't have happened" unless @roots;
 562 #                       if( @roots > 1 ) {
 563 #                               # $h is needed by this group.
 564 #                               if( exists( $still_contig->{$h} ) ) {
 565 #                                       # Conflict!
 566 #                                       $conflict->{$group_readings->{$gst}} = 1;
 567 #                                       $still_contig->{$h} = '';
 568 #                               } else {
 569 #                                       $still_contig->{$h} = $gst;
 570 #                               }
 571 #                       }
 572 #                 } else {
 573 #                       # $h is somewhere in the middle. See if everything
 574 #                       # else can still be reached from the root.
 575 #                                       my %still_reachable = ( $root => 1 );
 576 #                                       map { $still_reachable{$_} = 1 }
 577 #                                               $gpart->all_successors( $root );
 578 #                                       foreach my $v ( keys %$reachable ) {
 579 #                                               next if $v eq $h;
 580 #                                               if( !$still_reachable{$v}
 581 #                                                       && ( $contig->{$v} eq $gst
 582 #                                                                || ( exists $still_contig->{$v}
 583 #                                                                         && $still_contig->{$v} eq $gst ) ) ) {
 584 #                                                       # We need $h.
 585 #                                                       if( exists $still_contig->{$h} ) {
 586 #                                                               # Conflict!
 587 #                                                               $conflict->{$group_readings->{$gst}} = 1;
 588 #                                                               $still_contig->{$h} = '';
 589 #                                                       } else {
 590 #                                                               $still_contig->{$h} = $gst;
 591 #                                                       }
 592 #                                                       last;
 593 #                                               } # else we don't need $h in this group.
 594 #                                       } # end foreach $v
 595 #                               } # endif $h eq $root
 596 #             } # end foreach $gst
 597 #         } # end foreach $h
 598 #
 599 #         # Now we have some hypothetical vertices in $still_contig that are the
 600 #         # "real" group memberships.  Replace these in $contig.
 601 #               foreach my $v ( keys %$contig ) {
 602 #                       next unless ref $contig->{$v};
 603 #                       $contig->{$v} = $still_contig->{$v};
 604 #               }
 605 #     } # end if @resolve
 606 #
 607 #
 608 #     $variant_row->{'genealogical'} = !( keys %$conflict );
 609 #     return $variant_row;
 610 # }
 611
 612 sub _prune_subtree {
 613     my( $tree, $root, $contighash ) = @_;
 614     # First, delete hypothetical leaves / orphans until there are none left.
 615     my @orphan_hypotheticals = grep { ref( $contighash->{$_} ) }
 616         $tree->successorless_vertices;
 617     while( @orphan_hypotheticals ) {
 618         $tree->delete_vertices( @orphan_hypotheticals );
 619         @orphan_hypotheticals = grep { ref( $contighash->{$_} ) }
 620             $tree->successorless_vertices;
 621     }
 622     # Then delete a hypothetical root with only one successor, moving the
 623     # root to the first child that has no other predecessors.
 624     while( $tree->successors( $root ) == 1 && ref $contighash->{$root} ) {
 625         my @nextroot = $tree->successors( $root );
 626         $tree->delete_vertex( $root );
 627         ( $root ) = grep { $tree->is_predecessorless_vertex( $_ ) } @nextroot;
 628     }
 629     # The tree has been modified in place, but we need to know the new root.
 630     $root = undef unless $root && $tree->has_vertex( $root );
 631     return $root;
 632 }
 633 # Add the variant, subject to a.c. representation logic.
 634 # This assumes that we will see the 'main' version before the a.c. version.
 635 sub add_variant_wit {
 636     my( $arr, $wit, $acstr ) = @_;
 637     my $skip;
 638     if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
 639         my $real = $1;
 640         $skip = grep { $_ =~ /^\Q$real\E$/ } @$arr;
 641     }
 642     push( @$arr, $wit ) unless $skip;
 643 }
 644
 645 sub _useful_variant {
 646         my( $group_readings, $graph, $acstr ) = @_;
 647
 648         # TODO Decide what to do with AC witnesses
 649
 650         # Sort by group size and return
 651         my $is_useful = 0;
 652         my( @readings, @groups );   # The sorted groups for our answer.
 653         foreach my $rdg ( sort { @{$group_readings->{$b}} <=> @{$group_readings->{$a}} }
 654                 keys %$group_readings ) {
 655                 push( @readings, $rdg );
 656                 push( @groups, $group_readings->{$rdg} );
 657                 if( @{$group_readings->{$rdg}} > 1 ) {
 658                         $is_useful++;
 659                 } else {
 660                         my( $wit ) = @{$group_readings->{$rdg}};
 661                         $wit =~ s/^(.*)\Q$acstr\E$/$1/;
 662                         $is_useful++ unless( $graph->is_sink_vertex( $wit ) );
 663                 }
 664         }
 665         if( $is_useful > 1 ) {
 666                 return( \@readings, \@groups );
 667         } else {
 668                 return( [], [] );
 669         }
 670 }
 671
 672 =head2 wit_stringify( $groups )
 673
 674 Takes an array of witness groupings and produces a string like
 675 ['A','B'] / ['C','D','E'] / ['F']
 676
 677 =cut
 678
 679 sub wit_stringify {
 680     my $groups = shift;
 681     my @gst;
 682     # If we were passed an array of witnesses instead of an array of
 683     # groupings, then "group" the witnesses first.
 684     unless( ref( $groups->[0] ) ) {
 685         my $mkgrp = [ $groups ];
 686         $groups = $mkgrp;
 687     }
 688     foreach my $g ( @$groups ) {
 689         push( @gst, '[' . join( ',', map { "'$_'" } @$g ) . ']' );
 690     }
 691     return join( ' / ', @gst );
 692 }
 693
 694 # Helper function to ensure that X and X a.c. never appear in the same list.
 695 sub _add_to_witlist {
 696         my( $wit, $list, $acstr ) = @_;
 697         my %inlist;
 698         my $idx = 0;
 699         map { $inlist{$_} = $idx++ } @$list;
 700         if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
 701                 my $acwit = $1;
 702                 unless( exists $inlist{$acwit} ) {
 703                         push( @$list, $acwit.$acstr );
 704                 }
 705         } else {
 706                 if( exists( $inlist{$wit.$acstr} ) ) {
 707                         # Replace the a.c. version with the main witness
 708                         my $i = $inlist{$wit.$acstr};
 709                         $list->[$i] = $wit;
 710                 } else {
 711                         push( @$list, $wit );
 712                 }
 713         }
 714 }
 715
 716 sub _symmdiff {
 717         my( $lista, $listb ) = @_;
 718         my %union;
 719         my %scalars;
 720         map { $union{$_} = 1; $scalars{$_} = $_ } @$lista;
 721         map { $union{$_} += 1; $scalars{$_} = $_ } @$listb;
 722         my @set = grep { $union{$_} == 1 } keys %union;
 723         return map { $scalars{$_} } @set;
 724 }
 725
 726 1;
 727
 728 =head1 LICENSE
 729
 730 This package is free software and is provided "as is" without express
 731 or implied warranty.  You can redistribute it and/or modify it under
 732 the same terms as Perl itself.
 733
 734 =head1 AUTHOR
 735
 736 Tara L Andrews E<lt>aurum@cpan.orgE<gt>