lib/Text/Tradition/Analysis.pm

   1 package Text::Tradition::Analysis;
   2
   3 use strict;
   4 use warnings;
   5 use Benchmark;
   6 use Encode qw/ encode_utf8 /;
   7 use Exporter 'import';
   8 use JSON qw/ encode_json decode_json /;
   9 use LWP::UserAgent;
  10 use Text::Tradition;
  11 use Text::Tradition::Stemma;
  12
  13 use vars qw/ @EXPORT_OK /;
  14 @EXPORT_OK = qw/ run_analysis group_variants analyze_variant_location wit_stringify /;
  15
  16 =head1 NAME
  17
  18 Text::Tradition::Analysis - functions for stemma analysis of a tradition
  19
  20 =head1 SYNOPSIS
  21
  22   use Text::Tradition;
  23   use Text::Tradition::Analysis qw/ run_analysis analyze_variant_location /;
  24   my $t = Text::Tradition->new(
  25     'name' => 'this is a text',
  26     'input' => 'TEI',
  27     'file' => '/path/to/tei_parallel_seg_file.xml' );
  28   $t->add_stemma( 'dotfile' => $stemmafile );
  29
  30   my $variant_data = run_analysis( $tradition );
  31   # Recalculate rank $n treating all orthographic variants as equivalent
  32   my $reanalyze = analyze_variant_location( $tradition, $n, 0, 'orthographic' );
  33
  34 =head1 DESCRIPTION
  35
  36 Text::Tradition is a library for representation and analysis of collated
  37 texts, particularly medieval ones.  The Collation is the central feature of
  38 a Tradition, where the text, its sequence of readings, and its relationships
  39 between readings are actually kept.
  40
  41 =head1 SUBROUTINES
  42
  43 =head2 run_analysis( $tradition, %opts )
  44
  45 Runs the analysis described in analyze_variant_location on every location in the
  46 collation of the given tradition, with the given options. These include:
  47
  48 =over 4
  49
  50 =item * stemma_id - Specify which of the tradition's stemmata to use. Default
  51 is 0 (i.e. the first).
  52
  53 =item * ranks - Specify a list of location ranks to analyze; exclude the rest.
  54
  55 =item * merge_types - Specify a list of relationship types, where related readings
  56 should be treated as identical for the purposes of analysis.
  57
  58 =back
  59
  60 =begin testing
  61
  62 use Text::Tradition;
  63 use Text::Tradition::Analysis qw/ run_analysis analyze_variant_location /;
  64
  65 my $datafile = 't/data/florilegium_tei_ps.xml';
  66 my $tradition = Text::Tradition->new( 'input' => 'TEI',
  67                                       'name' => 'test0',
  68                                       'file' => $datafile );
  69 my $s = $tradition->add_stemma( 'dotfile' => 't/data/florilegium.dot' );
  70 is( ref( $s ), 'Text::Tradition::Stemma', "Added stemma to tradition" );
  71
  72 my %expected_genealogical = (
  73         1 => '',
  74         2 => 1,
  75         3 =>  '',
  76         5 =>  '',
  77         7 =>  '',
  78         8 =>  '',
  79         10 => '',
  80         13 => 1,
  81         33 => '',
  82         34 => '',
  83         37 => '',
  84         60 => '',
  85         81 => 1,
  86         84 => '',
  87         87 => '',
  88         101 => '',
  89         102 => '',
  90         122 => 1,
  91         157 => '',
  92         166 => 1,
  93         169 => 1,
  94         200 => 1,
  95         216 => 1,
  96         217 => 1,
  97         219 => 1,
  98         241 => 1,
  99         242 => 1,
 100         243 => 1,
 101 );
 102
 103 my $data = run_analysis( $tradition );
 104 foreach my $row ( @{$data->{'variants'}} ) {
 105         is( $row->{'genealogical'}, $expected_genealogical{$row->{'id'}},
 106                 "Got correct genealogical flag for row " . $row->{'id'} );
 107 }
 108 is( $data->{'conflict_count'}, 16, "Got right conflict count" );
 109 is( $data->{'variant_count'}, 28, "Got right total variant number" );
 110
 111 =end testing
 112
 113 =cut
 114
 115 sub run_analysis {
 116         my( $tradition, %opts ) = @_;
 117         my $c = $tradition->collation;
 118
 119         my $stemma_id = $opts{'stemma_id'} || 0;
 120         my @ranks = @{$opts{'ranks'}} if ref( $opts{'ranks'} ) eq 'ARRAY';
 121         my @collapse = @{$opts{'merge_types'}} if ref( $opts{'merge_types'} ) eq 'ARRAY';
 122
 123         # Get the stemma
 124         my $stemma = $tradition->stemma( $stemma_id );
 125         # Figure out which witnesses we are working with
 126         my @lacunose = $stemma->hypotheticals;
 127         my @tradition_wits = map { $_->sigil } $tradition->witnesses;
 128         map { push( @tradition_wits, $_->sigil."_ac" ) if $_->is_layered }
 129                 $tradition->witnesses;
 130         push( @lacunose, _symmdiff( [ $stemma->witnesses ], \@tradition_wits ) );
 131
 132         # Find and mark 'common' ranks for exclusion, unless they were
 133         # explicitly specified.
 134         unless( @ranks ) {
 135                 my %common_rank;
 136                 foreach my $rdg ( $tradition->collation->common_readings ) {
 137                         $common_rank{$rdg->rank} = 1;
 138                 }
 139                 @ranks = grep { !$common_rank{$_} } ( 1 .. $c->end->rank-1 );
 140         }
 141
 142         # Group the variants to send to the solver
 143         my @groups;
 144         foreach my $rank ( @ranks ) {
 145                 push( @groups, group_variants( $tradition, $rank, \@lacunose, \@collapse ) );
 146         }
 147
 148         # Parse the answer
 149         my $answer = solve_variants( $stemma->editable( ' ' ), @groups );
 150         $DB::single = 1;
 151
 152         # Do further analysis on the answer
 153         foreach my $idx ( 0 .. $#ranks ) {
 154                 my $location = $answer->{'variants'}->[$idx];
 155                 # Add the rank back in
 156                 $location->{'id'} = $ranks[$idx];
 157                 # Run the extra analysis we need.
 158                 analyze_location( $tradition, $stemma->graph, $location );
 159         }
 160
 161         return $answer;
 162 }
 163
 164 =head2 group_variants( $tradition, $rank, $lacunose, @merge_relationship_types )
 165
 166 Groups the variants at the given $rank of the collation, treating any
 167 relationships in @merge_relationship_types as equivalent.  $lacunose should
 168 be a reference to an array, to which the sigla of lacunose witnesses at this
 169 rank will be appended.
 170
 171 Returns two ordered lists $readings, $groups, where $readings->[$n] is attested
 172 by the witnesses listed in $groups->[$n].
 173
 174 =cut
 175
 176 # Return group_readings, groups, lacunose
 177 sub group_variants {
 178         my( $tradition, $rank, $lacunose, $collapse ) = @_;
 179         my $c = $tradition->collation;
 180         my $aclabel = $c->ac_label;
 181         # Get the alignment table readings
 182         my %readings_at_rank;
 183         my @gap_wits;
 184         foreach my $tablewit ( @{$tradition->collation->alignment_table->{'alignment'}} ) {
 185                 my $rdg = $tablewit->{'tokens'}->[$rank-1];
 186                 my $wit = $tablewit->{'witness'};
 187                 $wit =~ s/^(.*)\Q$aclabel\E$/${1}_ac/;
 188                 if( $rdg && $rdg->{'t'}->is_lacuna ) {
 189                         _add_to_witlist( $wit, $lacunose, '_ac' );
 190                 } elsif( $rdg ) {
 191                         $readings_at_rank{$rdg->{'t'}->text} = $rdg->{'t'};
 192                 } else {
 193                         _add_to_witlist( $wit, \@gap_wits, '_ac' );
 194                 }
 195         }
 196
 197         # Group the readings, collapsing groups by relationship if needed
 198         my %grouped_readings;
 199         foreach my $rdg ( sort { $b->witnesses <=> $a->witnesses } values %readings_at_rank ) {
 200                 # Skip readings that have been collapsed into others.
 201                 next if exists $grouped_readings{$rdg->id} && !$grouped_readings{$rdg->id};
 202                 my @wits = $rdg->witnesses;
 203                 map { s/\Q$aclabel\E$/_ac/ } @wits;
 204                 if( $collapse ) {
 205                         my $filter = sub { my $r = $_[0]; grep { $_ eq $r->type } @$collapse; };
 206                         foreach my $other ( $rdg->related_readings( $filter ) ) {
 207                                 my @otherwits = $other->witnesses;
 208                                 map { s/\Q$aclabel\E$/_ac/ } @otherwits;
 209                                 push( @wits, @otherwits );
 210                                 $grouped_readings{$other->id} = 0;
 211                         }
 212                 }
 213                 $grouped_readings{$rdg->id} = \@wits;
 214         }
 215         $grouped_readings{'(omitted)'} = \@gap_wits if @gap_wits;
 216         # Get rid of our collapsed readings
 217         map { delete $grouped_readings{$_} unless $grouped_readings{$_} }
 218                 keys %grouped_readings
 219                 if $collapse;
 220
 221         return \%grouped_readings;
 222 }
 223
 224 =head2 solve_variants( $graph, @groups )
 225
 226 Sends the set of groups to the external graph solver service and returns
 227 a cleaned-up answer, adding the rank IDs back where they belong.
 228
 229 The JSON has the form
 230   { "graph": [ stemmagraph DOT string without newlines ],
 231     "groupings": [ array of arrays of groups, one per rank ] }
 232
 233 The answer has the form
 234   { "variants" => [ array of variant location structures ],
 235     "variant_count" => total,
 236     "conflict_count" => number of conflicts detected,
 237     "genealogical_count" => number of solutions found }
 238
 239 =cut
 240
 241 sub solve_variants {
 242         my( $graph, @groups ) = @_;
 243
 244         # Make the json with stemma + groups
 245         my $jsonstruct = { 'graph' => $graph, 'groupings' => [] };
 246         foreach my $ghash ( @groups ) {
 247                 my @grouping;
 248                 foreach my $k ( sort keys %$ghash ) {
 249                         push( @grouping, $ghash->{$k} );
 250                 }
 251                 push( @{$jsonstruct->{'groupings'}}, \@grouping );
 252         }
 253         my $json = encode_json( $jsonstruct );
 254
 255         # Send it off and get the result
 256         my $solver_url = 'http://byzantini.st/cgi-bin/graphcalc.cgi';
 257         my $ua = LWP::UserAgent->new();
 258         my $resp = $ua->post( $solver_url, 'Content-Type' => 'application/json',
 259                                                   'Content' => $json );
 260
 261         my $answer;
 262         if( $resp->is_success ) {
 263                 $answer = decode_json( $resp->content );
 264         } else {
 265                 # Fall back to the old method.
 266                 warn "IDP solver returned " . $resp->status_line . " / " . $resp->content
 267                         . "; falling back to perl method";
 268                 $answer = perl_solver( $graph, @groups );
 269         }
 270
 271         # Fold the result back into what we know about the groups.
 272         my $variants = [];
 273         my $genealogical = 0;
 274         foreach my $idx ( 0 .. $#groups ) {
 275                 my( $calc_groups, $result ) = @{$answer->[$idx]};
 276                 $genealogical++ if $result;
 277                 my $input_group = $groups[$idx];
 278                 foreach my $k ( sort keys %$input_group ) {
 279                         my $cg = shift @$calc_groups;
 280                         $input_group->{$k} = $cg;
 281                 }
 282                 my $vstruct = {
 283                         'genealogical' => $result,
 284                         'readings' => [],
 285                 };
 286                 foreach my $k ( keys %$input_group ) {
 287                         push( @{$vstruct->{'readings'}},
 288                                   { 'readingid' => $k, 'group' => $input_group->{$k}} );
 289                 }
 290                 push( @$variants, $vstruct );
 291         }
 292
 293         return { 'variants' => $variants,
 294                          'variant_count' => scalar @$variants,
 295                          'genealogical_count' => $genealogical };
 296 }
 297
 298 =head2 analyze_location ( $tradition, $graph, $location_hash )
 299
 300 Given the tradition, its stemma graph, and the solution from the graph solver,
 301 work out the rest of the information we want.  For each reading we need missing,
 302 conflict, reading_parents, independent_occurrence, followed, not_followed, and follow_unknown.  Alters the location_hash in place.
 303
 304 =cut
 305
 306 sub analyze_location {
 307         my ( $tradition, $graph, $variant_row ) = @_;
 308
 309         # Make a hash of all known node memberships, and make the subgraphs.
 310         my $contig = {};
 311         my $reading_roots = {};
 312         my $subgraph = {};
 313     foreach my $rdghash ( @{$variant_row->{'readings'}} ) {
 314         my $rid = $rdghash->{'readingid'};
 315                 map { $contig->{$_} = $rid } @{$rdghash->{'group'}};
 316
 317         # Make the subgraph.
 318         my $part = $graph->copy;
 319                 my %these_vertices;
 320                 map { $these_vertices{$_} = 1 } @{$rdghash->{'group'}};
 321                 $part->delete_vertices( grep { !$these_vertices{$_} } $part->vertices );
 322                 $subgraph->{$rid} = $part;
 323                 # Get the reading roots.
 324                 map { $reading_roots->{$_} = $rid } $part->predecessorless_vertices;
 325         }
 326
 327         # Now that we have all the node group memberships, calculate followed/
 328     # non-followed/unknown values for each reading.  Also figure out the
 329     # reading's evident parent(s).
 330     foreach my $rdghash ( @{$variant_row->{'readings'}} ) {
 331         # Group string key - TODO do we need this?
 332         my $gst = wit_stringify( $rdghash->{'group'} );
 333         my $rid = $rdghash->{'readingid'};
 334         # Get the subgraph
 335         my $part = $subgraph->{$rid};
 336
 337         # Start figuring things out.
 338         my @roots = $part->predecessorless_vertices;
 339         $rdghash->{'independent_occurrence'} = scalar @roots;
 340         $rdghash->{'followed'} = scalar( $part->vertices ) - scalar( @roots );
 341         # Find the parent readings, if any, of this reading.
 342         my %rdgparents;
 343         $DB::single = 1;
 344         foreach my $wit ( @roots ) {
 345                 # Look in the main stemma to find this witness's extant or known-reading
 346                 # immediate ancestor(s), and look up the reading that each ancestor olds.
 347                         my @check = $graph->predecessors( $wit );
 348                         while( @check ) {
 349                                 my @next;
 350                                 foreach my $wparent( @check ) {
 351                                         my $preading = $contig->{$wparent};
 352                                         if( $preading ) {
 353                                                 $rdgparents{$preading} = 1;
 354                                         } else {
 355                                                 push( @next, $graph->predecessors( $wparent ) );
 356                                         }
 357                                 }
 358                                 @check = @next;
 359                         }
 360                 }
 361                 $rdghash->{'reading_parents'} = [ keys %rdgparents ];
 362
 363                 # Find the number of times this reading was altered, and the number of
 364                 # times we're not sure.
 365                 my( %nofollow, %unknownfollow );
 366                 foreach my $wit ( $part->vertices ) {
 367                         foreach my $wchild ( $graph->successors( $wit ) ) {
 368                                 next if $part->has_vertex( $wchild );
 369                                 if( $reading_roots->{$wchild} && $contig->{$wchild} ) {
 370                                         # It definitely changed here.
 371                                         $nofollow{$wchild} = 1;
 372                                 } elsif( !($contig->{$wchild}) ) {
 373                                         # The child is a hypothetical node not definitely in
 374                                         # any group. Answer is unknown.
 375                                         $unknownfollow{$wchild} = 1;
 376                                 } # else it's a non-root node in a known group, and therefore
 377                                   # is presumed to have its reading from its group, not this link.
 378                         }
 379                 }
 380                 $rdghash->{'not_followed'} = keys %nofollow;
 381                 $rdghash->{'follow_unknown'} = keys %unknownfollow;
 382
 383                 # Now say whether this reading represents a conflict.
 384                 unless( $variant_row->{'genealogical'} ) {
 385                         $rdghash->{'conflict'} = @roots != 1;
 386                 }
 387     }
 388 }
 389
 390
 391 =head2 perl_solver( $tradition, $rank, $stemma_id, @merge_relationship_types )
 392
 393 ** NOTE ** This method should hopefully not be called - it is not guaranteed
 394 to be correct.  Serves as a backup for the real solver.
 395
 396 Runs an analysis of the given tradition, at the location given in $rank,
 397 against the graph of the stemma specified in $stemma_id.  The argument
 398 @merge_relationship_types is an optional list of relationship types for
 399 which readings so related should be treated as equivalent.
 400
 401 Returns a nested array data structure as follows:
 402
 403  [ [ group_list, is_genealogical ], [ group_list, is_genealogical ] ... ]
 404
 405 where the group list is the array of arrays passed in for each element of @groups,
 406 possibly with the addition of hypothetical readings.
 407
 408
 409 =cut
 410
 411 sub perl_solver {
 412         my( $graph, @groups ) = @_;
 413
 414         warn "Not implemented yet";
 415         return [];
 416 }
 417
 418         # Now do the work.
 419 #     my $contig = {};
 420 #     my $subgraph = {};
 421 #     my $is_conflicted;
 422 #     my $conflict = {};
 423 #     my %reading_roots;
 424 #     my $variant_row = { 'id' => $rank, 'readings' => [] };
 425 #     # Mark each ms as in its own group, first.
 426 #     foreach my $g ( @$groups ) {
 427 #         my $gst = wit_stringify( $g );
 428 #         map { $contig->{$_} = $gst } @$g;
 429 #     }
 430 #     # Now for each unmarked node in the graph, initialize an array
 431 #     # for possible group memberships.  We will use this later to
 432 #     # resolve potential conflicts.
 433 #     map { $contig->{$_} = [] unless $contig->{$_} } $graph->vertices;
 434 #     foreach my $g ( sort { scalar @$b <=> scalar @$a } @$groups ) {
 435 #         my $gst = wit_stringify( $g );  # This is the group name
 436 #         # Copy the graph, and delete all non-members from the new graph.
 437 #         my $part = $graph->copy;
 438 #         my @group_roots;
 439 #         $part->delete_vertices(
 440 #             grep { !ref( $contig->{$_} ) && $contig->{$_} ne $gst } $graph->vertices );
 441 #
 442 #         # Now look to see if our group is connected.
 443 #         if( $undirected ) { # For use with distance trees etc.
 444 #             # Find all vertices reachable from the first (arbitrary) group
 445 #             # member.  If we are genealogical this should include them all.
 446 #             my $reachable = {};
 447 #             map { $reachable->{$_} = 1 } $part->all_reachable( $g->[0] );
 448 #             # TODO This is a terrible way to do distance trees, since all
 449 #             # non-leaf nodes are included in every graph part now. We may
 450 #             # have to go back to SPDP.
 451 #         } else {
 452 #             if( @$g > 1 ) {
 453 #                 # We have to take directionality into account.
 454 #                 # How many root nodes do we have?
 455 #                 my @roots = grep { ref( $contig->{$_} ) || $contig->{$_} eq $gst }
 456 #                     $part->predecessorless_vertices;
 457 #                 # Assuming that @$g > 1, find the first root node that has at
 458 #                 # least one successor belonging to our group. If this reading
 459 #                 # is genealogical, there should be only one, but we will check
 460 #                 # that implicitly later.
 461 #                 foreach my $root ( @roots ) {
 462 #                     # Prune the tree to get rid of extraneous hypotheticals.
 463 #                     $root = _prune_subtree( $part, $root, $contig );
 464 #                     next unless $root;
 465 #                     # Save this root for our group.
 466 #                     push( @group_roots, $root );
 467 #                     # Get all the successor nodes of our root.
 468 #                 }
 469 #             } else {
 470 #               # Dispense with the trivial case of one reading.
 471 #               my $wit = pop @$g;
 472 #                 @group_roots = ( $wit );
 473 #                 foreach my $v ( $part->vertices ) {
 474 #                       $part->delete_vertex( $v ) unless $v eq $wit;
 475 #                 }
 476 #             }
 477 #         }
 478 #
 479 #         map { $reading_roots{$_} = 1 } @group_roots;
 480 #         if( @group_roots > 1 ) {
 481 #               $conflict->{$group_readings->{$gst}} = 1;
 482 #               $is_conflicted = 1;
 483 #         }
 484 #         # Paint the 'hypotheticals' with our group.
 485 #               foreach my $wit ( $part->vertices ) {
 486 #                       if( ref( $contig->{$wit} ) ) {
 487 #                               push( @{$contig->{$wit}}, $gst );
 488 #                       } elsif( $contig->{$wit} ne $gst ) {
 489 #                               warn "How did we get here?";
 490 #                       }
 491 #               }
 492 #
 493 #
 494 #         # Start to write the reading, and save the group subgraph.
 495 #         my $reading = { 'readingid' => $group_readings->{$gst},
 496 #                         'missing' => wit_stringify( \@lacunose ),
 497 #                         'group' => $gst };  # This will change if we find no conflict
 498 #               # Save the relevant subgraph.
 499 #               $subgraph->{$gst} = $part;
 500 #         push( @{$variant_row->{'readings'}}, $reading );
 501 #     }
 502 #
 503 #       # For each of our hypothetical readings, flatten its 'contig' array if
 504 #       # the array contains zero or one group.  If we have any unflattened arrays,
 505 #       # we may need to run the resolution process. If the reading is already known
 506 #       # to have a conflict, flatten the 'contig' array to nothing; we won't resolve
 507 #       # it.
 508 #       my @resolve;
 509 #       foreach my $wit ( keys %$contig ) {
 510 #               next unless ref( $contig->{$wit} );
 511 #               if( @{$contig->{$wit}} > 1 ) {
 512 #                       if( $is_conflicted ) {
 513 #                               $contig->{$wit} = '';  # We aren't going to decide.
 514 #                       } else {
 515 #                               push( @resolve, $wit );
 516 #                       }
 517 #               } else {
 518 #                       my $gst = pop @{$contig->{$wit}};
 519 #                       $contig->{$wit} = $gst || '';
 520 #               }
 521 #       }
 522 #
 523 #     if( @resolve ) {
 524 #         my $still_contig = {};
 525 #         foreach my $h ( @resolve ) {
 526 #             # For each of the hypothetical readings with more than one possibility,
 527 #             # try deleting it from each of its member subgraphs in turn, and see
 528 #             # if that breaks the contiguous grouping.
 529 #             # TODO This can still break in a corner case where group A can use
 530 #             # either vertex 1 or 2, and group B can use either vertex 2 or 1.
 531 #             # Revisit this if necessary; it could get brute-force nasty.
 532 #             foreach my $gst ( @{$contig->{$h}} ) {
 533 #                 my $gpart = $subgraph->{$gst}->copy();
 534 #                 # If we have come this far, there is only one root and everything
 535 #                 # is reachable from it.
 536 #                 my( $root ) = $gpart->predecessorless_vertices;
 537 #                 my $reachable = {};
 538 #                 map { $reachable->{$_} = 1 } $gpart->vertices;
 539 #
 540 #                 # Try deleting the hypothetical node.
 541 #                 $gpart->delete_vertex( $h );
 542 #                 if( $h eq $root ) {
 543 #                       # See if we still have a single root.
 544 #                       my @roots = $gpart->predecessorless_vertices;
 545 #                       warn "This shouldn't have happened" unless @roots;
 546 #                       if( @roots > 1 ) {
 547 #                               # $h is needed by this group.
 548 #                               if( exists( $still_contig->{$h} ) ) {
 549 #                                       # Conflict!
 550 #                                       $conflict->{$group_readings->{$gst}} = 1;
 551 #                                       $still_contig->{$h} = '';
 552 #                               } else {
 553 #                                       $still_contig->{$h} = $gst;
 554 #                               }
 555 #                       }
 556 #                 } else {
 557 #                       # $h is somewhere in the middle. See if everything
 558 #                       # else can still be reached from the root.
 559 #                                       my %still_reachable = ( $root => 1 );
 560 #                                       map { $still_reachable{$_} = 1 }
 561 #                                               $gpart->all_successors( $root );
 562 #                                       foreach my $v ( keys %$reachable ) {
 563 #                                               next if $v eq $h;
 564 #                                               if( !$still_reachable{$v}
 565 #                                                       && ( $contig->{$v} eq $gst
 566 #                                                                || ( exists $still_contig->{$v}
 567 #                                                                         && $still_contig->{$v} eq $gst ) ) ) {
 568 #                                                       # We need $h.
 569 #                                                       if( exists $still_contig->{$h} ) {
 570 #                                                               # Conflict!
 571 #                                                               $conflict->{$group_readings->{$gst}} = 1;
 572 #                                                               $still_contig->{$h} = '';
 573 #                                                       } else {
 574 #                                                               $still_contig->{$h} = $gst;
 575 #                                                       }
 576 #                                                       last;
 577 #                                               } # else we don't need $h in this group.
 578 #                                       } # end foreach $v
 579 #                               } # endif $h eq $root
 580 #             } # end foreach $gst
 581 #         } # end foreach $h
 582 #
 583 #         # Now we have some hypothetical vertices in $still_contig that are the
 584 #         # "real" group memberships.  Replace these in $contig.
 585 #               foreach my $v ( keys %$contig ) {
 586 #                       next unless ref $contig->{$v};
 587 #                       $contig->{$v} = $still_contig->{$v};
 588 #               }
 589 #     } # end if @resolve
 590 #
 591 #
 592 #     $variant_row->{'genealogical'} = !( keys %$conflict );
 593 #     return $variant_row;
 594 # }
 595
 596 sub _prune_subtree {
 597     my( $tree, $root, $contighash ) = @_;
 598     # First, delete hypothetical leaves / orphans until there are none left.
 599     my @orphan_hypotheticals = grep { ref( $contighash->{$_} ) }
 600         $tree->successorless_vertices;
 601     while( @orphan_hypotheticals ) {
 602         $tree->delete_vertices( @orphan_hypotheticals );
 603         @orphan_hypotheticals = grep { ref( $contighash->{$_} ) }
 604             $tree->successorless_vertices;
 605     }
 606     # Then delete a hypothetical root with only one successor, moving the
 607     # root to the first child that has no other predecessors.
 608     while( $tree->successors( $root ) == 1 && ref $contighash->{$root} ) {
 609         my @nextroot = $tree->successors( $root );
 610         $tree->delete_vertex( $root );
 611         ( $root ) = grep { $tree->is_predecessorless_vertex( $_ ) } @nextroot;
 612     }
 613     # The tree has been modified in place, but we need to know the new root.
 614     $root = undef unless $root && $tree->has_vertex( $root );
 615     return $root;
 616 }
 617 # Add the variant, subject to a.c. representation logic.
 618 # This assumes that we will see the 'main' version before the a.c. version.
 619 sub add_variant_wit {
 620     my( $arr, $wit, $acstr ) = @_;
 621     my $skip;
 622     if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
 623         my $real = $1;
 624         $skip = grep { $_ =~ /^\Q$real\E$/ } @$arr;
 625     }
 626     push( @$arr, $wit ) unless $skip;
 627 }
 628
 629 sub _useful_variant {
 630         my( $group_readings, $graph, $acstr ) = @_;
 631
 632         # TODO Decide what to do with AC witnesses
 633
 634         # Sort by group size and return
 635         my $is_useful = 0;
 636         my( @readings, @groups );   # The sorted groups for our answer.
 637         foreach my $rdg ( sort { @{$group_readings->{$b}} <=> @{$group_readings->{$a}} }
 638                 keys %$group_readings ) {
 639                 push( @readings, $rdg );
 640                 push( @groups, $group_readings->{$rdg} );
 641                 if( @{$group_readings->{$rdg}} > 1 ) {
 642                         $is_useful++;
 643                 } else {
 644                         my( $wit ) = @{$group_readings->{$rdg}};
 645                         $wit =~ s/^(.*)\Q$acstr\E$/$1/;
 646                         $is_useful++ unless( $graph->is_sink_vertex( $wit ) );
 647                 }
 648         }
 649         if( $is_useful > 1 ) {
 650                 return( \@readings, \@groups );
 651         } else {
 652                 return( [], [] );
 653         }
 654 }
 655
 656 =head2 wit_stringify( $groups )
 657
 658 Takes an array of witness groupings and produces a string like
 659 ['A','B'] / ['C','D','E'] / ['F']
 660
 661 =cut
 662
 663 sub wit_stringify {
 664     my $groups = shift;
 665     my @gst;
 666     # If we were passed an array of witnesses instead of an array of
 667     # groupings, then "group" the witnesses first.
 668     unless( ref( $groups->[0] ) ) {
 669         my $mkgrp = [ $groups ];
 670         $groups = $mkgrp;
 671     }
 672     foreach my $g ( @$groups ) {
 673         push( @gst, '[' . join( ',', map { "'$_'" } @$g ) . ']' );
 674     }
 675     return join( ' / ', @gst );
 676 }
 677
 678 # Helper function to ensure that X and X a.c. never appear in the same list.
 679 sub _add_to_witlist {
 680         my( $wit, $list, $acstr ) = @_;
 681         my %inlist;
 682         my $idx = 0;
 683         map { $inlist{$_} = $idx++ } @$list;
 684         if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
 685                 my $acwit = $1;
 686                 unless( exists $inlist{$acwit} ) {
 687                         push( @$list, $acwit.$acstr );
 688                 }
 689         } else {
 690                 if( exists( $inlist{$wit.$acstr} ) ) {
 691                         # Replace the a.c. version with the main witness
 692                         my $i = $inlist{$wit.$acstr};
 693                         $list->[$i] = $wit;
 694                 } else {
 695                         push( @$list, $wit );
 696                 }
 697         }
 698 }
 699
 700 sub _symmdiff {
 701         my( $lista, $listb ) = @_;
 702         my %union;
 703         my %scalars;
 704         map { $union{$_} = 1; $scalars{$_} = $_ } @$lista;
 705         map { $union{$_} += 1; $scalars{$_} = $_ } @$listb;
 706         my @set = grep { $union{$_} == 1 } keys %union;
 707         return map { $scalars{$_} } @set;
 708 }
 709
 710 1;
 711
 712 =head1 LICENSE
 713
 714 This package is free software and is provided "as is" without express
 715 or implied warranty.  You can redistribute it and/or modify it under
 716 the same terms as Perl itself.
 717
 718 =head1 AUTHOR
 719
 720 Tara L Andrews E<lt>aurum@cpan.orgE<gt>