lib/Text/Tradition/Analysis.pm

   1 package Text::Tradition::Analysis;
   2
   3 use strict;
   4 use warnings;
   5 use Benchmark;
   6 use Encode qw/ encode_utf8 /;
   7 use Exporter 'import';
   8 use Graph;
   9 use JSON qw/ encode_json decode_json /;
  10 use LWP::UserAgent;
  11 use Text::Tradition;
  12 use Text::Tradition::Stemma;
  13
  14 use vars qw/ @EXPORT_OK /;
  15 @EXPORT_OK = qw/ run_analysis group_variants analyze_variant_location wit_stringify /;
  16
  17 =head1 NAME
  18
  19 Text::Tradition::Analysis - functions for stemma analysis of a tradition
  20
  21 =head1 SYNOPSIS
  22
  23   use Text::Tradition;
  24   use Text::Tradition::Analysis qw/ run_analysis analyze_variant_location /;
  25   my $t = Text::Tradition->new(
  26     'name' => 'this is a text',
  27     'input' => 'TEI',
  28     'file' => '/path/to/tei_parallel_seg_file.xml' );
  29   $t->add_stemma( 'dotfile' => $stemmafile );
  30
  31   my $variant_data = run_analysis( $tradition );
  32   # Recalculate rank $n treating all orthographic variants as equivalent
  33   my $reanalyze = analyze_variant_location( $tradition, $n, 0, 'orthographic' );
  34
  35 =head1 DESCRIPTION
  36
  37 Text::Tradition is a library for representation and analysis of collated
  38 texts, particularly medieval ones.  The Collation is the central feature of
  39 a Tradition, where the text, its sequence of readings, and its relationships
  40 between readings are actually kept.
  41
  42 =head1 SUBROUTINES
  43
  44 =head2 run_analysis( $tradition, %opts )
  45
  46 Runs the analysis described in analyze_variant_location on every location in the
  47 collation of the given tradition, with the given options. These include:
  48
  49 =over 4
  50
  51 =item * stemma_id - Specify which of the tradition's stemmata to use. Default
  52 is 0 (i.e. the first).
  53
  54 =item * ranks - Specify a list of location ranks to analyze; exclude the rest.
  55
  56 =item * merge_types - Specify a list of relationship types, where related readings
  57 should be treated as identical for the purposes of analysis.
  58
  59 =back
  60
  61 =begin testing
  62
  63 use Text::Tradition;
  64 use Text::Tradition::Analysis qw/ run_analysis analyze_variant_location /;
  65
  66 my $datafile = 't/data/florilegium_tei_ps.xml';
  67 my $tradition = Text::Tradition->new( 'input' => 'TEI',
  68                                       'name' => 'test0',
  69                                       'file' => $datafile );
  70 my $s = $tradition->add_stemma( 'dotfile' => 't/data/florilegium.dot' );
  71 is( ref( $s ), 'Text::Tradition::Stemma', "Added stemma to tradition" );
  72
  73 my %expected_genealogical = (
  74         1 => 0,
  75         2 => 1,
  76         3 =>  0,
  77         5 =>  0,
  78         7 =>  0,
  79         8 =>  0,
  80         10 => 0,
  81         13 => 1,
  82         33 => 0,
  83         34 => 0,
  84         37 => 0,
  85         60 => 0,
  86         81 => 1,
  87         84 => 0,
  88         87 => 0,
  89         101 => 0,
  90         102 => 0,
  91         122 => 1,
  92         157 => 0,
  93         166 => 1,
  94         169 => 1,
  95         200 => 0,
  96         216 => 1,
  97         217 => 1,
  98         219 => 1,
  99         241 => 1,
 100         242 => 1,
 101         243 => 1,
 102 );
 103
 104 my $data = run_analysis( $tradition );
 105 foreach my $row ( @{$data->{'variants'}} ) {
 106         # Account for rows that used to be "not useful"
 107         unless( exists $expected_genealogical{$row->{'id'}} ) {
 108                 $expected_genealogical{$row->{'id'}} = 1;
 109         }
 110         my $gen_bool = $row->{'genealogical'} ? 1 : 0;
 111         is( $gen_bool, $expected_genealogical{$row->{'id'}},
 112                 "Got correct genealogical flag for row " . $row->{'id'} );
 113 }
 114 is( $data->{'variant_count'}, 58, "Got right total variant number" );
 115 # TODO Make something meaningful of conflict count, maybe test other bits
 116
 117 =end testing
 118
 119 =cut
 120
 121 sub run_analysis {
 122         my( $tradition, %opts ) = @_;
 123         my $c = $tradition->collation;
 124
 125         my $stemma_id = $opts{'stemma_id'} || 0;
 126         my @ranks = ref( $opts{'ranks'} ) eq 'ARRAY' ? @{$opts{'ranks'}} : ();
 127         my @collapse = ref( $opts{'merge_types'} ) eq 'ARRAY' ? @{$opts{'merge_types'}} : ();
 128
 129         # Get the stemma
 130         my $stemma = $tradition->stemma( $stemma_id );
 131
 132         # Figure out which witnesses we are working with
 133         my @lacunose = $stemma->hypotheticals;
 134         my @tradition_wits = map { $_->sigil } $tradition->witnesses;
 135         map { push( @tradition_wits, $_->sigil.$c->ac_label ) if $_->is_layered }
 136                 $tradition->witnesses;
 137         push( @lacunose, _symmdiff( [ $stemma->witnesses ], \@tradition_wits ) );
 138
 139         # Find and mark 'common' ranks for exclusion, unless they were
 140         # explicitly specified.
 141         unless( @ranks ) {
 142                 my %common_rank;
 143                 foreach my $rdg ( $c->common_readings ) {
 144                         $common_rank{$rdg->rank} = 1;
 145                 }
 146                 @ranks = grep { !$common_rank{$_} } ( 1 .. $c->end->rank-1 );
 147         }
 148
 149         # Group the variants to send to the solver
 150         my @groups;
 151         my %lacunae;
 152         foreach my $rank ( @ranks ) {
 153                 my $missing = [ @lacunose ];
 154                 push( @groups, group_variants( $tradition, $rank, $missing, \@collapse ) );
 155                 $lacunae{$rank} = $missing;
 156         }
 157         $DB::single = 1;
 158         # Parse the answer
 159         my $answer = solve_variants( $stemma, @groups );
 160
 161         # Do further analysis on the answer
 162         my $conflict_count = 0;
 163         foreach my $idx ( 0 .. $#ranks ) {
 164                 my $location = $answer->{'variants'}->[$idx];
 165                 # Add the rank back in
 166                 $location->{'id'} = $ranks[$idx];
 167                 # Add the lacunae back in
 168                 $location->{'missing'} = $lacunae{$ranks[$idx]};
 169                 # Run the extra analysis we need.
 170                 analyze_location( $tradition, $stemma->graph, $location );
 171                 # Add the reading text back in
 172                 foreach my $rdghash ( @{$location->{'readings'}} ) {
 173                         $conflict_count++
 174                                 if exists $rdghash->{'conflict'} && $rdghash->{'conflict'};
 175                         my $rdg = $c->reading( $rdghash->{'readingid'} );
 176                         $rdghash->{'text'} = $rdg ? $rdg->text : $rdghash->{'readingid'};
 177                 }
 178         }
 179         $answer->{'conflict_count'} = $conflict_count;
 180
 181         return $answer;
 182 }
 183
 184 =head2 group_variants( $tradition, $rank, $lacunose, @merge_relationship_types )
 185
 186 Groups the variants at the given $rank of the collation, treating any
 187 relationships in @merge_relationship_types as equivalent.  $lacunose should
 188 be a reference to an array, to which the sigla of lacunose witnesses at this
 189 rank will be appended.
 190
 191 Returns two ordered lists $readings, $groups, where $readings->[$n] is attested
 192 by the witnesses listed in $groups->[$n].
 193
 194 =cut
 195
 196 # Return group_readings, groups, lacunose
 197 sub group_variants {
 198         my( $tradition, $rank, $lacunose, $collapse ) = @_;
 199         my $c = $tradition->collation;
 200         my $aclabel =  $c->ac_label;
 201         # Get the alignment table readings
 202         my %readings_at_rank;
 203         my @gap_wits;
 204         foreach my $tablewit ( @{$c->alignment_table->{'alignment'}} ) {
 205                 my $rdg = $tablewit->{'tokens'}->[$rank-1];
 206                 my $wit = $tablewit->{'witness'};
 207                 if( $rdg && $rdg->{'t'}->is_lacuna ) {
 208                         _add_to_witlist( $wit, $lacunose, $aclabel );
 209                 } elsif( $rdg ) {
 210                         $readings_at_rank{$rdg->{'t'}->text} = $rdg->{'t'};
 211                 } else {
 212                         _add_to_witlist( $wit, \@gap_wits, $aclabel );
 213                 }
 214         }
 215
 216         # Group the readings, collapsing groups by relationship if needed
 217         my %grouped_readings;
 218         foreach my $rdg ( sort { $b->witnesses <=> $a->witnesses }
 219                                                    values %readings_at_rank ) {
 220                 # Skip readings that have been collapsed into others.
 221                 next if exists $grouped_readings{$rdg->id} && !$grouped_readings{$rdg->id};
 222                 my @wits = $rdg->witnesses;
 223                 if( $collapse ) {
 224                         my $filter = sub { my $r = $_[0]; grep { $_ eq $r->type } @$collapse; };
 225                         foreach my $other ( $rdg->related_readings( $filter ) ) {
 226                                 my @otherwits = $other->witnesses;
 227                                 push( @wits, @otherwits );
 228                                 $grouped_readings{$other->id} = 0;
 229                         }
 230                 }
 231                 $grouped_readings{$rdg->id} = \@wits;
 232         }
 233         $grouped_readings{'(omitted)'} = \@gap_wits if @gap_wits;
 234         # Get rid of our collapsed readings
 235         map { delete $grouped_readings{$_} unless $grouped_readings{$_} }
 236                 keys %grouped_readings
 237                 if $collapse;
 238
 239         return \%grouped_readings;
 240 }
 241
 242 =head2 solve_variants( $graph, @groups )
 243
 244 Sends the set of groups to the external graph solver service and returns
 245 a cleaned-up answer, adding the rank IDs back where they belong.
 246
 247 The JSON has the form
 248   { "graph": [ stemmagraph DOT string without newlines ],
 249     "groupings": [ array of arrays of groups, one per rank ] }
 250
 251 The answer has the form
 252   { "variants" => [ array of variant location structures ],
 253     "variant_count" => total,
 254     "conflict_count" => number of conflicts detected,
 255     "genealogical_count" => number of solutions found }
 256
 257 =cut
 258
 259 sub solve_variants {
 260         my( $stemma, @groups ) = @_;
 261
 262         # Make the json with stemma + groups
 263         my $groupings = [];
 264         foreach my $ghash ( @groups ) {
 265                 my @grouping;
 266                 foreach my $k ( sort keys %$ghash ) {
 267                         push( @grouping, $ghash->{$k} );
 268                 }
 269                 push( @$groupings, \@grouping );
 270         }
 271         ## Witness map is a HACK to get around limitations in node names from IDP
 272         my $witness_map = {};
 273         my $json = encode_json( _safe_wit_strings( $stemma, $groupings, $witness_map ) );
 274
 275         # Send it off and get the result
 276         my $solver_url = 'http://byzantini.st/cgi-bin/graphcalc.cgi';
 277         my $ua = LWP::UserAgent->new();
 278         my $resp = $ua->post( $solver_url, 'Content-Type' => 'application/json',
 279                                                   'Content' => $json );
 280
 281         my $answer;
 282         if( $resp->is_success ) {
 283                 $answer = _desanitize_names( decode_json( $resp->content ), $witness_map );
 284         } else {
 285                 # Fall back to the old method.
 286                 warn "IDP solver returned " . $resp->status_line . " / " . $resp->content
 287                         . "; falling back to perl method";
 288                 $answer = perl_solver( $stemma, @$groupings );
 289         }
 290
 291         # Fold the result back into what we know about the groups.
 292         my $variants = [];
 293         my $genealogical = 0;
 294         foreach my $idx ( 0 .. $#groups ) {
 295                 my( $calc_groups, $result ) = @{$answer->[$idx]};
 296                 $genealogical++ if $result;
 297                 my $input_group = $groups[$idx];
 298                 foreach my $k ( sort keys %$input_group ) {
 299                         my $cg = shift @$calc_groups;
 300                         $input_group->{$k} = $cg;
 301                 }
 302                 my $vstruct = {
 303                         'genealogical' => $result,
 304                         'readings' => [],
 305                 };
 306                 foreach my $k ( keys %$input_group ) {
 307                         push( @{$vstruct->{'readings'}},
 308                                   { 'readingid' => $k, 'group' => $input_group->{$k}} );
 309                 }
 310                 push( @$variants, $vstruct );
 311         }
 312
 313         return { 'variants' => $variants,
 314                          'variant_count' => scalar @$variants,
 315                          'genealogical_count' => $genealogical };
 316 }
 317
 318 #### HACKERY to cope with IDP's limited idea of what a node name looks like ###
 319
 320 sub _safe_wit_strings {
 321         my( $stemma, $groupings, $witness_map ) = @_;
 322         my $safegraph = Graph->new();
 323         # Convert the graph to a safe representation and store the conversion.
 324         foreach my $n ( $stemma->graph->vertices ) {
 325                 my $sn = _safe_witstr( $n );
 326                 warn "Ambiguous stringification $sn for $n and " . $witness_map->{$sn}
 327                         if exists $witness_map->{$sn};
 328                 $witness_map->{$sn} = $n;
 329                 $safegraph->add_vertex( $sn );
 330                 $safegraph->set_vertex_attributes( $sn,
 331                         $stemma->graph->get_vertex_attributes( $n ) );
 332         }
 333         foreach my $e ( $stemma->graph->edges ) {
 334                 my @safe_e = ( _safe_witstr( $e->[0] ), _safe_witstr( $e->[1] ) );
 335                 $safegraph->add_edge( @safe_e );
 336         }
 337         my $safe_stemma = Text::Tradition::Stemma->new(
 338                 'collation' => $stemma->collation, 'graph' => $safegraph );
 339
 340         # Now convert the witness groupings to a safe representation.
 341         my $safe_groupings = [];
 342         foreach my $grouping ( @$groupings ) {
 343                 my $safe_grouping = [];
 344                 foreach my $group ( @$grouping ) {
 345                         my $safe_group = [];
 346                         foreach my $n ( @$group ) {
 347                                 my $sn = _safe_witstr( $n );
 348                                 warn "Ambiguous stringification $sn for $n and " . $witness_map->{$sn}
 349                                         if exists $witness_map->{$sn} && $witness_map->{$sn} ne $n;
 350                                 $witness_map->{$sn} = $n;
 351                                 push( @$safe_group, $sn );
 352                         }
 353                         push( @$safe_grouping, $safe_group );
 354                 }
 355                 push( @$safe_groupings, $safe_grouping );
 356         }
 357
 358         # Return it all in the struct we expect.  We have stored the reductions
 359         # in the $witness_map that we were passed.
 360         return { 'graph' => $safe_stemma->editable( ' ' ), 'groupings' => $safe_groupings };
 361 }
 362
 363 sub _safe_witstr {
 364         my $witstr = shift;
 365         $witstr =~ s/\s+/_/g;
 366         $witstr =~ s/[^\w\d-]//g;
 367         return $witstr;
 368 }
 369
 370 sub _desanitize_names {
 371         my( $jsonstruct, $witness_map ) = @_;
 372         my $result = [];
 373         foreach my $grouping ( @$jsonstruct ) {
 374                 my $real_grouping = [];
 375                 foreach my $element ( @$grouping ) {
 376                         if( ref( $element ) eq 'ARRAY' ) {
 377                                 # it's the groupset.
 378                                 my $real_groupset = [];
 379                                 foreach my $group ( @$element ) {
 380                                         my $real_group = [];
 381                                         foreach my $n ( @$group ) {
 382                                                 my $rn = $witness_map->{$n};
 383                                                 push( @$real_group, $rn );
 384                                         }
 385                                         push( @$real_groupset, $real_group );
 386                                 }
 387                                 push( @$real_grouping, $real_groupset );
 388                         } else {
 389                                 # It is the boolean, not actually a group.
 390                                 push( @$real_grouping, $element );
 391                         }
 392                 }
 393                 push( @$result, $real_grouping );
 394         }
 395         return $result;
 396 }
 397
 398 ### END HACKERY ###
 399
 400 =head2 analyze_location ( $tradition, $graph, $location_hash )
 401
 402 Given the tradition, its stemma graph, and the solution from the graph solver,
 403 work out the rest of the information we want.  For each reading we need missing,
 404 conflict, reading_parents, independent_occurrence, followed, not_followed, and follow_unknown.  Alters the location_hash in place.
 405
 406 =cut
 407
 408 sub analyze_location {
 409         my ( $tradition, $graph, $variant_row ) = @_;
 410
 411         # Make a hash of all known node memberships, and make the subgraphs.
 412         my $contig = {};
 413         my $reading_roots = {};
 414         my $subgraph = {};
 415     foreach my $rdghash ( @{$variant_row->{'readings'}} ) {
 416         my $rid = $rdghash->{'readingid'};
 417                 map { $contig->{$_} = $rid } @{$rdghash->{'group'}};
 418
 419         # Make the subgraph.
 420         my $part = $graph->copy;
 421                 my %these_vertices;
 422                 map { $these_vertices{$_} = 1 } @{$rdghash->{'group'}};
 423                 $part->delete_vertices( grep { !$these_vertices{$_} } $part->vertices );
 424                 $subgraph->{$rid} = $part;
 425                 # Get the reading roots.
 426                 map { $reading_roots->{$_} = $rid } $part->predecessorless_vertices;
 427         }
 428
 429         # Now that we have all the node group memberships, calculate followed/
 430     # non-followed/unknown values for each reading.  Also figure out the
 431     # reading's evident parent(s).
 432     foreach my $rdghash ( @{$variant_row->{'readings'}} ) {
 433         # Group string key - TODO do we need this?
 434         my $gst = wit_stringify( $rdghash->{'group'} );
 435         my $rid = $rdghash->{'readingid'};
 436         # Get the subgraph
 437         my $part = $subgraph->{$rid};
 438
 439         # Start figuring things out.
 440         my @roots = $part->predecessorless_vertices;
 441         $rdghash->{'independent_occurrence'} = scalar @roots;
 442         $rdghash->{'followed'} = scalar( $part->vertices ) - scalar( @roots );
 443         # Find the parent readings, if any, of this reading.
 444         my %rdgparents;
 445         foreach my $wit ( @roots ) {
 446                 # Look in the main stemma to find this witness's extant or known-reading
 447                 # immediate ancestor(s), and look up the reading that each ancestor olds.
 448                         my @check = $graph->predecessors( $wit );
 449                         while( @check ) {
 450                                 my @next;
 451                                 foreach my $wparent( @check ) {
 452                                         my $preading = $contig->{$wparent};
 453                                         if( $preading ) {
 454                                                 $rdgparents{$preading} = 1;
 455                                         } else {
 456                                                 push( @next, $graph->predecessors( $wparent ) );
 457                                         }
 458                                 }
 459                                 @check = @next;
 460                         }
 461                 }
 462                 $rdghash->{'reading_parents'} = [ keys %rdgparents ];
 463
 464                 # Find the number of times this reading was altered, and the number of
 465                 # times we're not sure.
 466                 my( %nofollow, %unknownfollow );
 467                 foreach my $wit ( $part->vertices ) {
 468                         foreach my $wchild ( $graph->successors( $wit ) ) {
 469                                 next if $part->has_vertex( $wchild );
 470                                 if( $reading_roots->{$wchild} && $contig->{$wchild} ) {
 471                                         # It definitely changed here.
 472                                         $nofollow{$wchild} = 1;
 473                                 } elsif( !($contig->{$wchild}) ) {
 474                                         # The child is a hypothetical node not definitely in
 475                                         # any group. Answer is unknown.
 476                                         $unknownfollow{$wchild} = 1;
 477                                 } # else it's a non-root node in a known group, and therefore
 478                                   # is presumed to have its reading from its group, not this link.
 479                         }
 480                 }
 481                 $rdghash->{'not_followed'} = keys %nofollow;
 482                 $rdghash->{'follow_unknown'} = keys %unknownfollow;
 483
 484                 # Now say whether this reading represents a conflict.
 485                 unless( $variant_row->{'genealogical'} ) {
 486                         $rdghash->{'conflict'} = @roots != 1;
 487                 }
 488     }
 489 }
 490
 491
 492 =head2 perl_solver( $tradition, $rank, $stemma_id, @merge_relationship_types )
 493
 494 ** NOTE ** This method should hopefully not be called - it is not guaranteed
 495 to be correct.  Serves as a backup for the real solver.
 496
 497 Runs an analysis of the given tradition, at the location given in $rank,
 498 against the graph of the stemma specified in $stemma_id.  The argument
 499 @merge_relationship_types is an optional list of relationship types for
 500 which readings so related should be treated as equivalent.
 501
 502 Returns a nested array data structure as follows:
 503
 504  [ [ group_list, is_genealogical ], [ group_list, is_genealogical ] ... ]
 505
 506 where the group list is the array of arrays passed in for each element of @groups,
 507 possibly with the addition of hypothetical readings.
 508
 509
 510 =cut
 511
 512 sub perl_solver {
 513         my( $stemma, @groups ) = @_;
 514         my $graph = $stemma->graph;
 515         my @answer;
 516         foreach my $g ( @groups ) {
 517                 push( @answer, _solve_variant_location( $graph, $g ) );
 518         }
 519         return \@answer;
 520 }
 521
 522 sub _solve_variant_location {
 523         my( $graph, $groups ) = @_;
 524         # Now do the work.
 525     my $contig = {};
 526     my $subgraph = {};
 527     my $is_conflicted;
 528     my $conflict = {};
 529
 530     # Mark each ms as in its own group, first.
 531     foreach my $g ( @$groups ) {
 532         my $gst = wit_stringify( $g );
 533         map { $contig->{$_} = $gst } @$g;
 534     }
 535
 536     # Now for each unmarked node in the graph, initialize an array
 537     # for possible group memberships.  We will use this later to
 538     # resolve potential conflicts.
 539     map { $contig->{$_} = [] unless $contig->{$_} } $graph->vertices;
 540     foreach my $g ( sort { scalar @$b <=> scalar @$a } @$groups ) {
 541         my $gst = wit_stringify( $g );  # This is the group name
 542         # Copy the graph, and delete all non-members from the new graph.
 543         my $part = $graph->copy;
 544         my @group_roots;
 545         $part->delete_vertices(
 546             grep { !ref( $contig->{$_} ) && $contig->{$_} ne $gst } $graph->vertices );
 547
 548         # Now look to see if our group is connected.
 549                 if( @$g > 1 ) {
 550                         # We have to take directionality into account.
 551                         # How many root nodes do we have?
 552                         my @roots = grep { ref( $contig->{$_} ) || $contig->{$_} eq $gst }
 553                                 $part->predecessorless_vertices;
 554                         # Assuming that @$g > 1, find the first root node that has at
 555                         # least one successor belonging to our group. If this reading
 556                         # is genealogical, there should be only one, but we will check
 557                         # that implicitly later.
 558                         foreach my $root ( @roots ) {
 559                                 # Prune the tree to get rid of extraneous hypotheticals.
 560                                 $root = _prune_subtree( $part, $root, $contig );
 561                                 next unless $root;
 562                                 # Save this root for our group.
 563                                 push( @group_roots, $root );
 564                                 # Get all the successor nodes of our root.
 565                         }
 566                 } else {
 567                         # Dispense with the trivial case of one reading.
 568                         my $wit = $g->[0];
 569                         @group_roots = ( $wit );
 570                         foreach my $v ( $part->vertices ) {
 571                                 $part->delete_vertex( $v ) unless $v eq $wit;
 572                         }
 573         }
 574
 575         if( @group_roots > 1 ) {
 576                 $conflict->{$gst} = 1;
 577                 $is_conflicted = 1;
 578         }
 579         # Paint the 'hypotheticals' with our group.
 580                 foreach my $wit ( $part->vertices ) {
 581                         if( ref( $contig->{$wit} ) ) {
 582                                 push( @{$contig->{$wit}}, $gst );
 583                         } elsif( $contig->{$wit} ne $gst ) {
 584                                 warn "How did we get here?";
 585                         }
 586                 }
 587
 588
 589                 # Save the relevant subgraph.
 590                 $subgraph->{$gst} = $part;
 591     }
 592
 593         # For each of our hypothetical readings, flatten its 'contig' array if
 594         # the array contains zero or one group.  If we have any unflattened arrays,
 595         # we may need to run the resolution process. If the reading is already known
 596         # to have a conflict, flatten the 'contig' array to nothing; we won't resolve
 597         # it.
 598         my @resolve;
 599         foreach my $wit ( keys %$contig ) {
 600                 next unless ref( $contig->{$wit} );
 601                 if( @{$contig->{$wit}} > 1 ) {
 602                         if( $is_conflicted ) {
 603                                 $contig->{$wit} = '';  # We aren't going to decide.
 604                         } else {
 605                                 push( @resolve, $wit );
 606                         }
 607                 } else {
 608                         my $gst = pop @{$contig->{$wit}};
 609                         $contig->{$wit} = $gst || '';
 610                 }
 611         }
 612
 613     if( @resolve ) {
 614         my $still_contig = {};
 615         foreach my $h ( @resolve ) {
 616             # For each of the hypothetical readings with more than one possibility,
 617             # try deleting it from each of its member subgraphs in turn, and see
 618             # if that breaks the contiguous grouping.
 619             # TODO This can still break in a corner case where group A can use
 620             # either vertex 1 or 2, and group B can use either vertex 2 or 1.
 621             # Revisit this if necessary; it could get brute-force nasty.
 622             foreach my $gst ( @{$contig->{$h}} ) {
 623                 my $gpart = $subgraph->{$gst}->copy();
 624                 # If we have come this far, there is only one root and everything
 625                 # is reachable from it.
 626                 my( $root ) = $gpart->predecessorless_vertices;
 627                 my $reachable = {};
 628                 map { $reachable->{$_} = 1 } $gpart->vertices;
 629
 630                 # Try deleting the hypothetical node.
 631                 $gpart->delete_vertex( $h );
 632                 if( $h eq $root ) {
 633                         # See if we still have a single root.
 634                         my @roots = $gpart->predecessorless_vertices;
 635                         warn "This shouldn't have happened" unless @roots;
 636                         if( @roots > 1 ) {
 637                                 # $h is needed by this group.
 638                                 if( exists( $still_contig->{$h} ) ) {
 639                                         # Conflict!
 640                                         $conflict->{$gst} = 1;
 641                                         $still_contig->{$h} = '';
 642                                 } else {
 643                                         $still_contig->{$h} = $gst;
 644                                 }
 645                         }
 646                 } else {
 647                         # $h is somewhere in the middle. See if everything
 648                         # else can still be reached from the root.
 649                                         my %still_reachable = ( $root => 1 );
 650                                         map { $still_reachable{$_} = 1 }
 651                                                 $gpart->all_successors( $root );
 652                                         foreach my $v ( keys %$reachable ) {
 653                                                 next if $v eq $h;
 654                                                 if( !$still_reachable{$v}
 655                                                         && ( $contig->{$v} eq $gst
 656                                                                  || ( exists $still_contig->{$v}
 657                                                                           && $still_contig->{$v} eq $gst ) ) ) {
 658                                                         # We need $h.
 659                                                         if( exists $still_contig->{$h} ) {
 660                                                                 # Conflict!
 661                                                                 $conflict->{$gst} = 1;
 662                                                                 $still_contig->{$h} = '';
 663                                                         } else {
 664                                                                 $still_contig->{$h} = $gst;
 665                                                         }
 666                                                         last;
 667                                                 } # else we don't need $h in this group.
 668                                         } # end foreach $v
 669                                 } # endif $h eq $root
 670             } # end foreach $gst
 671         } # end foreach $h
 672
 673         # Now we have some hypothetical vertices in $still_contig that are the
 674         # "real" group memberships.  Replace these in $contig.
 675                 foreach my $v ( keys %$contig ) {
 676                         next unless ref $contig->{$v};
 677                         $contig->{$v} = $still_contig->{$v};
 678                 }
 679     } # end if @resolve
 680
 681     my $is_genealogical = keys %$conflict ? JSON::false : JSON::true;
 682         my $variant_row = [ [], $is_genealogical ];
 683         # Fill in the groupings from $contig.
 684         foreach my $g ( @$groups ) {
 685         my $gst = wit_stringify( $g );
 686         my @realgroup = grep { $contig->{$_} eq $gst } keys %$contig;
 687         push( @{$variant_row->[0]}, \@realgroup );
 688     }
 689     return $variant_row;
 690 }
 691
 692 sub _prune_subtree {
 693     my( $tree, $root, $contighash ) = @_;
 694     # First, delete hypothetical leaves / orphans until there are none left.
 695     my @orphan_hypotheticals = grep { ref( $contighash->{$_} ) }
 696         $tree->successorless_vertices;
 697     while( @orphan_hypotheticals ) {
 698         $tree->delete_vertices( @orphan_hypotheticals );
 699         @orphan_hypotheticals = grep { ref( $contighash->{$_} ) }
 700             $tree->successorless_vertices;
 701     }
 702     # Then delete a hypothetical root with only one successor, moving the
 703     # root to the first child that has no other predecessors.
 704     while( $tree->successors( $root ) == 1 && ref $contighash->{$root} ) {
 705         my @nextroot = $tree->successors( $root );
 706         $tree->delete_vertex( $root );
 707         ( $root ) = grep { $tree->is_predecessorless_vertex( $_ ) } @nextroot;
 708     }
 709     # The tree has been modified in place, but we need to know the new root.
 710     $root = undef unless $root && $tree->has_vertex( $root );
 711     return $root;
 712 }
 713 # Add the variant, subject to a.c. representation logic.
 714 # This assumes that we will see the 'main' version before the a.c. version.
 715 sub add_variant_wit {
 716     my( $arr, $wit, $acstr ) = @_;
 717     my $skip;
 718     if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
 719         my $real = $1;
 720         $skip = grep { $_ =~ /^\Q$real\E$/ } @$arr;
 721     }
 722     push( @$arr, $wit ) unless $skip;
 723 }
 724
 725 sub _useful_variant {
 726         my( $group_readings, $graph, $acstr ) = @_;
 727
 728         # TODO Decide what to do with AC witnesses
 729
 730         # Sort by group size and return
 731         my $is_useful = 0;
 732         my( @readings, @groups );   # The sorted groups for our answer.
 733         foreach my $rdg ( sort { @{$group_readings->{$b}} <=> @{$group_readings->{$a}} }
 734                 keys %$group_readings ) {
 735                 push( @readings, $rdg );
 736                 push( @groups, $group_readings->{$rdg} );
 737                 if( @{$group_readings->{$rdg}} > 1 ) {
 738                         $is_useful++;
 739                 } else {
 740                         my( $wit ) = @{$group_readings->{$rdg}};
 741                         $wit =~ s/^(.*)\Q$acstr\E$/$1/;
 742                         $is_useful++ unless( $graph->is_sink_vertex( $wit ) );
 743                 }
 744         }
 745         if( $is_useful > 1 ) {
 746                 return( \@readings, \@groups );
 747         } else {
 748                 return( [], [] );
 749         }
 750 }
 751
 752 =head2 wit_stringify( $groups )
 753
 754 Takes an array of witness groupings and produces a string like
 755 ['A','B'] / ['C','D','E'] / ['F']
 756
 757 =cut
 758
 759 sub wit_stringify {
 760     my $groups = shift;
 761     my @gst;
 762     # If we were passed an array of witnesses instead of an array of
 763     # groupings, then "group" the witnesses first.
 764     unless( ref( $groups->[0] ) ) {
 765         my $mkgrp = [ $groups ];
 766         $groups = $mkgrp;
 767     }
 768     foreach my $g ( @$groups ) {
 769         push( @gst, '[' . join( ',', map { "'$_'" } @$g ) . ']' );
 770     }
 771     return join( ' / ', @gst );
 772 }
 773
 774 # Helper function to ensure that X and X a.c. never appear in the same list.
 775 sub _add_to_witlist {
 776         my( $wit, $list, $acstr ) = @_;
 777         my %inlist;
 778         my $idx = 0;
 779         map { $inlist{$_} = $idx++ } @$list;
 780         if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
 781                 my $acwit = $1;
 782                 unless( exists $inlist{$acwit} ) {
 783                         push( @$list, $acwit.$acstr );
 784                 }
 785         } else {
 786                 if( exists( $inlist{$wit.$acstr} ) ) {
 787                         # Replace the a.c. version with the main witness
 788                         my $i = $inlist{$wit.$acstr};
 789                         $list->[$i] = $wit;
 790                 } else {
 791                         push( @$list, $wit );
 792                 }
 793         }
 794 }
 795
 796 sub _symmdiff {
 797         my( $lista, $listb ) = @_;
 798         my %union;
 799         my %scalars;
 800         map { $union{$_} = 1; $scalars{$_} = $_ } @$lista;
 801         map { $union{$_} += 1; $scalars{$_} = $_ } @$listb;
 802         my @set = grep { $union{$_} == 1 } keys %union;
 803         return map { $scalars{$_} } @set;
 804 }
 805
 806 1;
 807
 808 =head1 LICENSE
 809
 810 This package is free software and is provided "as is" without express
 811 or implied warranty.  You can redistribute it and/or modify it under
 812 the same terms as Perl itself.
 813
 814 =head1 AUTHOR
 815
 816 Tara L Andrews E<lt>aurum@cpan.orgE<gt>