lib/Text/Tradition/Analysis.pm

   1 package Text::Tradition::Analysis;
   2
   3 use strict;
   4 use warnings;
   5 use Benchmark;
   6 use Encode qw/ encode_utf8 /;
   7 use Exporter 'import';
   8 use Graph;
   9 use JSON qw/ encode_json decode_json /;
  10 use LWP::UserAgent;
  11 use Text::Tradition;
  12 use Text::Tradition::Stemma;
  13
  14 use vars qw/ @EXPORT_OK /;
  15 @EXPORT_OK = qw/ run_analysis group_variants analyze_variant_location wit_stringify /;
  16
  17 =head1 NAME
  18
  19 Text::Tradition::Analysis - functions for stemma analysis of a tradition
  20
  21 =head1 SYNOPSIS
  22
  23   use Text::Tradition;
  24   use Text::Tradition::Analysis qw/ run_analysis analyze_variant_location /;
  25   my $t = Text::Tradition->new(
  26     'name' => 'this is a text',
  27     'input' => 'TEI',
  28     'file' => '/path/to/tei_parallel_seg_file.xml' );
  29   $t->add_stemma( 'dotfile' => $stemmafile );
  30
  31   my $variant_data = run_analysis( $tradition );
  32   # Recalculate rank $n treating all orthographic variants as equivalent
  33   my $reanalyze = analyze_variant_location( $tradition, $n, 0, 'orthographic' );
  34
  35 =head1 DESCRIPTION
  36
  37 Text::Tradition is a library for representation and analysis of collated
  38 texts, particularly medieval ones.  The Collation is the central feature of
  39 a Tradition, where the text, its sequence of readings, and its relationships
  40 between readings are actually kept.
  41
  42 =head1 SUBROUTINES
  43
  44 =head2 run_analysis( $tradition, %opts )
  45
  46 Runs the analysis described in analyze_variant_location on every location in the
  47 collation of the given tradition, with the given options. These include:
  48
  49 =over 4
  50
  51 =item * stemma_id - Specify which of the tradition's stemmata to use. Default
  52 is 0 (i.e. the first).
  53
  54 =item * ranks - Specify a list of location ranks to analyze; exclude the rest.
  55
  56 =item * merge_types - Specify a list of relationship types, where related readings
  57 should be treated as identical for the purposes of analysis.
  58
  59 =item * exclude_type1 - Exclude those ranks whose groupings have only type-1 variants.
  60
  61 =back
  62
  63 =begin testing
  64
  65 use Text::Tradition;
  66 use Text::Tradition::Analysis qw/ run_analysis analyze_variant_location /;
  67
  68 my $datafile = 't/data/florilegium_tei_ps.xml';
  69 my $tradition = Text::Tradition->new( 'input' => 'TEI',
  70                                       'name' => 'test0',
  71                                       'file' => $datafile );
  72 my $s = $tradition->add_stemma( 'dotfile' => 't/data/florilegium.dot' );
  73 is( ref( $s ), 'Text::Tradition::Stemma', "Added stemma to tradition" );
  74
  75 my %expected_genealogical = (
  76         1 => 0,
  77         2 => 1,
  78         3 =>  0,
  79         5 =>  0,
  80         7 =>  0,
  81         8 =>  0,
  82         10 => 0,
  83         13 => 1,
  84         33 => 0,
  85         34 => 0,
  86         37 => 0,
  87         60 => 0,
  88         81 => 1,
  89         84 => 0,
  90         87 => 0,
  91         101 => 0,
  92         102 => 0,
  93         122 => 1,
  94         157 => 0,
  95         166 => 1,
  96         169 => 1,
  97         200 => 0,
  98         216 => 1,
  99         217 => 1,
 100         219 => 1,
 101         241 => 1,
 102         242 => 1,
 103         243 => 1,
 104 );
 105
 106 my $data = run_analysis( $tradition );
 107 foreach my $row ( @{$data->{'variants'}} ) {
 108         # Account for rows that used to be "not useful"
 109         unless( exists $expected_genealogical{$row->{'id'}} ) {
 110                 $expected_genealogical{$row->{'id'}} = 1;
 111         }
 112         is( $row->{'genealogical'}, $expected_genealogical{$row->{'id'}},
 113                 "Got correct genealogical flag for row " . $row->{'id'} );
 114 }
 115 is( $data->{'variant_count'}, 58, "Got right total variant number" );
 116 # TODO Make something meaningful of conflict count, maybe test other bits
 117
 118 =end testing
 119
 120 =cut
 121
 122 sub run_analysis {
 123         my( $tradition, %opts ) = @_;
 124         my $c = $tradition->collation;
 125
 126         my $stemma_id = $opts{'stemma_id'} || 0;
 127         my @ranks = ref( $opts{'ranks'} ) eq 'ARRAY' ? @{$opts{'ranks'}} : ();
 128         my @collapse = ref( $opts{'merge_types'} ) eq 'ARRAY' ? @{$opts{'merge_types'}} : ();
 129
 130         # Get the stemma
 131         my $stemma = $tradition->stemma( $stemma_id );
 132
 133         # Figure out which witnesses we are working with
 134         my @lacunose = $stemma->hypotheticals;
 135         my @tradition_wits = map { $_->sigil } $tradition->witnesses;
 136         map { push( @tradition_wits, $_->sigil.$c->ac_label ) if $_->is_layered }
 137                 $tradition->witnesses;
 138         push( @lacunose, _symmdiff( [ $stemma->witnesses ], \@tradition_wits ) );
 139
 140         # Find and mark 'common' ranks for exclusion, unless they were
 141         # explicitly specified.
 142         unless( @ranks ) {
 143                 my %common_rank;
 144                 foreach my $rdg ( $c->common_readings ) {
 145                         $common_rank{$rdg->rank} = 1;
 146                 }
 147                 @ranks = grep { !$common_rank{$_} } ( 1 .. $c->end->rank-1 );
 148         }
 149
 150         # Group the variants to send to the solver
 151         my @groups;
 152         my %lacunae;
 153         foreach my $rank ( @ranks ) {
 154                 my $missing = [ @lacunose ];
 155                 my $rankgroup = group_variants( $tradition, $rank, $missing, \@collapse );
 156                 if( $opts{'exclude_type1'} ) {
 157                         # Check to see whether this is a "useful" group.
 158                         my( $rdgs, $grps ) = _useful_variant( $rankgroup );
 159                         next unless @$rdgs;
 160                 }
 161                 push( @groups, $rankgroup );
 162                 $lacunae{$rank} = $missing;
 163         }
 164         $DB::single = 1;
 165         # Parse the answer
 166         my $answer = solve_variants( $stemma, @groups );
 167
 168         # Do further analysis on the answer
 169         my $conflict_count = 0;
 170         foreach my $idx ( 0 .. $#ranks ) {
 171                 my $location = $answer->{'variants'}->[$idx];
 172                 # Add the rank back in
 173                 $location->{'id'} = $ranks[$idx];
 174                 # Add the lacunae back in
 175                 $location->{'missing'} = $lacunae{$ranks[$idx]};
 176                 # Run the extra analysis we need.
 177                 analyze_location( $tradition, $stemma->graph, $location );
 178                 # Add the reading text back in
 179                 foreach my $rdghash ( @{$location->{'readings'}} ) {
 180                         $conflict_count++
 181                                 if exists $rdghash->{'conflict'} && $rdghash->{'conflict'};
 182                         my $rdg = $c->reading( $rdghash->{'readingid'} );
 183                         $rdghash->{'text'} = $rdg ? $rdg->text : $rdghash->{'readingid'};
 184                 }
 185         }
 186         $answer->{'conflict_count'} = $conflict_count;
 187
 188         return $answer;
 189 }
 190
 191 =head2 group_variants( $tradition, $rank, $lacunose, @merge_relationship_types )
 192
 193 Groups the variants at the given $rank of the collation, treating any
 194 relationships in @merge_relationship_types as equivalent.  $lacunose should
 195 be a reference to an array, to which the sigla of lacunose witnesses at this
 196 rank will be appended.
 197
 198 Returns a hash $group_readings where $rdg is attested by the witnesses listed
 199 in $group_readings->{$rdg}.
 200
 201 =cut
 202
 203 # Return group_readings, groups, lacunose
 204 sub group_variants {
 205         my( $tradition, $rank, $lacunose, $collapse ) = @_;
 206         my $c = $tradition->collation;
 207         my $aclabel =  $c->ac_label;
 208         # Get the alignment table readings
 209         my %readings_at_rank;
 210         my @gap_wits;
 211         foreach my $tablewit ( @{$c->alignment_table->{'alignment'}} ) {
 212                 my $rdg = $tablewit->{'tokens'}->[$rank-1];
 213                 my $wit = $tablewit->{'witness'};
 214                 if( $rdg && $rdg->{'t'}->is_lacuna ) {
 215                         _add_to_witlist( $wit, $lacunose, $aclabel );
 216                 } elsif( $rdg ) {
 217                         $readings_at_rank{$rdg->{'t'}->text} = $rdg->{'t'};
 218                 } else {
 219                         _add_to_witlist( $wit, \@gap_wits, $aclabel );
 220                 }
 221         }
 222
 223         # Group the readings, collapsing groups by relationship if needed
 224         my %grouped_readings;
 225         foreach my $rdg ( sort { $b->witnesses <=> $a->witnesses }
 226                                                    values %readings_at_rank ) {
 227                 # Skip readings that have been collapsed into others.
 228                 next if exists $grouped_readings{$rdg->id} && !$grouped_readings{$rdg->id};
 229                 my @wits = $rdg->witnesses;
 230                 if( $collapse ) {
 231                         my $filter = sub { my $r = $_[0]; grep { $_ eq $r->type } @$collapse; };
 232                         foreach my $other ( $rdg->related_readings( $filter ) ) {
 233                                 my @otherwits = $other->witnesses;
 234                                 push( @wits, @otherwits );
 235                                 $grouped_readings{$other->id} = 0;
 236                         }
 237                 }
 238                 $grouped_readings{$rdg->id} = \@wits;
 239         }
 240         $grouped_readings{'(omitted)'} = \@gap_wits if @gap_wits;
 241         # Get rid of our collapsed readings
 242         map { delete $grouped_readings{$_} unless $grouped_readings{$_} }
 243                 keys %grouped_readings
 244                 if $collapse;
 245
 246         return \%grouped_readings;
 247 }
 248
 249 =head2 solve_variants( $graph, @groups )
 250
 251 Sends the set of groups to the external graph solver service and returns
 252 a cleaned-up answer, adding the rank IDs back where they belong.
 253
 254 The JSON has the form
 255   { "graph": [ stemmagraph DOT string without newlines ],
 256     "groupings": [ array of arrays of groups, one per rank ] }
 257
 258 The answer has the form
 259   { "variants" => [ array of variant location structures ],
 260     "variant_count" => total,
 261     "conflict_count" => number of conflicts detected,
 262     "genealogical_count" => number of solutions found }
 263
 264 =cut
 265
 266 sub solve_variants {
 267         my( $stemma, @groups ) = @_;
 268
 269         # Make the json with stemma + groups
 270         my $groupings = [];
 271         foreach my $ghash ( @groups ) {
 272                 my @grouping;
 273                 foreach my $k ( keys %$ghash ) {
 274                         push( @grouping, $ghash->{$k} );
 275                 }
 276                 push( @$groupings, \@grouping );
 277         }
 278         ## Witness map is a HACK to get around limitations in node names from IDP
 279         my $witness_map = {};
 280         my $json = encode_json( _safe_wit_strings( $stemma, $groupings, $witness_map ) );
 281
 282         # Send it off and get the result
 283         my $solver_url = 'http://byzantini.st/cgi-bin/graphcalc.cgi';
 284         my $ua = LWP::UserAgent->new();
 285         my $resp = $ua->post( $solver_url, 'Content-Type' => 'application/json',
 286                                                   'Content' => $json );
 287
 288         my $answer;
 289         my $used_idp;
 290         if( $resp->is_success ) {
 291                 $answer = _desanitize_names( decode_json( $resp->content ), $witness_map );
 292                 $used_idp = 1;
 293         } else {
 294                 # Fall back to the old method.
 295                 warn "IDP solver returned " . $resp->status_line . " / " . $resp->content
 296                         . "; falling back to perl method";
 297                 $answer = perl_solver( $stemma, @$groupings );
 298         }
 299
 300         # Fold the result back into what we know about the groups.
 301         my $variants = [];
 302         my $genealogical = 0;
 303         foreach my $idx ( 0 .. $#groups ) {
 304                 my( $calc_groups, $result ) = @{$answer->[$idx]};
 305                 if( $result ) {
 306                         $genealogical++;
 307                         # Prune the calculated groups, in case the IDP solver failed to.
 308                         if( $used_idp ) {
 309                                 my @pruned_groups;
 310                                 foreach my $cg ( @$calc_groups ) {
 311                                         my @pg = _prune_group( $cg, $stemma );
 312                                         push( @pruned_groups, \@pg );
 313                                 }
 314                                 $calc_groups = \@pruned_groups;
 315                         }
 316                 }
 317                 my $input_group = $groups[$idx];
 318                 foreach my $k ( sort keys %$input_group ) {
 319                         my $cg = shift @$calc_groups;
 320                         $input_group->{$k} = $cg;
 321                 }
 322                 my $vstruct = {
 323                         'genealogical' => $result,
 324                         'readings' => [],
 325                 };
 326                 foreach my $k ( keys %$input_group ) {
 327                         push( @{$vstruct->{'readings'}},
 328                                   { 'readingid' => $k, 'group' => $input_group->{$k}} );
 329                 }
 330                 push( @$variants, $vstruct );
 331         }
 332
 333         return { 'variants' => $variants,
 334                          'variant_count' => scalar @$variants,
 335                          'genealogical_count' => $genealogical };
 336 }
 337
 338 #### HACKERY to cope with IDP's limited idea of what a node name looks like ###
 339
 340 sub _safe_wit_strings {
 341         my( $stemma, $groupings, $witness_map ) = @_;
 342         my $safegraph = Graph->new();
 343         # Convert the graph to a safe representation and store the conversion.
 344         foreach my $n ( $stemma->graph->vertices ) {
 345                 my $sn = _safe_witstr( $n );
 346                 warn "Ambiguous stringification $sn for $n and " . $witness_map->{$sn}
 347                         if exists $witness_map->{$sn};
 348                 $witness_map->{$sn} = $n;
 349                 $safegraph->add_vertex( $sn );
 350                 $safegraph->set_vertex_attributes( $sn,
 351                         $stemma->graph->get_vertex_attributes( $n ) );
 352         }
 353         foreach my $e ( $stemma->graph->edges ) {
 354                 my @safe_e = ( _safe_witstr( $e->[0] ), _safe_witstr( $e->[1] ) );
 355                 $safegraph->add_edge( @safe_e );
 356         }
 357         my $safe_stemma = Text::Tradition::Stemma->new(
 358                 'collation' => $stemma->collation, 'graph' => $safegraph );
 359
 360         # Now convert the witness groupings to a safe representation.
 361         my $safe_groupings = [];
 362         foreach my $grouping ( @$groupings ) {
 363                 my $safe_grouping = [];
 364                 foreach my $group ( @$grouping ) {
 365                         my $safe_group = [];
 366                         foreach my $n ( @$group ) {
 367                                 my $sn = _safe_witstr( $n );
 368                                 warn "Ambiguous stringification $sn for $n and " . $witness_map->{$sn}
 369                                         if exists $witness_map->{$sn} && $witness_map->{$sn} ne $n;
 370                                 $witness_map->{$sn} = $n;
 371                                 push( @$safe_group, $sn );
 372                         }
 373                         push( @$safe_grouping, $safe_group );
 374                 }
 375                 push( @$safe_groupings, $safe_grouping );
 376         }
 377
 378         # Return it all in the struct we expect.  We have stored the reductions
 379         # in the $witness_map that we were passed.
 380         return { 'graph' => $safe_stemma->editable( ' ' ), 'groupings' => $safe_groupings };
 381 }
 382
 383 sub _safe_witstr {
 384         my $witstr = shift;
 385         $witstr =~ s/\s+/_/g;
 386         $witstr =~ s/[^\w\d-]//g;
 387         return $witstr;
 388 }
 389
 390 sub _desanitize_names {
 391         my( $jsonstruct, $witness_map ) = @_;
 392         my $result = [];
 393         foreach my $grouping ( @$jsonstruct ) {
 394                 my $real_grouping = [];
 395                 foreach my $element ( @$grouping ) {
 396                         if( ref( $element ) eq 'ARRAY' ) {
 397                                 # it's the groupset.
 398                                 my $real_groupset = [];
 399                                 foreach my $group ( @$element ) {
 400                                         my $real_group = [];
 401                                         foreach my $n ( @$group ) {
 402                                                 my $rn = $witness_map->{$n};
 403                                                 push( @$real_group, $rn );
 404                                         }
 405                                         push( @$real_groupset, $real_group );
 406                                 }
 407                                 push( @$real_grouping, $real_groupset );
 408                         } else {
 409                                 # It is the boolean, not actually a group.
 410                                 push( @$real_grouping, $element );
 411                         }
 412                 }
 413                 push( @$result, $real_grouping );
 414         }
 415         return $result;
 416 }
 417
 418 ### END HACKERY ###
 419
 420 =head2 analyze_location ( $tradition, $graph, $location_hash )
 421
 422 Given the tradition, its stemma graph, and the solution from the graph solver,
 423 work out the rest of the information we want.  For each reading we need missing,
 424 conflict, reading_parents, independent_occurrence, followed, not_followed, and follow_unknown.  Alters the location_hash in place.
 425
 426 =cut
 427
 428 sub analyze_location {
 429         my ( $tradition, $graph, $variant_row ) = @_;
 430
 431         # Make a hash of all known node memberships, and make the subgraphs.
 432         my $contig = {};
 433         my $reading_roots = {};
 434         my $subgraph = {};
 435     foreach my $rdghash ( @{$variant_row->{'readings'}} ) {
 436         my $rid = $rdghash->{'readingid'};
 437                 map { $contig->{$_} = $rid } @{$rdghash->{'group'}};
 438
 439         # Make the subgraph.
 440         my $part = $graph->copy;
 441                 my %these_vertices;
 442                 map { $these_vertices{$_} = 1 } @{$rdghash->{'group'}};
 443                 $part->delete_vertices( grep { !$these_vertices{$_} } $part->vertices );
 444                 $subgraph->{$rid} = $part;
 445                 # Get the reading roots.
 446                 map { $reading_roots->{$_} = $rid } $part->predecessorless_vertices;
 447         }
 448
 449         # Now that we have all the node group memberships, calculate followed/
 450     # non-followed/unknown values for each reading.  Also figure out the
 451     # reading's evident parent(s).
 452     foreach my $rdghash ( @{$variant_row->{'readings'}} ) {
 453         # Group string key - TODO do we need this?
 454         my $gst = wit_stringify( $rdghash->{'group'} );
 455         my $rid = $rdghash->{'readingid'};
 456         # Get the subgraph
 457         my $part = $subgraph->{$rid};
 458
 459         # Start figuring things out.
 460         my @roots = $part->predecessorless_vertices;
 461         $rdghash->{'independent_occurrence'} = scalar @roots;
 462         $rdghash->{'followed'} = scalar( $part->vertices ) - scalar( @roots );
 463         # Find the parent readings, if any, of this reading.
 464         my %rdgparents;
 465         foreach my $wit ( @roots ) {
 466                 # Look in the main stemma to find this witness's extant or known-reading
 467                 # immediate ancestor(s), and look up the reading that each ancestor olds.
 468                         my @check = $graph->predecessors( $wit );
 469                         while( @check ) {
 470                                 my @next;
 471                                 foreach my $wparent( @check ) {
 472                                         my $preading = $contig->{$wparent};
 473                                         if( $preading ) {
 474                                                 $rdgparents{$preading} = 1;
 475                                         } else {
 476                                                 push( @next, $graph->predecessors( $wparent ) );
 477                                         }
 478                                 }
 479                                 @check = @next;
 480                         }
 481                 }
 482                 $rdghash->{'reading_parents'} = [ keys %rdgparents ];
 483
 484                 # Find the number of times this reading was altered, and the number of
 485                 # times we're not sure.
 486                 my( %nofollow, %unknownfollow );
 487                 foreach my $wit ( $part->vertices ) {
 488                         foreach my $wchild ( $graph->successors( $wit ) ) {
 489                                 next if $part->has_vertex( $wchild );
 490                                 if( $reading_roots->{$wchild} && $contig->{$wchild} ) {
 491                                         # It definitely changed here.
 492                                         $nofollow{$wchild} = 1;
 493                                 } elsif( !($contig->{$wchild}) ) {
 494                                         # The child is a hypothetical node not definitely in
 495                                         # any group. Answer is unknown.
 496                                         $unknownfollow{$wchild} = 1;
 497                                 } # else it's a non-root node in a known group, and therefore
 498                                   # is presumed to have its reading from its group, not this link.
 499                         }
 500                 }
 501                 $rdghash->{'not_followed'} = keys %nofollow;
 502                 $rdghash->{'follow_unknown'} = keys %unknownfollow;
 503
 504                 # Now say whether this reading represents a conflict.
 505                 unless( $variant_row->{'genealogical'} ) {
 506                         $rdghash->{'conflict'} = @roots != 1;
 507                 }
 508     }
 509 }
 510
 511
 512 =head2 perl_solver( $tradition, $rank, $stemma_id, @merge_relationship_types )
 513
 514 ** NOTE ** This method should hopefully not be called - it is not guaranteed
 515 to be correct.  Serves as a backup for the real solver.
 516
 517 Runs an analysis of the given tradition, at the location given in $rank,
 518 against the graph of the stemma specified in $stemma_id.  The argument
 519 @merge_relationship_types is an optional list of relationship types for
 520 which readings so related should be treated as equivalent.
 521
 522 Returns a nested array data structure as follows:
 523
 524  [ [ group_list, is_genealogical ], [ group_list, is_genealogical ] ... ]
 525
 526 where the group list is the array of arrays passed in for each element of @groups,
 527 possibly with the addition of hypothetical readings.
 528
 529
 530 =cut
 531
 532 sub perl_solver {
 533         my( $stemma, @groups ) = @_;
 534         my $graph = $stemma->graph;
 535         my @answer;
 536         foreach my $g ( @groups ) {
 537                 push( @answer, _solve_variant_location( $graph, $g ) );
 538         }
 539         return \@answer;
 540 }
 541
 542 sub _solve_variant_location {
 543         my( $graph, $groups ) = @_;
 544         # Now do the work.
 545     my $contig = {};
 546     my $subgraph = {};
 547     my $is_conflicted;
 548     my $conflict = {};
 549
 550     # Mark each ms as in its own group, first.
 551     foreach my $g ( @$groups ) {
 552         my $gst = wit_stringify( $g );
 553         map { $contig->{$_} = $gst } @$g;
 554     }
 555
 556     # Now for each unmarked node in the graph, initialize an array
 557     # for possible group memberships.  We will use this later to
 558     # resolve potential conflicts.
 559     map { $contig->{$_} = [] unless $contig->{$_} } $graph->vertices;
 560     foreach my $g ( sort { scalar @$b <=> scalar @$a } @$groups ) {
 561         my $gst = wit_stringify( $g );  # This is the group name
 562         # Copy the graph, and delete all non-members from the new graph.
 563         my $part = $graph->copy;
 564         my @group_roots;
 565         $part->delete_vertices(
 566             grep { !ref( $contig->{$_} ) && $contig->{$_} ne $gst } $graph->vertices );
 567
 568         # Now look to see if our group is connected.
 569                 if( @$g > 1 ) {
 570                         # We have to take directionality into account.
 571                         # How many root nodes do we have?
 572                         my @roots = grep { ref( $contig->{$_} ) || $contig->{$_} eq $gst }
 573                                 $part->predecessorless_vertices;
 574                         # Assuming that @$g > 1, find the first root node that has at
 575                         # least one successor belonging to our group. If this reading
 576                         # is genealogical, there should be only one, but we will check
 577                         # that implicitly later.
 578                         foreach my $root ( @roots ) {
 579                                 # Prune the tree to get rid of extraneous hypotheticals.
 580                                 $root = _prune_subtree( $part, $root, $contig );
 581                                 next unless $root;
 582                                 # Save this root for our group.
 583                                 push( @group_roots, $root );
 584                                 # Get all the successor nodes of our root.
 585                         }
 586                 } else {
 587                         # Dispense with the trivial case of one reading.
 588                         my $wit = $g->[0];
 589                         @group_roots = ( $wit );
 590                         foreach my $v ( $part->vertices ) {
 591                                 $part->delete_vertex( $v ) unless $v eq $wit;
 592                         }
 593         }
 594
 595         if( @group_roots > 1 ) {
 596                 $conflict->{$gst} = 1;
 597                 $is_conflicted = 1;
 598         }
 599         # Paint the 'hypotheticals' with our group.
 600                 foreach my $wit ( $part->vertices ) {
 601                         if( ref( $contig->{$wit} ) ) {
 602                                 push( @{$contig->{$wit}}, $gst );
 603                         } elsif( $contig->{$wit} ne $gst ) {
 604                                 warn "How did we get here?";
 605                         }
 606                 }
 607
 608
 609                 # Save the relevant subgraph.
 610                 $subgraph->{$gst} = $part;
 611     }
 612
 613         # For each of our hypothetical readings, flatten its 'contig' array if
 614         # the array contains zero or one group.  If we have any unflattened arrays,
 615         # we may need to run the resolution process. If the reading is already known
 616         # to have a conflict, flatten the 'contig' array to nothing; we won't resolve
 617         # it.
 618         my @resolve;
 619         foreach my $wit ( keys %$contig ) {
 620                 next unless ref( $contig->{$wit} );
 621                 if( @{$contig->{$wit}} > 1 ) {
 622                         if( $is_conflicted ) {
 623                                 $contig->{$wit} = '';  # We aren't going to decide.
 624                         } else {
 625                                 push( @resolve, $wit );
 626                         }
 627                 } else {
 628                         my $gst = pop @{$contig->{$wit}};
 629                         $contig->{$wit} = $gst || '';
 630                 }
 631         }
 632
 633     if( @resolve ) {
 634         my $still_contig = {};
 635         foreach my $h ( @resolve ) {
 636             # For each of the hypothetical readings with more than one possibility,
 637             # try deleting it from each of its member subgraphs in turn, and see
 638             # if that breaks the contiguous grouping.
 639             # TODO This can still break in a corner case where group A can use
 640             # either vertex 1 or 2, and group B can use either vertex 2 or 1.
 641             # Revisit this if necessary; it could get brute-force nasty.
 642             foreach my $gst ( @{$contig->{$h}} ) {
 643                 my $gpart = $subgraph->{$gst}->copy();
 644                 # If we have come this far, there is only one root and everything
 645                 # is reachable from it.
 646                 my( $root ) = $gpart->predecessorless_vertices;
 647                 my $reachable = {};
 648                 map { $reachable->{$_} = 1 } $gpart->vertices;
 649
 650                 # Try deleting the hypothetical node.
 651                 $gpart->delete_vertex( $h );
 652                 if( $h eq $root ) {
 653                         # See if we still have a single root.
 654                         my @roots = $gpart->predecessorless_vertices;
 655                         warn "This shouldn't have happened" unless @roots;
 656                         if( @roots > 1 ) {
 657                                 # $h is needed by this group.
 658                                 if( exists( $still_contig->{$h} ) ) {
 659                                         # Conflict!
 660                                         $conflict->{$gst} = 1;
 661                                         $still_contig->{$h} = '';
 662                                 } else {
 663                                         $still_contig->{$h} = $gst;
 664                                 }
 665                         }
 666                 } else {
 667                         # $h is somewhere in the middle. See if everything
 668                         # else can still be reached from the root.
 669                                         my %still_reachable = ( $root => 1 );
 670                                         map { $still_reachable{$_} = 1 }
 671                                                 $gpart->all_successors( $root );
 672                                         foreach my $v ( keys %$reachable ) {
 673                                                 next if $v eq $h;
 674                                                 if( !$still_reachable{$v}
 675                                                         && ( $contig->{$v} eq $gst
 676                                                                  || ( exists $still_contig->{$v}
 677                                                                           && $still_contig->{$v} eq $gst ) ) ) {
 678                                                         # We need $h.
 679                                                         if( exists $still_contig->{$h} ) {
 680                                                                 # Conflict!
 681                                                                 $conflict->{$gst} = 1;
 682                                                                 $still_contig->{$h} = '';
 683                                                         } else {
 684                                                                 $still_contig->{$h} = $gst;
 685                                                         }
 686                                                         last;
 687                                                 } # else we don't need $h in this group.
 688                                         } # end foreach $v
 689                                 } # endif $h eq $root
 690             } # end foreach $gst
 691         } # end foreach $h
 692
 693         # Now we have some hypothetical vertices in $still_contig that are the
 694         # "real" group memberships.  Replace these in $contig.
 695                 foreach my $v ( keys %$contig ) {
 696                         next unless ref $contig->{$v};
 697                         $contig->{$v} = $still_contig->{$v};
 698                 }
 699     } # end if @resolve
 700
 701     my $is_genealogical = keys %$conflict ? JSON::false : JSON::true;
 702         my $variant_row = [ [], $is_genealogical ];
 703         # Fill in the groupings from $contig.
 704         foreach my $g ( @$groups ) {
 705         my $gst = wit_stringify( $g );
 706         my @realgroup = grep { $contig->{$_} eq $gst } keys %$contig;
 707         push( @{$variant_row->[0]}, \@realgroup );
 708     }
 709     return $variant_row;
 710 }
 711
 712 sub _prune_group {
 713         my( $group, $stemma ) = @_;
 714         # Get these into a form prune_subtree will recognize. Make a "contighash"
 715         my $hypohash = {};
 716         map { $hypohash->{$_} = 1 } @$group;
 717         # ...with reference values for hypotheticals.
 718         map { $hypohash->{$_} = [] } $stemma->hypotheticals;
 719         # Make our subgraph
 720         my $subgraph = $stemma->graph->copy;
 721         map { $subgraph->delete_vertex( $_ ) unless exists $hypohash->{$_} }
 722                 $subgraph->vertices;
 723         # ...and find the root.
 724         my( $root ) = $subgraph->predecessorless_vertices;
 725         # Now prune and return the remaining vertices.
 726         _prune_subtree( $subgraph, $root, $hypohash );
 727         return $subgraph->vertices;
 728 }
 729
 730 sub _prune_subtree {
 731     my( $tree, $root, $contighash ) = @_;
 732     # First, delete hypothetical leaves / orphans until there are none left.
 733     my @orphan_hypotheticals = grep { ref( $contighash->{$_} ) }
 734         $tree->successorless_vertices;
 735     while( @orphan_hypotheticals ) {
 736         $tree->delete_vertices( @orphan_hypotheticals );
 737         @orphan_hypotheticals = grep { ref( $contighash->{$_} ) }
 738             $tree->successorless_vertices;
 739     }
 740     # Then delete a hypothetical root with only one successor, moving the
 741     # root to the first child that has no other predecessors.
 742     while( $tree->successors( $root ) == 1 && ref $contighash->{$root} ) {
 743         my @nextroot = $tree->successors( $root );
 744         $tree->delete_vertex( $root );
 745         ( $root ) = grep { $tree->is_predecessorless_vertex( $_ ) } @nextroot;
 746     }
 747     # The tree has been modified in place, but we need to know the new root.
 748     $root = undef unless $root && $tree->has_vertex( $root );
 749     return $root;
 750 }
 751 # Add the variant, subject to a.c. representation logic.
 752 # This assumes that we will see the 'main' version before the a.c. version.
 753 sub add_variant_wit {
 754     my( $arr, $wit, $acstr ) = @_;
 755     my $skip;
 756     if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
 757         my $real = $1;
 758         $skip = grep { $_ =~ /^\Q$real\E$/ } @$arr;
 759     }
 760     push( @$arr, $wit ) unless $skip;
 761 }
 762
 763 sub _useful_variant {
 764         my( $group_readings, $graph, $acstr ) = @_;
 765
 766         # TODO Decide what to do with AC witnesses
 767
 768         # Sort by group size and return
 769         my $is_useful = 0;
 770         my( @readings, @groups );   # The sorted groups for our answer.
 771         foreach my $rdg ( sort { @{$group_readings->{$b}} <=> @{$group_readings->{$a}} }
 772                 keys %$group_readings ) {
 773                 push( @readings, $rdg );
 774                 push( @groups, $group_readings->{$rdg} );
 775                 if( @{$group_readings->{$rdg}} > 1 ) {
 776                         $is_useful++;
 777                 } else {
 778                         my( $wit ) = @{$group_readings->{$rdg}};
 779                         $wit =~ s/^(.*)\Q$acstr\E$/$1/;
 780                         $is_useful++ unless( $graph->is_sink_vertex( $wit ) );
 781                 }
 782         }
 783         if( $is_useful > 1 ) {
 784                 return( \@readings, \@groups );
 785         } else {
 786                 return( [], [] );
 787         }
 788 }
 789
 790 =head2 wit_stringify( $groups )
 791
 792 Takes an array of witness groupings and produces a string like
 793 ['A','B'] / ['C','D','E'] / ['F']
 794
 795 =cut
 796
 797 sub wit_stringify {
 798     my $groups = shift;
 799     my @gst;
 800     # If we were passed an array of witnesses instead of an array of
 801     # groupings, then "group" the witnesses first.
 802     unless( ref( $groups->[0] ) ) {
 803         my $mkgrp = [ $groups ];
 804         $groups = $mkgrp;
 805     }
 806     foreach my $g ( @$groups ) {
 807         push( @gst, '[' . join( ',', map { "'$_'" } @$g ) . ']' );
 808     }
 809     return join( ' / ', @gst );
 810 }
 811
 812 # Helper function to ensure that X and X a.c. never appear in the same list.
 813 sub _add_to_witlist {
 814         my( $wit, $list, $acstr ) = @_;
 815         my %inlist;
 816         my $idx = 0;
 817         map { $inlist{$_} = $idx++ } @$list;
 818         if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
 819                 my $acwit = $1;
 820                 unless( exists $inlist{$acwit} ) {
 821                         push( @$list, $acwit.$acstr );
 822                 }
 823         } else {
 824                 if( exists( $inlist{$wit.$acstr} ) ) {
 825                         # Replace the a.c. version with the main witness
 826                         my $i = $inlist{$wit.$acstr};
 827                         $list->[$i] = $wit;
 828                 } else {
 829                         push( @$list, $wit );
 830                 }
 831         }
 832 }
 833
 834 sub _symmdiff {
 835         my( $lista, $listb ) = @_;
 836         my %union;
 837         my %scalars;
 838         map { $union{$_} = 1; $scalars{$_} = $_ } @$lista;
 839         map { $union{$_} += 1; $scalars{$_} = $_ } @$listb;
 840         my @set = grep { $union{$_} == 1 } keys %union;
 841         return map { $scalars{$_} } @set;
 842 }
 843
 844 1;
 845
 846 =head1 LICENSE
 847
 848 This package is free software and is provided "as is" without express
 849 or implied warranty.  You can redistribute it and/or modify it under
 850 the same terms as Perl itself.
 851
 852 =head1 AUTHOR
 853
 854 Tara L Andrews E<lt>aurum@cpan.orgE<gt>