lib/Text/Tradition/Analysis.pm

   1 package Text::Tradition::Analysis;
   2
   3 use strict;
   4 use warnings;
   5 use Benchmark;
   6 use Encode qw/ encode_utf8 /;
   7 use Exporter 'import';
   8 use Graph;
   9 use JSON qw/ encode_json decode_json /;
  10 use LWP::UserAgent;
  11 use Text::Tradition;
  12 use Text::Tradition::Stemma;
  13
  14 use vars qw/ @EXPORT_OK /;
  15 @EXPORT_OK = qw/ run_analysis group_variants analyze_variant_location wit_stringify /;
  16
  17 =head1 NAME
  18
  19 Text::Tradition::Analysis - functions for stemma analysis of a tradition
  20
  21 =head1 SYNOPSIS
  22
  23   use Text::Tradition;
  24   use Text::Tradition::Analysis qw/ run_analysis analyze_variant_location /;
  25   my $t = Text::Tradition->new(
  26     'name' => 'this is a text',
  27     'input' => 'TEI',
  28     'file' => '/path/to/tei_parallel_seg_file.xml' );
  29   $t->add_stemma( 'dotfile' => $stemmafile );
  30
  31   my $variant_data = run_analysis( $tradition );
  32   # Recalculate rank $n treating all orthographic variants as equivalent
  33   my $reanalyze = analyze_variant_location( $tradition, $n, 0, 'orthographic' );
  34
  35 =head1 DESCRIPTION
  36
  37 Text::Tradition is a library for representation and analysis of collated
  38 texts, particularly medieval ones.  The Collation is the central feature of
  39 a Tradition, where the text, its sequence of readings, and its relationships
  40 between readings are actually kept.
  41
  42 =head1 SUBROUTINES
  43
  44 =head2 run_analysis( $tradition, %opts )
  45
  46 Runs the analysis described in analyze_variant_location on every location in the
  47 collation of the given tradition, with the given options. These include:
  48
  49 =over 4
  50
  51 =item * stemma_id - Specify which of the tradition's stemmata to use. Default
  52 is 0 (i.e. the first).
  53
  54 =item * ranks - Specify a list of location ranks to analyze; exclude the rest.
  55
  56 =item * merge_types - Specify a list of relationship types, where related readings
  57 should be treated as identical for the purposes of analysis.
  58
  59 =back
  60
  61 =begin testing
  62
  63 use Text::Tradition;
  64 use Text::Tradition::Analysis qw/ run_analysis analyze_variant_location /;
  65
  66 my $datafile = 't/data/florilegium_tei_ps.xml';
  67 my $tradition = Text::Tradition->new( 'input' => 'TEI',
  68                                       'name' => 'test0',
  69                                       'file' => $datafile );
  70 my $s = $tradition->add_stemma( 'dotfile' => 't/data/florilegium.dot' );
  71 is( ref( $s ), 'Text::Tradition::Stemma', "Added stemma to tradition" );
  72
  73 my %expected_genealogical = (
  74         1 => 0,
  75         2 => 1,
  76         3 =>  0,
  77         5 =>  0,
  78         7 =>  0,
  79         8 =>  0,
  80         10 => 0,
  81         13 => 1,
  82         33 => 0,
  83         34 => 0,
  84         37 => 0,
  85         60 => 0,
  86         81 => 1,
  87         84 => 0,
  88         87 => 0,
  89         101 => 0,
  90         102 => 0,
  91         122 => 1,
  92         157 => 0,
  93         166 => 1,
  94         169 => 1,
  95         200 => 0,
  96         216 => 1,
  97         217 => 1,
  98         219 => 1,
  99         241 => 1,
 100         242 => 1,
 101         243 => 1,
 102 );
 103
 104 my $data = run_analysis( $tradition );
 105 foreach my $row ( @{$data->{'variants'}} ) {
 106         # Account for rows that used to be "not useful"
 107         unless( exists $expected_genealogical{$row->{'id'}} ) {
 108                 $expected_genealogical{$row->{'id'}} = 1;
 109         }
 110         is( $row->{'genealogical'}, $expected_genealogical{$row->{'id'}},
 111                 "Got correct genealogical flag for row " . $row->{'id'} );
 112 }
 113 is( $data->{'variant_count'}, 58, "Got right total variant number" );
 114 # TODO Make something meaningful of conflict count, maybe test other bits
 115
 116 =end testing
 117
 118 =cut
 119
 120 sub run_analysis {
 121         my( $tradition, %opts ) = @_;
 122         my $c = $tradition->collation;
 123
 124         my $stemma_id = $opts{'stemma_id'} || 0;
 125         my @ranks = ref( $opts{'ranks'} ) eq 'ARRAY' ? @{$opts{'ranks'}} : ();
 126         my @collapse = ref( $opts{'merge_types'} ) eq 'ARRAY' ? @{$opts{'merge_types'}} : ();
 127
 128         # Get the stemma
 129         my $stemma = $tradition->stemma( $stemma_id );
 130
 131         # Figure out which witnesses we are working with
 132         my @lacunose = $stemma->hypotheticals;
 133         my @tradition_wits = map { $_->sigil } $tradition->witnesses;
 134         map { push( @tradition_wits, $_->sigil.$c->ac_label ) if $_->is_layered }
 135                 $tradition->witnesses;
 136         push( @lacunose, _symmdiff( [ $stemma->witnesses ], \@tradition_wits ) );
 137
 138         # Find and mark 'common' ranks for exclusion, unless they were
 139         # explicitly specified.
 140         unless( @ranks ) {
 141                 my %common_rank;
 142                 foreach my $rdg ( $c->common_readings ) {
 143                         $common_rank{$rdg->rank} = 1;
 144                 }
 145                 @ranks = grep { !$common_rank{$_} } ( 1 .. $c->end->rank-1 );
 146         }
 147
 148         # Group the variants to send to the solver
 149         my @groups;
 150         my %lacunae;
 151         foreach my $rank ( @ranks ) {
 152                 my $missing = [ @lacunose ];
 153                 push( @groups, group_variants( $tradition, $rank, $missing, \@collapse ) );
 154                 $lacunae{$rank} = $missing;
 155         }
 156         $DB::single = 1;
 157         # Parse the answer
 158         my $answer = solve_variants( $stemma, @groups );
 159
 160         # Do further analysis on the answer
 161         my $conflict_count = 0;
 162         foreach my $idx ( 0 .. $#ranks ) {
 163                 my $location = $answer->{'variants'}->[$idx];
 164                 # Add the rank back in
 165                 $location->{'id'} = $ranks[$idx];
 166                 # Add the lacunae back in
 167                 $location->{'missing'} = $lacunae{$ranks[$idx]};
 168                 # Run the extra analysis we need.
 169                 analyze_location( $tradition, $stemma->graph, $location );
 170                 # Add the reading text back in
 171                 foreach my $rdghash ( @{$location->{'readings'}} ) {
 172                         $conflict_count++
 173                                 if exists $rdghash->{'conflict'} && $rdghash->{'conflict'};
 174                         my $rdg = $c->reading( $rdghash->{'readingid'} );
 175                         $rdghash->{'text'} = $rdg ? $rdg->text : $rdghash->{'readingid'};
 176                 }
 177         }
 178         $answer->{'conflict_count'} = $conflict_count;
 179
 180         return $answer;
 181 }
 182
 183 =head2 group_variants( $tradition, $rank, $lacunose, @merge_relationship_types )
 184
 185 Groups the variants at the given $rank of the collation, treating any
 186 relationships in @merge_relationship_types as equivalent.  $lacunose should
 187 be a reference to an array, to which the sigla of lacunose witnesses at this
 188 rank will be appended.
 189
 190 Returns two ordered lists $readings, $groups, where $readings->[$n] is attested
 191 by the witnesses listed in $groups->[$n].
 192
 193 =cut
 194
 195 # Return group_readings, groups, lacunose
 196 sub group_variants {
 197         my( $tradition, $rank, $lacunose, $collapse ) = @_;
 198         my $c = $tradition->collation;
 199         my $aclabel =  $c->ac_label;
 200         # Get the alignment table readings
 201         my %readings_at_rank;
 202         my @gap_wits;
 203         foreach my $tablewit ( @{$c->alignment_table->{'alignment'}} ) {
 204                 my $rdg = $tablewit->{'tokens'}->[$rank-1];
 205                 my $wit = $tablewit->{'witness'};
 206                 if( $rdg && $rdg->{'t'}->is_lacuna ) {
 207                         _add_to_witlist( $wit, $lacunose, $aclabel );
 208                 } elsif( $rdg ) {
 209                         $readings_at_rank{$rdg->{'t'}->text} = $rdg->{'t'};
 210                 } else {
 211                         _add_to_witlist( $wit, \@gap_wits, $aclabel );
 212                 }
 213         }
 214
 215         # Group the readings, collapsing groups by relationship if needed
 216         my %grouped_readings;
 217         foreach my $rdg ( sort { $b->witnesses <=> $a->witnesses }
 218                                                    values %readings_at_rank ) {
 219                 # Skip readings that have been collapsed into others.
 220                 next if exists $grouped_readings{$rdg->id} && !$grouped_readings{$rdg->id};
 221                 my @wits = $rdg->witnesses;
 222                 if( $collapse ) {
 223                         my $filter = sub { my $r = $_[0]; grep { $_ eq $r->type } @$collapse; };
 224                         foreach my $other ( $rdg->related_readings( $filter ) ) {
 225                                 my @otherwits = $other->witnesses;
 226                                 push( @wits, @otherwits );
 227                                 $grouped_readings{$other->id} = 0;
 228                         }
 229                 }
 230                 $grouped_readings{$rdg->id} = \@wits;
 231         }
 232         $grouped_readings{'(omitted)'} = \@gap_wits if @gap_wits;
 233         # Get rid of our collapsed readings
 234         map { delete $grouped_readings{$_} unless $grouped_readings{$_} }
 235                 keys %grouped_readings
 236                 if $collapse;
 237
 238         return \%grouped_readings;
 239 }
 240
 241 =head2 solve_variants( $graph, @groups )
 242
 243 Sends the set of groups to the external graph solver service and returns
 244 a cleaned-up answer, adding the rank IDs back where they belong.
 245
 246 The JSON has the form
 247   { "graph": [ stemmagraph DOT string without newlines ],
 248     "groupings": [ array of arrays of groups, one per rank ] }
 249
 250 The answer has the form
 251   { "variants" => [ array of variant location structures ],
 252     "variant_count" => total,
 253     "conflict_count" => number of conflicts detected,
 254     "genealogical_count" => number of solutions found }
 255
 256 =cut
 257
 258 sub solve_variants {
 259         my( $stemma, @groups ) = @_;
 260
 261         # Make the json with stemma + groups
 262         my $groupings = [];
 263         foreach my $ghash ( @groups ) {
 264                 my @grouping;
 265                 foreach my $k ( keys %$ghash ) {
 266                         push( @grouping, $ghash->{$k} );
 267                 }
 268                 push( @$groupings, \@grouping );
 269         }
 270         ## Witness map is a HACK to get around limitations in node names from IDP
 271         my $witness_map = {};
 272         my $json = encode_json( _safe_wit_strings( $stemma, $groupings, $witness_map ) );
 273
 274         # Send it off and get the result
 275         my $solver_url = 'http://byzantini.st/cgi-bin/graphcalc.cgi';
 276         my $ua = LWP::UserAgent->new();
 277         my $resp = $ua->post( $solver_url, 'Content-Type' => 'application/json',
 278                                                   'Content' => $json );
 279
 280         my $answer;
 281         my $used_idp;
 282         if( $resp->is_success ) {
 283                 $answer = _desanitize_names( decode_json( $resp->content ), $witness_map );
 284                 $used_idp = 1;
 285         } else {
 286                 # Fall back to the old method.
 287                 warn "IDP solver returned " . $resp->status_line . " / " . $resp->content
 288                         . "; falling back to perl method";
 289                 $answer = perl_solver( $stemma, @$groupings );
 290         }
 291
 292         # Fold the result back into what we know about the groups.
 293         my $variants = [];
 294         my $genealogical = 0;
 295         foreach my $idx ( 0 .. $#groups ) {
 296                 my( $calc_groups, $result ) = @{$answer->[$idx]};
 297                 if( $result ) {
 298                         $genealogical++;
 299                         # Prune the calculated groups, in case the IDP solver failed to.
 300                         if( $used_idp ) {
 301                                 my @pruned_groups;
 302                                 foreach my $cg ( @$calc_groups ) {
 303                                         my @pg = _prune_group( $cg, $stemma );
 304                                         push( @pruned_groups, \@pg );
 305                                 }
 306                                 $calc_groups = \@pruned_groups;
 307                         }
 308                 }
 309                 my $input_group = $groups[$idx];
 310                 foreach my $k ( sort keys %$input_group ) {
 311                         my $cg = shift @$calc_groups;
 312                         $input_group->{$k} = $cg;
 313                 }
 314                 my $vstruct = {
 315                         'genealogical' => $result,
 316                         'readings' => [],
 317                 };
 318                 foreach my $k ( keys %$input_group ) {
 319                         push( @{$vstruct->{'readings'}},
 320                                   { 'readingid' => $k, 'group' => $input_group->{$k}} );
 321                 }
 322                 push( @$variants, $vstruct );
 323         }
 324
 325         return { 'variants' => $variants,
 326                          'variant_count' => scalar @$variants,
 327                          'genealogical_count' => $genealogical };
 328 }
 329
 330 #### HACKERY to cope with IDP's limited idea of what a node name looks like ###
 331
 332 sub _safe_wit_strings {
 333         my( $stemma, $groupings, $witness_map ) = @_;
 334         my $safegraph = Graph->new();
 335         # Convert the graph to a safe representation and store the conversion.
 336         foreach my $n ( $stemma->graph->vertices ) {
 337                 my $sn = _safe_witstr( $n );
 338                 warn "Ambiguous stringification $sn for $n and " . $witness_map->{$sn}
 339                         if exists $witness_map->{$sn};
 340                 $witness_map->{$sn} = $n;
 341                 $safegraph->add_vertex( $sn );
 342                 $safegraph->set_vertex_attributes( $sn,
 343                         $stemma->graph->get_vertex_attributes( $n ) );
 344         }
 345         foreach my $e ( $stemma->graph->edges ) {
 346                 my @safe_e = ( _safe_witstr( $e->[0] ), _safe_witstr( $e->[1] ) );
 347                 $safegraph->add_edge( @safe_e );
 348         }
 349         my $safe_stemma = Text::Tradition::Stemma->new(
 350                 'collation' => $stemma->collation, 'graph' => $safegraph );
 351
 352         # Now convert the witness groupings to a safe representation.
 353         my $safe_groupings = [];
 354         foreach my $grouping ( @$groupings ) {
 355                 my $safe_grouping = [];
 356                 foreach my $group ( @$grouping ) {
 357                         my $safe_group = [];
 358                         foreach my $n ( @$group ) {
 359                                 my $sn = _safe_witstr( $n );
 360                                 warn "Ambiguous stringification $sn for $n and " . $witness_map->{$sn}
 361                                         if exists $witness_map->{$sn} && $witness_map->{$sn} ne $n;
 362                                 $witness_map->{$sn} = $n;
 363                                 push( @$safe_group, $sn );
 364                         }
 365                         push( @$safe_grouping, $safe_group );
 366                 }
 367                 push( @$safe_groupings, $safe_grouping );
 368         }
 369
 370         # Return it all in the struct we expect.  We have stored the reductions
 371         # in the $witness_map that we were passed.
 372         return { 'graph' => $safe_stemma->editable( ' ' ), 'groupings' => $safe_groupings };
 373 }
 374
 375 sub _safe_witstr {
 376         my $witstr = shift;
 377         $witstr =~ s/\s+/_/g;
 378         $witstr =~ s/[^\w\d-]//g;
 379         return $witstr;
 380 }
 381
 382 sub _desanitize_names {
 383         my( $jsonstruct, $witness_map ) = @_;
 384         my $result = [];
 385         foreach my $grouping ( @$jsonstruct ) {
 386                 my $real_grouping = [];
 387                 foreach my $element ( @$grouping ) {
 388                         if( ref( $element ) eq 'ARRAY' ) {
 389                                 # it's the groupset.
 390                                 my $real_groupset = [];
 391                                 foreach my $group ( @$element ) {
 392                                         my $real_group = [];
 393                                         foreach my $n ( @$group ) {
 394                                                 my $rn = $witness_map->{$n};
 395                                                 push( @$real_group, $rn );
 396                                         }
 397                                         push( @$real_groupset, $real_group );
 398                                 }
 399                                 push( @$real_grouping, $real_groupset );
 400                         } else {
 401                                 # It is the boolean, not actually a group.
 402                                 push( @$real_grouping, $element );
 403                         }
 404                 }
 405                 push( @$result, $real_grouping );
 406         }
 407         return $result;
 408 }
 409
 410 ### END HACKERY ###
 411
 412 =head2 analyze_location ( $tradition, $graph, $location_hash )
 413
 414 Given the tradition, its stemma graph, and the solution from the graph solver,
 415 work out the rest of the information we want.  For each reading we need missing,
 416 conflict, reading_parents, independent_occurrence, followed, not_followed, and follow_unknown.  Alters the location_hash in place.
 417
 418 =cut
 419
 420 sub analyze_location {
 421         my ( $tradition, $graph, $variant_row ) = @_;
 422
 423         # Make a hash of all known node memberships, and make the subgraphs.
 424         my $contig = {};
 425         my $reading_roots = {};
 426         my $subgraph = {};
 427     foreach my $rdghash ( @{$variant_row->{'readings'}} ) {
 428         my $rid = $rdghash->{'readingid'};
 429                 map { $contig->{$_} = $rid } @{$rdghash->{'group'}};
 430
 431         # Make the subgraph.
 432         my $part = $graph->copy;
 433                 my %these_vertices;
 434                 map { $these_vertices{$_} = 1 } @{$rdghash->{'group'}};
 435                 $part->delete_vertices( grep { !$these_vertices{$_} } $part->vertices );
 436                 $subgraph->{$rid} = $part;
 437                 # Get the reading roots.
 438                 map { $reading_roots->{$_} = $rid } $part->predecessorless_vertices;
 439         }
 440
 441         # Now that we have all the node group memberships, calculate followed/
 442     # non-followed/unknown values for each reading.  Also figure out the
 443     # reading's evident parent(s).
 444     foreach my $rdghash ( @{$variant_row->{'readings'}} ) {
 445         # Group string key - TODO do we need this?
 446         my $gst = wit_stringify( $rdghash->{'group'} );
 447         my $rid = $rdghash->{'readingid'};
 448         # Get the subgraph
 449         my $part = $subgraph->{$rid};
 450
 451         # Start figuring things out.
 452         my @roots = $part->predecessorless_vertices;
 453         $rdghash->{'independent_occurrence'} = scalar @roots;
 454         $rdghash->{'followed'} = scalar( $part->vertices ) - scalar( @roots );
 455         # Find the parent readings, if any, of this reading.
 456         my %rdgparents;
 457         foreach my $wit ( @roots ) {
 458                 # Look in the main stemma to find this witness's extant or known-reading
 459                 # immediate ancestor(s), and look up the reading that each ancestor olds.
 460                         my @check = $graph->predecessors( $wit );
 461                         while( @check ) {
 462                                 my @next;
 463                                 foreach my $wparent( @check ) {
 464                                         my $preading = $contig->{$wparent};
 465                                         if( $preading ) {
 466                                                 $rdgparents{$preading} = 1;
 467                                         } else {
 468                                                 push( @next, $graph->predecessors( $wparent ) );
 469                                         }
 470                                 }
 471                                 @check = @next;
 472                         }
 473                 }
 474                 $rdghash->{'reading_parents'} = [ keys %rdgparents ];
 475
 476                 # Find the number of times this reading was altered, and the number of
 477                 # times we're not sure.
 478                 my( %nofollow, %unknownfollow );
 479                 foreach my $wit ( $part->vertices ) {
 480                         foreach my $wchild ( $graph->successors( $wit ) ) {
 481                                 next if $part->has_vertex( $wchild );
 482                                 if( $reading_roots->{$wchild} && $contig->{$wchild} ) {
 483                                         # It definitely changed here.
 484                                         $nofollow{$wchild} = 1;
 485                                 } elsif( !($contig->{$wchild}) ) {
 486                                         # The child is a hypothetical node not definitely in
 487                                         # any group. Answer is unknown.
 488                                         $unknownfollow{$wchild} = 1;
 489                                 } # else it's a non-root node in a known group, and therefore
 490                                   # is presumed to have its reading from its group, not this link.
 491                         }
 492                 }
 493                 $rdghash->{'not_followed'} = keys %nofollow;
 494                 $rdghash->{'follow_unknown'} = keys %unknownfollow;
 495
 496                 # Now say whether this reading represents a conflict.
 497                 unless( $variant_row->{'genealogical'} ) {
 498                         $rdghash->{'conflict'} = @roots != 1;
 499                 }
 500     }
 501 }
 502
 503
 504 =head2 perl_solver( $tradition, $rank, $stemma_id, @merge_relationship_types )
 505
 506 ** NOTE ** This method should hopefully not be called - it is not guaranteed
 507 to be correct.  Serves as a backup for the real solver.
 508
 509 Runs an analysis of the given tradition, at the location given in $rank,
 510 against the graph of the stemma specified in $stemma_id.  The argument
 511 @merge_relationship_types is an optional list of relationship types for
 512 which readings so related should be treated as equivalent.
 513
 514 Returns a nested array data structure as follows:
 515
 516  [ [ group_list, is_genealogical ], [ group_list, is_genealogical ] ... ]
 517
 518 where the group list is the array of arrays passed in for each element of @groups,
 519 possibly with the addition of hypothetical readings.
 520
 521
 522 =cut
 523
 524 sub perl_solver {
 525         my( $stemma, @groups ) = @_;
 526         my $graph = $stemma->graph;
 527         my @answer;
 528         foreach my $g ( @groups ) {
 529                 push( @answer, _solve_variant_location( $graph, $g ) );
 530         }
 531         return \@answer;
 532 }
 533
 534 sub _solve_variant_location {
 535         my( $graph, $groups ) = @_;
 536         # Now do the work.
 537     my $contig = {};
 538     my $subgraph = {};
 539     my $is_conflicted;
 540     my $conflict = {};
 541
 542     # Mark each ms as in its own group, first.
 543     foreach my $g ( @$groups ) {
 544         my $gst = wit_stringify( $g );
 545         map { $contig->{$_} = $gst } @$g;
 546     }
 547
 548     # Now for each unmarked node in the graph, initialize an array
 549     # for possible group memberships.  We will use this later to
 550     # resolve potential conflicts.
 551     map { $contig->{$_} = [] unless $contig->{$_} } $graph->vertices;
 552     foreach my $g ( sort { scalar @$b <=> scalar @$a } @$groups ) {
 553         my $gst = wit_stringify( $g );  # This is the group name
 554         # Copy the graph, and delete all non-members from the new graph.
 555         my $part = $graph->copy;
 556         my @group_roots;
 557         $part->delete_vertices(
 558             grep { !ref( $contig->{$_} ) && $contig->{$_} ne $gst } $graph->vertices );
 559
 560         # Now look to see if our group is connected.
 561                 if( @$g > 1 ) {
 562                         # We have to take directionality into account.
 563                         # How many root nodes do we have?
 564                         my @roots = grep { ref( $contig->{$_} ) || $contig->{$_} eq $gst }
 565                                 $part->predecessorless_vertices;
 566                         # Assuming that @$g > 1, find the first root node that has at
 567                         # least one successor belonging to our group. If this reading
 568                         # is genealogical, there should be only one, but we will check
 569                         # that implicitly later.
 570                         foreach my $root ( @roots ) {
 571                                 # Prune the tree to get rid of extraneous hypotheticals.
 572                                 $root = _prune_subtree( $part, $root, $contig );
 573                                 next unless $root;
 574                                 # Save this root for our group.
 575                                 push( @group_roots, $root );
 576                                 # Get all the successor nodes of our root.
 577                         }
 578                 } else {
 579                         # Dispense with the trivial case of one reading.
 580                         my $wit = $g->[0];
 581                         @group_roots = ( $wit );
 582                         foreach my $v ( $part->vertices ) {
 583                                 $part->delete_vertex( $v ) unless $v eq $wit;
 584                         }
 585         }
 586
 587         if( @group_roots > 1 ) {
 588                 $conflict->{$gst} = 1;
 589                 $is_conflicted = 1;
 590         }
 591         # Paint the 'hypotheticals' with our group.
 592                 foreach my $wit ( $part->vertices ) {
 593                         if( ref( $contig->{$wit} ) ) {
 594                                 push( @{$contig->{$wit}}, $gst );
 595                         } elsif( $contig->{$wit} ne $gst ) {
 596                                 warn "How did we get here?";
 597                         }
 598                 }
 599
 600
 601                 # Save the relevant subgraph.
 602                 $subgraph->{$gst} = $part;
 603     }
 604
 605         # For each of our hypothetical readings, flatten its 'contig' array if
 606         # the array contains zero or one group.  If we have any unflattened arrays,
 607         # we may need to run the resolution process. If the reading is already known
 608         # to have a conflict, flatten the 'contig' array to nothing; we won't resolve
 609         # it.
 610         my @resolve;
 611         foreach my $wit ( keys %$contig ) {
 612                 next unless ref( $contig->{$wit} );
 613                 if( @{$contig->{$wit}} > 1 ) {
 614                         if( $is_conflicted ) {
 615                                 $contig->{$wit} = '';  # We aren't going to decide.
 616                         } else {
 617                                 push( @resolve, $wit );
 618                         }
 619                 } else {
 620                         my $gst = pop @{$contig->{$wit}};
 621                         $contig->{$wit} = $gst || '';
 622                 }
 623         }
 624
 625     if( @resolve ) {
 626         my $still_contig = {};
 627         foreach my $h ( @resolve ) {
 628             # For each of the hypothetical readings with more than one possibility,
 629             # try deleting it from each of its member subgraphs in turn, and see
 630             # if that breaks the contiguous grouping.
 631             # TODO This can still break in a corner case where group A can use
 632             # either vertex 1 or 2, and group B can use either vertex 2 or 1.
 633             # Revisit this if necessary; it could get brute-force nasty.
 634             foreach my $gst ( @{$contig->{$h}} ) {
 635                 my $gpart = $subgraph->{$gst}->copy();
 636                 # If we have come this far, there is only one root and everything
 637                 # is reachable from it.
 638                 my( $root ) = $gpart->predecessorless_vertices;
 639                 my $reachable = {};
 640                 map { $reachable->{$_} = 1 } $gpart->vertices;
 641
 642                 # Try deleting the hypothetical node.
 643                 $gpart->delete_vertex( $h );
 644                 if( $h eq $root ) {
 645                         # See if we still have a single root.
 646                         my @roots = $gpart->predecessorless_vertices;
 647                         warn "This shouldn't have happened" unless @roots;
 648                         if( @roots > 1 ) {
 649                                 # $h is needed by this group.
 650                                 if( exists( $still_contig->{$h} ) ) {
 651                                         # Conflict!
 652                                         $conflict->{$gst} = 1;
 653                                         $still_contig->{$h} = '';
 654                                 } else {
 655                                         $still_contig->{$h} = $gst;
 656                                 }
 657                         }
 658                 } else {
 659                         # $h is somewhere in the middle. See if everything
 660                         # else can still be reached from the root.
 661                                         my %still_reachable = ( $root => 1 );
 662                                         map { $still_reachable{$_} = 1 }
 663                                                 $gpart->all_successors( $root );
 664                                         foreach my $v ( keys %$reachable ) {
 665                                                 next if $v eq $h;
 666                                                 if( !$still_reachable{$v}
 667                                                         && ( $contig->{$v} eq $gst
 668                                                                  || ( exists $still_contig->{$v}
 669                                                                           && $still_contig->{$v} eq $gst ) ) ) {
 670                                                         # We need $h.
 671                                                         if( exists $still_contig->{$h} ) {
 672                                                                 # Conflict!
 673                                                                 $conflict->{$gst} = 1;
 674                                                                 $still_contig->{$h} = '';
 675                                                         } else {
 676                                                                 $still_contig->{$h} = $gst;
 677                                                         }
 678                                                         last;
 679                                                 } # else we don't need $h in this group.
 680                                         } # end foreach $v
 681                                 } # endif $h eq $root
 682             } # end foreach $gst
 683         } # end foreach $h
 684
 685         # Now we have some hypothetical vertices in $still_contig that are the
 686         # "real" group memberships.  Replace these in $contig.
 687                 foreach my $v ( keys %$contig ) {
 688                         next unless ref $contig->{$v};
 689                         $contig->{$v} = $still_contig->{$v};
 690                 }
 691     } # end if @resolve
 692
 693     my $is_genealogical = keys %$conflict ? JSON::false : JSON::true;
 694         my $variant_row = [ [], $is_genealogical ];
 695         # Fill in the groupings from $contig.
 696         foreach my $g ( @$groups ) {
 697         my $gst = wit_stringify( $g );
 698         my @realgroup = grep { $contig->{$_} eq $gst } keys %$contig;
 699         push( @{$variant_row->[0]}, \@realgroup );
 700     }
 701     return $variant_row;
 702 }
 703
 704 sub _prune_group {
 705         my( $group, $stemma ) = @_;
 706         # Get these into a form prune_subtree will recognize. Make a "contighash"
 707         my $hypohash = {};
 708         map { $hypohash->{$_} = 1 } @$group;
 709         # ...with reference values for hypotheticals.
 710         map { $hypohash->{$_} = [] } $stemma->hypotheticals;
 711         # Make our subgraph
 712         my $subgraph = $stemma->graph->copy;
 713         map { $subgraph->delete_vertex( $_ ) unless exists $hypohash->{$_} }
 714                 $subgraph->vertices;
 715         # ...and find the root.
 716         my( $root ) = $subgraph->predecessorless_vertices;
 717         # Now prune and return the remaining vertices.
 718         _prune_subtree( $subgraph, $root, $hypohash );
 719         return $subgraph->vertices;
 720 }
 721
 722 sub _prune_subtree {
 723     my( $tree, $root, $contighash ) = @_;
 724     # First, delete hypothetical leaves / orphans until there are none left.
 725     my @orphan_hypotheticals = grep { ref( $contighash->{$_} ) }
 726         $tree->successorless_vertices;
 727     while( @orphan_hypotheticals ) {
 728         $tree->delete_vertices( @orphan_hypotheticals );
 729         @orphan_hypotheticals = grep { ref( $contighash->{$_} ) }
 730             $tree->successorless_vertices;
 731     }
 732     # Then delete a hypothetical root with only one successor, moving the
 733     # root to the first child that has no other predecessors.
 734     while( $tree->successors( $root ) == 1 && ref $contighash->{$root} ) {
 735         my @nextroot = $tree->successors( $root );
 736         $tree->delete_vertex( $root );
 737         ( $root ) = grep { $tree->is_predecessorless_vertex( $_ ) } @nextroot;
 738     }
 739     # The tree has been modified in place, but we need to know the new root.
 740     $root = undef unless $root && $tree->has_vertex( $root );
 741     return $root;
 742 }
 743 # Add the variant, subject to a.c. representation logic.
 744 # This assumes that we will see the 'main' version before the a.c. version.
 745 sub add_variant_wit {
 746     my( $arr, $wit, $acstr ) = @_;
 747     my $skip;
 748     if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
 749         my $real = $1;
 750         $skip = grep { $_ =~ /^\Q$real\E$/ } @$arr;
 751     }
 752     push( @$arr, $wit ) unless $skip;
 753 }
 754
 755 sub _useful_variant {
 756         my( $group_readings, $graph, $acstr ) = @_;
 757
 758         # TODO Decide what to do with AC witnesses
 759
 760         # Sort by group size and return
 761         my $is_useful = 0;
 762         my( @readings, @groups );   # The sorted groups for our answer.
 763         foreach my $rdg ( sort { @{$group_readings->{$b}} <=> @{$group_readings->{$a}} }
 764                 keys %$group_readings ) {
 765                 push( @readings, $rdg );
 766                 push( @groups, $group_readings->{$rdg} );
 767                 if( @{$group_readings->{$rdg}} > 1 ) {
 768                         $is_useful++;
 769                 } else {
 770                         my( $wit ) = @{$group_readings->{$rdg}};
 771                         $wit =~ s/^(.*)\Q$acstr\E$/$1/;
 772                         $is_useful++ unless( $graph->is_sink_vertex( $wit ) );
 773                 }
 774         }
 775         if( $is_useful > 1 ) {
 776                 return( \@readings, \@groups );
 777         } else {
 778                 return( [], [] );
 779         }
 780 }
 781
 782 =head2 wit_stringify( $groups )
 783
 784 Takes an array of witness groupings and produces a string like
 785 ['A','B'] / ['C','D','E'] / ['F']
 786
 787 =cut
 788
 789 sub wit_stringify {
 790     my $groups = shift;
 791     my @gst;
 792     # If we were passed an array of witnesses instead of an array of
 793     # groupings, then "group" the witnesses first.
 794     unless( ref( $groups->[0] ) ) {
 795         my $mkgrp = [ $groups ];
 796         $groups = $mkgrp;
 797     }
 798     foreach my $g ( @$groups ) {
 799         push( @gst, '[' . join( ',', map { "'$_'" } @$g ) . ']' );
 800     }
 801     return join( ' / ', @gst );
 802 }
 803
 804 # Helper function to ensure that X and X a.c. never appear in the same list.
 805 sub _add_to_witlist {
 806         my( $wit, $list, $acstr ) = @_;
 807         my %inlist;
 808         my $idx = 0;
 809         map { $inlist{$_} = $idx++ } @$list;
 810         if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
 811                 my $acwit = $1;
 812                 unless( exists $inlist{$acwit} ) {
 813                         push( @$list, $acwit.$acstr );
 814                 }
 815         } else {
 816                 if( exists( $inlist{$wit.$acstr} ) ) {
 817                         # Replace the a.c. version with the main witness
 818                         my $i = $inlist{$wit.$acstr};
 819                         $list->[$i] = $wit;
 820                 } else {
 821                         push( @$list, $wit );
 822                 }
 823         }
 824 }
 825
 826 sub _symmdiff {
 827         my( $lista, $listb ) = @_;
 828         my %union;
 829         my %scalars;
 830         map { $union{$_} = 1; $scalars{$_} = $_ } @$lista;
 831         map { $union{$_} += 1; $scalars{$_} = $_ } @$listb;
 832         my @set = grep { $union{$_} == 1 } keys %union;
 833         return map { $scalars{$_} } @set;
 834 }
 835
 836 1;
 837
 838 =head1 LICENSE
 839
 840 This package is free software and is provided "as is" without express
 841 or implied warranty.  You can redistribute it and/or modify it under
 842 the same terms as Perl itself.
 843
 844 =head1 AUTHOR
 845
 846 Tara L Andrews E<lt>aurum@cpan.orgE<gt>