lib/Text/Tradition/Analysis.pm

   1 package Text::Tradition::Analysis;
   2
   3 use strict;
   4 use warnings;
   5 use Benchmark;
   6 use Encode qw/ encode_utf8 /;
   7 use Exporter 'import';
   8 use Graph;
   9 use JSON qw/ encode_json decode_json /;
  10 use LWP::UserAgent;
  11 use Text::Tradition;
  12 use Text::Tradition::Stemma;
  13
  14 use vars qw/ @EXPORT_OK /;
  15 @EXPORT_OK = qw/ run_analysis group_variants analyze_variant_location wit_stringify /;
  16
  17 =head1 NAME
  18
  19 Text::Tradition::Analysis - functions for stemma analysis of a tradition
  20
  21 =head1 SYNOPSIS
  22
  23   use Text::Tradition;
  24   use Text::Tradition::Analysis qw/ run_analysis analyze_variant_location /;
  25   my $t = Text::Tradition->new(
  26     'name' => 'this is a text',
  27     'input' => 'TEI',
  28     'file' => '/path/to/tei_parallel_seg_file.xml' );
  29   $t->add_stemma( 'dotfile' => $stemmafile );
  30
  31   my $variant_data = run_analysis( $tradition );
  32   # Recalculate rank $n treating all orthographic variants as equivalent
  33   my $reanalyze = analyze_variant_location( $tradition, $n, 0, 'orthographic' );
  34
  35 =head1 DESCRIPTION
  36
  37 Text::Tradition is a library for representation and analysis of collated
  38 texts, particularly medieval ones.  The Collation is the central feature of
  39 a Tradition, where the text, its sequence of readings, and its relationships
  40 between readings are actually kept.
  41
  42 =head1 SUBROUTINES
  43
  44 =head2 run_analysis( $tradition, %opts )
  45
  46 Runs the analysis described in analyze_variant_location on every location in the
  47 collation of the given tradition, with the given options. These include:
  48
  49 =over 4
  50
  51 =item * stemma_id - Specify which of the tradition's stemmata to use. Default
  52 is 0 (i.e. the first).
  53
  54 =item * ranks - Specify a list of location ranks to analyze; exclude the rest.
  55
  56 =item * merge_types - Specify a list of relationship types, where related readings
  57 should be treated as identical for the purposes of analysis.
  58
  59 =item * exclude_type1 - Exclude those ranks whose groupings have only type-1 variants.
  60
  61 =back
  62
  63 =begin testing
  64
  65 use Text::Tradition;
  66 use Text::Tradition::Analysis qw/ run_analysis analyze_variant_location /;
  67
  68 my $datafile = 't/data/florilegium_tei_ps.xml';
  69 my $tradition = Text::Tradition->new( 'input' => 'TEI',
  70                                       'name' => 'test0',
  71                                       'file' => $datafile );
  72 my $s = $tradition->add_stemma( 'dotfile' => 't/data/florilegium.dot' );
  73 is( ref( $s ), 'Text::Tradition::Stemma', "Added stemma to tradition" );
  74
  75 my %expected_genealogical = (
  76         1 => 0,
  77         2 => 1,
  78         3 =>  0,
  79         5 =>  0,
  80         7 =>  0,
  81         8 =>  0,
  82         10 => 0,
  83         13 => 1,
  84         33 => 0,
  85         34 => 0,
  86         37 => 0,
  87         60 => 0,
  88         81 => 1,
  89         84 => 0,
  90         87 => 0,
  91         101 => 0,
  92         102 => 0,
  93         122 => 1,
  94         157 => 0,
  95         166 => 1,
  96         169 => 1,
  97         200 => 0,
  98         216 => 1,
  99         217 => 1,
 100         219 => 1,
 101         241 => 1,
 102         242 => 1,
 103         243 => 1,
 104 );
 105
 106 my $data = run_analysis( $tradition );
 107 foreach my $row ( @{$data->{'variants'}} ) {
 108         # Account for rows that used to be "not useful"
 109         unless( exists $expected_genealogical{$row->{'id'}} ) {
 110                 $expected_genealogical{$row->{'id'}} = 1;
 111         }
 112         is( $row->{'genealogical'}, $expected_genealogical{$row->{'id'}},
 113                 "Got correct genealogical flag for row " . $row->{'id'} );
 114 }
 115 is( $data->{'variant_count'}, 58, "Got right total variant number" );
 116 # TODO Make something meaningful of conflict count, maybe test other bits
 117
 118 =end testing
 119
 120 =cut
 121
 122 sub run_analysis {
 123         my( $tradition, %opts ) = @_;
 124         my $c = $tradition->collation;
 125
 126         my $stemma_id = $opts{'stemma_id'} || 0;
 127         my @ranks = ref( $opts{'ranks'} ) eq 'ARRAY' ? @{$opts{'ranks'}} : ();
 128         my @collapse = ref( $opts{'merge_types'} ) eq 'ARRAY' ? @{$opts{'merge_types'}} : ();
 129
 130         # Get the stemma
 131         my $stemma = $tradition->stemma( $stemma_id );
 132
 133         # Figure out which witnesses we are working with
 134         my @lacunose = $stemma->hypotheticals;
 135         my @tradition_wits = map { $_->sigil } $tradition->witnesses;
 136         map { push( @tradition_wits, $_->sigil.$c->ac_label ) if $_->is_layered }
 137                 $tradition->witnesses;
 138         push( @lacunose, _symmdiff( [ $stemma->witnesses ], \@tradition_wits ) );
 139
 140         # Find and mark 'common' ranks for exclusion, unless they were
 141         # explicitly specified.
 142         unless( @ranks ) {
 143                 my %common_rank;
 144                 foreach my $rdg ( $c->common_readings ) {
 145                         $common_rank{$rdg->rank} = 1;
 146                 }
 147                 @ranks = grep { !$common_rank{$_} } ( 1 .. $c->end->rank-1 );
 148         }
 149
 150         # Group the variants to send to the solver
 151         my @groups;
 152         my %lacunae;
 153         foreach my $rank ( @ranks ) {
 154                 my $missing = [ @lacunose ];
 155                 my $rankgroup = group_variants( $tradition, $rank, $missing, \@collapse );
 156                 if( $opts{'exclude_type1'} ) {
 157                         # Check to see whether this is a "useful" group.
 158                         my( $rdgs, $grps ) = _useful_variant( $rankgroup,
 159                                 $stemma->graph, $c->ac_label );
 160                         next unless @$rdgs;
 161                 }
 162                 push( @groups, $rankgroup );
 163                 $lacunae{$rank} = $missing;
 164         }
 165         $DB::single = 1;
 166         # Parse the answer
 167         my $answer = solve_variants( $stemma, @groups );
 168
 169         # Do further analysis on the answer
 170         my $conflict_count = 0;
 171         foreach my $idx ( 0 .. $#ranks ) {
 172                 my $location = $answer->{'variants'}->[$idx];
 173                 # Add the rank back in
 174                 $location->{'id'} = $ranks[$idx];
 175                 # Add the lacunae back in
 176                 $location->{'missing'} = $lacunae{$ranks[$idx]};
 177                 # Run the extra analysis we need.
 178                 analyze_location( $tradition, $stemma->graph, $location );
 179                 # Add the reading text back in
 180                 foreach my $rdghash ( @{$location->{'readings'}} ) {
 181                         $conflict_count++
 182                                 if exists $rdghash->{'conflict'} && $rdghash->{'conflict'};
 183                         my $rdg = $c->reading( $rdghash->{'readingid'} );
 184                         $rdghash->{'text'} = $rdg ? $rdg->text : $rdghash->{'readingid'};
 185                 }
 186         }
 187         $answer->{'conflict_count'} = $conflict_count;
 188
 189         return $answer;
 190 }
 191
 192 =head2 group_variants( $tradition, $rank, $lacunose, @merge_relationship_types )
 193
 194 Groups the variants at the given $rank of the collation, treating any
 195 relationships in @merge_relationship_types as equivalent.  $lacunose should
 196 be a reference to an array, to which the sigla of lacunose witnesses at this
 197 rank will be appended.
 198
 199 Returns a hash $group_readings where $rdg is attested by the witnesses listed
 200 in $group_readings->{$rdg}.
 201
 202 =cut
 203
 204 # Return group_readings, groups, lacunose
 205 sub group_variants {
 206         my( $tradition, $rank, $lacunose, $collapse ) = @_;
 207         my $c = $tradition->collation;
 208         my $aclabel =  $c->ac_label;
 209         # Get the alignment table readings
 210         my %readings_at_rank;
 211         my %is_lacunose; # lookup table for $lacunose
 212         map { $is_lacunose{$_} = 1 } @$lacunose;
 213         my @gap_wits;
 214         foreach my $tablewit ( @{$c->alignment_table->{'alignment'}} ) {
 215                 my $rdg = $tablewit->{'tokens'}->[$rank-1];
 216                 my $wit = $tablewit->{'witness'};
 217                 # Exclude the witness if it is "lacunose" which if we got here
 218                 # means "not in the stemma".
 219                 next if $is_lacunose{$wit};
 220                 if( $rdg && $rdg->{'t'}->is_lacuna ) {
 221                         _add_to_witlist( $wit, $lacunose, $aclabel );
 222                 } elsif( $rdg ) {
 223                         $readings_at_rank{$rdg->{'t'}->text} = $rdg->{'t'};
 224                 } else {
 225                         _add_to_witlist( $wit, \@gap_wits, $aclabel );
 226                 }
 227         }
 228
 229         # Group the readings, collapsing groups by relationship if needed
 230         my %grouped_readings;
 231         foreach my $rdg ( sort { $b->witnesses <=> $a->witnesses }
 232                                                    values %readings_at_rank ) {
 233                 # Skip readings that have been collapsed into others.
 234                 next if exists $grouped_readings{$rdg->id} && !$grouped_readings{$rdg->id};
 235                 my @wits = $rdg->witnesses;
 236                 if( $collapse ) {
 237                         my $filter = sub { my $r = $_[0]; grep { $_ eq $r->type } @$collapse; };
 238                         foreach my $other ( $rdg->related_readings( $filter ) ) {
 239                                 my @otherwits = $other->witnesses;
 240                                 push( @wits, @otherwits );
 241                                 $grouped_readings{$other->id} = 0;
 242                         }
 243                 }
 244                 my @use_wits = grep { !$is_lacunose{$_} } @wits;
 245                 $grouped_readings{$rdg->id} = \@use_wits;
 246         }
 247         $grouped_readings{'(omitted)'} = \@gap_wits if @gap_wits;
 248         # Get rid of our collapsed readings
 249         map { delete $grouped_readings{$_} unless $grouped_readings{$_} }
 250                 keys %grouped_readings
 251                 if $collapse;
 252
 253         return \%grouped_readings;
 254 }
 255
 256 =head2 solve_variants( $graph, @groups )
 257
 258 Sends the set of groups to the external graph solver service and returns
 259 a cleaned-up answer, adding the rank IDs back where they belong.
 260
 261 The JSON has the form
 262   { "graph": [ stemmagraph DOT string without newlines ],
 263     "groupings": [ array of arrays of groups, one per rank ] }
 264
 265 The answer has the form
 266   { "variants" => [ array of variant location structures ],
 267     "variant_count" => total,
 268     "conflict_count" => number of conflicts detected,
 269     "genealogical_count" => number of solutions found }
 270
 271 =cut
 272
 273 sub solve_variants {
 274         my( $stemma, @groups ) = @_;
 275
 276         # Make the json with stemma + groups
 277         my $groupings = [];
 278         foreach my $ghash ( @groups ) {
 279                 my @grouping;
 280                 foreach my $k ( keys %$ghash ) {
 281                         push( @grouping, $ghash->{$k} );
 282                 }
 283                 push( @$groupings, \@grouping );
 284         }
 285         ## Witness map is a HACK to get around limitations in node names from IDP
 286         my $witness_map = {};
 287         my $json = encode_json( _safe_wit_strings( $stemma, $groupings, $witness_map ) );
 288
 289         # Send it off and get the result
 290         my $solver_url = 'http://byzantini.st/cgi-bin/graphcalc.cgi';
 291         my $ua = LWP::UserAgent->new();
 292         my $resp = $ua->post( $solver_url, 'Content-Type' => 'application/json',
 293                                                   'Content' => $json );
 294
 295         my $answer;
 296         my $used_idp;
 297         if( $resp->is_success ) {
 298                 $answer = _desanitize_names( decode_json( $resp->content ), $witness_map );
 299                 $used_idp = 1;
 300         } else {
 301                 # Fall back to the old method.
 302                 warn "IDP solver returned " . $resp->status_line . " / " . $resp->content
 303                         . "; falling back to perl method";
 304                 $answer = perl_solver( $stemma, @$groupings );
 305         }
 306
 307         # Fold the result back into what we know about the groups.
 308         my $variants = [];
 309         my $genealogical = 0;
 310         foreach my $idx ( 0 .. $#groups ) {
 311                 my( $calc_groups, $result ) = @{$answer->[$idx]};
 312                 if( $result ) {
 313                         $genealogical++;
 314                         # Prune the calculated groups, in case the IDP solver failed to.
 315                         if( $used_idp ) {
 316                                 my @pruned_groups;
 317                                 foreach my $cg ( @$calc_groups ) {
 318                                         my @pg = _prune_group( $cg, $stemma );
 319                                         push( @pruned_groups, \@pg );
 320                                 }
 321                                 $calc_groups = \@pruned_groups;
 322                         }
 323                 }
 324                 my $input_group = $groups[$idx];
 325                 foreach my $k ( sort keys %$input_group ) {
 326                         my $cg = shift @$calc_groups;
 327                         $input_group->{$k} = $cg;
 328                 }
 329                 my $vstruct = {
 330                         'genealogical' => $result,
 331                         'readings' => [],
 332                 };
 333                 foreach my $k ( keys %$input_group ) {
 334                         push( @{$vstruct->{'readings'}},
 335                                   { 'readingid' => $k, 'group' => $input_group->{$k}} );
 336                 }
 337                 push( @$variants, $vstruct );
 338         }
 339
 340         return { 'variants' => $variants,
 341                          'variant_count' => scalar @$variants,
 342                          'genealogical_count' => $genealogical };
 343 }
 344
 345 #### HACKERY to cope with IDP's limited idea of what a node name looks like ###
 346
 347 sub _safe_wit_strings {
 348         my( $stemma, $groupings, $witness_map ) = @_;
 349         my $safegraph = Graph->new();
 350         # Convert the graph to a safe representation and store the conversion.
 351         foreach my $n ( $stemma->graph->vertices ) {
 352                 my $sn = _safe_witstr( $n );
 353                 warn "Ambiguous stringification $sn for $n and " . $witness_map->{$sn}
 354                         if exists $witness_map->{$sn};
 355                 $witness_map->{$sn} = $n;
 356                 $safegraph->add_vertex( $sn );
 357                 $safegraph->set_vertex_attributes( $sn,
 358                         $stemma->graph->get_vertex_attributes( $n ) );
 359         }
 360         foreach my $e ( $stemma->graph->edges ) {
 361                 my @safe_e = ( _safe_witstr( $e->[0] ), _safe_witstr( $e->[1] ) );
 362                 $safegraph->add_edge( @safe_e );
 363         }
 364         my $safe_stemma = Text::Tradition::Stemma->new(
 365                 'collation' => $stemma->collation, 'graph' => $safegraph );
 366
 367         # Now convert the witness groupings to a safe representation.
 368         my $safe_groupings = [];
 369         foreach my $grouping ( @$groupings ) {
 370                 my $safe_grouping = [];
 371                 foreach my $group ( @$grouping ) {
 372                         my $safe_group = [];
 373                         foreach my $n ( @$group ) {
 374                                 my $sn = _safe_witstr( $n );
 375                                 warn "Ambiguous stringification $sn for $n and " . $witness_map->{$sn}
 376                                         if exists $witness_map->{$sn} && $witness_map->{$sn} ne $n;
 377                                 $witness_map->{$sn} = $n;
 378                                 push( @$safe_group, $sn );
 379                         }
 380                         push( @$safe_grouping, $safe_group );
 381                 }
 382                 push( @$safe_groupings, $safe_grouping );
 383         }
 384
 385         # Return it all in the struct we expect.  We have stored the reductions
 386         # in the $witness_map that we were passed.
 387         return { 'graph' => $safe_stemma->editable( ' ' ), 'groupings' => $safe_groupings };
 388 }
 389
 390 sub _safe_witstr {
 391         my $witstr = shift;
 392         $witstr =~ s/\s+/_/g;
 393         $witstr =~ s/[^\w\d-]//g;
 394         return $witstr;
 395 }
 396
 397 sub _desanitize_names {
 398         my( $jsonstruct, $witness_map ) = @_;
 399         my $result = [];
 400         foreach my $grouping ( @$jsonstruct ) {
 401                 my $real_grouping = [];
 402                 foreach my $element ( @$grouping ) {
 403                         if( ref( $element ) eq 'ARRAY' ) {
 404                                 # it's the groupset.
 405                                 my $real_groupset = [];
 406                                 foreach my $group ( @$element ) {
 407                                         my $real_group = [];
 408                                         foreach my $n ( @$group ) {
 409                                                 my $rn = $witness_map->{$n};
 410                                                 push( @$real_group, $rn );
 411                                         }
 412                                         push( @$real_groupset, $real_group );
 413                                 }
 414                                 push( @$real_grouping, $real_groupset );
 415                         } else {
 416                                 # It is the boolean, not actually a group.
 417                                 push( @$real_grouping, $element );
 418                         }
 419                 }
 420                 push( @$result, $real_grouping );
 421         }
 422         return $result;
 423 }
 424
 425 ### END HACKERY ###
 426
 427 =head2 analyze_location ( $tradition, $graph, $location_hash )
 428
 429 Given the tradition, its stemma graph, and the solution from the graph solver,
 430 work out the rest of the information we want.  For each reading we need missing,
 431 conflict, reading_parents, independent_occurrence, followed, not_followed, and follow_unknown.  Alters the location_hash in place.
 432
 433 =cut
 434
 435 sub analyze_location {
 436         my ( $tradition, $graph, $variant_row ) = @_;
 437
 438         # Make a hash of all known node memberships, and make the subgraphs.
 439         my $contig = {};
 440         my $reading_roots = {};
 441         my $subgraph = {};
 442     foreach my $rdghash ( @{$variant_row->{'readings'}} ) {
 443         my $rid = $rdghash->{'readingid'};
 444                 map { $contig->{$_} = $rid } @{$rdghash->{'group'}};
 445
 446         # Make the subgraph.
 447         my $part = $graph->copy;
 448                 my %these_vertices;
 449                 map { $these_vertices{$_} = 1 } @{$rdghash->{'group'}};
 450                 $part->delete_vertices( grep { !$these_vertices{$_} } $part->vertices );
 451                 $subgraph->{$rid} = $part;
 452                 # Get the reading roots.
 453                 map { $reading_roots->{$_} = $rid } $part->predecessorless_vertices;
 454         }
 455
 456         # Now that we have all the node group memberships, calculate followed/
 457     # non-followed/unknown values for each reading.  Also figure out the
 458     # reading's evident parent(s).
 459     foreach my $rdghash ( @{$variant_row->{'readings'}} ) {
 460         # Group string key - TODO do we need this?
 461         my $gst = wit_stringify( $rdghash->{'group'} );
 462         my $rid = $rdghash->{'readingid'};
 463         # Get the subgraph
 464         my $part = $subgraph->{$rid};
 465
 466         # Start figuring things out.
 467         my @roots = $part->predecessorless_vertices;
 468         $rdghash->{'independent_occurrence'} = scalar @roots;
 469         $rdghash->{'followed'} = scalar( $part->vertices ) - scalar( @roots );
 470         # Find the parent readings, if any, of this reading.
 471         my %rdgparents;
 472         foreach my $wit ( @roots ) {
 473                 # Look in the main stemma to find this witness's extant or known-reading
 474                 # immediate ancestor(s), and look up the reading that each ancestor olds.
 475                         my @check = $graph->predecessors( $wit );
 476                         while( @check ) {
 477                                 my @next;
 478                                 foreach my $wparent( @check ) {
 479                                         my $preading = $contig->{$wparent};
 480                                         if( $preading ) {
 481                                                 $rdgparents{$preading} = 1;
 482                                         } else {
 483                                                 push( @next, $graph->predecessors( $wparent ) );
 484                                         }
 485                                 }
 486                                 @check = @next;
 487                         }
 488                 }
 489                 $rdghash->{'reading_parents'} = [ keys %rdgparents ];
 490
 491                 # Find the number of times this reading was altered, and the number of
 492                 # times we're not sure.
 493                 my( %nofollow, %unknownfollow );
 494                 foreach my $wit ( $part->vertices ) {
 495                         foreach my $wchild ( $graph->successors( $wit ) ) {
 496                                 next if $part->has_vertex( $wchild );
 497                                 if( $reading_roots->{$wchild} && $contig->{$wchild} ) {
 498                                         # It definitely changed here.
 499                                         $nofollow{$wchild} = 1;
 500                                 } elsif( !($contig->{$wchild}) ) {
 501                                         # The child is a hypothetical node not definitely in
 502                                         # any group. Answer is unknown.
 503                                         $unknownfollow{$wchild} = 1;
 504                                 } # else it's a non-root node in a known group, and therefore
 505                                   # is presumed to have its reading from its group, not this link.
 506                         }
 507                 }
 508                 $rdghash->{'not_followed'} = keys %nofollow;
 509                 $rdghash->{'follow_unknown'} = keys %unknownfollow;
 510
 511                 # Now say whether this reading represents a conflict.
 512                 unless( $variant_row->{'genealogical'} ) {
 513                         $rdghash->{'conflict'} = @roots != 1;
 514                 }
 515     }
 516 }
 517
 518
 519 =head2 perl_solver( $tradition, $rank, $stemma_id, @merge_relationship_types )
 520
 521 ** NOTE ** This method should hopefully not be called - it is not guaranteed
 522 to be correct.  Serves as a backup for the real solver.
 523
 524 Runs an analysis of the given tradition, at the location given in $rank,
 525 against the graph of the stemma specified in $stemma_id.  The argument
 526 @merge_relationship_types is an optional list of relationship types for
 527 which readings so related should be treated as equivalent.
 528
 529 Returns a nested array data structure as follows:
 530
 531  [ [ group_list, is_genealogical ], [ group_list, is_genealogical ] ... ]
 532
 533 where the group list is the array of arrays passed in for each element of @groups,
 534 possibly with the addition of hypothetical readings.
 535
 536
 537 =cut
 538
 539 sub perl_solver {
 540         my( $stemma, @groups ) = @_;
 541         my $graph = $stemma->graph;
 542         my @answer;
 543         foreach my $g ( @groups ) {
 544                 push( @answer, _solve_variant_location( $graph, $g ) );
 545         }
 546         return \@answer;
 547 }
 548
 549 sub _solve_variant_location {
 550         my( $graph, $groups ) = @_;
 551         # Now do the work.
 552     my $contig = {};
 553     my $subgraph = {};
 554     my $is_conflicted;
 555     my $conflict = {};
 556
 557     # Mark each ms as in its own group, first.
 558     foreach my $g ( @$groups ) {
 559         my $gst = wit_stringify( $g );
 560         map { $contig->{$_} = $gst } @$g;
 561     }
 562
 563     # Now for each unmarked node in the graph, initialize an array
 564     # for possible group memberships.  We will use this later to
 565     # resolve potential conflicts.
 566     map { $contig->{$_} = [] unless $contig->{$_} } $graph->vertices;
 567     foreach my $g ( sort { scalar @$b <=> scalar @$a } @$groups ) {
 568         my $gst = wit_stringify( $g );  # This is the group name
 569         # Copy the graph, and delete all non-members from the new graph.
 570         my $part = $graph->copy;
 571         my @group_roots;
 572         $part->delete_vertices(
 573             grep { !ref( $contig->{$_} ) && $contig->{$_} ne $gst } $graph->vertices );
 574
 575         # Now look to see if our group is connected.
 576                 if( @$g > 1 ) {
 577                         # We have to take directionality into account.
 578                         # How many root nodes do we have?
 579                         my @roots = grep { ref( $contig->{$_} ) || $contig->{$_} eq $gst }
 580                                 $part->predecessorless_vertices;
 581                         # Assuming that @$g > 1, find the first root node that has at
 582                         # least one successor belonging to our group. If this reading
 583                         # is genealogical, there should be only one, but we will check
 584                         # that implicitly later.
 585                         foreach my $root ( @roots ) {
 586                                 # Prune the tree to get rid of extraneous hypotheticals.
 587                                 $root = _prune_subtree( $part, $root, $contig );
 588                                 next unless $root;
 589                                 # Save this root for our group.
 590                                 push( @group_roots, $root );
 591                                 # Get all the successor nodes of our root.
 592                         }
 593                 } else {
 594                         # Dispense with the trivial case of one reading.
 595                         my $wit = $g->[0];
 596                         @group_roots = ( $wit );
 597                         foreach my $v ( $part->vertices ) {
 598                                 $part->delete_vertex( $v ) unless $v eq $wit;
 599                         }
 600         }
 601
 602         if( @group_roots > 1 ) {
 603                 $conflict->{$gst} = 1;
 604                 $is_conflicted = 1;
 605         }
 606         # Paint the 'hypotheticals' with our group.
 607                 foreach my $wit ( $part->vertices ) {
 608                         if( ref( $contig->{$wit} ) ) {
 609                                 push( @{$contig->{$wit}}, $gst );
 610                         } elsif( $contig->{$wit} ne $gst ) {
 611                                 warn "How did we get here?";
 612                         }
 613                 }
 614
 615
 616                 # Save the relevant subgraph.
 617                 $subgraph->{$gst} = $part;
 618     }
 619
 620         # For each of our hypothetical readings, flatten its 'contig' array if
 621         # the array contains zero or one group.  If we have any unflattened arrays,
 622         # we may need to run the resolution process. If the reading is already known
 623         # to have a conflict, flatten the 'contig' array to nothing; we won't resolve
 624         # it.
 625         my @resolve;
 626         foreach my $wit ( keys %$contig ) {
 627                 next unless ref( $contig->{$wit} );
 628                 if( @{$contig->{$wit}} > 1 ) {
 629                         if( $is_conflicted ) {
 630                                 $contig->{$wit} = '';  # We aren't going to decide.
 631                         } else {
 632                                 push( @resolve, $wit );
 633                         }
 634                 } else {
 635                         my $gst = pop @{$contig->{$wit}};
 636                         $contig->{$wit} = $gst || '';
 637                 }
 638         }
 639
 640     if( @resolve ) {
 641         my $still_contig = {};
 642         foreach my $h ( @resolve ) {
 643             # For each of the hypothetical readings with more than one possibility,
 644             # try deleting it from each of its member subgraphs in turn, and see
 645             # if that breaks the contiguous grouping.
 646             # TODO This can still break in a corner case where group A can use
 647             # either vertex 1 or 2, and group B can use either vertex 2 or 1.
 648             # Revisit this if necessary; it could get brute-force nasty.
 649             foreach my $gst ( @{$contig->{$h}} ) {
 650                 my $gpart = $subgraph->{$gst}->copy();
 651                 # If we have come this far, there is only one root and everything
 652                 # is reachable from it.
 653                 my( $root ) = $gpart->predecessorless_vertices;
 654                 my $reachable = {};
 655                 map { $reachable->{$_} = 1 } $gpart->vertices;
 656
 657                 # Try deleting the hypothetical node.
 658                 $gpart->delete_vertex( $h );
 659                 if( $h eq $root ) {
 660                         # See if we still have a single root.
 661                         my @roots = $gpart->predecessorless_vertices;
 662                         warn "This shouldn't have happened" unless @roots;
 663                         if( @roots > 1 ) {
 664                                 # $h is needed by this group.
 665                                 if( exists( $still_contig->{$h} ) ) {
 666                                         # Conflict!
 667                                         $conflict->{$gst} = 1;
 668                                         $still_contig->{$h} = '';
 669                                 } else {
 670                                         $still_contig->{$h} = $gst;
 671                                 }
 672                         }
 673                 } else {
 674                         # $h is somewhere in the middle. See if everything
 675                         # else can still be reached from the root.
 676                                         my %still_reachable = ( $root => 1 );
 677                                         map { $still_reachable{$_} = 1 }
 678                                                 $gpart->all_successors( $root );
 679                                         foreach my $v ( keys %$reachable ) {
 680                                                 next if $v eq $h;
 681                                                 if( !$still_reachable{$v}
 682                                                         && ( $contig->{$v} eq $gst
 683                                                                  || ( exists $still_contig->{$v}
 684                                                                           && $still_contig->{$v} eq $gst ) ) ) {
 685                                                         # We need $h.
 686                                                         if( exists $still_contig->{$h} ) {
 687                                                                 # Conflict!
 688                                                                 $conflict->{$gst} = 1;
 689                                                                 $still_contig->{$h} = '';
 690                                                         } else {
 691                                                                 $still_contig->{$h} = $gst;
 692                                                         }
 693                                                         last;
 694                                                 } # else we don't need $h in this group.
 695                                         } # end foreach $v
 696                                 } # endif $h eq $root
 697             } # end foreach $gst
 698         } # end foreach $h
 699
 700         # Now we have some hypothetical vertices in $still_contig that are the
 701         # "real" group memberships.  Replace these in $contig.
 702                 foreach my $v ( keys %$contig ) {
 703                         next unless ref $contig->{$v};
 704                         $contig->{$v} = $still_contig->{$v};
 705                 }
 706     } # end if @resolve
 707
 708     my $is_genealogical = keys %$conflict ? JSON::false : JSON::true;
 709         my $variant_row = [ [], $is_genealogical ];
 710         # Fill in the groupings from $contig.
 711         foreach my $g ( @$groups ) {
 712         my $gst = wit_stringify( $g );
 713         my @realgroup = grep { $contig->{$_} eq $gst } keys %$contig;
 714         push( @{$variant_row->[0]}, \@realgroup );
 715     }
 716     return $variant_row;
 717 }
 718
 719 sub _prune_group {
 720         my( $group, $stemma ) = @_;
 721         # Get these into a form prune_subtree will recognize. Make a "contighash"
 722         my $hypohash = {};
 723         map { $hypohash->{$_} = 1 } @$group;
 724         # ...with reference values for hypotheticals.
 725         map { $hypohash->{$_} = [] } $stemma->hypotheticals;
 726         # Make our subgraph
 727         my $subgraph = $stemma->graph->copy;
 728         map { $subgraph->delete_vertex( $_ ) unless exists $hypohash->{$_} }
 729                 $subgraph->vertices;
 730         # ...and find the root.
 731         my( $root ) = $subgraph->predecessorless_vertices;
 732         # Now prune and return the remaining vertices.
 733         _prune_subtree( $subgraph, $root, $hypohash );
 734         return $subgraph->vertices;
 735 }
 736
 737 sub _prune_subtree {
 738     my( $tree, $root, $contighash ) = @_;
 739     # First, delete hypothetical leaves / orphans until there are none left.
 740     my @orphan_hypotheticals = grep { ref( $contighash->{$_} ) }
 741         $tree->successorless_vertices;
 742     while( @orphan_hypotheticals ) {
 743         $tree->delete_vertices( @orphan_hypotheticals );
 744         @orphan_hypotheticals = grep { ref( $contighash->{$_} ) }
 745             $tree->successorless_vertices;
 746     }
 747     # Then delete a hypothetical root with only one successor, moving the
 748     # root to the first child that has no other predecessors.
 749     while( $tree->successors( $root ) == 1 && ref $contighash->{$root} ) {
 750         my @nextroot = $tree->successors( $root );
 751         $tree->delete_vertex( $root );
 752         ( $root ) = grep { $tree->is_predecessorless_vertex( $_ ) } @nextroot;
 753     }
 754     # The tree has been modified in place, but we need to know the new root.
 755     $root = undef unless $root && $tree->has_vertex( $root );
 756     return $root;
 757 }
 758 # Add the variant, subject to a.c. representation logic.
 759 # This assumes that we will see the 'main' version before the a.c. version.
 760 sub add_variant_wit {
 761     my( $arr, $wit, $acstr ) = @_;
 762     my $skip;
 763     if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
 764         my $real = $1;
 765         $skip = grep { $_ =~ /^\Q$real\E$/ } @$arr;
 766     }
 767     push( @$arr, $wit ) unless $skip;
 768 }
 769
 770 sub _useful_variant {
 771         my( $group_readings, $graph, $acstr ) = @_;
 772
 773         # TODO Decide what to do with AC witnesses
 774
 775         # Sort by group size and return
 776         my $is_useful = 0;
 777         my( @readings, @groups );   # The sorted groups for our answer.
 778         foreach my $rdg ( sort { @{$group_readings->{$b}} <=> @{$group_readings->{$a}} }
 779                 keys %$group_readings ) {
 780                 push( @readings, $rdg );
 781                 push( @groups, $group_readings->{$rdg} );
 782                 if( @{$group_readings->{$rdg}} > 1 ) {
 783                         $is_useful++;
 784                 } else {
 785                         my( $wit ) = @{$group_readings->{$rdg}};
 786                         $wit =~ s/^(.*)\Q$acstr\E$/$1/;
 787                         $is_useful++ unless( $graph->is_sink_vertex( $wit ) );
 788                 }
 789         }
 790         if( $is_useful > 1 ) {
 791                 return( \@readings, \@groups );
 792         } else {
 793                 return( [], [] );
 794         }
 795 }
 796
 797 =head2 wit_stringify( $groups )
 798
 799 Takes an array of witness groupings and produces a string like
 800 ['A','B'] / ['C','D','E'] / ['F']
 801
 802 =cut
 803
 804 sub wit_stringify {
 805     my $groups = shift;
 806     my @gst;
 807     # If we were passed an array of witnesses instead of an array of
 808     # groupings, then "group" the witnesses first.
 809     unless( ref( $groups->[0] ) ) {
 810         my $mkgrp = [ $groups ];
 811         $groups = $mkgrp;
 812     }
 813     foreach my $g ( @$groups ) {
 814         push( @gst, '[' . join( ',', map { "'$_'" } @$g ) . ']' );
 815     }
 816     return join( ' / ', @gst );
 817 }
 818
 819 # Helper function to ensure that X and X a.c. never appear in the same list.
 820 sub _add_to_witlist {
 821         my( $wit, $list, $acstr ) = @_;
 822         my %inlist;
 823         my $idx = 0;
 824         map { $inlist{$_} = $idx++ } @$list;
 825         if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
 826                 my $acwit = $1;
 827                 unless( exists $inlist{$acwit} ) {
 828                         push( @$list, $acwit.$acstr );
 829                 }
 830         } else {
 831                 if( exists( $inlist{$wit.$acstr} ) ) {
 832                         # Replace the a.c. version with the main witness
 833                         my $i = $inlist{$wit.$acstr};
 834                         $list->[$i] = $wit;
 835                 } else {
 836                         push( @$list, $wit );
 837                 }
 838         }
 839 }
 840
 841 sub _symmdiff {
 842         my( $lista, $listb ) = @_;
 843         my %union;
 844         my %scalars;
 845         map { $union{$_} = 1; $scalars{$_} = $_ } @$lista;
 846         map { $union{$_} += 1; $scalars{$_} = $_ } @$listb;
 847         my @set = grep { $union{$_} == 1 } keys %union;
 848         return map { $scalars{$_} } @set;
 849 }
 850
 851 1;
 852
 853 =head1 LICENSE
 854
 855 This package is free software and is provided "as is" without express
 856 or implied warranty.  You can redistribute it and/or modify it under
 857 the same terms as Perl itself.
 858
 859 =head1 AUTHOR
 860
 861 Tara L Andrews E<lt>aurum@cpan.orgE<gt>