lib/Text/Tradition/Analysis.pm

   1 package Text::Tradition::Analysis;
   2
   3 use strict;
   4 use warnings;
   5 use Benchmark;
   6 use Encode qw/ encode_utf8 /;
   7 use Exporter 'import';
   8 use Graph;
   9 use JSON qw/ encode_json decode_json /;
  10 use LWP::UserAgent;
  11 use Text::Tradition;
  12 use Text::Tradition::Stemma;
  13
  14 use vars qw/ @EXPORT_OK /;
  15 @EXPORT_OK = qw/ run_analysis group_variants analyze_variant_location wit_stringify /;
  16
  17 =head1 NAME
  18
  19 Text::Tradition::Analysis - functions for stemma analysis of a tradition
  20
  21 =head1 SYNOPSIS
  22
  23   use Text::Tradition;
  24   use Text::Tradition::Analysis qw/ run_analysis analyze_variant_location /;
  25   my $t = Text::Tradition->new(
  26     'name' => 'this is a text',
  27     'input' => 'TEI',
  28     'file' => '/path/to/tei_parallel_seg_file.xml' );
  29   $t->add_stemma( 'dotfile' => $stemmafile );
  30
  31   my $variant_data = run_analysis( $tradition );
  32   # Recalculate rank $n treating all orthographic variants as equivalent
  33   my $reanalyze = analyze_variant_location( $tradition, $n, 0, 'orthographic' );
  34
  35 =head1 DESCRIPTION
  36
  37 Text::Tradition is a library for representation and analysis of collated
  38 texts, particularly medieval ones.  The Collation is the central feature of
  39 a Tradition, where the text, its sequence of readings, and its relationships
  40 between readings are actually kept.
  41
  42 =head1 SUBROUTINES
  43
  44 =head2 run_analysis( $tradition, %opts )
  45
  46 Runs the analysis described in analyze_variant_location on every location in the
  47 collation of the given tradition, with the given options. These include:
  48
  49 =over 4
  50
  51 =item * stemma_id - Specify which of the tradition's stemmata to use. Default
  52 is 0 (i.e. the first).
  53
  54 =item * ranks - Specify a list of location ranks to analyze; exclude the rest.
  55
  56 =item * merge_types - Specify a list of relationship types, where related readings
  57 should be treated as identical for the purposes of analysis.
  58
  59 =item * exclude_type1 - Exclude those ranks whose groupings have only type-1 variants.
  60
  61 =back
  62
  63 =begin testing
  64
  65 use Text::Tradition;
  66 use Text::Tradition::Analysis qw/ run_analysis analyze_variant_location /;
  67
  68 my $datafile = 't/data/florilegium_tei_ps.xml';
  69 my $tradition = Text::Tradition->new( 'input' => 'TEI',
  70                                       'name' => 'test0',
  71                                       'file' => $datafile );
  72 my $s = $tradition->add_stemma( 'dotfile' => 't/data/florilegium.dot' );
  73 is( ref( $s ), 'Text::Tradition::Stemma', "Added stemma to tradition" );
  74
  75 my %expected_genealogical = (
  76         1 => 0,
  77         2 => 1,
  78         3 =>  0,
  79         5 =>  0,
  80         7 =>  0,
  81         8 =>  0,
  82         10 => 0,
  83         13 => 1,
  84         33 => 0,
  85         34 => 0,
  86         37 => 0,
  87         60 => 0,
  88         81 => 1,
  89         84 => 0,
  90         87 => 0,
  91         101 => 0,
  92         102 => 0,
  93         122 => 1,
  94         157 => 0,
  95         166 => 1,
  96         169 => 1,
  97         200 => 0,
  98         216 => 1,
  99         217 => 1,
 100         219 => 1,
 101         241 => 1,
 102         242 => 1,
 103         243 => 1,
 104 );
 105
 106 my $data = run_analysis( $tradition );
 107 foreach my $row ( @{$data->{'variants'}} ) {
 108         # Account for rows that used to be "not useful"
 109         unless( exists $expected_genealogical{$row->{'id'}} ) {
 110                 $expected_genealogical{$row->{'id'}} = 1;
 111         }
 112         is( $row->{'genealogical'}, $expected_genealogical{$row->{'id'}},
 113                 "Got correct genealogical flag for row " . $row->{'id'} );
 114 }
 115 is( $data->{'variant_count'}, 58, "Got right total variant number" );
 116 # TODO Make something meaningful of conflict count, maybe test other bits
 117
 118 =end testing
 119
 120 =cut
 121
 122 sub run_analysis {
 123         my( $tradition, %opts ) = @_;
 124         my $c = $tradition->collation;
 125
 126         my $stemma_id = $opts{'stemma_id'} || 0;
 127         my @ranks = ref( $opts{'ranks'} ) eq 'ARRAY' ? @{$opts{'ranks'}} : ();
 128         my @collapse = ref( $opts{'merge_types'} ) eq 'ARRAY' ? @{$opts{'merge_types'}} : ();
 129
 130         # Get the stemma
 131         my $stemma = $tradition->stemma( $stemma_id );
 132
 133         # Figure out which witnesses we are working with
 134         my @lacunose = $stemma->hypotheticals;
 135         my @tradition_wits = map { $_->sigil } $tradition->witnesses;
 136         map { push( @tradition_wits, $_->sigil.$c->ac_label ) if $_->is_layered }
 137                 $tradition->witnesses;
 138         push( @lacunose, _symmdiff( [ $stemma->witnesses ], \@tradition_wits ) );
 139
 140         # Find and mark 'common' ranks for exclusion, unless they were
 141         # explicitly specified.
 142         unless( @ranks ) {
 143                 my %common_rank;
 144                 foreach my $rdg ( $c->common_readings ) {
 145                         $common_rank{$rdg->rank} = 1;
 146                 }
 147                 @ranks = grep { !$common_rank{$_} } ( 1 .. $c->end->rank-1 );
 148         }
 149
 150         # Group the variants to send to the solver
 151         my @groups;
 152         my @use_ranks;
 153         my %lacunae;
 154         foreach my $rank ( @ranks ) {
 155                 $DB::single = 1 if $rank == 1003;
 156                 my $missing = [ @lacunose ];
 157                 my $rankgroup = group_variants( $tradition, $rank, $missing, \@collapse );
 158                 if( $opts{'exclude_type1'} ) {
 159                         # Check to see whether this is a "useful" group.
 160                         my( $rdgs, $grps ) = _useful_variant( $rankgroup,
 161                                 $stemma->graph, $c->ac_label );
 162                         next unless @$rdgs;
 163                 }
 164                 push( @use_ranks, $rank );
 165                 push( @groups, $rankgroup );
 166                 $lacunae{$rank} = $missing;
 167         }
 168         # Parse the answer
 169         my $answer = solve_variants( $stemma, @groups );
 170
 171         # Do further analysis on the answer
 172         my $conflict_count = 0;
 173         foreach my $idx ( 0 .. $#use_ranks ) {
 174                 my $location = $answer->{'variants'}->[$idx];
 175                 # Add the rank back in
 176                 $location->{'id'} = $use_ranks[$idx];
 177                 # Add the lacunae back in
 178                 $location->{'missing'} = $lacunae{$use_ranks[$idx]};
 179                 my %lmiss;
 180                 map { $lmiss{$_} = 1 } @{$location->{'missing'}};
 181                 # Run the extra analysis we need.
 182                 analyze_location( $tradition, $stemma->graph, $location );
 183                 foreach my $rdghash ( @{$location->{'readings'}} ) {
 184                         $conflict_count++
 185                                 if exists $rdghash->{'conflict'} && $rdghash->{'conflict'};
 186                         # Add the reading text back in
 187                         my $rdg = $c->reading( $rdghash->{'readingid'} );
 188                         $rdghash->{'text'} = $rdg ? $rdg->text : $rdghash->{'readingid'};
 189                         # Remove lacunose witnesses from this reading's list now that the
 190                         # analysis is done
 191                         my @realgroup;
 192                         map { push( @realgroup, $_ ) unless $lmiss{$_} } $rdghash->{'group'};
 193                         $rdghash->{'group'} = \@realgroup;
 194                 }
 195         }
 196         $answer->{'conflict_count'} = $conflict_count;
 197
 198         return $answer;
 199 }
 200
 201 =head2 group_variants( $tradition, $rank, $lacunose, @merge_relationship_types )
 202
 203 Groups the variants at the given $rank of the collation, treating any
 204 relationships in @merge_relationship_types as equivalent.  $lacunose should
 205 be a reference to an array, to which the sigla of lacunose witnesses at this
 206 rank will be appended.
 207
 208 Returns a hash $group_readings where $rdg is attested by the witnesses listed
 209 in $group_readings->{$rdg}.
 210
 211 =cut
 212
 213 # Return group_readings, groups, lacunose
 214 sub group_variants {
 215         my( $tradition, $rank, $lacunose, $collapse ) = @_;
 216         my $c = $tradition->collation;
 217         my $aclabel =  $c->ac_label;
 218         # Get the alignment table readings
 219         my %readings_at_rank;
 220         my %is_lacunose; # lookup table for $lacunose
 221         map { $is_lacunose{$_} = 1 } @$lacunose;
 222         my @gap_wits;
 223         foreach my $tablewit ( @{$c->alignment_table->{'alignment'}} ) {
 224                 my $rdg = $tablewit->{'tokens'}->[$rank-1];
 225                 my $wit = $tablewit->{'witness'};
 226                 # Exclude the witness if it is "lacunose" which if we got here
 227                 # means "not in the stemma".
 228                 next if $is_lacunose{$wit};
 229                 if( $rdg && $rdg->{'t'}->is_lacuna ) {
 230                         _add_to_witlist( $wit, $lacunose, $aclabel );
 231                 } elsif( $rdg ) {
 232                         $readings_at_rank{$rdg->{'t'}->text} = $rdg->{'t'};
 233                 } else {
 234                         _add_to_witlist( $wit, \@gap_wits, $aclabel );
 235                 }
 236         }
 237
 238         # Group the readings, collapsing groups by relationship if needed
 239         my %grouped_readings;
 240         foreach my $rdg ( sort { $b->witnesses <=> $a->witnesses }
 241                                                    values %readings_at_rank ) {
 242                 # Skip readings that have been collapsed into others.
 243                 next if exists $grouped_readings{$rdg->id} && !$grouped_readings{$rdg->id};
 244                 my @wits = $rdg->witnesses;
 245                 if( $collapse ) {
 246                         my $filter = sub { my $r = $_[0]; grep { $_ eq $r->type } @$collapse; };
 247                         foreach my $other ( $rdg->related_readings( $filter ) ) {
 248                                 my @otherwits = $other->witnesses;
 249                                 push( @wits, @otherwits );
 250                                 $grouped_readings{$other->id} = 0;
 251                         }
 252                 }
 253                 my @use_wits = grep { !$is_lacunose{$_} } @wits;
 254                 $grouped_readings{$rdg->id} = \@use_wits;
 255         }
 256         $grouped_readings{'(omitted)'} = \@gap_wits if @gap_wits;
 257         # Get rid of our collapsed readings
 258         map { delete $grouped_readings{$_} unless $grouped_readings{$_} }
 259                 keys %grouped_readings
 260                 if $collapse;
 261
 262         return \%grouped_readings;
 263 }
 264
 265 =head2 solve_variants( $graph, @groups )
 266
 267 Sends the set of groups to the external graph solver service and returns
 268 a cleaned-up answer, adding the rank IDs back where they belong.
 269
 270 The JSON has the form
 271   { "graph": [ stemmagraph DOT string without newlines ],
 272     "groupings": [ array of arrays of groups, one per rank ] }
 273
 274 The answer has the form
 275   { "variants" => [ array of variant location structures ],
 276     "variant_count" => total,
 277     "conflict_count" => number of conflicts detected,
 278     "genealogical_count" => number of solutions found }
 279
 280 =cut
 281
 282 sub solve_variants {
 283         my( $stemma, @groups ) = @_;
 284
 285         # Make the json with stemma + groups
 286         my $groupings = [];
 287         foreach my $ghash ( @groups ) {
 288                 my @grouping;
 289                 foreach my $k ( keys %$ghash ) {
 290                         push( @grouping, $ghash->{$k} );
 291                 }
 292                 push( @$groupings, \@grouping );
 293         }
 294         ## Witness map is a HACK to get around limitations in node names from IDP
 295         my $witness_map = {};
 296         my $json = encode_json( _safe_wit_strings( $stemma, $groupings, $witness_map ) );
 297
 298         # Send it off and get the result
 299         my $solver_url = 'http://byzantini.st/cgi-bin/graphcalc.cgi';
 300         my $ua = LWP::UserAgent->new();
 301         my $resp = $ua->post( $solver_url, 'Content-Type' => 'application/json',
 302                                                   'Content' => $json );
 303
 304         my $answer;
 305         my $used_idp;
 306         if( $resp->is_success ) {
 307                 $answer = _desanitize_names( decode_json( $resp->content ), $witness_map );
 308                 $used_idp = 1;
 309         } else {
 310                 # Fall back to the old method.
 311                 warn "IDP solver returned " . $resp->status_line . " / " . $resp->content
 312                         . "; falling back to perl method";
 313                 $answer = perl_solver( $stemma, @$groupings );
 314         }
 315
 316         # Fold the result back into what we know about the groups.
 317         my $variants = [];
 318         my $genealogical = 0;
 319         foreach my $idx ( 0 .. $#groups ) {
 320                 my( $calc_groups, $result ) = @{$answer->[$idx]};
 321                 if( $result ) {
 322                         $genealogical++;
 323                         # Prune the calculated groups, in case the IDP solver failed to.
 324                         if( $used_idp ) {
 325                                 my @pruned_groups;
 326                                 foreach my $cg ( @$calc_groups ) {
 327                                         my @pg = _prune_group( $cg, $stemma );
 328                                         push( @pruned_groups, \@pg );
 329                                 }
 330                                 $calc_groups = \@pruned_groups;
 331                         }
 332                 }
 333                 my $input_group = $groups[$idx];
 334                 foreach my $k ( keys %$input_group ) {
 335                         my $cg = shift @$calc_groups;
 336                         $input_group->{$k} = $cg;
 337                 }
 338                 my $vstruct = {
 339                         'genealogical' => $result,
 340                         'readings' => [],
 341                 };
 342                 foreach my $k ( sort { @{$input_group->{$b}} <=> @{$input_group->{$a}} }
 343                                                         keys %$input_group ) {
 344                         push( @{$vstruct->{'readings'}},
 345                                   { 'readingid' => $k, 'group' => $input_group->{$k}} );
 346                 }
 347                 push( @$variants, $vstruct );
 348         }
 349
 350         return { 'variants' => $variants,
 351                          'variant_count' => scalar @$variants,
 352                          'genealogical_count' => $genealogical };
 353 }
 354
 355 #### HACKERY to cope with IDP's limited idea of what a node name looks like ###
 356
 357 sub _safe_wit_strings {
 358         my( $stemma, $groupings, $witness_map ) = @_;
 359         my $safegraph = Graph->new();
 360         # Convert the graph to a safe representation and store the conversion.
 361         foreach my $n ( $stemma->graph->vertices ) {
 362                 my $sn = _safe_witstr( $n );
 363                 warn "Ambiguous stringification $sn for $n and " . $witness_map->{$sn}
 364                         if exists $witness_map->{$sn};
 365                 $witness_map->{$sn} = $n;
 366                 $safegraph->add_vertex( $sn );
 367                 $safegraph->set_vertex_attributes( $sn,
 368                         $stemma->graph->get_vertex_attributes( $n ) );
 369         }
 370         foreach my $e ( $stemma->graph->edges ) {
 371                 my @safe_e = ( _safe_witstr( $e->[0] ), _safe_witstr( $e->[1] ) );
 372                 $safegraph->add_edge( @safe_e );
 373         }
 374         my $safe_stemma = Text::Tradition::Stemma->new(
 375                 'collation' => $stemma->collation, 'graph' => $safegraph );
 376
 377         # Now convert the witness groupings to a safe representation.
 378         my $safe_groupings = [];
 379         foreach my $grouping ( @$groupings ) {
 380                 my $safe_grouping = [];
 381                 foreach my $group ( @$grouping ) {
 382                         my $safe_group = [];
 383                         foreach my $n ( @$group ) {
 384                                 my $sn = _safe_witstr( $n );
 385                                 warn "Ambiguous stringification $sn for $n and " . $witness_map->{$sn}
 386                                         if exists $witness_map->{$sn} && $witness_map->{$sn} ne $n;
 387                                 $witness_map->{$sn} = $n;
 388                                 push( @$safe_group, $sn );
 389                         }
 390                         push( @$safe_grouping, $safe_group );
 391                 }
 392                 push( @$safe_groupings, $safe_grouping );
 393         }
 394
 395         # Return it all in the struct we expect.  We have stored the reductions
 396         # in the $witness_map that we were passed.
 397         return { 'graph' => $safe_stemma->editable( ' ' ), 'groupings' => $safe_groupings };
 398 }
 399
 400 sub _safe_witstr {
 401         my $witstr = shift;
 402         $witstr =~ s/\s+/_/g;
 403         $witstr =~ s/[^\w\d-]//g;
 404         return $witstr;
 405 }
 406
 407 sub _desanitize_names {
 408         my( $jsonstruct, $witness_map ) = @_;
 409         my $result = [];
 410         foreach my $grouping ( @$jsonstruct ) {
 411                 my $real_grouping = [];
 412                 foreach my $element ( @$grouping ) {
 413                         if( ref( $element ) eq 'ARRAY' ) {
 414                                 # it's the groupset.
 415                                 my $real_groupset = [];
 416                                 foreach my $group ( @$element ) {
 417                                         my $real_group = [];
 418                                         foreach my $n ( @$group ) {
 419                                                 my $rn = $witness_map->{$n};
 420                                                 push( @$real_group, $rn );
 421                                         }
 422                                         push( @$real_groupset, $real_group );
 423                                 }
 424                                 push( @$real_grouping, $real_groupset );
 425                         } else {
 426                                 # It is the boolean, not actually a group.
 427                                 push( @$real_grouping, $element );
 428                         }
 429                 }
 430                 push( @$result, $real_grouping );
 431         }
 432         return $result;
 433 }
 434
 435 ### END HACKERY ###
 436
 437 =head2 analyze_location ( $tradition, $graph, $location_hash )
 438
 439 Given the tradition, its stemma graph, and the solution from the graph solver,
 440 work out the rest of the information we want.  For each reading we need missing,
 441 conflict, reading_parents, independent_occurrence, followed, not_followed, and follow_unknown.  Alters the location_hash in place.
 442
 443 =cut
 444
 445 sub analyze_location {
 446         my ( $tradition, $graph, $variant_row ) = @_;
 447
 448         # Make a hash of all known node memberships, and make the subgraphs.
 449         my $contig = {};
 450         my $reading_roots = {};
 451         my $subgraph = {};
 452     foreach my $rdghash ( @{$variant_row->{'readings'}} ) {
 453         my $rid = $rdghash->{'readingid'};
 454                 map { $contig->{$_} = $rid } @{$rdghash->{'group'}};
 455
 456         # Make the subgraph.
 457         my $part = $graph->copy;
 458                 my %these_vertices;
 459                 map { $these_vertices{$_} = 1 } @{$rdghash->{'group'}};
 460                 $part->delete_vertices( grep { !$these_vertices{$_} } $part->vertices );
 461                 $subgraph->{$rid} = $part;
 462                 # Get the reading roots.
 463                 map { $reading_roots->{$_} = $rid } $part->predecessorless_vertices;
 464         }
 465
 466         # Now that we have all the node group memberships, calculate followed/
 467     # non-followed/unknown values for each reading.  Also figure out the
 468     # reading's evident parent(s).
 469     foreach my $rdghash ( @{$variant_row->{'readings'}} ) {
 470         # Group string key - TODO do we need this?
 471         my $gst = wit_stringify( $rdghash->{'group'} );
 472         my $rid = $rdghash->{'readingid'};
 473         # Get the subgraph
 474         my $part = $subgraph->{$rid};
 475
 476         # Start figuring things out.
 477         my @roots = $part->predecessorless_vertices;
 478         $rdghash->{'independent_occurrence'} = scalar @roots;
 479         $rdghash->{'followed'} = scalar( $part->vertices ) - scalar( @roots );
 480         # Find the parent readings, if any, of this reading.
 481         my %rdgparents;
 482         foreach my $wit ( @roots ) {
 483                 # Look in the main stemma to find this witness's extant or known-reading
 484                 # immediate ancestor(s), and look up the reading that each ancestor olds.
 485                         my @check = $graph->predecessors( $wit );
 486                         while( @check ) {
 487                                 my @next;
 488                                 foreach my $wparent( @check ) {
 489                                         my $preading = $contig->{$wparent};
 490                                         if( $preading ) {
 491                                                 $rdgparents{$preading} = 1;
 492                                         } else {
 493                                                 push( @next, $graph->predecessors( $wparent ) );
 494                                         }
 495                                 }
 496                                 @check = @next;
 497                         }
 498                 }
 499                 $rdghash->{'reading_parents'} = [ keys %rdgparents ];
 500
 501                 # Find the number of times this reading was altered, and the number of
 502                 # times we're not sure.
 503                 my( %nofollow, %unknownfollow );
 504                 foreach my $wit ( $part->vertices ) {
 505                         foreach my $wchild ( $graph->successors( $wit ) ) {
 506                                 next if $part->has_vertex( $wchild );
 507                                 if( $reading_roots->{$wchild} && $contig->{$wchild} ) {
 508                                         # It definitely changed here.
 509                                         $nofollow{$wchild} = 1;
 510                                 } elsif( !($contig->{$wchild}) ) {
 511                                         # The child is a hypothetical node not definitely in
 512                                         # any group. Answer is unknown.
 513                                         $unknownfollow{$wchild} = 1;
 514                                 } # else it's a non-root node in a known group, and therefore
 515                                   # is presumed to have its reading from its group, not this link.
 516                         }
 517                 }
 518                 $rdghash->{'not_followed'} = keys %nofollow;
 519                 $rdghash->{'follow_unknown'} = keys %unknownfollow;
 520
 521                 # Now say whether this reading represents a conflict.
 522                 unless( $variant_row->{'genealogical'} ) {
 523                         $rdghash->{'conflict'} = @roots != 1;
 524                 }
 525     }
 526 }
 527
 528
 529 =head2 perl_solver( $tradition, $rank, $stemma_id, @merge_relationship_types )
 530
 531 ** NOTE ** This method should hopefully not be called - it is not guaranteed
 532 to be correct.  Serves as a backup for the real solver.
 533
 534 Runs an analysis of the given tradition, at the location given in $rank,
 535 against the graph of the stemma specified in $stemma_id.  The argument
 536 @merge_relationship_types is an optional list of relationship types for
 537 which readings so related should be treated as equivalent.
 538
 539 Returns a nested array data structure as follows:
 540
 541  [ [ group_list, is_genealogical ], [ group_list, is_genealogical ] ... ]
 542
 543 where the group list is the array of arrays passed in for each element of @groups,
 544 possibly with the addition of hypothetical readings.
 545
 546
 547 =cut
 548
 549 sub perl_solver {
 550         my( $stemma, @groups ) = @_;
 551         my $graph = $stemma->graph;
 552         my @answer;
 553         foreach my $g ( @groups ) {
 554                 push( @answer, _solve_variant_location( $graph, $g ) );
 555         }
 556         return \@answer;
 557 }
 558
 559 sub _solve_variant_location {
 560         my( $graph, $groups ) = @_;
 561         # Now do the work.
 562     my $contig = {};
 563     my $subgraph = {};
 564     my $is_conflicted;
 565     my $conflict = {};
 566
 567     # Mark each ms as in its own group, first.
 568     foreach my $g ( @$groups ) {
 569         my $gst = wit_stringify( $g );
 570         map { $contig->{$_} = $gst } @$g;
 571     }
 572
 573     # Now for each unmarked node in the graph, initialize an array
 574     # for possible group memberships.  We will use this later to
 575     # resolve potential conflicts.
 576     map { $contig->{$_} = [] unless $contig->{$_} } $graph->vertices;
 577     foreach my $g ( sort { scalar @$b <=> scalar @$a } @$groups ) {
 578         my $gst = wit_stringify( $g );  # This is the group name
 579         # Copy the graph, and delete all non-members from the new graph.
 580         my $part = $graph->copy;
 581         my @group_roots;
 582         $part->delete_vertices(
 583             grep { !ref( $contig->{$_} ) && $contig->{$_} ne $gst } $graph->vertices );
 584
 585         # Now look to see if our group is connected.
 586                 if( @$g > 1 ) {
 587                         # We have to take directionality into account.
 588                         # How many root nodes do we have?
 589                         my @roots = grep { ref( $contig->{$_} ) || $contig->{$_} eq $gst }
 590                                 $part->predecessorless_vertices;
 591                         # Assuming that @$g > 1, find the first root node that has at
 592                         # least one successor belonging to our group. If this reading
 593                         # is genealogical, there should be only one, but we will check
 594                         # that implicitly later.
 595                         foreach my $root ( @roots ) {
 596                                 # Prune the tree to get rid of extraneous hypotheticals.
 597                                 $root = _prune_subtree( $part, $root, $contig );
 598                                 next unless $root;
 599                                 # Save this root for our group.
 600                                 push( @group_roots, $root );
 601                                 # Get all the successor nodes of our root.
 602                         }
 603                 } else {
 604                         # Dispense with the trivial case of one reading.
 605                         my $wit = $g->[0];
 606                         @group_roots = ( $wit );
 607                         foreach my $v ( $part->vertices ) {
 608                                 $part->delete_vertex( $v ) unless $v eq $wit;
 609                         }
 610         }
 611
 612         if( @group_roots > 1 ) {
 613                 $conflict->{$gst} = 1;
 614                 $is_conflicted = 1;
 615         }
 616         # Paint the 'hypotheticals' with our group.
 617                 foreach my $wit ( $part->vertices ) {
 618                         if( ref( $contig->{$wit} ) ) {
 619                                 push( @{$contig->{$wit}}, $gst );
 620                         } elsif( $contig->{$wit} ne $gst ) {
 621                                 warn "How did we get here?";
 622                         }
 623                 }
 624
 625
 626                 # Save the relevant subgraph.
 627                 $subgraph->{$gst} = $part;
 628     }
 629
 630         # For each of our hypothetical readings, flatten its 'contig' array if
 631         # the array contains zero or one group.  If we have any unflattened arrays,
 632         # we may need to run the resolution process. If the reading is already known
 633         # to have a conflict, flatten the 'contig' array to nothing; we won't resolve
 634         # it.
 635         my @resolve;
 636         foreach my $wit ( keys %$contig ) {
 637                 next unless ref( $contig->{$wit} );
 638                 if( @{$contig->{$wit}} > 1 ) {
 639                         if( $is_conflicted ) {
 640                                 $contig->{$wit} = '';  # We aren't going to decide.
 641                         } else {
 642                                 push( @resolve, $wit );
 643                         }
 644                 } else {
 645                         my $gst = pop @{$contig->{$wit}};
 646                         $contig->{$wit} = $gst || '';
 647                 }
 648         }
 649
 650     if( @resolve ) {
 651         my $still_contig = {};
 652         foreach my $h ( @resolve ) {
 653             # For each of the hypothetical readings with more than one possibility,
 654             # try deleting it from each of its member subgraphs in turn, and see
 655             # if that breaks the contiguous grouping.
 656             # TODO This can still break in a corner case where group A can use
 657             # either vertex 1 or 2, and group B can use either vertex 2 or 1.
 658             # Revisit this if necessary; it could get brute-force nasty.
 659             foreach my $gst ( @{$contig->{$h}} ) {
 660                 my $gpart = $subgraph->{$gst}->copy();
 661                 # If we have come this far, there is only one root and everything
 662                 # is reachable from it.
 663                 my( $root ) = $gpart->predecessorless_vertices;
 664                 my $reachable = {};
 665                 map { $reachable->{$_} = 1 } $gpart->vertices;
 666
 667                 # Try deleting the hypothetical node.
 668                 $gpart->delete_vertex( $h );
 669                 if( $h eq $root ) {
 670                         # See if we still have a single root.
 671                         my @roots = $gpart->predecessorless_vertices;
 672                         warn "This shouldn't have happened" unless @roots;
 673                         if( @roots > 1 ) {
 674                                 # $h is needed by this group.
 675                                 if( exists( $still_contig->{$h} ) ) {
 676                                         # Conflict!
 677                                         $conflict->{$gst} = 1;
 678                                         $still_contig->{$h} = '';
 679                                 } else {
 680                                         $still_contig->{$h} = $gst;
 681                                 }
 682                         }
 683                 } else {
 684                         # $h is somewhere in the middle. See if everything
 685                         # else can still be reached from the root.
 686                                         my %still_reachable = ( $root => 1 );
 687                                         map { $still_reachable{$_} = 1 }
 688                                                 $gpart->all_successors( $root );
 689                                         foreach my $v ( keys %$reachable ) {
 690                                                 next if $v eq $h;
 691                                                 if( !$still_reachable{$v}
 692                                                         && ( $contig->{$v} eq $gst
 693                                                                  || ( exists $still_contig->{$v}
 694                                                                           && $still_contig->{$v} eq $gst ) ) ) {
 695                                                         # We need $h.
 696                                                         if( exists $still_contig->{$h} ) {
 697                                                                 # Conflict!
 698                                                                 $conflict->{$gst} = 1;
 699                                                                 $still_contig->{$h} = '';
 700                                                         } else {
 701                                                                 $still_contig->{$h} = $gst;
 702                                                         }
 703                                                         last;
 704                                                 } # else we don't need $h in this group.
 705                                         } # end foreach $v
 706                                 } # endif $h eq $root
 707             } # end foreach $gst
 708         } # end foreach $h
 709
 710         # Now we have some hypothetical vertices in $still_contig that are the
 711         # "real" group memberships.  Replace these in $contig.
 712                 foreach my $v ( keys %$contig ) {
 713                         next unless ref $contig->{$v};
 714                         $contig->{$v} = $still_contig->{$v};
 715                 }
 716     } # end if @resolve
 717
 718     my $is_genealogical = keys %$conflict ? JSON::false : JSON::true;
 719         my $variant_row = [ [], $is_genealogical ];
 720         # Fill in the groupings from $contig.
 721         foreach my $g ( @$groups ) {
 722         my $gst = wit_stringify( $g );
 723         my @realgroup = grep { $contig->{$_} eq $gst } keys %$contig;
 724         push( @{$variant_row->[0]}, \@realgroup );
 725     }
 726     return $variant_row;
 727 }
 728
 729 sub _prune_group {
 730         my( $group, $stemma ) = @_;
 731         # Get these into a form prune_subtree will recognize. Make a "contighash"
 732         my $hypohash = {};
 733         map { $hypohash->{$_} = 1 } @$group;
 734         # ...with reference values for hypotheticals.
 735         map { $hypohash->{$_} = [] } $stemma->hypotheticals;
 736         # Make our subgraph
 737         my $subgraph = $stemma->graph->copy;
 738         map { $subgraph->delete_vertex( $_ ) unless exists $hypohash->{$_} }
 739                 $subgraph->vertices;
 740         # ...and find the root.
 741         my( $root ) = $subgraph->predecessorless_vertices;
 742         # Now prune and return the remaining vertices.
 743         _prune_subtree( $subgraph, $root, $hypohash );
 744         return $subgraph->vertices;
 745 }
 746
 747 sub _prune_subtree {
 748     my( $tree, $root, $contighash ) = @_;
 749     # First, delete hypothetical leaves / orphans until there are none left.
 750     my @orphan_hypotheticals = grep { ref( $contighash->{$_} ) }
 751         $tree->successorless_vertices;
 752     while( @orphan_hypotheticals ) {
 753         $tree->delete_vertices( @orphan_hypotheticals );
 754         @orphan_hypotheticals = grep { ref( $contighash->{$_} ) }
 755             $tree->successorless_vertices;
 756     }
 757     # Then delete a hypothetical root with only one successor, moving the
 758     # root to the first child that has no other predecessors.
 759     while( $tree->successors( $root ) == 1 && ref $contighash->{$root} ) {
 760         my @nextroot = $tree->successors( $root );
 761         $tree->delete_vertex( $root );
 762         ( $root ) = grep { $tree->is_predecessorless_vertex( $_ ) } @nextroot;
 763     }
 764     # The tree has been modified in place, but we need to know the new root.
 765     $root = undef unless $root && $tree->has_vertex( $root );
 766     return $root;
 767 }
 768 # Add the variant, subject to a.c. representation logic.
 769 # This assumes that we will see the 'main' version before the a.c. version.
 770 sub add_variant_wit {
 771     my( $arr, $wit, $acstr ) = @_;
 772     my $skip;
 773     if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
 774         my $real = $1;
 775         $skip = grep { $_ =~ /^\Q$real\E$/ } @$arr;
 776     }
 777     push( @$arr, $wit ) unless $skip;
 778 }
 779
 780 sub _useful_variant {
 781         my( $group_readings, $graph, $acstr ) = @_;
 782
 783         # TODO Decide what to do with AC witnesses
 784
 785         # Sort by group size and return
 786         my $is_useful = 0;
 787         my( @readings, @groups );   # The sorted groups for our answer.
 788         foreach my $rdg ( sort { @{$group_readings->{$b}} <=> @{$group_readings->{$a}} }
 789                 keys %$group_readings ) {
 790                 push( @readings, $rdg );
 791                 push( @groups, $group_readings->{$rdg} );
 792                 if( @{$group_readings->{$rdg}} > 1 ) {
 793                         $is_useful++;
 794                 } else {
 795                         my( $wit ) = @{$group_readings->{$rdg}};
 796                         $wit =~ s/^(.*)\Q$acstr\E$/$1/;
 797                         $is_useful++ unless( $graph->is_sink_vertex( $wit ) );
 798                 }
 799         }
 800         if( $is_useful > 1 ) {
 801                 return( \@readings, \@groups );
 802         } else {
 803                 return( [], [] );
 804         }
 805 }
 806
 807 =head2 wit_stringify( $groups )
 808
 809 Takes an array of witness groupings and produces a string like
 810 ['A','B'] / ['C','D','E'] / ['F']
 811
 812 =cut
 813
 814 sub wit_stringify {
 815     my $groups = shift;
 816     my @gst;
 817     # If we were passed an array of witnesses instead of an array of
 818     # groupings, then "group" the witnesses first.
 819     unless( ref( $groups->[0] ) ) {
 820         my $mkgrp = [ $groups ];
 821         $groups = $mkgrp;
 822     }
 823     foreach my $g ( @$groups ) {
 824         push( @gst, '[' . join( ',', map { "'$_'" } @$g ) . ']' );
 825     }
 826     return join( ' / ', @gst );
 827 }
 828
 829 # Helper function to ensure that X and X a.c. never appear in the same list.
 830 sub _add_to_witlist {
 831         my( $wit, $list, $acstr ) = @_;
 832         my %inlist;
 833         my $idx = 0;
 834         map { $inlist{$_} = $idx++ } @$list;
 835         if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
 836                 my $acwit = $1;
 837                 unless( exists $inlist{$acwit} ) {
 838                         push( @$list, $acwit.$acstr );
 839                 }
 840         } else {
 841                 if( exists( $inlist{$wit.$acstr} ) ) {
 842                         # Replace the a.c. version with the main witness
 843                         my $i = $inlist{$wit.$acstr};
 844                         $list->[$i] = $wit;
 845                 } else {
 846                         push( @$list, $wit );
 847                 }
 848         }
 849 }
 850
 851 sub _symmdiff {
 852         my( $lista, $listb ) = @_;
 853         my %union;
 854         my %scalars;
 855         map { $union{$_} = 1; $scalars{$_} = $_ } @$lista;
 856         map { $union{$_} += 1; $scalars{$_} = $_ } @$listb;
 857         my @set = grep { $union{$_} == 1 } keys %union;
 858         return map { $scalars{$_} } @set;
 859 }
 860
 861 1;
 862
 863 =head1 LICENSE
 864
 865 This package is free software and is provided "as is" without express
 866 or implied warranty.  You can redistribute it and/or modify it under
 867 the same terms as Perl itself.
 868
 869 =head1 AUTHOR
 870
 871 Tara L Andrews E<lt>aurum@cpan.orgE<gt>