lib/Text/Tradition/Analysis.pm

   1 package Text::Tradition::Analysis;
   2
   3 use strict;
   4 use warnings;
   5 use Benchmark;
   6 use Encode qw/ encode_utf8 /;
   7 use Exporter 'import';
   8 use Graph;
   9 use JSON qw/ encode_json decode_json /;
  10 use LWP::UserAgent;
  11 use Text::Tradition;
  12 use Text::Tradition::Stemma;
  13
  14 use vars qw/ @EXPORT_OK /;
  15 @EXPORT_OK = qw/ run_analysis group_variants analyze_variant_location wit_stringify /;
  16
  17 =head1 NAME
  18
  19 Text::Tradition::Analysis - functions for stemma analysis of a tradition
  20
  21 =head1 SYNOPSIS
  22
  23   use Text::Tradition;
  24   use Text::Tradition::Analysis qw/ run_analysis analyze_variant_location /;
  25   my $t = Text::Tradition->new(
  26     'name' => 'this is a text',
  27     'input' => 'TEI',
  28     'file' => '/path/to/tei_parallel_seg_file.xml' );
  29   $t->add_stemma( 'dotfile' => $stemmafile );
  30
  31   my $variant_data = run_analysis( $tradition );
  32   # Recalculate rank $n treating all orthographic variants as equivalent
  33   my $reanalyze = analyze_variant_location( $tradition, $n, 0, 'orthographic' );
  34
  35 =head1 DESCRIPTION
  36
  37 Text::Tradition is a library for representation and analysis of collated
  38 texts, particularly medieval ones.  The Collation is the central feature of
  39 a Tradition, where the text, its sequence of readings, and its relationships
  40 between readings are actually kept.
  41
  42 =head1 SUBROUTINES
  43
  44 =head2 run_analysis( $tradition, %opts )
  45
  46 Runs the analysis described in analyze_variant_location on every location in the
  47 collation of the given tradition, with the given options. These include:
  48
  49 =over 4
  50
  51 =item * stemma_id - Specify which of the tradition's stemmata to use. Default
  52 is 0 (i.e. the first).
  53
  54 =item * ranks - Specify a list of location ranks to analyze; exclude the rest.
  55
  56 =item * merge_types - Specify a list of relationship types, where related readings
  57 should be treated as identical for the purposes of analysis.
  58
  59 =item * exclude_type1 - Exclude those ranks whose groupings have only type-1 variants.
  60
  61 =back
  62
  63 =begin testing
  64
  65 use Text::Tradition;
  66 use Text::Tradition::Analysis qw/ run_analysis analyze_variant_location /;
  67
  68 my $datafile = 't/data/florilegium_tei_ps.xml';
  69 my $tradition = Text::Tradition->new( 'input' => 'TEI',
  70                                       'name' => 'test0',
  71                                       'file' => $datafile );
  72 my $s = $tradition->add_stemma( 'dotfile' => 't/data/florilegium.dot' );
  73 is( ref( $s ), 'Text::Tradition::Stemma', "Added stemma to tradition" );
  74
  75 my %expected_genealogical = (
  76         1 => 0,
  77         2 => 1,
  78         3 =>  0,
  79         5 =>  0,
  80         7 =>  0,
  81         8 =>  0,
  82         10 => 0,
  83         13 => 1,
  84         33 => 0,
  85         34 => 0,
  86         37 => 0,
  87         60 => 0,
  88         81 => 1,
  89         84 => 0,
  90         87 => 0,
  91         101 => 0,
  92         102 => 0,
  93         122 => 1,
  94         157 => 0,
  95         166 => 1,
  96         169 => 1,
  97         200 => 0,
  98         216 => 1,
  99         217 => 1,
 100         219 => 1,
 101         241 => 1,
 102         242 => 1,
 103         243 => 1,
 104 );
 105
 106 my $data = run_analysis( $tradition );
 107 my $c = $tradition->collation;
 108 foreach my $row ( @{$data->{'variants'}} ) {
 109         # Account for rows that used to be "not useful"
 110         unless( exists $expected_genealogical{$row->{'id'}} ) {
 111                 $expected_genealogical{$row->{'id'}} = 1;
 112         }
 113         is( $row->{'genealogical'}, $expected_genealogical{$row->{'id'}},
 114                 "Got correct genealogical flag for row " . $row->{'id'} );
 115         # Check that we have the right row with the right groups
 116         my $rank = $row->{'id'};
 117         foreach my $rdghash ( @{$row->{'readings'}} ) {
 118                 # Skip 'readings' that aren't really
 119                 next unless $c->reading( $rdghash->{'readingid'} );
 120                 # Check the rank
 121                 is( $c->reading( $rdghash->{'readingid'} )->rank, $rank,
 122                         "Got correct reading rank" );
 123                 # Check the witnesses
 124                 my @realwits = sort $c->reading_witnesses( $rdghash->{'readingid'} );
 125                 my @sgrp = sort @{$rdghash->{'group'}};
 126                 is_deeply( \@sgrp, \@realwits, "Reading analyzed with correct groups" );
 127         }
 128 }
 129 is( $data->{'variant_count'}, 58, "Got right total variant number" );
 130 # TODO Make something meaningful of conflict count, maybe test other bits
 131
 132 =end testing
 133
 134 =cut
 135
 136 sub run_analysis {
 137         my( $tradition, %opts ) = @_;
 138         my $c = $tradition->collation;
 139
 140         my $stemma_id = $opts{'stemma_id'} || 0;
 141         my @ranks = ref( $opts{'ranks'} ) eq 'ARRAY' ? @{$opts{'ranks'}} : ();
 142         my @collapse = ref( $opts{'merge_types'} ) eq 'ARRAY' ? @{$opts{'merge_types'}} : ();
 143
 144         # Get the stemma
 145         my $stemma = $tradition->stemma( $stemma_id );
 146
 147         # Figure out which witnesses we are working with
 148         my @lacunose = $stemma->hypotheticals;
 149         my @tradition_wits = map { $_->sigil } $tradition->witnesses;
 150         map { push( @tradition_wits, $_->sigil.$c->ac_label ) if $_->is_layered }
 151                 $tradition->witnesses;
 152         push( @lacunose, _symmdiff( [ $stemma->witnesses ], \@tradition_wits ) );
 153
 154         # Find and mark 'common' ranks for exclusion, unless they were
 155         # explicitly specified.
 156         unless( @ranks ) {
 157                 my %common_rank;
 158                 foreach my $rdg ( $c->common_readings ) {
 159                         $common_rank{$rdg->rank} = 1;
 160                 }
 161                 @ranks = grep { !$common_rank{$_} } ( 1 .. $c->end->rank-1 );
 162         }
 163
 164         # Group the variants to send to the solver
 165         my @groups;
 166         my @use_ranks;
 167         my %lacunae;
 168         foreach my $rank ( @ranks ) {
 169                 my $missing = [ @lacunose ];
 170                 my $rankgroup = group_variants( $tradition, $rank, $missing, \@collapse );
 171                 if( $opts{'exclude_type1'} ) {
 172                         # Check to see whether this is a "useful" group.
 173                         my( $rdgs, $grps ) = _useful_variant( $rankgroup,
 174                                 $stemma->graph, $c->ac_label );
 175                         next unless @$rdgs;
 176                 }
 177                 push( @use_ranks, $rank );
 178                 push( @groups, $rankgroup );
 179                 $lacunae{$rank} = $missing;
 180         }
 181         # Parse the answer
 182         my $answer = solve_variants( $stemma, @groups );
 183
 184         # Do further analysis on the answer
 185         my $conflict_count = 0;
 186         my $aclabel = $c->ac_label;
 187         foreach my $idx ( 0 .. $#use_ranks ) {
 188                 my $location = $answer->{'variants'}->[$idx];
 189                 # Add the rank back in
 190                 $location->{'id'} = $use_ranks[$idx];
 191                 # Note what our lacunae are
 192                 my %lmiss;
 193                 map { $lmiss{$_} = 1 } @{$lacunae{$use_ranks[$idx]}};
 194                 # Run through the reading groups and add as 'lacunae' any redundant
 195                 # a.c. witnesses (yes, we have to do this before the analysis, thus
 196                 # identical loops before and after. Boo.)
 197                 # TODO Consider making these callbacks to analyze_location
 198                 foreach my $rdghash ( @{$location->{'readings'}} ) {
 199                         my %rwits;
 200                         map { $rwits{$_} = 1 } @{$rdghash->{'group'}};
 201                         foreach my $rw ( keys %rwits ) {
 202                                 if( $rw =~ /^(.*)\Q$aclabel\E$/ ) {
 203                                         if( exists $rwits{$1} ) {
 204                                                 $lmiss{$rw} = 1;
 205                                                 delete $rwits{$rw};
 206                                         }
 207                                 }
 208                         }
 209                         $rdghash->{'group'} = [ keys %rwits ];
 210                 }
 211                 $location->{'missing'} = [ keys %lmiss ];
 212
 213                 # Run the extra analysis we need.
 214                 analyze_location( $tradition, $stemma->graph, $location );
 215
 216                 # Do the final post-analysis tidying up of the data.
 217                 foreach my $rdghash ( @{$location->{'readings'}} ) {
 218                         $conflict_count++
 219                                 if exists $rdghash->{'conflict'} && $rdghash->{'conflict'};
 220                         # Add the reading text back in
 221                         my $rdg = $c->reading( $rdghash->{'readingid'} );
 222                         $rdghash->{'text'} = $rdg ? $rdg->text : $rdghash->{'readingid'};
 223                         # Remove lacunose witnesses from this reading's list now that the
 224                         # analysis is done
 225                         my @realgroup;
 226                         map { push( @realgroup, $_ ) unless $lmiss{$_} } @{$rdghash->{'group'}};
 227                         $rdghash->{'group'} = \@realgroup;
 228                         # TODO Record hypotheticals used to create group, if we end up
 229                         # needing it
 230                 }
 231         }
 232         $answer->{'conflict_count'} = $conflict_count;
 233
 234         return $answer;
 235 }
 236
 237 =head2 group_variants( $tradition, $rank, $lacunose, @merge_relationship_types )
 238
 239 Groups the variants at the given $rank of the collation, treating any
 240 relationships in @merge_relationship_types as equivalent.  $lacunose should
 241 be a reference to an array, to which the sigla of lacunose witnesses at this
 242 rank will be appended.
 243
 244 Returns a hash $group_readings where $rdg is attested by the witnesses listed
 245 in $group_readings->{$rdg}.
 246
 247 =cut
 248
 249 # Return group_readings, groups, lacunose
 250 sub group_variants {
 251         my( $tradition, $rank, $lacunose, $collapse ) = @_;
 252         my $c = $tradition->collation;
 253         my $aclabel =  $c->ac_label;
 254         # Get the alignment table readings
 255         my %readings_at_rank;
 256         my %is_lacunose; # lookup table for $lacunose
 257         map { $is_lacunose{$_} = 1 } @$lacunose;
 258         my @gap_wits;
 259         foreach my $tablewit ( @{$c->alignment_table->{'alignment'}} ) {
 260                 my $rdg = $tablewit->{'tokens'}->[$rank-1];
 261                 my $wit = $tablewit->{'witness'};
 262                 # Exclude the witness if it is "lacunose" which if we got here
 263                 # means "not in the stemma".
 264                 next if $is_lacunose{$wit};
 265                 if( $rdg && $rdg->{'t'}->is_lacuna ) {
 266                         _add_to_witlist( $wit, $lacunose, $aclabel );
 267                 } elsif( $rdg ) {
 268                         $readings_at_rank{$rdg->{'t'}->text} = $rdg->{'t'};
 269                 } else {
 270                         _add_to_witlist( $wit, \@gap_wits, $aclabel );
 271                 }
 272         }
 273
 274         # Group the readings, collapsing groups by relationship if needed
 275         my %grouped_readings;
 276         foreach my $rdg ( sort { $b->witnesses <=> $a->witnesses }
 277                                                    values %readings_at_rank ) {
 278                 # Skip readings that have been collapsed into others.
 279                 next if exists $grouped_readings{$rdg->id} && !$grouped_readings{$rdg->id};
 280                 my @wits = $rdg->witnesses;
 281                 if( $collapse ) {
 282                         my $filter = sub { my $r = $_[0]; grep { $_ eq $r->type } @$collapse; };
 283                         foreach my $other ( $rdg->related_readings( $filter ) ) {
 284                                 my @otherwits = $other->witnesses;
 285                                 push( @wits, @otherwits );
 286                                 $grouped_readings{$other->id} = 0;
 287                         }
 288                 }
 289                 my @use_wits = grep { !$is_lacunose{$_} } @wits;
 290                 $grouped_readings{$rdg->id} = \@use_wits;
 291         }
 292         $grouped_readings{'(omitted)'} = \@gap_wits if @gap_wits;
 293         # Get rid of our collapsed readings
 294         map { delete $grouped_readings{$_} unless $grouped_readings{$_} }
 295                 keys %grouped_readings
 296                 if $collapse;
 297
 298         return \%grouped_readings;
 299 }
 300
 301 =head2 solve_variants( $graph, @groups )
 302
 303 Sends the set of groups to the external graph solver service and returns
 304 a cleaned-up answer, adding the rank IDs back where they belong.
 305
 306 The JSON has the form
 307   { "graph": [ stemmagraph DOT string without newlines ],
 308     "groupings": [ array of arrays of groups, one per rank ] }
 309
 310 The answer has the form
 311   { "variants" => [ array of variant location structures ],
 312     "variant_count" => total,
 313     "conflict_count" => number of conflicts detected,
 314     "genealogical_count" => number of solutions found }
 315
 316 =cut
 317
 318 sub solve_variants {
 319         my( $stemma, @groups ) = @_;
 320
 321         # Make the json with stemma + groups
 322         my $groupings = [];
 323         foreach my $ghash ( @groups ) {
 324                 my @grouping;
 325                 foreach my $k ( sort keys %$ghash ) {
 326                         push( @grouping, $ghash->{$k} );
 327                 }
 328                 push( @$groupings, \@grouping );
 329         }
 330         ## Witness map is a HACK to get around limitations in node names from IDP
 331         my $witness_map = {};
 332         my $json = encode_json( _safe_wit_strings( $stemma, $groupings, $witness_map ) );
 333
 334         # Send it off and get the result
 335         my $solver_url = 'http://byzantini.st/cgi-bin/graphcalc.cgi';
 336         my $ua = LWP::UserAgent->new();
 337         my $resp = $ua->post( $solver_url, 'Content-Type' => 'application/json',
 338                                                   'Content' => $json );
 339
 340         my $answer;
 341         my $used_idp;
 342         if( $resp->is_success ) {
 343                 $answer = _desanitize_names( decode_json( $resp->content ), $witness_map );
 344                 $used_idp = 1;
 345         } else {
 346                 # Fall back to the old method.
 347                 warn "IDP solver returned " . $resp->status_line . " / " . $resp->content
 348                         . "; falling back to perl method";
 349                 $answer = perl_solver( $stemma, @$groupings );
 350         }
 351
 352         # Fold the result back into what we know about the groups.
 353         my $variants = [];
 354         my $genealogical = 0;
 355         foreach my $idx ( 0 .. $#groups ) {
 356                 my( $calc_groups, $result ) = @{$answer->[$idx]};
 357                 if( $result ) {
 358                         $genealogical++;
 359                         # Prune the calculated groups, in case the IDP solver failed to.
 360                         if( $used_idp ) {
 361                                 my @pruned_groups;
 362                                 foreach my $cg ( @$calc_groups ) {
 363                                         my @pg = _prune_group( $cg, $stemma );
 364                                         push( @pruned_groups, \@pg );
 365                                 }
 366                                 $calc_groups = \@pruned_groups;
 367                         }
 368                 }
 369                 my $input_group = $groups[$idx];
 370                 foreach my $k ( sort keys %$input_group ) {
 371                         my $cg = shift @$calc_groups;
 372                         $input_group->{$k} = $cg;
 373                 }
 374                 my $vstruct = {
 375                         'genealogical' => $result,
 376                         'readings' => [],
 377                 };
 378                 foreach my $k ( sort { @{$input_group->{$b}} <=> @{$input_group->{$a}} }
 379                                                         keys %$input_group ) {
 380                         push( @{$vstruct->{'readings'}},
 381                                   { 'readingid' => $k, 'group' => $input_group->{$k}} );
 382                 }
 383                 push( @$variants, $vstruct );
 384         }
 385
 386         return { 'variants' => $variants,
 387                          'variant_count' => scalar @$variants,
 388                          'genealogical_count' => $genealogical };
 389 }
 390
 391 #### HACKERY to cope with IDP's limited idea of what a node name looks like ###
 392
 393 sub _safe_wit_strings {
 394         my( $stemma, $groupings, $witness_map ) = @_;
 395         my $safegraph = Graph->new();
 396         # Convert the graph to a safe representation and store the conversion.
 397         foreach my $n ( $stemma->graph->vertices ) {
 398                 my $sn = _safe_witstr( $n );
 399                 warn "Ambiguous stringification $sn for $n and " . $witness_map->{$sn}
 400                         if exists $witness_map->{$sn};
 401                 $witness_map->{$sn} = $n;
 402                 $safegraph->add_vertex( $sn );
 403                 $safegraph->set_vertex_attributes( $sn,
 404                         $stemma->graph->get_vertex_attributes( $n ) );
 405         }
 406         foreach my $e ( $stemma->graph->edges ) {
 407                 my @safe_e = ( _safe_witstr( $e->[0] ), _safe_witstr( $e->[1] ) );
 408                 $safegraph->add_edge( @safe_e );
 409         }
 410         my $safe_stemma = Text::Tradition::Stemma->new(
 411                 'collation' => $stemma->collation, 'graph' => $safegraph );
 412
 413         # Now convert the witness groupings to a safe representation.
 414         my $safe_groupings = [];
 415         foreach my $grouping ( @$groupings ) {
 416                 my $safe_grouping = [];
 417                 foreach my $group ( @$grouping ) {
 418                         my $safe_group = [];
 419                         foreach my $n ( @$group ) {
 420                                 my $sn = _safe_witstr( $n );
 421                                 warn "Ambiguous stringification $sn for $n and " . $witness_map->{$sn}
 422                                         if exists $witness_map->{$sn} && $witness_map->{$sn} ne $n;
 423                                 $witness_map->{$sn} = $n;
 424                                 push( @$safe_group, $sn );
 425                         }
 426                         push( @$safe_grouping, $safe_group );
 427                 }
 428                 push( @$safe_groupings, $safe_grouping );
 429         }
 430
 431         # Return it all in the struct we expect.  We have stored the reductions
 432         # in the $witness_map that we were passed.
 433         return { 'graph' => $safe_stemma->editable( ' ' ), 'groupings' => $safe_groupings };
 434 }
 435
 436 sub _safe_witstr {
 437         my $witstr = shift;
 438         $witstr =~ s/\s+/_/g;
 439         $witstr =~ s/[^\w\d-]//g;
 440         return $witstr;
 441 }
 442
 443 sub _desanitize_names {
 444         my( $jsonstruct, $witness_map ) = @_;
 445         my $result = [];
 446         foreach my $grouping ( @$jsonstruct ) {
 447                 my $real_grouping = [];
 448                 foreach my $element ( @$grouping ) {
 449                         if( ref( $element ) eq 'ARRAY' ) {
 450                                 # it's the groupset.
 451                                 my $real_groupset = [];
 452                                 foreach my $group ( @$element ) {
 453                                         my $real_group = [];
 454                                         foreach my $n ( @$group ) {
 455                                                 my $rn = $witness_map->{$n};
 456                                                 push( @$real_group, $rn );
 457                                         }
 458                                         push( @$real_groupset, $real_group );
 459                                 }
 460                                 push( @$real_grouping, $real_groupset );
 461                         } else {
 462                                 # It is the boolean, not actually a group.
 463                                 push( @$real_grouping, $element );
 464                         }
 465                 }
 466                 push( @$result, $real_grouping );
 467         }
 468         return $result;
 469 }
 470
 471 ### END HACKERY ###
 472
 473 =head2 analyze_location ( $tradition, $graph, $location_hash )
 474
 475 Given the tradition, its stemma graph, and the solution from the graph solver,
 476 work out the rest of the information we want.  For each reading we need missing,
 477 conflict, reading_parents, independent_occurrence, followed, not_followed, and follow_unknown.  Alters the location_hash in place.
 478
 479 =cut
 480
 481 sub analyze_location {
 482         my ( $tradition, $graph, $variant_row ) = @_;
 483
 484         # Make a hash of all known node memberships, and make the subgraphs.
 485         my $contig = {};
 486         my $reading_roots = {};
 487         my $subgraph = {};
 488     foreach my $rdghash ( @{$variant_row->{'readings'}} ) {
 489         my $rid = $rdghash->{'readingid'};
 490                 map { $contig->{$_} = $rid } @{$rdghash->{'group'}};
 491
 492         # Make the subgraph.
 493         my $part = $graph->copy;
 494                 my %these_vertices;
 495                 map { $these_vertices{$_} = 1 } @{$rdghash->{'group'}};
 496                 $part->delete_vertices( grep { !$these_vertices{$_} } $part->vertices );
 497                 $subgraph->{$rid} = $part;
 498                 # Get the reading roots.
 499                 map { $reading_roots->{$_} = $rid } $part->predecessorless_vertices;
 500         }
 501
 502         # Now that we have all the node group memberships, calculate followed/
 503     # non-followed/unknown values for each reading.  Also figure out the
 504     # reading's evident parent(s).
 505     foreach my $rdghash ( @{$variant_row->{'readings'}} ) {
 506         # Group string key - TODO do we need this?
 507         my $gst = wit_stringify( $rdghash->{'group'} );
 508         my $rid = $rdghash->{'readingid'};
 509         # Get the subgraph
 510         my $part = $subgraph->{$rid};
 511
 512         # Start figuring things out.
 513         my @roots = $part->predecessorless_vertices;
 514         $rdghash->{'independent_occurrence'} = scalar @roots;
 515         $rdghash->{'followed'} = scalar( $part->vertices ) - scalar( @roots );
 516         # Find the parent readings, if any, of this reading.
 517         my %rdgparents;
 518         foreach my $wit ( @roots ) {
 519                 # Look in the main stemma to find this witness's extant or known-reading
 520                 # immediate ancestor(s), and look up the reading that each ancestor olds.
 521                         my @check = $graph->predecessors( $wit );
 522                         while( @check ) {
 523                                 my @next;
 524                                 foreach my $wparent( @check ) {
 525                                         my $preading = $contig->{$wparent};
 526                                         if( $preading ) {
 527                                                 $rdgparents{$preading} = 1;
 528                                         } else {
 529                                                 push( @next, $graph->predecessors( $wparent ) );
 530                                         }
 531                                 }
 532                                 @check = @next;
 533                         }
 534                 }
 535                 $rdghash->{'reading_parents'} = [ keys %rdgparents ];
 536
 537                 # Find the number of times this reading was altered, and the number of
 538                 # times we're not sure.
 539                 my( %nofollow, %unknownfollow );
 540                 foreach my $wit ( $part->vertices ) {
 541                         foreach my $wchild ( $graph->successors( $wit ) ) {
 542                                 next if $part->has_vertex( $wchild );
 543                                 if( $reading_roots->{$wchild} && $contig->{$wchild} ) {
 544                                         # It definitely changed here.
 545                                         $nofollow{$wchild} = 1;
 546                                 } elsif( !($contig->{$wchild}) ) {
 547                                         # The child is a hypothetical node not definitely in
 548                                         # any group. Answer is unknown.
 549                                         $unknownfollow{$wchild} = 1;
 550                                 } # else it's a non-root node in a known group, and therefore
 551                                   # is presumed to have its reading from its group, not this link.
 552                         }
 553                 }
 554                 $rdghash->{'not_followed'} = keys %nofollow;
 555                 $rdghash->{'follow_unknown'} = keys %unknownfollow;
 556
 557                 # Now say whether this reading represents a conflict.
 558                 unless( $variant_row->{'genealogical'} ) {
 559                         $rdghash->{'conflict'} = @roots != 1;
 560                 }
 561     }
 562 }
 563
 564
 565 =head2 perl_solver( $tradition, $rank, $stemma_id, @merge_relationship_types )
 566
 567 ** NOTE ** This method should hopefully not be called - it is not guaranteed
 568 to be correct.  Serves as a backup for the real solver.
 569
 570 Runs an analysis of the given tradition, at the location given in $rank,
 571 against the graph of the stemma specified in $stemma_id.  The argument
 572 @merge_relationship_types is an optional list of relationship types for
 573 which readings so related should be treated as equivalent.
 574
 575 Returns a nested array data structure as follows:
 576
 577  [ [ group_list, is_genealogical ], [ group_list, is_genealogical ] ... ]
 578
 579 where the group list is the array of arrays passed in for each element of @groups,
 580 possibly with the addition of hypothetical readings.
 581
 582
 583 =cut
 584
 585 sub perl_solver {
 586         my( $stemma, @groups ) = @_;
 587         my $graph = $stemma->graph;
 588         my @answer;
 589         foreach my $g ( @groups ) {
 590                 push( @answer, _solve_variant_location( $graph, $g ) );
 591         }
 592         return \@answer;
 593 }
 594
 595 sub _solve_variant_location {
 596         my( $graph, $groups ) = @_;
 597         # Now do the work.
 598     my $contig = {};
 599     my $subgraph = {};
 600     my $is_conflicted;
 601     my $conflict = {};
 602
 603     # Mark each ms as in its own group, first.
 604     foreach my $g ( @$groups ) {
 605         my $gst = wit_stringify( $g );
 606         map { $contig->{$_} = $gst } @$g;
 607     }
 608
 609     # Now for each unmarked node in the graph, initialize an array
 610     # for possible group memberships.  We will use this later to
 611     # resolve potential conflicts.
 612     map { $contig->{$_} = [] unless $contig->{$_} } $graph->vertices;
 613     foreach my $g ( sort { scalar @$b <=> scalar @$a } @$groups ) {
 614         my $gst = wit_stringify( $g );  # This is the group name
 615         # Copy the graph, and delete all non-members from the new graph.
 616         my $part = $graph->copy;
 617         my @group_roots;
 618         $part->delete_vertices(
 619             grep { !ref( $contig->{$_} ) && $contig->{$_} ne $gst } $graph->vertices );
 620
 621         # Now look to see if our group is connected.
 622                 if( @$g > 1 ) {
 623                         # We have to take directionality into account.
 624                         # How many root nodes do we have?
 625                         my @roots = grep { ref( $contig->{$_} ) || $contig->{$_} eq $gst }
 626                                 $part->predecessorless_vertices;
 627                         # Assuming that @$g > 1, find the first root node that has at
 628                         # least one successor belonging to our group. If this reading
 629                         # is genealogical, there should be only one, but we will check
 630                         # that implicitly later.
 631                         foreach my $root ( @roots ) {
 632                                 # Prune the tree to get rid of extraneous hypotheticals.
 633                                 $root = _prune_subtree( $part, $root, $contig );
 634                                 next unless $root;
 635                                 # Save this root for our group.
 636                                 push( @group_roots, $root );
 637                                 # Get all the successor nodes of our root.
 638                         }
 639                 } else {
 640                         # Dispense with the trivial case of one reading.
 641                         my $wit = $g->[0];
 642                         @group_roots = ( $wit );
 643                         foreach my $v ( $part->vertices ) {
 644                                 $part->delete_vertex( $v ) unless $v eq $wit;
 645                         }
 646         }
 647
 648         if( @group_roots > 1 ) {
 649                 $conflict->{$gst} = 1;
 650                 $is_conflicted = 1;
 651         }
 652         # Paint the 'hypotheticals' with our group.
 653                 foreach my $wit ( $part->vertices ) {
 654                         if( ref( $contig->{$wit} ) ) {
 655                                 push( @{$contig->{$wit}}, $gst );
 656                         } elsif( $contig->{$wit} ne $gst ) {
 657                                 warn "How did we get here?";
 658                         }
 659                 }
 660
 661
 662                 # Save the relevant subgraph.
 663                 $subgraph->{$gst} = $part;
 664     }
 665
 666         # For each of our hypothetical readings, flatten its 'contig' array if
 667         # the array contains zero or one group.  If we have any unflattened arrays,
 668         # we may need to run the resolution process. If the reading is already known
 669         # to have a conflict, flatten the 'contig' array to nothing; we won't resolve
 670         # it.
 671         my @resolve;
 672         foreach my $wit ( keys %$contig ) {
 673                 next unless ref( $contig->{$wit} );
 674                 if( @{$contig->{$wit}} > 1 ) {
 675                         if( $is_conflicted ) {
 676                                 $contig->{$wit} = '';  # We aren't going to decide.
 677                         } else {
 678                                 push( @resolve, $wit );
 679                         }
 680                 } else {
 681                         my $gst = pop @{$contig->{$wit}};
 682                         $contig->{$wit} = $gst || '';
 683                 }
 684         }
 685
 686     if( @resolve ) {
 687         my $still_contig = {};
 688         foreach my $h ( @resolve ) {
 689             # For each of the hypothetical readings with more than one possibility,
 690             # try deleting it from each of its member subgraphs in turn, and see
 691             # if that breaks the contiguous grouping.
 692             # TODO This can still break in a corner case where group A can use
 693             # either vertex 1 or 2, and group B can use either vertex 2 or 1.
 694             # Revisit this if necessary; it could get brute-force nasty.
 695             foreach my $gst ( @{$contig->{$h}} ) {
 696                 my $gpart = $subgraph->{$gst}->copy();
 697                 # If we have come this far, there is only one root and everything
 698                 # is reachable from it.
 699                 my( $root ) = $gpart->predecessorless_vertices;
 700                 my $reachable = {};
 701                 map { $reachable->{$_} = 1 } $gpart->vertices;
 702
 703                 # Try deleting the hypothetical node.
 704                 $gpart->delete_vertex( $h );
 705                 if( $h eq $root ) {
 706                         # See if we still have a single root.
 707                         my @roots = $gpart->predecessorless_vertices;
 708                         warn "This shouldn't have happened" unless @roots;
 709                         if( @roots > 1 ) {
 710                                 # $h is needed by this group.
 711                                 if( exists( $still_contig->{$h} ) ) {
 712                                         # Conflict!
 713                                         $conflict->{$gst} = 1;
 714                                         $still_contig->{$h} = '';
 715                                 } else {
 716                                         $still_contig->{$h} = $gst;
 717                                 }
 718                         }
 719                 } else {
 720                         # $h is somewhere in the middle. See if everything
 721                         # else can still be reached from the root.
 722                                         my %still_reachable = ( $root => 1 );
 723                                         map { $still_reachable{$_} = 1 }
 724                                                 $gpart->all_successors( $root );
 725                                         foreach my $v ( keys %$reachable ) {
 726                                                 next if $v eq $h;
 727                                                 if( !$still_reachable{$v}
 728                                                         && ( $contig->{$v} eq $gst
 729                                                                  || ( exists $still_contig->{$v}
 730                                                                           && $still_contig->{$v} eq $gst ) ) ) {
 731                                                         # We need $h.
 732                                                         if( exists $still_contig->{$h} ) {
 733                                                                 # Conflict!
 734                                                                 $conflict->{$gst} = 1;
 735                                                                 $still_contig->{$h} = '';
 736                                                         } else {
 737                                                                 $still_contig->{$h} = $gst;
 738                                                         }
 739                                                         last;
 740                                                 } # else we don't need $h in this group.
 741                                         } # end foreach $v
 742                                 } # endif $h eq $root
 743             } # end foreach $gst
 744         } # end foreach $h
 745
 746         # Now we have some hypothetical vertices in $still_contig that are the
 747         # "real" group memberships.  Replace these in $contig.
 748                 foreach my $v ( keys %$contig ) {
 749                         next unless ref $contig->{$v};
 750                         $contig->{$v} = $still_contig->{$v};
 751                 }
 752     } # end if @resolve
 753
 754     my $is_genealogical = keys %$conflict ? JSON::false : JSON::true;
 755         my $variant_row = [ [], $is_genealogical ];
 756         # Fill in the groupings from $contig.
 757         foreach my $g ( @$groups ) {
 758         my $gst = wit_stringify( $g );
 759         my @realgroup = grep { $contig->{$_} eq $gst } keys %$contig;
 760         push( @{$variant_row->[0]}, \@realgroup );
 761     }
 762     return $variant_row;
 763 }
 764
 765 sub _prune_group {
 766         my( $group, $stemma ) = @_;
 767         # Get these into a form prune_subtree will recognize. Make a "contighash"
 768         my $hypohash = {};
 769         map { $hypohash->{$_} = 1 } @$group;
 770         # ...with reference values for hypotheticals.
 771         map { $hypohash->{$_} = [] } $stemma->hypotheticals;
 772         # Make our subgraph
 773         my $subgraph = $stemma->graph->copy;
 774         map { $subgraph->delete_vertex( $_ ) unless exists $hypohash->{$_} }
 775                 $subgraph->vertices;
 776         # ...and find the root.
 777         my( $root ) = $subgraph->predecessorless_vertices;
 778         # Now prune and return the remaining vertices.
 779         _prune_subtree( $subgraph, $root, $hypohash );
 780         return $subgraph->vertices;
 781 }
 782
 783 sub _prune_subtree {
 784     my( $tree, $root, $contighash ) = @_;
 785     # First, delete hypothetical leaves / orphans until there are none left.
 786     my @orphan_hypotheticals = grep { ref( $contighash->{$_} ) }
 787         $tree->successorless_vertices;
 788     while( @orphan_hypotheticals ) {
 789         $tree->delete_vertices( @orphan_hypotheticals );
 790         @orphan_hypotheticals = grep { ref( $contighash->{$_} ) }
 791             $tree->successorless_vertices;
 792     }
 793     # Then delete a hypothetical root with only one successor, moving the
 794     # root to the first child that has no other predecessors.
 795     while( $tree->successors( $root ) == 1 && ref $contighash->{$root} ) {
 796         my @nextroot = $tree->successors( $root );
 797         $tree->delete_vertex( $root );
 798         ( $root ) = grep { $tree->is_predecessorless_vertex( $_ ) } @nextroot;
 799     }
 800     # The tree has been modified in place, but we need to know the new root.
 801     $root = undef unless $root && $tree->has_vertex( $root );
 802     return $root;
 803 }
 804 # Add the variant, subject to a.c. representation logic.
 805 # This assumes that we will see the 'main' version before the a.c. version.
 806 sub add_variant_wit {
 807     my( $arr, $wit, $acstr ) = @_;
 808     my $skip;
 809     if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
 810         my $real = $1;
 811         $skip = grep { $_ =~ /^\Q$real\E$/ } @$arr;
 812     }
 813     push( @$arr, $wit ) unless $skip;
 814 }
 815
 816 sub _useful_variant {
 817         my( $group_readings, $graph, $acstr ) = @_;
 818
 819         # TODO Decide what to do with AC witnesses
 820
 821         # Sort by group size and return
 822         my $is_useful = 0;
 823         my( @readings, @groups );   # The sorted groups for our answer.
 824         foreach my $rdg ( sort { @{$group_readings->{$b}} <=> @{$group_readings->{$a}} }
 825                 keys %$group_readings ) {
 826                 push( @readings, $rdg );
 827                 push( @groups, $group_readings->{$rdg} );
 828                 if( @{$group_readings->{$rdg}} > 1 ) {
 829                         $is_useful++;
 830                 } else {
 831                         my( $wit ) = @{$group_readings->{$rdg}};
 832                         $wit =~ s/^(.*)\Q$acstr\E$/$1/;
 833                         $is_useful++ unless( $graph->is_sink_vertex( $wit ) );
 834                 }
 835         }
 836         if( $is_useful > 1 ) {
 837                 return( \@readings, \@groups );
 838         } else {
 839                 return( [], [] );
 840         }
 841 }
 842
 843 =head2 wit_stringify( $groups )
 844
 845 Takes an array of witness groupings and produces a string like
 846 ['A','B'] / ['C','D','E'] / ['F']
 847
 848 =cut
 849
 850 sub wit_stringify {
 851     my $groups = shift;
 852     my @gst;
 853     # If we were passed an array of witnesses instead of an array of
 854     # groupings, then "group" the witnesses first.
 855     unless( ref( $groups->[0] ) ) {
 856         my $mkgrp = [ $groups ];
 857         $groups = $mkgrp;
 858     }
 859     foreach my $g ( @$groups ) {
 860         push( @gst, '[' . join( ',', map { "'$_'" } @$g ) . ']' );
 861     }
 862     return join( ' / ', @gst );
 863 }
 864
 865 # Helper function to ensure that X and X a.c. never appear in the same list.
 866 sub _add_to_witlist {
 867         my( $wit, $list, $acstr ) = @_;
 868         my %inlist;
 869         my $idx = 0;
 870         map { $inlist{$_} = $idx++ } @$list;
 871         if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
 872                 my $acwit = $1;
 873                 unless( exists $inlist{$acwit} ) {
 874                         push( @$list, $acwit.$acstr );
 875                 }
 876         } else {
 877                 if( exists( $inlist{$wit.$acstr} ) ) {
 878                         # Replace the a.c. version with the main witness
 879                         my $i = $inlist{$wit.$acstr};
 880                         $list->[$i] = $wit;
 881                 } else {
 882                         push( @$list, $wit );
 883                 }
 884         }
 885 }
 886
 887 sub _symmdiff {
 888         my( $lista, $listb ) = @_;
 889         my %union;
 890         my %scalars;
 891         map { $union{$_} = 1; $scalars{$_} = $_ } @$lista;
 892         map { $union{$_} += 1; $scalars{$_} = $_ } @$listb;
 893         my @set = grep { $union{$_} == 1 } keys %union;
 894         return map { $scalars{$_} } @set;
 895 }
 896
 897 1;
 898
 899 =head1 LICENSE
 900
 901 This package is free software and is provided "as is" without express
 902 or implied warranty.  You can redistribute it and/or modify it under
 903 the same terms as Perl itself.
 904
 905 =head1 AUTHOR
 906
 907 Tara L Andrews E<lt>aurum@cpan.orgE<gt>