lib/Text/Tradition/Analysis.pm

   1 package Text::Tradition::Analysis;
   2
   3 use strict;
   4 use warnings;
   5 use Algorithm::Diff;  # for word similarity measure
   6 use Benchmark;
   7 use Encode qw/ encode_utf8 /;
   8 use Exporter 'import';
   9 use Graph;
  10 use JSON qw/ encode_json decode_json /;
  11 use LWP::UserAgent;
  12 use Text::Tradition;
  13 use Text::Tradition::Stemma;
  14 use TryCatch;
  15
  16 use vars qw/ @EXPORT_OK /;
  17 @EXPORT_OK = qw/ run_analysis group_variants analyze_variant_location wit_stringify /;
  18
  19 =head1 NAME
  20
  21 Text::Tradition::Analysis - functions for stemma analysis of a tradition
  22
  23 =head1 SYNOPSIS
  24
  25   use Text::Tradition;
  26   use Text::Tradition::Analysis qw/ run_analysis analyze_variant_location /;
  27   my $t = Text::Tradition->new(
  28     'name' => 'this is a text',
  29     'input' => 'TEI',
  30     'file' => '/path/to/tei_parallel_seg_file.xml' );
  31   $t->add_stemma( 'dotfile' => $stemmafile );
  32
  33   my $variant_data = run_analysis( $tradition );
  34   # Recalculate rank $n treating all orthographic variants as equivalent
  35   my $reanalyze = analyze_variant_location( $tradition, $n, 0, 'orthographic' );
  36
  37 =head1 DESCRIPTION
  38
  39 Text::Tradition is a library for representation and analysis of collated
  40 texts, particularly medieval ones.  The Collation is the central feature of
  41 a Tradition, where the text, its sequence of readings, and its relationships
  42 between readings are actually kept.
  43
  44 =head1 SUBROUTINES
  45
  46 =head2 run_analysis( $tradition, %opts )
  47
  48 Runs the analysis described in analyze_variant_location on every location in the
  49 collation of the given tradition, with the given options. These include:
  50
  51 =over 4
  52
  53 =item * stemma_id - Specify which of the tradition's stemmata to use. Default
  54 is 0 (i.e. the first).
  55
  56 =item * ranks - Specify a list of location ranks to analyze; exclude the rest.
  57
  58 =item * merge_types - Specify a list of relationship types, where related readings
  59 should be treated as identical for the purposes of analysis.
  60
  61 =item * exclude_type1 - Exclude those ranks whose groupings have only type-1 variants.
  62
  63 =back
  64
  65 =begin testing
  66
  67 use Text::Tradition;
  68 use Text::Tradition::Analysis qw/ run_analysis analyze_variant_location /;
  69
  70 my $datafile = 't/data/florilegium_tei_ps.xml';
  71 my $tradition = Text::Tradition->new( 'input' => 'TEI',
  72                                       'name' => 'test0',
  73                                       'file' => $datafile );
  74 my $s = $tradition->add_stemma( 'dotfile' => 't/data/florilegium.dot' );
  75 is( ref( $s ), 'Text::Tradition::Stemma', "Added stemma to tradition" );
  76
  77 my %expected_genealogical = (
  78         1 => 0,
  79         2 => 1,
  80         3 =>  0,
  81         5 =>  0,
  82         7 =>  0,
  83         8 =>  0,
  84         10 => 0,
  85         13 => 1,
  86         33 => 0,
  87         34 => 0,
  88         37 => 0,
  89         60 => 0,
  90         81 => 1,
  91         84 => 0,
  92         87 => 0,
  93         101 => 0,
  94         102 => 0,
  95         122 => 1,
  96         157 => 0,
  97         166 => 1,
  98         169 => 1,
  99         200 => 0,
 100         216 => 1,
 101         217 => 1,
 102         219 => 1,
 103         241 => 1,
 104         242 => 1,
 105         243 => 1,
 106 );
 107
 108 my $data = run_analysis( $tradition );
 109 my $c = $tradition->collation;
 110 foreach my $row ( @{$data->{'variants'}} ) {
 111         # Account for rows that used to be "not useful"
 112         unless( exists $expected_genealogical{$row->{'id'}} ) {
 113                 $expected_genealogical{$row->{'id'}} = 1;
 114         }
 115         my $gen_bool = $row->{'genealogical'} ? 1 : 0;
 116         is( $gen_bool, $expected_genealogical{$row->{'id'}},
 117                 "Got correct genealogical flag for row " . $row->{'id'} );
 118         # Check that we have the right row with the right groups
 119         my $rank = $row->{'id'};
 120         foreach my $rdghash ( @{$row->{'readings'}} ) {
 121                 # Skip 'readings' that aren't really
 122                 next unless $c->reading( $rdghash->{'readingid'} );
 123                 # Check the rank
 124                 is( $c->reading( $rdghash->{'readingid'} )->rank, $rank,
 125                         "Got correct reading rank" );
 126                 # Check the witnesses
 127                 my @realwits = sort $c->reading_witnesses( $rdghash->{'readingid'} );
 128                 my @sgrp = sort @{$rdghash->{'group'}};
 129                 is_deeply( \@sgrp, \@realwits, "Reading analyzed with correct groups" );
 130         }
 131 }
 132 is( $data->{'variant_count'}, 58, "Got right total variant number" );
 133 # TODO Make something meaningful of conflict count, maybe test other bits
 134
 135 =end testing
 136
 137 =cut
 138
 139 sub run_analysis {
 140         my( $tradition, %opts ) = @_;
 141         my $c = $tradition->collation;
 142
 143         my $stemma_id = $opts{'stemma_id'} || 0;
 144         my @ranks = ref( $opts{'ranks'} ) eq 'ARRAY' ? @{$opts{'ranks'}} : ();
 145         my @collapse = ref( $opts{'merge_types'} ) eq 'ARRAY' ? @{$opts{'merge_types'}} : ();
 146
 147         # Get the stemma
 148         my $stemma = $tradition->stemma( $stemma_id );
 149
 150         # Figure out which witnesses we are working with - that is, the ones that
 151         # appear both in the stemma and in the tradition. All others are 'lacunose'
 152         # for our purposes.
 153         my @lacunose = $stemma->hypotheticals;
 154         my @tradition_wits = map { $_->sigil } $tradition->witnesses;
 155         push( @lacunose, _symmdiff( [ $stemma->witnesses ], \@tradition_wits ) );
 156
 157         # Find and mark 'common' ranks for exclusion, unless they were
 158         # explicitly specified.
 159         unless( @ranks ) {
 160                 my %common_rank;
 161                 foreach my $rdg ( $c->common_readings ) {
 162                         $common_rank{$rdg->rank} = 1;
 163                 }
 164                 @ranks = grep { !$common_rank{$_} } ( 1 .. $c->end->rank-1 );
 165         }
 166
 167         # Group the variants to send to the solver
 168         my @groups;
 169         my @use_ranks;
 170         my %lacunae;
 171         my $moved = {};
 172         foreach my $rank ( @ranks ) {
 173                 my $missing = [ @lacunose ];
 174                 my $rankgroup = group_variants( $tradition, $rank, $missing, $moved, \@collapse );
 175                 # Filter out any empty rankgroups
 176                 # (e.g. from the later rank for a transposition)
 177                 next unless keys %$rankgroup;
 178                 if( $opts{'exclude_type1'} ) {
 179                         # Check to see whether this is a "useful" group.
 180                         my( $rdgs, $grps ) = _useful_variant( $rankgroup,
 181                                 $stemma->graph, $c->ac_label );
 182                         next unless @$rdgs;
 183                 }
 184                 push( @use_ranks, $rank );
 185                 push( @groups, $rankgroup );
 186                 $lacunae{$rank} = $missing;
 187         }
 188         # Run the solver
 189         my $answer = solve_variants( $stemma, @groups );
 190
 191         # Do further analysis on the answer
 192         my $conflict_count = 0;
 193         my $aclabel = $c->ac_label;
 194         foreach my $idx ( 0 .. $#use_ranks ) {
 195                 my $location = $answer->{'variants'}->[$idx];
 196                 # Add the rank back in
 197                 my $rank = $use_ranks[$idx];
 198                 $location->{'id'} = $rank;
 199                 # Note what our lacunae are
 200                 my %lmiss;
 201                 map { $lmiss{$_} = 1 } @{$lacunae{$use_ranks[$idx]}};
 202                 $location->{'missing'} = [ keys %lmiss ];
 203
 204                 # Run the extra analysis we need.
 205                 ## TODO We run through all the variants in this call, so
 206                 ## why not add the reading data there instead of here below?
 207                 analyze_location( $tradition, $stemma, $location, \%lmiss );
 208
 209                 my @layerwits;
 210                 # Do the final post-analysis tidying up of the data.
 211                 foreach my $rdghash ( @{$location->{'readings'}} ) {
 212                         $conflict_count++
 213                                 if exists $rdghash->{'conflict'} && $rdghash->{'conflict'};
 214                         # Add the reading text back in, setting display value as needed
 215                         my $rdg = $c->reading( $rdghash->{'readingid'} );
 216                         if( $rdg ) {
 217                                 $rdghash->{'text'} = $rdg->text .
 218                                         ( $rdg->rank == $rank ? '' : ' [' . $rdg->rank . ']' );
 219                                 $rdghash->{'is_ungrammatical'} = $rdg->grammar_invalid;
 220                                 $rdghash->{'is_nonsense'} = $rdg->is_nonsense;
 221                         }
 222                         # Remove lacunose witnesses from this reading's list now that the
 223                         # analysis is done
 224                         my @realgroup;
 225                         map { push( @realgroup, $_ ) unless $lmiss{$_} } @{$rdghash->{'group'}};
 226                         $rdghash->{'group'} = \@realgroup;
 227                         # Note any layered witnesses that appear in this group
 228                         foreach( @realgroup ) {
 229                                 if( $_ =~ /^(.*)\Q$aclabel\E$/ ) {
 230                                         push( @layerwits, $1 );
 231                                 }
 232                         }
 233                 }
 234                 $location->{'layerwits'} = \@layerwits if @layerwits;
 235         }
 236         $answer->{'conflict_count'} = $conflict_count;
 237
 238         return $answer;
 239 }
 240
 241 =head2 group_variants( $tradition, $rank, $lacunose, @merge_relationship_types )
 242
 243 Groups the variants at the given $rank of the collation, treating any
 244 relationships in @merge_relationship_types as equivalent.  $lacunose should
 245 be a reference to an array, to which the sigla of lacunose witnesses at this
 246 rank will be appended; $transposed should be a reference to a hash, wherein
 247 the identities of transposed readings and their relatives will be stored.
 248
 249 Returns a hash $group_readings where $rdg is attested by the witnesses listed
 250 in $group_readings->{$rdg}.
 251
 252 =cut
 253
 254 # Return group_readings, groups, lacunose
 255 sub group_variants {
 256         my( $tradition, $rank, $lacunose, $transposed, $collapse ) = @_;
 257         my $c = $tradition->collation;
 258         my $aclabel = $c->ac_label;
 259         my $table = $c->alignment_table;
 260         # Get the alignment table readings
 261         my %readings_at_rank;
 262         my %is_lacunose; # lookup table for witnesses not in stemma
 263         map { $is_lacunose{$_} = 1; $is_lacunose{$_.$aclabel} = 1 } @$lacunose;
 264         my @check_for_gaps;
 265         my %moved_wits;
 266         my $has_transposition;
 267         foreach my $tablewit ( @{$table->{'alignment'}} ) {
 268                 my $rdg = $tablewit->{'tokens'}->[$rank-1];
 269                 my $wit = $tablewit->{'witness'};
 270                 # Exclude the witness if it is "lacunose" which if we got here
 271                 # means "not in the stemma".
 272                 next if $is_lacunose{$wit};
 273                 # Note if the witness is actually in a lacuna
 274                 if( $rdg && $rdg->{'t'}->is_lacuna ) {
 275                         _add_to_witlist( $wit, $lacunose, $aclabel );
 276                 # Otherwise the witness either has a positive reading...
 277                 } elsif( $rdg ) {
 278                         # If the reading has been counted elsewhere as a transposition, ignore it.
 279                         if( $transposed->{$rdg->{'t'}->id} ) {
 280                                 # TODO Does this cope with three-way transpositions?
 281                                 map { $moved_wits{$_} = 1 } @{$transposed->{$rdg->{'t'}->id}};
 282                                 next;
 283                         }
 284                         # Otherwise, record it...
 285                         $readings_at_rank{$rdg->{'t'}->id} = $rdg->{'t'};
 286                         # ...and grab any transpositions, and their relations.
 287                         my @transp = grep { $_->rank != $rank } $rdg->{'t'}->related_readings();
 288                         foreach my $trdg ( @transp ) {
 289                                 next if exists $readings_at_rank{$trdg->id};
 290                                 $has_transposition = 1;
 291                                 my @affected_wits = _table_witnesses(
 292                                         $table, $trdg, \%is_lacunose, $aclabel );
 293                                 next unless @affected_wits;
 294                                 map { $moved_wits{$_} = 1 } @affected_wits;
 295                                 $transposed->{$trdg->id} =
 296                                         [ _table_witnesses( $table, $rdg->{'t'}, \%is_lacunose, $aclabel ) ];
 297                                 $readings_at_rank{$trdg->id} = $trdg;
 298                         }
 299                 # ...or it is empty, ergo a gap.
 300                 } else {
 301                         _add_to_witlist( $wit, \@check_for_gaps, $aclabel );
 302                 }
 303         }
 304         my @gap_wits;
 305         map { _add_to_witlist( $_, \@gap_wits, $aclabel )
 306                 unless $moved_wits{$_} } @check_for_gaps;
 307         # Group the readings, collapsing groups by relationship if needed
 308         my $grouped_readings = {};
 309         foreach my $rdg ( values %readings_at_rank ) {
 310                 # Skip readings that have been collapsed into others.
 311                 next if exists $grouped_readings->{$rdg->id}
 312                         && $grouped_readings->{$rdg->id} eq 'COLLAPSE';
 313                 # Get the witness list, including from readings collapsed into this one.
 314                 my @wits = _table_witnesses( $table, $rdg, \%is_lacunose, $aclabel );
 315                 if( $collapse && @$collapse ) {
 316                         my $filter = sub { my $r = $_[0]; grep { $_ eq $r->type } @$collapse; };
 317                         foreach my $other ( $rdg->related_readings( $filter ) ) {
 318                                 my @otherwits = _table_witnesses(
 319                                         $table, $other, \%is_lacunose, $aclabel );
 320                                 push( @wits, @otherwits );
 321                                 $grouped_readings->{$other->id} = 'COLLAPSE';
 322                         }
 323                 }
 324                 $grouped_readings->{$rdg->id} = \@wits;
 325         }
 326         $grouped_readings->{'(omitted)'} = \@gap_wits if @gap_wits;
 327         # Get rid of our collapsed readings
 328         map { delete $grouped_readings->{$_} if $grouped_readings->{$_} eq 'COLLAPSE' }
 329                 keys %$grouped_readings
 330                 if $collapse;
 331
 332         # If something was transposed, check the groups for doubled-up readings
 333         if( $has_transposition ) {
 334                 # print STDERR "Group for rank $rank:\n";
 335                 # map { print STDERR "\t$_: " . join( ' ' , @{$grouped_readings->{$_}} ) . "\n" }
 336                 #       keys %$grouped_readings;
 337                 _check_transposed_consistency( $c, $rank, $transposed, $grouped_readings );
 338         }
 339
 340         # Return the result
 341         return $grouped_readings;
 342 }
 343
 344 # Helper function to query the alignment table for all witnesses (a.c. included)
 345 # that have a given reading at its rank.
 346 sub _table_witnesses {
 347         my( $table, $trdg, $lacunose, $aclabel ) = @_;
 348         my $tableidx = $trdg->rank - 1;
 349         my @has_reading;
 350         foreach my $row ( @{$table->{'alignment'}} ) {
 351                 my $wit = $row->{'witness'};
 352                 next if $lacunose->{$wit};
 353                 my $rdg = $row->{'tokens'}->[$tableidx];
 354                 next unless exists $rdg->{'t'} && defined $rdg->{'t'};
 355                 _add_to_witlist( $wit, \@has_reading, $aclabel )
 356                         if $rdg->{'t'}->id eq $trdg->id;
 357         }
 358         return @has_reading;
 359 }
 360
 361 # Helper function to ensure that X and X a.c. never appear in the same list.
 362 sub _add_to_witlist {
 363         my( $wit, $list, $acstr ) = @_;
 364         my %inlist;
 365         my $idx = 0;
 366         map { $inlist{$_} = $idx++ } @$list;
 367         if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
 368                 my $acwit = $1;
 369                 unless( exists $inlist{$acwit} ) {
 370                         push( @$list, $acwit.$acstr );
 371                 }
 372         } else {
 373                 if( exists( $inlist{$wit.$acstr} ) ) {
 374                         # Replace the a.c. version with the main witness
 375                         my $i = $inlist{$wit.$acstr};
 376                         $list->[$i] = $wit;
 377                 } else {
 378                         push( @$list, $wit );
 379                 }
 380         }
 381 }
 382
 383 sub _check_transposed_consistency {
 384         my( $c, $rank, $transposed, $groupings ) = @_;
 385         my %seen_wits;
 386         my %thisrank;
 387         # Note which readings are actually at this rank, and which witnesses
 388         # belong to which reading.
 389         foreach my $rdg ( keys %$groupings ) {
 390                 my $rdgobj = $c->reading( $rdg );
 391                 # Count '(omitted)' as a reading at this rank
 392                 $thisrank{$rdg} = 1 if !$rdgobj || $rdgobj->rank == $rank;
 393                 map { push( @{$seen_wits{$_}}, $rdg ) } @{$groupings->{$rdg}};
 394         }
 395         # Our work is done if we have no witness belonging to more than one
 396         # reading.
 397         my @doubled = grep { scalar @{$seen_wits{$_}} > 1 } keys %seen_wits;
 398         return unless @doubled;
 399         # If we have a symmetric related transposition, drop the non-rank readings.
 400         if( @doubled == scalar keys %seen_wits ) {
 401                 foreach my $rdg ( keys %$groupings ) {
 402                         if( !$thisrank{$rdg} ) {
 403                                 my $groupstr = wit_stringify( $groupings->{$rdg} );
 404                                 my ( $matched ) = grep { $groupstr eq wit_stringify( $groupings->{$_} ) }
 405                                         keys %thisrank;
 406                                 delete $groupings->{$rdg};
 407                                 # If we found a group match, assume there is a symmetry happening.
 408                                 # TODO think more about this
 409                                 # print STDERR "*** Deleting symmetric reading $rdg\n";
 410                                 unless( $matched ) {
 411                                         delete $transposed->{$rdg};
 412                                         warn "Found problem in evident symmetry with reading $rdg";
 413                                 }
 414                         }
 415                 }
 416         # Otherwise 'unhook' the transposed reading(s) that have duplicates.
 417         } else {
 418                 foreach my $dup ( @doubled ) {
 419                         foreach my $rdg ( @{$seen_wits{$dup}} ) {
 420                                 next if $thisrank{$rdg};
 421                                 next unless exists $groupings->{$rdg};
 422                                 # print STDERR "*** Deleting asymmetric doubled-up reading $rdg\n";
 423                                 delete $groupings->{$rdg};
 424                                 delete $transposed->{$rdg};
 425                         }
 426                 }
 427                 # and put any now-orphaned readings into an 'omitted' reading.
 428                 foreach my $wit ( keys %seen_wits ) {
 429                         unless( grep { exists $groupings->{$_} } @{$seen_wits{$wit}} ) {
 430                                 $groupings->{'(omitted)'} = [] unless exists $groupings->{'(omitted)'};
 431                                 _add_to_witlist( $wit, $groupings->{'(omitted)'}, $c->ac_label );
 432                         }
 433                 }
 434         }
 435 }
 436
 437 =head2 solve_variants( $graph, @groups )
 438
 439 Sends the set of groups to the external graph solver service and returns
 440 a cleaned-up answer, adding the rank IDs back where they belong.
 441
 442 The JSON has the form
 443   { "graph": [ stemmagraph DOT string without newlines ],
 444     "groupings": [ array of arrays of groups, one per rank ] }
 445
 446 The answer has the form
 447   { "variants" => [ array of variant location structures ],
 448     "variant_count" => total,
 449     "conflict_count" => number of conflicts detected,
 450     "genealogical_count" => number of solutions found }
 451
 452 =cut
 453
 454 sub solve_variants {
 455         my( $stemma, @groups ) = @_;
 456         my $aclabel = $stemma->collation->ac_label;
 457
 458         # Filter the groups down to distinct groups, and work out what graph
 459         # should be used in the calculation of each group. We want to send each
 460         # distinct problem to the solver only once.
 461         # We need a whole bunch of lookup tables for this.
 462         my $index_groupkeys = {};       # Save the order of readings
 463         my $group_indices = {};         # Save the indices that have a given grouping
 464         my $graph_problems = {};        # Save the groupings for the given graph
 465
 466         foreach my $idx ( 0..$#groups ) {
 467                 my $ghash = $groups[$idx];
 468                 my @grouping;
 469                 # Sort the groupings from big to little, and scan for a.c. witnesses
 470                 # that would need an extended graph.
 471                 my @acwits;   # note which AC witnesses crop up at this rank
 472                 my @idxkeys = sort { scalar @{$ghash->{$b}} <=> scalar @{$ghash->{$a}} }
 473                         keys %$ghash;
 474                 foreach my $rdg ( @idxkeys ) {
 475                         my @sg = sort @{$ghash->{$rdg}};
 476                         push( @acwits, grep { $_ =~ /\Q$aclabel\E$/ } @sg );
 477                         push( @grouping, \@sg );
 478                 }
 479                 # Save the reading order
 480                 $index_groupkeys->{$idx} = \@idxkeys;
 481
 482                 # Now associate the distinct group with this index
 483                 my $gstr = wit_stringify( \@grouping );
 484                 push( @{$group_indices->{$gstr}}, $idx );
 485
 486                 # Finally, add the group to the list to be calculated for this graph.
 487                 map { s/\Q$aclabel\E$// } @acwits;
 488                 my $graph;
 489                 try {
 490                         $graph = $stemma->extend_graph( \@acwits );
 491                 } catch {
 492                         die "Unable to extend graph with @acwits";
 493                 }
 494                 unless( exists $graph_problems->{"$graph"} ) {
 495                         $graph_problems->{"$graph"} = { 'object' => $graph, 'groups' => [] };
 496                 }
 497                 push( @{$graph_problems->{"$graph"}->{'groups'}}, \@grouping );
 498         }
 499
 500         ## For each distinct graph, send its groups to the solver.
 501         my $solver_url = 'http://byzantini.st/cgi-bin/graphcalc.cgi';
 502         my $ua = LWP::UserAgent->new();
 503         ## Witness map is a HACK to get around limitations in node names from IDP
 504         my $witness_map = {};
 505         ## Variables to store answers as they come back
 506         my $variants = [ ( undef ) x ( scalar keys %$index_groupkeys ) ];
 507         my $genealogical = 0;
 508         foreach my $graphkey ( keys %$graph_problems ) {
 509                 my $graph = $graph_problems->{$graphkey}->{'object'};
 510                 my $groupings = $graph_problems->{$graphkey}->{'groups'};
 511                 my $json = encode_json( _safe_wit_strings( $graph, $stemma->collation,
 512                         $groupings, $witness_map ) );
 513                 # Send it off and get the result
 514                 #print STDERR "Sending request: $json\n";
 515                 my $resp = $ua->post( $solver_url, 'Content-Type' => 'application/json',
 516                                                           'Content' => $json );
 517                 my $answer;
 518                 my $used_idp;
 519                 if( $resp->is_success ) {
 520                         $answer = _desanitize_names( decode_json( $resp->content ), $witness_map );
 521                         $used_idp = 1;
 522                 } else {
 523                         # Fall back to the old method.
 524                         warn "IDP solver returned " . $resp->status_line . " / " . $resp->content
 525                                 . "; falling back to perl method";
 526                         $answer = perl_solver( $graph, @$groupings );
 527                 }
 528                 ## The answer is the evaluated groupings, plus a boolean for whether
 529                 ## they were genealogical.  Reconstruct our original groups.
 530                 foreach my $gidx ( 0 .. $#{$groupings} ) {
 531                         my( $calc_groups, $result ) = @{$answer->[$gidx]};
 532                         if( $result ) {
 533                                 $genealogical++;
 534                                 # Prune the calculated groups, in case the IDP solver failed to.
 535                                 if( $used_idp ) {
 536                                         my @pruned_groups;
 537                                         foreach my $cg ( @$calc_groups ) {
 538                                                 # This is a little wasteful but the path of least
 539                                                 # resistance. Send both the stemma, which knows what
 540                                                 # its hypotheticals are, and the actual graph used.
 541                                                 my @pg = _prune_group( $cg, $stemma, $graph );
 542                                                 push( @pruned_groups, \@pg );
 543                                         }
 544                                         $calc_groups = \@pruned_groups;
 545                                 }
 546                         }
 547                         # Retrieve the key for the original group that went to the solver
 548                         my $input_group = wit_stringify( $groupings->[$gidx] );
 549                         foreach my $oidx ( @{$group_indices->{$input_group}} ) {
 550                                 my @readings = @{$index_groupkeys->{$oidx}};
 551                                 my $vstruct = {
 552                                         'genealogical' => $result,
 553                                         'readings' => [],
 554                                 };
 555                                 foreach my $ridx ( 0 .. $#readings ) {
 556                                         push( @{$vstruct->{'readings'}},
 557                                                 { 'readingid' => $readings[$ridx],
 558                                                   'group' => $calc_groups->[$ridx] } );
 559                                 }
 560                                 $variants->[$oidx] = $vstruct;
 561                         }
 562                 }
 563         }
 564
 565         return { 'variants' => $variants,
 566                          'variant_count' => scalar @$variants,
 567                          'genealogical_count' => $genealogical };
 568 }
 569
 570 #### HACKERY to cope with IDP's limited idea of what a node name looks like ###
 571
 572 sub _safe_wit_strings {
 573         my( $graph, $c, $groupings, $witness_map ) = @_;
 574         # Parse the graph we were given into a stemma.
 575         my $safegraph = Graph->new();
 576         # Convert the graph to a safe representation and store the conversion.
 577         foreach my $n ( $graph->vertices ) {
 578                 my $sn = _safe_witstr( $n );
 579                 if( exists $witness_map->{$sn} ) {
 580                         warn "Ambiguous stringification $sn for $n and " . $witness_map->{$sn}
 581                                 if $witness_map->{$sn} ne $n;
 582                 } else {
 583                         $witness_map->{$sn} = $n;
 584                 }
 585                 $safegraph->add_vertex( $sn );
 586                 $safegraph->set_vertex_attributes( $sn,
 587                         $graph->get_vertex_attributes( $n ) );
 588         }
 589         foreach my $e ( $graph->edges ) {
 590                 my @safe_e = ( _safe_witstr( $e->[0] ), _safe_witstr( $e->[1] ) );
 591                 $safegraph->add_edge( @safe_e );
 592         }
 593         my $safe_stemma = Text::Tradition::Stemma->new(
 594                 'collation' => $c, 'graph' => $safegraph );
 595
 596         # Now convert the witness groupings to a safe representation.
 597         my $safe_groupings = [];
 598         foreach my $grouping ( @$groupings ) {
 599                 my $safe_grouping = [];
 600                 foreach my $group ( @$grouping ) {
 601                         my $safe_group = [];
 602                         foreach my $n ( @$group ) {
 603                                 my $sn = _safe_witstr( $n );
 604                                 warn "Ambiguous stringification $sn for $n and " . $witness_map->{$sn}
 605                                         if exists $witness_map->{$sn} && $witness_map->{$sn} ne $n;
 606                                 $witness_map->{$sn} = $n;
 607                                 push( @$safe_group, $sn );
 608                         }
 609                         push( @$safe_grouping, $safe_group );
 610                 }
 611                 push( @$safe_groupings, $safe_grouping );
 612         }
 613
 614         # Return it all in the struct we expect.  We have stored the reductions
 615         # in the $witness_map that we were passed.
 616         return { 'graph' => $safe_stemma->editable( { 'linesep' => ' ' } ),
 617                          'groupings' => $safe_groupings };
 618 }
 619
 620 sub _safe_witstr {
 621         my $witstr = shift;
 622         $witstr =~ s/\s+/_/g;
 623         $witstr =~ s/[^\w\d-]//g;
 624         return $witstr;
 625 }
 626
 627 sub _desanitize_names {
 628         my( $jsonstruct, $witness_map ) = @_;
 629         my $result = [];
 630         foreach my $grouping ( @$jsonstruct ) {
 631                 my $real_grouping = [];
 632                 foreach my $element ( @$grouping ) {
 633                         if( ref( $element ) eq 'ARRAY' ) {
 634                                 # it's the groupset.
 635                                 my $real_groupset = [];
 636                                 foreach my $group ( @$element ) {
 637                                         my $real_group = [];
 638                                         foreach my $n ( @$group ) {
 639                                                 my $rn = $witness_map->{$n};
 640                                                 push( @$real_group, $rn );
 641                                         }
 642                                         push( @$real_groupset, $real_group );
 643                                 }
 644                                 push( @$real_grouping, $real_groupset );
 645                         } else {
 646                                 # It is the boolean, not actually a group.
 647                                 push( @$real_grouping, $element );
 648                         }
 649                 }
 650                 push( @$result, $real_grouping );
 651         }
 652         return $result;
 653 }
 654
 655 ### END HACKERY ###
 656
 657 =head2 analyze_location ( $tradition, $graph, $location_hash )
 658
 659 Given the tradition, its stemma graph, and the solution from the graph solver,
 660 work out the rest of the information we want.  For each reading we need missing,
 661 conflict, reading_parents, independent_occurrence, followed, not_followed, and follow_unknown.  Alters the location_hash in place.
 662
 663 =cut
 664
 665 sub analyze_location {
 666         my ( $tradition, $stemma, $variant_row, $lacunose ) = @_;
 667         my $c = $tradition->collation;
 668
 669         # Make a hash of all known node memberships, and make the subgraphs.
 670         my $contig = {};
 671         my $reading_roots = {};
 672         my $subgraph = {};
 673         my $acstr = $c->ac_label;
 674         my @acwits;
 675         # Note which witnesses positively belong to which group
 676     foreach my $rdghash ( @{$variant_row->{'readings'}} ) {
 677         my $rid = $rdghash->{'readingid'};
 678         foreach my $wit ( @{$rdghash->{'group'}} ) {
 679                 $contig->{$wit} = $rid;
 680             if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
 681                 push( @acwits, $1 );
 682             }
 683         }
 684         }
 685         # Get the actual graph we should work with
 686         my $graph;
 687         try {
 688                 $graph = @acwits ? $stemma->extend_graph( \@acwits ) : $stemma->graph;
 689         } catch {
 690                 die "Could not extend graph with a.c. witnesses @acwits";
 691         }
 692
 693         # Now, armed with that knowledge, make a subgraph for each reading
 694         # and note the root(s) of each subgraph.
 695         foreach my $rdghash( @{$variant_row->{'readings'}} ) {
 696         my $rid = $rdghash->{'readingid'};
 697         my %rdgwits;
 698         # Make the subgraph.
 699         my $part = $graph->copy;
 700         my @todelete = grep { exists $contig->{$_} && $contig->{$_} ne $rid }
 701                 keys %$contig;
 702         $part->delete_vertices( @todelete );
 703         _prune_subtree( $part, $lacunose );
 704                 $subgraph->{$rid} = $part;
 705                 # Record the remaining lacunose nodes as part of this group, if
 706                 # we are dealing with a non-genealogical reading.
 707                 unless( $variant_row->{'genealogical'} ) {
 708                         map { $contig->{$_} = $rid } $part->vertices;
 709                 }
 710                 # Get the reading roots.
 711                 map { $reading_roots->{$_} = $rid } $part->predecessorless_vertices;
 712         }
 713
 714         # Now that we have all the node group memberships, calculate followed/
 715     # non-followed/unknown values for each reading.  Also figure out the
 716     # reading's evident parent(s).
 717     foreach my $rdghash ( @{$variant_row->{'readings'}} ) {
 718         my $rid = $rdghash->{'readingid'};
 719         my $rdg = $c->reading( $rid );
 720         # Get the subgraph
 721         my $part = $subgraph->{$rid};
 722
 723         # Start figuring things out.
 724         my @roots = grep { $reading_roots->{$_} eq $rid } keys %$reading_roots;
 725         $rdghash->{'independent_occurrence'} = \@roots;
 726         $rdghash->{'followed'} = scalar( $part->vertices ) - scalar( @roots );
 727         # Find the parent readings, if any, of this reading.
 728         my $rdgparents = {};
 729         foreach my $wit ( @roots ) {
 730                 # Look in the main stemma to find this witness's extant or known-reading
 731                 # immediate ancestor(s), and look up the reading that each ancestor olds.
 732                         my @check = $graph->predecessors( $wit );
 733                         while( @check ) {
 734                                 my @next;
 735                                 foreach my $wparent( @check ) {
 736                                         my $preading = $contig->{$wparent};
 737                                         # IDP assigns all nodes, hypothetical included, to a reading
 738                                         # in the case of genealogical sets. We prune non-necessary
 739                                         # hypothetical readings, but they are still in $contig, so
 740                                         # we account for that here.
 741                                         if( $preading && $preading ne $rid ) {
 742                                                 $rdgparents->{$preading} = 1;
 743                                         } else {
 744                                                 push( @next, $graph->predecessors( $wparent ) );
 745                                         }
 746                                 }
 747                                 @check = @next;
 748                         }
 749                 }
 750                 foreach my $p ( keys %$rdgparents ) {
 751                         # Resolve the relationship of the parent to the reading, and
 752                         # save it in our hash.
 753                         my $pobj = $c->reading( $p );
 754                         my $prep = $pobj ? $pobj->id . ' (' . $pobj->text . ')' : $p;
 755                         my $phash = { 'label' => $prep };
 756                         if( $pobj ) {
 757                                 my $rel = $c->get_relationship( $p, $rdghash->{readingid} );
 758                                 if( $rel ) {
 759                                         _add_to_hash( $rel, $phash );
 760                                 } elsif( $rdg ) {
 761                                         # First check for a transposed relationship
 762                                         if( $rdg->rank != $pobj->rank ) {
 763                                                 foreach my $ti ( $rdg->related_readings( 'transposition' ) ) {
 764                                                         next unless $ti->text eq $rdg->text;
 765                                                         $rel = $c->get_relationship( $ti, $pobj );
 766                                                         if( $rel ) {
 767                                                                 _add_to_hash( $rel, $phash, 1 );
 768                                                                 last;
 769                                                         }
 770                                                 }
 771                                                 unless( $rel ) {
 772                                                         foreach my $ti ( $pobj->related_readings( 'transposition' ) ) {
 773                                                                 next unless $ti->text eq $pobj->text;
 774                                                                 $rel = $c->get_relationship( $ti, $rdg );
 775                                                                 if( $rel ) {
 776                                                                         _add_to_hash( $rel, $phash, 1 );
 777                                                                         last;
 778                                                                 }
 779                                                         }
 780                                                 }
 781                                         }
 782                                         unless( $rel ) {
 783                                                 # and then check for sheer word similarity.
 784                                                 my $rtext = $rdg->text;
 785                                                 my $ptext = $pobj->text;
 786                                                 if( similar( $rtext, $ptext ) ) {
 787                                                         # say STDERR "Words $rtext and $ptext judged similar";
 788                                                         $phash->{relation} = { type => 'wordsimilar' };
 789                                                 }
 790                                         }
 791                                 } else {
 792                                         $phash->{relation} = { type => 'deletion' };
 793                                 }
 794                                 # Get the attributes of the parent object while we are here
 795                                 $phash->{'text'} = $pobj->text if $pobj;
 796                                 $phash->{'is_nonsense'} = $pobj->is_nonsense;
 797                                 $phash->{'is_ungrammatical'} = $pobj->grammar_invalid;
 798                         } elsif( $p eq '(omitted)' ) {
 799                                 $phash->{relation} = { type => 'addition' };
 800                         }
 801                         # Save it
 802                         $rdgparents->{$p} = $phash;
 803                 }
 804
 805                 $rdghash->{'reading_parents'} = $rdgparents;
 806
 807                 # Find the number of times this reading was altered, and the number of
 808                 # times we're not sure.
 809                 my( %nofollow, %unknownfollow );
 810                 foreach my $wit ( $part->vertices ) {
 811                         foreach my $wchild ( $graph->successors( $wit ) ) {
 812                                 next if $part->has_vertex( $wchild );
 813                                 if( $reading_roots->{$wchild} && $contig->{$wchild} ) {
 814                                         # It definitely changed here.
 815                                         $nofollow{$wchild} = 1;
 816                                 } elsif( !($contig->{$wchild}) ) {
 817                                         # The child is a hypothetical node not definitely in
 818                                         # any group. Answer is unknown.
 819                                         $unknownfollow{$wchild} = 1;
 820                                 } # else it's a non-root node in a known group, and therefore
 821                                   # is presumed to have its reading from its group, not this link.
 822                         }
 823                 }
 824                 $rdghash->{'not_followed'} = keys %nofollow;
 825                 $rdghash->{'follow_unknown'} = keys %unknownfollow;
 826
 827                 # Now say whether this reading represents a conflict.
 828                 unless( $variant_row->{'genealogical'} ) {
 829                         $rdghash->{'conflict'} = @roots != 1;
 830                 }
 831     }
 832 }
 833
 834 sub _add_to_hash {
 835         my( $rel, $phash, $is_transposed ) = @_;
 836         $phash->{relation} = { type => $rel->type };
 837         $phash->{relation}->{transposed} = 1 if $is_transposed;
 838         $phash->{relation}->{annotation} = $rel->annotation
 839                 if $rel->has_annotation;
 840 }
 841
 842 =head2 similar( $word1, $word2 )
 843
 844 Use Algorithm::Diff to get a sense of how close the words are to each other.
 845 This will hopefully handle substitutions a bit more nicely than Levenshtein.
 846
 847 =cut
 848
 849 #!/usr/bin/env perl
 850
 851 sub similar {
 852         my( $word1, $word2 ) = sort { length($a) <=> length($b) } @_;
 853         my @let1 = split( '', lc( $word1 ) );
 854         my @let2 = split( '', lc( $word2 ) );
 855         my $diff = Algorithm::Diff->new( \@let1, \@let2 );
 856         my $mag = 0;
 857         while( $diff->Next ) {
 858                 if( $diff->Same ) {
 859                         # Take off points for longer strings
 860                         my $cs = $diff->Range(1) - 2;
 861                         $cs = 0 if $cs < 0;
 862                         $mag -= $cs;
 863                 } elsif( !$diff->Items(1) ) {
 864                         $mag += $diff->Range(2);
 865                 } elsif( !$diff->Items(2) ) {
 866                         $mag += $diff->Range(1);
 867                 } else {
 868                         # Split the difference for substitutions
 869                         my $c1 = $diff->Range(1) || 1;
 870                         my $c2 = $diff->Range(2) || 1;
 871                         my $cd = ( $c1 + $c2 ) / 2;
 872                         $mag += $cd;
 873                 }
 874         }
 875         return ( $mag <= length( $word1 ) / 2 );
 876 }
 877
 878
 879
 880 =head2 perl_solver( $tradition, $rank, $stemma_id, @merge_relationship_types )
 881
 882 ** NOTE ** This method should hopefully not be called - it is not guaranteed
 883 to be correct.  Serves as a backup for the real solver.
 884
 885 Runs an analysis of the given tradition, at the location given in $rank,
 886 against the graph of the stemma specified in $stemma_id.  The argument
 887 @merge_relationship_types is an optional list of relationship types for
 888 which readings so related should be treated as equivalent.
 889
 890 Returns a nested array data structure as follows:
 891
 892  [ [ group_list, is_genealogical ], [ group_list, is_genealogical ] ... ]
 893
 894 where the group list is the array of arrays passed in for each element of @groups,
 895 possibly with the addition of hypothetical readings.
 896
 897
 898 =cut
 899
 900 sub perl_solver {
 901         my( $graph, @groups ) = @_;
 902         my @answer;
 903         foreach my $g ( @groups ) {
 904                 push( @answer, _solve_variant_location( $graph, $g ) );
 905         }
 906         return \@answer;
 907 }
 908
 909 sub _solve_variant_location {
 910         my( $graph, $groups ) = @_;
 911         # Now do the work.
 912     my $contig = {};
 913     my $subgraph = {};
 914     my $is_conflicted;
 915     my $conflict = {};
 916
 917     # Mark each ms as in its own group, first.
 918     foreach my $g ( @$groups ) {
 919         my $gst = wit_stringify( $g );
 920         map { $contig->{$_} = $gst } @$g;
 921     }
 922
 923     # Now for each unmarked node in the graph, initialize an array
 924     # for possible group memberships.  We will use this later to
 925     # resolve potential conflicts.
 926     map { $contig->{$_} = [] unless $contig->{$_} } $graph->vertices;
 927     foreach my $g ( sort { scalar @$b <=> scalar @$a } @$groups ) {
 928         my $gst = wit_stringify( $g );  # This is the group name
 929         # Copy the graph, and delete all non-members from the new graph.
 930         my $part = $graph->copy;
 931         my @group_roots;
 932         $part->delete_vertices(
 933             grep { !ref( $contig->{$_} ) && $contig->{$_} ne $gst } $graph->vertices );
 934
 935         # Now look to see if our group is connected.
 936                 if( @$g > 1 ) {
 937                         # We have to take directionality into account.
 938                         # How many root nodes do we have?
 939                         my @roots = grep { ref( $contig->{$_} ) || $contig->{$_} eq $gst }
 940                                 $part->predecessorless_vertices;
 941                         # Assuming that @$g > 1, find the first root node that has at
 942                         # least one successor belonging to our group. If this reading
 943                         # is genealogical, there should be only one, but we will check
 944                         # that implicitly later.
 945                         foreach my $root ( @roots ) {
 946                                 # Prune the tree to get rid of extraneous hypotheticals.
 947                                 $root = _prune_subtree_old( $part, $root, $contig );
 948                                 next unless $root;
 949                                 # Save this root for our group.
 950                                 push( @group_roots, $root );
 951                                 # Get all the successor nodes of our root.
 952                         }
 953                 } else {
 954                         # Dispense with the trivial case of one reading.
 955                         my $wit = $g->[0];
 956                         @group_roots = ( $wit );
 957                         foreach my $v ( $part->vertices ) {
 958                                 $part->delete_vertex( $v ) unless $v eq $wit;
 959                         }
 960         }
 961
 962         if( @group_roots > 1 ) {
 963                 $conflict->{$gst} = 1;
 964                 $is_conflicted = 1;
 965         }
 966         # Paint the 'hypotheticals' with our group.
 967                 foreach my $wit ( $part->vertices ) {
 968                         if( ref( $contig->{$wit} ) ) {
 969                                 push( @{$contig->{$wit}}, $gst );
 970                         } elsif( $contig->{$wit} ne $gst ) {
 971                                 warn "How did we get here?";
 972                         }
 973                 }
 974
 975
 976                 # Save the relevant subgraph.
 977                 $subgraph->{$gst} = $part;
 978     }
 979
 980         # For each of our hypothetical readings, flatten its 'contig' array if
 981         # the array contains zero or one group.  If we have any unflattened arrays,
 982         # we may need to run the resolution process. If the reading is already known
 983         # to have a conflict, flatten the 'contig' array to nothing; we won't resolve
 984         # it.
 985         my @resolve;
 986         foreach my $wit ( keys %$contig ) {
 987                 next unless ref( $contig->{$wit} );
 988                 if( @{$contig->{$wit}} > 1 ) {
 989                         if( $is_conflicted ) {
 990                                 $contig->{$wit} = '';  # We aren't going to decide.
 991                         } else {
 992                                 push( @resolve, $wit );
 993                         }
 994                 } else {
 995                         my $gst = pop @{$contig->{$wit}};
 996                         $contig->{$wit} = $gst || '';
 997                 }
 998         }
 999
1000     if( @resolve ) {
1001         my $still_contig = {};
1002         foreach my $h ( @resolve ) {
1003             # For each of the hypothetical readings with more than one possibility,
1004             # try deleting it from each of its member subgraphs in turn, and see
1005             # if that breaks the contiguous grouping.
1006             # TODO This can still break in a corner case where group A can use
1007             # either vertex 1 or 2, and group B can use either vertex 2 or 1.
1008             # Revisit this if necessary; it could get brute-force nasty.
1009             foreach my $gst ( @{$contig->{$h}} ) {
1010                 my $gpart = $subgraph->{$gst}->copy();
1011                 # If we have come this far, there is only one root and everything
1012                 # is reachable from it.
1013                 my( $root ) = $gpart->predecessorless_vertices;
1014                 my $reachable = {};
1015                 map { $reachable->{$_} = 1 } $gpart->vertices;
1016
1017                 # Try deleting the hypothetical node.
1018                 $gpart->delete_vertex( $h );
1019                 if( $h eq $root ) {
1020                         # See if we still have a single root.
1021                         my @roots = $gpart->predecessorless_vertices;
1022                         warn "This shouldn't have happened" unless @roots;
1023                         if( @roots > 1 ) {
1024                                 # $h is needed by this group.
1025                                 if( exists( $still_contig->{$h} ) ) {
1026                                         # Conflict!
1027                                         $conflict->{$gst} = 1;
1028                                         $still_contig->{$h} = '';
1029                                 } else {
1030                                         $still_contig->{$h} = $gst;
1031                                 }
1032                         }
1033                 } else {
1034                         # $h is somewhere in the middle. See if everything
1035                         # else can still be reached from the root.
1036                                         my %still_reachable = ( $root => 1 );
1037                                         map { $still_reachable{$_} = 1 }
1038                                                 $gpart->all_successors( $root );
1039                                         foreach my $v ( keys %$reachable ) {
1040                                                 next if $v eq $h;
1041                                                 if( !$still_reachable{$v}
1042                                                         && ( $contig->{$v} eq $gst
1043                                                                  || ( exists $still_contig->{$v}
1044                                                                           && $still_contig->{$v} eq $gst ) ) ) {
1045                                                         # We need $h.
1046                                                         if( exists $still_contig->{$h} ) {
1047                                                                 # Conflict!
1048                                                                 $conflict->{$gst} = 1;
1049                                                                 $still_contig->{$h} = '';
1050                                                         } else {
1051                                                                 $still_contig->{$h} = $gst;
1052                                                         }
1053                                                         last;
1054                                                 } # else we don't need $h in this group.
1055                                         } # end foreach $v
1056                                 } # endif $h eq $root
1057             } # end foreach $gst
1058         } # end foreach $h
1059
1060         # Now we have some hypothetical vertices in $still_contig that are the
1061         # "real" group memberships.  Replace these in $contig.
1062                 foreach my $v ( keys %$contig ) {
1063                         next unless ref $contig->{$v};
1064                         $contig->{$v} = $still_contig->{$v};
1065                 }
1066     } # end if @resolve
1067
1068     my $is_genealogical = keys %$conflict ? JSON::false : JSON::true;
1069         my $variant_row = [ [], $is_genealogical ];
1070         # Fill in the groupings from $contig.
1071         foreach my $g ( @$groups ) {
1072         my $gst = wit_stringify( $g );
1073         my @realgroup = grep { $contig->{$_} eq $gst } keys %$contig;
1074         push( @{$variant_row->[0]}, \@realgroup );
1075     }
1076     return $variant_row;
1077 }
1078
1079 sub _prune_group {
1080         my( $group, $stemma, $graph ) = @_;
1081         my $lacunose = {};
1082         map { $lacunose->{$_} = 1 } $stemma->hypotheticals;
1083         map { $lacunose->{$_} = 0 } @$group;
1084         # Make our subgraph
1085         my $subgraph = $graph->copy;
1086         map { $subgraph->delete_vertex( $_ ) unless exists $lacunose->{$_} }
1087                 $subgraph->vertices;
1088         # ...and find the root.
1089         # Now prune and return the remaining vertices.
1090         _prune_subtree( $subgraph, $lacunose );
1091         return $subgraph->vertices;
1092 }
1093
1094 sub _prune_subtree {
1095         my( $tree, $lacunose ) = @_;
1096
1097         # Delete lacunose witnesses that have no successors
1098     my @orphan_hypotheticals;
1099     my $ctr = 0;
1100     do {
1101         die "Infinite loop on leaves" if $ctr > 100;
1102         @orphan_hypotheticals = grep { $lacunose->{$_} }
1103                 $tree->successorless_vertices;
1104         $tree->delete_vertices( @orphan_hypotheticals );
1105         $ctr++;
1106     } while( @orphan_hypotheticals );
1107
1108         # Delete lacunose roots that have a single successor
1109         my @redundant_root;
1110         $ctr = 0;
1111         do {
1112         die "Infinite loop on roots" if $ctr > 100;
1113                 @redundant_root = grep { $lacunose->{$_} && $tree->successors( $_ ) == 1 }
1114                         $tree->predecessorless_vertices;
1115                 $tree->delete_vertices( @redundant_root );
1116                 $ctr++;
1117         } while( @redundant_root );
1118 }
1119
1120 sub _prune_subtree_old {
1121     my( $tree, $root, $contighash ) = @_;
1122     # First, delete hypothetical leaves / orphans until there are none left.
1123     my @orphan_hypotheticals = grep { ref( $contighash->{$_} ) }
1124         $tree->successorless_vertices;
1125     while( @orphan_hypotheticals ) {
1126         $tree->delete_vertices( @orphan_hypotheticals );
1127         @orphan_hypotheticals = grep { ref( $contighash->{$_} ) }
1128             $tree->successorless_vertices;
1129     }
1130     # Then delete a hypothetical root with only one successor, moving the
1131     # root to the first child that has no other predecessors.
1132     while( $tree->successors( $root ) == 1 && ref $contighash->{$root} ) {
1133         my @nextroot = $tree->successors( $root );
1134         $tree->delete_vertex( $root );
1135         ( $root ) = grep { $tree->is_predecessorless_vertex( $_ ) } @nextroot;
1136     }
1137     # The tree has been modified in place, but we need to know the new root.
1138     $root = undef unless $root && $tree->has_vertex( $root );
1139     return $root;
1140 }
1141 # Add the variant, subject to a.c. representation logic.
1142 # This assumes that we will see the 'main' version before the a.c. version.
1143 sub add_variant_wit {
1144     my( $arr, $wit, $acstr ) = @_;
1145     my $skip;
1146     if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
1147         my $real = $1;
1148         $skip = grep { $_ =~ /^\Q$real\E$/ } @$arr;
1149     }
1150     push( @$arr, $wit ) unless $skip;
1151 }
1152
1153 sub _useful_variant {
1154         my( $group_readings, $graph, $acstr ) = @_;
1155
1156         # TODO Decide what to do with AC witnesses
1157
1158         # Sort by group size and return
1159         my $is_useful = 0;
1160         my( @readings, @groups );   # The sorted groups for our answer.
1161         foreach my $rdg ( sort { @{$group_readings->{$b}} <=> @{$group_readings->{$a}} }
1162                 keys %$group_readings ) {
1163                 push( @readings, $rdg );
1164                 push( @groups, $group_readings->{$rdg} );
1165                 if( @{$group_readings->{$rdg}} > 1 ) {
1166                         $is_useful++;
1167                 } else {
1168                         my( $wit ) = @{$group_readings->{$rdg}};
1169                         $wit =~ s/^(.*)\Q$acstr\E$/$1/;
1170                         $is_useful++ unless( $graph->is_sink_vertex( $wit ) );
1171                 }
1172         }
1173         if( $is_useful > 1 ) {
1174                 return( \@readings, \@groups );
1175         } else {
1176                 return( [], [] );
1177         }
1178 }
1179
1180 =head2 wit_stringify( $groups )
1181
1182 Takes an array of witness groupings and produces a string like
1183 ['A','B'] / ['C','D','E'] / ['F']
1184
1185 =cut
1186
1187 sub wit_stringify {
1188     my $groups = shift;
1189     my @gst;
1190     # If we were passed an array of witnesses instead of an array of
1191     # groupings, then "group" the witnesses first.
1192     unless( ref( $groups->[0] ) ) {
1193         my $mkgrp = [ $groups ];
1194         $groups = $mkgrp;
1195     }
1196     foreach my $g ( @$groups ) {
1197         push( @gst, '[' . join( ',', map { "'$_'" } @$g ) . ']' );
1198     }
1199     return join( ' / ', @gst );
1200 }
1201
1202 sub _symmdiff {
1203         my( $lista, $listb ) = @_;
1204         my %union;
1205         my %scalars;
1206         map { $union{$_} = 1; $scalars{$_} = $_ } @$lista;
1207         map { $union{$_} += 1; $scalars{$_} = $_ } @$listb;
1208         my @set = grep { $union{$_} == 1 } keys %union;
1209         return map { $scalars{$_} } @set;
1210 }
1211
1212 1;
1213
1214 =head1 LICENSE
1215
1216 This package is free software and is provided "as is" without express
1217 or implied warranty.  You can redistribute it and/or modify it under
1218 the same terms as Perl itself.
1219
1220 =head1 AUTHOR
1221
1222 Tara L Andrews E<lt>aurum@cpan.orgE<gt>