lib/Text/Tradition/Analysis.pm

   1 package Text::Tradition::Analysis;
   2
   3 use strict;
   4 use warnings;
   5 use Algorithm::Diff;  # for word similarity measure
   6 use Benchmark;
   7 use Encode qw/ encode_utf8 /;
   8 use Exporter 'import';
   9 use Graph;
  10 use JSON qw/ encode_json decode_json /;
  11 use LWP::UserAgent;
  12 use Text::Tradition;
  13 use Text::Tradition::Stemma;
  14 use TryCatch;
  15
  16 use vars qw/ @EXPORT_OK /;
  17 @EXPORT_OK = qw/ run_analysis group_variants analyze_variant_location wit_stringify /;
  18
  19 =head1 NAME
  20
  21 Text::Tradition::Analysis - functions for stemma analysis of a tradition
  22
  23 =head1 SYNOPSIS
  24
  25   use Text::Tradition;
  26   use Text::Tradition::Analysis qw/ run_analysis analyze_variant_location /;
  27   my $t = Text::Tradition->new(
  28     'name' => 'this is a text',
  29     'input' => 'TEI',
  30     'file' => '/path/to/tei_parallel_seg_file.xml' );
  31   $t->add_stemma( 'dotfile' => $stemmafile );
  32
  33   my $variant_data = run_analysis( $tradition );
  34   # Recalculate rank $n treating all orthographic variants as equivalent
  35   my $reanalyze = analyze_variant_location( $tradition, $n, 0, 'orthographic' );
  36
  37 =head1 DESCRIPTION
  38
  39 Text::Tradition is a library for representation and analysis of collated
  40 texts, particularly medieval ones.  The Collation is the central feature of
  41 a Tradition, where the text, its sequence of readings, and its relationships
  42 between readings are actually kept.
  43
  44 =head1 SUBROUTINES
  45
  46 =head2 run_analysis( $tradition, %opts )
  47
  48 Runs the analysis described in analyze_variant_location on every location in the
  49 collation of the given tradition, with the given options. These include:
  50
  51 =over 4
  52
  53 =item * stemma_id - Specify which of the tradition's stemmata to use. Default
  54 is 0 (i.e. the first).
  55
  56 =item * ranks - Specify a list of location ranks to analyze; exclude the rest.
  57
  58 =item * merge_types - Specify a list of relationship types, where related readings
  59 should be treated as identical for the purposes of analysis.
  60
  61 =item * exclude_type1 - Exclude those ranks whose groupings have only type-1 variants.
  62
  63 =back
  64
  65 =begin testing
  66
  67 use Text::Tradition;
  68 use Text::Tradition::Analysis qw/ run_analysis analyze_variant_location /;
  69
  70 my $datafile = 't/data/florilegium_tei_ps.xml';
  71 my $tradition = Text::Tradition->new( 'input' => 'TEI',
  72                                       'name' => 'test0',
  73                                       'file' => $datafile );
  74 my $s = $tradition->add_stemma( 'dotfile' => 't/data/florilegium.dot' );
  75 is( ref( $s ), 'Text::Tradition::Stemma', "Added stemma to tradition" );
  76
  77 my %expected_genealogical = (
  78         1 => 0,
  79         2 => 1,
  80         3 =>  0,
  81         5 =>  0,
  82         7 =>  0,
  83         8 =>  0,
  84         10 => 0,
  85         13 => 1,
  86         33 => 0,
  87         34 => 0,
  88         37 => 0,
  89         60 => 0,
  90         81 => 1,
  91         84 => 0,
  92         87 => 0,
  93         101 => 0,
  94         102 => 0,
  95         122 => 1,
  96         157 => 0,
  97         166 => 1,
  98         169 => 1,
  99         200 => 0,
 100         216 => 1,
 101         217 => 1,
 102         219 => 1,
 103         241 => 1,
 104         242 => 1,
 105         243 => 1,
 106 );
 107
 108 my $data = run_analysis( $tradition );
 109 my $c = $tradition->collation;
 110 foreach my $row ( @{$data->{'variants'}} ) {
 111         # Account for rows that used to be "not useful"
 112         unless( exists $expected_genealogical{$row->{'id'}} ) {
 113                 $expected_genealogical{$row->{'id'}} = 1;
 114         }
 115         my $gen_bool = $row->{'genealogical'} ? 1 : 0;
 116         is( $gen_bool, $expected_genealogical{$row->{'id'}},
 117                 "Got correct genealogical flag for row " . $row->{'id'} );
 118         # Check that we have the right row with the right groups
 119         my $rank = $row->{'id'};
 120         foreach my $rdghash ( @{$row->{'readings'}} ) {
 121                 # Skip 'readings' that aren't really
 122                 next unless $c->reading( $rdghash->{'readingid'} );
 123                 # Check the rank
 124                 is( $c->reading( $rdghash->{'readingid'} )->rank, $rank,
 125                         "Got correct reading rank" );
 126                 # Check the witnesses
 127                 my @realwits = sort $c->reading_witnesses( $rdghash->{'readingid'} );
 128                 my @sgrp = sort @{$rdghash->{'group'}};
 129                 is_deeply( \@sgrp, \@realwits, "Reading analyzed with correct groups" );
 130         }
 131 }
 132 is( $data->{'variant_count'}, 58, "Got right total variant number" );
 133 # TODO Make something meaningful of conflict count, maybe test other bits
 134
 135 =end testing
 136
 137 =cut
 138
 139 sub run_analysis {
 140         my( $tradition, %opts ) = @_;
 141         my $c = $tradition->collation;
 142
 143         my $stemma_id = $opts{'stemma_id'} || 0;
 144         my @ranks = ref( $opts{'ranks'} ) eq 'ARRAY' ? @{$opts{'ranks'}} : ();
 145         my @collapse = ref( $opts{'merge_types'} ) eq 'ARRAY' ? @{$opts{'merge_types'}} : ();
 146
 147         # Get the stemma
 148         my $stemma = $tradition->stemma( $stemma_id );
 149
 150         # Figure out which witnesses we are working with - that is, the ones that
 151         # appear both in the stemma and in the tradition. All others are 'lacunose'
 152         # for our purposes.
 153         my @lacunose = $stemma->hypotheticals;
 154         my @tradition_wits = map { $_->sigil } $tradition->witnesses;
 155         push( @lacunose, _symmdiff( [ $stemma->witnesses ], \@tradition_wits ) );
 156
 157         # Find and mark 'common' ranks for exclusion, unless they were
 158         # explicitly specified.
 159         unless( @ranks ) {
 160                 my %common_rank;
 161                 foreach my $rdg ( $c->common_readings ) {
 162                         $common_rank{$rdg->rank} = 1;
 163                 }
 164                 @ranks = grep { !$common_rank{$_} } ( 1 .. $c->end->rank-1 );
 165         }
 166
 167         # Group the variants to send to the solver
 168         my @groups;
 169         my @use_ranks;
 170         my %lacunae;
 171         my $moved = {};
 172         foreach my $rank ( @ranks ) {
 173                 my $missing = [ @lacunose ];
 174                 my $rankgroup = group_variants( $tradition, $rank, $missing, $moved, \@collapse );
 175                 # Filter out any empty rankgroups
 176                 # (e.g. from the later rank for a transposition)
 177                 next unless keys %$rankgroup;
 178                 if( $opts{'exclude_type1'} ) {
 179                         # Check to see whether this is a "useful" group.
 180                         my( $rdgs, $grps ) = _useful_variant( $rankgroup,
 181                                 $stemma->graph, $c->ac_label );
 182                         next unless @$rdgs;
 183                 }
 184                 push( @use_ranks, $rank );
 185                 push( @groups, $rankgroup );
 186                 $lacunae{$rank} = $missing;
 187         }
 188         # Run the solver
 189         my $answer = solve_variants( $stemma, @groups );
 190
 191         # Do further analysis on the answer
 192         my $conflict_count = 0;
 193         my $aclabel = $c->ac_label;
 194         foreach my $idx ( 0 .. $#use_ranks ) {
 195                 my $location = $answer->{'variants'}->[$idx];
 196                 # Add the rank back in
 197                 my $rank = $use_ranks[$idx];
 198                 $location->{'id'} = $rank;
 199                 # Note what our lacunae are
 200                 my %lmiss;
 201                 map { $lmiss{$_} = 1 } @{$lacunae{$use_ranks[$idx]}};
 202                 $location->{'missing'} = [ keys %lmiss ];
 203
 204                 # Run the extra analysis we need.
 205                 ## TODO We run through all the variants in this call, so
 206                 ## why not add the reading data there instead of here below?
 207                 analyze_location( $tradition, $stemma, $location, \%lmiss );
 208
 209                 my @layerwits;
 210                 # Do the final post-analysis tidying up of the data.
 211                 foreach my $rdghash ( @{$location->{'readings'}} ) {
 212                         $conflict_count++
 213                                 if exists $rdghash->{'conflict'} && $rdghash->{'conflict'};
 214                         # Add the reading text back in, setting display value as needed
 215                         my $rdg = $c->reading( $rdghash->{'readingid'} );
 216                         if( $rdg ) {
 217                                 $rdghash->{'text'} = $rdg->text .
 218                                         ( $rdg->rank == $rank ? '' : ' [' . $rdg->rank . ']' );
 219                                 $rdghash->{'is_ungrammatical'} = $rdg->grammar_invalid;
 220                                 $rdghash->{'is_nonsense'} = $rdg->is_nonsense;
 221                         }
 222                         # Remove lacunose witnesses from this reading's list now that the
 223                         # analysis is done
 224                         my @realgroup;
 225                         map { push( @realgroup, $_ ) unless $lmiss{$_} } @{$rdghash->{'group'}};
 226                         $rdghash->{'group'} = \@realgroup;
 227                         # Note any layered witnesses that appear in this group
 228                         foreach( @realgroup ) {
 229                                 if( $_ =~ /^(.*)\Q$aclabel\E$/ ) {
 230                                         push( @layerwits, $1 );
 231                                 }
 232                         }
 233                 }
 234                 $location->{'layerwits'} = \@layerwits if @layerwits;
 235         }
 236         $answer->{'conflict_count'} = $conflict_count;
 237
 238         return $answer;
 239 }
 240
 241 =head2 group_variants( $tradition, $rank, $lacunose, @merge_relationship_types )
 242
 243 Groups the variants at the given $rank of the collation, treating any
 244 relationships in @merge_relationship_types as equivalent.  $lacunose should
 245 be a reference to an array, to which the sigla of lacunose witnesses at this
 246 rank will be appended; $transposed should be a reference to a hash, wherein
 247 the identities of transposed readings and their relatives will be stored.
 248
 249 Returns a hash $group_readings where $rdg is attested by the witnesses listed
 250 in $group_readings->{$rdg}.
 251
 252 =cut
 253
 254 # Return group_readings, groups, lacunose
 255 sub group_variants {
 256         my( $tradition, $rank, $lacunose, $transposed, $collapse ) = @_;
 257         my $c = $tradition->collation;
 258         my $aclabel = $c->ac_label;
 259         my $table = $c->alignment_table;
 260         # Get the alignment table readings
 261         my %readings_at_rank;
 262         my %is_lacunose; # lookup table for witnesses not in stemma
 263         map { $is_lacunose{$_} = 1; $is_lacunose{$_.$aclabel} = 1 } @$lacunose;
 264         my @check_for_gaps;
 265         my %moved_wits;
 266         my $has_transposition;
 267         foreach my $tablewit ( @{$table->{'alignment'}} ) {
 268                 my $rdg = $tablewit->{'tokens'}->[$rank-1];
 269                 my $wit = $tablewit->{'witness'};
 270                 # Exclude the witness if it is "lacunose" which if we got here
 271                 # means "not in the stemma".
 272                 next if $is_lacunose{$wit};
 273                 # Note if the witness is actually in a lacuna
 274                 if( $rdg && $rdg->{'t'}->is_lacuna ) {
 275                         _add_to_witlist( $wit, $lacunose, $aclabel );
 276                 # Otherwise the witness either has a positive reading...
 277                 } elsif( $rdg ) {
 278                         # If the reading has been counted elsewhere as a transposition, ignore it.
 279                         if( $transposed->{$rdg->{'t'}->id} ) {
 280                                 # TODO Does this cope with three-way transpositions?
 281                                 map { $moved_wits{$_} = 1 } @{$transposed->{$rdg->{'t'}->id}};
 282                                 next;
 283                         }
 284                         # Otherwise, record it...
 285                         $readings_at_rank{$rdg->{'t'}->id} = $rdg->{'t'};
 286                         # ...and grab any transpositions, and their relations.
 287                         my @transp = grep { $_->rank != $rank } $rdg->{'t'}->related_readings();
 288                         foreach my $trdg ( @transp ) {
 289                                 next if exists $readings_at_rank{$trdg->id};
 290                                 $has_transposition = 1;
 291                                 my @affected_wits = _table_witnesses(
 292                                         $table, $trdg, \%is_lacunose, $aclabel );
 293                                 next unless @affected_wits;
 294                                 map { $moved_wits{$_} = 1 } @affected_wits;
 295                                 $transposed->{$trdg->id} =
 296                                         [ _table_witnesses( $table, $rdg->{'t'}, \%is_lacunose, $aclabel ) ];
 297                                 $readings_at_rank{$trdg->id} = $trdg;
 298                         }
 299                 # ...or it is empty, ergo a gap.
 300                 } else {
 301                         _add_to_witlist( $wit, \@check_for_gaps, $aclabel );
 302                 }
 303         }
 304         my @gap_wits;
 305         map { _add_to_witlist( $_, \@gap_wits, $aclabel )
 306                 unless $moved_wits{$_} } @check_for_gaps;
 307         # Group the readings, collapsing groups by relationship if needed
 308         my $grouped_readings = {};
 309         foreach my $rdg ( values %readings_at_rank ) {
 310                 # Skip readings that have been collapsed into others.
 311                 next if exists $grouped_readings->{$rdg->id}
 312                         && $grouped_readings->{$rdg->id} eq 'COLLAPSE';
 313                 # Get the witness list, including from readings collapsed into this one.
 314                 my @wits = _table_witnesses( $table, $rdg, \%is_lacunose, $aclabel );
 315                 if( $collapse && @$collapse ) {
 316                         my $filter = sub { my $r = $_[0]; grep { $_ eq $r->type } @$collapse; };
 317                         foreach my $other ( $rdg->related_readings( $filter ) ) {
 318                                 my @otherwits = _table_witnesses(
 319                                         $table, $other, \%is_lacunose, $aclabel );
 320                                 push( @wits, @otherwits );
 321                                 $grouped_readings->{$other->id} = 'COLLAPSE';
 322                         }
 323                 }
 324                 $grouped_readings->{$rdg->id} = \@wits;
 325         }
 326         $grouped_readings->{'(omitted)'} = \@gap_wits if @gap_wits;
 327         # Get rid of our collapsed readings
 328         map { delete $grouped_readings->{$_} if $grouped_readings->{$_} eq 'COLLAPSE' }
 329                 keys %$grouped_readings
 330                 if $collapse;
 331
 332         # If something was transposed, check the groups for doubled-up readings
 333         if( $has_transposition ) {
 334                 # print STDERR "Group for rank $rank:\n";
 335                 # map { print STDERR "\t$_: " . join( ' ' , @{$grouped_readings->{$_}} ) . "\n" }
 336                 #       keys %$grouped_readings;
 337                 _check_transposed_consistency( $c, $rank, $transposed, $grouped_readings );
 338         }
 339
 340         # Return the result
 341         return $grouped_readings;
 342 }
 343
 344 # Helper function to query the alignment table for all witnesses (a.c. included)
 345 # that have a given reading at its rank.
 346 sub _table_witnesses {
 347         my( $table, $trdg, $lacunose, $aclabel ) = @_;
 348         my $tableidx = $trdg->rank - 1;
 349         my @has_reading;
 350         foreach my $row ( @{$table->{'alignment'}} ) {
 351                 my $wit = $row->{'witness'};
 352                 next if $lacunose->{$wit};
 353                 my $rdg = $row->{'tokens'}->[$tableidx];
 354                 next unless exists $rdg->{'t'} && defined $rdg->{'t'};
 355                 _add_to_witlist( $wit, \@has_reading, $aclabel )
 356                         if $rdg->{'t'}->id eq $trdg->id;
 357         }
 358         return @has_reading;
 359 }
 360
 361 # Helper function to ensure that X and X a.c. never appear in the same list.
 362 sub _add_to_witlist {
 363         my( $wit, $list, $acstr ) = @_;
 364         my %inlist;
 365         my $idx = 0;
 366         map { $inlist{$_} = $idx++ } @$list;
 367         if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
 368                 my $acwit = $1;
 369                 unless( exists $inlist{$acwit} ) {
 370                         push( @$list, $acwit.$acstr );
 371                 }
 372         } else {
 373                 if( exists( $inlist{$wit.$acstr} ) ) {
 374                         # Replace the a.c. version with the main witness
 375                         my $i = $inlist{$wit.$acstr};
 376                         $list->[$i] = $wit;
 377                 } else {
 378                         push( @$list, $wit );
 379                 }
 380         }
 381 }
 382
 383 sub _check_transposed_consistency {
 384         my( $c, $rank, $transposed, $groupings ) = @_;
 385         my %seen_wits;
 386         my %thisrank;
 387         # Note which readings are actually at this rank, and which witnesses
 388         # belong to which reading.
 389         foreach my $rdg ( keys %$groupings ) {
 390                 my $rdgobj = $c->reading( $rdg );
 391                 # Count '(omitted)' as a reading at this rank
 392                 $thisrank{$rdg} = 1 if !$rdgobj || $rdgobj->rank == $rank;
 393                 map { push( @{$seen_wits{$_}}, $rdg ) } @{$groupings->{$rdg}};
 394         }
 395         # Our work is done if we have no witness belonging to more than one
 396         # reading.
 397         my @doubled = grep { scalar @{$seen_wits{$_}} > 1 } keys %seen_wits;
 398         return unless @doubled;
 399         # If we have a symmetric related transposition, drop the non-rank readings.
 400         if( @doubled == scalar keys %seen_wits ) {
 401                 foreach my $rdg ( keys %$groupings ) {
 402                         if( !$thisrank{$rdg} ) {
 403                                 my $groupstr = wit_stringify( $groupings->{$rdg} );
 404                                 my ( $matched ) = grep { $groupstr eq wit_stringify( $groupings->{$_} ) }
 405                                         keys %thisrank;
 406                                 delete $groupings->{$rdg};
 407                                 # If we found a group match, assume there is a symmetry happening.
 408                                 # TODO think more about this
 409                                 # print STDERR "*** Deleting symmetric reading $rdg\n";
 410                                 unless( $matched ) {
 411                                         delete $transposed->{$rdg};
 412                                         warn "Found problem in evident symmetry with reading $rdg";
 413                                 }
 414                         }
 415                 }
 416         # Otherwise 'unhook' the transposed reading(s) that have duplicates.
 417         } else {
 418                 foreach my $dup ( @doubled ) {
 419                         foreach my $rdg ( @{$seen_wits{$dup}} ) {
 420                                 next if $thisrank{$rdg};
 421                                 next unless exists $groupings->{$rdg};
 422                                 # print STDERR "*** Deleting asymmetric doubled-up reading $rdg\n";
 423                                 delete $groupings->{$rdg};
 424                                 delete $transposed->{$rdg};
 425                         }
 426                 }
 427                 # and put any now-orphaned readings into an 'omitted' reading.
 428                 foreach my $wit ( keys %seen_wits ) {
 429                         unless( grep { exists $groupings->{$_} } @{$seen_wits{$wit}} ) {
 430                                 $groupings->{'(omitted)'} = [] unless exists $groupings->{'(omitted)'};
 431                                 _add_to_witlist( $wit, $groupings->{'(omitted)'}, $c->ac_label );
 432                         }
 433                 }
 434         }
 435 }
 436
 437 =head2 solve_variants( $graph, @groups )
 438
 439 Sends the set of groups to the external graph solver service and returns
 440 a cleaned-up answer, adding the rank IDs back where they belong.
 441
 442 The JSON has the form
 443   { "graph": [ stemmagraph DOT string without newlines ],
 444     "groupings": [ array of arrays of groups, one per rank ] }
 445
 446 The answer has the form
 447   { "variants" => [ array of variant location structures ],
 448     "variant_count" => total,
 449     "conflict_count" => number of conflicts detected,
 450     "genealogical_count" => number of solutions found }
 451
 452 =cut
 453
 454 sub solve_variants {
 455         my( $stemma, @groups ) = @_;
 456         my $aclabel = $stemma->collation->ac_label;
 457
 458         # Filter the groups down to distinct groups, and work out what graph
 459         # should be used in the calculation of each group. We want to send each
 460         # distinct problem to the solver only once.
 461         # We need a whole bunch of lookup tables for this.
 462         my $index_groupkeys = {};       # Save the order of readings
 463         my $group_indices = {};         # Save the indices that have a given grouping
 464         my $graph_problems = {};        # Save the groupings for the given graph
 465
 466         foreach my $idx ( 0..$#groups ) {
 467                 my $ghash = $groups[$idx];
 468                 my @grouping;
 469                 # Sort the groupings from big to little, and scan for a.c. witnesses
 470                 # that would need an extended graph.
 471                 my @acwits;   # note which AC witnesses crop up at this rank
 472                 my @idxkeys = sort { scalar @{$ghash->{$b}} <=> scalar @{$ghash->{$a}} }
 473                         keys %$ghash;
 474                 foreach my $rdg ( @idxkeys ) {
 475                         my @sg = sort @{$ghash->{$rdg}};
 476                         push( @acwits, grep { $_ =~ /\Q$aclabel\E$/ } @sg );
 477                         push( @grouping, \@sg );
 478                 }
 479                 # Save the reading order
 480                 $index_groupkeys->{$idx} = \@idxkeys;
 481
 482                 # Now associate the distinct group with this index
 483                 my $gstr = wit_stringify( \@grouping );
 484                 push( @{$group_indices->{$gstr}}, $idx );
 485
 486                 # Finally, add the group to the list to be calculated for this graph.
 487                 map { s/\Q$aclabel\E$// } @acwits;
 488                 my $graph;
 489                 try {
 490                         $graph = $stemma->extend_graph( \@acwits );
 491                 } catch {
 492                         die "Unable to extend graph with @acwits";
 493                 }
 494                 unless( exists $graph_problems->{"$graph"} ) {
 495                         $graph_problems->{"$graph"} = { 'object' => $graph, 'groups' => [] };
 496                 }
 497                 push( @{$graph_problems->{"$graph"}->{'groups'}}, \@grouping );
 498         }
 499
 500         ## For each distinct graph, send its groups to the solver.
 501         my $solver_url = 'http://byzantini.st/cgi-bin/graphcalc.cgi';
 502         my $ua = LWP::UserAgent->new();
 503         ## Witness map is a HACK to get around limitations in node names from IDP
 504         my $witness_map = {};
 505         ## Variables to store answers as they come back
 506         my $variants = [ ( undef ) x ( scalar keys %$index_groupkeys ) ];
 507         my $genealogical = 0;
 508         foreach my $graphkey ( keys %$graph_problems ) {
 509                 my $graph = $graph_problems->{$graphkey}->{'object'};
 510                 my $groupings = $graph_problems->{$graphkey}->{'groups'};
 511                 my $req = _safe_wit_strings( $graph, $stemma->collation,
 512                         $groupings, $witness_map );
 513                 $req->{'command'} = 'findGroupings';
 514                 my $json = encode_json( $req );
 515                 # Send it off and get the result
 516                 #print STDERR "Sending request: $json\n";
 517                 my $resp = $ua->post( $solver_url, 'Content-Type' => 'application/json',
 518                                                           'Content' => $json );
 519                 my $answer;
 520                 my $used_idp;
 521                 if( $resp->is_success ) {
 522                         $answer = _desanitize_names( decode_json( $resp->content ), $witness_map );
 523                         $used_idp = 1;
 524                 } else {
 525                         # Fall back to the old method.
 526                         warn "IDP solver returned " . $resp->status_line . " / " . $resp->content
 527                                 . "; falling back to perl method";
 528                         $answer = perl_solver( $graph, @$groupings );
 529                 }
 530                 ## The answer is the evaluated groupings, plus a boolean for whether
 531                 ## they were genealogical.  Reconstruct our original groups.
 532                 foreach my $gidx ( 0 .. $#{$groupings} ) {
 533                         my( $calc_groups, $result ) = @{$answer->[$gidx]};
 534                         if( $result ) {
 535                                 $genealogical++;
 536                                 # Prune the calculated groups, in case the IDP solver failed to.
 537                                 if( $used_idp ) {
 538                                         my @pruned_groups;
 539                                         foreach my $cg ( @$calc_groups ) {
 540                                                 # This is a little wasteful but the path of least
 541                                                 # resistance. Send both the stemma, which knows what
 542                                                 # its hypotheticals are, and the actual graph used.
 543                                                 my @pg = _prune_group( $cg, $stemma, $graph );
 544                                                 push( @pruned_groups, \@pg );
 545                                         }
 546                                         $calc_groups = \@pruned_groups;
 547                                 }
 548                         }
 549                         # Retrieve the key for the original group that went to the solver
 550                         my $input_group = wit_stringify( $groupings->[$gidx] );
 551                         foreach my $oidx ( @{$group_indices->{$input_group}} ) {
 552                                 my @readings = @{$index_groupkeys->{$oidx}};
 553                                 my $vstruct = {
 554                                         'genealogical' => $result,
 555                                         'readings' => [],
 556                                 };
 557                                 foreach my $ridx ( 0 .. $#readings ) {
 558                                         push( @{$vstruct->{'readings'}},
 559                                                 { 'readingid' => $readings[$ridx],
 560                                                   'group' => $calc_groups->[$ridx] } );
 561                                 }
 562                                 $variants->[$oidx] = $vstruct;
 563                         }
 564                 }
 565         }
 566
 567         return { 'variants' => $variants,
 568                          'variant_count' => scalar @$variants,
 569                          'genealogical_count' => $genealogical };
 570 }
 571
 572 #### HACKERY to cope with IDP's limited idea of what a node name looks like ###
 573
 574 sub _safe_wit_strings {
 575         my( $graph, $c, $groupings, $witness_map ) = @_;
 576         # Parse the graph we were given into a stemma.
 577         my $safegraph = Graph->new();
 578         # Convert the graph to a safe representation and store the conversion.
 579         foreach my $n ( $graph->vertices ) {
 580                 my $sn = _safe_witstr( $n );
 581                 if( exists $witness_map->{$sn} ) {
 582                         warn "Ambiguous stringification $sn for $n and " . $witness_map->{$sn}
 583                                 if $witness_map->{$sn} ne $n;
 584                 } else {
 585                         $witness_map->{$sn} = $n;
 586                 }
 587                 $safegraph->add_vertex( $sn );
 588                 $safegraph->set_vertex_attributes( $sn,
 589                         $graph->get_vertex_attributes( $n ) );
 590         }
 591         foreach my $e ( $graph->edges ) {
 592                 my @safe_e = ( _safe_witstr( $e->[0] ), _safe_witstr( $e->[1] ) );
 593                 $safegraph->add_edge( @safe_e );
 594         }
 595         my $safe_stemma = Text::Tradition::Stemma->new(
 596                 'collation' => $c, 'graph' => $safegraph );
 597
 598         # Now convert the witness groupings to a safe representation.
 599         my $safe_groupings = [];
 600         foreach my $grouping ( @$groupings ) {
 601                 my $safe_grouping = [];
 602                 foreach my $group ( @$grouping ) {
 603                         my $safe_group = [];
 604                         foreach my $n ( @$group ) {
 605                                 my $sn = _safe_witstr( $n );
 606                                 warn "Ambiguous stringification $sn for $n and " . $witness_map->{$sn}
 607                                         if exists $witness_map->{$sn} && $witness_map->{$sn} ne $n;
 608                                 $witness_map->{$sn} = $n;
 609                                 push( @$safe_group, $sn );
 610                         }
 611                         push( @$safe_grouping, $safe_group );
 612                 }
 613                 push( @$safe_groupings, $safe_grouping );
 614         }
 615
 616         # Return it all in the struct we expect.  We have stored the reductions
 617         # in the $witness_map that we were passed.
 618         return { 'graph' => $safe_stemma->editable( { 'linesep' => ' ' } ),
 619                          'groupings' => $safe_groupings };
 620 }
 621
 622 sub _safe_witstr {
 623         my $witstr = shift;
 624         $witstr =~ s/\s+/_/g;
 625         $witstr =~ s/[^\w\d-]//g;
 626         return $witstr;
 627 }
 628
 629 sub _desanitize_names {
 630         my( $jsonstruct, $witness_map ) = @_;
 631         my $result = [];
 632         foreach my $grouping ( @$jsonstruct ) {
 633                 my $real_grouping = [];
 634                 foreach my $element ( @$grouping ) {
 635                         if( ref( $element ) eq 'ARRAY' ) {
 636                                 # it's the groupset.
 637                                 my $real_groupset = [];
 638                                 foreach my $group ( @$element ) {
 639                                         my $real_group = [];
 640                                         foreach my $n ( @$group ) {
 641                                                 my $rn = $witness_map->{$n};
 642                                                 push( @$real_group, $rn );
 643                                         }
 644                                         push( @$real_groupset, $real_group );
 645                                 }
 646                                 push( @$real_grouping, $real_groupset );
 647                         } else {
 648                                 # It is the boolean, not actually a group.
 649                                 push( @$real_grouping, $element );
 650                         }
 651                 }
 652                 push( @$result, $real_grouping );
 653         }
 654         return $result;
 655 }
 656
 657 ### END HACKERY ###
 658
 659 =head2 analyze_location ( $tradition, $graph, $location_hash )
 660
 661 Given the tradition, its stemma graph, and the solution from the graph solver,
 662 work out the rest of the information we want.  For each reading we need missing,
 663 conflict, reading_parents, independent_occurrence, followed, not_followed, and follow_unknown.  Alters the location_hash in place.
 664
 665 =cut
 666
 667 sub analyze_location {
 668         my ( $tradition, $stemma, $variant_row, $lacunose ) = @_;
 669         my $c = $tradition->collation;
 670
 671         # Make a hash of all known node memberships, and make the subgraphs.
 672         my $contig = {};
 673         my $reading_roots = {};
 674         my $subgraph = {};
 675         my $acstr = $c->ac_label;
 676         my @acwits;
 677         # Note which witnesses positively belong to which group
 678     foreach my $rdghash ( @{$variant_row->{'readings'}} ) {
 679         my $rid = $rdghash->{'readingid'};
 680         foreach my $wit ( @{$rdghash->{'group'}} ) {
 681                 $contig->{$wit} = $rid;
 682             if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
 683                 push( @acwits, $1 );
 684             }
 685         }
 686         }
 687         # Get the actual graph we should work with
 688         my $graph;
 689         try {
 690                 $graph = @acwits ? $stemma->extend_graph( \@acwits ) : $stemma->graph;
 691         } catch {
 692                 die "Could not extend graph with a.c. witnesses @acwits";
 693         }
 694
 695         # Now, armed with that knowledge, make a subgraph for each reading
 696         # and note the root(s) of each subgraph.
 697         foreach my $rdghash( @{$variant_row->{'readings'}} ) {
 698         my $rid = $rdghash->{'readingid'};
 699         my %rdgwits;
 700         # Make the subgraph.
 701         my $part = $graph->copy;
 702         my @todelete = grep { exists $contig->{$_} && $contig->{$_} ne $rid }
 703                 keys %$contig;
 704         $part->delete_vertices( @todelete );
 705         _prune_subtree( $part, $lacunose );
 706                 $subgraph->{$rid} = $part;
 707                 # Record the remaining lacunose nodes as part of this group, if
 708                 # we are dealing with a non-genealogical reading.
 709                 unless( $variant_row->{'genealogical'} ) {
 710                         map { $contig->{$_} = $rid } $part->vertices;
 711                 }
 712                 # Get the reading roots.
 713                 map { $reading_roots->{$_} = $rid } $part->predecessorless_vertices;
 714         }
 715
 716         # Now that we have all the node group memberships, calculate followed/
 717     # non-followed/unknown values for each reading.  Also figure out the
 718     # reading's evident parent(s).
 719     foreach my $rdghash ( @{$variant_row->{'readings'}} ) {
 720         my $rid = $rdghash->{'readingid'};
 721         my $rdg = $c->reading( $rid );
 722         # Get the subgraph
 723         my $part = $subgraph->{$rid};
 724
 725         # Start figuring things out.
 726         my @roots = grep { $reading_roots->{$_} eq $rid } keys %$reading_roots;
 727         $rdghash->{'independent_occurrence'} = \@roots;
 728         $rdghash->{'followed'} = scalar( $part->vertices ) - scalar( @roots );
 729         # Find the parent readings, if any, of this reading.
 730         my $rdgparents = {};
 731         foreach my $wit ( @roots ) {
 732                 # Look in the main stemma to find this witness's extant or known-reading
 733                 # immediate ancestor(s), and look up the reading that each ancestor olds.
 734                         my @check = $graph->predecessors( $wit );
 735                         while( @check ) {
 736                                 my @next;
 737                                 foreach my $wparent( @check ) {
 738                                         my $preading = $contig->{$wparent};
 739                                         # IDP assigns all nodes, hypothetical included, to a reading
 740                                         # in the case of genealogical sets. We prune non-necessary
 741                                         # hypothetical readings, but they are still in $contig, so
 742                                         # we account for that here.
 743                                         if( $preading && $preading ne $rid ) {
 744                                                 $rdgparents->{$preading} = 1;
 745                                         } else {
 746                                                 push( @next, $graph->predecessors( $wparent ) );
 747                                         }
 748                                 }
 749                                 @check = @next;
 750                         }
 751                 }
 752                 foreach my $p ( keys %$rdgparents ) {
 753                         # Resolve the relationship of the parent to the reading, and
 754                         # save it in our hash.
 755                         my $pobj = $c->reading( $p );
 756                         my $prep = $pobj ? $pobj->id . ' (' . $pobj->text . ')' : $p;
 757                         my $phash = { 'label' => $prep };
 758                         if( $pobj ) {
 759                                 my $rel = $c->get_relationship( $p, $rdghash->{readingid} );
 760                                 if( $rel ) {
 761                                         _add_to_hash( $rel, $phash );
 762                                 } elsif( $rdg ) {
 763                                         # First check for a transposed relationship
 764                                         if( $rdg->rank != $pobj->rank ) {
 765                                                 foreach my $ti ( $rdg->related_readings( 'transposition' ) ) {
 766                                                         next unless $ti->text eq $rdg->text;
 767                                                         $rel = $c->get_relationship( $ti, $pobj );
 768                                                         if( $rel ) {
 769                                                                 _add_to_hash( $rel, $phash, 1 );
 770                                                                 last;
 771                                                         }
 772                                                 }
 773                                                 unless( $rel ) {
 774                                                         foreach my $ti ( $pobj->related_readings( 'transposition' ) ) {
 775                                                                 next unless $ti->text eq $pobj->text;
 776                                                                 $rel = $c->get_relationship( $ti, $rdg );
 777                                                                 if( $rel ) {
 778                                                                         _add_to_hash( $rel, $phash, 1 );
 779                                                                         last;
 780                                                                 }
 781                                                         }
 782                                                 }
 783                                         }
 784                                         unless( $rel ) {
 785                                                 # and then check for sheer word similarity.
 786                                                 my $rtext = $rdg->text;
 787                                                 my $ptext = $pobj->text;
 788                                                 if( similar( $rtext, $ptext ) ) {
 789                                                         # say STDERR "Words $rtext and $ptext judged similar";
 790                                                         $phash->{relation} = { type => 'wordsimilar' };
 791                                                 }
 792                                         }
 793                                 } else {
 794                                         $phash->{relation} = { type => 'deletion' };
 795                                 }
 796                                 # Get the attributes of the parent object while we are here
 797                                 $phash->{'text'} = $pobj->text if $pobj;
 798                                 $phash->{'is_nonsense'} = $pobj->is_nonsense;
 799                                 $phash->{'is_ungrammatical'} = $pobj->grammar_invalid;
 800                         } elsif( $p eq '(omitted)' ) {
 801                                 $phash->{relation} = { type => 'addition' };
 802                         }
 803                         # Save it
 804                         $rdgparents->{$p} = $phash;
 805                 }
 806
 807                 $rdghash->{'reading_parents'} = $rdgparents;
 808
 809                 # Find the number of times this reading was altered, and the number of
 810                 # times we're not sure.
 811                 my( %nofollow, %unknownfollow );
 812                 foreach my $wit ( $part->vertices ) {
 813                         foreach my $wchild ( $graph->successors( $wit ) ) {
 814                                 next if $part->has_vertex( $wchild );
 815                                 if( $reading_roots->{$wchild} && $contig->{$wchild} ) {
 816                                         # It definitely changed here.
 817                                         $nofollow{$wchild} = 1;
 818                                 } elsif( !($contig->{$wchild}) ) {
 819                                         # The child is a hypothetical node not definitely in
 820                                         # any group. Answer is unknown.
 821                                         $unknownfollow{$wchild} = 1;
 822                                 } # else it's a non-root node in a known group, and therefore
 823                                   # is presumed to have its reading from its group, not this link.
 824                         }
 825                 }
 826                 $rdghash->{'not_followed'} = keys %nofollow;
 827                 $rdghash->{'follow_unknown'} = keys %unknownfollow;
 828
 829                 # Now say whether this reading represents a conflict.
 830                 unless( $variant_row->{'genealogical'} ) {
 831                         $rdghash->{'conflict'} = @roots != 1;
 832                 }
 833     }
 834 }
 835
 836 sub _add_to_hash {
 837         my( $rel, $phash, $is_transposed ) = @_;
 838         $phash->{relation} = { type => $rel->type };
 839         $phash->{relation}->{transposed} = 1 if $is_transposed;
 840         $phash->{relation}->{annotation} = $rel->annotation
 841                 if $rel->has_annotation;
 842 }
 843
 844 =head2 similar( $word1, $word2 )
 845
 846 Use Algorithm::Diff to get a sense of how close the words are to each other.
 847 This will hopefully handle substitutions a bit more nicely than Levenshtein.
 848
 849 =cut
 850
 851 #!/usr/bin/env perl
 852
 853 sub similar {
 854         my( $word1, $word2 ) = sort { length($a) <=> length($b) } @_;
 855         my @let1 = split( '', lc( $word1 ) );
 856         my @let2 = split( '', lc( $word2 ) );
 857         my $diff = Algorithm::Diff->new( \@let1, \@let2 );
 858         my $mag = 0;
 859         while( $diff->Next ) {
 860                 if( $diff->Same ) {
 861                         # Take off points for longer strings
 862                         my $cs = $diff->Range(1) - 2;
 863                         $cs = 0 if $cs < 0;
 864                         $mag -= $cs;
 865                 } elsif( !$diff->Items(1) ) {
 866                         $mag += $diff->Range(2);
 867                 } elsif( !$diff->Items(2) ) {
 868                         $mag += $diff->Range(1);
 869                 } else {
 870                         # Split the difference for substitutions
 871                         my $c1 = $diff->Range(1) || 1;
 872                         my $c2 = $diff->Range(2) || 1;
 873                         my $cd = ( $c1 + $c2 ) / 2;
 874                         $mag += $cd;
 875                 }
 876         }
 877         return ( $mag <= length( $word1 ) / 2 );
 878 }
 879
 880
 881
 882 =head2 perl_solver( $tradition, $rank, $stemma_id, @merge_relationship_types )
 883
 884 ** NOTE ** This method should hopefully not be called - it is not guaranteed
 885 to be correct.  Serves as a backup for the real solver.
 886
 887 Runs an analysis of the given tradition, at the location given in $rank,
 888 against the graph of the stemma specified in $stemma_id.  The argument
 889 @merge_relationship_types is an optional list of relationship types for
 890 which readings so related should be treated as equivalent.
 891
 892 Returns a nested array data structure as follows:
 893
 894  [ [ group_list, is_genealogical ], [ group_list, is_genealogical ] ... ]
 895
 896 where the group list is the array of arrays passed in for each element of @groups,
 897 possibly with the addition of hypothetical readings.
 898
 899
 900 =cut
 901
 902 sub perl_solver {
 903         my( $graph, @groups ) = @_;
 904         my @answer;
 905         foreach my $g ( @groups ) {
 906                 push( @answer, _solve_variant_location( $graph, $g ) );
 907         }
 908         return \@answer;
 909 }
 910
 911 sub _solve_variant_location {
 912         my( $graph, $groups ) = @_;
 913         # Now do the work.
 914     my $contig = {};
 915     my $subgraph = {};
 916     my $is_conflicted;
 917     my $conflict = {};
 918
 919     # Mark each ms as in its own group, first.
 920     foreach my $g ( @$groups ) {
 921         my $gst = wit_stringify( $g );
 922         map { $contig->{$_} = $gst } @$g;
 923     }
 924
 925     # Now for each unmarked node in the graph, initialize an array
 926     # for possible group memberships.  We will use this later to
 927     # resolve potential conflicts.
 928     map { $contig->{$_} = [] unless $contig->{$_} } $graph->vertices;
 929     foreach my $g ( sort { scalar @$b <=> scalar @$a } @$groups ) {
 930         my $gst = wit_stringify( $g );  # This is the group name
 931         # Copy the graph, and delete all non-members from the new graph.
 932         my $part = $graph->copy;
 933         my @group_roots;
 934         $part->delete_vertices(
 935             grep { !ref( $contig->{$_} ) && $contig->{$_} ne $gst } $graph->vertices );
 936
 937         # Now look to see if our group is connected.
 938                 if( @$g > 1 ) {
 939                         # We have to take directionality into account.
 940                         # How many root nodes do we have?
 941                         my @roots = grep { ref( $contig->{$_} ) || $contig->{$_} eq $gst }
 942                                 $part->predecessorless_vertices;
 943                         # Assuming that @$g > 1, find the first root node that has at
 944                         # least one successor belonging to our group. If this reading
 945                         # is genealogical, there should be only one, but we will check
 946                         # that implicitly later.
 947                         foreach my $root ( @roots ) {
 948                                 # Prune the tree to get rid of extraneous hypotheticals.
 949                                 $root = _prune_subtree_old( $part, $root, $contig );
 950                                 next unless $root;
 951                                 # Save this root for our group.
 952                                 push( @group_roots, $root );
 953                                 # Get all the successor nodes of our root.
 954                         }
 955                 } else {
 956                         # Dispense with the trivial case of one reading.
 957                         my $wit = $g->[0];
 958                         @group_roots = ( $wit );
 959                         foreach my $v ( $part->vertices ) {
 960                                 $part->delete_vertex( $v ) unless $v eq $wit;
 961                         }
 962         }
 963
 964         if( @group_roots > 1 ) {
 965                 $conflict->{$gst} = 1;
 966                 $is_conflicted = 1;
 967         }
 968         # Paint the 'hypotheticals' with our group.
 969                 foreach my $wit ( $part->vertices ) {
 970                         if( ref( $contig->{$wit} ) ) {
 971                                 push( @{$contig->{$wit}}, $gst );
 972                         } elsif( $contig->{$wit} ne $gst ) {
 973                                 warn "How did we get here?";
 974                         }
 975                 }
 976
 977
 978                 # Save the relevant subgraph.
 979                 $subgraph->{$gst} = $part;
 980     }
 981
 982         # For each of our hypothetical readings, flatten its 'contig' array if
 983         # the array contains zero or one group.  If we have any unflattened arrays,
 984         # we may need to run the resolution process. If the reading is already known
 985         # to have a conflict, flatten the 'contig' array to nothing; we won't resolve
 986         # it.
 987         my @resolve;
 988         foreach my $wit ( keys %$contig ) {
 989                 next unless ref( $contig->{$wit} );
 990                 if( @{$contig->{$wit}} > 1 ) {
 991                         if( $is_conflicted ) {
 992                                 $contig->{$wit} = '';  # We aren't going to decide.
 993                         } else {
 994                                 push( @resolve, $wit );
 995                         }
 996                 } else {
 997                         my $gst = pop @{$contig->{$wit}};
 998                         $contig->{$wit} = $gst || '';
 999                 }
1000         }
1001
1002     if( @resolve ) {
1003         my $still_contig = {};
1004         foreach my $h ( @resolve ) {
1005             # For each of the hypothetical readings with more than one possibility,
1006             # try deleting it from each of its member subgraphs in turn, and see
1007             # if that breaks the contiguous grouping.
1008             # TODO This can still break in a corner case where group A can use
1009             # either vertex 1 or 2, and group B can use either vertex 2 or 1.
1010             # Revisit this if necessary; it could get brute-force nasty.
1011             foreach my $gst ( @{$contig->{$h}} ) {
1012                 my $gpart = $subgraph->{$gst}->copy();
1013                 # If we have come this far, there is only one root and everything
1014                 # is reachable from it.
1015                 my( $root ) = $gpart->predecessorless_vertices;
1016                 my $reachable = {};
1017                 map { $reachable->{$_} = 1 } $gpart->vertices;
1018
1019                 # Try deleting the hypothetical node.
1020                 $gpart->delete_vertex( $h );
1021                 if( $h eq $root ) {
1022                         # See if we still have a single root.
1023                         my @roots = $gpart->predecessorless_vertices;
1024                         warn "This shouldn't have happened" unless @roots;
1025                         if( @roots > 1 ) {
1026                                 # $h is needed by this group.
1027                                 if( exists( $still_contig->{$h} ) ) {
1028                                         # Conflict!
1029                                         $conflict->{$gst} = 1;
1030                                         $still_contig->{$h} = '';
1031                                 } else {
1032                                         $still_contig->{$h} = $gst;
1033                                 }
1034                         }
1035                 } else {
1036                         # $h is somewhere in the middle. See if everything
1037                         # else can still be reached from the root.
1038                                         my %still_reachable = ( $root => 1 );
1039                                         map { $still_reachable{$_} = 1 }
1040                                                 $gpart->all_successors( $root );
1041                                         foreach my $v ( keys %$reachable ) {
1042                                                 next if $v eq $h;
1043                                                 if( !$still_reachable{$v}
1044                                                         && ( $contig->{$v} eq $gst
1045                                                                  || ( exists $still_contig->{$v}
1046                                                                           && $still_contig->{$v} eq $gst ) ) ) {
1047                                                         # We need $h.
1048                                                         if( exists $still_contig->{$h} ) {
1049                                                                 # Conflict!
1050                                                                 $conflict->{$gst} = 1;
1051                                                                 $still_contig->{$h} = '';
1052                                                         } else {
1053                                                                 $still_contig->{$h} = $gst;
1054                                                         }
1055                                                         last;
1056                                                 } # else we don't need $h in this group.
1057                                         } # end foreach $v
1058                                 } # endif $h eq $root
1059             } # end foreach $gst
1060         } # end foreach $h
1061
1062         # Now we have some hypothetical vertices in $still_contig that are the
1063         # "real" group memberships.  Replace these in $contig.
1064                 foreach my $v ( keys %$contig ) {
1065                         next unless ref $contig->{$v};
1066                         $contig->{$v} = $still_contig->{$v};
1067                 }
1068     } # end if @resolve
1069
1070     my $is_genealogical = keys %$conflict ? JSON::false : JSON::true;
1071         my $variant_row = [ [], $is_genealogical ];
1072         # Fill in the groupings from $contig.
1073         foreach my $g ( @$groups ) {
1074         my $gst = wit_stringify( $g );
1075         my @realgroup = grep { $contig->{$_} eq $gst } keys %$contig;
1076         push( @{$variant_row->[0]}, \@realgroup );
1077     }
1078     return $variant_row;
1079 }
1080
1081 sub _prune_group {
1082         my( $group, $stemma, $graph ) = @_;
1083         my $lacunose = {};
1084         map { $lacunose->{$_} = 1 } $stemma->hypotheticals;
1085         map { $lacunose->{$_} = 0 } @$group;
1086         # Make our subgraph
1087         my $subgraph = $graph->copy;
1088         map { $subgraph->delete_vertex( $_ ) unless exists $lacunose->{$_} }
1089                 $subgraph->vertices;
1090         # ...and find the root.
1091         # Now prune and return the remaining vertices.
1092         _prune_subtree( $subgraph, $lacunose );
1093         return $subgraph->vertices;
1094 }
1095
1096 sub _prune_subtree {
1097         my( $tree, $lacunose ) = @_;
1098
1099         # Delete lacunose witnesses that have no successors
1100     my @orphan_hypotheticals;
1101     my $ctr = 0;
1102     do {
1103         die "Infinite loop on leaves" if $ctr > 100;
1104         @orphan_hypotheticals = grep { $lacunose->{$_} }
1105                 $tree->successorless_vertices;
1106         $tree->delete_vertices( @orphan_hypotheticals );
1107         $ctr++;
1108     } while( @orphan_hypotheticals );
1109
1110         # Delete lacunose roots that have a single successor
1111         my @redundant_root;
1112         $ctr = 0;
1113         do {
1114         die "Infinite loop on roots" if $ctr > 100;
1115                 @redundant_root = grep { $lacunose->{$_} && $tree->successors( $_ ) == 1 }
1116                         $tree->predecessorless_vertices;
1117                 $tree->delete_vertices( @redundant_root );
1118                 $ctr++;
1119         } while( @redundant_root );
1120 }
1121
1122 sub _prune_subtree_old {
1123     my( $tree, $root, $contighash ) = @_;
1124     # First, delete hypothetical leaves / orphans until there are none left.
1125     my @orphan_hypotheticals = grep { ref( $contighash->{$_} ) }
1126         $tree->successorless_vertices;
1127     while( @orphan_hypotheticals ) {
1128         $tree->delete_vertices( @orphan_hypotheticals );
1129         @orphan_hypotheticals = grep { ref( $contighash->{$_} ) }
1130             $tree->successorless_vertices;
1131     }
1132     # Then delete a hypothetical root with only one successor, moving the
1133     # root to the first child that has no other predecessors.
1134     while( $tree->successors( $root ) == 1 && ref $contighash->{$root} ) {
1135         my @nextroot = $tree->successors( $root );
1136         $tree->delete_vertex( $root );
1137         ( $root ) = grep { $tree->is_predecessorless_vertex( $_ ) } @nextroot;
1138     }
1139     # The tree has been modified in place, but we need to know the new root.
1140     $root = undef unless $root && $tree->has_vertex( $root );
1141     return $root;
1142 }
1143 # Add the variant, subject to a.c. representation logic.
1144 # This assumes that we will see the 'main' version before the a.c. version.
1145 sub add_variant_wit {
1146     my( $arr, $wit, $acstr ) = @_;
1147     my $skip;
1148     if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
1149         my $real = $1;
1150         $skip = grep { $_ =~ /^\Q$real\E$/ } @$arr;
1151     }
1152     push( @$arr, $wit ) unless $skip;
1153 }
1154
1155 sub _useful_variant {
1156         my( $group_readings, $graph, $acstr ) = @_;
1157
1158         # TODO Decide what to do with AC witnesses
1159
1160         # Sort by group size and return
1161         my $is_useful = 0;
1162         my( @readings, @groups );   # The sorted groups for our answer.
1163         foreach my $rdg ( sort { @{$group_readings->{$b}} <=> @{$group_readings->{$a}} }
1164                 keys %$group_readings ) {
1165                 push( @readings, $rdg );
1166                 push( @groups, $group_readings->{$rdg} );
1167                 if( @{$group_readings->{$rdg}} > 1 ) {
1168                         $is_useful++;
1169                 } else {
1170                         my( $wit ) = @{$group_readings->{$rdg}};
1171                         $wit =~ s/^(.*)\Q$acstr\E$/$1/;
1172                         $is_useful++ unless( $graph->is_sink_vertex( $wit ) );
1173                 }
1174         }
1175         if( $is_useful > 1 ) {
1176                 return( \@readings, \@groups );
1177         } else {
1178                 return( [], [] );
1179         }
1180 }
1181
1182 =head2 wit_stringify( $groups )
1183
1184 Takes an array of witness groupings and produces a string like
1185 ['A','B'] / ['C','D','E'] / ['F']
1186
1187 =cut
1188
1189 sub wit_stringify {
1190     my $groups = shift;
1191     my @gst;
1192     # If we were passed an array of witnesses instead of an array of
1193     # groupings, then "group" the witnesses first.
1194     unless( ref( $groups->[0] ) ) {
1195         my $mkgrp = [ $groups ];
1196         $groups = $mkgrp;
1197     }
1198     foreach my $g ( @$groups ) {
1199         push( @gst, '[' . join( ',', map { "'$_'" } @$g ) . ']' );
1200     }
1201     return join( ' / ', @gst );
1202 }
1203
1204 sub _symmdiff {
1205         my( $lista, $listb ) = @_;
1206         my %union;
1207         my %scalars;
1208         map { $union{$_} = 1; $scalars{$_} = $_ } @$lista;
1209         map { $union{$_} += 1; $scalars{$_} = $_ } @$listb;
1210         my @set = grep { $union{$_} == 1 } keys %union;
1211         return map { $scalars{$_} } @set;
1212 }
1213
1214 1;
1215
1216 =head1 LICENSE
1217
1218 This package is free software and is provided "as is" without express
1219 or implied warranty.  You can redistribute it and/or modify it under
1220 the same terms as Perl itself.
1221
1222 =head1 AUTHOR
1223
1224 Tara L Andrews E<lt>aurum@cpan.orgE<gt>