lib/Text/Tradition/Analysis.pm

   1 package Text::Tradition::Analysis;
   2
   3 use strict;
   4 use warnings;
   5 use Benchmark;
   6 use Encode qw/ encode_utf8 /;
   7 use Exporter 'import';
   8 use Graph;
   9 use JSON qw/ encode_json decode_json /;
  10 use LWP::UserAgent;
  11 use Text::Tradition;
  12 use Text::Tradition::Stemma;
  13 use TryCatch;
  14
  15 use vars qw/ @EXPORT_OK /;
  16 @EXPORT_OK = qw/ run_analysis group_variants analyze_variant_location wit_stringify /;
  17
  18 =head1 NAME
  19
  20 Text::Tradition::Analysis - functions for stemma analysis of a tradition
  21
  22 =head1 SYNOPSIS
  23
  24   use Text::Tradition;
  25   use Text::Tradition::Analysis qw/ run_analysis analyze_variant_location /;
  26   my $t = Text::Tradition->new(
  27     'name' => 'this is a text',
  28     'input' => 'TEI',
  29     'file' => '/path/to/tei_parallel_seg_file.xml' );
  30   $t->add_stemma( 'dotfile' => $stemmafile );
  31
  32   my $variant_data = run_analysis( $tradition );
  33   # Recalculate rank $n treating all orthographic variants as equivalent
  34   my $reanalyze = analyze_variant_location( $tradition, $n, 0, 'orthographic' );
  35
  36 =head1 DESCRIPTION
  37
  38 Text::Tradition is a library for representation and analysis of collated
  39 texts, particularly medieval ones.  The Collation is the central feature of
  40 a Tradition, where the text, its sequence of readings, and its relationships
  41 between readings are actually kept.
  42
  43 =head1 SUBROUTINES
  44
  45 =head2 run_analysis( $tradition, %opts )
  46
  47 Runs the analysis described in analyze_variant_location on every location in the
  48 collation of the given tradition, with the given options. These include:
  49
  50 =over 4
  51
  52 =item * stemma_id - Specify which of the tradition's stemmata to use. Default
  53 is 0 (i.e. the first).
  54
  55 =item * ranks - Specify a list of location ranks to analyze; exclude the rest.
  56
  57 =item * merge_types - Specify a list of relationship types, where related readings
  58 should be treated as identical for the purposes of analysis.
  59
  60 =item * exclude_type1 - Exclude those ranks whose groupings have only type-1 variants.
  61
  62 =back
  63
  64 =begin testing
  65
  66 use Text::Tradition;
  67 use Text::Tradition::Analysis qw/ run_analysis analyze_variant_location /;
  68
  69 my $datafile = 't/data/florilegium_tei_ps.xml';
  70 my $tradition = Text::Tradition->new( 'input' => 'TEI',
  71                                       'name' => 'test0',
  72                                       'file' => $datafile );
  73 my $s = $tradition->add_stemma( 'dotfile' => 't/data/florilegium.dot' );
  74 is( ref( $s ), 'Text::Tradition::Stemma', "Added stemma to tradition" );
  75
  76 my %expected_genealogical = (
  77         1 => 0,
  78         2 => 1,
  79         3 =>  0,
  80         5 =>  0,
  81         7 =>  0,
  82         8 =>  0,
  83         10 => 0,
  84         13 => 1,
  85         33 => 0,
  86         34 => 0,
  87         37 => 0,
  88         60 => 0,
  89         81 => 1,
  90         84 => 0,
  91         87 => 0,
  92         101 => 0,
  93         102 => 0,
  94         122 => 1,
  95         157 => 0,
  96         166 => 1,
  97         169 => 1,
  98         200 => 0,
  99         216 => 1,
 100         217 => 1,
 101         219 => 1,
 102         241 => 1,
 103         242 => 1,
 104         243 => 1,
 105 );
 106
 107 my $data = run_analysis( $tradition );
 108 my $c = $tradition->collation;
 109 foreach my $row ( @{$data->{'variants'}} ) {
 110         # Account for rows that used to be "not useful"
 111         unless( exists $expected_genealogical{$row->{'id'}} ) {
 112                 $expected_genealogical{$row->{'id'}} = 1;
 113         }
 114         my $gen_bool = $row->{'genealogical'} ? 1 : 0;
 115         is( $gen_bool, $expected_genealogical{$row->{'id'}},
 116                 "Got correct genealogical flag for row " . $row->{'id'} );
 117         # Check that we have the right row with the right groups
 118         my $rank = $row->{'id'};
 119         foreach my $rdghash ( @{$row->{'readings'}} ) {
 120                 # Skip 'readings' that aren't really
 121                 next unless $c->reading( $rdghash->{'readingid'} );
 122                 # Check the rank
 123                 is( $c->reading( $rdghash->{'readingid'} )->rank, $rank,
 124                         "Got correct reading rank" );
 125                 # Check the witnesses
 126                 my @realwits = sort $c->reading_witnesses( $rdghash->{'readingid'} );
 127                 my @sgrp = sort @{$rdghash->{'group'}};
 128                 is_deeply( \@sgrp, \@realwits, "Reading analyzed with correct groups" );
 129         }
 130 }
 131 is( $data->{'variant_count'}, 58, "Got right total variant number" );
 132 # TODO Make something meaningful of conflict count, maybe test other bits
 133
 134 =end testing
 135
 136 =cut
 137
 138 sub run_analysis {
 139         my( $tradition, %opts ) = @_;
 140         my $c = $tradition->collation;
 141
 142         my $stemma_id = $opts{'stemma_id'} || 0;
 143         my @ranks = ref( $opts{'ranks'} ) eq 'ARRAY' ? @{$opts{'ranks'}} : ();
 144         my @collapse = ref( $opts{'merge_types'} ) eq 'ARRAY' ? @{$opts{'merge_types'}} : ();
 145
 146         # Get the stemma
 147         my $stemma = $tradition->stemma( $stemma_id );
 148
 149         # Figure out which witnesses we are working with - that is, the ones that
 150         # appear both in the stemma and in the tradition. All others are 'lacunose'
 151         # for our purposes.
 152         my @lacunose = $stemma->hypotheticals;
 153         my @tradition_wits = map { $_->sigil } $tradition->witnesses;
 154         push( @lacunose, _symmdiff( [ $stemma->witnesses ], \@tradition_wits ) );
 155
 156         # Find and mark 'common' ranks for exclusion, unless they were
 157         # explicitly specified.
 158         unless( @ranks ) {
 159                 my %common_rank;
 160                 foreach my $rdg ( $c->common_readings ) {
 161                         $common_rank{$rdg->rank} = 1;
 162                 }
 163                 @ranks = grep { !$common_rank{$_} } ( 1 .. $c->end->rank-1 );
 164         }
 165
 166         # Group the variants to send to the solver
 167         my @groups;
 168         my @use_ranks;
 169         my %lacunae;
 170         my $moved = {};
 171         foreach my $rank ( @ranks ) {
 172                 my $missing = [ @lacunose ];
 173                 my $rankgroup = group_variants( $tradition, $rank, $missing, $moved, \@collapse );
 174                 # Filter out any empty rankgroups
 175                 # (e.g. from the later rank for a transposition)
 176                 next unless keys %$rankgroup;
 177                 if( $opts{'exclude_type1'} ) {
 178                         # Check to see whether this is a "useful" group.
 179                         my( $rdgs, $grps ) = _useful_variant( $rankgroup,
 180                                 $stemma->graph, $c->ac_label );
 181                         next unless @$rdgs;
 182                 }
 183                 push( @use_ranks, $rank );
 184                 push( @groups, $rankgroup );
 185                 $lacunae{$rank} = $missing;
 186         }
 187         # Run the solver
 188         my $answer = solve_variants( $stemma, @groups );
 189
 190         # Do further analysis on the answer
 191         my $conflict_count = 0;
 192         my $aclabel = $c->ac_label;
 193         foreach my $idx ( 0 .. $#use_ranks ) {
 194                 my $location = $answer->{'variants'}->[$idx];
 195                 # Add the rank back in
 196                 my $rank = $use_ranks[$idx];
 197                 $location->{'id'} = $rank;
 198                 # Note what our lacunae are
 199                 my %lmiss;
 200                 map { $lmiss{$_} = 1 } @{$lacunae{$use_ranks[$idx]}};
 201                 $location->{'missing'} = [ keys %lmiss ];
 202
 203                 # Run the extra analysis we need.
 204                 analyze_location( $tradition, $stemma, $location, \%lmiss );
 205
 206                 my @layerwits;
 207                 # Do the final post-analysis tidying up of the data.
 208                 foreach my $rdghash ( @{$location->{'readings'}} ) {
 209                         $conflict_count++
 210                                 if exists $rdghash->{'conflict'} && $rdghash->{'conflict'};
 211                         # Add the reading text back in, setting display value as needed
 212                         my $rdg = $c->reading( $rdghash->{'readingid'} );
 213                         if( $rdg ) {
 214                                 $rdghash->{'text'} = $rdg->text .
 215                                         ( $rdg->rank == $rank ? '' : ' [' . $rdg->rank . ']' );
 216                         }
 217                         # Remove lacunose witnesses from this reading's list now that the
 218                         # analysis is done
 219                         my @realgroup;
 220                         map { push( @realgroup, $_ ) unless $lmiss{$_} } @{$rdghash->{'group'}};
 221                         $rdghash->{'group'} = \@realgroup;
 222                         # Note any layered witnesses that appear in this group
 223                         foreach( @realgroup ) {
 224                                 if( $_ =~ /^(.*)\Q$aclabel\E$/ ) {
 225                                         push( @layerwits, $1 );
 226                                 }
 227                         }
 228                 }
 229                 $location->{'layerwits'} = \@layerwits if @layerwits;
 230         }
 231         $answer->{'conflict_count'} = $conflict_count;
 232
 233         return $answer;
 234 }
 235
 236 =head2 group_variants( $tradition, $rank, $lacunose, @merge_relationship_types )
 237
 238 Groups the variants at the given $rank of the collation, treating any
 239 relationships in @merge_relationship_types as equivalent.  $lacunose should
 240 be a reference to an array, to which the sigla of lacunose witnesses at this
 241 rank will be appended; $transposed should be a reference to a hash, wherein
 242 the identities of transposed readings and their relatives will be stored.
 243
 244 Returns a hash $group_readings where $rdg is attested by the witnesses listed
 245 in $group_readings->{$rdg}.
 246
 247 =cut
 248
 249 # Return group_readings, groups, lacunose
 250 sub group_variants {
 251         my( $tradition, $rank, $lacunose, $transposed, $collapse ) = @_;
 252         my $c = $tradition->collation;
 253         my $aclabel = $c->ac_label;
 254         my $table = $c->alignment_table;
 255         # Get the alignment table readings
 256         my %readings_at_rank;
 257         my %is_lacunose; # lookup table for witnesses not in stemma
 258         map { $is_lacunose{$_} = 1; $is_lacunose{$_.$aclabel} = 1 } @$lacunose;
 259         my @check_for_gaps;
 260         my %moved_wits;
 261         my $has_transposition;
 262         foreach my $tablewit ( @{$table->{'alignment'}} ) {
 263                 my $rdg = $tablewit->{'tokens'}->[$rank-1];
 264                 my $wit = $tablewit->{'witness'};
 265                 # Exclude the witness if it is "lacunose" which if we got here
 266                 # means "not in the stemma".
 267                 next if $is_lacunose{$wit};
 268                 # Note if the witness is actually in a lacuna
 269                 if( $rdg && $rdg->{'t'}->is_lacuna ) {
 270                         _add_to_witlist( $wit, $lacunose, $aclabel );
 271                 # Otherwise the witness either has a positive reading...
 272                 } elsif( $rdg ) {
 273                         # If the reading has been counted elsewhere as a transposition, ignore it.
 274                         if( $transposed->{$rdg->{'t'}->id} ) {
 275                                 # TODO Does this cope with three-way transpositions?
 276                                 map { $moved_wits{$_} = 1 } @{$transposed->{$rdg->{'t'}->id}};
 277                                 next;
 278                         }
 279                         # Otherwise, record it...
 280                         $readings_at_rank{$rdg->{'t'}->id} = $rdg->{'t'};
 281                         # ...and grab any transpositions, and their relations.
 282                         my @transp = grep { $_->rank != $rank } $rdg->{'t'}->related_readings();
 283                         foreach my $trdg ( @transp ) {
 284                                 next if exists $readings_at_rank{$trdg->id};
 285                                 $has_transposition = 1;
 286                                 my @affected_wits = _table_witnesses(
 287                                         $table, $trdg, \%is_lacunose, $aclabel );
 288                                 next unless @affected_wits;
 289                                 map { $moved_wits{$_} = 1 } @affected_wits;
 290                                 $transposed->{$trdg->id} =
 291                                         [ _table_witnesses( $table, $rdg->{'t'}, \%is_lacunose, $aclabel ) ];
 292                                 $readings_at_rank{$trdg->id} = $trdg;
 293                         }
 294                 # ...or it is empty, ergo a gap.
 295                 } else {
 296                         _add_to_witlist( $wit, \@check_for_gaps, $aclabel );
 297                 }
 298         }
 299         my @gap_wits;
 300         map { _add_to_witlist( $_, \@gap_wits, $aclabel )
 301                 unless $moved_wits{$_} } @check_for_gaps;
 302         # Group the readings, collapsing groups by relationship if needed
 303         my $grouped_readings = {};
 304         foreach my $rdg ( values %readings_at_rank ) {
 305                 # Skip readings that have been collapsed into others.
 306                 next if exists $grouped_readings->{$rdg->id}
 307                         && $grouped_readings->{$rdg->id} eq 'COLLAPSE';
 308                 # Get the witness list, including from readings collapsed into this one.
 309                 my @wits = _table_witnesses( $table, $rdg, \%is_lacunose, $aclabel );
 310                 if( $collapse && @$collapse ) {
 311                         my $filter = sub { my $r = $_[0]; grep { $_ eq $r->type } @$collapse; };
 312                         foreach my $other ( $rdg->related_readings( $filter ) ) {
 313                                 my @otherwits = _table_witnesses(
 314                                         $table, $other, \%is_lacunose, $aclabel );
 315                                 push( @wits, @otherwits );
 316                                 $grouped_readings->{$other->id} = 'COLLAPSE';
 317                         }
 318                 }
 319                 $grouped_readings->{$rdg->id} = \@wits;
 320         }
 321         $grouped_readings->{'(omitted)'} = \@gap_wits if @gap_wits;
 322         # Get rid of our collapsed readings
 323         map { delete $grouped_readings->{$_} if $grouped_readings->{$_} eq 'COLLAPSE' }
 324                 keys %$grouped_readings
 325                 if $collapse;
 326
 327         # If something was transposed, check the groups for doubled-up readings
 328         if( $has_transposition ) {
 329                 # print STDERR "Group for rank $rank:\n";
 330                 # map { print STDERR "\t$_: " . join( ' ' , @{$grouped_readings->{$_}} ) . "\n" }
 331                 #       keys %$grouped_readings;
 332                 _check_transposed_consistency( $c, $rank, $transposed, $grouped_readings );
 333         }
 334
 335         # Return the result
 336         return $grouped_readings;
 337 }
 338
 339 # Helper function to query the alignment table for all witnesses (a.c. included)
 340 # that have a given reading at its rank.
 341 sub _table_witnesses {
 342         my( $table, $trdg, $lacunose, $aclabel ) = @_;
 343         my $tableidx = $trdg->rank - 1;
 344         my @has_reading;
 345         foreach my $row ( @{$table->{'alignment'}} ) {
 346                 my $wit = $row->{'witness'};
 347                 next if $lacunose->{$wit};
 348                 my $rdg = $row->{'tokens'}->[$tableidx];
 349                 next unless exists $rdg->{'t'} && defined $rdg->{'t'};
 350                 _add_to_witlist( $wit, \@has_reading, $aclabel )
 351                         if $rdg->{'t'}->id eq $trdg->id;
 352         }
 353         return @has_reading;
 354 }
 355
 356 # Helper function to ensure that X and X a.c. never appear in the same list.
 357 sub _add_to_witlist {
 358         my( $wit, $list, $acstr ) = @_;
 359         my %inlist;
 360         my $idx = 0;
 361         map { $inlist{$_} = $idx++ } @$list;
 362         if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
 363                 my $acwit = $1;
 364                 unless( exists $inlist{$acwit} ) {
 365                         push( @$list, $acwit.$acstr );
 366                 }
 367         } else {
 368                 if( exists( $inlist{$wit.$acstr} ) ) {
 369                         # Replace the a.c. version with the main witness
 370                         my $i = $inlist{$wit.$acstr};
 371                         $list->[$i] = $wit;
 372                 } else {
 373                         push( @$list, $wit );
 374                 }
 375         }
 376 }
 377
 378 sub _check_transposed_consistency {
 379         my( $c, $rank, $transposed, $groupings ) = @_;
 380         my %seen_wits;
 381         my %thisrank;
 382         # Note which readings are actually at this rank, and which witnesses
 383         # belong to which reading.
 384         foreach my $rdg ( keys %$groupings ) {
 385                 my $rdgobj = $c->reading( $rdg );
 386                 # Count '(omitted)' as a reading at this rank
 387                 $thisrank{$rdg} = 1 if !$rdgobj || $rdgobj->rank == $rank;
 388                 map { push( @{$seen_wits{$_}}, $rdg ) } @{$groupings->{$rdg}};
 389         }
 390         # Our work is done if we have no witness belonging to more than one
 391         # reading.
 392         my @doubled = grep { scalar @{$seen_wits{$_}} > 1 } keys %seen_wits;
 393         return unless @doubled;
 394         # If we have a symmetric related transposition, drop the non-rank readings.
 395         if( @doubled == scalar keys %seen_wits ) {
 396                 foreach my $rdg ( keys %$groupings ) {
 397                         if( !$thisrank{$rdg} ) {
 398                                 my $groupstr = wit_stringify( $groupings->{$rdg} );
 399                                 my ( $matched ) = grep { $groupstr eq wit_stringify( $groupings->{$_} ) }
 400                                         keys %thisrank;
 401                                 delete $groupings->{$rdg};
 402                                 # If we found a group match, assume there is a symmetry happening.
 403                                 # TODO think more about this
 404                                 # print STDERR "*** Deleting symmetric reading $rdg\n";
 405                                 unless( $matched ) {
 406                                         delete $transposed->{$rdg};
 407                                         warn "Found problem in evident symmetry with reading $rdg";
 408                                 }
 409                         }
 410                 }
 411         # Otherwise 'unhook' the transposed reading(s) that have duplicates.
 412         } else {
 413                 foreach my $dup ( @doubled ) {
 414                         foreach my $rdg ( @{$seen_wits{$dup}} ) {
 415                                 next if $thisrank{$rdg};
 416                                 next unless exists $groupings->{$rdg};
 417                                 # print STDERR "*** Deleting asymmetric doubled-up reading $rdg\n";
 418                                 delete $groupings->{$rdg};
 419                                 delete $transposed->{$rdg};
 420                         }
 421                 }
 422                 # and put any now-orphaned readings into an 'omitted' reading.
 423                 foreach my $wit ( keys %seen_wits ) {
 424                         unless( grep { exists $groupings->{$_} } @{$seen_wits{$wit}} ) {
 425                                 $groupings->{'(omitted)'} = [] unless exists $groupings->{'(omitted)'};
 426                                 _add_to_witlist( $wit, $groupings->{'(omitted)'}, $c->ac_label );
 427                         }
 428                 }
 429         }
 430 }
 431
 432 =head2 solve_variants( $graph, @groups )
 433
 434 Sends the set of groups to the external graph solver service and returns
 435 a cleaned-up answer, adding the rank IDs back where they belong.
 436
 437 The JSON has the form
 438   { "graph": [ stemmagraph DOT string without newlines ],
 439     "groupings": [ array of arrays of groups, one per rank ] }
 440
 441 The answer has the form
 442   { "variants" => [ array of variant location structures ],
 443     "variant_count" => total,
 444     "conflict_count" => number of conflicts detected,
 445     "genealogical_count" => number of solutions found }
 446
 447 =cut
 448
 449 sub solve_variants {
 450         my( $stemma, @groups ) = @_;
 451         my $aclabel = $stemma->collation->ac_label;
 452
 453         # Filter the groups down to distinct groups, and work out what graph
 454         # should be used in the calculation of each group. We want to send each
 455         # distinct problem to the solver only once.
 456         # We need a whole bunch of lookup tables for this.
 457         my $index_groupkeys = {};       # Save the order of readings
 458         my $group_indices = {};         # Save the indices that have a given grouping
 459         my $graph_problems = {};        # Save the groupings for the given graph
 460
 461         foreach my $idx ( 0..$#groups ) {
 462                 my $ghash = $groups[$idx];
 463                 my @grouping;
 464                 # Sort the groupings from big to little, and scan for a.c. witnesses
 465                 # that would need an extended graph.
 466                 my @acwits;   # note which AC witnesses crop up at this rank
 467                 my @idxkeys = sort { scalar @{$ghash->{$b}} <=> scalar @{$ghash->{$a}} }
 468                         keys %$ghash;
 469                 foreach my $rdg ( @idxkeys ) {
 470                         my @sg = sort @{$ghash->{$rdg}};
 471                         push( @acwits, grep { $_ =~ /\Q$aclabel\E$/ } @sg );
 472                         push( @grouping, \@sg );
 473                 }
 474                 # Save the reading order
 475                 $index_groupkeys->{$idx} = \@idxkeys;
 476
 477                 # Now associate the distinct group with this index
 478                 my $gstr = wit_stringify( \@grouping );
 479                 push( @{$group_indices->{$gstr}}, $idx );
 480
 481                 # Finally, add the group to the list to be calculated for this graph.
 482                 map { s/\Q$aclabel\E$// } @acwits;
 483                 my $graph;
 484                 try {
 485                         $graph = $stemma->extend_graph( \@acwits );
 486                 } catch {
 487                         die "Unable to extend graph with @acwits";
 488                 }
 489                 unless( exists $graph_problems->{"$graph"} ) {
 490                         $graph_problems->{"$graph"} = { 'object' => $graph, 'groups' => [] };
 491                 }
 492                 push( @{$graph_problems->{"$graph"}->{'groups'}}, \@grouping );
 493         }
 494
 495         ## For each distinct graph, send its groups to the solver.
 496         my $solver_url = 'http://byzantini.st/cgi-bin/graphcalc.cgi';
 497         my $ua = LWP::UserAgent->new();
 498         ## Witness map is a HACK to get around limitations in node names from IDP
 499         my $witness_map = {};
 500         ## Variables to store answers as they come back
 501         my $variants = [ ( undef ) x ( scalar keys %$index_groupkeys ) ];
 502         my $genealogical = 0;
 503         foreach my $graphkey ( keys %$graph_problems ) {
 504                 my $graph = $graph_problems->{$graphkey}->{'object'};
 505                 my $groupings = $graph_problems->{$graphkey}->{'groups'};
 506                 my $json = encode_json( _safe_wit_strings( $graph, $stemma->collation,
 507                         $groupings, $witness_map ) );
 508                 # Send it off and get the result
 509                 #print STDERR "Sending request: $json\n";
 510                 my $resp = $ua->post( $solver_url, 'Content-Type' => 'application/json',
 511                                                           'Content' => $json );
 512                 my $answer;
 513                 my $used_idp;
 514                 if( $resp->is_success ) {
 515                         $answer = _desanitize_names( decode_json( $resp->content ), $witness_map );
 516                         $used_idp = 1;
 517                 } else {
 518                         # Fall back to the old method.
 519                         warn "IDP solver returned " . $resp->status_line . " / " . $resp->content
 520                                 . "; falling back to perl method";
 521                         $answer = perl_solver( $graph, @$groupings );
 522                 }
 523                 ## The answer is the evaluated groupings, plus a boolean for whether
 524                 ## they were genealogical.  Reconstruct our original groups.
 525                 foreach my $gidx ( 0 .. $#{$groupings} ) {
 526                         my( $calc_groups, $result ) = @{$answer->[$gidx]};
 527                         if( $result ) {
 528                                 $genealogical++;
 529                                 # Prune the calculated groups, in case the IDP solver failed to.
 530                                 if( $used_idp ) {
 531                                         my @pruned_groups;
 532                                         foreach my $cg ( @$calc_groups ) {
 533                                                 # This is a little wasteful but the path of least
 534                                                 # resistance. Send both the stemma, which knows what
 535                                                 # its hypotheticals are, and the actual graph used.
 536                                                 my @pg = _prune_group( $cg, $stemma, $graph );
 537                                                 push( @pruned_groups, \@pg );
 538                                         }
 539                                         $calc_groups = \@pruned_groups;
 540                                 }
 541                         }
 542                         # Retrieve the key for the original group that went to the solver
 543                         my $input_group = wit_stringify( $groupings->[$gidx] );
 544                         foreach my $oidx ( @{$group_indices->{$input_group}} ) {
 545                                 my @readings = @{$index_groupkeys->{$oidx}};
 546                                 my $vstruct = {
 547                                         'genealogical' => $result,
 548                                         'readings' => [],
 549                                 };
 550                                 foreach my $ridx ( 0 .. $#readings ) {
 551                                         push( @{$vstruct->{'readings'}},
 552                                                 { 'readingid' => $readings[$ridx],
 553                                                   'group' => $calc_groups->[$ridx] } );
 554                                 }
 555                                 $variants->[$oidx] = $vstruct;
 556                         }
 557                 }
 558         }
 559
 560         return { 'variants' => $variants,
 561                          'variant_count' => scalar @$variants,
 562                          'genealogical_count' => $genealogical };
 563 }
 564
 565 #### HACKERY to cope with IDP's limited idea of what a node name looks like ###
 566
 567 sub _safe_wit_strings {
 568         my( $graph, $c, $groupings, $witness_map ) = @_;
 569         # Parse the graph we were given into a stemma.
 570         my $safegraph = Graph->new();
 571         # Convert the graph to a safe representation and store the conversion.
 572         foreach my $n ( $graph->vertices ) {
 573                 my $sn = _safe_witstr( $n );
 574                 if( exists $witness_map->{$sn} ) {
 575                         warn "Ambiguous stringification $sn for $n and " . $witness_map->{$sn}
 576                                 if $witness_map->{$sn} ne $n;
 577                 } else {
 578                         $witness_map->{$sn} = $n;
 579                 }
 580                 $safegraph->add_vertex( $sn );
 581                 $safegraph->set_vertex_attributes( $sn,
 582                         $graph->get_vertex_attributes( $n ) );
 583         }
 584         foreach my $e ( $graph->edges ) {
 585                 my @safe_e = ( _safe_witstr( $e->[0] ), _safe_witstr( $e->[1] ) );
 586                 $safegraph->add_edge( @safe_e );
 587         }
 588         my $safe_stemma = Text::Tradition::Stemma->new(
 589                 'collation' => $c, 'graph' => $safegraph );
 590
 591         # Now convert the witness groupings to a safe representation.
 592         my $safe_groupings = [];
 593         foreach my $grouping ( @$groupings ) {
 594                 my $safe_grouping = [];
 595                 foreach my $group ( @$grouping ) {
 596                         my $safe_group = [];
 597                         foreach my $n ( @$group ) {
 598                                 my $sn = _safe_witstr( $n );
 599                                 warn "Ambiguous stringification $sn for $n and " . $witness_map->{$sn}
 600                                         if exists $witness_map->{$sn} && $witness_map->{$sn} ne $n;
 601                                 $witness_map->{$sn} = $n;
 602                                 push( @$safe_group, $sn );
 603                         }
 604                         push( @$safe_grouping, $safe_group );
 605                 }
 606                 push( @$safe_groupings, $safe_grouping );
 607         }
 608
 609         # Return it all in the struct we expect.  We have stored the reductions
 610         # in the $witness_map that we were passed.
 611         return { 'graph' => $safe_stemma->editable( { 'linesep' => ' ' } ),
 612                          'groupings' => $safe_groupings };
 613 }
 614
 615 sub _safe_witstr {
 616         my $witstr = shift;
 617         $witstr =~ s/\s+/_/g;
 618         $witstr =~ s/[^\w\d-]//g;
 619         return $witstr;
 620 }
 621
 622 sub _desanitize_names {
 623         my( $jsonstruct, $witness_map ) = @_;
 624         my $result = [];
 625         foreach my $grouping ( @$jsonstruct ) {
 626                 my $real_grouping = [];
 627                 foreach my $element ( @$grouping ) {
 628                         if( ref( $element ) eq 'ARRAY' ) {
 629                                 # it's the groupset.
 630                                 my $real_groupset = [];
 631                                 foreach my $group ( @$element ) {
 632                                         my $real_group = [];
 633                                         foreach my $n ( @$group ) {
 634                                                 my $rn = $witness_map->{$n};
 635                                                 push( @$real_group, $rn );
 636                                         }
 637                                         push( @$real_groupset, $real_group );
 638                                 }
 639                                 push( @$real_grouping, $real_groupset );
 640                         } else {
 641                                 # It is the boolean, not actually a group.
 642                                 push( @$real_grouping, $element );
 643                         }
 644                 }
 645                 push( @$result, $real_grouping );
 646         }
 647         return $result;
 648 }
 649
 650 ### END HACKERY ###
 651
 652 =head2 analyze_location ( $tradition, $graph, $location_hash )
 653
 654 Given the tradition, its stemma graph, and the solution from the graph solver,
 655 work out the rest of the information we want.  For each reading we need missing,
 656 conflict, reading_parents, independent_occurrence, followed, not_followed, and follow_unknown.  Alters the location_hash in place.
 657
 658 =cut
 659
 660 sub analyze_location {
 661         my ( $tradition, $stemma, $variant_row, $lacunose ) = @_;
 662         my $c = $tradition->collation;
 663
 664         # Make a hash of all known node memberships, and make the subgraphs.
 665         my $contig = {};
 666         my $reading_roots = {};
 667         my $subgraph = {};
 668         my $acstr = $c->ac_label;
 669         my @acwits;
 670         $DB::single = 1 if $variant_row->{id} == 87;
 671         # Note which witnesses positively belong to which group
 672     foreach my $rdghash ( @{$variant_row->{'readings'}} ) {
 673         my $rid = $rdghash->{'readingid'};
 674         foreach my $wit ( @{$rdghash->{'group'}} ) {
 675                 $contig->{$wit} = $rid;
 676             if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
 677                 push( @acwits, $1 );
 678             }
 679         }
 680         }
 681
 682         # Get the actual graph we should work with
 683         my $graph;
 684         try {
 685                 $graph = @acwits ? $stemma->extend_graph( \@acwits ) : $stemma->graph;
 686         } catch {
 687                 die "Could not extend graph with a.c. witnesses @acwits";
 688         }
 689
 690         # Now, armed with that knowledge, make a subgraph for each reading
 691         # and note the root(s) of each subgraph.
 692         foreach my $rdghash( @{$variant_row->{'readings'}} ) {
 693         my $rid = $rdghash->{'readingid'};
 694         my %rdgwits;
 695         # Make the subgraph.
 696         my $part = $graph->copy;
 697         my @todelete = grep { exists $contig->{$_} && $contig->{$_} ne $rid }
 698                 keys %$contig;
 699         $part->delete_vertices( @todelete );
 700         _prune_subtree( $part, $lacunose );
 701                 $subgraph->{$rid} = $part;
 702                 # Record the remaining lacunose nodes as part of this group, if
 703                 # we are dealing with a non-genealogical reading.
 704                 unless( $variant_row->{'genealogical'} ) {
 705                         map { $contig->{$_} = $rid } $part->vertices;
 706                 }
 707                 # Get the reading roots.
 708                 map { $reading_roots->{$_} = $rid } $part->predecessorless_vertices;
 709         }
 710
 711         # Now that we have all the node group memberships, calculate followed/
 712     # non-followed/unknown values for each reading.  Also figure out the
 713     # reading's evident parent(s).
 714     foreach my $rdghash ( @{$variant_row->{'readings'}} ) {
 715         my $rid = $rdghash->{'readingid'};
 716         # Get the subgraph
 717         my $part = $subgraph->{$rid};
 718
 719         # Start figuring things out.
 720         my @roots = grep { $reading_roots->{$_} eq $rid } keys %$reading_roots;
 721         $rdghash->{'independent_occurrence'} = \@roots;
 722         $rdghash->{'followed'} = scalar( $part->vertices ) - scalar( @roots );
 723         # Find the parent readings, if any, of this reading.
 724         my $rdgparents = {};
 725         foreach my $wit ( @roots ) {
 726                 # Look in the main stemma to find this witness's extant or known-reading
 727                 # immediate ancestor(s), and look up the reading that each ancestor olds.
 728                         my @check = $graph->predecessors( $wit );
 729                         while( @check ) {
 730                                 my @next;
 731                                 foreach my $wparent( @check ) {
 732                                         my $preading = $contig->{$wparent};
 733                                         if( $preading ) {
 734                                                 $rdgparents->{$preading} = 1;
 735                                         } else {
 736                                                 push( @next, $graph->predecessors( $wparent ) );
 737                                         }
 738                                 }
 739                                 @check = @next;
 740                         }
 741                 }
 742                 foreach my $p ( keys %$rdgparents ) {
 743                         # Resolve the relationship of the parent to the reading, and
 744                         # save it in our hash.
 745                         my $pobj = $c->reading( $p );
 746                         my $relation;
 747                         my $prep = $pobj ? $pobj->id . ' (' . $pobj->text . ')' : $p;
 748                         if( $pobj ) {
 749                                 my $rel = $c->get_relationship( $p, $rdghash->{readingid} );
 750                                 if( $rel ) {
 751                                         $relation = { type => $rel->type };
 752                                         if( $rel->has_annotation ) {
 753                                                 $relation->{'annotation'} = $rel->annotation;
 754                                         }
 755                                 }
 756                         }
 757                         $rdgparents->{$p} = { 'label' => $prep, 'relation' => $relation };
 758                 }
 759
 760                 $rdghash->{'reading_parents'} = $rdgparents;
 761
 762                 # Find the number of times this reading was altered, and the number of
 763                 # times we're not sure.
 764                 my( %nofollow, %unknownfollow );
 765                 foreach my $wit ( $part->vertices ) {
 766                         foreach my $wchild ( $graph->successors( $wit ) ) {
 767                                 next if $part->has_vertex( $wchild );
 768                                 if( $reading_roots->{$wchild} && $contig->{$wchild} ) {
 769                                         # It definitely changed here.
 770                                         $nofollow{$wchild} = 1;
 771                                 } elsif( !($contig->{$wchild}) ) {
 772                                         # The child is a hypothetical node not definitely in
 773                                         # any group. Answer is unknown.
 774                                         $unknownfollow{$wchild} = 1;
 775                                 } # else it's a non-root node in a known group, and therefore
 776                                   # is presumed to have its reading from its group, not this link.
 777                         }
 778                 }
 779                 $rdghash->{'not_followed'} = keys %nofollow;
 780                 $rdghash->{'follow_unknown'} = keys %unknownfollow;
 781
 782                 # Now say whether this reading represents a conflict.
 783                 unless( $variant_row->{'genealogical'} ) {
 784                         $rdghash->{'conflict'} = @roots != 1;
 785                 }
 786     }
 787 }
 788
 789
 790 =head2 perl_solver( $tradition, $rank, $stemma_id, @merge_relationship_types )
 791
 792 ** NOTE ** This method should hopefully not be called - it is not guaranteed
 793 to be correct.  Serves as a backup for the real solver.
 794
 795 Runs an analysis of the given tradition, at the location given in $rank,
 796 against the graph of the stemma specified in $stemma_id.  The argument
 797 @merge_relationship_types is an optional list of relationship types for
 798 which readings so related should be treated as equivalent.
 799
 800 Returns a nested array data structure as follows:
 801
 802  [ [ group_list, is_genealogical ], [ group_list, is_genealogical ] ... ]
 803
 804 where the group list is the array of arrays passed in for each element of @groups,
 805 possibly with the addition of hypothetical readings.
 806
 807
 808 =cut
 809
 810 sub perl_solver {
 811         my( $graph, @groups ) = @_;
 812         my @answer;
 813         foreach my $g ( @groups ) {
 814                 push( @answer, _solve_variant_location( $graph, $g ) );
 815         }
 816         return \@answer;
 817 }
 818
 819 sub _solve_variant_location {
 820         my( $graph, $groups ) = @_;
 821         # Now do the work.
 822     my $contig = {};
 823     my $subgraph = {};
 824     my $is_conflicted;
 825     my $conflict = {};
 826
 827     # Mark each ms as in its own group, first.
 828     foreach my $g ( @$groups ) {
 829         my $gst = wit_stringify( $g );
 830         map { $contig->{$_} = $gst } @$g;
 831     }
 832
 833     # Now for each unmarked node in the graph, initialize an array
 834     # for possible group memberships.  We will use this later to
 835     # resolve potential conflicts.
 836     map { $contig->{$_} = [] unless $contig->{$_} } $graph->vertices;
 837     foreach my $g ( sort { scalar @$b <=> scalar @$a } @$groups ) {
 838         my $gst = wit_stringify( $g );  # This is the group name
 839         # Copy the graph, and delete all non-members from the new graph.
 840         my $part = $graph->copy;
 841         my @group_roots;
 842         $part->delete_vertices(
 843             grep { !ref( $contig->{$_} ) && $contig->{$_} ne $gst } $graph->vertices );
 844
 845         # Now look to see if our group is connected.
 846                 if( @$g > 1 ) {
 847                         # We have to take directionality into account.
 848                         # How many root nodes do we have?
 849                         my @roots = grep { ref( $contig->{$_} ) || $contig->{$_} eq $gst }
 850                                 $part->predecessorless_vertices;
 851                         # Assuming that @$g > 1, find the first root node that has at
 852                         # least one successor belonging to our group. If this reading
 853                         # is genealogical, there should be only one, but we will check
 854                         # that implicitly later.
 855                         foreach my $root ( @roots ) {
 856                                 # Prune the tree to get rid of extraneous hypotheticals.
 857                                 $root = _prune_subtree_old( $part, $root, $contig );
 858                                 next unless $root;
 859                                 # Save this root for our group.
 860                                 push( @group_roots, $root );
 861                                 # Get all the successor nodes of our root.
 862                         }
 863                 } else {
 864                         # Dispense with the trivial case of one reading.
 865                         my $wit = $g->[0];
 866                         @group_roots = ( $wit );
 867                         foreach my $v ( $part->vertices ) {
 868                                 $part->delete_vertex( $v ) unless $v eq $wit;
 869                         }
 870         }
 871
 872         if( @group_roots > 1 ) {
 873                 $conflict->{$gst} = 1;
 874                 $is_conflicted = 1;
 875         }
 876         # Paint the 'hypotheticals' with our group.
 877                 foreach my $wit ( $part->vertices ) {
 878                         if( ref( $contig->{$wit} ) ) {
 879                                 push( @{$contig->{$wit}}, $gst );
 880                         } elsif( $contig->{$wit} ne $gst ) {
 881                                 warn "How did we get here?";
 882                         }
 883                 }
 884
 885
 886                 # Save the relevant subgraph.
 887                 $subgraph->{$gst} = $part;
 888     }
 889
 890         # For each of our hypothetical readings, flatten its 'contig' array if
 891         # the array contains zero or one group.  If we have any unflattened arrays,
 892         # we may need to run the resolution process. If the reading is already known
 893         # to have a conflict, flatten the 'contig' array to nothing; we won't resolve
 894         # it.
 895         my @resolve;
 896         foreach my $wit ( keys %$contig ) {
 897                 next unless ref( $contig->{$wit} );
 898                 if( @{$contig->{$wit}} > 1 ) {
 899                         if( $is_conflicted ) {
 900                                 $contig->{$wit} = '';  # We aren't going to decide.
 901                         } else {
 902                                 push( @resolve, $wit );
 903                         }
 904                 } else {
 905                         my $gst = pop @{$contig->{$wit}};
 906                         $contig->{$wit} = $gst || '';
 907                 }
 908         }
 909
 910     if( @resolve ) {
 911         my $still_contig = {};
 912         foreach my $h ( @resolve ) {
 913             # For each of the hypothetical readings with more than one possibility,
 914             # try deleting it from each of its member subgraphs in turn, and see
 915             # if that breaks the contiguous grouping.
 916             # TODO This can still break in a corner case where group A can use
 917             # either vertex 1 or 2, and group B can use either vertex 2 or 1.
 918             # Revisit this if necessary; it could get brute-force nasty.
 919             foreach my $gst ( @{$contig->{$h}} ) {
 920                 my $gpart = $subgraph->{$gst}->copy();
 921                 # If we have come this far, there is only one root and everything
 922                 # is reachable from it.
 923                 my( $root ) = $gpart->predecessorless_vertices;
 924                 my $reachable = {};
 925                 map { $reachable->{$_} = 1 } $gpart->vertices;
 926
 927                 # Try deleting the hypothetical node.
 928                 $gpart->delete_vertex( $h );
 929                 if( $h eq $root ) {
 930                         # See if we still have a single root.
 931                         my @roots = $gpart->predecessorless_vertices;
 932                         warn "This shouldn't have happened" unless @roots;
 933                         if( @roots > 1 ) {
 934                                 # $h is needed by this group.
 935                                 if( exists( $still_contig->{$h} ) ) {
 936                                         # Conflict!
 937                                         $conflict->{$gst} = 1;
 938                                         $still_contig->{$h} = '';
 939                                 } else {
 940                                         $still_contig->{$h} = $gst;
 941                                 }
 942                         }
 943                 } else {
 944                         # $h is somewhere in the middle. See if everything
 945                         # else can still be reached from the root.
 946                                         my %still_reachable = ( $root => 1 );
 947                                         map { $still_reachable{$_} = 1 }
 948                                                 $gpart->all_successors( $root );
 949                                         foreach my $v ( keys %$reachable ) {
 950                                                 next if $v eq $h;
 951                                                 if( !$still_reachable{$v}
 952                                                         && ( $contig->{$v} eq $gst
 953                                                                  || ( exists $still_contig->{$v}
 954                                                                           && $still_contig->{$v} eq $gst ) ) ) {
 955                                                         # We need $h.
 956                                                         if( exists $still_contig->{$h} ) {
 957                                                                 # Conflict!
 958                                                                 $conflict->{$gst} = 1;
 959                                                                 $still_contig->{$h} = '';
 960                                                         } else {
 961                                                                 $still_contig->{$h} = $gst;
 962                                                         }
 963                                                         last;
 964                                                 } # else we don't need $h in this group.
 965                                         } # end foreach $v
 966                                 } # endif $h eq $root
 967             } # end foreach $gst
 968         } # end foreach $h
 969
 970         # Now we have some hypothetical vertices in $still_contig that are the
 971         # "real" group memberships.  Replace these in $contig.
 972                 foreach my $v ( keys %$contig ) {
 973                         next unless ref $contig->{$v};
 974                         $contig->{$v} = $still_contig->{$v};
 975                 }
 976     } # end if @resolve
 977
 978     my $is_genealogical = keys %$conflict ? JSON::false : JSON::true;
 979         my $variant_row = [ [], $is_genealogical ];
 980         # Fill in the groupings from $contig.
 981         foreach my $g ( @$groups ) {
 982         my $gst = wit_stringify( $g );
 983         my @realgroup = grep { $contig->{$_} eq $gst } keys %$contig;
 984         push( @{$variant_row->[0]}, \@realgroup );
 985     }
 986     return $variant_row;
 987 }
 988
 989 sub _prune_group {
 990         my( $group, $stemma, $graph ) = @_;
 991         my $lacunose = {};
 992         map { $lacunose->{$_} = 1 } $stemma->hypotheticals;
 993         map { $lacunose->{$_} = 0 } @$group;
 994         # Make our subgraph
 995         my $subgraph = $graph->copy;
 996         map { $subgraph->delete_vertex( $_ ) unless exists $lacunose->{$_} }
 997                 $subgraph->vertices;
 998         # ...and find the root.
 999         # Now prune and return the remaining vertices.
1000         _prune_subtree( $subgraph, $lacunose );
1001         return $subgraph->vertices;
1002 }
1003
1004 sub _prune_subtree {
1005         my( $tree, $lacunose ) = @_;
1006
1007         # Delete lacunose witnesses that have no successors
1008     my @orphan_hypotheticals;
1009     my $ctr = 0;
1010     do {
1011         die "Infinite loop on leaves" if $ctr > 100;
1012         @orphan_hypotheticals = grep { $lacunose->{$_} }
1013                 $tree->successorless_vertices;
1014         $tree->delete_vertices( @orphan_hypotheticals );
1015         $ctr++;
1016     } while( @orphan_hypotheticals );
1017
1018         # Delete lacunose roots that have a single successor
1019         my @redundant_root;
1020         $ctr = 0;
1021         do {
1022         die "Infinite loop on roots" if $ctr > 100;
1023                 @redundant_root = grep { $lacunose->{$_} && $tree->successors( $_ ) == 1 }
1024                         $tree->predecessorless_vertices;
1025                 $tree->delete_vertices( @redundant_root );
1026                 $ctr++;
1027         } while( @redundant_root );
1028 }
1029
1030 sub _prune_subtree_old {
1031     my( $tree, $root, $contighash ) = @_;
1032     # First, delete hypothetical leaves / orphans until there are none left.
1033     my @orphan_hypotheticals = grep { ref( $contighash->{$_} ) }
1034         $tree->successorless_vertices;
1035     while( @orphan_hypotheticals ) {
1036         $tree->delete_vertices( @orphan_hypotheticals );
1037         @orphan_hypotheticals = grep { ref( $contighash->{$_} ) }
1038             $tree->successorless_vertices;
1039     }
1040     # Then delete a hypothetical root with only one successor, moving the
1041     # root to the first child that has no other predecessors.
1042     while( $tree->successors( $root ) == 1 && ref $contighash->{$root} ) {
1043         my @nextroot = $tree->successors( $root );
1044         $tree->delete_vertex( $root );
1045         ( $root ) = grep { $tree->is_predecessorless_vertex( $_ ) } @nextroot;
1046     }
1047     # The tree has been modified in place, but we need to know the new root.
1048     $root = undef unless $root && $tree->has_vertex( $root );
1049     return $root;
1050 }
1051 # Add the variant, subject to a.c. representation logic.
1052 # This assumes that we will see the 'main' version before the a.c. version.
1053 sub add_variant_wit {
1054     my( $arr, $wit, $acstr ) = @_;
1055     my $skip;
1056     if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
1057         my $real = $1;
1058         $skip = grep { $_ =~ /^\Q$real\E$/ } @$arr;
1059     }
1060     push( @$arr, $wit ) unless $skip;
1061 }
1062
1063 sub _useful_variant {
1064         my( $group_readings, $graph, $acstr ) = @_;
1065
1066         # TODO Decide what to do with AC witnesses
1067
1068         # Sort by group size and return
1069         my $is_useful = 0;
1070         my( @readings, @groups );   # The sorted groups for our answer.
1071         foreach my $rdg ( sort { @{$group_readings->{$b}} <=> @{$group_readings->{$a}} }
1072                 keys %$group_readings ) {
1073                 push( @readings, $rdg );
1074                 push( @groups, $group_readings->{$rdg} );
1075                 if( @{$group_readings->{$rdg}} > 1 ) {
1076                         $is_useful++;
1077                 } else {
1078                         my( $wit ) = @{$group_readings->{$rdg}};
1079                         $wit =~ s/^(.*)\Q$acstr\E$/$1/;
1080                         $is_useful++ unless( $graph->is_sink_vertex( $wit ) );
1081                 }
1082         }
1083         if( $is_useful > 1 ) {
1084                 return( \@readings, \@groups );
1085         } else {
1086                 return( [], [] );
1087         }
1088 }
1089
1090 =head2 wit_stringify( $groups )
1091
1092 Takes an array of witness groupings and produces a string like
1093 ['A','B'] / ['C','D','E'] / ['F']
1094
1095 =cut
1096
1097 sub wit_stringify {
1098     my $groups = shift;
1099     my @gst;
1100     # If we were passed an array of witnesses instead of an array of
1101     # groupings, then "group" the witnesses first.
1102     unless( ref( $groups->[0] ) ) {
1103         my $mkgrp = [ $groups ];
1104         $groups = $mkgrp;
1105     }
1106     foreach my $g ( @$groups ) {
1107         push( @gst, '[' . join( ',', map { "'$_'" } @$g ) . ']' );
1108     }
1109     return join( ' / ', @gst );
1110 }
1111
1112 sub _symmdiff {
1113         my( $lista, $listb ) = @_;
1114         my %union;
1115         my %scalars;
1116         map { $union{$_} = 1; $scalars{$_} = $_ } @$lista;
1117         map { $union{$_} += 1; $scalars{$_} = $_ } @$listb;
1118         my @set = grep { $union{$_} == 1 } keys %union;
1119         return map { $scalars{$_} } @set;
1120 }
1121
1122 1;
1123
1124 =head1 LICENSE
1125
1126 This package is free software and is provided "as is" without express
1127 or implied warranty.  You can redistribute it and/or modify it under
1128 the same terms as Perl itself.
1129
1130 =head1 AUTHOR
1131
1132 Tara L Andrews E<lt>aurum@cpan.orgE<gt>