lib/Text/Tradition/Analysis.pm

   1 package Text::Tradition::Analysis;
   2
   3 use strict;
   4 use warnings;
   5 use Benchmark;
   6 use Encode qw/ encode_utf8 /;
   7 use Exporter 'import';
   8 use Graph;
   9 use JSON qw/ encode_json decode_json /;
  10 use LWP::UserAgent;
  11 use Text::Tradition;
  12 use Text::Tradition::Stemma;
  13
  14 use vars qw/ @EXPORT_OK /;
  15 @EXPORT_OK = qw/ run_analysis group_variants analyze_variant_location wit_stringify /;
  16
  17 =head1 NAME
  18
  19 Text::Tradition::Analysis - functions for stemma analysis of a tradition
  20
  21 =head1 SYNOPSIS
  22
  23   use Text::Tradition;
  24   use Text::Tradition::Analysis qw/ run_analysis analyze_variant_location /;
  25   my $t = Text::Tradition->new(
  26     'name' => 'this is a text',
  27     'input' => 'TEI',
  28     'file' => '/path/to/tei_parallel_seg_file.xml' );
  29   $t->add_stemma( 'dotfile' => $stemmafile );
  30
  31   my $variant_data = run_analysis( $tradition );
  32   # Recalculate rank $n treating all orthographic variants as equivalent
  33   my $reanalyze = analyze_variant_location( $tradition, $n, 0, 'orthographic' );
  34
  35 =head1 DESCRIPTION
  36
  37 Text::Tradition is a library for representation and analysis of collated
  38 texts, particularly medieval ones.  The Collation is the central feature of
  39 a Tradition, where the text, its sequence of readings, and its relationships
  40 between readings are actually kept.
  41
  42 =head1 SUBROUTINES
  43
  44 =head2 run_analysis( $tradition, %opts )
  45
  46 Runs the analysis described in analyze_variant_location on every location in the
  47 collation of the given tradition, with the given options. These include:
  48
  49 =over 4
  50
  51 =item * stemma_id - Specify which of the tradition's stemmata to use. Default
  52 is 0 (i.e. the first).
  53
  54 =item * ranks - Specify a list of location ranks to analyze; exclude the rest.
  55
  56 =item * merge_types - Specify a list of relationship types, where related readings
  57 should be treated as identical for the purposes of analysis.
  58
  59 =item * exclude_type1 - Exclude those ranks whose groupings have only type-1 variants.
  60
  61 =back
  62
  63 =begin testing
  64
  65 use Text::Tradition;
  66 use Text::Tradition::Analysis qw/ run_analysis analyze_variant_location /;
  67
  68 my $datafile = 't/data/florilegium_tei_ps.xml';
  69 my $tradition = Text::Tradition->new( 'input' => 'TEI',
  70                                       'name' => 'test0',
  71                                       'file' => $datafile );
  72 my $s = $tradition->add_stemma( 'dotfile' => 't/data/florilegium.dot' );
  73 is( ref( $s ), 'Text::Tradition::Stemma', "Added stemma to tradition" );
  74
  75 my %expected_genealogical = (
  76         1 => 0,
  77         2 => 1,
  78         3 =>  0,
  79         5 =>  0,
  80         7 =>  0,
  81         8 =>  0,
  82         10 => 0,
  83         13 => 1,
  84         33 => 0,
  85         34 => 0,
  86         37 => 0,
  87         60 => 0,
  88         81 => 1,
  89         84 => 0,
  90         87 => 0,
  91         101 => 0,
  92         102 => 0,
  93         122 => 1,
  94         157 => 0,
  95         166 => 1,
  96         169 => 1,
  97         200 => 0,
  98         216 => 1,
  99         217 => 1,
 100         219 => 1,
 101         241 => 1,
 102         242 => 1,
 103         243 => 1,
 104 );
 105
 106 my $data = run_analysis( $tradition );
 107 my $c = $tradition->collation;
 108 foreach my $row ( @{$data->{'variants'}} ) {
 109         # Account for rows that used to be "not useful"
 110         unless( exists $expected_genealogical{$row->{'id'}} ) {
 111                 $expected_genealogical{$row->{'id'}} = 1;
 112         }
 113         my $gen_bool = $row->{'genealogical'} ? 1 : 0;
 114         is( $gen_bool, $expected_genealogical{$row->{'id'}},
 115                 "Got correct genealogical flag for row " . $row->{'id'} );
 116         # Check that we have the right row with the right groups
 117         my $rank = $row->{'id'};
 118         foreach my $rdghash ( @{$row->{'readings'}} ) {
 119                 # Skip 'readings' that aren't really
 120                 next unless $c->reading( $rdghash->{'readingid'} );
 121                 # Check the rank
 122                 is( $c->reading( $rdghash->{'readingid'} )->rank, $rank,
 123                         "Got correct reading rank" );
 124                 # Check the witnesses
 125                 my @realwits = sort $c->reading_witnesses( $rdghash->{'readingid'} );
 126                 my @sgrp = sort @{$rdghash->{'group'}};
 127                 is_deeply( \@sgrp, \@realwits, "Reading analyzed with correct groups" );
 128         }
 129 }
 130 is( $data->{'variant_count'}, 58, "Got right total variant number" );
 131 # TODO Make something meaningful of conflict count, maybe test other bits
 132
 133 =end testing
 134
 135 =cut
 136
 137 sub run_analysis {
 138         my( $tradition, %opts ) = @_;
 139         my $c = $tradition->collation;
 140
 141         my $stemma_id = $opts{'stemma_id'} || 0;
 142         my @ranks = ref( $opts{'ranks'} ) eq 'ARRAY' ? @{$opts{'ranks'}} : ();
 143         my @collapse = ref( $opts{'merge_types'} ) eq 'ARRAY' ? @{$opts{'merge_types'}} : ();
 144
 145         # Get the stemma
 146         my $stemma = $tradition->stemma( $stemma_id );
 147
 148         # Figure out which witnesses we are working with - that is, the ones that
 149         # appear both in the stemma and in the tradition. All others are 'lacunose'
 150         # for our purposes.
 151         my @lacunose = $stemma->hypotheticals;
 152         my @tradition_wits = map { $_->sigil } $tradition->witnesses;
 153         push( @lacunose, _symmdiff( [ $stemma->witnesses ], \@tradition_wits ) );
 154
 155         # Find and mark 'common' ranks for exclusion, unless they were
 156         # explicitly specified.
 157         unless( @ranks ) {
 158                 my %common_rank;
 159                 foreach my $rdg ( $c->common_readings ) {
 160                         $common_rank{$rdg->rank} = 1;
 161                 }
 162                 @ranks = grep { !$common_rank{$_} } ( 1 .. $c->end->rank-1 );
 163         }
 164
 165         # Group the variants to send to the solver
 166         my @groups;
 167         my @use_ranks;
 168         my %lacunae;
 169         foreach my $rank ( @ranks ) {
 170                 my $missing = [ @lacunose ];
 171                 my $rankgroup = group_variants( $tradition, $rank, $missing, \@collapse );
 172                 if( $opts{'exclude_type1'} ) {
 173                         # Check to see whether this is a "useful" group.
 174                         my( $rdgs, $grps ) = _useful_variant( $rankgroup,
 175                                 $stemma->graph, $c->ac_label );
 176                         next unless @$rdgs;
 177                 }
 178                 push( @use_ranks, $rank );
 179                 push( @groups, $rankgroup );
 180                 $lacunae{$rank} = $missing;
 181         }
 182         # Run the solver
 183         my $answer = solve_variants( $stemma, @groups );
 184
 185         # Do further analysis on the answer
 186         my $conflict_count = 0;
 187         my $aclabel = $c->ac_label;
 188         foreach my $idx ( 0 .. $#use_ranks ) {
 189                 my $location = $answer->{'variants'}->[$idx];
 190                 # Add the rank back in
 191                 $location->{'id'} = $use_ranks[$idx];
 192                 # Note what our lacunae are
 193                 my %lmiss;
 194                 map { $lmiss{$_} = 1 } @{$lacunae{$use_ranks[$idx]}};
 195                 # Run through the reading groups and add as 'lacunae' any redundant
 196                 # a.c. witnesses (yes, we have to do this before the analysis, thus
 197                 # identical loops before and after. Boo.)
 198                 # TODO Consider making these callbacks to analyze_location
 199                 foreach my $rdghash ( @{$location->{'readings'}} ) {
 200                         my %rwits;
 201                         map { $rwits{$_} = 1 } @{$rdghash->{'group'}};
 202                         foreach my $rw ( keys %rwits ) {
 203                                 if( $rw =~ /^(.*)\Q$aclabel\E$/ ) {
 204                                         if( exists $rwits{$1} ) {
 205                                                 $lmiss{$rw} = 1;
 206                                                 delete $rwits{$rw};
 207                                         }
 208                                 }
 209                         }
 210                         $rdghash->{'group'} = [ keys %rwits ];
 211                 }
 212                 $location->{'missing'} = [ keys %lmiss ];
 213
 214                 # Run the extra analysis we need.
 215                 analyze_location( $tradition, $stemma->graph, $location );
 216
 217                 # Do the final post-analysis tidying up of the data.
 218                 foreach my $rdghash ( @{$location->{'readings'}} ) {
 219                         $conflict_count++
 220                                 if exists $rdghash->{'conflict'} && $rdghash->{'conflict'};
 221                         # Add the reading text back in
 222                         my $rdg = $c->reading( $rdghash->{'readingid'} );
 223                         $rdghash->{'text'} = $rdg ? $rdg->text : $rdghash->{'readingid'};
 224                         # Remove lacunose witnesses from this reading's list now that the
 225                         # analysis is done
 226                         my @realgroup;
 227                         map { push( @realgroup, $_ ) unless $lmiss{$_} } @{$rdghash->{'group'}};
 228                         $rdghash->{'group'} = \@realgroup;
 229                         # TODO Record hypotheticals used to create group, if we end up
 230                         # needing it
 231                 }
 232         }
 233         $answer->{'conflict_count'} = $conflict_count;
 234
 235         return $answer;
 236 }
 237
 238 =head2 group_variants( $tradition, $rank, $lacunose, @merge_relationship_types )
 239
 240 Groups the variants at the given $rank of the collation, treating any
 241 relationships in @merge_relationship_types as equivalent.  $lacunose should
 242 be a reference to an array, to which the sigla of lacunose witnesses at this
 243 rank will be appended.
 244
 245 Returns a hash $group_readings where $rdg is attested by the witnesses listed
 246 in $group_readings->{$rdg}.
 247
 248 =cut
 249
 250 # Return group_readings, groups, lacunose
 251 sub group_variants {
 252         my( $tradition, $rank, $lacunose, $collapse ) = @_;
 253         my $c = $tradition->collation;
 254         my $aclabel = $c->ac_label;
 255
 256         # Get the alignment table readings
 257         my %readings_at_rank;
 258         my %is_lacunose; # lookup table for $lacunose
 259         map { $is_lacunose{$_} = 1 } @$lacunose;
 260         my @gap_wits;
 261         foreach my $tablewit ( @{$c->alignment_table->{'alignment'}} ) {
 262                 my $rdg = $tablewit->{'tokens'}->[$rank-1];
 263                 my $wit = $tablewit->{'witness'};
 264                 # Exclude the witness if it is "lacunose" which if we got here
 265                 # means "not in the stemma".
 266                 next if $is_lacunose{$wit};
 267                 if( $rdg && $rdg->{'t'}->is_lacuna ) {
 268                         _add_to_witlist( $wit, $lacunose, $aclabel );
 269                 } elsif( $rdg ) {
 270                         $readings_at_rank{$rdg->{'t'}->text} = $rdg->{'t'};
 271                 } else {
 272                         _add_to_witlist( $wit, \@gap_wits, $aclabel );
 273                 }
 274         }
 275
 276         # Group the readings, collapsing groups by relationship if needed
 277         my %grouped_readings;
 278         foreach my $rdg ( values %readings_at_rank ) {
 279                 # Skip readings that have been collapsed into others.
 280                 next if exists $grouped_readings{$rdg->id} && !$grouped_readings{$rdg->id};
 281                 # Get the witness list, including from readings collapsed into this one.
 282                 my @wits = $rdg->witnesses;
 283                 if( $collapse ) {
 284                         my $filter = sub { my $r = $_[0]; grep { $_ eq $r->type } @$collapse; };
 285                         foreach my $other ( $rdg->related_readings( $filter ) ) {
 286                                 my @otherwits = $other->witnesses;
 287                                 push( @wits, @otherwits );
 288                                 $grouped_readings{$other->id} = 0;
 289                         }
 290                 }
 291                 # Filter the group to those witnesses in the stemma
 292                 my @use_wits;
 293                 foreach my $wit ( @wits ) {
 294                         next if $is_lacunose{$wit};
 295                         push( @use_wits, $wit );
 296                 }
 297                 $grouped_readings{$rdg->id} = \@use_wits;
 298         }
 299         $grouped_readings{'(omitted)'} = \@gap_wits if @gap_wits;
 300         # Get rid of our collapsed readings
 301         map { delete $grouped_readings{$_} unless $grouped_readings{$_} }
 302                 keys %grouped_readings
 303                 if $collapse;
 304
 305         # Return the result
 306         return \%grouped_readings;
 307 }
 308
 309 # Helper function to ensure that X and X a.c. never appear in the same list.
 310 sub _add_to_witlist {
 311         my( $wit, $list, $acstr ) = @_;
 312         my %inlist;
 313         my $idx = 0;
 314         map { $inlist{$_} = $idx++ } @$list;
 315         if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
 316                 my $acwit = $1;
 317                 unless( exists $inlist{$acwit} ) {
 318                         push( @$list, $acwit.$acstr );
 319                 }
 320         } else {
 321                 if( exists( $inlist{$wit.$acstr} ) ) {
 322                         # Replace the a.c. version with the main witness
 323                         my $i = $inlist{$wit.$acstr};
 324                         $list->[$i] = $wit;
 325                 } else {
 326                         push( @$list, $wit );
 327                 }
 328         }
 329 }
 330
 331 =head2 solve_variants( $graph, @groups )
 332
 333 Sends the set of groups to the external graph solver service and returns
 334 a cleaned-up answer, adding the rank IDs back where they belong.
 335
 336 The JSON has the form
 337   { "graph": [ stemmagraph DOT string without newlines ],
 338     "groupings": [ array of arrays of groups, one per rank ] }
 339
 340 The answer has the form
 341   { "variants" => [ array of variant location structures ],
 342     "variant_count" => total,
 343     "conflict_count" => number of conflicts detected,
 344     "genealogical_count" => number of solutions found }
 345
 346 =cut
 347
 348 sub solve_variants {
 349         my( $stemma, @groups ) = @_;
 350         my $aclabel = $stemma->collation->ac_label;
 351
 352         # Filter the groups down to distinct groups, and work out what graph
 353         # should be used in the calculation of each group. We want to send each
 354         # distinct problem to the solver only once.
 355         # We need a whole bunch of lookup tables for this.
 356         my $index_groupkeys = {};       # Save the order of readings
 357         my $group_indices = {};         # Save the indices that have a given grouping
 358         my $graph_problems = {};        # Save the groupings for the given graph
 359
 360         foreach my $idx ( 0..$#groups ) {
 361                 my $ghash = $groups[$idx];
 362                 my @grouping;
 363                 # Sort the groupings from big to little, and scan for a.c. witnesses
 364                 # that would need an extended graph.
 365                 my @acwits;   # note which AC witnesses crop up at this rank
 366                 my @idxkeys = sort { scalar @{$ghash->{$b}} <=> scalar @{$ghash->{$a}} }
 367                         keys %$ghash;
 368                 foreach my $rdg ( @idxkeys ) {
 369                         my @sg = sort @{$ghash->{$rdg}};
 370                         push( @acwits, grep { $_ =~ /\Q$aclabel\E$/ } @sg );
 371                         push( @grouping, \@sg );
 372                 }
 373                 # Save the reading order
 374                 $index_groupkeys->{$idx} = \@idxkeys;
 375
 376                 # Now associate the distinct group with this index
 377                 my $gstr = wit_stringify( \@grouping );
 378                 push( @{$group_indices->{$gstr}}, $idx );
 379
 380                 # Finally, add the group to the list to be calculated for this graph.
 381                 map { s/\Q$aclabel\E$// } @acwits;
 382                 my $graph = $stemma->extend_graph( \@acwits );
 383                 unless( exists $graph_problems->{"$graph"} ) {
 384                         $graph_problems->{"$graph"} = { 'object' => $graph, 'groups' => [] };
 385                 }
 386                 push( @{$graph_problems->{"$graph"}->{'groups'}}, \@grouping );
 387         }
 388
 389         ## For each distinct graph, send its groups to the solver.
 390         $DB::single = 1;
 391         my $solver_url = 'http://byzantini.st/cgi-bin/graphcalc.cgi';
 392         my $ua = LWP::UserAgent->new();
 393         ## Witness map is a HACK to get around limitations in node names from IDP
 394         my $witness_map = {};
 395         ## Variables to store answers as they come back
 396         my $variants = [ ( undef ) x ( scalar keys %$index_groupkeys ) ];
 397         my $genealogical = 0;
 398         foreach my $graphkey ( keys %$graph_problems ) {
 399                 my $graph = $graph_problems->{$graphkey}->{'object'};
 400                 my $groupings = $graph_problems->{$graphkey}->{'groups'};
 401                 my $json = encode_json( _safe_wit_strings( $graph, $stemma->collation,
 402                         $groupings, $witness_map ) );
 403                 # Send it off and get the result
 404                 my $resp = $ua->post( $solver_url, 'Content-Type' => 'application/json',
 405                                                           'Content' => $json );
 406                 my $answer;
 407                 my $used_idp;
 408                 if( $resp->is_success ) {
 409                         $answer = _desanitize_names( decode_json( $resp->content ), $witness_map );
 410                         $used_idp = 1;
 411                 } else {
 412                         # Fall back to the old method.
 413                         warn "IDP solver returned " . $resp->status_line . " / " . $resp->content
 414                                 . "; falling back to perl method";
 415                         $answer = perl_solver( $graph, @$groupings );
 416                 }
 417                 ## The answer is the evaluated groupings, plus a boolean for whether
 418                 ## they were genealogical.  Reconstruct our original groups.
 419                 foreach my $gidx ( 0 .. $#{$groupings} ) {
 420                         my( $calc_groups, $result ) = @{$answer->[$gidx]};
 421                         if( $result ) {
 422                                 $genealogical++;
 423                                 # Prune the calculated groups, in case the IDP solver failed to.
 424                                 if( $used_idp ) {
 425                                         my @pruned_groups;
 426                                         foreach my $cg ( @$calc_groups ) {
 427                                                 # This is a little wasteful but the path of least
 428                                                 # resistance. Send both the stemma, which knows what
 429                                                 # its hypotheticals are, and the actual graph used.
 430                                                 my @pg = _prune_group( $cg, $stemma, $graph );
 431                                                 push( @pruned_groups, \@pg );
 432                                         }
 433                                         $calc_groups = \@pruned_groups;
 434                                 }
 435                         }
 436                         # Retrieve the key for the original group that went to the solver
 437                         my $input_group = wit_stringify( $groupings->[$gidx] );
 438                         foreach my $oidx ( @{$group_indices->{$input_group}} ) {
 439                                 my @readings = @{$index_groupkeys->{$oidx}};
 440                                 my $vstruct = {
 441                                         'genealogical' => $result,
 442                                         'readings' => [],
 443                                 };
 444                                 foreach my $ridx ( 0 .. $#readings ) {
 445                                         push( @{$vstruct->{'readings'}},
 446                                                 { 'readingid' => $readings[$ridx],
 447                                                   'group' => $calc_groups->[$ridx] } );
 448                                 }
 449                                 $variants->[$oidx] = $vstruct;
 450                         }
 451                 }
 452         }
 453
 454         return { 'variants' => $variants,
 455                          'variant_count' => scalar @$variants,
 456                          'genealogical_count' => $genealogical };
 457 }
 458
 459 #### HACKERY to cope with IDP's limited idea of what a node name looks like ###
 460
 461 sub _safe_wit_strings {
 462         my( $graph, $c, $groupings, $witness_map ) = @_;
 463         # Parse the graph we were given into a stemma.
 464         my $safegraph = Graph->new();
 465         # Convert the graph to a safe representation and store the conversion.
 466         foreach my $n ( $graph->vertices ) {
 467                 my $sn = _safe_witstr( $n );
 468                 if( exists $witness_map->{$sn} ) {
 469                         warn "Ambiguous stringification $sn for $n and " . $witness_map->{$sn}
 470                                 if $witness_map->{$sn} ne $n;
 471                 } else {
 472                         $witness_map->{$sn} = $n;
 473                 }
 474                 $safegraph->add_vertex( $sn );
 475                 $safegraph->set_vertex_attributes( $sn,
 476                         $graph->get_vertex_attributes( $n ) );
 477         }
 478         foreach my $e ( $graph->edges ) {
 479                 my @safe_e = ( _safe_witstr( $e->[0] ), _safe_witstr( $e->[1] ) );
 480                 $safegraph->add_edge( @safe_e );
 481         }
 482         my $safe_stemma = Text::Tradition::Stemma->new(
 483                 'collation' => $c, 'graph' => $safegraph );
 484
 485         # Now convert the witness groupings to a safe representation.
 486         my $safe_groupings = [];
 487         foreach my $grouping ( @$groupings ) {
 488                 my $safe_grouping = [];
 489                 foreach my $group ( @$grouping ) {
 490                         my $safe_group = [];
 491                         foreach my $n ( @$group ) {
 492                                 my $sn = _safe_witstr( $n );
 493                                 warn "Ambiguous stringification $sn for $n and " . $witness_map->{$sn}
 494                                         if exists $witness_map->{$sn} && $witness_map->{$sn} ne $n;
 495                                 $witness_map->{$sn} = $n;
 496                                 push( @$safe_group, $sn );
 497                         }
 498                         push( @$safe_grouping, $safe_group );
 499                 }
 500                 push( @$safe_groupings, $safe_grouping );
 501         }
 502
 503         # Return it all in the struct we expect.  We have stored the reductions
 504         # in the $witness_map that we were passed.
 505         return { 'graph' => $safe_stemma->editable( { 'linesep' => ' ' } ),
 506                          'groupings' => $safe_groupings };
 507 }
 508
 509 sub _safe_witstr {
 510         my $witstr = shift;
 511         $witstr =~ s/\s+/_/g;
 512         $witstr =~ s/[^\w\d-]//g;
 513         return $witstr;
 514 }
 515
 516 sub _desanitize_names {
 517         my( $jsonstruct, $witness_map ) = @_;
 518         my $result = [];
 519         foreach my $grouping ( @$jsonstruct ) {
 520                 my $real_grouping = [];
 521                 foreach my $element ( @$grouping ) {
 522                         if( ref( $element ) eq 'ARRAY' ) {
 523                                 # it's the groupset.
 524                                 my $real_groupset = [];
 525                                 foreach my $group ( @$element ) {
 526                                         my $real_group = [];
 527                                         foreach my $n ( @$group ) {
 528                                                 my $rn = $witness_map->{$n};
 529                                                 push( @$real_group, $rn );
 530                                         }
 531                                         push( @$real_groupset, $real_group );
 532                                 }
 533                                 push( @$real_grouping, $real_groupset );
 534                         } else {
 535                                 # It is the boolean, not actually a group.
 536                                 push( @$real_grouping, $element );
 537                         }
 538                 }
 539                 push( @$result, $real_grouping );
 540         }
 541         return $result;
 542 }
 543
 544 ### END HACKERY ###
 545
 546 =head2 analyze_location ( $tradition, $graph, $location_hash )
 547
 548 Given the tradition, its stemma graph, and the solution from the graph solver,
 549 work out the rest of the information we want.  For each reading we need missing,
 550 conflict, reading_parents, independent_occurrence, followed, not_followed, and follow_unknown.  Alters the location_hash in place.
 551
 552 =cut
 553
 554 sub analyze_location {
 555         my ( $tradition, $graph, $variant_row ) = @_;
 556
 557         # Make a hash of all known node memberships, and make the subgraphs.
 558         my $contig = {};
 559         my $reading_roots = {};
 560         my $subgraph = {};
 561     foreach my $rdghash ( @{$variant_row->{'readings'}} ) {
 562         my $rid = $rdghash->{'readingid'};
 563                 map { $contig->{$_} = $rid } @{$rdghash->{'group'}};
 564
 565         # Make the subgraph.
 566         my $part = $graph->copy;
 567                 my %these_vertices;
 568                 map { $these_vertices{$_} = 1 } @{$rdghash->{'group'}};
 569                 $part->delete_vertices( grep { !$these_vertices{$_} } $part->vertices );
 570                 $subgraph->{$rid} = $part;
 571                 # Get the reading roots.
 572                 map { $reading_roots->{$_} = $rid } $part->predecessorless_vertices;
 573         }
 574
 575         # Now that we have all the node group memberships, calculate followed/
 576     # non-followed/unknown values for each reading.  Also figure out the
 577     # reading's evident parent(s).
 578     foreach my $rdghash ( @{$variant_row->{'readings'}} ) {
 579         # Group string key - TODO do we need this?
 580         my $gst = wit_stringify( $rdghash->{'group'} );
 581         my $rid = $rdghash->{'readingid'};
 582         # Get the subgraph
 583         my $part = $subgraph->{$rid};
 584
 585         # Start figuring things out.
 586         my @roots = $part->predecessorless_vertices;
 587         $rdghash->{'independent_occurrence'} = scalar @roots;
 588         $rdghash->{'followed'} = scalar( $part->vertices ) - scalar( @roots );
 589         # Find the parent readings, if any, of this reading.
 590         my %rdgparents;
 591         foreach my $wit ( @roots ) {
 592                 # Look in the main stemma to find this witness's extant or known-reading
 593                 # immediate ancestor(s), and look up the reading that each ancestor olds.
 594                         my @check = $graph->predecessors( $wit );
 595                         while( @check ) {
 596                                 my @next;
 597                                 foreach my $wparent( @check ) {
 598                                         my $preading = $contig->{$wparent};
 599                                         if( $preading ) {
 600                                                 $rdgparents{$preading} = 1;
 601                                         } else {
 602                                                 push( @next, $graph->predecessors( $wparent ) );
 603                                         }
 604                                 }
 605                                 @check = @next;
 606                         }
 607                 }
 608                 $rdghash->{'reading_parents'} = [ keys %rdgparents ];
 609
 610                 # Find the number of times this reading was altered, and the number of
 611                 # times we're not sure.
 612                 my( %nofollow, %unknownfollow );
 613                 foreach my $wit ( $part->vertices ) {
 614                         foreach my $wchild ( $graph->successors( $wit ) ) {
 615                                 next if $part->has_vertex( $wchild );
 616                                 if( $reading_roots->{$wchild} && $contig->{$wchild} ) {
 617                                         # It definitely changed here.
 618                                         $nofollow{$wchild} = 1;
 619                                 } elsif( !($contig->{$wchild}) ) {
 620                                         # The child is a hypothetical node not definitely in
 621                                         # any group. Answer is unknown.
 622                                         $unknownfollow{$wchild} = 1;
 623                                 } # else it's a non-root node in a known group, and therefore
 624                                   # is presumed to have its reading from its group, not this link.
 625                         }
 626                 }
 627                 $rdghash->{'not_followed'} = keys %nofollow;
 628                 $rdghash->{'follow_unknown'} = keys %unknownfollow;
 629
 630                 # Now say whether this reading represents a conflict.
 631                 unless( $variant_row->{'genealogical'} ) {
 632                         $rdghash->{'conflict'} = @roots != 1;
 633                 }
 634     }
 635 }
 636
 637
 638 =head2 perl_solver( $tradition, $rank, $stemma_id, @merge_relationship_types )
 639
 640 ** NOTE ** This method should hopefully not be called - it is not guaranteed
 641 to be correct.  Serves as a backup for the real solver.
 642
 643 Runs an analysis of the given tradition, at the location given in $rank,
 644 against the graph of the stemma specified in $stemma_id.  The argument
 645 @merge_relationship_types is an optional list of relationship types for
 646 which readings so related should be treated as equivalent.
 647
 648 Returns a nested array data structure as follows:
 649
 650  [ [ group_list, is_genealogical ], [ group_list, is_genealogical ] ... ]
 651
 652 where the group list is the array of arrays passed in for each element of @groups,
 653 possibly with the addition of hypothetical readings.
 654
 655
 656 =cut
 657
 658 sub perl_solver {
 659         my( $graph, @groups ) = @_;
 660         my @answer;
 661         foreach my $g ( @groups ) {
 662                 push( @answer, _solve_variant_location( $graph, $g ) );
 663         }
 664         return \@answer;
 665 }
 666
 667 sub _solve_variant_location {
 668         my( $graph, $groups ) = @_;
 669         # Now do the work.
 670     my $contig = {};
 671     my $subgraph = {};
 672     my $is_conflicted;
 673     my $conflict = {};
 674
 675     # Mark each ms as in its own group, first.
 676     foreach my $g ( @$groups ) {
 677         my $gst = wit_stringify( $g );
 678         map { $contig->{$_} = $gst } @$g;
 679     }
 680
 681     # Now for each unmarked node in the graph, initialize an array
 682     # for possible group memberships.  We will use this later to
 683     # resolve potential conflicts.
 684     map { $contig->{$_} = [] unless $contig->{$_} } $graph->vertices;
 685     foreach my $g ( sort { scalar @$b <=> scalar @$a } @$groups ) {
 686         my $gst = wit_stringify( $g );  # This is the group name
 687         # Copy the graph, and delete all non-members from the new graph.
 688         my $part = $graph->copy;
 689         my @group_roots;
 690         $part->delete_vertices(
 691             grep { !ref( $contig->{$_} ) && $contig->{$_} ne $gst } $graph->vertices );
 692
 693         # Now look to see if our group is connected.
 694                 if( @$g > 1 ) {
 695                         # We have to take directionality into account.
 696                         # How many root nodes do we have?
 697                         my @roots = grep { ref( $contig->{$_} ) || $contig->{$_} eq $gst }
 698                                 $part->predecessorless_vertices;
 699                         # Assuming that @$g > 1, find the first root node that has at
 700                         # least one successor belonging to our group. If this reading
 701                         # is genealogical, there should be only one, but we will check
 702                         # that implicitly later.
 703                         foreach my $root ( @roots ) {
 704                                 # Prune the tree to get rid of extraneous hypotheticals.
 705                                 $root = _prune_subtree( $part, $root, $contig );
 706                                 next unless $root;
 707                                 # Save this root for our group.
 708                                 push( @group_roots, $root );
 709                                 # Get all the successor nodes of our root.
 710                         }
 711                 } else {
 712                         # Dispense with the trivial case of one reading.
 713                         my $wit = $g->[0];
 714                         @group_roots = ( $wit );
 715                         foreach my $v ( $part->vertices ) {
 716                                 $part->delete_vertex( $v ) unless $v eq $wit;
 717                         }
 718         }
 719
 720         if( @group_roots > 1 ) {
 721                 $conflict->{$gst} = 1;
 722                 $is_conflicted = 1;
 723         }
 724         # Paint the 'hypotheticals' with our group.
 725                 foreach my $wit ( $part->vertices ) {
 726                         if( ref( $contig->{$wit} ) ) {
 727                                 push( @{$contig->{$wit}}, $gst );
 728                         } elsif( $contig->{$wit} ne $gst ) {
 729                                 warn "How did we get here?";
 730                         }
 731                 }
 732
 733
 734                 # Save the relevant subgraph.
 735                 $subgraph->{$gst} = $part;
 736     }
 737
 738         # For each of our hypothetical readings, flatten its 'contig' array if
 739         # the array contains zero or one group.  If we have any unflattened arrays,
 740         # we may need to run the resolution process. If the reading is already known
 741         # to have a conflict, flatten the 'contig' array to nothing; we won't resolve
 742         # it.
 743         my @resolve;
 744         foreach my $wit ( keys %$contig ) {
 745                 next unless ref( $contig->{$wit} );
 746                 if( @{$contig->{$wit}} > 1 ) {
 747                         if( $is_conflicted ) {
 748                                 $contig->{$wit} = '';  # We aren't going to decide.
 749                         } else {
 750                                 push( @resolve, $wit );
 751                         }
 752                 } else {
 753                         my $gst = pop @{$contig->{$wit}};
 754                         $contig->{$wit} = $gst || '';
 755                 }
 756         }
 757
 758     if( @resolve ) {
 759         my $still_contig = {};
 760         foreach my $h ( @resolve ) {
 761             # For each of the hypothetical readings with more than one possibility,
 762             # try deleting it from each of its member subgraphs in turn, and see
 763             # if that breaks the contiguous grouping.
 764             # TODO This can still break in a corner case where group A can use
 765             # either vertex 1 or 2, and group B can use either vertex 2 or 1.
 766             # Revisit this if necessary; it could get brute-force nasty.
 767             foreach my $gst ( @{$contig->{$h}} ) {
 768                 my $gpart = $subgraph->{$gst}->copy();
 769                 # If we have come this far, there is only one root and everything
 770                 # is reachable from it.
 771                 my( $root ) = $gpart->predecessorless_vertices;
 772                 my $reachable = {};
 773                 map { $reachable->{$_} = 1 } $gpart->vertices;
 774
 775                 # Try deleting the hypothetical node.
 776                 $gpart->delete_vertex( $h );
 777                 if( $h eq $root ) {
 778                         # See if we still have a single root.
 779                         my @roots = $gpart->predecessorless_vertices;
 780                         warn "This shouldn't have happened" unless @roots;
 781                         if( @roots > 1 ) {
 782                                 # $h is needed by this group.
 783                                 if( exists( $still_contig->{$h} ) ) {
 784                                         # Conflict!
 785                                         $conflict->{$gst} = 1;
 786                                         $still_contig->{$h} = '';
 787                                 } else {
 788                                         $still_contig->{$h} = $gst;
 789                                 }
 790                         }
 791                 } else {
 792                         # $h is somewhere in the middle. See if everything
 793                         # else can still be reached from the root.
 794                                         my %still_reachable = ( $root => 1 );
 795                                         map { $still_reachable{$_} = 1 }
 796                                                 $gpart->all_successors( $root );
 797                                         foreach my $v ( keys %$reachable ) {
 798                                                 next if $v eq $h;
 799                                                 if( !$still_reachable{$v}
 800                                                         && ( $contig->{$v} eq $gst
 801                                                                  || ( exists $still_contig->{$v}
 802                                                                           && $still_contig->{$v} eq $gst ) ) ) {
 803                                                         # We need $h.
 804                                                         if( exists $still_contig->{$h} ) {
 805                                                                 # Conflict!
 806                                                                 $conflict->{$gst} = 1;
 807                                                                 $still_contig->{$h} = '';
 808                                                         } else {
 809                                                                 $still_contig->{$h} = $gst;
 810                                                         }
 811                                                         last;
 812                                                 } # else we don't need $h in this group.
 813                                         } # end foreach $v
 814                                 } # endif $h eq $root
 815             } # end foreach $gst
 816         } # end foreach $h
 817
 818         # Now we have some hypothetical vertices in $still_contig that are the
 819         # "real" group memberships.  Replace these in $contig.
 820                 foreach my $v ( keys %$contig ) {
 821                         next unless ref $contig->{$v};
 822                         $contig->{$v} = $still_contig->{$v};
 823                 }
 824     } # end if @resolve
 825
 826     my $is_genealogical = keys %$conflict ? JSON::false : JSON::true;
 827         my $variant_row = [ [], $is_genealogical ];
 828         # Fill in the groupings from $contig.
 829         foreach my $g ( @$groups ) {
 830         my $gst = wit_stringify( $g );
 831         my @realgroup = grep { $contig->{$_} eq $gst } keys %$contig;
 832         push( @{$variant_row->[0]}, \@realgroup );
 833     }
 834     return $variant_row;
 835 }
 836
 837 sub _prune_group {
 838         my( $group, $stemma, $graph ) = @_;
 839         # Get these into a form prune_subtree will recognize. Make a "contighash"
 840         my $hypohash = {};
 841         map { $hypohash->{$_} = 1 } @$group;
 842         # ...with reference values for hypotheticals.
 843         map { $hypohash->{$_} = [] } $stemma->hypotheticals;
 844         # Make our subgraph
 845         my $subgraph = $graph->copy;
 846         map { $subgraph->delete_vertex( $_ ) unless exists $hypohash->{$_} }
 847                 $subgraph->vertices;
 848         # ...and find the root.
 849         my( $root ) = $subgraph->predecessorless_vertices;
 850         # Now prune and return the remaining vertices.
 851         _prune_subtree( $subgraph, $root, $hypohash );
 852         return $subgraph->vertices;
 853 }
 854
 855 sub _prune_subtree {
 856     my( $tree, $root, $contighash ) = @_;
 857     # First, delete hypothetical leaves / orphans until there are none left.
 858     my @orphan_hypotheticals = grep { ref( $contighash->{$_} ) }
 859         $tree->successorless_vertices;
 860     while( @orphan_hypotheticals ) {
 861         $tree->delete_vertices( @orphan_hypotheticals );
 862         @orphan_hypotheticals = grep { ref( $contighash->{$_} ) }
 863             $tree->successorless_vertices;
 864     }
 865     # Then delete a hypothetical root with only one successor, moving the
 866     # root to the first child that has no other predecessors.
 867     while( $tree->successors( $root ) == 1 && ref $contighash->{$root} ) {
 868         my @nextroot = $tree->successors( $root );
 869         $tree->delete_vertex( $root );
 870         ( $root ) = grep { $tree->is_predecessorless_vertex( $_ ) } @nextroot;
 871     }
 872     # The tree has been modified in place, but we need to know the new root.
 873     $root = undef unless $root && $tree->has_vertex( $root );
 874     return $root;
 875 }
 876 # Add the variant, subject to a.c. representation logic.
 877 # This assumes that we will see the 'main' version before the a.c. version.
 878 sub add_variant_wit {
 879     my( $arr, $wit, $acstr ) = @_;
 880     my $skip;
 881     if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
 882         my $real = $1;
 883         $skip = grep { $_ =~ /^\Q$real\E$/ } @$arr;
 884     }
 885     push( @$arr, $wit ) unless $skip;
 886 }
 887
 888 sub _useful_variant {
 889         my( $group_readings, $graph, $acstr ) = @_;
 890
 891         # TODO Decide what to do with AC witnesses
 892
 893         # Sort by group size and return
 894         my $is_useful = 0;
 895         my( @readings, @groups );   # The sorted groups for our answer.
 896         foreach my $rdg ( sort { @{$group_readings->{$b}} <=> @{$group_readings->{$a}} }
 897                 keys %$group_readings ) {
 898                 push( @readings, $rdg );
 899                 push( @groups, $group_readings->{$rdg} );
 900                 if( @{$group_readings->{$rdg}} > 1 ) {
 901                         $is_useful++;
 902                 } else {
 903                         my( $wit ) = @{$group_readings->{$rdg}};
 904                         $wit =~ s/^(.*)\Q$acstr\E$/$1/;
 905                         $is_useful++ unless( $graph->is_sink_vertex( $wit ) );
 906                 }
 907         }
 908         if( $is_useful > 1 ) {
 909                 return( \@readings, \@groups );
 910         } else {
 911                 return( [], [] );
 912         }
 913 }
 914
 915 =head2 wit_stringify( $groups )
 916
 917 Takes an array of witness groupings and produces a string like
 918 ['A','B'] / ['C','D','E'] / ['F']
 919
 920 =cut
 921
 922 sub wit_stringify {
 923     my $groups = shift;
 924     my @gst;
 925     # If we were passed an array of witnesses instead of an array of
 926     # groupings, then "group" the witnesses first.
 927     unless( ref( $groups->[0] ) ) {
 928         my $mkgrp = [ $groups ];
 929         $groups = $mkgrp;
 930     }
 931     foreach my $g ( @$groups ) {
 932         push( @gst, '[' . join( ',', map { "'$_'" } @$g ) . ']' );
 933     }
 934     return join( ' / ', @gst );
 935 }
 936
 937 sub _symmdiff {
 938         my( $lista, $listb ) = @_;
 939         my %union;
 940         my %scalars;
 941         map { $union{$_} = 1; $scalars{$_} = $_ } @$lista;
 942         map { $union{$_} += 1; $scalars{$_} = $_ } @$listb;
 943         my @set = grep { $union{$_} == 1 } keys %union;
 944         return map { $scalars{$_} } @set;
 945 }
 946
 947 1;
 948
 949 =head1 LICENSE
 950
 951 This package is free software and is provided "as is" without express
 952 or implied warranty.  You can redistribute it and/or modify it under
 953 the same terms as Perl itself.
 954
 955 =head1 AUTHOR
 956
 957 Tara L Andrews E<lt>aurum@cpan.orgE<gt>