lib/Text/Tradition/Analysis.pm

   1 package Text::Tradition::Analysis;
   2
   3 use strict;
   4 use warnings;
   5 use Benchmark;
   6 use Encode qw/ encode_utf8 /;
   7 use Exporter 'import';
   8 use Graph;
   9 use JSON qw/ encode_json decode_json /;
  10 use LWP::UserAgent;
  11 use Text::Tradition;
  12 use Text::Tradition::Stemma;
  13
  14 use vars qw/ @EXPORT_OK /;
  15 @EXPORT_OK = qw/ run_analysis group_variants analyze_variant_location wit_stringify /;
  16
  17 =head1 NAME
  18
  19 Text::Tradition::Analysis - functions for stemma analysis of a tradition
  20
  21 =head1 SYNOPSIS
  22
  23   use Text::Tradition;
  24   use Text::Tradition::Analysis qw/ run_analysis analyze_variant_location /;
  25   my $t = Text::Tradition->new(
  26     'name' => 'this is a text',
  27     'input' => 'TEI',
  28     'file' => '/path/to/tei_parallel_seg_file.xml' );
  29   $t->add_stemma( 'dotfile' => $stemmafile );
  30
  31   my $variant_data = run_analysis( $tradition );
  32   # Recalculate rank $n treating all orthographic variants as equivalent
  33   my $reanalyze = analyze_variant_location( $tradition, $n, 0, 'orthographic' );
  34
  35 =head1 DESCRIPTION
  36
  37 Text::Tradition is a library for representation and analysis of collated
  38 texts, particularly medieval ones.  The Collation is the central feature of
  39 a Tradition, where the text, its sequence of readings, and its relationships
  40 between readings are actually kept.
  41
  42 =head1 SUBROUTINES
  43
  44 =head2 run_analysis( $tradition, %opts )
  45
  46 Runs the analysis described in analyze_variant_location on every location in the
  47 collation of the given tradition, with the given options. These include:
  48
  49 =over 4
  50
  51 =item * stemma_id - Specify which of the tradition's stemmata to use. Default
  52 is 0 (i.e. the first).
  53
  54 =item * ranks - Specify a list of location ranks to analyze; exclude the rest.
  55
  56 =item * merge_types - Specify a list of relationship types, where related readings
  57 should be treated as identical for the purposes of analysis.
  58
  59 =item * exclude_type1 - Exclude those ranks whose groupings have only type-1 variants.
  60
  61 =back
  62
  63 =begin testing
  64
  65 use Text::Tradition;
  66 use Text::Tradition::Analysis qw/ run_analysis analyze_variant_location /;
  67
  68 my $datafile = 't/data/florilegium_tei_ps.xml';
  69 my $tradition = Text::Tradition->new( 'input' => 'TEI',
  70                                       'name' => 'test0',
  71                                       'file' => $datafile );
  72 my $s = $tradition->add_stemma( 'dotfile' => 't/data/florilegium.dot' );
  73 is( ref( $s ), 'Text::Tradition::Stemma', "Added stemma to tradition" );
  74
  75 my %expected_genealogical = (
  76         1 => 0,
  77         2 => 1,
  78         3 =>  0,
  79         5 =>  0,
  80         7 =>  0,
  81         8 =>  0,
  82         10 => 0,
  83         13 => 1,
  84         33 => 0,
  85         34 => 0,
  86         37 => 0,
  87         60 => 0,
  88         81 => 1,
  89         84 => 0,
  90         87 => 0,
  91         101 => 0,
  92         102 => 0,
  93         122 => 1,
  94         157 => 0,
  95         166 => 1,
  96         169 => 1,
  97         200 => 0,
  98         216 => 1,
  99         217 => 1,
 100         219 => 1,
 101         241 => 1,
 102         242 => 1,
 103         243 => 1,
 104 );
 105
 106 my $data = run_analysis( $tradition );
 107 my $c = $tradition->collation;
 108 foreach my $row ( @{$data->{'variants'}} ) {
 109         # Account for rows that used to be "not useful"
 110         unless( exists $expected_genealogical{$row->{'id'}} ) {
 111                 $expected_genealogical{$row->{'id'}} = 1;
 112         }
 113         is( $row->{'genealogical'}, $expected_genealogical{$row->{'id'}},
 114                 "Got correct genealogical flag for row " . $row->{'id'} );
 115         # Check that we have the right row with the right groups
 116         my $rank = $row->{'id'};
 117         foreach my $rdghash ( @{$row->{'readings'}} ) {
 118                 # Skip 'readings' that aren't really
 119                 next unless $c->reading( $rdghash->{'readingid'} );
 120                 # Check the rank
 121                 is( $c->reading( $rdghash->{'readingid'} )->rank, $rank,
 122                         "Got correct reading rank" );
 123                 # Check the witnesses
 124                 my @realwits = sort $c->reading_witnesses( $rdghash->{'readingid'} );
 125                 my @sgrp = sort @{$rdghash->{'group'}};
 126                 is_deeply( \@sgrp, \@realwits, "Reading analyzed with correct groups" );
 127         }
 128 }
 129 is( $data->{'variant_count'}, 58, "Got right total variant number" );
 130 # TODO Make something meaningful of conflict count, maybe test other bits
 131
 132 =end testing
 133
 134 =cut
 135
 136 sub run_analysis {
 137         my( $tradition, %opts ) = @_;
 138         my $c = $tradition->collation;
 139
 140         my $stemma_id = $opts{'stemma_id'} || 0;
 141         my @ranks = ref( $opts{'ranks'} ) eq 'ARRAY' ? @{$opts{'ranks'}} : ();
 142         my @collapse = ref( $opts{'merge_types'} ) eq 'ARRAY' ? @{$opts{'merge_types'}} : ();
 143
 144         # Get the stemma
 145         my $stemma = $tradition->stemma( $stemma_id );
 146
 147         # Figure out which witnesses we are working with - that is, the ones that
 148         # appear both in the stemma and in the tradition. All others are 'lacunose'
 149         # for our purposes.
 150         my @lacunose = $stemma->hypotheticals;
 151         my @tradition_wits = map { $_->sigil } $tradition->witnesses;
 152         push( @lacunose, _symmdiff( [ $stemma->witnesses ], \@tradition_wits ) );
 153
 154         # Find and mark 'common' ranks for exclusion, unless they were
 155         # explicitly specified.
 156         unless( @ranks ) {
 157                 my %common_rank;
 158                 foreach my $rdg ( $c->common_readings ) {
 159                         $common_rank{$rdg->rank} = 1;
 160                 }
 161                 @ranks = grep { !$common_rank{$_} } ( 1 .. $c->end->rank-1 );
 162         }
 163
 164         # Group the variants to send to the solver
 165         my @groups;
 166         my @use_ranks;
 167         my %lacunae;
 168         foreach my $rank ( @ranks ) {
 169                 my $missing = [ @lacunose ];
 170                 my $rankgroup = group_variants( $tradition, $rank, $missing, \@collapse );
 171                 if( $opts{'exclude_type1'} ) {
 172                         # Check to see whether this is a "useful" group.
 173                         my( $rdgs, $grps ) = _useful_variant( $rankgroup,
 174                                 $stemma->graph, $c->ac_label );
 175                         next unless @$rdgs;
 176                 }
 177                 push( @use_ranks, $rank );
 178                 push( @groups, $rankgroup );
 179                 $lacunae{$rank} = $missing;
 180         }
 181         # Run the solver
 182         my $answer = solve_variants( $stemma, @groups );
 183
 184         # Do further analysis on the answer
 185         my $conflict_count = 0;
 186         my $aclabel = $c->ac_label;
 187         foreach my $idx ( 0 .. $#use_ranks ) {
 188                 my $location = $answer->{'variants'}->[$idx];
 189                 # Add the rank back in
 190                 $location->{'id'} = $use_ranks[$idx];
 191                 # Note what our lacunae are
 192                 my %lmiss;
 193                 map { $lmiss{$_} = 1 } @{$lacunae{$use_ranks[$idx]}};
 194                 # Run through the reading groups and add as 'lacunae' any redundant
 195                 # a.c. witnesses (yes, we have to do this before the analysis, thus
 196                 # identical loops before and after. Boo.)
 197                 # TODO Consider making these callbacks to analyze_location
 198                 foreach my $rdghash ( @{$location->{'readings'}} ) {
 199                         my %rwits;
 200                         map { $rwits{$_} = 1 } @{$rdghash->{'group'}};
 201                         foreach my $rw ( keys %rwits ) {
 202                                 if( $rw =~ /^(.*)\Q$aclabel\E$/ ) {
 203                                         if( exists $rwits{$1} ) {
 204                                                 $lmiss{$rw} = 1;
 205                                                 delete $rwits{$rw};
 206                                         }
 207                                 }
 208                         }
 209                         $rdghash->{'group'} = [ keys %rwits ];
 210                 }
 211                 $location->{'missing'} = [ keys %lmiss ];
 212
 213                 # Run the extra analysis we need.
 214                 analyze_location( $tradition, $stemma->graph, $location );
 215
 216                 # Do the final post-analysis tidying up of the data.
 217                 foreach my $rdghash ( @{$location->{'readings'}} ) {
 218                         $conflict_count++
 219                                 if exists $rdghash->{'conflict'} && $rdghash->{'conflict'};
 220                         # Add the reading text back in
 221                         my $rdg = $c->reading( $rdghash->{'readingid'} );
 222                         $rdghash->{'text'} = $rdg ? $rdg->text : $rdghash->{'readingid'};
 223                         # Remove lacunose witnesses from this reading's list now that the
 224                         # analysis is done
 225                         my @realgroup;
 226                         map { push( @realgroup, $_ ) unless $lmiss{$_} } @{$rdghash->{'group'}};
 227                         $rdghash->{'group'} = \@realgroup;
 228                         # TODO Record hypotheticals used to create group, if we end up
 229                         # needing it
 230                 }
 231         }
 232         $answer->{'conflict_count'} = $conflict_count;
 233
 234         return $answer;
 235 }
 236
 237 =head2 group_variants( $tradition, $rank, $lacunose, @merge_relationship_types )
 238
 239 Groups the variants at the given $rank of the collation, treating any
 240 relationships in @merge_relationship_types as equivalent.  $lacunose should
 241 be a reference to an array, to which the sigla of lacunose witnesses at this
 242 rank will be appended.
 243
 244 Returns a hash $group_readings where $rdg is attested by the witnesses listed
 245 in $group_readings->{$rdg}.
 246
 247 =cut
 248
 249 # Return group_readings, groups, lacunose
 250 sub group_variants {
 251         my( $tradition, $rank, $lacunose, $collapse ) = @_;
 252         my $c = $tradition->collation;
 253         my $aclabel = $c->ac_label;
 254
 255         # Get the alignment table readings
 256         my %readings_at_rank;
 257         my %is_lacunose; # lookup table for $lacunose
 258         map { $is_lacunose{$_} = 1 } @$lacunose;
 259         my @gap_wits;
 260         foreach my $tablewit ( @{$c->alignment_table->{'alignment'}} ) {
 261                 my $rdg = $tablewit->{'tokens'}->[$rank-1];
 262                 my $wit = $tablewit->{'witness'};
 263                 # Exclude the witness if it is "lacunose" which if we got here
 264                 # means "not in the stemma".
 265                 next if $is_lacunose{$wit};
 266                 if( $rdg && $rdg->{'t'}->is_lacuna ) {
 267                         _add_to_witlist( $wit, $lacunose, $aclabel );
 268                 } elsif( $rdg ) {
 269                         $readings_at_rank{$rdg->{'t'}->text} = $rdg->{'t'};
 270                 } else {
 271                         _add_to_witlist( $wit, \@gap_wits, $aclabel );
 272                 }
 273         }
 274
 275         # Group the readings, collapsing groups by relationship if needed
 276         my %grouped_readings;
 277         foreach my $rdg ( values %readings_at_rank ) {
 278                 # Skip readings that have been collapsed into others.
 279                 next if exists $grouped_readings{$rdg->id} && !$grouped_readings{$rdg->id};
 280                 # Get the witness list, including from readings collapsed into this one.
 281                 my @wits = $rdg->witnesses;
 282                 if( $collapse ) {
 283                         my $filter = sub { my $r = $_[0]; grep { $_ eq $r->type } @$collapse; };
 284                         foreach my $other ( $rdg->related_readings( $filter ) ) {
 285                                 my @otherwits = $other->witnesses;
 286                                 push( @wits, @otherwits );
 287                                 $grouped_readings{$other->id} = 0;
 288                         }
 289                 }
 290                 # Filter the group to those witnesses in the stemma
 291                 my @use_wits;
 292                 foreach my $wit ( @wits ) {
 293                         next if $is_lacunose{$wit};
 294                         push( @use_wits, $wit );
 295                 }
 296                 $grouped_readings{$rdg->id} = \@use_wits;
 297         }
 298         $grouped_readings{'(omitted)'} = \@gap_wits if @gap_wits;
 299         # Get rid of our collapsed readings
 300         map { delete $grouped_readings{$_} unless $grouped_readings{$_} }
 301                 keys %grouped_readings
 302                 if $collapse;
 303
 304         # Return the result
 305         return \%grouped_readings;
 306 }
 307
 308 # Helper function to ensure that X and X a.c. never appear in the same list.
 309 sub _add_to_witlist {
 310         my( $wit, $list, $acstr ) = @_;
 311         my %inlist;
 312         my $idx = 0;
 313         map { $inlist{$_} = $idx++ } @$list;
 314         if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
 315                 my $acwit = $1;
 316                 unless( exists $inlist{$acwit} ) {
 317                         push( @$list, $acwit.$acstr );
 318                 }
 319         } else {
 320                 if( exists( $inlist{$wit.$acstr} ) ) {
 321                         # Replace the a.c. version with the main witness
 322                         my $i = $inlist{$wit.$acstr};
 323                         $list->[$i] = $wit;
 324                 } else {
 325                         push( @$list, $wit );
 326                 }
 327         }
 328 }
 329
 330 =head2 solve_variants( $graph, @groups )
 331
 332 Sends the set of groups to the external graph solver service and returns
 333 a cleaned-up answer, adding the rank IDs back where they belong.
 334
 335 The JSON has the form
 336   { "graph": [ stemmagraph DOT string without newlines ],
 337     "groupings": [ array of arrays of groups, one per rank ] }
 338
 339 The answer has the form
 340   { "variants" => [ array of variant location structures ],
 341     "variant_count" => total,
 342     "conflict_count" => number of conflicts detected,
 343     "genealogical_count" => number of solutions found }
 344
 345 =cut
 346
 347 sub solve_variants {
 348         my( $stemma, @groups ) = @_;
 349         my $aclabel = $stemma->collation->ac_label;
 350
 351         # Filter the groups down to distinct groups, and work out what graph
 352         # should be used in the calculation of each group. We want to send each
 353         # distinct problem to the solver only once.
 354         # We need a whole bunch of lookup tables for this.
 355         my $index_groupkeys = {};       # Save the order of readings
 356         my $group_indices = {};         # Save the indices that have a given grouping
 357         my $graph_problems = {};        # Save the groupings for the given graph
 358
 359         foreach my $idx ( 0..$#groups ) {
 360                 my $ghash = $groups[$idx];
 361                 my @grouping;
 362                 # Sort the groupings from big to little, and scan for a.c. witnesses
 363                 # that would need an extended graph.
 364                 my @acwits;   # note which AC witnesses crop up at this rank
 365                 my @idxkeys = sort { scalar @{$ghash->{$b}} <=> scalar @{$ghash->{$a}} }
 366                         keys %$ghash;
 367                 foreach my $rdg ( @idxkeys ) {
 368                         my @sg = sort @{$ghash->{$rdg}};
 369                         push( @acwits, grep { $_ =~ /\Q$aclabel\E$/ } @sg );
 370                         push( @grouping, \@sg );
 371                 }
 372                 # Save the reading order
 373                 $index_groupkeys->{$idx} = \@idxkeys;
 374
 375                 # Now associate the distinct group with this index
 376                 my $gstr = wit_stringify( \@grouping );
 377                 push( @{$group_indices->{$gstr}}, $idx );
 378
 379                 # Finally, add the group to the list to be calculated for this graph.
 380                 map { s/\Q$aclabel\E$// } @acwits;
 381                 my $graph = $stemma->extend_graph( \@acwits );
 382                 unless( exists $graph_problems->{"$graph"} ) {
 383                         $graph_problems->{"$graph"} = { 'object' => $graph, 'groups' => [] };
 384                 }
 385                 push( @{$graph_problems->{"$graph"}->{'groups'}}, \@grouping );
 386         }
 387
 388         ## For each distinct graph, send its groups to the solver.
 389         $DB::single = 1;
 390         my $solver_url = 'http://byzantini.st/cgi-bin/graphcalc.cgi';
 391         my $ua = LWP::UserAgent->new();
 392         ## Witness map is a HACK to get around limitations in node names from IDP
 393         my $witness_map = {};
 394         ## Variables to store answers as they come back
 395         my $variants = [ ( undef ) x ( scalar keys %$index_groupkeys ) ];
 396         my $genealogical = 0;
 397         foreach my $graphkey ( keys %$graph_problems ) {
 398                 my $graph = $graph_problems->{$graphkey}->{'object'};
 399                 my $groupings = $graph_problems->{$graphkey}->{'groups'};
 400                 my $json = encode_json( _safe_wit_strings( $graph, $stemma->collation,
 401                         $groupings, $witness_map ) );
 402                 # Send it off and get the result
 403                 my $resp = $ua->post( $solver_url, 'Content-Type' => 'application/json',
 404                                                           'Content' => $json );
 405                 my $answer;
 406                 my $used_idp;
 407                 if( $resp->is_success ) {
 408                         $answer = _desanitize_names( decode_json( $resp->content ), $witness_map );
 409                         $used_idp = 1;
 410                 } else {
 411                         # Fall back to the old method.
 412                         warn "IDP solver returned " . $resp->status_line . " / " . $resp->content
 413                                 . "; falling back to perl method";
 414                         $answer = perl_solver( $graph, @$groupings );
 415                 }
 416                 ## The answer is the evaluated groupings, plus a boolean for whether
 417                 ## they were genealogical.  Reconstruct our original groups.
 418                 foreach my $gidx ( 0 .. $#{$groupings} ) {
 419                         my( $calc_groups, $result ) = @{$answer->[$gidx]};
 420                         if( $result ) {
 421                                 $genealogical++;
 422                                 # Prune the calculated groups, in case the IDP solver failed to.
 423                                 if( $used_idp ) {
 424                                         my @pruned_groups;
 425                                         foreach my $cg ( @$calc_groups ) {
 426                                                 # This is a little wasteful but the path of least
 427                                                 # resistance. Send both the stemma, which knows what
 428                                                 # its hypotheticals are, and the actual graph used.
 429                                                 my @pg = _prune_group( $cg, $stemma, $graph );
 430                                                 push( @pruned_groups, \@pg );
 431                                         }
 432                                         $calc_groups = \@pruned_groups;
 433                                 }
 434                         }
 435                         # Retrieve the key for the original group that went to the solver
 436                         my $input_group = wit_stringify( $groupings->[$gidx] );
 437                         foreach my $oidx ( @{$group_indices->{$input_group}} ) {
 438                                 my @readings = @{$index_groupkeys->{$oidx}};
 439                                 my $vstruct = {
 440                                         'genealogical' => $result,
 441                                         'readings' => [],
 442                                 };
 443                                 foreach my $ridx ( 0 .. $#readings ) {
 444                                         push( @{$vstruct->{'readings'}},
 445                                                 { 'readingid' => $readings[$ridx],
 446                                                   'group' => $calc_groups->[$ridx] } );
 447                                 }
 448                                 $variants->[$oidx] = $vstruct;
 449                         }
 450                 }
 451         }
 452
 453         return { 'variants' => $variants,
 454                          'variant_count' => scalar @$variants,
 455                          'genealogical_count' => $genealogical };
 456 }
 457
 458 #### HACKERY to cope with IDP's limited idea of what a node name looks like ###
 459
 460 sub _safe_wit_strings {
 461         my( $graph, $c, $groupings, $witness_map ) = @_;
 462         # Parse the graph we were given into a stemma.
 463         my $safegraph = Graph->new();
 464         # Convert the graph to a safe representation and store the conversion.
 465         foreach my $n ( $graph->vertices ) {
 466                 my $sn = _safe_witstr( $n );
 467                 if( exists $witness_map->{$sn} ) {
 468                         warn "Ambiguous stringification $sn for $n and " . $witness_map->{$sn}
 469                                 if $witness_map->{$sn} ne $n;
 470                 } else {
 471                         $witness_map->{$sn} = $n;
 472                 }
 473                 $safegraph->add_vertex( $sn );
 474                 $safegraph->set_vertex_attributes( $sn,
 475                         $graph->get_vertex_attributes( $n ) );
 476         }
 477         foreach my $e ( $graph->edges ) {
 478                 my @safe_e = ( _safe_witstr( $e->[0] ), _safe_witstr( $e->[1] ) );
 479                 $safegraph->add_edge( @safe_e );
 480         }
 481         my $safe_stemma = Text::Tradition::Stemma->new(
 482                 'collation' => $c, 'graph' => $safegraph );
 483
 484         # Now convert the witness groupings to a safe representation.
 485         my $safe_groupings = [];
 486         foreach my $grouping ( @$groupings ) {
 487                 my $safe_grouping = [];
 488                 foreach my $group ( @$grouping ) {
 489                         my $safe_group = [];
 490                         foreach my $n ( @$group ) {
 491                                 my $sn = _safe_witstr( $n );
 492                                 warn "Ambiguous stringification $sn for $n and " . $witness_map->{$sn}
 493                                         if exists $witness_map->{$sn} && $witness_map->{$sn} ne $n;
 494                                 $witness_map->{$sn} = $n;
 495                                 push( @$safe_group, $sn );
 496                         }
 497                         push( @$safe_grouping, $safe_group );
 498                 }
 499                 push( @$safe_groupings, $safe_grouping );
 500         }
 501
 502         # Return it all in the struct we expect.  We have stored the reductions
 503         # in the $witness_map that we were passed.
 504         return { 'graph' => $safe_stemma->editable( { 'linesep' => ' ' } ),
 505                          'groupings' => $safe_groupings };
 506 }
 507
 508 sub _safe_witstr {
 509         my $witstr = shift;
 510         $witstr =~ s/\s+/_/g;
 511         $witstr =~ s/[^\w\d-]//g;
 512         return $witstr;
 513 }
 514
 515 sub _desanitize_names {
 516         my( $jsonstruct, $witness_map ) = @_;
 517         my $result = [];
 518         foreach my $grouping ( @$jsonstruct ) {
 519                 my $real_grouping = [];
 520                 foreach my $element ( @$grouping ) {
 521                         if( ref( $element ) eq 'ARRAY' ) {
 522                                 # it's the groupset.
 523                                 my $real_groupset = [];
 524                                 foreach my $group ( @$element ) {
 525                                         my $real_group = [];
 526                                         foreach my $n ( @$group ) {
 527                                                 my $rn = $witness_map->{$n};
 528                                                 push( @$real_group, $rn );
 529                                         }
 530                                         push( @$real_groupset, $real_group );
 531                                 }
 532                                 push( @$real_grouping, $real_groupset );
 533                         } else {
 534                                 # It is the boolean, not actually a group.
 535                                 push( @$real_grouping, $element );
 536                         }
 537                 }
 538                 push( @$result, $real_grouping );
 539         }
 540         return $result;
 541 }
 542
 543 ### END HACKERY ###
 544
 545 =head2 analyze_location ( $tradition, $graph, $location_hash )
 546
 547 Given the tradition, its stemma graph, and the solution from the graph solver,
 548 work out the rest of the information we want.  For each reading we need missing,
 549 conflict, reading_parents, independent_occurrence, followed, not_followed, and follow_unknown.  Alters the location_hash in place.
 550
 551 =cut
 552
 553 sub analyze_location {
 554         my ( $tradition, $graph, $variant_row ) = @_;
 555
 556         # Make a hash of all known node memberships, and make the subgraphs.
 557         my $contig = {};
 558         my $reading_roots = {};
 559         my $subgraph = {};
 560     foreach my $rdghash ( @{$variant_row->{'readings'}} ) {
 561         my $rid = $rdghash->{'readingid'};
 562                 map { $contig->{$_} = $rid } @{$rdghash->{'group'}};
 563
 564         # Make the subgraph.
 565         my $part = $graph->copy;
 566                 my %these_vertices;
 567                 map { $these_vertices{$_} = 1 } @{$rdghash->{'group'}};
 568                 $part->delete_vertices( grep { !$these_vertices{$_} } $part->vertices );
 569                 $subgraph->{$rid} = $part;
 570                 # Get the reading roots.
 571                 map { $reading_roots->{$_} = $rid } $part->predecessorless_vertices;
 572         }
 573
 574         # Now that we have all the node group memberships, calculate followed/
 575     # non-followed/unknown values for each reading.  Also figure out the
 576     # reading's evident parent(s).
 577     foreach my $rdghash ( @{$variant_row->{'readings'}} ) {
 578         # Group string key - TODO do we need this?
 579         my $gst = wit_stringify( $rdghash->{'group'} );
 580         my $rid = $rdghash->{'readingid'};
 581         # Get the subgraph
 582         my $part = $subgraph->{$rid};
 583
 584         # Start figuring things out.
 585         my @roots = $part->predecessorless_vertices;
 586         $rdghash->{'independent_occurrence'} = scalar @roots;
 587         $rdghash->{'followed'} = scalar( $part->vertices ) - scalar( @roots );
 588         # Find the parent readings, if any, of this reading.
 589         my %rdgparents;
 590         foreach my $wit ( @roots ) {
 591                 # Look in the main stemma to find this witness's extant or known-reading
 592                 # immediate ancestor(s), and look up the reading that each ancestor olds.
 593                         my @check = $graph->predecessors( $wit );
 594                         while( @check ) {
 595                                 my @next;
 596                                 foreach my $wparent( @check ) {
 597                                         my $preading = $contig->{$wparent};
 598                                         if( $preading ) {
 599                                                 $rdgparents{$preading} = 1;
 600                                         } else {
 601                                                 push( @next, $graph->predecessors( $wparent ) );
 602                                         }
 603                                 }
 604                                 @check = @next;
 605                         }
 606                 }
 607                 $rdghash->{'reading_parents'} = [ keys %rdgparents ];
 608
 609                 # Find the number of times this reading was altered, and the number of
 610                 # times we're not sure.
 611                 my( %nofollow, %unknownfollow );
 612                 foreach my $wit ( $part->vertices ) {
 613                         foreach my $wchild ( $graph->successors( $wit ) ) {
 614                                 next if $part->has_vertex( $wchild );
 615                                 if( $reading_roots->{$wchild} && $contig->{$wchild} ) {
 616                                         # It definitely changed here.
 617                                         $nofollow{$wchild} = 1;
 618                                 } elsif( !($contig->{$wchild}) ) {
 619                                         # The child is a hypothetical node not definitely in
 620                                         # any group. Answer is unknown.
 621                                         $unknownfollow{$wchild} = 1;
 622                                 } # else it's a non-root node in a known group, and therefore
 623                                   # is presumed to have its reading from its group, not this link.
 624                         }
 625                 }
 626                 $rdghash->{'not_followed'} = keys %nofollow;
 627                 $rdghash->{'follow_unknown'} = keys %unknownfollow;
 628
 629                 # Now say whether this reading represents a conflict.
 630                 unless( $variant_row->{'genealogical'} ) {
 631                         $rdghash->{'conflict'} = @roots != 1;
 632                 }
 633     }
 634 }
 635
 636
 637 =head2 perl_solver( $tradition, $rank, $stemma_id, @merge_relationship_types )
 638
 639 ** NOTE ** This method should hopefully not be called - it is not guaranteed
 640 to be correct.  Serves as a backup for the real solver.
 641
 642 Runs an analysis of the given tradition, at the location given in $rank,
 643 against the graph of the stemma specified in $stemma_id.  The argument
 644 @merge_relationship_types is an optional list of relationship types for
 645 which readings so related should be treated as equivalent.
 646
 647 Returns a nested array data structure as follows:
 648
 649  [ [ group_list, is_genealogical ], [ group_list, is_genealogical ] ... ]
 650
 651 where the group list is the array of arrays passed in for each element of @groups,
 652 possibly with the addition of hypothetical readings.
 653
 654
 655 =cut
 656
 657 sub perl_solver {
 658         my( $graph, @groups ) = @_;
 659         my @answer;
 660         foreach my $g ( @groups ) {
 661                 push( @answer, _solve_variant_location( $graph, $g ) );
 662         }
 663         return \@answer;
 664 }
 665
 666 sub _solve_variant_location {
 667         my( $graph, $groups ) = @_;
 668         # Now do the work.
 669     my $contig = {};
 670     my $subgraph = {};
 671     my $is_conflicted;
 672     my $conflict = {};
 673
 674     # Mark each ms as in its own group, first.
 675     foreach my $g ( @$groups ) {
 676         my $gst = wit_stringify( $g );
 677         map { $contig->{$_} = $gst } @$g;
 678     }
 679
 680     # Now for each unmarked node in the graph, initialize an array
 681     # for possible group memberships.  We will use this later to
 682     # resolve potential conflicts.
 683     map { $contig->{$_} = [] unless $contig->{$_} } $graph->vertices;
 684     foreach my $g ( sort { scalar @$b <=> scalar @$a } @$groups ) {
 685         my $gst = wit_stringify( $g );  # This is the group name
 686         # Copy the graph, and delete all non-members from the new graph.
 687         my $part = $graph->copy;
 688         my @group_roots;
 689         $part->delete_vertices(
 690             grep { !ref( $contig->{$_} ) && $contig->{$_} ne $gst } $graph->vertices );
 691
 692         # Now look to see if our group is connected.
 693                 if( @$g > 1 ) {
 694                         # We have to take directionality into account.
 695                         # How many root nodes do we have?
 696                         my @roots = grep { ref( $contig->{$_} ) || $contig->{$_} eq $gst }
 697                                 $part->predecessorless_vertices;
 698                         # Assuming that @$g > 1, find the first root node that has at
 699                         # least one successor belonging to our group. If this reading
 700                         # is genealogical, there should be only one, but we will check
 701                         # that implicitly later.
 702                         foreach my $root ( @roots ) {
 703                                 # Prune the tree to get rid of extraneous hypotheticals.
 704                                 $root = _prune_subtree( $part, $root, $contig );
 705                                 next unless $root;
 706                                 # Save this root for our group.
 707                                 push( @group_roots, $root );
 708                                 # Get all the successor nodes of our root.
 709                         }
 710                 } else {
 711                         # Dispense with the trivial case of one reading.
 712                         my $wit = $g->[0];
 713                         @group_roots = ( $wit );
 714                         foreach my $v ( $part->vertices ) {
 715                                 $part->delete_vertex( $v ) unless $v eq $wit;
 716                         }
 717         }
 718
 719         if( @group_roots > 1 ) {
 720                 $conflict->{$gst} = 1;
 721                 $is_conflicted = 1;
 722         }
 723         # Paint the 'hypotheticals' with our group.
 724                 foreach my $wit ( $part->vertices ) {
 725                         if( ref( $contig->{$wit} ) ) {
 726                                 push( @{$contig->{$wit}}, $gst );
 727                         } elsif( $contig->{$wit} ne $gst ) {
 728                                 warn "How did we get here?";
 729                         }
 730                 }
 731
 732
 733                 # Save the relevant subgraph.
 734                 $subgraph->{$gst} = $part;
 735     }
 736
 737         # For each of our hypothetical readings, flatten its 'contig' array if
 738         # the array contains zero or one group.  If we have any unflattened arrays,
 739         # we may need to run the resolution process. If the reading is already known
 740         # to have a conflict, flatten the 'contig' array to nothing; we won't resolve
 741         # it.
 742         my @resolve;
 743         foreach my $wit ( keys %$contig ) {
 744                 next unless ref( $contig->{$wit} );
 745                 if( @{$contig->{$wit}} > 1 ) {
 746                         if( $is_conflicted ) {
 747                                 $contig->{$wit} = '';  # We aren't going to decide.
 748                         } else {
 749                                 push( @resolve, $wit );
 750                         }
 751                 } else {
 752                         my $gst = pop @{$contig->{$wit}};
 753                         $contig->{$wit} = $gst || '';
 754                 }
 755         }
 756
 757     if( @resolve ) {
 758         my $still_contig = {};
 759         foreach my $h ( @resolve ) {
 760             # For each of the hypothetical readings with more than one possibility,
 761             # try deleting it from each of its member subgraphs in turn, and see
 762             # if that breaks the contiguous grouping.
 763             # TODO This can still break in a corner case where group A can use
 764             # either vertex 1 or 2, and group B can use either vertex 2 or 1.
 765             # Revisit this if necessary; it could get brute-force nasty.
 766             foreach my $gst ( @{$contig->{$h}} ) {
 767                 my $gpart = $subgraph->{$gst}->copy();
 768                 # If we have come this far, there is only one root and everything
 769                 # is reachable from it.
 770                 my( $root ) = $gpart->predecessorless_vertices;
 771                 my $reachable = {};
 772                 map { $reachable->{$_} = 1 } $gpart->vertices;
 773
 774                 # Try deleting the hypothetical node.
 775                 $gpart->delete_vertex( $h );
 776                 if( $h eq $root ) {
 777                         # See if we still have a single root.
 778                         my @roots = $gpart->predecessorless_vertices;
 779                         warn "This shouldn't have happened" unless @roots;
 780                         if( @roots > 1 ) {
 781                                 # $h is needed by this group.
 782                                 if( exists( $still_contig->{$h} ) ) {
 783                                         # Conflict!
 784                                         $conflict->{$gst} = 1;
 785                                         $still_contig->{$h} = '';
 786                                 } else {
 787                                         $still_contig->{$h} = $gst;
 788                                 }
 789                         }
 790                 } else {
 791                         # $h is somewhere in the middle. See if everything
 792                         # else can still be reached from the root.
 793                                         my %still_reachable = ( $root => 1 );
 794                                         map { $still_reachable{$_} = 1 }
 795                                                 $gpart->all_successors( $root );
 796                                         foreach my $v ( keys %$reachable ) {
 797                                                 next if $v eq $h;
 798                                                 if( !$still_reachable{$v}
 799                                                         && ( $contig->{$v} eq $gst
 800                                                                  || ( exists $still_contig->{$v}
 801                                                                           && $still_contig->{$v} eq $gst ) ) ) {
 802                                                         # We need $h.
 803                                                         if( exists $still_contig->{$h} ) {
 804                                                                 # Conflict!
 805                                                                 $conflict->{$gst} = 1;
 806                                                                 $still_contig->{$h} = '';
 807                                                         } else {
 808                                                                 $still_contig->{$h} = $gst;
 809                                                         }
 810                                                         last;
 811                                                 } # else we don't need $h in this group.
 812                                         } # end foreach $v
 813                                 } # endif $h eq $root
 814             } # end foreach $gst
 815         } # end foreach $h
 816
 817         # Now we have some hypothetical vertices in $still_contig that are the
 818         # "real" group memberships.  Replace these in $contig.
 819                 foreach my $v ( keys %$contig ) {
 820                         next unless ref $contig->{$v};
 821                         $contig->{$v} = $still_contig->{$v};
 822                 }
 823     } # end if @resolve
 824
 825     my $is_genealogical = keys %$conflict ? JSON::false : JSON::true;
 826         my $variant_row = [ [], $is_genealogical ];
 827         # Fill in the groupings from $contig.
 828         foreach my $g ( @$groups ) {
 829         my $gst = wit_stringify( $g );
 830         my @realgroup = grep { $contig->{$_} eq $gst } keys %$contig;
 831         push( @{$variant_row->[0]}, \@realgroup );
 832     }
 833     return $variant_row;
 834 }
 835
 836 sub _prune_group {
 837         my( $group, $stemma, $graph ) = @_;
 838         # Get these into a form prune_subtree will recognize. Make a "contighash"
 839         my $hypohash = {};
 840         map { $hypohash->{$_} = 1 } @$group;
 841         # ...with reference values for hypotheticals.
 842         map { $hypohash->{$_} = [] } $stemma->hypotheticals;
 843         # Make our subgraph
 844         my $subgraph = $graph->copy;
 845         map { $subgraph->delete_vertex( $_ ) unless exists $hypohash->{$_} }
 846                 $subgraph->vertices;
 847         # ...and find the root.
 848         my( $root ) = $subgraph->predecessorless_vertices;
 849         # Now prune and return the remaining vertices.
 850         _prune_subtree( $subgraph, $root, $hypohash );
 851         return $subgraph->vertices;
 852 }
 853
 854 sub _prune_subtree {
 855     my( $tree, $root, $contighash ) = @_;
 856     # First, delete hypothetical leaves / orphans until there are none left.
 857     my @orphan_hypotheticals = grep { ref( $contighash->{$_} ) }
 858         $tree->successorless_vertices;
 859     while( @orphan_hypotheticals ) {
 860         $tree->delete_vertices( @orphan_hypotheticals );
 861         @orphan_hypotheticals = grep { ref( $contighash->{$_} ) }
 862             $tree->successorless_vertices;
 863     }
 864     # Then delete a hypothetical root with only one successor, moving the
 865     # root to the first child that has no other predecessors.
 866     while( $tree->successors( $root ) == 1 && ref $contighash->{$root} ) {
 867         my @nextroot = $tree->successors( $root );
 868         $tree->delete_vertex( $root );
 869         ( $root ) = grep { $tree->is_predecessorless_vertex( $_ ) } @nextroot;
 870     }
 871     # The tree has been modified in place, but we need to know the new root.
 872     $root = undef unless $root && $tree->has_vertex( $root );
 873     return $root;
 874 }
 875 # Add the variant, subject to a.c. representation logic.
 876 # This assumes that we will see the 'main' version before the a.c. version.
 877 sub add_variant_wit {
 878     my( $arr, $wit, $acstr ) = @_;
 879     my $skip;
 880     if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
 881         my $real = $1;
 882         $skip = grep { $_ =~ /^\Q$real\E$/ } @$arr;
 883     }
 884     push( @$arr, $wit ) unless $skip;
 885 }
 886
 887 sub _useful_variant {
 888         my( $group_readings, $graph, $acstr ) = @_;
 889
 890         # TODO Decide what to do with AC witnesses
 891
 892         # Sort by group size and return
 893         my $is_useful = 0;
 894         my( @readings, @groups );   # The sorted groups for our answer.
 895         foreach my $rdg ( sort { @{$group_readings->{$b}} <=> @{$group_readings->{$a}} }
 896                 keys %$group_readings ) {
 897                 push( @readings, $rdg );
 898                 push( @groups, $group_readings->{$rdg} );
 899                 if( @{$group_readings->{$rdg}} > 1 ) {
 900                         $is_useful++;
 901                 } else {
 902                         my( $wit ) = @{$group_readings->{$rdg}};
 903                         $wit =~ s/^(.*)\Q$acstr\E$/$1/;
 904                         $is_useful++ unless( $graph->is_sink_vertex( $wit ) );
 905                 }
 906         }
 907         if( $is_useful > 1 ) {
 908                 return( \@readings, \@groups );
 909         } else {
 910                 return( [], [] );
 911         }
 912 }
 913
 914 =head2 wit_stringify( $groups )
 915
 916 Takes an array of witness groupings and produces a string like
 917 ['A','B'] / ['C','D','E'] / ['F']
 918
 919 =cut
 920
 921 sub wit_stringify {
 922     my $groups = shift;
 923     my @gst;
 924     # If we were passed an array of witnesses instead of an array of
 925     # groupings, then "group" the witnesses first.
 926     unless( ref( $groups->[0] ) ) {
 927         my $mkgrp = [ $groups ];
 928         $groups = $mkgrp;
 929     }
 930     foreach my $g ( @$groups ) {
 931         push( @gst, '[' . join( ',', map { "'$_'" } @$g ) . ']' );
 932     }
 933     return join( ' / ', @gst );
 934 }
 935
 936 sub _symmdiff {
 937         my( $lista, $listb ) = @_;
 938         my %union;
 939         my %scalars;
 940         map { $union{$_} = 1; $scalars{$_} = $_ } @$lista;
 941         map { $union{$_} += 1; $scalars{$_} = $_ } @$listb;
 942         my @set = grep { $union{$_} == 1 } keys %union;
 943         return map { $scalars{$_} } @set;
 944 }
 945
 946 1;
 947
 948 =head1 LICENSE
 949
 950 This package is free software and is provided "as is" without express
 951 or implied warranty.  You can redistribute it and/or modify it under
 952 the same terms as Perl itself.
 953
 954 =head1 AUTHOR
 955
 956 Tara L Andrews E<lt>aurum@cpan.orgE<gt>