lib/Text/Tradition/Analysis.pm

   1 package Text::Tradition::Analysis;
   2
   3 use strict;
   4 use warnings;
   5 use Benchmark;
   6 use Encode qw/ encode_utf8 /;
   7 use Exporter 'import';
   8 use Graph;
   9 use JSON qw/ encode_json decode_json /;
  10 use LWP::UserAgent;
  11 use Text::Tradition;
  12 use Text::Tradition::Stemma;
  13
  14 use vars qw/ @EXPORT_OK /;
  15 @EXPORT_OK = qw/ run_analysis group_variants analyze_variant_location wit_stringify /;
  16
  17 =head1 NAME
  18
  19 Text::Tradition::Analysis - functions for stemma analysis of a tradition
  20
  21 =head1 SYNOPSIS
  22
  23   use Text::Tradition;
  24   use Text::Tradition::Analysis qw/ run_analysis analyze_variant_location /;
  25   my $t = Text::Tradition->new(
  26     'name' => 'this is a text',
  27     'input' => 'TEI',
  28     'file' => '/path/to/tei_parallel_seg_file.xml' );
  29   $t->add_stemma( 'dotfile' => $stemmafile );
  30
  31   my $variant_data = run_analysis( $tradition );
  32   # Recalculate rank $n treating all orthographic variants as equivalent
  33   my $reanalyze = analyze_variant_location( $tradition, $n, 0, 'orthographic' );
  34
  35 =head1 DESCRIPTION
  36
  37 Text::Tradition is a library for representation and analysis of collated
  38 texts, particularly medieval ones.  The Collation is the central feature of
  39 a Tradition, where the text, its sequence of readings, and its relationships
  40 between readings are actually kept.
  41
  42 =head1 SUBROUTINES
  43
  44 =head2 run_analysis( $tradition, %opts )
  45
  46 Runs the analysis described in analyze_variant_location on every location in the
  47 collation of the given tradition, with the given options. These include:
  48
  49 =over 4
  50
  51 =item * stemma_id - Specify which of the tradition's stemmata to use. Default
  52 is 0 (i.e. the first).
  53
  54 =item * ranks - Specify a list of location ranks to analyze; exclude the rest.
  55
  56 =item * merge_types - Specify a list of relationship types, where related readings
  57 should be treated as identical for the purposes of analysis.
  58
  59 =item * exclude_type1 - Exclude those ranks whose groupings have only type-1 variants.
  60
  61 =back
  62
  63 =begin testing
  64
  65 use Text::Tradition;
  66 use Text::Tradition::Analysis qw/ run_analysis analyze_variant_location /;
  67
  68 my $datafile = 't/data/florilegium_tei_ps.xml';
  69 my $tradition = Text::Tradition->new( 'input' => 'TEI',
  70                                       'name' => 'test0',
  71                                       'file' => $datafile );
  72 my $s = $tradition->add_stemma( 'dotfile' => 't/data/florilegium.dot' );
  73 is( ref( $s ), 'Text::Tradition::Stemma', "Added stemma to tradition" );
  74
  75 my %expected_genealogical = (
  76         1 => 0,
  77         2 => 1,
  78         3 =>  0,
  79         5 =>  0,
  80         7 =>  0,
  81         8 =>  0,
  82         10 => 0,
  83         13 => 1,
  84         33 => 0,
  85         34 => 0,
  86         37 => 0,
  87         60 => 0,
  88         81 => 1,
  89         84 => 0,
  90         87 => 0,
  91         101 => 0,
  92         102 => 0,
  93         122 => 1,
  94         157 => 0,
  95         166 => 1,
  96         169 => 1,
  97         200 => 0,
  98         216 => 1,
  99         217 => 1,
 100         219 => 1,
 101         241 => 1,
 102         242 => 1,
 103         243 => 1,
 104 );
 105
 106 my $data = run_analysis( $tradition );
 107 my $c = $tradition->collation;
 108 foreach my $row ( @{$data->{'variants'}} ) {
 109         # Account for rows that used to be "not useful"
 110         unless( exists $expected_genealogical{$row->{'id'}} ) {
 111                 $expected_genealogical{$row->{'id'}} = 1;
 112         }
 113         my $gen_bool = $row->{'genealogical'} ? 1 : 0;
 114         is( $gen_bool, $expected_genealogical{$row->{'id'}},
 115                 "Got correct genealogical flag for row " . $row->{'id'} );
 116         # Check that we have the right row with the right groups
 117         my $rank = $row->{'id'};
 118         foreach my $rdghash ( @{$row->{'readings'}} ) {
 119                 # Skip 'readings' that aren't really
 120                 next unless $c->reading( $rdghash->{'readingid'} );
 121                 # Check the rank
 122                 is( $c->reading( $rdghash->{'readingid'} )->rank, $rank,
 123                         "Got correct reading rank" );
 124                 # Check the witnesses
 125                 my @realwits = sort $c->reading_witnesses( $rdghash->{'readingid'} );
 126                 my @sgrp = sort @{$rdghash->{'group'}};
 127                 is_deeply( \@sgrp, \@realwits, "Reading analyzed with correct groups" );
 128         }
 129 }
 130 is( $data->{'variant_count'}, 58, "Got right total variant number" );
 131 # TODO Make something meaningful of conflict count, maybe test other bits
 132
 133 =end testing
 134
 135 =cut
 136
 137 sub run_analysis {
 138         my( $tradition, %opts ) = @_;
 139         my $c = $tradition->collation;
 140
 141         my $stemma_id = $opts{'stemma_id'} || 0;
 142         my @ranks = ref( $opts{'ranks'} ) eq 'ARRAY' ? @{$opts{'ranks'}} : ();
 143         my @collapse = ref( $opts{'merge_types'} ) eq 'ARRAY' ? @{$opts{'merge_types'}} : ();
 144
 145         # Get the stemma
 146         my $stemma = $tradition->stemma( $stemma_id );
 147
 148         # Figure out which witnesses we are working with - that is, the ones that
 149         # appear both in the stemma and in the tradition. All others are 'lacunose'
 150         # for our purposes.
 151         my @lacunose = $stemma->hypotheticals;
 152         my @tradition_wits = map { $_->sigil } $tradition->witnesses;
 153         push( @lacunose, _symmdiff( [ $stemma->witnesses ], \@tradition_wits ) );
 154
 155         # Find and mark 'common' ranks for exclusion, unless they were
 156         # explicitly specified.
 157         unless( @ranks ) {
 158                 my %common_rank;
 159                 foreach my $rdg ( $c->common_readings ) {
 160                         $common_rank{$rdg->rank} = 1;
 161                 }
 162                 @ranks = grep { !$common_rank{$_} } ( 1 .. $c->end->rank-1 );
 163         }
 164
 165         # Group the variants to send to the solver
 166         my @groups;
 167         my @use_ranks;
 168         my %lacunae;
 169         my $moved = {};
 170         foreach my $rank ( @ranks ) {
 171                 my $missing = [ @lacunose ];
 172                 my $rankgroup = group_variants( $tradition, $rank, $missing, $moved, \@collapse );
 173                 # Filter out any empty rankgroups
 174                 # (e.g. from the later rank for a transposition)
 175                 next unless keys %$rankgroup;
 176                 if( $opts{'exclude_type1'} ) {
 177                         # Check to see whether this is a "useful" group.
 178                         my( $rdgs, $grps ) = _useful_variant( $rankgroup,
 179                                 $stemma->graph, $c->ac_label );
 180                         next unless @$rdgs;
 181                 }
 182                 push( @use_ranks, $rank );
 183                 push( @groups, $rankgroup );
 184                 $lacunae{$rank} = $missing;
 185         }
 186         # Run the solver
 187         my $answer = solve_variants( $stemma, @groups );
 188
 189         # Do further analysis on the answer
 190         my $conflict_count = 0;
 191         my $aclabel = $c->ac_label;
 192         foreach my $idx ( 0 .. $#use_ranks ) {
 193                 my $location = $answer->{'variants'}->[$idx];
 194                 # Add the rank back in
 195                 my $rank = $use_ranks[$idx];
 196                 $location->{'id'} = $rank;
 197                 # Note what our lacunae are
 198                 my %lmiss;
 199                 map { $lmiss{$_} = 1 } @{$lacunae{$use_ranks[$idx]}};
 200                 $location->{'missing'} = [ keys %lmiss ];
 201
 202                 # Run the extra analysis we need.
 203                 analyze_location( $tradition, $stemma, $location, \%lmiss );
 204
 205                 my @layerwits;
 206                 # Do the final post-analysis tidying up of the data.
 207                 foreach my $rdghash ( @{$location->{'readings'}} ) {
 208                         $conflict_count++
 209                                 if exists $rdghash->{'conflict'} && $rdghash->{'conflict'};
 210                         # Add the reading text back in, setting display value as needed
 211                         my $rdg = $c->reading( $rdghash->{'readingid'} );
 212                         if( $rdg ) {
 213                                 $rdghash->{'text'} = $rdg->text .
 214                                         ( $rdg->rank == $rank ? '' : ' [' . $rdg->rank . ']' );
 215                         }
 216                         # Remove lacunose witnesses from this reading's list now that the
 217                         # analysis is done
 218                         my @realgroup;
 219                         map { push( @realgroup, $_ ) unless $lmiss{$_} } @{$rdghash->{'group'}};
 220                         $rdghash->{'group'} = \@realgroup;
 221                         # Note any layered witnesses that appear in this group
 222                         foreach( @realgroup ) {
 223                                 if( $_ =~ /^(.*)\Q$aclabel\E$/ ) {
 224                                         push( @layerwits, $1 );
 225                                 }
 226                         }
 227                 }
 228                 $location->{'layerwits'} = \@layerwits if @layerwits;
 229         }
 230         $answer->{'conflict_count'} = $conflict_count;
 231
 232         return $answer;
 233 }
 234
 235 =head2 group_variants( $tradition, $rank, $lacunose, @merge_relationship_types )
 236
 237 Groups the variants at the given $rank of the collation, treating any
 238 relationships in @merge_relationship_types as equivalent.  $lacunose should
 239 be a reference to an array, to which the sigla of lacunose witnesses at this
 240 rank will be appended; $transposed should be a reference to a hash, wherein
 241 the identities of transposed readings and their relatives will be stored.
 242
 243 Returns a hash $group_readings where $rdg is attested by the witnesses listed
 244 in $group_readings->{$rdg}.
 245
 246 =cut
 247
 248 # Return group_readings, groups, lacunose
 249 sub group_variants {
 250         my( $tradition, $rank, $lacunose, $transposed, $collapse ) = @_;
 251         my $c = $tradition->collation;
 252         my $aclabel = $c->ac_label;
 253         # Get the alignment table readings
 254         my %readings_at_rank;
 255         my %is_lacunose; # lookup table for $lacunose
 256         map { $is_lacunose{$_} = 1 } @$lacunose;
 257         my @check_for_gaps;
 258         my %moved_wits;
 259         foreach my $tablewit ( @{$c->alignment_table->{'alignment'}} ) {
 260                 my $rdg = $tablewit->{'tokens'}->[$rank-1];
 261                 my $wit = $tablewit->{'witness'};
 262                 # Exclude the witness if it is "lacunose" which if we got here
 263                 # means "not in the stemma".
 264                 next if $is_lacunose{$wit};
 265                 # Note if the witness is actually in a lacuna
 266                 if( $rdg && $rdg->{'t'}->is_lacuna ) {
 267                         _add_to_witlist( $wit, $lacunose, $aclabel );
 268                 # Otherwise the witness either has a positive reading...
 269                 } elsif( $rdg ) {
 270                         # If the reading has been counted elsewhere as a transposition, ignore it.
 271                         if( $transposed->{$rdg->{'t'}->id} ) {
 272                                 # TODO This doesn't cope with three-way transpositions
 273                                 map { $moved_wits{$_} = 1 } @{$transposed->{$rdg->{'t'}->id}};
 274                                 next;
 275                         }
 276                         # Otherwise, record it...
 277                         $readings_at_rank{$rdg->{'t'}->id} = $rdg->{'t'};
 278                         # ...and grab any transpositions, and their relations.
 279                         my @transp = grep { $_->rank != $rank } $rdg->{'t'}->related_readings();
 280                         foreach my $trdg ( @transp ) {
 281                                 map { $moved_wits{$_} = 1 } $trdg->witnesses;
 282                                 $transposed->{$trdg->id} = [ $rdg->{'t'}->witnesses ];
 283                                 $readings_at_rank{$trdg->id} = $trdg;
 284                         }
 285                 # ...or it is empty, ergo a gap.
 286                 } else {
 287                         push( @check_for_gaps, $wit );
 288                 }
 289         }
 290         my @gap_wits;
 291         map { _add_to_witlist( $_, \@gap_wits, $aclabel )
 292                 unless $moved_wits{$_} } @check_for_gaps;
 293         # TODO check for, and break into a new row, any doubled-up witness readings
 294         #    after transposition...
 295         # Group the readings, collapsing groups by relationship if needed
 296         my %grouped_readings;
 297         foreach my $rdg ( values %readings_at_rank ) {
 298                 # Skip readings that have been collapsed into others.
 299                 next if exists $grouped_readings{$rdg->id} && !$grouped_readings{$rdg->id};
 300                 # Get the witness list, including from readings collapsed into this one.
 301                 my @wits = $rdg->witnesses;
 302                 if( $collapse ) {
 303                         my $filter = sub { my $r = $_[0]; grep { $_ eq $r->type } @$collapse; };
 304                         foreach my $other ( $rdg->related_readings( $filter ) ) {
 305                                 my @otherwits = $other->witnesses;
 306                                 push( @wits, @otherwits );
 307                                 $grouped_readings{$other->id} = 0;
 308                         }
 309                 }
 310                 # Filter the group to those witnesses in the stemma
 311                 my @use_wits;
 312                 foreach my $wit ( @wits ) {
 313                         next if $is_lacunose{$wit};
 314                         push( @use_wits, $wit );
 315                 }
 316                 $grouped_readings{$rdg->id} = \@use_wits;
 317         }
 318         $grouped_readings{'(omitted)'} = \@gap_wits if @gap_wits;
 319         # Get rid of our collapsed readings
 320         map { delete $grouped_readings{$_} unless $grouped_readings{$_} }
 321                 keys %grouped_readings
 322                 if $collapse;
 323
 324         # Return the result
 325         return \%grouped_readings;
 326 }
 327
 328 # Helper function to ensure that X and X a.c. never appear in the same list.
 329 sub _add_to_witlist {
 330         my( $wit, $list, $acstr ) = @_;
 331         my %inlist;
 332         my $idx = 0;
 333         map { $inlist{$_} = $idx++ } @$list;
 334         if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
 335                 my $acwit = $1;
 336                 unless( exists $inlist{$acwit} ) {
 337                         push( @$list, $acwit.$acstr );
 338                 }
 339         } else {
 340                 if( exists( $inlist{$wit.$acstr} ) ) {
 341                         # Replace the a.c. version with the main witness
 342                         my $i = $inlist{$wit.$acstr};
 343                         $list->[$i] = $wit;
 344                 } else {
 345                         push( @$list, $wit );
 346                 }
 347         }
 348 }
 349
 350 =head2 solve_variants( $graph, @groups )
 351
 352 Sends the set of groups to the external graph solver service and returns
 353 a cleaned-up answer, adding the rank IDs back where they belong.
 354
 355 The JSON has the form
 356   { "graph": [ stemmagraph DOT string without newlines ],
 357     "groupings": [ array of arrays of groups, one per rank ] }
 358
 359 The answer has the form
 360   { "variants" => [ array of variant location structures ],
 361     "variant_count" => total,
 362     "conflict_count" => number of conflicts detected,
 363     "genealogical_count" => number of solutions found }
 364
 365 =cut
 366
 367 sub solve_variants {
 368         my( $stemma, @groups ) = @_;
 369         my $aclabel = $stemma->collation->ac_label;
 370
 371         # Filter the groups down to distinct groups, and work out what graph
 372         # should be used in the calculation of each group. We want to send each
 373         # distinct problem to the solver only once.
 374         # We need a whole bunch of lookup tables for this.
 375         my $index_groupkeys = {};       # Save the order of readings
 376         my $group_indices = {};         # Save the indices that have a given grouping
 377         my $graph_problems = {};        # Save the groupings for the given graph
 378
 379         foreach my $idx ( 0..$#groups ) {
 380                 my $ghash = $groups[$idx];
 381                 my @grouping;
 382                 # Sort the groupings from big to little, and scan for a.c. witnesses
 383                 # that would need an extended graph.
 384                 my @acwits;   # note which AC witnesses crop up at this rank
 385                 my @idxkeys = sort { scalar @{$ghash->{$b}} <=> scalar @{$ghash->{$a}} }
 386                         keys %$ghash;
 387                 foreach my $rdg ( @idxkeys ) {
 388                         my @sg = sort @{$ghash->{$rdg}};
 389                         push( @acwits, grep { $_ =~ /\Q$aclabel\E$/ } @sg );
 390                         push( @grouping, \@sg );
 391                 }
 392                 # Save the reading order
 393                 $index_groupkeys->{$idx} = \@idxkeys;
 394
 395                 # Now associate the distinct group with this index
 396                 my $gstr = wit_stringify( \@grouping );
 397                 push( @{$group_indices->{$gstr}}, $idx );
 398
 399                 # Finally, add the group to the list to be calculated for this graph.
 400                 map { s/\Q$aclabel\E$// } @acwits;
 401                 my $graph = $stemma->extend_graph( \@acwits );
 402                 unless( exists $graph_problems->{"$graph"} ) {
 403                         $graph_problems->{"$graph"} = { 'object' => $graph, 'groups' => [] };
 404                 }
 405                 push( @{$graph_problems->{"$graph"}->{'groups'}}, \@grouping );
 406         }
 407
 408         ## For each distinct graph, send its groups to the solver.
 409         my $solver_url = 'http://byzantini.st/cgi-bin/graphcalc.cgi';
 410         my $ua = LWP::UserAgent->new();
 411         ## Witness map is a HACK to get around limitations in node names from IDP
 412         my $witness_map = {};
 413         ## Variables to store answers as they come back
 414         my $variants = [ ( undef ) x ( scalar keys %$index_groupkeys ) ];
 415         my $genealogical = 0;
 416         foreach my $graphkey ( keys %$graph_problems ) {
 417                 my $graph = $graph_problems->{$graphkey}->{'object'};
 418                 my $groupings = $graph_problems->{$graphkey}->{'groups'};
 419                 my $json = encode_json( _safe_wit_strings( $graph, $stemma->collation,
 420                         $groupings, $witness_map ) );
 421                 # Send it off and get the result
 422                 #print STDERR "Sending request: $json\n";
 423                 my $resp = $ua->post( $solver_url, 'Content-Type' => 'application/json',
 424                                                           'Content' => $json );
 425                 my $answer;
 426                 my $used_idp;
 427                 if( $resp->is_success ) {
 428                         $answer = _desanitize_names( decode_json( $resp->content ), $witness_map );
 429                         $used_idp = 1;
 430                 } else {
 431                         # Fall back to the old method.
 432                         warn "IDP solver returned " . $resp->status_line . " / " . $resp->content
 433                                 . "; falling back to perl method";
 434                         $answer = perl_solver( $graph, @$groupings );
 435                 }
 436                 ## The answer is the evaluated groupings, plus a boolean for whether
 437                 ## they were genealogical.  Reconstruct our original groups.
 438                 foreach my $gidx ( 0 .. $#{$groupings} ) {
 439                         my( $calc_groups, $result ) = @{$answer->[$gidx]};
 440                         if( $result ) {
 441                                 $genealogical++;
 442                                 # Prune the calculated groups, in case the IDP solver failed to.
 443                                 if( $used_idp ) {
 444                                         my @pruned_groups;
 445                                         foreach my $cg ( @$calc_groups ) {
 446                                                 # This is a little wasteful but the path of least
 447                                                 # resistance. Send both the stemma, which knows what
 448                                                 # its hypotheticals are, and the actual graph used.
 449                                                 my @pg = _prune_group( $cg, $stemma, $graph );
 450                                                 push( @pruned_groups, \@pg );
 451                                         }
 452                                         $calc_groups = \@pruned_groups;
 453                                 }
 454                         }
 455                         # Retrieve the key for the original group that went to the solver
 456                         my $input_group = wit_stringify( $groupings->[$gidx] );
 457                         foreach my $oidx ( @{$group_indices->{$input_group}} ) {
 458                                 my @readings = @{$index_groupkeys->{$oidx}};
 459                                 my $vstruct = {
 460                                         'genealogical' => $result,
 461                                         'readings' => [],
 462                                 };
 463                                 foreach my $ridx ( 0 .. $#readings ) {
 464                                         push( @{$vstruct->{'readings'}},
 465                                                 { 'readingid' => $readings[$ridx],
 466                                                   'group' => $calc_groups->[$ridx] } );
 467                                 }
 468                                 $variants->[$oidx] = $vstruct;
 469                         }
 470                 }
 471         }
 472
 473         return { 'variants' => $variants,
 474                          'variant_count' => scalar @$variants,
 475                          'genealogical_count' => $genealogical };
 476 }
 477
 478 #### HACKERY to cope with IDP's limited idea of what a node name looks like ###
 479
 480 sub _safe_wit_strings {
 481         my( $graph, $c, $groupings, $witness_map ) = @_;
 482         # Parse the graph we were given into a stemma.
 483         my $safegraph = Graph->new();
 484         # Convert the graph to a safe representation and store the conversion.
 485         foreach my $n ( $graph->vertices ) {
 486                 my $sn = _safe_witstr( $n );
 487                 if( exists $witness_map->{$sn} ) {
 488                         warn "Ambiguous stringification $sn for $n and " . $witness_map->{$sn}
 489                                 if $witness_map->{$sn} ne $n;
 490                 } else {
 491                         $witness_map->{$sn} = $n;
 492                 }
 493                 $safegraph->add_vertex( $sn );
 494                 $safegraph->set_vertex_attributes( $sn,
 495                         $graph->get_vertex_attributes( $n ) );
 496         }
 497         foreach my $e ( $graph->edges ) {
 498                 my @safe_e = ( _safe_witstr( $e->[0] ), _safe_witstr( $e->[1] ) );
 499                 $safegraph->add_edge( @safe_e );
 500         }
 501         my $safe_stemma = Text::Tradition::Stemma->new(
 502                 'collation' => $c, 'graph' => $safegraph );
 503
 504         # Now convert the witness groupings to a safe representation.
 505         my $safe_groupings = [];
 506         foreach my $grouping ( @$groupings ) {
 507                 my $safe_grouping = [];
 508                 foreach my $group ( @$grouping ) {
 509                         my $safe_group = [];
 510                         foreach my $n ( @$group ) {
 511                                 my $sn = _safe_witstr( $n );
 512                                 warn "Ambiguous stringification $sn for $n and " . $witness_map->{$sn}
 513                                         if exists $witness_map->{$sn} && $witness_map->{$sn} ne $n;
 514                                 $witness_map->{$sn} = $n;
 515                                 push( @$safe_group, $sn );
 516                         }
 517                         push( @$safe_grouping, $safe_group );
 518                 }
 519                 push( @$safe_groupings, $safe_grouping );
 520         }
 521
 522         # Return it all in the struct we expect.  We have stored the reductions
 523         # in the $witness_map that we were passed.
 524         return { 'graph' => $safe_stemma->editable( { 'linesep' => ' ' } ),
 525                          'groupings' => $safe_groupings };
 526 }
 527
 528 sub _safe_witstr {
 529         my $witstr = shift;
 530         $witstr =~ s/\s+/_/g;
 531         $witstr =~ s/[^\w\d-]//g;
 532         return $witstr;
 533 }
 534
 535 sub _desanitize_names {
 536         my( $jsonstruct, $witness_map ) = @_;
 537         my $result = [];
 538         foreach my $grouping ( @$jsonstruct ) {
 539                 my $real_grouping = [];
 540                 foreach my $element ( @$grouping ) {
 541                         if( ref( $element ) eq 'ARRAY' ) {
 542                                 # it's the groupset.
 543                                 my $real_groupset = [];
 544                                 foreach my $group ( @$element ) {
 545                                         my $real_group = [];
 546                                         foreach my $n ( @$group ) {
 547                                                 my $rn = $witness_map->{$n};
 548                                                 push( @$real_group, $rn );
 549                                         }
 550                                         push( @$real_groupset, $real_group );
 551                                 }
 552                                 push( @$real_grouping, $real_groupset );
 553                         } else {
 554                                 # It is the boolean, not actually a group.
 555                                 push( @$real_grouping, $element );
 556                         }
 557                 }
 558                 push( @$result, $real_grouping );
 559         }
 560         return $result;
 561 }
 562
 563 ### END HACKERY ###
 564
 565 =head2 analyze_location ( $tradition, $graph, $location_hash )
 566
 567 Given the tradition, its stemma graph, and the solution from the graph solver,
 568 work out the rest of the information we want.  For each reading we need missing,
 569 conflict, reading_parents, independent_occurrence, followed, not_followed, and follow_unknown.  Alters the location_hash in place.
 570
 571 =cut
 572
 573 sub analyze_location {
 574         my ( $tradition, $stemma, $variant_row, $lacunose ) = @_;
 575         my $c = $tradition->collation;
 576
 577         # Make a hash of all known node memberships, and make the subgraphs.
 578         my $contig = {};
 579         my $reading_roots = {};
 580         my $subgraph = {};
 581         my $acstr = $c->ac_label;
 582         my @acwits;
 583         $DB::single = 1 if $variant_row->{id} == 87;
 584         # Note which witnesses positively belong to which group
 585     foreach my $rdghash ( @{$variant_row->{'readings'}} ) {
 586         my $rid = $rdghash->{'readingid'};
 587         foreach my $wit ( @{$rdghash->{'group'}} ) {
 588                 $contig->{$wit} = $rid;
 589             if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
 590                 push( @acwits, $1 );
 591             }
 592         }
 593         }
 594
 595         # Get the actual graph we should work with
 596         my $graph = @acwits ? $stemma->extend_graph( \@acwits ) : $stemma->graph;
 597
 598         # Now, armed with that knowledge, make a subgraph for each reading
 599         # and note the root(s) of each subgraph.
 600         foreach my $rdghash( @{$variant_row->{'readings'}} ) {
 601         my $rid = $rdghash->{'readingid'};
 602         my %rdgwits;
 603         # Make the subgraph.
 604         my $part = $graph->copy;
 605         my @todelete = grep { exists $contig->{$_} && $contig->{$_} ne $rid }
 606                 keys %$contig;
 607         $part->delete_vertices( @todelete );
 608         _prune_subtree( $part, $lacunose );
 609                 $subgraph->{$rid} = $part;
 610                 # Record the remaining lacunose nodes as part of this group, if
 611                 # we are dealing with a non-genealogical reading.
 612                 unless( $variant_row->{'genealogical'} ) {
 613                         map { $contig->{$_} = $rid } $part->vertices;
 614                 }
 615                 # Get the reading roots.
 616                 map { $reading_roots->{$_} = $rid } $part->predecessorless_vertices;
 617         }
 618
 619         # Now that we have all the node group memberships, calculate followed/
 620     # non-followed/unknown values for each reading.  Also figure out the
 621     # reading's evident parent(s).
 622     foreach my $rdghash ( @{$variant_row->{'readings'}} ) {
 623         my $rid = $rdghash->{'readingid'};
 624         # Get the subgraph
 625         my $part = $subgraph->{$rid};
 626
 627         # Start figuring things out.
 628         my @roots = grep { $reading_roots->{$_} eq $rid } keys %$reading_roots;
 629         $rdghash->{'independent_occurrence'} = \@roots;
 630         $rdghash->{'followed'} = scalar( $part->vertices ) - scalar( @roots );
 631         # Find the parent readings, if any, of this reading.
 632         my $rdgparents = {};
 633         foreach my $wit ( @roots ) {
 634                 # Look in the main stemma to find this witness's extant or known-reading
 635                 # immediate ancestor(s), and look up the reading that each ancestor olds.
 636                         my @check = $graph->predecessors( $wit );
 637                         while( @check ) {
 638                                 my @next;
 639                                 foreach my $wparent( @check ) {
 640                                         my $preading = $contig->{$wparent};
 641                                         if( $preading ) {
 642                                                 $rdgparents->{$preading} = 1;
 643                                         } else {
 644                                                 push( @next, $graph->predecessors( $wparent ) );
 645                                         }
 646                                 }
 647                                 @check = @next;
 648                         }
 649                 }
 650                 foreach my $p ( keys %$rdgparents ) {
 651                         # Resolve the relationship of the parent to the reading, and
 652                         # save it in our hash.
 653                         my $pobj = $c->reading( $p );
 654                         my $relation;
 655                         my $prep = $pobj ? $pobj->id . ' (' . $pobj->text . ')' : $p;
 656                         if( $pobj ) {
 657                                 my $rel = $c->get_relationship( $p, $rdghash->{readingid} );
 658                                 if( $rel ) {
 659                                         $relation = { type => $rel->type };
 660                                         if( $rel->has_annotation ) {
 661                                                 $relation->{'annotation'} = $rel->annotation;
 662                                         }
 663                                 }
 664                         }
 665                         $rdgparents->{$p} = { 'label' => $prep, 'relation' => $relation };
 666                 }
 667
 668                 $rdghash->{'reading_parents'} = $rdgparents;
 669
 670                 # Find the number of times this reading was altered, and the number of
 671                 # times we're not sure.
 672                 my( %nofollow, %unknownfollow );
 673                 foreach my $wit ( $part->vertices ) {
 674                         foreach my $wchild ( $graph->successors( $wit ) ) {
 675                                 next if $part->has_vertex( $wchild );
 676                                 if( $reading_roots->{$wchild} && $contig->{$wchild} ) {
 677                                         # It definitely changed here.
 678                                         $nofollow{$wchild} = 1;
 679                                 } elsif( !($contig->{$wchild}) ) {
 680                                         # The child is a hypothetical node not definitely in
 681                                         # any group. Answer is unknown.
 682                                         $unknownfollow{$wchild} = 1;
 683                                 } # else it's a non-root node in a known group, and therefore
 684                                   # is presumed to have its reading from its group, not this link.
 685                         }
 686                 }
 687                 $rdghash->{'not_followed'} = keys %nofollow;
 688                 $rdghash->{'follow_unknown'} = keys %unknownfollow;
 689
 690                 # Now say whether this reading represents a conflict.
 691                 unless( $variant_row->{'genealogical'} ) {
 692                         $rdghash->{'conflict'} = @roots != 1;
 693                 }
 694     }
 695 }
 696
 697
 698 =head2 perl_solver( $tradition, $rank, $stemma_id, @merge_relationship_types )
 699
 700 ** NOTE ** This method should hopefully not be called - it is not guaranteed
 701 to be correct.  Serves as a backup for the real solver.
 702
 703 Runs an analysis of the given tradition, at the location given in $rank,
 704 against the graph of the stemma specified in $stemma_id.  The argument
 705 @merge_relationship_types is an optional list of relationship types for
 706 which readings so related should be treated as equivalent.
 707
 708 Returns a nested array data structure as follows:
 709
 710  [ [ group_list, is_genealogical ], [ group_list, is_genealogical ] ... ]
 711
 712 where the group list is the array of arrays passed in for each element of @groups,
 713 possibly with the addition of hypothetical readings.
 714
 715
 716 =cut
 717
 718 sub perl_solver {
 719         my( $graph, @groups ) = @_;
 720         my @answer;
 721         foreach my $g ( @groups ) {
 722                 push( @answer, _solve_variant_location( $graph, $g ) );
 723         }
 724         return \@answer;
 725 }
 726
 727 sub _solve_variant_location {
 728         my( $graph, $groups ) = @_;
 729         # Now do the work.
 730     my $contig = {};
 731     my $subgraph = {};
 732     my $is_conflicted;
 733     my $conflict = {};
 734
 735     # Mark each ms as in its own group, first.
 736     foreach my $g ( @$groups ) {
 737         my $gst = wit_stringify( $g );
 738         map { $contig->{$_} = $gst } @$g;
 739     }
 740
 741     # Now for each unmarked node in the graph, initialize an array
 742     # for possible group memberships.  We will use this later to
 743     # resolve potential conflicts.
 744     map { $contig->{$_} = [] unless $contig->{$_} } $graph->vertices;
 745     foreach my $g ( sort { scalar @$b <=> scalar @$a } @$groups ) {
 746         my $gst = wit_stringify( $g );  # This is the group name
 747         # Copy the graph, and delete all non-members from the new graph.
 748         my $part = $graph->copy;
 749         my @group_roots;
 750         $part->delete_vertices(
 751             grep { !ref( $contig->{$_} ) && $contig->{$_} ne $gst } $graph->vertices );
 752
 753         # Now look to see if our group is connected.
 754                 if( @$g > 1 ) {
 755                         # We have to take directionality into account.
 756                         # How many root nodes do we have?
 757                         my @roots = grep { ref( $contig->{$_} ) || $contig->{$_} eq $gst }
 758                                 $part->predecessorless_vertices;
 759                         # Assuming that @$g > 1, find the first root node that has at
 760                         # least one successor belonging to our group. If this reading
 761                         # is genealogical, there should be only one, but we will check
 762                         # that implicitly later.
 763                         foreach my $root ( @roots ) {
 764                                 # Prune the tree to get rid of extraneous hypotheticals.
 765                                 $root = _prune_subtree_old( $part, $root, $contig );
 766                                 next unless $root;
 767                                 # Save this root for our group.
 768                                 push( @group_roots, $root );
 769                                 # Get all the successor nodes of our root.
 770                         }
 771                 } else {
 772                         # Dispense with the trivial case of one reading.
 773                         my $wit = $g->[0];
 774                         @group_roots = ( $wit );
 775                         foreach my $v ( $part->vertices ) {
 776                                 $part->delete_vertex( $v ) unless $v eq $wit;
 777                         }
 778         }
 779
 780         if( @group_roots > 1 ) {
 781                 $conflict->{$gst} = 1;
 782                 $is_conflicted = 1;
 783         }
 784         # Paint the 'hypotheticals' with our group.
 785                 foreach my $wit ( $part->vertices ) {
 786                         if( ref( $contig->{$wit} ) ) {
 787                                 push( @{$contig->{$wit}}, $gst );
 788                         } elsif( $contig->{$wit} ne $gst ) {
 789                                 warn "How did we get here?";
 790                         }
 791                 }
 792
 793
 794                 # Save the relevant subgraph.
 795                 $subgraph->{$gst} = $part;
 796     }
 797
 798         # For each of our hypothetical readings, flatten its 'contig' array if
 799         # the array contains zero or one group.  If we have any unflattened arrays,
 800         # we may need to run the resolution process. If the reading is already known
 801         # to have a conflict, flatten the 'contig' array to nothing; we won't resolve
 802         # it.
 803         my @resolve;
 804         foreach my $wit ( keys %$contig ) {
 805                 next unless ref( $contig->{$wit} );
 806                 if( @{$contig->{$wit}} > 1 ) {
 807                         if( $is_conflicted ) {
 808                                 $contig->{$wit} = '';  # We aren't going to decide.
 809                         } else {
 810                                 push( @resolve, $wit );
 811                         }
 812                 } else {
 813                         my $gst = pop @{$contig->{$wit}};
 814                         $contig->{$wit} = $gst || '';
 815                 }
 816         }
 817
 818     if( @resolve ) {
 819         my $still_contig = {};
 820         foreach my $h ( @resolve ) {
 821             # For each of the hypothetical readings with more than one possibility,
 822             # try deleting it from each of its member subgraphs in turn, and see
 823             # if that breaks the contiguous grouping.
 824             # TODO This can still break in a corner case where group A can use
 825             # either vertex 1 or 2, and group B can use either vertex 2 or 1.
 826             # Revisit this if necessary; it could get brute-force nasty.
 827             foreach my $gst ( @{$contig->{$h}} ) {
 828                 my $gpart = $subgraph->{$gst}->copy();
 829                 # If we have come this far, there is only one root and everything
 830                 # is reachable from it.
 831                 my( $root ) = $gpart->predecessorless_vertices;
 832                 my $reachable = {};
 833                 map { $reachable->{$_} = 1 } $gpart->vertices;
 834
 835                 # Try deleting the hypothetical node.
 836                 $gpart->delete_vertex( $h );
 837                 if( $h eq $root ) {
 838                         # See if we still have a single root.
 839                         my @roots = $gpart->predecessorless_vertices;
 840                         warn "This shouldn't have happened" unless @roots;
 841                         if( @roots > 1 ) {
 842                                 # $h is needed by this group.
 843                                 if( exists( $still_contig->{$h} ) ) {
 844                                         # Conflict!
 845                                         $conflict->{$gst} = 1;
 846                                         $still_contig->{$h} = '';
 847                                 } else {
 848                                         $still_contig->{$h} = $gst;
 849                                 }
 850                         }
 851                 } else {
 852                         # $h is somewhere in the middle. See if everything
 853                         # else can still be reached from the root.
 854                                         my %still_reachable = ( $root => 1 );
 855                                         map { $still_reachable{$_} = 1 }
 856                                                 $gpart->all_successors( $root );
 857                                         foreach my $v ( keys %$reachable ) {
 858                                                 next if $v eq $h;
 859                                                 if( !$still_reachable{$v}
 860                                                         && ( $contig->{$v} eq $gst
 861                                                                  || ( exists $still_contig->{$v}
 862                                                                           && $still_contig->{$v} eq $gst ) ) ) {
 863                                                         # We need $h.
 864                                                         if( exists $still_contig->{$h} ) {
 865                                                                 # Conflict!
 866                                                                 $conflict->{$gst} = 1;
 867                                                                 $still_contig->{$h} = '';
 868                                                         } else {
 869                                                                 $still_contig->{$h} = $gst;
 870                                                         }
 871                                                         last;
 872                                                 } # else we don't need $h in this group.
 873                                         } # end foreach $v
 874                                 } # endif $h eq $root
 875             } # end foreach $gst
 876         } # end foreach $h
 877
 878         # Now we have some hypothetical vertices in $still_contig that are the
 879         # "real" group memberships.  Replace these in $contig.
 880                 foreach my $v ( keys %$contig ) {
 881                         next unless ref $contig->{$v};
 882                         $contig->{$v} = $still_contig->{$v};
 883                 }
 884     } # end if @resolve
 885
 886     my $is_genealogical = keys %$conflict ? JSON::false : JSON::true;
 887         my $variant_row = [ [], $is_genealogical ];
 888         # Fill in the groupings from $contig.
 889         foreach my $g ( @$groups ) {
 890         my $gst = wit_stringify( $g );
 891         my @realgroup = grep { $contig->{$_} eq $gst } keys %$contig;
 892         push( @{$variant_row->[0]}, \@realgroup );
 893     }
 894     return $variant_row;
 895 }
 896
 897 sub _prune_group {
 898         my( $group, $stemma, $graph ) = @_;
 899         my $lacunose = {};
 900         map { $lacunose->{$_} = 1 } $stemma->hypotheticals;
 901         map { $lacunose->{$_} = 0 } @$group;
 902         # Make our subgraph
 903         my $subgraph = $graph->copy;
 904         map { $subgraph->delete_vertex( $_ ) unless exists $lacunose->{$_} }
 905                 $subgraph->vertices;
 906         # ...and find the root.
 907         # Now prune and return the remaining vertices.
 908         _prune_subtree( $subgraph, $lacunose );
 909         return $subgraph->vertices;
 910 }
 911
 912 sub _prune_subtree {
 913         my( $tree, $lacunose ) = @_;
 914
 915         # Delete lacunose witnesses that have no successors
 916     my @orphan_hypotheticals;
 917     my $ctr = 0;
 918     do {
 919         die "Infinite loop on leaves" if $ctr > 100;
 920         @orphan_hypotheticals = grep { $lacunose->{$_} }
 921                 $tree->successorless_vertices;
 922         $tree->delete_vertices( @orphan_hypotheticals );
 923         $ctr++;
 924     } while( @orphan_hypotheticals );
 925
 926         # Delete lacunose roots that have a single successor
 927         my @redundant_root;
 928         $ctr = 0;
 929         do {
 930         die "Infinite loop on roots" if $ctr > 100;
 931                 @redundant_root = grep { $lacunose->{$_} && $tree->successors( $_ ) == 1 }
 932                         $tree->predecessorless_vertices;
 933                 $tree->delete_vertices( @redundant_root );
 934                 $ctr++;
 935         } while( @redundant_root );
 936 }
 937
 938 sub _prune_subtree_old {
 939     my( $tree, $root, $contighash ) = @_;
 940     # First, delete hypothetical leaves / orphans until there are none left.
 941     my @orphan_hypotheticals = grep { ref( $contighash->{$_} ) }
 942         $tree->successorless_vertices;
 943     while( @orphan_hypotheticals ) {
 944         $tree->delete_vertices( @orphan_hypotheticals );
 945         @orphan_hypotheticals = grep { ref( $contighash->{$_} ) }
 946             $tree->successorless_vertices;
 947     }
 948     # Then delete a hypothetical root with only one successor, moving the
 949     # root to the first child that has no other predecessors.
 950     while( $tree->successors( $root ) == 1 && ref $contighash->{$root} ) {
 951         my @nextroot = $tree->successors( $root );
 952         $tree->delete_vertex( $root );
 953         ( $root ) = grep { $tree->is_predecessorless_vertex( $_ ) } @nextroot;
 954     }
 955     # The tree has been modified in place, but we need to know the new root.
 956     $root = undef unless $root && $tree->has_vertex( $root );
 957     return $root;
 958 }
 959 # Add the variant, subject to a.c. representation logic.
 960 # This assumes that we will see the 'main' version before the a.c. version.
 961 sub add_variant_wit {
 962     my( $arr, $wit, $acstr ) = @_;
 963     my $skip;
 964     if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
 965         my $real = $1;
 966         $skip = grep { $_ =~ /^\Q$real\E$/ } @$arr;
 967     }
 968     push( @$arr, $wit ) unless $skip;
 969 }
 970
 971 sub _useful_variant {
 972         my( $group_readings, $graph, $acstr ) = @_;
 973
 974         # TODO Decide what to do with AC witnesses
 975
 976         # Sort by group size and return
 977         my $is_useful = 0;
 978         my( @readings, @groups );   # The sorted groups for our answer.
 979         foreach my $rdg ( sort { @{$group_readings->{$b}} <=> @{$group_readings->{$a}} }
 980                 keys %$group_readings ) {
 981                 push( @readings, $rdg );
 982                 push( @groups, $group_readings->{$rdg} );
 983                 if( @{$group_readings->{$rdg}} > 1 ) {
 984                         $is_useful++;
 985                 } else {
 986                         my( $wit ) = @{$group_readings->{$rdg}};
 987                         $wit =~ s/^(.*)\Q$acstr\E$/$1/;
 988                         $is_useful++ unless( $graph->is_sink_vertex( $wit ) );
 989                 }
 990         }
 991         if( $is_useful > 1 ) {
 992                 return( \@readings, \@groups );
 993         } else {
 994                 return( [], [] );
 995         }
 996 }
 997
 998 =head2 wit_stringify( $groups )
 999
1000 Takes an array of witness groupings and produces a string like
1001 ['A','B'] / ['C','D','E'] / ['F']
1002
1003 =cut
1004
1005 sub wit_stringify {
1006     my $groups = shift;
1007     my @gst;
1008     # If we were passed an array of witnesses instead of an array of
1009     # groupings, then "group" the witnesses first.
1010     unless( ref( $groups->[0] ) ) {
1011         my $mkgrp = [ $groups ];
1012         $groups = $mkgrp;
1013     }
1014     foreach my $g ( @$groups ) {
1015         push( @gst, '[' . join( ',', map { "'$_'" } @$g ) . ']' );
1016     }
1017     return join( ' / ', @gst );
1018 }
1019
1020 sub _symmdiff {
1021         my( $lista, $listb ) = @_;
1022         my %union;
1023         my %scalars;
1024         map { $union{$_} = 1; $scalars{$_} = $_ } @$lista;
1025         map { $union{$_} += 1; $scalars{$_} = $_ } @$listb;
1026         my @set = grep { $union{$_} == 1 } keys %union;
1027         return map { $scalars{$_} } @set;
1028 }
1029
1030 1;
1031
1032 =head1 LICENSE
1033
1034 This package is free software and is provided "as is" without express
1035 or implied warranty.  You can redistribute it and/or modify it under
1036 the same terms as Perl itself.
1037
1038 =head1 AUTHOR
1039
1040 Tara L Andrews E<lt>aurum@cpan.orgE<gt>