lib/Text/Tradition/Analysis.pm

   1 package Text::Tradition::Analysis;
   2
   3 use strict;
   4 use warnings;
   5 use Benchmark;
   6 use Encode qw/ encode_utf8 /;
   7 use Exporter 'import';
   8 use Graph;
   9 use JSON qw/ encode_json decode_json /;
  10 use LWP::UserAgent;
  11 use Text::Tradition;
  12 use Text::Tradition::Stemma;
  13
  14 use vars qw/ @EXPORT_OK /;
  15 @EXPORT_OK = qw/ run_analysis group_variants analyze_variant_location wit_stringify /;
  16
  17 =head1 NAME
  18
  19 Text::Tradition::Analysis - functions for stemma analysis of a tradition
  20
  21 =head1 SYNOPSIS
  22
  23   use Text::Tradition;
  24   use Text::Tradition::Analysis qw/ run_analysis analyze_variant_location /;
  25   my $t = Text::Tradition->new(
  26     'name' => 'this is a text',
  27     'input' => 'TEI',
  28     'file' => '/path/to/tei_parallel_seg_file.xml' );
  29   $t->add_stemma( 'dotfile' => $stemmafile );
  30
  31   my $variant_data = run_analysis( $tradition );
  32   # Recalculate rank $n treating all orthographic variants as equivalent
  33   my $reanalyze = analyze_variant_location( $tradition, $n, 0, 'orthographic' );
  34
  35 =head1 DESCRIPTION
  36
  37 Text::Tradition is a library for representation and analysis of collated
  38 texts, particularly medieval ones.  The Collation is the central feature of
  39 a Tradition, where the text, its sequence of readings, and its relationships
  40 between readings are actually kept.
  41
  42 =head1 SUBROUTINES
  43
  44 =head2 run_analysis( $tradition, %opts )
  45
  46 Runs the analysis described in analyze_variant_location on every location in the
  47 collation of the given tradition, with the given options. These include:
  48
  49 =over 4
  50
  51 =item * stemma_id - Specify which of the tradition's stemmata to use. Default
  52 is 0 (i.e. the first).
  53
  54 =item * ranks - Specify a list of location ranks to analyze; exclude the rest.
  55
  56 =item * merge_types - Specify a list of relationship types, where related readings
  57 should be treated as identical for the purposes of analysis.
  58
  59 =item * exclude_type1 - Exclude those ranks whose groupings have only type-1 variants.
  60
  61 =back
  62
  63 =begin testing
  64
  65 use Text::Tradition;
  66 use Text::Tradition::Analysis qw/ run_analysis analyze_variant_location /;
  67
  68 my $datafile = 't/data/florilegium_tei_ps.xml';
  69 my $tradition = Text::Tradition->new( 'input' => 'TEI',
  70                                       'name' => 'test0',
  71                                       'file' => $datafile );
  72 my $s = $tradition->add_stemma( 'dotfile' => 't/data/florilegium.dot' );
  73 is( ref( $s ), 'Text::Tradition::Stemma', "Added stemma to tradition" );
  74
  75 my %expected_genealogical = (
  76         1 => 0,
  77         2 => 1,
  78         3 =>  0,
  79         5 =>  0,
  80         7 =>  0,
  81         8 =>  0,
  82         10 => 0,
  83         13 => 1,
  84         33 => 0,
  85         34 => 0,
  86         37 => 0,
  87         60 => 0,
  88         81 => 1,
  89         84 => 0,
  90         87 => 0,
  91         101 => 0,
  92         102 => 0,
  93         122 => 1,
  94         157 => 0,
  95         166 => 1,
  96         169 => 1,
  97         200 => 0,
  98         216 => 1,
  99         217 => 1,
 100         219 => 1,
 101         241 => 1,
 102         242 => 1,
 103         243 => 1,
 104 );
 105
 106 my $data = run_analysis( $tradition );
 107 my $c = $tradition->collation;
 108 foreach my $row ( @{$data->{'variants'}} ) {
 109         # Account for rows that used to be "not useful"
 110         unless( exists $expected_genealogical{$row->{'id'}} ) {
 111                 $expected_genealogical{$row->{'id'}} = 1;
 112         }
 113         my $gen_bool = $row->{'genealogical'} ? 1 : 0;
 114         is( $gen_bool, $expected_genealogical{$row->{'id'}},
 115                 "Got correct genealogical flag for row " . $row->{'id'} );
 116         # Check that we have the right row with the right groups
 117         my $rank = $row->{'id'};
 118         foreach my $rdghash ( @{$row->{'readings'}} ) {
 119                 # Skip 'readings' that aren't really
 120                 next unless $c->reading( $rdghash->{'readingid'} );
 121                 # Check the rank
 122                 is( $c->reading( $rdghash->{'readingid'} )->rank, $rank,
 123                         "Got correct reading rank" );
 124                 # Check the witnesses
 125                 my @realwits = sort $c->reading_witnesses( $rdghash->{'readingid'} );
 126                 my @sgrp = sort @{$rdghash->{'group'}};
 127                 is_deeply( \@sgrp, \@realwits, "Reading analyzed with correct groups" );
 128         }
 129 }
 130 is( $data->{'variant_count'}, 58, "Got right total variant number" );
 131 # TODO Make something meaningful of conflict count, maybe test other bits
 132
 133 =end testing
 134
 135 =cut
 136
 137 sub run_analysis {
 138         my( $tradition, %opts ) = @_;
 139         my $c = $tradition->collation;
 140
 141         my $stemma_id = $opts{'stemma_id'} || 0;
 142         my @ranks = ref( $opts{'ranks'} ) eq 'ARRAY' ? @{$opts{'ranks'}} : ();
 143         my @collapse = ref( $opts{'merge_types'} ) eq 'ARRAY' ? @{$opts{'merge_types'}} : ();
 144
 145         # Get the stemma
 146         my $stemma = $tradition->stemma( $stemma_id );
 147
 148         # Figure out which witnesses we are working with - that is, the ones that
 149         # appear both in the stemma and in the tradition. All others are 'lacunose'
 150         # for our purposes.
 151         my @lacunose = $stemma->hypotheticals;
 152         my @tradition_wits = map { $_->sigil } $tradition->witnesses;
 153         push( @lacunose, _symmdiff( [ $stemma->witnesses ], \@tradition_wits ) );
 154
 155         # Find and mark 'common' ranks for exclusion, unless they were
 156         # explicitly specified.
 157         unless( @ranks ) {
 158                 my %common_rank;
 159                 foreach my $rdg ( $c->common_readings ) {
 160                         $common_rank{$rdg->rank} = 1;
 161                 }
 162                 @ranks = grep { !$common_rank{$_} } ( 1 .. $c->end->rank-1 );
 163         }
 164
 165         # Group the variants to send to the solver
 166         my @groups;
 167         my @use_ranks;
 168         my %lacunae;
 169         my $moved = {};
 170         foreach my $rank ( @ranks ) {
 171                 my $missing = [ @lacunose ];
 172                 my $rankgroup = group_variants( $tradition, $rank, $missing, $moved, \@collapse );
 173                 # Filter out any empty rankgroups
 174                 # (e.g. from the later rank for a transposition)
 175                 next unless keys %$rankgroup;
 176                 if( $opts{'exclude_type1'} ) {
 177                         # Check to see whether this is a "useful" group.
 178                         my( $rdgs, $grps ) = _useful_variant( $rankgroup,
 179                                 $stemma->graph, $c->ac_label );
 180                         next unless @$rdgs;
 181                 }
 182                 push( @use_ranks, $rank );
 183                 push( @groups, $rankgroup );
 184                 $lacunae{$rank} = $missing;
 185         }
 186         # Run the solver
 187         my $answer = solve_variants( $stemma, @groups );
 188
 189         # Do further analysis on the answer
 190         my $conflict_count = 0;
 191         my $aclabel = $c->ac_label;
 192         foreach my $idx ( 0 .. $#use_ranks ) {
 193                 my $location = $answer->{'variants'}->[$idx];
 194                 # Add the rank back in
 195                 my $rank = $use_ranks[$idx];
 196                 $location->{'id'} = $rank;
 197                 # Note what our lacunae are
 198                 my %lmiss;
 199                 map { $lmiss{$_} = 1 } @{$lacunae{$use_ranks[$idx]}};
 200                 $location->{'missing'} = [ keys %lmiss ];
 201
 202                 # Run the extra analysis we need.
 203                 analyze_location( $tradition, $stemma->graph, $location, \%lmiss );
 204
 205                 # Do the final post-analysis tidying up of the data.
 206                 foreach my $rdghash ( @{$location->{'readings'}} ) {
 207                         $conflict_count++
 208                                 if exists $rdghash->{'conflict'} && $rdghash->{'conflict'};
 209                         # Add the reading text back in, setting display value as needed
 210                         my $rdg = $c->reading( $rdghash->{'readingid'} );
 211                         if( $rdg ) {
 212                                 $rdghash->{'text'} = $rdg->text .
 213                                         ( $rdg->rank == $rank ? '' : ' [' . $rdg->rank . ']' );
 214                         }
 215                         # Remove lacunose witnesses from this reading's list now that the
 216                         # analysis is done
 217                         my @realgroup;
 218                         map { push( @realgroup, $_ ) unless $lmiss{$_} } @{$rdghash->{'group'}};
 219                         $rdghash->{'group'} = \@realgroup;
 220                         # TODO Record hypotheticals used to create group, if we end up
 221                         # needing it
 222                 }
 223         }
 224         $answer->{'conflict_count'} = $conflict_count;
 225
 226         return $answer;
 227 }
 228
 229 =head2 group_variants( $tradition, $rank, $lacunose, @merge_relationship_types )
 230
 231 Groups the variants at the given $rank of the collation, treating any
 232 relationships in @merge_relationship_types as equivalent.  $lacunose should
 233 be a reference to an array, to which the sigla of lacunose witnesses at this
 234 rank will be appended; $transposed should be a reference to a hash, wherein
 235 the identities of transposed readings and their relatives will be stored.
 236
 237 Returns a hash $group_readings where $rdg is attested by the witnesses listed
 238 in $group_readings->{$rdg}.
 239
 240 =cut
 241
 242 # Return group_readings, groups, lacunose
 243 sub group_variants {
 244         my( $tradition, $rank, $lacunose, $transposed, $collapse ) = @_;
 245         my $c = $tradition->collation;
 246         my $aclabel = $c->ac_label;
 247         # Get the alignment table readings
 248         my %readings_at_rank;
 249         my %is_lacunose; # lookup table for $lacunose
 250         map { $is_lacunose{$_} = 1 } @$lacunose;
 251         my @check_for_gaps;
 252         my %moved_wits;
 253         foreach my $tablewit ( @{$c->alignment_table->{'alignment'}} ) {
 254                 my $rdg = $tablewit->{'tokens'}->[$rank-1];
 255                 my $wit = $tablewit->{'witness'};
 256                 # Exclude the witness if it is "lacunose" which if we got here
 257                 # means "not in the stemma".
 258                 next if $is_lacunose{$wit};
 259                 # Note if the witness is actually in a lacuna
 260                 if( $rdg && $rdg->{'t'}->is_lacuna ) {
 261                         _add_to_witlist( $wit, $lacunose, $aclabel );
 262                 # Otherwise the witness either has a positive reading...
 263                 } elsif( $rdg ) {
 264                         # If the reading has been counted elsewhere as a transposition, ignore it.
 265                         if( $transposed->{$rdg->{'t'}->id} ) {
 266                                 # TODO This doesn't cope with three-way transpositions
 267                                 map { $moved_wits{$_} = 1 } @{$transposed->{$rdg->{'t'}->id}};
 268                                 next;
 269                         }
 270                         # Otherwise, record it...
 271                         $readings_at_rank{$rdg->{'t'}->id} = $rdg->{'t'};
 272                         # ...and grab any transpositions, and their relations.
 273                         my @transp = grep { $_->rank != $rank } $rdg->{'t'}->related_readings();
 274                         foreach my $trdg ( @transp ) {
 275                                 map { $moved_wits{$_} = 1 } $trdg->witnesses;
 276                                 $transposed->{$trdg->id} = [ $rdg->{'t'}->witnesses ];
 277                                 $readings_at_rank{$trdg->id} = $trdg;
 278                         }
 279                 # ...or it is empty, ergo a gap.
 280                 } else {
 281                         push( @check_for_gaps, $wit );
 282                 }
 283         }
 284         my @gap_wits;
 285         map { _add_to_witlist( $_, \@gap_wits, $aclabel )
 286                 unless $moved_wits{$_} } @check_for_gaps;
 287         # TODO check for, and break into a new row, any doubled-up witness readings
 288         #    after transposition...
 289         # Group the readings, collapsing groups by relationship if needed
 290         my %grouped_readings;
 291         foreach my $rdg ( values %readings_at_rank ) {
 292                 # Skip readings that have been collapsed into others.
 293                 next if exists $grouped_readings{$rdg->id} && !$grouped_readings{$rdg->id};
 294                 # Get the witness list, including from readings collapsed into this one.
 295                 my @wits = $rdg->witnesses;
 296                 if( $collapse ) {
 297                         my $filter = sub { my $r = $_[0]; grep { $_ eq $r->type } @$collapse; };
 298                         foreach my $other ( $rdg->related_readings( $filter ) ) {
 299                                 my @otherwits = $other->witnesses;
 300                                 push( @wits, @otherwits );
 301                                 $grouped_readings{$other->id} = 0;
 302                         }
 303                 }
 304                 # Filter the group to those witnesses in the stemma
 305                 my @use_wits;
 306                 foreach my $wit ( @wits ) {
 307                         next if $is_lacunose{$wit};
 308                         push( @use_wits, $wit );
 309                 }
 310                 $grouped_readings{$rdg->id} = \@use_wits;
 311         }
 312         $grouped_readings{'(omitted)'} = \@gap_wits if @gap_wits;
 313         # Get rid of our collapsed readings
 314         map { delete $grouped_readings{$_} unless $grouped_readings{$_} }
 315                 keys %grouped_readings
 316                 if $collapse;
 317
 318         # Return the result
 319         return \%grouped_readings;
 320 }
 321
 322 # Helper function to ensure that X and X a.c. never appear in the same list.
 323 sub _add_to_witlist {
 324         my( $wit, $list, $acstr ) = @_;
 325         my %inlist;
 326         my $idx = 0;
 327         map { $inlist{$_} = $idx++ } @$list;
 328         if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
 329                 my $acwit = $1;
 330                 unless( exists $inlist{$acwit} ) {
 331                         push( @$list, $acwit.$acstr );
 332                 }
 333         } else {
 334                 if( exists( $inlist{$wit.$acstr} ) ) {
 335                         # Replace the a.c. version with the main witness
 336                         my $i = $inlist{$wit.$acstr};
 337                         $list->[$i] = $wit;
 338                 } else {
 339                         push( @$list, $wit );
 340                 }
 341         }
 342 }
 343
 344 =head2 solve_variants( $graph, @groups )
 345
 346 Sends the set of groups to the external graph solver service and returns
 347 a cleaned-up answer, adding the rank IDs back where they belong.
 348
 349 The JSON has the form
 350   { "graph": [ stemmagraph DOT string without newlines ],
 351     "groupings": [ array of arrays of groups, one per rank ] }
 352
 353 The answer has the form
 354   { "variants" => [ array of variant location structures ],
 355     "variant_count" => total,
 356     "conflict_count" => number of conflicts detected,
 357     "genealogical_count" => number of solutions found }
 358
 359 =cut
 360
 361 sub solve_variants {
 362         my( $stemma, @groups ) = @_;
 363         my $aclabel = $stemma->collation->ac_label;
 364
 365         # Filter the groups down to distinct groups, and work out what graph
 366         # should be used in the calculation of each group. We want to send each
 367         # distinct problem to the solver only once.
 368         # We need a whole bunch of lookup tables for this.
 369         my $index_groupkeys = {};       # Save the order of readings
 370         my $group_indices = {};         # Save the indices that have a given grouping
 371         my $graph_problems = {};        # Save the groupings for the given graph
 372
 373         foreach my $idx ( 0..$#groups ) {
 374                 my $ghash = $groups[$idx];
 375                 my @grouping;
 376                 # Sort the groupings from big to little, and scan for a.c. witnesses
 377                 # that would need an extended graph.
 378                 my @acwits;   # note which AC witnesses crop up at this rank
 379                 my @idxkeys = sort { scalar @{$ghash->{$b}} <=> scalar @{$ghash->{$a}} }
 380                         keys %$ghash;
 381                 foreach my $rdg ( @idxkeys ) {
 382                         my @sg = sort @{$ghash->{$rdg}};
 383                         push( @acwits, grep { $_ =~ /\Q$aclabel\E$/ } @sg );
 384                         push( @grouping, \@sg );
 385                 }
 386                 # Save the reading order
 387                 $index_groupkeys->{$idx} = \@idxkeys;
 388
 389                 # Now associate the distinct group with this index
 390                 my $gstr = wit_stringify( \@grouping );
 391                 push( @{$group_indices->{$gstr}}, $idx );
 392
 393                 # Finally, add the group to the list to be calculated for this graph.
 394                 map { s/\Q$aclabel\E$// } @acwits;
 395                 my $graph = $stemma->extend_graph( \@acwits );
 396                 unless( exists $graph_problems->{"$graph"} ) {
 397                         $graph_problems->{"$graph"} = { 'object' => $graph, 'groups' => [] };
 398                 }
 399                 push( @{$graph_problems->{"$graph"}->{'groups'}}, \@grouping );
 400         }
 401
 402         ## For each distinct graph, send its groups to the solver.
 403         my $solver_url = 'http://byzantini.st/cgi-bin/graphcalc.cgi';
 404         my $ua = LWP::UserAgent->new();
 405         ## Witness map is a HACK to get around limitations in node names from IDP
 406         my $witness_map = {};
 407         ## Variables to store answers as they come back
 408         my $variants = [ ( undef ) x ( scalar keys %$index_groupkeys ) ];
 409         my $genealogical = 0;
 410         foreach my $graphkey ( keys %$graph_problems ) {
 411                 my $graph = $graph_problems->{$graphkey}->{'object'};
 412                 my $groupings = $graph_problems->{$graphkey}->{'groups'};
 413                 my $json = encode_json( _safe_wit_strings( $graph, $stemma->collation,
 414                         $groupings, $witness_map ) );
 415                 # Send it off and get the result
 416                 #print STDERR "Sending request: $json\n";
 417                 my $resp = $ua->post( $solver_url, 'Content-Type' => 'application/json',
 418                                                           'Content' => $json );
 419                 my $answer;
 420                 my $used_idp;
 421                 if( $resp->is_success ) {
 422                         $answer = _desanitize_names( decode_json( $resp->content ), $witness_map );
 423                         $used_idp = 1;
 424                 } else {
 425                         # Fall back to the old method.
 426                         warn "IDP solver returned " . $resp->status_line . " / " . $resp->content
 427                                 . "; falling back to perl method";
 428                         $answer = perl_solver( $graph, @$groupings );
 429                 }
 430                 ## The answer is the evaluated groupings, plus a boolean for whether
 431                 ## they were genealogical.  Reconstruct our original groups.
 432                 foreach my $gidx ( 0 .. $#{$groupings} ) {
 433                         my( $calc_groups, $result ) = @{$answer->[$gidx]};
 434                         if( $result ) {
 435                                 $genealogical++;
 436                                 # Prune the calculated groups, in case the IDP solver failed to.
 437                                 if( $used_idp ) {
 438                                         my @pruned_groups;
 439                                         foreach my $cg ( @$calc_groups ) {
 440                                                 # This is a little wasteful but the path of least
 441                                                 # resistance. Send both the stemma, which knows what
 442                                                 # its hypotheticals are, and the actual graph used.
 443                                                 my @pg = _prune_group( $cg, $stemma, $graph );
 444                                                 push( @pruned_groups, \@pg );
 445                                         }
 446                                         $calc_groups = \@pruned_groups;
 447                                 }
 448                         }
 449                         # Retrieve the key for the original group that went to the solver
 450                         my $input_group = wit_stringify( $groupings->[$gidx] );
 451                         foreach my $oidx ( @{$group_indices->{$input_group}} ) {
 452                                 my @readings = @{$index_groupkeys->{$oidx}};
 453                                 my $vstruct = {
 454                                         'genealogical' => $result,
 455                                         'readings' => [],
 456                                 };
 457                                 foreach my $ridx ( 0 .. $#readings ) {
 458                                         push( @{$vstruct->{'readings'}},
 459                                                 { 'readingid' => $readings[$ridx],
 460                                                   'group' => $calc_groups->[$ridx] } );
 461                                 }
 462                                 $variants->[$oidx] = $vstruct;
 463                         }
 464                 }
 465         }
 466
 467         return { 'variants' => $variants,
 468                          'variant_count' => scalar @$variants,
 469                          'genealogical_count' => $genealogical };
 470 }
 471
 472 #### HACKERY to cope with IDP's limited idea of what a node name looks like ###
 473
 474 sub _safe_wit_strings {
 475         my( $graph, $c, $groupings, $witness_map ) = @_;
 476         # Parse the graph we were given into a stemma.
 477         my $safegraph = Graph->new();
 478         # Convert the graph to a safe representation and store the conversion.
 479         foreach my $n ( $graph->vertices ) {
 480                 my $sn = _safe_witstr( $n );
 481                 if( exists $witness_map->{$sn} ) {
 482                         warn "Ambiguous stringification $sn for $n and " . $witness_map->{$sn}
 483                                 if $witness_map->{$sn} ne $n;
 484                 } else {
 485                         $witness_map->{$sn} = $n;
 486                 }
 487                 $safegraph->add_vertex( $sn );
 488                 $safegraph->set_vertex_attributes( $sn,
 489                         $graph->get_vertex_attributes( $n ) );
 490         }
 491         foreach my $e ( $graph->edges ) {
 492                 my @safe_e = ( _safe_witstr( $e->[0] ), _safe_witstr( $e->[1] ) );
 493                 $safegraph->add_edge( @safe_e );
 494         }
 495         my $safe_stemma = Text::Tradition::Stemma->new(
 496                 'collation' => $c, 'graph' => $safegraph );
 497
 498         # Now convert the witness groupings to a safe representation.
 499         my $safe_groupings = [];
 500         foreach my $grouping ( @$groupings ) {
 501                 my $safe_grouping = [];
 502                 foreach my $group ( @$grouping ) {
 503                         my $safe_group = [];
 504                         foreach my $n ( @$group ) {
 505                                 my $sn = _safe_witstr( $n );
 506                                 warn "Ambiguous stringification $sn for $n and " . $witness_map->{$sn}
 507                                         if exists $witness_map->{$sn} && $witness_map->{$sn} ne $n;
 508                                 $witness_map->{$sn} = $n;
 509                                 push( @$safe_group, $sn );
 510                         }
 511                         push( @$safe_grouping, $safe_group );
 512                 }
 513                 push( @$safe_groupings, $safe_grouping );
 514         }
 515
 516         # Return it all in the struct we expect.  We have stored the reductions
 517         # in the $witness_map that we were passed.
 518         return { 'graph' => $safe_stemma->editable( { 'linesep' => ' ' } ),
 519                          'groupings' => $safe_groupings };
 520 }
 521
 522 sub _safe_witstr {
 523         my $witstr = shift;
 524         $witstr =~ s/\s+/_/g;
 525         $witstr =~ s/[^\w\d-]//g;
 526         return $witstr;
 527 }
 528
 529 sub _desanitize_names {
 530         my( $jsonstruct, $witness_map ) = @_;
 531         my $result = [];
 532         foreach my $grouping ( @$jsonstruct ) {
 533                 my $real_grouping = [];
 534                 foreach my $element ( @$grouping ) {
 535                         if( ref( $element ) eq 'ARRAY' ) {
 536                                 # it's the groupset.
 537                                 my $real_groupset = [];
 538                                 foreach my $group ( @$element ) {
 539                                         my $real_group = [];
 540                                         foreach my $n ( @$group ) {
 541                                                 my $rn = $witness_map->{$n};
 542                                                 push( @$real_group, $rn );
 543                                         }
 544                                         push( @$real_groupset, $real_group );
 545                                 }
 546                                 push( @$real_grouping, $real_groupset );
 547                         } else {
 548                                 # It is the boolean, not actually a group.
 549                                 push( @$real_grouping, $element );
 550                         }
 551                 }
 552                 push( @$result, $real_grouping );
 553         }
 554         return $result;
 555 }
 556
 557 ### END HACKERY ###
 558
 559 =head2 analyze_location ( $tradition, $graph, $location_hash )
 560
 561 Given the tradition, its stemma graph, and the solution from the graph solver,
 562 work out the rest of the information we want.  For each reading we need missing,
 563 conflict, reading_parents, independent_occurrence, followed, not_followed, and follow_unknown.  Alters the location_hash in place.
 564
 565 =cut
 566
 567 sub analyze_location {
 568         my ( $tradition, $graph, $variant_row, $lacunose ) = @_;
 569         my $c = $tradition->collation;
 570
 571         # Make a hash of all known node memberships, and make the subgraphs.
 572         my $contig = {};
 573         my $reading_roots = {};
 574         my $subgraph = {};
 575         $DB::single = 1 if $variant_row->{id} == 6;
 576         # Note which witnesses positively belong to which group
 577     foreach my $rdghash ( @{$variant_row->{'readings'}} ) {
 578         my $rid = $rdghash->{'readingid'};
 579                 map { $contig->{$_} = $rid } @{$rdghash->{'group'}};
 580         }
 581
 582         # Now, armed with that knowledge, make a subgraph for each reading
 583         # and note the root(s) of each subgraph.
 584         foreach my $rdghash( @{$variant_row->{'readings'}} ) {
 585         my $rid = $rdghash->{'readingid'};
 586         my %rdgwits;
 587         # Make the subgraph.
 588         my $part = $graph->copy;
 589         my @todelete = grep { exists $contig->{$_} && $contig->{$_} ne $rid }
 590                 keys %$contig;
 591         $part->delete_vertices( @todelete );
 592         _prune_subtree( $part, $lacunose );
 593                 $subgraph->{$rid} = $part;
 594                 # Record the remaining lacunose nodes as part of this group, if
 595                 # we are dealing with a non-genealogical reading.
 596                 unless( $variant_row->{'genealogical'} ) {
 597                         map { $contig->{$_} = $rid } $part->vertices;
 598                 }
 599                 # Get the reading roots.
 600                 map { $reading_roots->{$_} = $rid } $part->predecessorless_vertices;
 601         }
 602
 603         # Now that we have all the node group memberships, calculate followed/
 604     # non-followed/unknown values for each reading.  Also figure out the
 605     # reading's evident parent(s).
 606     foreach my $rdghash ( @{$variant_row->{'readings'}} ) {
 607         my $rid = $rdghash->{'readingid'};
 608         # Get the subgraph
 609         my $part = $subgraph->{$rid};
 610
 611         # Start figuring things out.
 612         my @roots = grep { $reading_roots->{$_} eq $rid } keys %$reading_roots;
 613         $rdghash->{'independent_occurrence'} = \@roots;
 614         $rdghash->{'followed'} = scalar( $part->vertices ) - scalar( @roots );
 615         # Find the parent readings, if any, of this reading.
 616         my $rdgparents = {};
 617         foreach my $wit ( @roots ) {
 618                 # Look in the main stemma to find this witness's extant or known-reading
 619                 # immediate ancestor(s), and look up the reading that each ancestor olds.
 620                         my @check = $graph->predecessors( $wit );
 621                         while( @check ) {
 622                                 my @next;
 623                                 foreach my $wparent( @check ) {
 624                                         my $preading = $contig->{$wparent};
 625                                         if( $preading ) {
 626                                                 $rdgparents->{$preading} = 1;
 627                                         } else {
 628                                                 push( @next, $graph->predecessors( $wparent ) );
 629                                         }
 630                                 }
 631                                 @check = @next;
 632                         }
 633                 }
 634                 foreach my $p ( keys %$rdgparents ) {
 635                         # Resolve the relationship of the parent to the reading, and
 636                         # save it in our hash.
 637                         my $pobj = $c->reading( $p );
 638                         my $relation;
 639                         my $prep = $pobj ? $pobj->id . ' (' . $pobj->text . ')' : $p;
 640                         if( $pobj ) {
 641                                 my $rel = $c->get_relationship( $p, $rdghash->{readingid} );
 642                                 if( $rel ) {
 643                                         $relation = { type => $rel->type };
 644                                         if( $rel->has_annotation ) {
 645                                                 $relation->{'annotation'} = $rel->annotation;
 646                                         }
 647                                 }
 648                         }
 649                         $rdgparents->{$p} = { 'label' => $prep, 'relation' => $relation };
 650                 }
 651
 652                 $rdghash->{'reading_parents'} = $rdgparents;
 653
 654                 # Find the number of times this reading was altered, and the number of
 655                 # times we're not sure.
 656                 my( %nofollow, %unknownfollow );
 657                 foreach my $wit ( $part->vertices ) {
 658                         foreach my $wchild ( $graph->successors( $wit ) ) {
 659                                 next if $part->has_vertex( $wchild );
 660                                 if( $reading_roots->{$wchild} && $contig->{$wchild} ) {
 661                                         # It definitely changed here.
 662                                         $nofollow{$wchild} = 1;
 663                                 } elsif( !($contig->{$wchild}) ) {
 664                                         # The child is a hypothetical node not definitely in
 665                                         # any group. Answer is unknown.
 666                                         $unknownfollow{$wchild} = 1;
 667                                 } # else it's a non-root node in a known group, and therefore
 668                                   # is presumed to have its reading from its group, not this link.
 669                         }
 670                 }
 671                 $rdghash->{'not_followed'} = keys %nofollow;
 672                 $rdghash->{'follow_unknown'} = keys %unknownfollow;
 673
 674                 # Now say whether this reading represents a conflict.
 675                 unless( $variant_row->{'genealogical'} ) {
 676                         $rdghash->{'conflict'} = @roots != 1;
 677                 }
 678     }
 679 }
 680
 681
 682 =head2 perl_solver( $tradition, $rank, $stemma_id, @merge_relationship_types )
 683
 684 ** NOTE ** This method should hopefully not be called - it is not guaranteed
 685 to be correct.  Serves as a backup for the real solver.
 686
 687 Runs an analysis of the given tradition, at the location given in $rank,
 688 against the graph of the stemma specified in $stemma_id.  The argument
 689 @merge_relationship_types is an optional list of relationship types for
 690 which readings so related should be treated as equivalent.
 691
 692 Returns a nested array data structure as follows:
 693
 694  [ [ group_list, is_genealogical ], [ group_list, is_genealogical ] ... ]
 695
 696 where the group list is the array of arrays passed in for each element of @groups,
 697 possibly with the addition of hypothetical readings.
 698
 699
 700 =cut
 701
 702 sub perl_solver {
 703         my( $graph, @groups ) = @_;
 704         my @answer;
 705         foreach my $g ( @groups ) {
 706                 push( @answer, _solve_variant_location( $graph, $g ) );
 707         }
 708         return \@answer;
 709 }
 710
 711 sub _solve_variant_location {
 712         my( $graph, $groups ) = @_;
 713         # Now do the work.
 714     my $contig = {};
 715     my $subgraph = {};
 716     my $is_conflicted;
 717     my $conflict = {};
 718
 719     # Mark each ms as in its own group, first.
 720     foreach my $g ( @$groups ) {
 721         my $gst = wit_stringify( $g );
 722         map { $contig->{$_} = $gst } @$g;
 723     }
 724
 725     # Now for each unmarked node in the graph, initialize an array
 726     # for possible group memberships.  We will use this later to
 727     # resolve potential conflicts.
 728     map { $contig->{$_} = [] unless $contig->{$_} } $graph->vertices;
 729     foreach my $g ( sort { scalar @$b <=> scalar @$a } @$groups ) {
 730         my $gst = wit_stringify( $g );  # This is the group name
 731         # Copy the graph, and delete all non-members from the new graph.
 732         my $part = $graph->copy;
 733         my @group_roots;
 734         $part->delete_vertices(
 735             grep { !ref( $contig->{$_} ) && $contig->{$_} ne $gst } $graph->vertices );
 736
 737         # Now look to see if our group is connected.
 738                 if( @$g > 1 ) {
 739                         # We have to take directionality into account.
 740                         # How many root nodes do we have?
 741                         my @roots = grep { ref( $contig->{$_} ) || $contig->{$_} eq $gst }
 742                                 $part->predecessorless_vertices;
 743                         # Assuming that @$g > 1, find the first root node that has at
 744                         # least one successor belonging to our group. If this reading
 745                         # is genealogical, there should be only one, but we will check
 746                         # that implicitly later.
 747                         foreach my $root ( @roots ) {
 748                                 # Prune the tree to get rid of extraneous hypotheticals.
 749                                 $root = _prune_subtree_old( $part, $root, $contig );
 750                                 next unless $root;
 751                                 # Save this root for our group.
 752                                 push( @group_roots, $root );
 753                                 # Get all the successor nodes of our root.
 754                         }
 755                 } else {
 756                         # Dispense with the trivial case of one reading.
 757                         my $wit = $g->[0];
 758                         @group_roots = ( $wit );
 759                         foreach my $v ( $part->vertices ) {
 760                                 $part->delete_vertex( $v ) unless $v eq $wit;
 761                         }
 762         }
 763
 764         if( @group_roots > 1 ) {
 765                 $conflict->{$gst} = 1;
 766                 $is_conflicted = 1;
 767         }
 768         # Paint the 'hypotheticals' with our group.
 769                 foreach my $wit ( $part->vertices ) {
 770                         if( ref( $contig->{$wit} ) ) {
 771                                 push( @{$contig->{$wit}}, $gst );
 772                         } elsif( $contig->{$wit} ne $gst ) {
 773                                 warn "How did we get here?";
 774                         }
 775                 }
 776
 777
 778                 # Save the relevant subgraph.
 779                 $subgraph->{$gst} = $part;
 780     }
 781
 782         # For each of our hypothetical readings, flatten its 'contig' array if
 783         # the array contains zero or one group.  If we have any unflattened arrays,
 784         # we may need to run the resolution process. If the reading is already known
 785         # to have a conflict, flatten the 'contig' array to nothing; we won't resolve
 786         # it.
 787         my @resolve;
 788         foreach my $wit ( keys %$contig ) {
 789                 next unless ref( $contig->{$wit} );
 790                 if( @{$contig->{$wit}} > 1 ) {
 791                         if( $is_conflicted ) {
 792                                 $contig->{$wit} = '';  # We aren't going to decide.
 793                         } else {
 794                                 push( @resolve, $wit );
 795                         }
 796                 } else {
 797                         my $gst = pop @{$contig->{$wit}};
 798                         $contig->{$wit} = $gst || '';
 799                 }
 800         }
 801
 802     if( @resolve ) {
 803         my $still_contig = {};
 804         foreach my $h ( @resolve ) {
 805             # For each of the hypothetical readings with more than one possibility,
 806             # try deleting it from each of its member subgraphs in turn, and see
 807             # if that breaks the contiguous grouping.
 808             # TODO This can still break in a corner case where group A can use
 809             # either vertex 1 or 2, and group B can use either vertex 2 or 1.
 810             # Revisit this if necessary; it could get brute-force nasty.
 811             foreach my $gst ( @{$contig->{$h}} ) {
 812                 my $gpart = $subgraph->{$gst}->copy();
 813                 # If we have come this far, there is only one root and everything
 814                 # is reachable from it.
 815                 my( $root ) = $gpart->predecessorless_vertices;
 816                 my $reachable = {};
 817                 map { $reachable->{$_} = 1 } $gpart->vertices;
 818
 819                 # Try deleting the hypothetical node.
 820                 $gpart->delete_vertex( $h );
 821                 if( $h eq $root ) {
 822                         # See if we still have a single root.
 823                         my @roots = $gpart->predecessorless_vertices;
 824                         warn "This shouldn't have happened" unless @roots;
 825                         if( @roots > 1 ) {
 826                                 # $h is needed by this group.
 827                                 if( exists( $still_contig->{$h} ) ) {
 828                                         # Conflict!
 829                                         $conflict->{$gst} = 1;
 830                                         $still_contig->{$h} = '';
 831                                 } else {
 832                                         $still_contig->{$h} = $gst;
 833                                 }
 834                         }
 835                 } else {
 836                         # $h is somewhere in the middle. See if everything
 837                         # else can still be reached from the root.
 838                                         my %still_reachable = ( $root => 1 );
 839                                         map { $still_reachable{$_} = 1 }
 840                                                 $gpart->all_successors( $root );
 841                                         foreach my $v ( keys %$reachable ) {
 842                                                 next if $v eq $h;
 843                                                 if( !$still_reachable{$v}
 844                                                         && ( $contig->{$v} eq $gst
 845                                                                  || ( exists $still_contig->{$v}
 846                                                                           && $still_contig->{$v} eq $gst ) ) ) {
 847                                                         # We need $h.
 848                                                         if( exists $still_contig->{$h} ) {
 849                                                                 # Conflict!
 850                                                                 $conflict->{$gst} = 1;
 851                                                                 $still_contig->{$h} = '';
 852                                                         } else {
 853                                                                 $still_contig->{$h} = $gst;
 854                                                         }
 855                                                         last;
 856                                                 } # else we don't need $h in this group.
 857                                         } # end foreach $v
 858                                 } # endif $h eq $root
 859             } # end foreach $gst
 860         } # end foreach $h
 861
 862         # Now we have some hypothetical vertices in $still_contig that are the
 863         # "real" group memberships.  Replace these in $contig.
 864                 foreach my $v ( keys %$contig ) {
 865                         next unless ref $contig->{$v};
 866                         $contig->{$v} = $still_contig->{$v};
 867                 }
 868     } # end if @resolve
 869
 870     my $is_genealogical = keys %$conflict ? JSON::false : JSON::true;
 871         my $variant_row = [ [], $is_genealogical ];
 872         # Fill in the groupings from $contig.
 873         foreach my $g ( @$groups ) {
 874         my $gst = wit_stringify( $g );
 875         my @realgroup = grep { $contig->{$_} eq $gst } keys %$contig;
 876         push( @{$variant_row->[0]}, \@realgroup );
 877     }
 878     return $variant_row;
 879 }
 880
 881 sub _prune_group {
 882         my( $group, $stemma, $graph ) = @_;
 883         my $lacunose = {};
 884         map { $lacunose->{$_} = 1 } $stemma->hypotheticals;
 885         map { $lacunose->{$_} = 0 } @$group;
 886         # Make our subgraph
 887         my $subgraph = $graph->copy;
 888         map { $subgraph->delete_vertex( $_ ) unless exists $lacunose->{$_} }
 889                 $subgraph->vertices;
 890         # ...and find the root.
 891         # Now prune and return the remaining vertices.
 892         _prune_subtree( $subgraph, $lacunose );
 893         return $subgraph->vertices;
 894 }
 895
 896 sub _prune_subtree {
 897         my( $tree, $lacunose ) = @_;
 898
 899         # Delete lacunose witnesses that have no successors
 900     my @orphan_hypotheticals;
 901     my $ctr = 0;
 902     do {
 903         die "Infinite loop on leaves" if $ctr > 100;
 904         @orphan_hypotheticals = grep { $lacunose->{$_} }
 905                 $tree->successorless_vertices;
 906         $tree->delete_vertices( @orphan_hypotheticals );
 907         $ctr++;
 908     } while( @orphan_hypotheticals );
 909
 910         # Delete lacunose roots that have a single successor
 911         my @redundant_root;
 912         $ctr = 0;
 913         do {
 914         die "Infinite loop on roots" if $ctr > 100;
 915                 @redundant_root = grep { $lacunose->{$_} && $tree->successors( $_ ) == 1 }
 916                         $tree->predecessorless_vertices;
 917                 $tree->delete_vertices( @redundant_root );
 918                 $ctr++;
 919         } while( @redundant_root );
 920 }
 921
 922 sub _prune_subtree_old {
 923     my( $tree, $root, $contighash ) = @_;
 924     # First, delete hypothetical leaves / orphans until there are none left.
 925     my @orphan_hypotheticals = grep { ref( $contighash->{$_} ) }
 926         $tree->successorless_vertices;
 927     while( @orphan_hypotheticals ) {
 928         $tree->delete_vertices( @orphan_hypotheticals );
 929         @orphan_hypotheticals = grep { ref( $contighash->{$_} ) }
 930             $tree->successorless_vertices;
 931     }
 932     # Then delete a hypothetical root with only one successor, moving the
 933     # root to the first child that has no other predecessors.
 934     while( $tree->successors( $root ) == 1 && ref $contighash->{$root} ) {
 935         my @nextroot = $tree->successors( $root );
 936         $tree->delete_vertex( $root );
 937         ( $root ) = grep { $tree->is_predecessorless_vertex( $_ ) } @nextroot;
 938     }
 939     # The tree has been modified in place, but we need to know the new root.
 940     $root = undef unless $root && $tree->has_vertex( $root );
 941     return $root;
 942 }
 943 # Add the variant, subject to a.c. representation logic.
 944 # This assumes that we will see the 'main' version before the a.c. version.
 945 sub add_variant_wit {
 946     my( $arr, $wit, $acstr ) = @_;
 947     my $skip;
 948     if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
 949         my $real = $1;
 950         $skip = grep { $_ =~ /^\Q$real\E$/ } @$arr;
 951     }
 952     push( @$arr, $wit ) unless $skip;
 953 }
 954
 955 sub _useful_variant {
 956         my( $group_readings, $graph, $acstr ) = @_;
 957
 958         # TODO Decide what to do with AC witnesses
 959
 960         # Sort by group size and return
 961         my $is_useful = 0;
 962         my( @readings, @groups );   # The sorted groups for our answer.
 963         foreach my $rdg ( sort { @{$group_readings->{$b}} <=> @{$group_readings->{$a}} }
 964                 keys %$group_readings ) {
 965                 push( @readings, $rdg );
 966                 push( @groups, $group_readings->{$rdg} );
 967                 if( @{$group_readings->{$rdg}} > 1 ) {
 968                         $is_useful++;
 969                 } else {
 970                         my( $wit ) = @{$group_readings->{$rdg}};
 971                         $wit =~ s/^(.*)\Q$acstr\E$/$1/;
 972                         $is_useful++ unless( $graph->is_sink_vertex( $wit ) );
 973                 }
 974         }
 975         if( $is_useful > 1 ) {
 976                 return( \@readings, \@groups );
 977         } else {
 978                 return( [], [] );
 979         }
 980 }
 981
 982 =head2 wit_stringify( $groups )
 983
 984 Takes an array of witness groupings and produces a string like
 985 ['A','B'] / ['C','D','E'] / ['F']
 986
 987 =cut
 988
 989 sub wit_stringify {
 990     my $groups = shift;
 991     my @gst;
 992     # If we were passed an array of witnesses instead of an array of
 993     # groupings, then "group" the witnesses first.
 994     unless( ref( $groups->[0] ) ) {
 995         my $mkgrp = [ $groups ];
 996         $groups = $mkgrp;
 997     }
 998     foreach my $g ( @$groups ) {
 999         push( @gst, '[' . join( ',', map { "'$_'" } @$g ) . ']' );
1000     }
1001     return join( ' / ', @gst );
1002 }
1003
1004 sub _symmdiff {
1005         my( $lista, $listb ) = @_;
1006         my %union;
1007         my %scalars;
1008         map { $union{$_} = 1; $scalars{$_} = $_ } @$lista;
1009         map { $union{$_} += 1; $scalars{$_} = $_ } @$listb;
1010         my @set = grep { $union{$_} == 1 } keys %union;
1011         return map { $scalars{$_} } @set;
1012 }
1013
1014 1;
1015
1016 =head1 LICENSE
1017
1018 This package is free software and is provided "as is" without express
1019 or implied warranty.  You can redistribute it and/or modify it under
1020 the same terms as Perl itself.
1021
1022 =head1 AUTHOR
1023
1024 Tara L Andrews E<lt>aurum@cpan.orgE<gt>