lib/Text/Tradition/Analysis.pm

   1 package Text::Tradition::Analysis;
   2
   3 use strict;
   4 use warnings;
   5 use Benchmark;
   6 use Encode qw/ encode_utf8 /;
   7 use Exporter 'import';
   8 use Graph;
   9 use JSON qw/ encode_json decode_json /;
  10 use LWP::UserAgent;
  11 use Text::Tradition;
  12 use Text::Tradition::Stemma;
  13
  14 use vars qw/ @EXPORT_OK /;
  15 @EXPORT_OK = qw/ run_analysis group_variants analyze_variant_location wit_stringify /;
  16
  17 =head1 NAME
  18
  19 Text::Tradition::Analysis - functions for stemma analysis of a tradition
  20
  21 =head1 SYNOPSIS
  22
  23   use Text::Tradition;
  24   use Text::Tradition::Analysis qw/ run_analysis analyze_variant_location /;
  25   my $t = Text::Tradition->new(
  26     'name' => 'this is a text',
  27     'input' => 'TEI',
  28     'file' => '/path/to/tei_parallel_seg_file.xml' );
  29   $t->add_stemma( 'dotfile' => $stemmafile );
  30
  31   my $variant_data = run_analysis( $tradition );
  32   # Recalculate rank $n treating all orthographic variants as equivalent
  33   my $reanalyze = analyze_variant_location( $tradition, $n, 0, 'orthographic' );
  34
  35 =head1 DESCRIPTION
  36
  37 Text::Tradition is a library for representation and analysis of collated
  38 texts, particularly medieval ones.  The Collation is the central feature of
  39 a Tradition, where the text, its sequence of readings, and its relationships
  40 between readings are actually kept.
  41
  42 =head1 SUBROUTINES
  43
  44 =head2 run_analysis( $tradition, %opts )
  45
  46 Runs the analysis described in analyze_variant_location on every location in the
  47 collation of the given tradition, with the given options. These include:
  48
  49 =over 4
  50
  51 =item * stemma_id - Specify which of the tradition's stemmata to use. Default
  52 is 0 (i.e. the first).
  53
  54 =item * ranks - Specify a list of location ranks to analyze; exclude the rest.
  55
  56 =item * merge_types - Specify a list of relationship types, where related readings
  57 should be treated as identical for the purposes of analysis.
  58
  59 =item * exclude_type1 - Exclude those ranks whose groupings have only type-1 variants.
  60
  61 =back
  62
  63 =begin testing
  64
  65 use Text::Tradition;
  66 use Text::Tradition::Analysis qw/ run_analysis analyze_variant_location /;
  67
  68 my $datafile = 't/data/florilegium_tei_ps.xml';
  69 my $tradition = Text::Tradition->new( 'input' => 'TEI',
  70                                       'name' => 'test0',
  71                                       'file' => $datafile );
  72 my $s = $tradition->add_stemma( 'dotfile' => 't/data/florilegium.dot' );
  73 is( ref( $s ), 'Text::Tradition::Stemma', "Added stemma to tradition" );
  74
  75 my %expected_genealogical = (
  76         1 => 0,
  77         2 => 1,
  78         3 =>  0,
  79         5 =>  0,
  80         7 =>  0,
  81         8 =>  0,
  82         10 => 0,
  83         13 => 1,
  84         33 => 0,
  85         34 => 0,
  86         37 => 0,
  87         60 => 0,
  88         81 => 1,
  89         84 => 0,
  90         87 => 0,
  91         101 => 0,
  92         102 => 0,
  93         122 => 1,
  94         157 => 0,
  95         166 => 1,
  96         169 => 1,
  97         200 => 0,
  98         216 => 1,
  99         217 => 1,
 100         219 => 1,
 101         241 => 1,
 102         242 => 1,
 103         243 => 1,
 104 );
 105
 106 my $data = run_analysis( $tradition );
 107 my $c = $tradition->collation;
 108 foreach my $row ( @{$data->{'variants'}} ) {
 109         # Account for rows that used to be "not useful"
 110         unless( exists $expected_genealogical{$row->{'id'}} ) {
 111                 $expected_genealogical{$row->{'id'}} = 1;
 112         }
 113         is( $row->{'genealogical'}, $expected_genealogical{$row->{'id'}},
 114                 "Got correct genealogical flag for row " . $row->{'id'} );
 115         # Check that we have the right row with the right groups
 116         my $rank = $row->{'id'};
 117         foreach my $rdghash ( @{$row->{'readings'}} ) {
 118                 # Skip 'readings' that aren't really
 119                 next unless $c->reading( $rdghash->{'readingid'} );
 120                 # Check the rank
 121                 is( $c->reading( $rdghash->{'readingid'} )->rank, $rank,
 122                         "Got correct reading rank" );
 123                 # Check the witnesses
 124                 my @realwits = sort $c->reading_witnesses( $rdghash->{'readingid'} );
 125                 my @sgrp = sort @{$rdghash->{'group'}};
 126                 is_deeply( \@sgrp, \@realwits, "Reading analyzed with correct groups" );
 127         }
 128 }
 129 is( $data->{'variant_count'}, 58, "Got right total variant number" );
 130 # TODO Make something meaningful of conflict count, maybe test other bits
 131
 132 =end testing
 133
 134 =cut
 135
 136 sub run_analysis {
 137         my( $tradition, %opts ) = @_;
 138         my $c = $tradition->collation;
 139
 140         my $stemma_id = $opts{'stemma_id'} || 0;
 141         my @ranks = ref( $opts{'ranks'} ) eq 'ARRAY' ? @{$opts{'ranks'}} : ();
 142         my @collapse = ref( $opts{'merge_types'} ) eq 'ARRAY' ? @{$opts{'merge_types'}} : ();
 143
 144         # Get the stemma
 145         my $stemma = $tradition->stemma( $stemma_id );
 146
 147         # Figure out which witnesses we are working with - that is, the ones that
 148         # appear both in the stemma and in the tradition. All others are 'lacunose'
 149         # for our purposes.
 150         my @lacunose = $stemma->hypotheticals;
 151         my @tradition_wits = map { $_->sigil } $tradition->witnesses;
 152         map { push( @tradition_wits, $_->sigil.$c->ac_label ) if $_->is_layered }
 153                 $tradition->witnesses;
 154         push( @lacunose, _symmdiff( [ $stemma->witnesses ], \@tradition_wits ) );
 155
 156         # Find and mark 'common' ranks for exclusion, unless they were
 157         # explicitly specified.
 158         unless( @ranks ) {
 159                 my %common_rank;
 160                 foreach my $rdg ( $c->common_readings ) {
 161                         $common_rank{$rdg->rank} = 1;
 162                 }
 163                 @ranks = grep { !$common_rank{$_} } ( 1 .. $c->end->rank-1 );
 164         }
 165
 166         # Group the variants to send to the solver
 167         my @groups;
 168         my @use_ranks;
 169         my %lacunae;
 170         foreach my $rank ( @ranks ) {
 171                 my $missing = [ @lacunose ];
 172                 my $rankgroup = group_variants( $tradition, $rank, $missing, \@collapse );
 173                 if( $opts{'exclude_type1'} ) {
 174                         # Check to see whether this is a "useful" group.
 175                         my( $rdgs, $grps ) = _useful_variant( $rankgroup,
 176                                 $stemma->graph, $c->ac_label );
 177                         next unless @$rdgs;
 178                 }
 179                 push( @use_ranks, $rank );
 180                 push( @groups, $rankgroup );
 181                 $lacunae{$rank} = $missing;
 182         }
 183         # Run the solver
 184         my $answer = solve_variants( $stemma, @groups );
 185
 186         # Do further analysis on the answer
 187         my $conflict_count = 0;
 188         my $aclabel = $c->ac_label;
 189         foreach my $idx ( 0 .. $#use_ranks ) {
 190                 my $location = $answer->{'variants'}->[$idx];
 191                 # Add the rank back in
 192                 $location->{'id'} = $use_ranks[$idx];
 193                 $DB::single = 1 if $use_ranks[$idx] == 87;
 194                 # Note what our lacunae are
 195                 my %lmiss;
 196                 map { $lmiss{$_} = 1 } @{$lacunae{$use_ranks[$idx]}};
 197                 # Run through the reading groups and add as 'lacunae' any redundant
 198                 # a.c. witnesses (yes, we have to do this before the analysis, thus
 199                 # identical loops before and after. Boo.)
 200                 # TODO Consider making these callbacks to analyze_location
 201                 foreach my $rdghash ( @{$location->{'readings'}} ) {
 202                         my %rwits;
 203                         map { $rwits{$_} = 1 } @{$rdghash->{'group'}};
 204                         foreach my $rw ( keys %rwits ) {
 205                                 if( $rw =~ /^(.*)\Q$aclabel\E$/ ) {
 206                                         if( exists $rwits{$1} ) {
 207                                                 $lmiss{$rw} = 1;
 208                                                 delete $rwits{$rw};
 209                                         }
 210                                 }
 211                         }
 212                         $rdghash->{'group'} = [ keys %rwits ];
 213                 }
 214                 $location->{'missing'} = [ keys %lmiss ];
 215
 216                 # Run the extra analysis we need.
 217                 analyze_location( $tradition, $stemma->graph, $location );
 218
 219                 # Do the final post-analysis tidying up of the data.
 220                 foreach my $rdghash ( @{$location->{'readings'}} ) {
 221                         $conflict_count++
 222                                 if exists $rdghash->{'conflict'} && $rdghash->{'conflict'};
 223                         # Add the reading text back in
 224                         my $rdg = $c->reading( $rdghash->{'readingid'} );
 225                         $rdghash->{'text'} = $rdg ? $rdg->text : $rdghash->{'readingid'};
 226                         # Remove lacunose witnesses from this reading's list now that the
 227                         # analysis is done
 228                         my @realgroup;
 229                         map { push( @realgroup, $_ ) unless $lmiss{$_} } @{$rdghash->{'group'}};
 230                         $rdghash->{'group'} = \@realgroup;
 231                         # TODO Record hypotheticals used to create group, if we end up
 232                         # needing it
 233                 }
 234         }
 235         $answer->{'conflict_count'} = $conflict_count;
 236
 237         return $answer;
 238 }
 239
 240 =head2 group_variants( $tradition, $rank, $lacunose, @merge_relationship_types )
 241
 242 Groups the variants at the given $rank of the collation, treating any
 243 relationships in @merge_relationship_types as equivalent.  $lacunose should
 244 be a reference to an array, to which the sigla of lacunose witnesses at this
 245 rank will be appended.
 246
 247 Returns a hash $group_readings where $rdg is attested by the witnesses listed
 248 in $group_readings->{$rdg}.
 249
 250 =cut
 251
 252 # Return group_readings, groups, lacunose
 253 sub group_variants {
 254         my( $tradition, $rank, $lacunose, $collapse ) = @_;
 255         my $c = $tradition->collation;
 256         my $aclabel =  $c->ac_label;
 257         my %seen_acwits;
 258         map { $seen_acwits{$_->sigil.$aclabel} = 0 if $_->is_layered } $tradition->witnesses;
 259         # Get the alignment table readings
 260         my %readings_at_rank;
 261         my %is_lacunose; # lookup table for $lacunose
 262         map { $is_lacunose{$_} = 1 } @$lacunose;
 263         my @gap_wits;
 264         foreach my $tablewit ( @{$c->alignment_table->{'alignment'}} ) {
 265                 my $rdg = $tablewit->{'tokens'}->[$rank-1];
 266                 my $wit = $tablewit->{'witness'};
 267                 # Exclude the witness if it is "lacunose" which if we got here
 268                 # means "not in the stemma".
 269                 next if $is_lacunose{$wit};
 270                 if( $rdg && $rdg->{'t'}->is_lacuna ) {
 271                         push( @$lacunose, $wit );
 272                 } elsif( $rdg ) {
 273                         $readings_at_rank{$rdg->{'t'}->text} = $rdg->{'t'};
 274                 } else {
 275                         $seen_acwits{$wit} = 1 if exists $seen_acwits{$wit};
 276                         push( @gap_wits, $wit );
 277                 }
 278         }
 279
 280         # Group the readings, collapsing groups by relationship if needed
 281         my %grouped_readings;
 282         foreach my $rdg ( values %readings_at_rank ) {
 283                 # Skip readings that have been collapsed into others.
 284                 next if exists $grouped_readings{$rdg->id} && !$grouped_readings{$rdg->id};
 285                 # Get the witness list, including from readings collapsed into this one.
 286                 my @wits = $rdg->witnesses;
 287                 if( $collapse ) {
 288                         my $filter = sub { my $r = $_[0]; grep { $_ eq $r->type } @$collapse; };
 289                         foreach my $other ( $rdg->related_readings( $filter ) ) {
 290                                 my @otherwits = $other->witnesses;
 291                                 push( @wits, @otherwits );
 292                                 $grouped_readings{$other->id} = 0;
 293                         }
 294                 }
 295                 # Filter the group to those witnesses in the stemma, and note any
 296                 # a.c. witnesses explicitly returned.
 297                 my @use_wits;
 298                 foreach my $wit ( @wits ) {
 299                         next if $is_lacunose{$wit};
 300                         push( @use_wits, $wit );
 301                         $seen_acwits{$wit} = 1 if exists $seen_acwits{$wit};
 302                 }
 303                 $grouped_readings{$rdg->id} = \@use_wits;
 304         }
 305         $grouped_readings{'(omitted)'} = \@gap_wits if @gap_wits;
 306         # Get rid of our collapsed readings
 307         map { delete $grouped_readings{$_} unless $grouped_readings{$_} }
 308                 keys %grouped_readings
 309                 if $collapse;
 310         # Any unseen a.c. witnesses should be made lacunose
 311         map { push( @$lacunose, $_ ) unless $seen_acwits{$_} } keys %seen_acwits;
 312
 313         # Return the result
 314         return \%grouped_readings;
 315 }
 316
 317 =head2 solve_variants( $graph, @groups )
 318
 319 Sends the set of groups to the external graph solver service and returns
 320 a cleaned-up answer, adding the rank IDs back where they belong.
 321
 322 The JSON has the form
 323   { "graph": [ stemmagraph DOT string without newlines ],
 324     "groupings": [ array of arrays of groups, one per rank ] }
 325
 326 The answer has the form
 327   { "variants" => [ array of variant location structures ],
 328     "variant_count" => total,
 329     "conflict_count" => number of conflicts detected,
 330     "genealogical_count" => number of solutions found }
 331
 332 =cut
 333
 334 sub solve_variants {
 335         my( $stemma, @groups ) = @_;
 336
 337         # Make the json with stemma + groups
 338         my $groupings = [];
 339         foreach my $ghash ( @groups ) {
 340                 my @grouping;
 341                 foreach my $k ( sort keys %$ghash ) {
 342                         push( @grouping, $ghash->{$k} );
 343                 }
 344                 push( @$groupings, \@grouping );
 345         }
 346         ## Witness map is a HACK to get around limitations in node names from IDP
 347         my $witness_map = {};
 348         my $json = encode_json( _safe_wit_strings( $stemma, $groupings, $witness_map ) );
 349
 350         # Send it off and get the result
 351         my $solver_url = 'http://byzantini.st/cgi-bin/graphcalc.cgi';
 352         my $ua = LWP::UserAgent->new();
 353         my $resp = $ua->post( $solver_url, 'Content-Type' => 'application/json',
 354                                                   'Content' => $json );
 355
 356         my $answer;
 357         my $used_idp;
 358         if( $resp->is_success ) {
 359                 $answer = _desanitize_names( decode_json( $resp->content ), $witness_map );
 360                 $used_idp = 1;
 361         } else {
 362                 # Fall back to the old method.
 363                 warn "IDP solver returned " . $resp->status_line . " / " . $resp->content
 364                         . "; falling back to perl method";
 365                 $answer = perl_solver( $stemma, @$groupings );
 366         }
 367
 368         # Fold the result back into what we know about the groups.
 369         my $variants = [];
 370         my $genealogical = 0;
 371         foreach my $idx ( 0 .. $#groups ) {
 372                 my( $calc_groups, $result ) = @{$answer->[$idx]};
 373                 if( $result ) {
 374                         $genealogical++;
 375                         # Prune the calculated groups, in case the IDP solver failed to.
 376                         if( $used_idp ) {
 377                                 my @pruned_groups;
 378                                 foreach my $cg ( @$calc_groups ) {
 379                                         my @pg = _prune_group( $cg, $stemma );
 380                                         push( @pruned_groups, \@pg );
 381                                 }
 382                                 $calc_groups = \@pruned_groups;
 383                         }
 384                 }
 385                 my $input_group = $groups[$idx];
 386                 foreach my $k ( sort keys %$input_group ) {
 387                         my $cg = shift @$calc_groups;
 388                         $input_group->{$k} = $cg;
 389                 }
 390                 my $vstruct = {
 391                         'genealogical' => $result,
 392                         'readings' => [],
 393                 };
 394                 foreach my $k ( sort { @{$input_group->{$b}} <=> @{$input_group->{$a}} }
 395                                                         keys %$input_group ) {
 396                         push( @{$vstruct->{'readings'}},
 397                                   { 'readingid' => $k, 'group' => $input_group->{$k}} );
 398                 }
 399                 push( @$variants, $vstruct );
 400         }
 401
 402         return { 'variants' => $variants,
 403                          'variant_count' => scalar @$variants,
 404                          'genealogical_count' => $genealogical };
 405 }
 406
 407 #### HACKERY to cope with IDP's limited idea of what a node name looks like ###
 408
 409 sub _safe_wit_strings {
 410         my( $stemma, $groupings, $witness_map ) = @_;
 411         my $safegraph = Graph->new();
 412         # Convert the graph to a safe representation and store the conversion.
 413         foreach my $n ( $stemma->graph->vertices ) {
 414                 my $sn = _safe_witstr( $n );
 415                 warn "Ambiguous stringification $sn for $n and " . $witness_map->{$sn}
 416                         if exists $witness_map->{$sn};
 417                 $witness_map->{$sn} = $n;
 418                 $safegraph->add_vertex( $sn );
 419                 $safegraph->set_vertex_attributes( $sn,
 420                         $stemma->graph->get_vertex_attributes( $n ) );
 421         }
 422         foreach my $e ( $stemma->graph->edges ) {
 423                 my @safe_e = ( _safe_witstr( $e->[0] ), _safe_witstr( $e->[1] ) );
 424                 $safegraph->add_edge( @safe_e );
 425         }
 426         my $safe_stemma = Text::Tradition::Stemma->new(
 427                 'collation' => $stemma->collation, 'graph' => $safegraph );
 428
 429         # Now convert the witness groupings to a safe representation.
 430         my $safe_groupings = [];
 431         foreach my $grouping ( @$groupings ) {
 432                 my $safe_grouping = [];
 433                 foreach my $group ( @$grouping ) {
 434                         my $safe_group = [];
 435                         foreach my $n ( @$group ) {
 436                                 my $sn = _safe_witstr( $n );
 437                                 warn "Ambiguous stringification $sn for $n and " . $witness_map->{$sn}
 438                                         if exists $witness_map->{$sn} && $witness_map->{$sn} ne $n;
 439                                 $witness_map->{$sn} = $n;
 440                                 push( @$safe_group, $sn );
 441                         }
 442                         push( @$safe_grouping, $safe_group );
 443                 }
 444                 push( @$safe_groupings, $safe_grouping );
 445         }
 446
 447         # Return it all in the struct we expect.  We have stored the reductions
 448         # in the $witness_map that we were passed.
 449         return { 'graph' => $safe_stemma->editable( ' ' ), 'groupings' => $safe_groupings };
 450 }
 451
 452 sub _safe_witstr {
 453         my $witstr = shift;
 454         $witstr =~ s/\s+/_/g;
 455         $witstr =~ s/[^\w\d-]//g;
 456         return $witstr;
 457 }
 458
 459 sub _desanitize_names {
 460         my( $jsonstruct, $witness_map ) = @_;
 461         my $result = [];
 462         foreach my $grouping ( @$jsonstruct ) {
 463                 my $real_grouping = [];
 464                 foreach my $element ( @$grouping ) {
 465                         if( ref( $element ) eq 'ARRAY' ) {
 466                                 # it's the groupset.
 467                                 my $real_groupset = [];
 468                                 foreach my $group ( @$element ) {
 469                                         my $real_group = [];
 470                                         foreach my $n ( @$group ) {
 471                                                 my $rn = $witness_map->{$n};
 472                                                 push( @$real_group, $rn );
 473                                         }
 474                                         push( @$real_groupset, $real_group );
 475                                 }
 476                                 push( @$real_grouping, $real_groupset );
 477                         } else {
 478                                 # It is the boolean, not actually a group.
 479                                 push( @$real_grouping, $element );
 480                         }
 481                 }
 482                 push( @$result, $real_grouping );
 483         }
 484         return $result;
 485 }
 486
 487 ### END HACKERY ###
 488
 489 =head2 analyze_location ( $tradition, $graph, $location_hash )
 490
 491 Given the tradition, its stemma graph, and the solution from the graph solver,
 492 work out the rest of the information we want.  For each reading we need missing,
 493 conflict, reading_parents, independent_occurrence, followed, not_followed, and follow_unknown.  Alters the location_hash in place.
 494
 495 =cut
 496
 497 sub analyze_location {
 498         my ( $tradition, $graph, $variant_row ) = @_;
 499
 500         # Make a hash of all known node memberships, and make the subgraphs.
 501         my $contig = {};
 502         my $reading_roots = {};
 503         my $subgraph = {};
 504     foreach my $rdghash ( @{$variant_row->{'readings'}} ) {
 505         my $rid = $rdghash->{'readingid'};
 506                 map { $contig->{$_} = $rid } @{$rdghash->{'group'}};
 507
 508         # Make the subgraph.
 509         my $part = $graph->copy;
 510                 my %these_vertices;
 511                 map { $these_vertices{$_} = 1 } @{$rdghash->{'group'}};
 512                 $part->delete_vertices( grep { !$these_vertices{$_} } $part->vertices );
 513                 $subgraph->{$rid} = $part;
 514                 # Get the reading roots.
 515                 map { $reading_roots->{$_} = $rid } $part->predecessorless_vertices;
 516         }
 517
 518         # Now that we have all the node group memberships, calculate followed/
 519     # non-followed/unknown values for each reading.  Also figure out the
 520     # reading's evident parent(s).
 521     foreach my $rdghash ( @{$variant_row->{'readings'}} ) {
 522         # Group string key - TODO do we need this?
 523         my $gst = wit_stringify( $rdghash->{'group'} );
 524         my $rid = $rdghash->{'readingid'};
 525         # Get the subgraph
 526         my $part = $subgraph->{$rid};
 527
 528         # Start figuring things out.
 529         my @roots = $part->predecessorless_vertices;
 530         $rdghash->{'independent_occurrence'} = scalar @roots;
 531         $rdghash->{'followed'} = scalar( $part->vertices ) - scalar( @roots );
 532         # Find the parent readings, if any, of this reading.
 533         my %rdgparents;
 534         foreach my $wit ( @roots ) {
 535                 # Look in the main stemma to find this witness's extant or known-reading
 536                 # immediate ancestor(s), and look up the reading that each ancestor olds.
 537                         my @check = $graph->predecessors( $wit );
 538                         while( @check ) {
 539                                 my @next;
 540                                 foreach my $wparent( @check ) {
 541                                         my $preading = $contig->{$wparent};
 542                                         if( $preading ) {
 543                                                 $rdgparents{$preading} = 1;
 544                                         } else {
 545                                                 push( @next, $graph->predecessors( $wparent ) );
 546                                         }
 547                                 }
 548                                 @check = @next;
 549                         }
 550                 }
 551                 $rdghash->{'reading_parents'} = [ keys %rdgparents ];
 552
 553                 # Find the number of times this reading was altered, and the number of
 554                 # times we're not sure.
 555                 my( %nofollow, %unknownfollow );
 556                 foreach my $wit ( $part->vertices ) {
 557                         foreach my $wchild ( $graph->successors( $wit ) ) {
 558                                 next if $part->has_vertex( $wchild );
 559                                 if( $reading_roots->{$wchild} && $contig->{$wchild} ) {
 560                                         # It definitely changed here.
 561                                         $nofollow{$wchild} = 1;
 562                                 } elsif( !($contig->{$wchild}) ) {
 563                                         # The child is a hypothetical node not definitely in
 564                                         # any group. Answer is unknown.
 565                                         $unknownfollow{$wchild} = 1;
 566                                 } # else it's a non-root node in a known group, and therefore
 567                                   # is presumed to have its reading from its group, not this link.
 568                         }
 569                 }
 570                 $rdghash->{'not_followed'} = keys %nofollow;
 571                 $rdghash->{'follow_unknown'} = keys %unknownfollow;
 572
 573                 # Now say whether this reading represents a conflict.
 574                 unless( $variant_row->{'genealogical'} ) {
 575                         $rdghash->{'conflict'} = @roots != 1;
 576                 }
 577     }
 578 }
 579
 580
 581 =head2 perl_solver( $tradition, $rank, $stemma_id, @merge_relationship_types )
 582
 583 ** NOTE ** This method should hopefully not be called - it is not guaranteed
 584 to be correct.  Serves as a backup for the real solver.
 585
 586 Runs an analysis of the given tradition, at the location given in $rank,
 587 against the graph of the stemma specified in $stemma_id.  The argument
 588 @merge_relationship_types is an optional list of relationship types for
 589 which readings so related should be treated as equivalent.
 590
 591 Returns a nested array data structure as follows:
 592
 593  [ [ group_list, is_genealogical ], [ group_list, is_genealogical ] ... ]
 594
 595 where the group list is the array of arrays passed in for each element of @groups,
 596 possibly with the addition of hypothetical readings.
 597
 598
 599 =cut
 600
 601 sub perl_solver {
 602         my( $stemma, @groups ) = @_;
 603         my $graph = $stemma->graph;
 604         my @answer;
 605         foreach my $g ( @groups ) {
 606                 push( @answer, _solve_variant_location( $graph, $g ) );
 607         }
 608         return \@answer;
 609 }
 610
 611 sub _solve_variant_location {
 612         my( $graph, $groups ) = @_;
 613         # Now do the work.
 614     my $contig = {};
 615     my $subgraph = {};
 616     my $is_conflicted;
 617     my $conflict = {};
 618
 619     # Mark each ms as in its own group, first.
 620     foreach my $g ( @$groups ) {
 621         my $gst = wit_stringify( $g );
 622         map { $contig->{$_} = $gst } @$g;
 623     }
 624
 625     # Now for each unmarked node in the graph, initialize an array
 626     # for possible group memberships.  We will use this later to
 627     # resolve potential conflicts.
 628     map { $contig->{$_} = [] unless $contig->{$_} } $graph->vertices;
 629     foreach my $g ( sort { scalar @$b <=> scalar @$a } @$groups ) {
 630         my $gst = wit_stringify( $g );  # This is the group name
 631         # Copy the graph, and delete all non-members from the new graph.
 632         my $part = $graph->copy;
 633         my @group_roots;
 634         $part->delete_vertices(
 635             grep { !ref( $contig->{$_} ) && $contig->{$_} ne $gst } $graph->vertices );
 636
 637         # Now look to see if our group is connected.
 638                 if( @$g > 1 ) {
 639                         # We have to take directionality into account.
 640                         # How many root nodes do we have?
 641                         my @roots = grep { ref( $contig->{$_} ) || $contig->{$_} eq $gst }
 642                                 $part->predecessorless_vertices;
 643                         # Assuming that @$g > 1, find the first root node that has at
 644                         # least one successor belonging to our group. If this reading
 645                         # is genealogical, there should be only one, but we will check
 646                         # that implicitly later.
 647                         foreach my $root ( @roots ) {
 648                                 # Prune the tree to get rid of extraneous hypotheticals.
 649                                 $root = _prune_subtree( $part, $root, $contig );
 650                                 next unless $root;
 651                                 # Save this root for our group.
 652                                 push( @group_roots, $root );
 653                                 # Get all the successor nodes of our root.
 654                         }
 655                 } else {
 656                         # Dispense with the trivial case of one reading.
 657                         my $wit = $g->[0];
 658                         @group_roots = ( $wit );
 659                         foreach my $v ( $part->vertices ) {
 660                                 $part->delete_vertex( $v ) unless $v eq $wit;
 661                         }
 662         }
 663
 664         if( @group_roots > 1 ) {
 665                 $conflict->{$gst} = 1;
 666                 $is_conflicted = 1;
 667         }
 668         # Paint the 'hypotheticals' with our group.
 669                 foreach my $wit ( $part->vertices ) {
 670                         if( ref( $contig->{$wit} ) ) {
 671                                 push( @{$contig->{$wit}}, $gst );
 672                         } elsif( $contig->{$wit} ne $gst ) {
 673                                 warn "How did we get here?";
 674                         }
 675                 }
 676
 677
 678                 # Save the relevant subgraph.
 679                 $subgraph->{$gst} = $part;
 680     }
 681
 682         # For each of our hypothetical readings, flatten its 'contig' array if
 683         # the array contains zero or one group.  If we have any unflattened arrays,
 684         # we may need to run the resolution process. If the reading is already known
 685         # to have a conflict, flatten the 'contig' array to nothing; we won't resolve
 686         # it.
 687         my @resolve;
 688         foreach my $wit ( keys %$contig ) {
 689                 next unless ref( $contig->{$wit} );
 690                 if( @{$contig->{$wit}} > 1 ) {
 691                         if( $is_conflicted ) {
 692                                 $contig->{$wit} = '';  # We aren't going to decide.
 693                         } else {
 694                                 push( @resolve, $wit );
 695                         }
 696                 } else {
 697                         my $gst = pop @{$contig->{$wit}};
 698                         $contig->{$wit} = $gst || '';
 699                 }
 700         }
 701
 702     if( @resolve ) {
 703         my $still_contig = {};
 704         foreach my $h ( @resolve ) {
 705             # For each of the hypothetical readings with more than one possibility,
 706             # try deleting it from each of its member subgraphs in turn, and see
 707             # if that breaks the contiguous grouping.
 708             # TODO This can still break in a corner case where group A can use
 709             # either vertex 1 or 2, and group B can use either vertex 2 or 1.
 710             # Revisit this if necessary; it could get brute-force nasty.
 711             foreach my $gst ( @{$contig->{$h}} ) {
 712                 my $gpart = $subgraph->{$gst}->copy();
 713                 # If we have come this far, there is only one root and everything
 714                 # is reachable from it.
 715                 my( $root ) = $gpart->predecessorless_vertices;
 716                 my $reachable = {};
 717                 map { $reachable->{$_} = 1 } $gpart->vertices;
 718
 719                 # Try deleting the hypothetical node.
 720                 $gpart->delete_vertex( $h );
 721                 if( $h eq $root ) {
 722                         # See if we still have a single root.
 723                         my @roots = $gpart->predecessorless_vertices;
 724                         warn "This shouldn't have happened" unless @roots;
 725                         if( @roots > 1 ) {
 726                                 # $h is needed by this group.
 727                                 if( exists( $still_contig->{$h} ) ) {
 728                                         # Conflict!
 729                                         $conflict->{$gst} = 1;
 730                                         $still_contig->{$h} = '';
 731                                 } else {
 732                                         $still_contig->{$h} = $gst;
 733                                 }
 734                         }
 735                 } else {
 736                         # $h is somewhere in the middle. See if everything
 737                         # else can still be reached from the root.
 738                                         my %still_reachable = ( $root => 1 );
 739                                         map { $still_reachable{$_} = 1 }
 740                                                 $gpart->all_successors( $root );
 741                                         foreach my $v ( keys %$reachable ) {
 742                                                 next if $v eq $h;
 743                                                 if( !$still_reachable{$v}
 744                                                         && ( $contig->{$v} eq $gst
 745                                                                  || ( exists $still_contig->{$v}
 746                                                                           && $still_contig->{$v} eq $gst ) ) ) {
 747                                                         # We need $h.
 748                                                         if( exists $still_contig->{$h} ) {
 749                                                                 # Conflict!
 750                                                                 $conflict->{$gst} = 1;
 751                                                                 $still_contig->{$h} = '';
 752                                                         } else {
 753                                                                 $still_contig->{$h} = $gst;
 754                                                         }
 755                                                         last;
 756                                                 } # else we don't need $h in this group.
 757                                         } # end foreach $v
 758                                 } # endif $h eq $root
 759             } # end foreach $gst
 760         } # end foreach $h
 761
 762         # Now we have some hypothetical vertices in $still_contig that are the
 763         # "real" group memberships.  Replace these in $contig.
 764                 foreach my $v ( keys %$contig ) {
 765                         next unless ref $contig->{$v};
 766                         $contig->{$v} = $still_contig->{$v};
 767                 }
 768     } # end if @resolve
 769
 770     my $is_genealogical = keys %$conflict ? JSON::false : JSON::true;
 771         my $variant_row = [ [], $is_genealogical ];
 772         # Fill in the groupings from $contig.
 773         foreach my $g ( @$groups ) {
 774         my $gst = wit_stringify( $g );
 775         my @realgroup = grep { $contig->{$_} eq $gst } keys %$contig;
 776         push( @{$variant_row->[0]}, \@realgroup );
 777     }
 778     return $variant_row;
 779 }
 780
 781 sub _prune_group {
 782         my( $group, $stemma ) = @_;
 783         # Get these into a form prune_subtree will recognize. Make a "contighash"
 784         my $hypohash = {};
 785         map { $hypohash->{$_} = 1 } @$group;
 786         # ...with reference values for hypotheticals.
 787         map { $hypohash->{$_} = [] } $stemma->hypotheticals;
 788         # Make our subgraph
 789         my $subgraph = $stemma->graph->copy;
 790         map { $subgraph->delete_vertex( $_ ) unless exists $hypohash->{$_} }
 791                 $subgraph->vertices;
 792         # ...and find the root.
 793         my( $root ) = $subgraph->predecessorless_vertices;
 794         # Now prune and return the remaining vertices.
 795         _prune_subtree( $subgraph, $root, $hypohash );
 796         return $subgraph->vertices;
 797 }
 798
 799 sub _prune_subtree {
 800     my( $tree, $root, $contighash ) = @_;
 801     # First, delete hypothetical leaves / orphans until there are none left.
 802     my @orphan_hypotheticals = grep { ref( $contighash->{$_} ) }
 803         $tree->successorless_vertices;
 804     while( @orphan_hypotheticals ) {
 805         $tree->delete_vertices( @orphan_hypotheticals );
 806         @orphan_hypotheticals = grep { ref( $contighash->{$_} ) }
 807             $tree->successorless_vertices;
 808     }
 809     # Then delete a hypothetical root with only one successor, moving the
 810     # root to the first child that has no other predecessors.
 811     while( $tree->successors( $root ) == 1 && ref $contighash->{$root} ) {
 812         my @nextroot = $tree->successors( $root );
 813         $tree->delete_vertex( $root );
 814         ( $root ) = grep { $tree->is_predecessorless_vertex( $_ ) } @nextroot;
 815     }
 816     # The tree has been modified in place, but we need to know the new root.
 817     $root = undef unless $root && $tree->has_vertex( $root );
 818     return $root;
 819 }
 820 # Add the variant, subject to a.c. representation logic.
 821 # This assumes that we will see the 'main' version before the a.c. version.
 822 sub add_variant_wit {
 823     my( $arr, $wit, $acstr ) = @_;
 824     my $skip;
 825     if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
 826         my $real = $1;
 827         $skip = grep { $_ =~ /^\Q$real\E$/ } @$arr;
 828     }
 829     push( @$arr, $wit ) unless $skip;
 830 }
 831
 832 sub _useful_variant {
 833         my( $group_readings, $graph, $acstr ) = @_;
 834
 835         # TODO Decide what to do with AC witnesses
 836
 837         # Sort by group size and return
 838         my $is_useful = 0;
 839         my( @readings, @groups );   # The sorted groups for our answer.
 840         foreach my $rdg ( sort { @{$group_readings->{$b}} <=> @{$group_readings->{$a}} }
 841                 keys %$group_readings ) {
 842                 push( @readings, $rdg );
 843                 push( @groups, $group_readings->{$rdg} );
 844                 if( @{$group_readings->{$rdg}} > 1 ) {
 845                         $is_useful++;
 846                 } else {
 847                         my( $wit ) = @{$group_readings->{$rdg}};
 848                         $wit =~ s/^(.*)\Q$acstr\E$/$1/;
 849                         $is_useful++ unless( $graph->is_sink_vertex( $wit ) );
 850                 }
 851         }
 852         if( $is_useful > 1 ) {
 853                 return( \@readings, \@groups );
 854         } else {
 855                 return( [], [] );
 856         }
 857 }
 858
 859 =head2 wit_stringify( $groups )
 860
 861 Takes an array of witness groupings and produces a string like
 862 ['A','B'] / ['C','D','E'] / ['F']
 863
 864 =cut
 865
 866 sub wit_stringify {
 867     my $groups = shift;
 868     my @gst;
 869     # If we were passed an array of witnesses instead of an array of
 870     # groupings, then "group" the witnesses first.
 871     unless( ref( $groups->[0] ) ) {
 872         my $mkgrp = [ $groups ];
 873         $groups = $mkgrp;
 874     }
 875     foreach my $g ( @$groups ) {
 876         push( @gst, '[' . join( ',', map { "'$_'" } @$g ) . ']' );
 877     }
 878     return join( ' / ', @gst );
 879 }
 880
 881 sub _symmdiff {
 882         my( $lista, $listb ) = @_;
 883         my %union;
 884         my %scalars;
 885         map { $union{$_} = 1; $scalars{$_} = $_ } @$lista;
 886         map { $union{$_} += 1; $scalars{$_} = $_ } @$listb;
 887         my @set = grep { $union{$_} == 1 } keys %union;
 888         return map { $scalars{$_} } @set;
 889 }
 890
 891 1;
 892
 893 =head1 LICENSE
 894
 895 This package is free software and is provided "as is" without express
 896 or implied warranty.  You can redistribute it and/or modify it under
 897 the same terms as Perl itself.
 898
 899 =head1 AUTHOR
 900
 901 Tara L Andrews E<lt>aurum@cpan.orgE<gt>