lib/Text/Tradition/Analysis.pm

   1 package Text::Tradition::Analysis;
   2
   3 use strict;
   4 use warnings;
   5 use Algorithm::Diff;  # for word similarity measure
   6 use Benchmark;
   7 use Digest::MD5 qw/ md5_hex /;
   8 use Encode qw/ encode_utf8 /;
   9 use Exporter 'import';
  10 use Graph;
  11 use JSON qw/ to_json /;
  12 use Set::Scalar;
  13 use Text::Tradition;
  14 use Text::Tradition::Analysis::Result;
  15 use Text::Tradition::Directory;
  16 use Text::Tradition::Stemma;
  17 use TryCatch;
  18
  19 use vars qw/ @EXPORT_OK /;
  20 @EXPORT_OK = qw/ run_analysis group_variants analyze_variant_location wit_stringify /;
  21
  22 my $unsolved_problems = {};
  23
  24 =head1 NAME
  25
  26 Text::Tradition::Analysis - functions for stemma analysis of a tradition
  27
  28 =head1 SYNOPSIS
  29
  30   use Text::Tradition;
  31   use Text::Tradition::Analysis qw/ run_analysis analyze_variant_location /;
  32   my $t = Text::Tradition->new(
  33     'name' => 'this is a text',
  34     'input' => 'TEI',
  35     'file' => '/path/to/tei_parallel_seg_file.xml' );
  36   $t->add_stemma( 'dotfile' => $stemmafile );
  37
  38   my $variant_data = run_analysis( $tradition );
  39   # Recalculate rank $n treating all orthographic variants as equivalent
  40   my $reanalyze = analyze_variant_location( $tradition, $n, 0, 'orthographic' );
  41
  42 =head1 DESCRIPTION
  43
  44 Text::Tradition is a library for representation and analysis of collated
  45 texts, particularly medieval ones.  The Collation is the central feature of
  46 a Tradition, where the text, its sequence of readings, and its relationships
  47 between readings are actually kept.
  48
  49 =head1 SUBROUTINES
  50
  51 =head2 run_analysis( $tradition, %opts )
  52
  53 Runs the analysis described in analyze_variant_location on every location in the
  54 collation of the given tradition, with the given options. These include:
  55
  56 =over 4
  57
  58 =item * stemma_id - Specify which of the tradition's stemmata to use. Default
  59 is 0 (i.e. the first).
  60
  61 =item * ranks - Specify a list of location ranks to analyze; exclude the rest.
  62
  63 =item * merge_types - Specify a list of relationship types, where related readings
  64 should be treated as identical for the purposes of analysis.
  65
  66 =item * exclude_type1 - Exclude those ranks whose groupings have only type-1 variants.
  67
  68 =back
  69
  70 =begin testing
  71
  72 use Text::Tradition;
  73 use Text::Tradition::Analysis qw/ run_analysis analyze_variant_location /;
  74
  75 my $datafile = 't/data/florilegium_tei_ps.xml';
  76 my $tradition = Text::Tradition->new( 'input' => 'TEI',
  77                                       'name' => 'test0',
  78                                       'file' => $datafile );
  79 my $s = $tradition->add_stemma( 'dotfile' => 't/data/florilegium.dot' );
  80 is( ref( $s ), 'Text::Tradition::Stemma', "Added stemma to tradition" );
  81
  82 my %expected_genealogical = (
  83         1 => 0,
  84         2 => 1,
  85         3 =>  0,
  86         5 =>  0,
  87         7 =>  0,
  88         8 =>  0,
  89         10 => 0,
  90         13 => 1,
  91         33 => 0,
  92         34 => 0,
  93         37 => 0,
  94         60 => 0,
  95         81 => 1,
  96         84 => 0,
  97         87 => 0,
  98         101 => 0,
  99         102 => 0,
 100         122 => 1,
 101         157 => 0,
 102         166 => 1,
 103         169 => 1,
 104         200 => 0,
 105         216 => 1,
 106         217 => 1,
 107         219 => 1,
 108         241 => 1,
 109         242 => 1,
 110         243 => 1,
 111 );
 112
 113 my $data = run_analysis( $tradition, calcdsn => 'dbi:SQLite:dbname=t/data/analysis.db' );
 114 my $c = $tradition->collation;
 115 foreach my $row ( @{$data->{'variants'}} ) {
 116         # Account for rows that used to be "not useful"
 117         unless( exists $expected_genealogical{$row->{'id'}} ) {
 118                 $expected_genealogical{$row->{'id'}} = 1;
 119         }
 120         my $gen_bool = $row->{'genealogical'} ? 1 : 0;
 121         is( $gen_bool, $expected_genealogical{$row->{'id'}},
 122                 "Got correct genealogical flag for row " . $row->{'id'} );
 123         # Check that we have the right row with the right groups
 124         my $rank = $row->{'id'};
 125         foreach my $rdghash ( @{$row->{'readings'}} ) {
 126                 # Skip 'readings' that aren't really
 127                 next unless $c->reading( $rdghash->{'readingid'} );
 128                 # Check the rank
 129                 is( $c->reading( $rdghash->{'readingid'} )->rank, $rank,
 130                         "Got correct reading rank" );
 131                 # Check the witnesses
 132                 my @realwits = sort $c->reading_witnesses( $rdghash->{'readingid'} );
 133                 my @sgrp = sort @{$rdghash->{'group'}};
 134                 is_deeply( \@sgrp, \@realwits, "Reading analyzed with correct groups" );
 135         }
 136 }
 137 is( $data->{'variant_count'}, 58, "Got right total variant number" );
 138 # TODO Make something meaningful of conflict count, maybe test other bits
 139
 140 =end testing
 141
 142 =cut
 143
 144 sub run_analysis {
 145         my( $tradition, %opts ) = @_;
 146         my $c = $tradition->collation;
 147         my $aclabel = $c->ac_label;
 148
 149         my $stemma_id = $opts{'stemma_id'} || 0;
 150         my @ranks = ref( $opts{'ranks'} ) eq 'ARRAY' ? @{$opts{'ranks'}} : ();
 151         my $collapse = Set::Scalar->new();
 152         if( $opts{'merge_types'} && ref( $opts{'merge_types'} ) eq 'ARRAY' ) {
 153                 $collapse->insert( @{$opts{'merge_types'}} );
 154         } elsif( $opts{'merge_types'} ) {
 155                 $collapse->insert( $opts{'merge_types'} );
 156         }
 157
 158         # Make sure we have a lookup DB for graph calculation results; this will die
 159         # if calcdir or calcdsn isn't passed.
 160         my $dir = $opts{'calcdir'} ? delete $opts{'calcdir'}
 161                 : Text::Tradition::Directory->new( dsn => $opts{'calcdsn'} );
 162
 163         # Get the stemma
 164         my $stemma = $tradition->stemma( $stemma_id );
 165
 166         # Figure out which witnesses we are working with - that is, the ones that
 167         # appear both in the stemma and in the tradition. All others are 'lacunose'
 168         # for our purposes.
 169         my $lacunose = Set::Scalar->new( $stemma->hypotheticals );
 170         my $stemma_wits = Set::Scalar->new( $stemma->witnesses );
 171         my $tradition_wits = Set::Scalar->new( map { $_->sigil } $tradition->witnesses );
 172         $lacunose->insert( $stemma_wits->symmetric_difference( $tradition_wits )->members );
 173
 174         # Find and mark 'common' ranks for exclusion, unless they were
 175         # explicitly specified.
 176         unless( @ranks ) {
 177                 my %common_rank;
 178                 foreach my $rdg ( $c->common_readings ) {
 179                         $common_rank{$rdg->rank} = 1;
 180                 }
 181                 @ranks = grep { !$common_rank{$_} } ( 1 .. $c->end->rank-1 );
 182         }
 183
 184         # Group the variants to send to the solver
 185         my @groups;
 186         my @use_ranks;
 187         my %lacunae;
 188         my $moved = {};
 189         foreach my $rank ( @ranks ) {
 190                 my $missing = $lacunose->clone();
 191                 my $rankgroup = group_variants( $tradition, $rank, $missing, $moved, $collapse );
 192                 # Filter out any empty rankgroups
 193                 # (e.g. from the later rank for a transposition)
 194                 next unless keys %$rankgroup;
 195                 # Get the graph for this rankgroup
 196                 my $rankgraph = _graph_for_grouping( $stemma, $rankgroup, $missing, $aclabel );
 197                 if( $opts{'exclude_type1'} ) {
 198                         # Check to see whether this is a "useful" group.
 199                         next unless _useful_variant( $rankgroup, $rankgraph, $aclabel );
 200                 }
 201                 push( @use_ranks, $rank );
 202                 push( @groups, { grouping => $rankgroup, graph => $rankgraph } );
 203                 $lacunae{$rank} = $missing;
 204         }
 205         # Run the solver
 206         my $answer = solve_variants( $dir, @groups );
 207
 208         # Do further analysis on the answer
 209         my $conflict_count = 0;
 210         my $reversion_count = 0;
 211         foreach my $idx ( 0 .. $#use_ranks ) {
 212                 my $location = $answer->{'variants'}->[$idx];
 213                 # Add the rank back in
 214                 my $rank = $use_ranks[$idx];
 215                 $location->{'id'} = $rank;
 216                 # Note what our lacunae are
 217                 my %lmiss;
 218                 map { $lmiss{$_} = 1 } @{$lacunae{$use_ranks[$idx]}};
 219                 $location->{'missing'} = [ keys %lmiss ];
 220
 221                 # Run the extra analysis we need.
 222                 ## TODO We run through all the variants in this call, so
 223                 ## why not add the reading data there instead of here below?
 224                 my $graph = $groups[$idx]->{graph};
 225                 analyze_location( $tradition, $graph, $location, \%lmiss );
 226
 227                 my @layerwits;
 228                 # Do the final post-analysis tidying up of the data.
 229                 foreach my $rdghash ( @{$location->{'readings'}} ) {
 230                         $conflict_count++ if $rdghash->{'is_conflict'};
 231                         $reversion_count++ if $rdghash->{'is_reverted'};
 232                         # Add the reading text back in, setting display value as needed
 233                         my $rdg = $c->reading( $rdghash->{'readingid'} );
 234                         if( $rdg ) {
 235                                 $rdghash->{'text'} = $rdg->text .
 236                                         ( $rdg->rank == $rank ? '' : ' [' . $rdg->rank . ']' );
 237                                 $rdghash->{'is_ungrammatical'} = $rdg->grammar_invalid;
 238                                 $rdghash->{'is_nonsense'} = $rdg->is_nonsense;
 239                         }
 240                         # Remove lacunose witnesses from this reading's list now that the
 241                         # analysis is done
 242                         my @realgroup;
 243                         map { push( @realgroup, $_ ) unless $lmiss{$_} } @{$rdghash->{'group'}};
 244                         $rdghash->{'group'} = \@realgroup;
 245                         # Note any layered witnesses that appear in this group
 246                         foreach( @realgroup ) {
 247                                 if( $_ =~ /^(.*)\Q$aclabel\E$/ ) {
 248                                         push( @layerwits, $1 );
 249                                 }
 250                         }
 251                 }
 252                 $location->{'layerwits'} = \@layerwits if @layerwits;
 253         }
 254         $answer->{'conflict_count'} = $conflict_count;
 255         $answer->{'reversion_count'} = $reversion_count;
 256
 257         return $answer;
 258 }
 259
 260 =head2 group_variants( $tradition, $rank, $lacunose, $transposed, $merge_relationship_types )
 261
 262 Groups the variants at the given $rank of the collation, treating any
 263 relationships in the set $merge_relationship_types as equivalent.
 264 $lacunose should be a reference to an array, to which the sigla of lacunose
 265 witnesses at this rank will be appended; $transposed should be a reference
 266 to a hash, wherein the identities of transposed readings and their
 267 relatives will be stored.
 268
 269 Returns a hash $group_readings where $rdg is attested by the witnesses listed
 270 in $group_readings->{$rdg}.
 271
 272 =cut
 273
 274 # Return group_readings, groups, lacunose
 275 sub group_variants {
 276         my( $tradition, $rank, $lacunose, $transposed, $collapse ) = @_;
 277         my $c = $tradition->collation;
 278         my $aclabel = $c->ac_label;
 279         my $table = $c->alignment_table;
 280         # Get the alignment table readings
 281         my %readings_at_rank;
 282         my $check_for_gaps = Set::Scalar->new();
 283         my %moved_wits;
 284         my $has_transposition;
 285         foreach my $tablewit ( @{$table->{'alignment'}} ) {
 286                 my $rdg = $tablewit->{'tokens'}->[$rank-1];
 287                 my $wit = $tablewit->{'witness'};
 288                 # Exclude the witness if it is "lacunose" which if we got here
 289                 # means "not in the stemma".
 290                 next if _is_lacunose( $wit, $lacunose, $aclabel );
 291                 # Note if the witness is actually in a lacuna
 292                 if( $rdg && $rdg->{'t'}->is_lacuna ) {
 293                         _add_to_witlist( $wit, $lacunose, $aclabel );
 294                 # Otherwise the witness either has a positive reading...
 295                 } elsif( $rdg ) {
 296                         # If the reading has been counted elsewhere as a transposition, ignore it.
 297                         if( $transposed->{$rdg->{'t'}->id} ) {
 298                                 # TODO Does this cope with three-way transpositions?
 299                                 map { $moved_wits{$_} = 1 } @{$transposed->{$rdg->{'t'}->id}};
 300                                 next;
 301                         }
 302                         # Otherwise, record it...
 303                         $readings_at_rank{$rdg->{'t'}->id} = $rdg->{'t'};
 304                         # ...and grab any transpositions, and their relations.
 305                         my @transp = grep { $_->rank != $rank } $rdg->{'t'}->related_readings();
 306                         foreach my $trdg ( @transp ) {
 307                                 next if exists $readings_at_rank{$trdg->id};
 308                                 $has_transposition = 1;
 309                                 my @affected_wits = _table_witnesses(
 310                                         $table, $trdg, $lacunose, $aclabel );
 311                                 next unless @affected_wits;
 312                                 map { $moved_wits{$_} = 1 } @affected_wits;
 313                                 $transposed->{$trdg->id} =
 314                                         [ _table_witnesses( $table, $rdg->{'t'}, $lacunose, $aclabel ) ];
 315                                 $readings_at_rank{$trdg->id} = $trdg;
 316                         }
 317                 # ...or it is empty, ergo a gap.
 318                 } else {
 319                         _add_to_witlist( $wit, $check_for_gaps, $aclabel );
 320                 }
 321         }
 322         my $gap_wits = Set::Scalar->new();
 323         map { _add_to_witlist( $_, $gap_wits, $aclabel )
 324                 unless $moved_wits{$_} } $check_for_gaps->members;
 325
 326         # Group the readings, collapsing groups by relationship if needed.
 327         my $grouped_readings = {};
 328         foreach my $rdg ( values %readings_at_rank ) {
 329                 # Skip readings that have been collapsed into others.
 330                 next if exists $grouped_readings->{$rdg->id}
 331                         && $grouped_readings->{$rdg->id} eq 'COLLAPSE';
 332                 # Get the witness list, including from readings collapsed into this one.
 333                 my @wits = _table_witnesses( $table, $rdg, $lacunose, $aclabel );
 334                 if( $collapse && $collapse->size ) {
 335                         my $filter = sub { $collapse->has( $_[0]->type ) };
 336                         foreach my $other ( $rdg->related_readings( $filter ) ) {
 337                                 my @otherwits = _table_witnesses( $table, $other, $lacunose, $aclabel );
 338                                 push( @wits, @otherwits );
 339                                 $grouped_readings->{$other->id} = 'COLLAPSE';
 340                         }
 341                 }
 342                 $grouped_readings->{$rdg->id} = Set::Scalar->new( @wits );
 343         }
 344         if( $gap_wits->members ) {
 345                 $grouped_readings->{'(omitted)'} = $gap_wits;
 346         }
 347
 348         # Get rid of our collapsed readings
 349         map { delete $grouped_readings->{$_} if(
 350                          $grouped_readings->{$_} eq 'COLLAPSE'
 351                          || $grouped_readings->{$_}->is_empty ) }
 352                 keys %$grouped_readings;
 353
 354         # If something was transposed, check the groups for doubled-up readings
 355         if( $has_transposition ) {
 356                 # print STDERR "Group for rank $rank:\n";
 357                 # map { print STDERR "\t$_: " . join( ' ' , @{$grouped_readings->{$_}} ) . "\n" }
 358                 #       keys %$grouped_readings;
 359                 _check_transposed_consistency( $c, $rank, $transposed, $grouped_readings );
 360         }
 361
 362         # Return the result
 363         return $grouped_readings;
 364 }
 365
 366 # Helper function to query the alignment table for all witnesses (a.c. included)
 367 # that have a given reading at its rank.
 368 sub _table_witnesses {
 369         my( $table, $trdg, $lacunose, $aclabel ) = @_;
 370         my $tableidx = $trdg->rank - 1;
 371         my $has_reading = Set::Scalar->new();
 372         foreach my $row ( @{$table->{'alignment'}} ) {
 373                 my $wit = $row->{'witness'};
 374                 next if _is_lacunose( $wit, $lacunose, $aclabel );
 375                 my $rdg = $row->{'tokens'}->[$tableidx];
 376                 next unless exists $rdg->{'t'} && defined $rdg->{'t'};
 377                 _add_to_witlist( $wit, $has_reading, $aclabel )
 378                         if $rdg->{'t'}->id eq $trdg->id;
 379         }
 380         return $has_reading->members;
 381 }
 382
 383 # Helper function to see if a witness is lacunose even if we are asking about
 384 # the a.c. version
 385 sub _is_lacunose {
 386         my ( $wit, $lac, $acstr ) = @_;
 387         if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
 388                 $wit = $1;
 389         }
 390         return $lac->has( $wit );
 391 }
 392
 393 # Helper function to ensure that X and X a.c. never appear in the same list.
 394 sub _add_to_witlist {
 395         my( $wit, $list, $acstr ) = @_;
 396         if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
 397                 # Don't add X a.c. if we already have X
 398                 return if $list->has( $1 );
 399         } else {
 400                 # Delete X a.c. if we are about to add X
 401                 $list->delete( $wit.$acstr );
 402         }
 403         $list->insert( $wit );
 404 }
 405
 406 sub _check_transposed_consistency {
 407         my( $c, $rank, $transposed, $groupings ) = @_;
 408         my %seen_wits;
 409         my %thisrank;
 410         # Note which readings are actually at this rank, and which witnesses
 411         # belong to which reading.
 412         foreach my $rdg ( keys %$groupings ) {
 413                 my $rdgobj = $c->reading( $rdg );
 414                 # Count '(omitted)' as a reading at this rank
 415                 $thisrank{$rdg} = 1 if !$rdgobj || $rdgobj->rank == $rank;
 416                 map { push( @{$seen_wits{$_}}, $rdg ) } @{$groupings->{$rdg}};
 417         }
 418         # Our work is done if we have no witness belonging to more than one
 419         # reading.
 420         my @doubled = grep { scalar @{$seen_wits{$_}} > 1 } keys %seen_wits;
 421         return unless @doubled;
 422         # If we have a symmetric related transposition, drop the non-rank readings.
 423         if( @doubled == scalar keys %seen_wits ) {
 424                 foreach my $rdg ( keys %$groupings ) {
 425                         if( !$thisrank{$rdg} ) {
 426                                 my $groupstr = wit_stringify( $groupings->{$rdg} );
 427                                 my ( $matched ) = grep { $groupstr eq wit_stringify( $groupings->{$_} ) }
 428                                         keys %thisrank;
 429                                 delete $groupings->{$rdg};
 430                                 # If we found a group match, assume there is a symmetry happening.
 431                                 # TODO think more about this
 432                                 # print STDERR "*** Deleting symmetric reading $rdg\n";
 433                                 unless( $matched ) {
 434                                         delete $transposed->{$rdg};
 435                                         warn "Found problem in evident symmetry with reading $rdg";
 436                                 }
 437                         }
 438                 }
 439         # Otherwise 'unhook' the transposed reading(s) that have duplicates.
 440         } else {
 441                 foreach my $dup ( @doubled ) {
 442                         foreach my $rdg ( @{$seen_wits{$dup}} ) {
 443                                 next if $thisrank{$rdg};
 444                                 next unless exists $groupings->{$rdg};
 445                                 # print STDERR "*** Deleting asymmetric doubled-up reading $rdg\n";
 446                                 delete $groupings->{$rdg};
 447                                 delete $transposed->{$rdg};
 448                         }
 449                 }
 450                 # and put any now-orphaned readings into an 'omitted' reading.
 451                 foreach my $wit ( keys %seen_wits ) {
 452                         unless( grep { exists $groupings->{$_} } @{$seen_wits{$wit}} ) {
 453                                 $groupings->{'(omitted)'} = Set::Scalar->new()
 454                                          unless exists $groupings->{'(omitted)'};
 455                                 _add_to_witlist( $wit, $groupings->{'(omitted)'}, $c->ac_label );
 456                         }
 457                 }
 458         }
 459 }
 460
 461 # For the given grouping, return its situation graph based on the stemma.
 462 sub _graph_for_grouping {
 463         my( $stemma, $grouping, $lacunose, $aclabel ) = @_;
 464         my $acwits = [];
 465         my $extant = {};
 466         foreach my $gs ( values %$grouping ) {
 467                 map {
 468                         if( $_ =~ /^(.*)\Q$aclabel\E$/ ) {
 469                                 push( @$acwits, $1 ) unless $lacunose->has( $1 );
 470                         } else {
 471                                 $extant->{$_} = 1 unless $lacunose->has( $_ );
 472                         }
 473                 } $gs->members;
 474         }
 475         my $graph;
 476         try {
 477                 # contig contains all extant wits and all hypothetical wits
 478                 # needed to make up the groups.
 479                 $graph = $stemma->situation_graph( $extant, $acwits );
 480         } catch ( Text::Tradition::Error $e ) {
 481                 die "Could not extend graph with given extant and a.c. witnesses: "
 482                         . $e->message;
 483         } catch {
 484                 die "Could not extend graph with a.c. witnesses @$acwits";
 485         }
 486         return $graph;
 487 }
 488
 489 =head2 solve_variants( $calcdir, @groups )
 490
 491 Looks up the set of groups in the answers provided by the external graph solver
 492 service and returns a cleaned-up answer, adding the rank IDs back where they belong.
 493
 494 The JSON has the form
 495   { "graph": [ stemmagraph DOT string without newlines ],
 496     "groupings": [ array of arrays of groups, one per rank ] }
 497
 498 The answer has the form
 499   { "variants" => [ array of variant location structures ],
 500     "variant_count" => total,
 501     "conflict_count" => number of conflicts detected,
 502     "genealogical_count" => number of solutions found }
 503
 504 =cut
 505
 506 sub solve_variants {
 507         my( $dir, @groups ) = @_;
 508
 509         ## For each graph/group combo, look it up in the DB.
 510         ## Witness map is a HACK to get around limitations in node names from IDP
 511         my $witness_map = {};
 512         ## Variables to store answers as they come back
 513         my $variants = [];
 514         my $genealogical = 0; # counter
 515         foreach my $graphproblem ( @groups ) {
 516                 # Initialize the result structure for this graph problem
 517                 my $vstruct = { readings => [] };
 518                 push( @$variants, $vstruct );
 519
 520                 # Construct the calc result key and look up its answer
 521                 my $reqkey = _get_calc_key( $graphproblem );
 522                 my $scope = $dir->new_scope;
 523                 my $answer = $dir->lookup( $reqkey );
 524                 unless( $answer ) {
 525                         #warn "No answer found for graph problem $reqkey, moving on";
 526                         # Record the unsolved problem so that we can go get a solution
 527                         _save_problem( $graphproblem );
 528                         # Put just the request, with no real result, into vstruct
 529                         foreach my $rid ( keys %{$graphproblem->{grouping}} ) {
 530                                 push( @{$vstruct->{readings}}, { readingid => $rid,
 531                                         group => [ $graphproblem->{grouping}->{$rid}->members ] } );
 532                         }
 533                         next;
 534                 }
 535
 536                 ## The answer is a Text::Tradition::Analysis::Result containing a bunch
 537                 ## of information about this variant set. Record the information therein.
 538
 539                 # 1. Did the group evaluate as genealogical?
 540                 $vstruct->{genealogical} = $answer->is_genealogical;
 541                 $genealogical++ if $answer->is_genealogical;
 542
 543                 # 2. What are the calculated minimum groupings for each variant loc?
 544                 foreach my $rid ( keys %{$graphproblem->{grouping}} ) {
 545                         my $inputset = $graphproblem->{grouping}->{$rid};
 546                         my $minset = $answer->minimum_grouping_for( $inputset );
 547                         push( @{$vstruct->{readings}}, { readingid => $rid, group => $minset } );
 548                 }
 549
 550                 # 3. What are the sources and classes calculated for each witness?
 551                 $vstruct->{witcopy_types} = { $answer->classes };
 552                 $vstruct->{reading_roots} = {};
 553                 map { $vstruct->{reading_roots}->{$_} = 1 } $answer->sources;
 554
 555         }
 556
 557         # Spit out any unsolved problems we encountered
 558         # _list_unsolved();
 559
 560         return { 'variants' => $variants,
 561                          'variant_count' => scalar @$variants,
 562                          'genealogical_count' => $genealogical };
 563 }
 564
 565 sub _get_calc_key {
 566         my( $graphproblem ) = @_;
 567         my $graph = $graphproblem->{graph};
 568         my $grouping = [ values %{$graphproblem->{grouping}} ];
 569         my $key = Text::Tradition::Analysis::Result::string_from_graph_problem(
 570                 $graph, $grouping );
 571         return md5_hex( encode_utf8( $key ) );
 572 }
 573
 574 sub _save_problem {
 575         my( $graphproblem ) = @_;
 576         my $problem = Text::Tradition::Analysis::Result->new(
 577                 graph => $graphproblem->{graph},
 578                 setlist => [ values %{$graphproblem->{grouping}} ]
 579         );
 580         my $key = _get_calc_key( $graphproblem );
 581         my( $str ) = $problem->problem_json;
 582         say STDERR "Stashing unsolved problem $str at key $key";
 583         $unsolved_problems->{$key} = $problem;
 584 }
 585
 586 sub _list_unsolved {
 587         #say STDERR "Problems needing a solution:";
 588         my @problems = values %$unsolved_problems;
 589         return unless @problems;
 590         my $first = shift @problems;
 591         map { say STDERR $_ } $first->problem_json( @problems );
 592 }
 593
 594 =head2 analyze_location ( $tradition, $graph, $location_hash )
 595
 596 Given the tradition, its stemma graph, and the solution from the graph solver,
 597 work out the rest of the information we want.  For each reading we need missing,
 598 conflict, reading_parents, independent_occurrence, followed, not_followed,
 599 and follow_unknown.  Alters the location_hash in place.
 600
 601 =cut
 602
 603 sub analyze_location {
 604         my ( $tradition, $graph, $variant_row, $lacunose ) = @_;
 605         my $c = $tradition->collation;
 606
 607         # Make a hash of all known node memberships, and make the subgraphs.
 608         my $contig = {};
 609         my $reading_roots = {};
 610         my $subgraph = {};
 611         my $acstr = $c->ac_label;
 612         my @acwits;
 613
 614         my $NO_IDP;
 615         if( exists $variant_row->{'reading_roots'} ) {
 616                 $reading_roots = delete $variant_row->{'reading_roots'};
 617         } else {
 618                 warn "No reading source information from IDP - proceed at your own risk";
 619                 $NO_IDP = 1;
 620         }
 621         my $classinfo = delete $variant_row->{'witcopy_types'};
 622
 623         # Note which witnesses positively belong to which group. This information
 624         # comes ultimately from the IDP solver.
 625         # Also make a note of the reading's roots.
 626     foreach my $rdghash ( @{$variant_row->{'readings'}} ) {
 627         my $rid = $rdghash->{'readingid'};
 628         my @roots;
 629         foreach my $wit ( @{$rdghash->{'group'}} ) {
 630                 $contig->{$wit} = $rid;
 631             if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
 632                 push( @acwits, $1 );
 633             }
 634             if( exists $reading_roots->{$wit} && $reading_roots->{$wit} ) {
 635                 push( @roots, $wit );
 636             }
 637         }
 638                 $rdghash->{'independent_occurrence'} = \@roots;
 639         }
 640
 641         # Now that we have all the node group memberships, calculate followed/
 642     # non-followed/unknown values for each reading.  Also figure out the
 643     # reading's evident parent(s).
 644     foreach my $rdghash ( @{$variant_row->{'readings'}} ) {
 645         my $rid = $rdghash->{'readingid'};
 646         my $rdg = $c->reading( $rid );
 647         my @roots = @{$rdghash->{'independent_occurrence'}};
 648         my @reversions;
 649         if( $classinfo ) {
 650                 @reversions = grep { $classinfo->{$_} eq 'revert' }
 651                         $rdghash->{'group'}->members;
 652         }
 653         my @group = @{$rdghash->{'group'}};
 654
 655         # Start figuring things out.
 656         $rdghash->{'followed'} = scalar( @group )
 657                 - ( scalar( @roots ) + scalar( @reversions ) );
 658         # Find the parent readings, if any, of this reading.
 659         my $sourceparents = _find_reading_parents( $rid, $graph, $contig, @roots );
 660                 # Work out relationships between readings and their non-followed parent.
 661                 _resolve_parent_relationships( $c, $rid, $rdg, $sourceparents );
 662                 $rdghash->{'reading_parents'} = $sourceparents;
 663
 664                 if( @reversions ) {
 665                         my $revparents = _find_reading_parents( $rid, $graph, $contig, @reversions );
 666                         _resolve_parent_relationships( $c, $rid, $rdg, $revparents );
 667                         $rdghash->{'reversion_parents'} = $revparents;
 668                 }
 669
 670                 # Find the number of times this reading was altered, and the number of
 671                 # times we're not sure.
 672                 my( %nofollow, %unknownfollow );
 673                 foreach my $wit ( @{$rdghash->{'group'}} ) {
 674                         foreach my $wchild ( $graph->successors( $wit ) ) {
 675                                 if( $reading_roots->{$wchild} && $contig->{$wchild}
 676                                         && $contig->{$wchild} ne $rid ) {
 677                                         # It definitely changed here.
 678                                         $nofollow{$wchild} = 1;
 679                                 } elsif( !($contig->{$wchild}) ) {
 680                                         # The child is a hypothetical node not definitely in
 681                                         # any group. Answer is unknown.
 682                                         $unknownfollow{$wchild} = 1;
 683                                 } # else it is either in our group, or it is a non-root node in a
 684                                   # known group and therefore is presumed to have its reading from
 685                                   # its group, not this link.
 686                         }
 687                 }
 688                 $rdghash->{'not_followed'} = keys %nofollow;
 689                 $rdghash->{'follow_unknown'} = keys %unknownfollow;
 690
 691                 # Now say whether this reading represents a conflict.
 692                 unless( $variant_row->{'genealogical'} ) {
 693                         $rdghash->{'is_conflict'} = @roots != 1;
 694                         $rdghash->{'is_reverted'} = scalar @reversions;
 695                 }
 696     }
 697 }
 698
 699 sub _find_reading_parents {
 700         my( $rid, $graph, $contig, @list ) = @_;
 701         my $parenthash = {};
 702         foreach my $wit ( @list ) {
 703                 # Look in the stemma graph to find this witness's extant or known-reading
 704                 # immediate ancestor(s), and look up the reading that each ancestor holds.
 705                 my @check = $graph->predecessors( $wit );
 706                 while( @check ) {
 707                         my @next;
 708                         foreach my $wparent( @check ) {
 709                                 my $preading = $contig->{$wparent};
 710                                 if( $preading && $preading ne $rid ) {
 711                                         $parenthash->{$preading} = 1;
 712                                 } else {
 713                                         push( @next, $graph->predecessors( $wparent ) );
 714                                 }
 715                         }
 716                         @check = @next;
 717                 }
 718         }
 719         return $parenthash;
 720 }
 721
 722 sub _resolve_parent_relationships {
 723         my( $c, $rid, $rdg, $rdgparents ) = @_;
 724         foreach my $p ( keys %$rdgparents ) {
 725                 # Resolve the relationship of the parent to the reading, and
 726                 # save it in our hash.
 727                 my $pobj = $c->reading( $p );
 728                 my $prep = $pobj ? $pobj->id . ' (' . $pobj->text . ')' : $p;
 729                 my $phash = { 'label' => $prep };
 730                 if( $pobj ) {
 731                         my $rel = $c->get_relationship( $p, $rid );
 732                         if( $rel ) {
 733                                 _add_to_hash( $rel, $phash );
 734                         } elsif( $rdg ) {
 735                                 # First check for a transposed relationship
 736                                 if( $rdg->rank != $pobj->rank ) {
 737                                         foreach my $ti ( $rdg->related_readings( 'transposition' ) ) {
 738                                                 next unless $ti->text eq $rdg->text;
 739                                                 $rel = $c->get_relationship( $ti, $pobj );
 740                                                 if( $rel ) {
 741                                                         _add_to_hash( $rel, $phash, 1 );
 742                                                         last;
 743                                                 }
 744                                         }
 745                                         unless( $rel ) {
 746                                                 foreach my $ti ( $pobj->related_readings( 'transposition' ) ) {
 747                                                         next unless $ti->text eq $pobj->text;
 748                                                         $rel = $c->get_relationship( $ti, $rdg );
 749                                                         if( $rel ) {
 750                                                                 _add_to_hash( $rel, $phash, 1 );
 751                                                                 last;
 752                                                         }
 753                                                 }
 754                                         }
 755                                 }
 756                                 unless( $rel ) {
 757                                         # and then check for sheer word similarity.
 758                                         my $rtext = $rdg->text;
 759                                         my $ptext = $pobj->text;
 760                                         if( similar( $rtext, $ptext ) ) {
 761                                                 # say STDERR "Words $rtext and $ptext judged similar";
 762                                                 $phash->{relation} = { type => 'wordsimilar' };
 763                                         }
 764                                 }
 765                         } else {
 766                                 $phash->{relation} = { type => 'deletion' };
 767                         }
 768                         # Get the attributes of the parent object while we are here
 769                         $phash->{'text'} = $pobj->text if $pobj;
 770                         $phash->{'is_nonsense'} = $pobj->is_nonsense;
 771                         $phash->{'is_ungrammatical'} = $pobj->grammar_invalid;
 772                 } elsif( $p eq '(omitted)' ) {
 773                         $phash->{relation} = { type => 'addition' };
 774                 }
 775                 # Save it
 776                 $rdgparents->{$p} = $phash;
 777         }
 778 }
 779
 780 sub _add_to_hash {
 781         my( $rel, $phash, $is_transposed ) = @_;
 782         $phash->{relation} = { type => $rel->type };
 783         $phash->{relation}->{transposed} = 1 if $is_transposed;
 784         $phash->{relation}->{annotation} = $rel->annotation
 785                 if $rel->has_annotation;
 786 }
 787
 788 =head2 similar( $word1, $word2 )
 789
 790 Use Algorithm::Diff to get a sense of how close the words are to each other.
 791 This will hopefully handle substitutions a bit more nicely than Levenshtein.
 792
 793 =cut
 794
 795 #!/usr/bin/env perl
 796
 797 sub similar {
 798         my( $word1, $word2 ) = sort { length($a) <=> length($b) } @_;
 799         my @let1 = split( '', lc( $word1 ) );
 800         my @let2 = split( '', lc( $word2 ) );
 801         my $diff = Algorithm::Diff->new( \@let1, \@let2 );
 802         my $mag = 0;
 803         while( $diff->Next ) {
 804                 if( $diff->Same ) {
 805                         # Take off points for longer strings
 806                         my $cs = $diff->Range(1) - 2;
 807                         $cs = 0 if $cs < 0;
 808                         $mag -= $cs;
 809                 } elsif( !$diff->Items(1) ) {
 810                         $mag += $diff->Range(2);
 811                 } elsif( !$diff->Items(2) ) {
 812                         $mag += $diff->Range(1);
 813                 } else {
 814                         # Split the difference for substitutions
 815                         my $c1 = $diff->Range(1) || 1;
 816                         my $c2 = $diff->Range(2) || 1;
 817                         my $cd = ( $c1 + $c2 ) / 2;
 818                         $mag += $cd;
 819                 }
 820         }
 821         return ( $mag <= length( $word1 ) / 2 );
 822 }
 823
 824 sub _prune_group {
 825         my( $group, $graph ) = @_;
 826         my $relevant = {};
 827         # Record the existence of the vertices in the group
 828         map { $relevant->{$_} = 1 } @$group;
 829         # Make our subgraph
 830         my $subgraph = $graph->deep_copy;
 831         map { $subgraph->delete_vertex( $_ ) unless $relevant->{$_} }
 832                 $subgraph->vertices;
 833         # Now prune and return the remaining vertices.
 834         _prune_subtree( $subgraph );
 835         # Return the list of vertices and the list of roots.
 836         my $pruned_group = [ sort $subgraph->vertices ];
 837         my $pruned_roots = [ $subgraph->predecessorless_vertices ];
 838         return( $pruned_group, $pruned_roots );
 839 }
 840
 841 sub _prune_subtree {
 842         my( $tree ) = @_;
 843
 844         # Delete lacunose witnesses that have no successors
 845         my @orphan_hypotheticals;
 846         my $ctr = 0;
 847         do {
 848                 die "Infinite loop on leaves" if $ctr > 100;
 849                 @orphan_hypotheticals =
 850                         grep { $tree->get_vertex_attribute( $_, 'class' ) eq 'hypothetical' }
 851                                 $tree->successorless_vertices;
 852                 $tree->delete_vertices( @orphan_hypotheticals );
 853                 $ctr++;
 854         } while( @orphan_hypotheticals );
 855
 856         # Delete lacunose roots that have a single successor
 857         my @redundant_root;
 858         $ctr = 0;
 859         do {
 860                 die "Infinite loop on roots" if $ctr > 100;
 861                 @redundant_root =
 862                         grep { $tree->get_vertex_attribute( $_, 'class' ) eq 'hypothetical'
 863                                    && $tree->successors( $_ ) == 1 }
 864                                 $tree->predecessorless_vertices;
 865                 $tree->delete_vertices( @redundant_root );
 866                 $ctr++;
 867         } while( @redundant_root );
 868 }
 869
 870 sub _useful_variant {
 871         my( $rankgroup, $rankgraph, $acstr ) = @_;
 872
 873         # Sort by group size and return
 874         my $is_useful = 0;
 875         foreach my $rdg ( keys %$rankgroup ) {
 876                 my @wits = $rankgroup->{$rdg}->members;
 877                 if( @wits > 1 ) {
 878                         $is_useful++;
 879                 } else {
 880                         $is_useful++ unless( $rankgraph->is_sink_vertex( $wits[0] )
 881                                 || $wits[0] =~ /\Q$acstr\E$/ );
 882                 }
 883         }
 884         return $is_useful > 1;
 885 }
 886
 887 =head2 wit_stringify( $groups )
 888
 889 Takes an array of witness groupings and produces a string like
 890 ['A','B'] / ['C','D','E'] / ['F']
 891
 892 =cut
 893
 894 sub wit_stringify {
 895     my $groups = shift;
 896     my @gst;
 897     # If we were passed an array of witnesses instead of an array of
 898     # groupings, then "group" the witnesses first.
 899     unless( ref( $groups->[0] ) ) {
 900         my $mkgrp = [ $groups ];
 901         $groups = $mkgrp;
 902     }
 903     foreach my $g ( @$groups ) {
 904         push( @gst, '[' . join( ',', map { "'$_'" } @$g ) . ']' );
 905     }
 906     return join( ' / ', @gst );
 907 }
 908
 909 1;
 910
 911 =head1 LICENSE
 912
 913 This package is free software and is provided "as is" without express
 914 or implied warranty.  You can redistribute it and/or modify it under
 915 the same terms as Perl itself.
 916
 917 =head1 AUTHOR
 918
 919 Tara L Andrews E<lt>aurum@cpan.orgE<gt>