lib/Text/Tradition/Analysis.pm

   1 package Text::Tradition::Analysis;
   2
   3 use strict;
   4 use warnings;
   5 use Algorithm::Diff;  # for word similarity measure
   6 use Benchmark;
   7 use Digest::MD5 qw/ md5_hex /;
   8 use Encode qw/ encode_utf8 /;
   9 use Exporter 'import';
  10 use Graph;
  11 use JSON qw/ to_json /;
  12 use Set::Scalar;
  13 use Text::Tradition;
  14 use Text::Tradition::Analysis::Result;
  15 use Text::Tradition::Directory;
  16 use Text::Tradition::Stemma;
  17 use TryCatch;
  18
  19 use vars qw/ @EXPORT_OK /;
  20 @EXPORT_OK = qw/ run_analysis group_variants analyze_variant_location wit_stringify /;
  21
  22 my $unsolved_problems = {};
  23
  24 =head1 NAME
  25
  26 Text::Tradition::Analysis - functions for stemma analysis of a tradition
  27
  28 =head1 SYNOPSIS
  29
  30   use Text::Tradition;
  31   use Text::Tradition::Analysis qw/ run_analysis analyze_variant_location /;
  32   my $t = Text::Tradition->new(
  33     'name' => 'this is a text',
  34     'input' => 'TEI',
  35     'file' => '/path/to/tei_parallel_seg_file.xml' );
  36   $t->add_stemma( 'dotfile' => $stemmafile );
  37
  38   my $variant_data = run_analysis( $tradition );
  39   # Recalculate rank $n treating all orthographic variants as equivalent
  40   my $reanalyze = analyze_variant_location( $tradition, $n, 0, 'orthographic' );
  41
  42 =head1 DESCRIPTION
  43
  44 Text::Tradition is a library for representation and analysis of collated
  45 texts, particularly medieval ones.  The Collation is the central feature of
  46 a Tradition, where the text, its sequence of readings, and its relationships
  47 between readings are actually kept.
  48
  49 =head1 SUBROUTINES
  50
  51 =head2 run_analysis( $tradition, %opts )
  52
  53 Runs the analysis described in analyze_variant_location on every location in the
  54 collation of the given tradition, with the given options. These include:
  55
  56 =over 4
  57
  58 =item * stemma_id - Specify which of the tradition's stemmata to use. Default
  59 is 0 (i.e. the first).
  60
  61 =item * ranks - Specify a list of location ranks to analyze; exclude the rest.
  62
  63 =item * merge_types - Specify a list of relationship types, where related readings
  64 should be treated as identical for the purposes of analysis.
  65
  66 =item * exclude_type1 - Exclude those ranks whose groupings have only type-1 variants.
  67
  68 =back
  69
  70 =begin testing
  71
  72 use Text::Tradition;
  73 use Text::Tradition::Analysis qw/ run_analysis analyze_variant_location /;
  74
  75 my $datafile = 't/data/florilegium_tei_ps.xml';
  76 my $tradition = Text::Tradition->new( 'input' => 'TEI',
  77                                       'name' => 'test0',
  78                                       'file' => $datafile );
  79 my $s = $tradition->add_stemma( 'dotfile' => 't/data/florilegium.dot' );
  80 is( ref( $s ), 'Text::Tradition::Stemma', "Added stemma to tradition" );
  81
  82 my %expected_genealogical = (
  83         1 => 0,
  84         2 => 1,
  85         3 =>  0,
  86         5 =>  0,
  87         7 =>  0,
  88         8 =>  0,
  89         10 => 0,
  90         13 => 1,
  91         33 => 0,
  92         34 => 0,
  93         37 => 0,
  94         60 => 0,
  95         81 => 1,
  96         84 => 0,
  97         87 => 0,
  98         101 => 0,
  99         102 => 0,
 100         122 => 1,
 101         157 => 0,
 102         166 => 1,
 103         169 => 1,
 104         200 => 0,
 105         216 => 1,
 106         217 => 1,
 107         219 => 1,
 108         241 => 1,
 109         242 => 1,
 110         243 => 1,
 111 );
 112
 113 my $data = run_analysis( $tradition, calcdsn => 'dbi:SQLite:dbname=t/data/analysis.db' );
 114 my $c = $tradition->collation;
 115 foreach my $row ( @{$data->{'variants'}} ) {
 116         # Account for rows that used to be "not useful"
 117         unless( exists $expected_genealogical{$row->{'id'}} ) {
 118                 $expected_genealogical{$row->{'id'}} = 1;
 119         }
 120         my $gen_bool = $row->{'genealogical'} ? 1 : 0;
 121         is( $gen_bool, $expected_genealogical{$row->{'id'}},
 122                 "Got correct genealogical flag for row " . $row->{'id'} );
 123         # Check that we have the right row with the right groups
 124         my $rank = $row->{'id'};
 125         foreach my $rdghash ( @{$row->{'readings'}} ) {
 126                 # Skip 'readings' that aren't really
 127                 next unless $c->reading( $rdghash->{'readingid'} );
 128                 # Check the rank
 129                 is( $c->reading( $rdghash->{'readingid'} )->rank, $rank,
 130                         "Got correct reading rank" );
 131                 # Check the witnesses
 132                 my @realwits = sort $c->reading_witnesses( $rdghash->{'readingid'} );
 133                 my @sgrp = sort @{$rdghash->{'group'}};
 134                 is_deeply( \@sgrp, \@realwits, "Reading analyzed with correct groups" );
 135         }
 136 }
 137 is( $data->{'variant_count'}, 58, "Got right total variant number" );
 138 # TODO Make something meaningful of conflict count, maybe test other bits
 139
 140 =end testing
 141
 142 =cut
 143
 144 sub run_analysis {
 145         my( $tradition, %opts ) = @_;
 146         my $c = $tradition->collation;
 147         my $aclabel = $c->ac_label;
 148
 149         my $stemma_id = $opts{'stemma_id'} || 0;
 150         my @ranks = ref( $opts{'ranks'} ) eq 'ARRAY' ? @{$opts{'ranks'}} : ();
 151         my @collapse = ref( $opts{'merge_types'} ) eq 'ARRAY' ? @{$opts{'merge_types'}} : ();
 152
 153         # Make sure we have a lookup DB for graph calculation results; this will die
 154         # if calcdir or calcdsn isn't passed.
 155         my $dir = $opts{'calcdir'} ? delete $opts{'calcdir'}
 156                 : Text::Tradition::Directory->new( dsn => $opts{'calcdsn'} );
 157
 158         # Get the stemma
 159         my $stemma = $tradition->stemma( $stemma_id );
 160
 161         # Figure out which witnesses we are working with - that is, the ones that
 162         # appear both in the stemma and in the tradition. All others are 'lacunose'
 163         # for our purposes.
 164         my $lacunose = Set::Scalar->new( $stemma->hypotheticals );
 165         my $stemma_wits = Set::Scalar->new( $stemma->witnesses );
 166         my $tradition_wits = Set::Scalar->new( map { $_->sigil } $tradition->witnesses );
 167         $lacunose->insert( $stemma_wits->symmetric_difference( $tradition_wits )->members );
 168
 169         # Find and mark 'common' ranks for exclusion, unless they were
 170         # explicitly specified.
 171         unless( @ranks ) {
 172                 my %common_rank;
 173                 foreach my $rdg ( $c->common_readings ) {
 174                         $common_rank{$rdg->rank} = 1;
 175                 }
 176                 @ranks = grep { !$common_rank{$_} } ( 1 .. $c->end->rank-1 );
 177         }
 178
 179         # Group the variants to send to the solver
 180         my @groups;
 181         my @use_ranks;
 182         my %lacunae;
 183         my $moved = {};
 184         foreach my $rank ( @ranks ) {
 185                 my $missing = $lacunose->clone();
 186                 my $rankgroup = group_variants( $tradition, $rank, $missing, $moved, \@collapse );
 187                 # Filter out any empty rankgroups
 188                 # (e.g. from the later rank for a transposition)
 189                 next unless keys %$rankgroup;
 190                 # Get the graph for this rankgroup
 191                 my $rankgraph = _graph_for_grouping( $stemma, $rankgroup, $missing, $aclabel );
 192                 if( $opts{'exclude_type1'} ) {
 193                         # Check to see whether this is a "useful" group.
 194                         next unless _useful_variant( $rankgroup, $rankgraph, $aclabel );
 195                 }
 196                 push( @use_ranks, $rank );
 197                 push( @groups, { grouping => $rankgroup, graph => $rankgraph } );
 198                 $lacunae{$rank} = $missing;
 199         }
 200         # Run the solver
 201         my $answer = solve_variants( $dir, @groups );
 202
 203         # Do further analysis on the answer
 204         my $conflict_count = 0;
 205         my $reversion_count = 0;
 206         foreach my $idx ( 0 .. $#use_ranks ) {
 207                 my $location = $answer->{'variants'}->[$idx];
 208                 # Add the rank back in
 209                 my $rank = $use_ranks[$idx];
 210                 $location->{'id'} = $rank;
 211                 # Note what our lacunae are
 212                 my %lmiss;
 213                 map { $lmiss{$_} = 1 } @{$lacunae{$use_ranks[$idx]}};
 214                 $location->{'missing'} = [ keys %lmiss ];
 215
 216                 # Run the extra analysis we need.
 217                 ## TODO We run through all the variants in this call, so
 218                 ## why not add the reading data there instead of here below?
 219                 my $graph = $groups[$idx]->{graph};
 220                 analyze_location( $tradition, $graph, $location, \%lmiss );
 221
 222                 my @layerwits;
 223                 # Do the final post-analysis tidying up of the data.
 224                 foreach my $rdghash ( @{$location->{'readings'}} ) {
 225                         $conflict_count++
 226                                 if exists $rdghash->{'conflict'} && $rdghash->{'conflict'};
 227                         $reversion_count++ if $rdghash->{'reverted'};
 228                         # Add the reading text back in, setting display value as needed
 229                         my $rdg = $c->reading( $rdghash->{'readingid'} );
 230                         if( $rdg ) {
 231                                 $rdghash->{'text'} = $rdg->text .
 232                                         ( $rdg->rank == $rank ? '' : ' [' . $rdg->rank . ']' );
 233                                 $rdghash->{'is_ungrammatical'} = $rdg->grammar_invalid;
 234                                 $rdghash->{'is_nonsense'} = $rdg->is_nonsense;
 235                         }
 236                         # Remove lacunose witnesses from this reading's list now that the
 237                         # analysis is done
 238                         my @realgroup;
 239                         map { push( @realgroup, $_ ) unless $lmiss{$_} } @{$rdghash->{'group'}};
 240                         $rdghash->{'group'} = \@realgroup;
 241                         # Note any layered witnesses that appear in this group
 242                         foreach( @realgroup ) {
 243                                 if( $_ =~ /^(.*)\Q$aclabel\E$/ ) {
 244                                         push( @layerwits, $1 );
 245                                 }
 246                         }
 247                 }
 248                 $location->{'layerwits'} = \@layerwits if @layerwits;
 249         }
 250         $answer->{'conflict_count'} = $conflict_count;
 251         $answer->{'reversion_count'} = $reversion_count;
 252
 253         return $answer;
 254 }
 255
 256 =head2 group_variants( $tradition, $rank, $lacunose, @merge_relationship_types )
 257
 258 Groups the variants at the given $rank of the collation, treating any
 259 relationships in @merge_relationship_types as equivalent.  $lacunose should
 260 be a reference to an array, to which the sigla of lacunose witnesses at this
 261 rank will be appended; $transposed should be a reference to a hash, wherein
 262 the identities of transposed readings and their relatives will be stored.
 263
 264 Returns a hash $group_readings where $rdg is attested by the witnesses listed
 265 in $group_readings->{$rdg}.
 266
 267 =cut
 268
 269 # Return group_readings, groups, lacunose
 270 sub group_variants {
 271         my( $tradition, $rank, $lacunose, $transposed, $collapse ) = @_;
 272         my $c = $tradition->collation;
 273         my $aclabel = $c->ac_label;
 274         my $table = $c->alignment_table;
 275         # Get the alignment table readings
 276         my %readings_at_rank;
 277         my $check_for_gaps = Set::Scalar->new();
 278         my %moved_wits;
 279         my $has_transposition;
 280         foreach my $tablewit ( @{$table->{'alignment'}} ) {
 281                 my $rdg = $tablewit->{'tokens'}->[$rank-1];
 282                 my $wit = $tablewit->{'witness'};
 283                 # Exclude the witness if it is "lacunose" which if we got here
 284                 # means "not in the stemma".
 285                 next if _is_lacunose( $wit, $lacunose, $aclabel );
 286                 # Note if the witness is actually in a lacuna
 287                 if( $rdg && $rdg->{'t'}->is_lacuna ) {
 288                         _add_to_witlist( $wit, $lacunose, $aclabel );
 289                 # Otherwise the witness either has a positive reading...
 290                 } elsif( $rdg ) {
 291                         # If the reading has been counted elsewhere as a transposition, ignore it.
 292                         if( $transposed->{$rdg->{'t'}->id} ) {
 293                                 # TODO Does this cope with three-way transpositions?
 294                                 map { $moved_wits{$_} = 1 } @{$transposed->{$rdg->{'t'}->id}};
 295                                 next;
 296                         }
 297                         # Otherwise, record it...
 298                         $readings_at_rank{$rdg->{'t'}->id} = $rdg->{'t'};
 299                         # ...and grab any transpositions, and their relations.
 300                         my @transp = grep { $_->rank != $rank } $rdg->{'t'}->related_readings();
 301                         foreach my $trdg ( @transp ) {
 302                                 next if exists $readings_at_rank{$trdg->id};
 303                                 $has_transposition = 1;
 304                                 my @affected_wits = _table_witnesses(
 305                                         $table, $trdg, $lacunose, $aclabel );
 306                                 next unless @affected_wits;
 307                                 map { $moved_wits{$_} = 1 } @affected_wits;
 308                                 $transposed->{$trdg->id} =
 309                                         [ _table_witnesses( $table, $rdg->{'t'}, $lacunose, $aclabel ) ];
 310                                 $readings_at_rank{$trdg->id} = $trdg;
 311                         }
 312                 # ...or it is empty, ergo a gap.
 313                 } else {
 314                         _add_to_witlist( $wit, $check_for_gaps, $aclabel );
 315                 }
 316         }
 317         my $gap_wits = Set::Scalar->new();
 318         map { _add_to_witlist( $_, $gap_wits, $aclabel )
 319                 unless $moved_wits{$_} } $check_for_gaps->members;
 320
 321         # Group the readings, collapsing groups by relationship if needed.
 322         my $grouped_readings = {};
 323         foreach my $rdg ( values %readings_at_rank ) {
 324                 # Skip readings that have been collapsed into others.
 325                 next if exists $grouped_readings->{$rdg->id}
 326                         && $grouped_readings->{$rdg->id} eq 'COLLAPSE';
 327                 # Get the witness list, including from readings collapsed into this one.
 328                 my @wits = _table_witnesses( $table, $rdg, $lacunose, $aclabel );
 329                 if( $collapse && @$collapse ) {
 330                         my $filter = sub { my $r = $_[0]; grep { $_ eq $r->type } @$collapse; };
 331                         foreach my $other ( $rdg->related_readings( $filter ) ) {
 332                                 my @otherwits = _table_witnesses( $table, $other, $lacunose, $aclabel );
 333                                 push( @wits, @otherwits );
 334                                 $grouped_readings->{$other->id} = 'COLLAPSE';
 335                         }
 336                 }
 337                 $grouped_readings->{$rdg->id} = Set::Scalar->new( @wits );
 338         }
 339         if( $gap_wits->members ) {
 340                 $grouped_readings->{'(omitted)'} = $gap_wits;
 341         }
 342
 343         # Get rid of our collapsed readings
 344         map { delete $grouped_readings->{$_} if(
 345                          $grouped_readings->{$_} eq 'COLLAPSE'
 346                          || $grouped_readings->{$_}->is_empty ) }
 347                 keys %$grouped_readings;
 348
 349         # If something was transposed, check the groups for doubled-up readings
 350         if( $has_transposition ) {
 351                 # print STDERR "Group for rank $rank:\n";
 352                 # map { print STDERR "\t$_: " . join( ' ' , @{$grouped_readings->{$_}} ) . "\n" }
 353                 #       keys %$grouped_readings;
 354                 _check_transposed_consistency( $c, $rank, $transposed, $grouped_readings );
 355         }
 356
 357         # Return the result
 358         return $grouped_readings;
 359 }
 360
 361 # Helper function to query the alignment table for all witnesses (a.c. included)
 362 # that have a given reading at its rank.
 363 sub _table_witnesses {
 364         my( $table, $trdg, $lacunose, $aclabel ) = @_;
 365         my $tableidx = $trdg->rank - 1;
 366         my $has_reading = Set::Scalar->new();
 367         foreach my $row ( @{$table->{'alignment'}} ) {
 368                 my $wit = $row->{'witness'};
 369                 next if _is_lacunose( $wit, $lacunose, $aclabel );
 370                 my $rdg = $row->{'tokens'}->[$tableidx];
 371                 next unless exists $rdg->{'t'} && defined $rdg->{'t'};
 372                 _add_to_witlist( $wit, $has_reading, $aclabel )
 373                         if $rdg->{'t'}->id eq $trdg->id;
 374         }
 375         return $has_reading->members;
 376 }
 377
 378 # Helper function to see if a witness is lacunose even if we are asking about
 379 # the a.c. version
 380 sub _is_lacunose {
 381         my ( $wit, $lac, $acstr ) = @_;
 382         if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
 383                 $wit = $1;
 384         }
 385         return $lac->has( $wit );
 386 }
 387
 388 # Helper function to ensure that X and X a.c. never appear in the same list.
 389 sub _add_to_witlist {
 390         my( $wit, $list, $acstr ) = @_;
 391         if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
 392                 # Don't add X a.c. if we already have X
 393                 return if $list->has( $1 );
 394         } else {
 395                 # Delete X a.c. if we are about to add X
 396                 $list->delete( $wit.$acstr );
 397         }
 398         $list->insert( $wit );
 399 }
 400
 401 sub _check_transposed_consistency {
 402         my( $c, $rank, $transposed, $groupings ) = @_;
 403         my %seen_wits;
 404         my %thisrank;
 405         # Note which readings are actually at this rank, and which witnesses
 406         # belong to which reading.
 407         foreach my $rdg ( keys %$groupings ) {
 408                 my $rdgobj = $c->reading( $rdg );
 409                 # Count '(omitted)' as a reading at this rank
 410                 $thisrank{$rdg} = 1 if !$rdgobj || $rdgobj->rank == $rank;
 411                 map { push( @{$seen_wits{$_}}, $rdg ) } @{$groupings->{$rdg}};
 412         }
 413         # Our work is done if we have no witness belonging to more than one
 414         # reading.
 415         my @doubled = grep { scalar @{$seen_wits{$_}} > 1 } keys %seen_wits;
 416         return unless @doubled;
 417         # If we have a symmetric related transposition, drop the non-rank readings.
 418         if( @doubled == scalar keys %seen_wits ) {
 419                 foreach my $rdg ( keys %$groupings ) {
 420                         if( !$thisrank{$rdg} ) {
 421                                 my $groupstr = wit_stringify( $groupings->{$rdg} );
 422                                 my ( $matched ) = grep { $groupstr eq wit_stringify( $groupings->{$_} ) }
 423                                         keys %thisrank;
 424                                 delete $groupings->{$rdg};
 425                                 # If we found a group match, assume there is a symmetry happening.
 426                                 # TODO think more about this
 427                                 # print STDERR "*** Deleting symmetric reading $rdg\n";
 428                                 unless( $matched ) {
 429                                         delete $transposed->{$rdg};
 430                                         warn "Found problem in evident symmetry with reading $rdg";
 431                                 }
 432                         }
 433                 }
 434         # Otherwise 'unhook' the transposed reading(s) that have duplicates.
 435         } else {
 436                 foreach my $dup ( @doubled ) {
 437                         foreach my $rdg ( @{$seen_wits{$dup}} ) {
 438                                 next if $thisrank{$rdg};
 439                                 next unless exists $groupings->{$rdg};
 440                                 # print STDERR "*** Deleting asymmetric doubled-up reading $rdg\n";
 441                                 delete $groupings->{$rdg};
 442                                 delete $transposed->{$rdg};
 443                         }
 444                 }
 445                 # and put any now-orphaned readings into an 'omitted' reading.
 446                 foreach my $wit ( keys %seen_wits ) {
 447                         unless( grep { exists $groupings->{$_} } @{$seen_wits{$wit}} ) {
 448                                 $groupings->{'(omitted)'} = Set::Scalar->new()
 449                                          unless exists $groupings->{'(omitted)'};
 450                                 _add_to_witlist( $wit, $groupings->{'(omitted)'}, $c->ac_label );
 451                         }
 452                 }
 453         }
 454 }
 455
 456 # For the given grouping, return its situation graph based on the stemma.
 457 sub _graph_for_grouping {
 458         my( $stemma, $grouping, $lacunose, $aclabel ) = @_;
 459         my $acwits = [];
 460         my $extant = {};
 461         foreach my $gs ( values %$grouping ) {
 462                 map {
 463                         if( $_ =~ /^(.*)\Q$aclabel\E$/ ) {
 464                                 push( @$acwits, $1 ) unless $lacunose->has( $1 );
 465                         } else {
 466                                 $extant->{$_} = 1 unless $lacunose->has( $_ );
 467                         }
 468                 } $gs->members;
 469         }
 470         my $graph;
 471         try {
 472                 # contig contains all extant wits and all hypothetical wits
 473                 # needed to make up the groups.
 474                 $graph = $stemma->situation_graph( $extant, $acwits );
 475         } catch ( Text::Tradition::Error $e ) {
 476                 die "Could not extend graph with given extant and a.c. witnesses: "
 477                         . $e->message;
 478         } catch {
 479                 die "Could not extend graph with a.c. witnesses @$acwits";
 480         }
 481         return $graph;
 482 }
 483
 484 =head2 solve_variants( $calcdir, @groups )
 485
 486 Looks up the set of groups in the answers provided by the external graph solver
 487 service and returns a cleaned-up answer, adding the rank IDs back where they belong.
 488
 489 The JSON has the form
 490   { "graph": [ stemmagraph DOT string without newlines ],
 491     "groupings": [ array of arrays of groups, one per rank ] }
 492
 493 The answer has the form
 494   { "variants" => [ array of variant location structures ],
 495     "variant_count" => total,
 496     "conflict_count" => number of conflicts detected,
 497     "genealogical_count" => number of solutions found }
 498
 499 =cut
 500
 501 sub solve_variants {
 502         my( $dir, @groups ) = @_;
 503
 504         ## For each graph/group combo, look it up in the DB.
 505         ## Witness map is a HACK to get around limitations in node names from IDP
 506         my $witness_map = {};
 507         ## Variables to store answers as they come back
 508         my $variants = [];
 509         my $genealogical = 0; # counter
 510         foreach my $graphproblem ( @groups ) {
 511                 # Initialize the result structure for this graph problem
 512                 my $vstruct = { readings => [] };
 513                 push( @$variants, $vstruct );
 514
 515                 # Construct the calc result key and look up its answer
 516                 my $reqkey = _get_calc_key( $graphproblem );
 517                 my $scope = $dir->new_scope;
 518                 my $answer = $dir->lookup( $reqkey );
 519                 unless( $answer ) {
 520                         #warn "No answer found for graph problem $reqkey, moving on";
 521                         # Record the unsolved problem so that we can go get a solution
 522                         _save_problem( $graphproblem );
 523                         # Put just the request, with no real result, into vstruct
 524                         foreach my $rid ( keys %{$graphproblem->{grouping}} ) {
 525                                 push( @{$vstruct->{readings}}, { readingid => $rid,
 526                                         group => [ $graphproblem->{grouping}->{$rid}->members ] } );
 527                         }
 528                         next;
 529                 }
 530
 531                 ## The answer is a Text::Tradition::Analysis::Result containing a bunch
 532                 ## of information about this variant set. Record the information therein.
 533
 534                 # 1. Did the group evaluate as genealogical?
 535                 $vstruct->{genealogical} = $answer->is_genealogical;
 536                 $genealogical++ if $answer->is_genealogical;
 537
 538                 # 2. What are the calculated minimum groupings for each variant loc?
 539                 foreach my $rid ( keys %{$graphproblem->{grouping}} ) {
 540                         my $inputset = $graphproblem->{grouping}->{$rid};
 541                         my $minset = $answer->minimum_grouping_for( $inputset );
 542                         push( @{$vstruct->{readings}}, { readingid => $rid, group => $minset } );
 543                 }
 544
 545                 # 3. What are the sources and classes calculated for each witness?
 546                 $vstruct->{witcopy_types} = { $answer->classes };
 547                 $vstruct->{reading_roots} = {};
 548                 map { $vstruct->{reading_roots}->{$_} = 1 } $answer->sources;
 549
 550         }
 551
 552         # Spit out any unsolved problems we encountered
 553         _list_unsolved();
 554
 555         return { 'variants' => $variants,
 556                          'variant_count' => scalar @$variants,
 557                          'genealogical_count' => $genealogical };
 558 }
 559
 560 sub _get_calc_key {
 561         my( $graphproblem ) = @_;
 562         my $graph = $graphproblem->{graph};
 563         my $grouping = [ values %{$graphproblem->{grouping}} ];
 564         my $key = Text::Tradition::Analysis::Result::string_from_graph_problem(
 565                 $graph, $grouping );
 566         return md5_hex( encode_utf8( $key ) );
 567 }
 568
 569 sub _save_problem {
 570         my( $graphproblem ) = @_;
 571         my $graphstr = Text::Tradition::Stemma::editable_graph(
 572                 $graphproblem->{graph}, { 'linesep' => ' ' } );
 573         unless( exists $unsolved_problems->{$graphstr} ) {
 574                 $unsolved_problems->{$graphstr} = {};
 575         }
 576         my $grouping = [];
 577         foreach my $set ( sort { Text::Tradition::Analysis::Result::by_size_and_alpha( $a, $b ) } values %{$graphproblem->{grouping}} ) {
 578                 push( @$grouping, [ sort $set->members ] );
 579         }
 580         $unsolved_problems->{$graphstr}->{wit_stringify( $grouping )} = $grouping;
 581 }
 582
 583 sub _list_unsolved {
 584         #say STDERR "Problems needing a solution:";
 585         foreach my $graphstr ( keys %$unsolved_problems ) {
 586                 my $struct = { graph => $graphstr, groupings => [] };
 587                 foreach my $gp ( values %{$unsolved_problems->{$graphstr}} ) {
 588                         push( @{$struct->{groupings}}, $gp );
 589                 }
 590                 my $json = to_json( $struct );
 591                 say STDERR "$json";
 592         }
 593 }
 594
 595 =head2 analyze_location ( $tradition, $graph, $location_hash )
 596
 597 Given the tradition, its stemma graph, and the solution from the graph solver,
 598 work out the rest of the information we want.  For each reading we need missing,
 599 conflict, reading_parents, independent_occurrence, followed, not_followed,
 600 and follow_unknown.  Alters the location_hash in place.
 601
 602 =cut
 603
 604 sub analyze_location {
 605         my ( $tradition, $graph, $variant_row, $lacunose ) = @_;
 606         my $c = $tradition->collation;
 607
 608         # Make a hash of all known node memberships, and make the subgraphs.
 609         my $contig = {};
 610         my $reading_roots = {};
 611         my $subgraph = {};
 612         my $acstr = $c->ac_label;
 613         my @acwits;
 614
 615         my $NO_IDP;
 616         if( exists $variant_row->{'reading_roots'} ) {
 617                 $reading_roots = delete $variant_row->{'reading_roots'};
 618         } else {
 619                 warn "No reading source information from IDP - proceed at your own risk";
 620                 $NO_IDP = 1;
 621         }
 622         my $classinfo = delete $variant_row->{'witcopy_types'};
 623
 624         # Note which witnesses positively belong to which group. This information
 625         # comes ultimately from the IDP solver.
 626         # Also make a note of the reading's roots.
 627     foreach my $rdghash ( @{$variant_row->{'readings'}} ) {
 628         my $rid = $rdghash->{'readingid'};
 629         my @roots;
 630         foreach my $wit ( @{$rdghash->{'group'}} ) {
 631                 $contig->{$wit} = $rid;
 632             if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
 633                 push( @acwits, $1 );
 634             }
 635             if( exists $reading_roots->{$wit} && $reading_roots->{$wit} ) {
 636                 push( @roots, $wit );
 637             }
 638         }
 639                 $rdghash->{'independent_occurrence'} = \@roots;
 640         }
 641
 642         # Now that we have all the node group memberships, calculate followed/
 643     # non-followed/unknown values for each reading.  Also figure out the
 644     # reading's evident parent(s).
 645     foreach my $rdghash ( @{$variant_row->{'readings'}} ) {
 646         my $rid = $rdghash->{'readingid'};
 647         my $rdg = $c->reading( $rid );
 648         my @roots = @{$rdghash->{'independent_occurrence'}};
 649         my @reversions;
 650         if( $classinfo ) {
 651                 @reversions = grep { $classinfo->{$_} eq 'revert' }
 652                         $rdghash->{'group'}->members;
 653         }
 654         my @group = @{$rdghash->{'group'}};
 655
 656         # Start figuring things out.
 657         $rdghash->{'followed'} = scalar( @group )
 658                 - ( scalar( @roots ) + scalar( @reversions ) );
 659         # Find the parent readings, if any, of this reading.
 660         my $sourceparents = _find_reading_parents( $rid, $graph, $contig, @roots );
 661         my $revertparents = _find_reading_parents( $rid, $graph, $contig, @reversions );
 662                 # Work out relationships between readings and their non-followed parent.
 663                 _resolve_parent_relationships( $c, $rid, $rdg, $sourceparents );
 664                 _resolve_parent_relationships( $c, $rid, $rdg, $revertparents );
 665
 666                 $rdghash->{'reading_parents'} = $sourceparents;
 667                 $rdghash->{'reversion_parents'} = $revertparents;
 668
 669                 # Find the number of times this reading was altered, and the number of
 670                 # times we're not sure.
 671                 my( %nofollow, %unknownfollow );
 672                 foreach my $wit ( @{$rdghash->{'group'}} ) {
 673                         foreach my $wchild ( $graph->successors( $wit ) ) {
 674                                 if( $reading_roots->{$wchild} && $contig->{$wchild}
 675                                         && $contig->{$wchild} ne $rid ) {
 676                                         # It definitely changed here.
 677                                         $nofollow{$wchild} = 1;
 678                                 } elsif( !($contig->{$wchild}) ) {
 679                                         # The child is a hypothetical node not definitely in
 680                                         # any group. Answer is unknown.
 681                                         $unknownfollow{$wchild} = 1;
 682                                 } # else it is either in our group, or it is a non-root node in a
 683                                   # known group and therefore is presumed to have its reading from
 684                                   # its group, not this link.
 685                         }
 686                 }
 687                 $rdghash->{'not_followed'} = keys %nofollow;
 688                 $rdghash->{'follow_unknown'} = keys %unknownfollow;
 689
 690                 # Now say whether this reading represents a conflict.
 691                 unless( $variant_row->{'genealogical'} ) {
 692                         my @reversions;
 693                         if( $classinfo ) {
 694                                 # We have tested for reversions. Use the information.
 695                                 @reversions =
 696                                 $rdghash->{'reversions'} = \@reversions if @reversions;
 697                         }
 698                         $rdghash->{'is_conflict'} = @roots != 1;
 699                         $rdghash->{'is_reverted'} = !!@reversions;
 700                 }
 701     }
 702 }
 703
 704 sub _find_reading_parents {
 705         my( $rid, $graph, $contig, @list ) = @_;
 706         my $parenthash = {};
 707         foreach my $wit ( @list ) {
 708                 # Look in the stemma graph to find this witness's extant or known-reading
 709                 # immediate ancestor(s), and look up the reading that each ancestor holds.
 710                 my @check = $graph->predecessors( $wit );
 711                 while( @check ) {
 712                         my @next;
 713                         foreach my $wparent( @check ) {
 714                                 my $preading = $contig->{$wparent};
 715                                 if( $preading && $preading ne $rid ) {
 716                                         $parenthash->{$preading} = 1;
 717                                 } else {
 718                                         push( @next, $graph->predecessors( $wparent ) );
 719                                 }
 720                         }
 721                         @check = @next;
 722                 }
 723         }
 724         return $parenthash;
 725 }
 726
 727 sub _resolve_parent_relationships {
 728         my( $c, $rid, $rdg, $rdgparents ) = @_;
 729         foreach my $p ( keys %$rdgparents ) {
 730                 # Resolve the relationship of the parent to the reading, and
 731                 # save it in our hash.
 732                 my $pobj = $c->reading( $p );
 733                 my $prep = $pobj ? $pobj->id . ' (' . $pobj->text . ')' : $p;
 734                 my $phash = { 'label' => $prep };
 735                 if( $pobj ) {
 736                         my $rel = $c->get_relationship( $p, $rid );
 737                         if( $rel ) {
 738                                 _add_to_hash( $rel, $phash );
 739                         } elsif( $rdg ) {
 740                                 # First check for a transposed relationship
 741                                 if( $rdg->rank != $pobj->rank ) {
 742                                         foreach my $ti ( $rdg->related_readings( 'transposition' ) ) {
 743                                                 next unless $ti->text eq $rdg->text;
 744                                                 $rel = $c->get_relationship( $ti, $pobj );
 745                                                 if( $rel ) {
 746                                                         _add_to_hash( $rel, $phash, 1 );
 747                                                         last;
 748                                                 }
 749                                         }
 750                                         unless( $rel ) {
 751                                                 foreach my $ti ( $pobj->related_readings( 'transposition' ) ) {
 752                                                         next unless $ti->text eq $pobj->text;
 753                                                         $rel = $c->get_relationship( $ti, $rdg );
 754                                                         if( $rel ) {
 755                                                                 _add_to_hash( $rel, $phash, 1 );
 756                                                                 last;
 757                                                         }
 758                                                 }
 759                                         }
 760                                 }
 761                                 unless( $rel ) {
 762                                         # and then check for sheer word similarity.
 763                                         my $rtext = $rdg->text;
 764                                         my $ptext = $pobj->text;
 765                                         if( similar( $rtext, $ptext ) ) {
 766                                                 # say STDERR "Words $rtext and $ptext judged similar";
 767                                                 $phash->{relation} = { type => 'wordsimilar' };
 768                                         }
 769                                 }
 770                         } else {
 771                                 $phash->{relation} = { type => 'deletion' };
 772                         }
 773                         # Get the attributes of the parent object while we are here
 774                         $phash->{'text'} = $pobj->text if $pobj;
 775                         $phash->{'is_nonsense'} = $pobj->is_nonsense;
 776                         $phash->{'is_ungrammatical'} = $pobj->grammar_invalid;
 777                 } elsif( $p eq '(omitted)' ) {
 778                         $phash->{relation} = { type => 'addition' };
 779                 }
 780                 # Save it
 781                 $rdgparents->{$p} = $phash;
 782         }
 783 }
 784
 785 sub _add_to_hash {
 786         my( $rel, $phash, $is_transposed ) = @_;
 787         $phash->{relation} = { type => $rel->type };
 788         $phash->{relation}->{transposed} = 1 if $is_transposed;
 789         $phash->{relation}->{annotation} = $rel->annotation
 790                 if $rel->has_annotation;
 791 }
 792
 793 =head2 similar( $word1, $word2 )
 794
 795 Use Algorithm::Diff to get a sense of how close the words are to each other.
 796 This will hopefully handle substitutions a bit more nicely than Levenshtein.
 797
 798 =cut
 799
 800 #!/usr/bin/env perl
 801
 802 sub similar {
 803         my( $word1, $word2 ) = sort { length($a) <=> length($b) } @_;
 804         my @let1 = split( '', lc( $word1 ) );
 805         my @let2 = split( '', lc( $word2 ) );
 806         my $diff = Algorithm::Diff->new( \@let1, \@let2 );
 807         my $mag = 0;
 808         while( $diff->Next ) {
 809                 if( $diff->Same ) {
 810                         # Take off points for longer strings
 811                         my $cs = $diff->Range(1) - 2;
 812                         $cs = 0 if $cs < 0;
 813                         $mag -= $cs;
 814                 } elsif( !$diff->Items(1) ) {
 815                         $mag += $diff->Range(2);
 816                 } elsif( !$diff->Items(2) ) {
 817                         $mag += $diff->Range(1);
 818                 } else {
 819                         # Split the difference for substitutions
 820                         my $c1 = $diff->Range(1) || 1;
 821                         my $c2 = $diff->Range(2) || 1;
 822                         my $cd = ( $c1 + $c2 ) / 2;
 823                         $mag += $cd;
 824                 }
 825         }
 826         return ( $mag <= length( $word1 ) / 2 );
 827 }
 828
 829 sub _prune_group {
 830         my( $group, $graph ) = @_;
 831         my $relevant = {};
 832         # Record the existence of the vertices in the group
 833         map { $relevant->{$_} = 1 } @$group;
 834         # Make our subgraph
 835         my $subgraph = $graph->deep_copy;
 836         map { $subgraph->delete_vertex( $_ ) unless $relevant->{$_} }
 837                 $subgraph->vertices;
 838         # Now prune and return the remaining vertices.
 839         _prune_subtree( $subgraph );
 840         # Return the list of vertices and the list of roots.
 841         my $pruned_group = [ sort $subgraph->vertices ];
 842         my $pruned_roots = [ $subgraph->predecessorless_vertices ];
 843         return( $pruned_group, $pruned_roots );
 844 }
 845
 846 sub _prune_subtree {
 847         my( $tree ) = @_;
 848
 849         # Delete lacunose witnesses that have no successors
 850         my @orphan_hypotheticals;
 851         my $ctr = 0;
 852         do {
 853                 die "Infinite loop on leaves" if $ctr > 100;
 854                 @orphan_hypotheticals =
 855                         grep { $tree->get_vertex_attribute( $_, 'class' ) eq 'hypothetical' }
 856                                 $tree->successorless_vertices;
 857                 $tree->delete_vertices( @orphan_hypotheticals );
 858                 $ctr++;
 859         } while( @orphan_hypotheticals );
 860
 861         # Delete lacunose roots that have a single successor
 862         my @redundant_root;
 863         $ctr = 0;
 864         do {
 865                 die "Infinite loop on roots" if $ctr > 100;
 866                 @redundant_root =
 867                         grep { $tree->get_vertex_attribute( $_, 'class' ) eq 'hypothetical'
 868                                    && $tree->successors( $_ ) == 1 }
 869                                 $tree->predecessorless_vertices;
 870                 $tree->delete_vertices( @redundant_root );
 871                 $ctr++;
 872         } while( @redundant_root );
 873 }
 874
 875 sub _useful_variant {
 876         my( $rankgroup, $rankgraph, $acstr ) = @_;
 877
 878         # Sort by group size and return
 879         my $is_useful = 0;
 880         foreach my $rdg ( keys %$rankgroup ) {
 881                 my @wits = $rankgroup->{$rdg}->members;
 882                 if( @wits > 1 ) {
 883                         $is_useful++;
 884                 } else {
 885                         $is_useful++ unless( $rankgraph->is_sink_vertex( $wits[0] )
 886                                 || $wits[0] =~ /\Q$acstr\E$/ );
 887                 }
 888         }
 889         return $is_useful > 1;
 890 }
 891
 892 =head2 wit_stringify( $groups )
 893
 894 Takes an array of witness groupings and produces a string like
 895 ['A','B'] / ['C','D','E'] / ['F']
 896
 897 =cut
 898
 899 sub wit_stringify {
 900     my $groups = shift;
 901     my @gst;
 902     # If we were passed an array of witnesses instead of an array of
 903     # groupings, then "group" the witnesses first.
 904     unless( ref( $groups->[0] ) ) {
 905         my $mkgrp = [ $groups ];
 906         $groups = $mkgrp;
 907     }
 908     foreach my $g ( @$groups ) {
 909         push( @gst, '[' . join( ',', map { "'$_'" } @$g ) . ']' );
 910     }
 911     return join( ' / ', @gst );
 912 }
 913
 914 1;
 915
 916 =head1 LICENSE
 917
 918 This package is free software and is provided "as is" without express
 919 or implied warranty.  You can redistribute it and/or modify it under
 920 the same terms as Perl itself.
 921
 922 =head1 AUTHOR
 923
 924 Tara L Andrews E<lt>aurum@cpan.orgE<gt>