analysis/lib/Text/Tradition/Analysis.pm

   1 package Text::Tradition::Analysis;
   2
   3 use strict;
   4 use warnings;
   5 use Algorithm::Diff;  # for word similarity measure
   6 use Encode qw/ decode_utf8 encode_utf8 /;
   7 use Exporter 'import';
   8 use Graph;
   9 use JSON qw/ to_json decode_json /;
  10 use LWP::UserAgent;
  11 use Set::Scalar;
  12 use Text::Tradition::Analysis::Result;
  13 use Text::Tradition::Stemma;
  14 use TryCatch;
  15
  16 use vars qw/ @EXPORT_OK $VERSION /;
  17 @EXPORT_OK = qw/ run_analysis group_variants analyze_variant_location wit_stringify /;
  18 $VERSION = "1.3";
  19
  20
  21 my $DEFAULT_SOLVER_URL = 'http://perf.byzantini.st/cgi-bin/graphcalc.cgi';
  22 my $unsolved_problems = {};
  23
  24 =head1 NAME
  25
  26 Text::Tradition::Analysis - functions for stemma analysis of a tradition
  27
  28 =head1 DESCRIPTION
  29
  30 Text::Tradition is a library for representation and analysis of collated
  31 texts, particularly medieval ones.  Where the Collation is the central
  32 feature of a Tradition, it may also have one or more stemmata associated
  33 with it, and these stemmata may be analyzed. This package provides the
  34 following modules:
  35
  36 =over 4
  37
  38 =item * L<Text::Tradition::HasStemma> - a role that will be composed into
  39 Text::Tradition objects, providing the ability for Text::Tradition::Stemma
  40 objects to be associated with them.
  41
  42 =item * L<Text::Tradition::Stemma> - an object class that represents stemma
  43 hypotheses, both rooted (with a single archetype) and unrooted (e.g.
  44 phylogenetic trees).
  45
  46 =item * Text::Tradition::Analysis (this package). Provides functions for
  47 the analysis of a given stemma against the collation within a given
  48 Tradition.
  49
  50 =back
  51
  52 =head1 SYNOPSIS
  53
  54   use Text::Tradition;
  55   use Text::Tradition::Analysis qw/ run_analysis analyze_variant_location /;
  56   my $t = Text::Tradition->new(
  57     'name' => 'this is a text',
  58     'input' => 'TEI',
  59     'file' => '/path/to/tei_parallel_seg_file.xml' );
  60   $t->add_stemma( 'dotfile' => $stemmafile );
  61
  62   my $variant_data = run_analysis( $tradition );
  63
  64 =head1 SUBROUTINES
  65
  66 =head2 run_analysis( $tradition, %opts )
  67
  68 Runs the analysis described in analyze_variant_location on every location in the
  69 collation of the given tradition, with the given options. These include:
  70
  71 =over 4
  72
  73 =item * stemma_id - Specify which of the tradition's stemmata to use. Default
  74 is 0 (i.e. the first).
  75
  76 =item * ranks - Specify a list of location ranks to analyze; exclude the rest.
  77
  78 =item * merge_types - Specify a list of relationship types, where related readings
  79 should be treated as identical for the purposes of analysis.
  80
  81 =item * exclude_type1 - Exclude those ranks whose groupings have only type-1 variants.
  82
  83 =back
  84
  85 =begin testing
  86
  87 use Text::Tradition;
  88 use Text::Tradition::Analysis qw/ run_analysis analyze_variant_location /;
  89
  90 my $datafile = 't/data/florilegium_tei_ps.xml';
  91 my $tradition = Text::Tradition->new( 'input' => 'TEI',
  92                                       'name' => 'test0',
  93                                       'file' => $datafile );
  94 my $s = $tradition->add_stemma( 'dotfile' => 't/data/florilegium.dot' );
  95 is( ref( $s ), 'Text::Tradition::Stemma', "Added stemma to tradition" );
  96
  97 my %expected_genealogical = (
  98         1 => 0,
  99         2 => 1,
 100         3 =>  0,
 101         5 =>  0,
 102         7 =>  0,
 103         8 =>  0,
 104         10 => 0,
 105         13 => 1,
 106         33 => 0,
 107         34 => 0,
 108         37 => 0,
 109         60 => 0,
 110         81 => 1,
 111         84 => 0,
 112         87 => 0,
 113         101 => 0,
 114         102 => 0,
 115         122 => 1,
 116         157 => 0,
 117         166 => 1,
 118         169 => 1,
 119         200 => 0,
 120         216 => 1,
 121         217 => 1,
 122         219 => 1,
 123         241 => 1,
 124         242 => 1,
 125         243 => 1,
 126 );
 127
 128 my $data = run_analysis( $tradition, calcdsn => 'dbi:SQLite:dbname=t/data/analysis.db' );
 129 my $c = $tradition->collation;
 130 foreach my $row ( @{$data->{'variants'}} ) {
 131         # Account for rows that used to be "not useful"
 132         unless( exists $expected_genealogical{$row->{'id'}} ) {
 133                 $expected_genealogical{$row->{'id'}} = 1;
 134         }
 135         my $gen_bool = $row->{'genealogical'} ? 1 : 0;
 136         is( $gen_bool, $expected_genealogical{$row->{'id'}},
 137                 "Got correct genealogical flag for row " . $row->{'id'} );
 138         # Check that we have the right row with the right groups
 139         my $rank = $row->{'id'};
 140         foreach my $rdghash ( @{$row->{'readings'}} ) {
 141                 # Skip 'readings' that aren't really
 142                 next unless $c->reading( $rdghash->{'readingid'} );
 143                 # Check the rank
 144                 is( $c->reading( $rdghash->{'readingid'} )->rank, $rank,
 145                         "Got correct reading rank" );
 146                 # Check the witnesses
 147                 my @realwits = sort $c->reading_witnesses( $rdghash->{'readingid'} );
 148                 my @sgrp = sort @{$rdghash->{'group'}};
 149                 is_deeply( \@sgrp, \@realwits, "Reading analyzed with correct groups" );
 150         }
 151 }
 152 is( $data->{'variant_count'}, 58, "Got right total variant number" );
 153 # TODO Make something meaningful of conflict count, maybe test other bits
 154
 155 =end testing
 156
 157 =cut
 158
 159 sub run_analysis {
 160         my( $tradition, %opts ) = @_;
 161         my $c = $tradition->collation;
 162         my $aclabel = $c->ac_label;
 163
 164         my $stemma_id = $opts{'stemma_id'} || 0;
 165         my @ranks = ref( $opts{'ranks'} ) eq 'ARRAY' ? @{$opts{'ranks'}} : ();
 166         my $collapse = Set::Scalar->new();
 167         if( $opts{'merge_types'} && ref( $opts{'merge_types'} ) eq 'ARRAY' ) {
 168                 $collapse->insert( @{$opts{'merge_types'}} );
 169         } elsif( $opts{'merge_types'} ) {
 170                 $collapse->insert( $opts{'merge_types'} );
 171         }
 172
 173         # If we have specified a local lookup DB for graph calculation results,
 174         # make sure it exists and connect to it.
 175         my $dir;
 176         if ( exists $opts{'calcdsn'} ) {
 177                 eval { require Text::Tradition::Directory };
 178                 if( $@ ) {
 179                         throw( "Could not instantiate a directory for " . $opts{'calcdsn'}
 180                                 . ": $@" );
 181                 }
 182                 $opts{'dir'} = Text::Tradition::Directory->new( dsn => $opts{'calcdsn'} );
 183         } elsif( !exists $opts{'solver_url'} ) {
 184                 $opts{'solver_url'} = $DEFAULT_SOLVER_URL;
 185         }
 186
 187         # Get the stemma
 188         my $stemma = $tradition->stemma( $stemma_id );
 189
 190         # Figure out which witnesses we are working with - that is, the ones that
 191         # appear both in the stemma and in the tradition. All others are 'lacunose'
 192         # for our purposes.
 193         my $lacunose = Set::Scalar->new( $stemma->hypotheticals );
 194         my $stemma_wits = Set::Scalar->new( $stemma->witnesses );
 195         my $tradition_wits = Set::Scalar->new( map { $_->sigil } $tradition->witnesses );
 196         $lacunose->insert( $stemma_wits->symmetric_difference( $tradition_wits )->members );
 197
 198         # Find and mark 'common' ranks for exclusion, unless they were
 199         # explicitly specified.
 200         unless( @ranks ) {
 201                 my %common_rank;
 202                 foreach my $rdg ( $c->common_readings ) {
 203                         $common_rank{$rdg->rank} = 1;
 204                 }
 205                 @ranks = grep { !$common_rank{$_} } ( 1 .. $c->end->rank-1 );
 206         }
 207
 208         # Group the variants to send to the solver
 209         my @groups;
 210         my @use_ranks;
 211         my %lacunae;
 212         my $moved = {};
 213         foreach my $rank ( @ranks ) {
 214                 my $missing = $lacunose->clone();
 215                 my $rankgroup = group_variants( $tradition, $rank, $missing, $moved, $collapse );
 216                 # Filter out any empty rankgroups
 217                 # (e.g. from the later rank for a transposition)
 218                 next unless keys %$rankgroup;
 219                 # Get the graph for this rankgroup
 220                 my $rankgraph = _graph_for_grouping( $stemma, $rankgroup, $missing, $aclabel );
 221                 if( $opts{'exclude_type1'} ) {
 222                         # Check to see whether this is a "useful" group.
 223                         next unless _useful_variant( $rankgroup, $rankgraph, $aclabel );
 224                 }
 225                 push( @use_ranks, $rank );
 226                 push( @groups, { grouping => $rankgroup, graph => $rankgraph } );
 227                 $lacunae{$rank} = $missing;
 228         }
 229         # Run the solver
 230         my $answer;
 231         try {
 232                 $answer = solve_variants( \%opts, @groups );
 233         } catch ( Text::Tradition::Error $e ) {
 234                 if( $e->message =~ /IDP/ ) {
 235                         # Something is wrong with the solver; make the variants table anyway
 236                         $answer->{'variants'} = [];
 237                         map { push( @{$answer->{'variants'}}, _init_unsolved( $_, 'IDP error' ) ) }
 238                                 @groups;
 239                 } else {
 240                         # Something else is wrong; error out.
 241                         $e->throw;
 242                 }
 243         }
 244
 245         # Do further analysis on the answer
 246         my $conflict_count = 0;
 247         my $reversion_count = 0;
 248         foreach my $idx ( 0 .. $#use_ranks ) {
 249                 my $location = $answer->{'variants'}->[$idx];
 250                 # Add the rank back in
 251                 my $rank = $use_ranks[$idx];
 252                 $location->{'id'} = $rank;
 253                 # Note what our lacunae are
 254                 my %lmiss;
 255                 map { $lmiss{$_} = 1 } @{$lacunae{$use_ranks[$idx]}};
 256                 $location->{'missing'} = [ keys %lmiss ];
 257
 258                 # Run the extra analysis we need.
 259                 ## TODO We run through all the variants in this call, so
 260                 ## why not add the reading data there instead of here below?
 261                 my $graph = $groups[$idx]->{graph};
 262                 analyze_location( $tradition, $graph, $location, \%lmiss );
 263
 264                 my @layerwits;
 265                 # Do the final post-analysis tidying up of the data.
 266                 foreach my $rdghash ( @{$location->{'readings'}} ) {
 267                         $conflict_count++ if $rdghash->{'is_conflict'};
 268                         $reversion_count++ if $rdghash->{'is_reverted'};
 269                         # Add the reading text back in, setting display value as needed
 270                         my $rdg = $c->reading( $rdghash->{'readingid'} );
 271                         if( $rdg ) {
 272                                 $rdghash->{'text'} = $rdg->text .
 273                                         ( $rdg->rank == $rank ? '' : ' [' . $rdg->rank . ']' );
 274                                 if( $rdg->does( 'Text::Tradition::Morphology' ) ) {
 275                                         $rdghash->{'is_ungrammatical'} = $rdg->grammar_invalid;
 276                                         $rdghash->{'is_nonsense'} = $rdg->is_nonsense;
 277                                 }
 278                         }
 279                         # Remove lacunose witnesses from this reading's list now that the
 280                         # analysis is done
 281                         my @realgroup;
 282                         map { push( @realgroup, $_ ) unless $lmiss{$_} } @{$rdghash->{'group'}};
 283                         $rdghash->{'group'} = \@realgroup;
 284                         # Note any layered witnesses that appear in this group
 285                         foreach( @realgroup ) {
 286                                 if( $_ =~ /^(.*)\Q$aclabel\E$/ ) {
 287                                         push( @layerwits, $1 );
 288                                 }
 289                         }
 290                 }
 291                 $location->{'layerwits'} = \@layerwits if @layerwits;
 292         }
 293         $answer->{'conflict_count'} = $conflict_count;
 294         $answer->{'reversion_count'} = $reversion_count;
 295
 296         return $answer;
 297 }
 298
 299 =head2 group_variants( $tradition, $rank, $lacunose, $transposed, $merge_relationship_types )
 300
 301 Groups the variants at the given $rank of the collation, treating any
 302 relationships in the set $merge_relationship_types as equivalent.
 303 $lacunose should be a reference to an array, to which the sigla of lacunose
 304 witnesses at this rank will be appended; $transposed should be a reference
 305 to a hash, wherein the identities of transposed readings and their
 306 relatives will be stored.
 307
 308 Returns a hash $group_readings where $rdg is attested by the witnesses listed
 309 in $group_readings->{$rdg}.
 310
 311 =cut
 312
 313 # Return group_readings, groups, lacunose
 314 sub group_variants {
 315         my( $tradition, $rank, $lacunose, $transposed, $collapse ) = @_;
 316         my $c = $tradition->collation;
 317         my $aclabel = $c->ac_label;
 318         my $table = $c->alignment_table;
 319         # Get the alignment table readings
 320         my %readings_at_rank;
 321         my $check_for_gaps = Set::Scalar->new();
 322         my %moved_wits;
 323         my $has_transposition;
 324         my @transp_acgap;
 325         foreach my $tablewit ( @{$table->{'alignment'}} ) {
 326                 my $rdg = $tablewit->{'tokens'}->[$rank-1];
 327                 my $wit = $tablewit->{'witness'};
 328                 # Exclude the witness if it is "lacunose" which if we got here
 329                 # means "not in the stemma".
 330                 next if _is_lacunose( $wit, $lacunose, $aclabel );
 331                 # Note if the witness is actually in a lacuna
 332                 if( $rdg && $rdg->{'t'}->is_lacuna ) {
 333                         _add_to_witlist( $wit, $lacunose, $aclabel );
 334                 # Otherwise the witness either has a positive reading...
 335                 } elsif( $rdg ) {
 336                         # If the reading has been counted elsewhere as a transposition, ignore it.
 337                         if( $transposed->{$rdg->{'t'}->id} ) {
 338                                 # TODO Does this cope with three-way transpositions?
 339                                 map { $moved_wits{$_} = 1 } @{$transposed->{$rdg->{'t'}->id}};
 340                                 next;
 341                         }
 342                         # Otherwise, record it...
 343                         $readings_at_rank{$rdg->{'t'}->id} = $rdg->{'t'};
 344                         # ...and grab any transpositions, and their relations.
 345                         my @transp = grep { $_->rank != $rank } _all_related( $rdg->{'t'} );
 346                         foreach my $trdg ( @transp ) {
 347                                 next if exists $readings_at_rank{$trdg->id};
 348                                 $has_transposition = 1;
 349                                 my @affected_wits = _table_witnesses(
 350                                         $table, $trdg->rank, $trdg, $lacunose, $aclabel );
 351                                 next unless @affected_wits;
 352                                 map { $moved_wits{$_} = 1 } @affected_wits;
 353                                 my @thisloc_wits = _table_witnesses( $table, $rank, $rdg->{'t'},
 354                                         $lacunose, $aclabel );
 355                                 # Check to see if our affected wits have layers that do something
 356                                 # wacky.
 357                                 my %transploc_gaps;
 358                                 map { $transploc_gaps{$_} = 1 }
 359                                         _table_witnesses( $table, $trdg->rank, undef, $lacunose, $aclabel );
 360                                 foreach my $aw ( @affected_wits ) {
 361                                         if( $transploc_gaps{$aw.$aclabel} ) {
 362                                                 push( @thisloc_wits, $aw.$aclabel );
 363                                                 push( @transp_acgap, $aw.$aclabel );
 364                                         }
 365                                 }
 366                                 # Record which witnesses we should count as already analyzed when we
 367                                 # get to the transposed reading's own rank.
 368                                 $transposed->{$trdg->id} = \@thisloc_wits;
 369                                 $readings_at_rank{$trdg->id} = $trdg;
 370                         }
 371                 # ...or it is empty, ergo a gap.
 372                 } else {
 373                         _add_to_witlist( $wit, $check_for_gaps, $aclabel );
 374                 }
 375         }
 376         # Push all the transposition layer gaps onto our list
 377         $check_for_gaps->insert( @transp_acgap );
 378         # Now remove from our 'gaps' any witnesses known to have been dealt with elsewhere.
 379         my $gap_wits = Set::Scalar->new();
 380         map { _add_to_witlist( $_, $gap_wits, $aclabel )
 381                 unless $moved_wits{$_} } $check_for_gaps->members;
 382
 383         # Group the readings, collapsing groups by relationship if needed.
 384         my $grouped_readings = {};
 385         foreach my $rdg ( values %readings_at_rank ) {
 386                 # Skip readings that have been collapsed into others.
 387                 next if exists $grouped_readings->{$rdg->id}
 388                         && $grouped_readings->{$rdg->id} eq 'COLLAPSE';
 389                 # Get the witness list, including from readings collapsed into this one.
 390                 my @wits = _table_witnesses( $table, $rdg->rank, $rdg, $lacunose, $aclabel );
 391                 if( $collapse && $collapse->size ) {
 392                         my $filter = sub { $collapse->has( $_[0]->type ) };
 393                         foreach my $other ( $rdg->related_readings( $filter ) ) {
 394                                 my @otherwits = _table_witnesses( $table, $other->rank, $other, $lacunose, $aclabel );
 395                                 push( @wits, @otherwits );
 396                                 $grouped_readings->{$other->id} = 'COLLAPSE';
 397                         }
 398                 }
 399                 $grouped_readings->{$rdg->id} = Set::Scalar->new( @wits );
 400         }
 401         if( $gap_wits->members ) {
 402                 $grouped_readings->{'(omitted)'} = $gap_wits;
 403         }
 404
 405         # Get rid of our collapsed readings
 406         map { delete $grouped_readings->{$_} if(
 407                          $grouped_readings->{$_} eq 'COLLAPSE'
 408                          || $grouped_readings->{$_}->is_empty ) }
 409                 keys %$grouped_readings;
 410
 411         # If something was transposed, check the groups for doubled-up readings
 412         if( $has_transposition ) {
 413                 # print STDERR "Group for rank $rank:\n";
 414                 # map { print STDERR "\t$_: " . join( ' ' , @{$grouped_readings->{$_}} ) . "\n" }
 415                 #       keys %$grouped_readings;
 416                 _check_transposed_consistency( $c, $rank, $transposed, $grouped_readings );
 417         }
 418
 419         # Return the result
 420         return $grouped_readings;
 421 }
 422
 423 sub _all_related {
 424         # Except by repetition
 425         my $rdg = shift;
 426         my $c = $rdg->collation;
 427         my @check = ( $rdg );
 428         my %seen;
 429         while( @check ) {
 430                 my @next;
 431                 foreach my $ck ( @check ) {
 432                         $seen{"$ck"} = 1;
 433                         push( @next, grep { !$seen{"$_"} }
 434                                 $ck->related_readings( sub { $_[0]->type ne 'repetition' } ) );
 435                 }
 436                 @check = @next;
 437         }
 438
 439
 440         my @all = map { $c->reading( $_ ) } keys %seen;
 441         return @all;
 442 }
 443
 444
 445 # Helper function to query the alignment table for all witnesses (a.c. included)
 446 # that have a given reading at its rank.
 447 sub _table_witnesses {
 448         my( $table, $rank, $trdg, $lacunose, $aclabel ) = @_;
 449         my $tableidx = $rank - 1;
 450         my $has_reading = Set::Scalar->new();
 451         foreach my $row ( @{$table->{'alignment'}} ) {
 452                 my $wit = $row->{'witness'};
 453                 next if _is_lacunose( $wit, $lacunose, $aclabel );
 454                 my $rdg = $row->{'tokens'}->[$tableidx];
 455                 if( $trdg ) {
 456                         # We have some positive reading we want.
 457                         next unless exists $rdg->{'t'} && defined $rdg->{'t'};
 458                         if( $trdg->is_lacuna ) {
 459                                 _add_to_witlist( $wit, $has_reading, $aclabel )
 460                                 if $rdg->{'t'}->is_lacuna;
 461                         } else {
 462                                 _add_to_witlist( $wit, $has_reading, $aclabel )
 463                                         if $rdg->{'t'}->id eq $trdg->id;
 464                         }
 465                 } else {
 466                         # We want the omissions.
 467                         next if exists $rdg->{'t'} && defined $rdg->{'t'};
 468                         _add_to_witlist( $wit, $has_reading, $aclabel )
 469                 }
 470         }
 471         return $has_reading->members;
 472 }
 473
 474 # Helper function to see if a witness is lacunose even if we are asking about
 475 # the a.c. version
 476 sub _is_lacunose {
 477         my ( $wit, $lac, $acstr ) = @_;
 478         if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
 479                 $wit = $1;
 480         }
 481         return $lac->has( $wit );
 482 }
 483
 484 # Helper function to ensure that X and X a.c. never appear in the same list.
 485 sub _add_to_witlist {
 486         my( $wit, $list, $acstr ) = @_;
 487         if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
 488                 # Don't add X a.c. if we already have X
 489                 return if $list->has( $1 );
 490         } else {
 491                 # Delete X a.c. if we are about to add X
 492                 $list->delete( $wit.$acstr );
 493         }
 494         $list->insert( $wit );
 495 }
 496
 497 sub _check_transposed_consistency {
 498         my( $c, $rank, $transposed, $groupings ) = @_;
 499         my %seen_wits;
 500         my %thisrank;
 501         # Note which readings are actually at this rank, and which witnesses
 502         # belong to which reading.
 503         foreach my $rdg ( keys %$groupings ) {
 504                 my $rdgobj = $c->reading( $rdg );
 505                 # Count '(omitted)' as a reading at this rank
 506                 $thisrank{$rdg} = 1 if !$rdgobj || $rdgobj->rank == $rank;
 507                 map { push( @{$seen_wits{$_}}, $rdg ) } @{$groupings->{$rdg}};
 508         }
 509         # Our work is done if we have no witness belonging to more than one
 510         # reading.
 511         my @doubled = grep { scalar @{$seen_wits{$_}} > 1 } keys %seen_wits;
 512         return unless @doubled;
 513         # If we have a symmetric related transposition, drop the non-rank readings.
 514         if( @doubled == scalar keys %seen_wits ) {
 515                 foreach my $rdg ( keys %$groupings ) {
 516                         if( !$thisrank{$rdg} ) {
 517                                 # Groupings are Set::Scalar objects so we can compare them outright.
 518                                 my ( $matched ) = grep { $groupings->{$rdg} == $groupings->{$_} }
 519                                         keys %thisrank;
 520                                 delete $groupings->{$rdg};
 521                                 # If we found a group match, assume there is a symmetry happening.
 522                                 # TODO think more about this
 523                                 # print STDERR "*** Deleting symmetric reading $rdg\n";
 524                                 unless( $matched ) {
 525                                         delete $transposed->{$rdg};
 526                                         warn "Found problem in evident symmetry with reading $rdg";
 527                                 }
 528                         }
 529                 }
 530         # Otherwise 'unhook' the transposed reading(s) that have duplicates.
 531         } else {
 532                 foreach my $dup ( @doubled ) {
 533                         foreach my $rdg ( @{$seen_wits{$dup}} ) {
 534                                 next if $thisrank{$rdg};
 535                                 next unless exists $groupings->{$rdg};
 536                                 # print STDERR "*** Deleting asymmetric doubled-up reading $rdg\n";
 537                                 delete $groupings->{$rdg};
 538                                 delete $transposed->{$rdg};
 539                         }
 540                 }
 541                 # and put any now-orphaned readings into an 'omitted' reading.
 542                 foreach my $wit ( keys %seen_wits ) {
 543                         unless( grep { exists $groupings->{$_} } @{$seen_wits{$wit}} ) {
 544                                 $groupings->{'(omitted)'} = Set::Scalar->new()
 545                                          unless exists $groupings->{'(omitted)'};
 546                                 _add_to_witlist( $wit, $groupings->{'(omitted)'}, $c->ac_label );
 547                         }
 548                 }
 549         }
 550 }
 551
 552 # For the given grouping, return its situation graph based on the stemma.
 553 sub _graph_for_grouping {
 554         my( $stemma, $grouping, $lacunose, $aclabel ) = @_;
 555         my $acwits = [];
 556         my $extant = {};
 557         foreach my $gs ( values %$grouping ) {
 558                 map {
 559                         if( $_ =~ /^(.*)\Q$aclabel\E$/ ) {
 560                                 push( @$acwits, $1 ) unless $lacunose->has( $1 );
 561                         } else {
 562                                 $extant->{$_} = 1 unless $lacunose->has( $_ );
 563                         }
 564                 } $gs->members;
 565         }
 566         my $graph;
 567         try {
 568                 # contig contains all extant wits and all hypothetical wits
 569                 # needed to make up the groups.
 570                 $graph = $stemma->situation_graph( $extant, $acwits, $aclabel );
 571         } catch ( Text::Tradition::Error $e ) {
 572                 throw( "Could not extend graph with given extant and a.c. witnesses: "
 573                         . $e->message );
 574         } catch {
 575                 throw( "Could not extend graph with a.c. witnesses @$acwits" );
 576         }
 577         return $graph;
 578 }
 579
 580 =head2 solve_variants( $calcdir, @groups )
 581
 582 Looks up the set of groups in the answers provided by the external graph solver
 583 service and returns a cleaned-up answer, adding the rank IDs back where they belong.
 584
 585 The answer has the form
 586   { "variants" => [ array of variant location structures ],
 587     "variant_count" => total,
 588     "conflict_count" => number of conflicts detected,
 589     "genealogical_count" => number of solutions found }
 590
 591 =cut
 592
 593 sub solve_variants {
 594         my( $opts, @groups ) = @_;
 595
 596         # Are we using a local result directory?
 597         my $dir = $opts->{dir};
 598
 599         ## For each graph/group combo, make a Text::Tradition::Analysis::Result
 600         ## object so that we can send it off for IDP lookup.
 601         my $variants = [];
 602         my $genealogical = 0; # counter
 603         # TODO Optimize for unique graph problems
 604         my %problems;
 605         foreach my $graphproblem ( @groups ) {
 606                 # Construct the calc result key and look up its answer
 607                 my $problem = Text::Tradition::Analysis::Result->new(
 608                         graph => $graphproblem->{'graph'},
 609                         setlist => [ values %{$graphproblem->{'grouping'}} ] );
 610                 if( exists $problems{$problem->object_key} ) {
 611                         $problem = $problems{$problem->object_key};
 612                 } else {
 613                         $problems{$problem->object_key} = $problem;
 614                 }
 615                 $graphproblem->{'object'} = $problem;
 616         }
 617
 618         my %results;
 619         if( $dir ) {
 620                 my $scope = $dir->new_scope;
 621                 map { $results{$_} = $dir->lookup( $_ ) || $problems{$_} } keys %problems;
 622         } else {
 623                 # print STDERR "Using solver at " . $opts->{solver_url} . "\n";
 624                 my $json = JSON->new->allow_blessed->convert_blessed->utf8->encode(
 625                         [ values %problems ] );
 626                 # Send it off and get the result
 627                 # print STDERR "Sending request: " . decode_utf8( $json ) . "\n";
 628                 my $ua = LWP::UserAgent->new();
 629                 my $resp = $ua->post( $opts->{solver_url}, 'Content-Type' => 'application/json',
 630                                                           'Content' => $json );
 631                 my $answer;
 632                 if( $resp->is_success ) {
 633                         $answer = decode_json( $resp->content );
 634                         throw( "Unexpected answer from IDP: $answer" ) unless ref( $answer ) eq 'ARRAY';
 635                 } else {
 636                         throw( "IDP solver returned " . $resp->status_line . " / " . $resp->content
 637                                 . "; cannot run graph analysis" );
 638                 }
 639                 # One more sanity check
 640                 throw( "Something went wrong with answer symmetricity" )
 641                         unless keys( %problems ) == @$answer;
 642                 # Convert the results
 643                 foreach my $a ( @$answer ) {
 644                         my $r = Text::Tradition::Analysis::Result->new( $a );
 645                         $results{$r->object_key} = $r;
 646                 }
 647         }
 648
 649         # We now have a single JSON-encoded Result object per problem sent. Fold its
 650         # answers into our variant info structure.
 651         foreach my $graphproblem ( @groups ) {
 652                 my $result = $results{$graphproblem->{'object'}->object_key}
 653                         || $graphproblem->{'object'};
 654
 655                 # Initialize the result structure for this graph problem
 656                 my $vstruct;
 657                 if( $result->status eq 'OK' ) {
 658                         $vstruct = { readings => [] };
 659                         push( @$variants, $vstruct );
 660                 } else {
 661                         push( @$variants, _init_unsolved( $graphproblem, $result->status ) );
 662                         next;
 663                 }
 664
 665                 # 1. Did the group evaluate as genealogical?
 666                 $vstruct->{genealogical} = $result->is_genealogical;
 667                 $genealogical++ if $result->is_genealogical;
 668
 669                 # 2. What are the calculated minimum groupings for each variant loc?
 670                 foreach my $rid ( keys %{$graphproblem->{grouping}} ) {
 671                         my $inputset = $graphproblem->{grouping}->{$rid};
 672                         my $minset = $result->minimum_grouping_for( $inputset );
 673                         push( @{$vstruct->{readings}}, { readingid => $rid, group => $minset } );
 674                 }
 675
 676                 # 3. What are the sources and classes calculated for each witness?
 677                 $vstruct->{witcopy_types} = { $result->classes };
 678                 $vstruct->{reading_roots} = {};
 679                 map { $vstruct->{reading_roots}->{$_} = 1 } $result->sources;
 680
 681         }
 682
 683         return { 'variants' => $variants,
 684                          'variant_count' => scalar @$variants,
 685                          'genealogical_count' => $genealogical };
 686 }
 687
 688 sub _init_unsolved {
 689         my( $graphproblem, $status ) = @_;
 690         my $vstruct = { 'readings' => [] };
 691         $vstruct->{'unsolved'} = $status;
 692         foreach my $rid ( keys %{$graphproblem->{grouping}} ) {
 693                 push( @{$vstruct->{readings}}, { readingid => $rid,
 694                         group => [ $graphproblem->{grouping}->{$rid}->members ] } );
 695         }
 696         return $vstruct;
 697 }
 698
 699 =head2 analyze_location ( $tradition, $graph, $location_hash )
 700
 701 Given the tradition, its stemma graph, and the solution from the graph solver,
 702 work out the rest of the information we want.  For each reading we need missing,
 703 conflict, reading_parents, independent_occurrence, followed, not_followed,
 704 and follow_unknown.  Alters the location_hash in place.
 705
 706 =cut
 707
 708 sub analyze_location {
 709         my ( $tradition, $graph, $variant_row, $lacunose ) = @_;
 710         my $c = $tradition->collation;
 711
 712         if( exists $variant_row->{'unsolved'} ) {
 713                 return;
 714         }
 715         my $reading_roots = delete $variant_row->{'reading_roots'};
 716         my $classinfo = delete $variant_row->{'witcopy_types'};
 717
 718         # Make a hash of all known node memberships, and make the subgraphs.
 719         my $contig = {};
 720         my $subgraph = {};
 721         my $acstr = $c->ac_label;
 722         my @acwits;
 723
 724         # Note which witnesses positively belong to which group. This information
 725         # comes ultimately from the IDP solver.
 726         # Also make a note of the reading's roots.
 727     foreach my $rdghash ( @{$variant_row->{'readings'}} ) {
 728         my $rid = $rdghash->{'readingid'};
 729         my @roots;
 730         foreach my $wit ( @{$rdghash->{'group'}} ) {
 731                 $contig->{$wit} = $rid;
 732             if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
 733                 push( @acwits, $1 );
 734             }
 735             if( exists $reading_roots->{$wit} && $reading_roots->{$wit} ) {
 736                 push( @roots, $wit );
 737             }
 738         }
 739                 $rdghash->{'independent_occurrence'} = \@roots;
 740         }
 741
 742         # Now that we have all the node group memberships, calculate followed/
 743     # non-followed/unknown values for each reading.  Also figure out the
 744     # reading's evident parent(s).
 745     foreach my $rdghash ( @{$variant_row->{'readings'}} ) {
 746         my $rid = $rdghash->{'readingid'};
 747         my $rdg = $c->reading( $rid );
 748         my @roots = @{$rdghash->{'independent_occurrence'}};
 749         my @reversions;
 750         if( $classinfo ) {
 751                 @reversions = grep { $classinfo->{$_} eq 'revert' }
 752                         $rdghash->{'group'}->members;
 753                 $rdghash->{'reversions'} = \@reversions;
 754         }
 755         my @group = @{$rdghash->{'group'}};
 756
 757         # Start figuring things out.
 758         $rdghash->{'followed'} = scalar( @group )
 759                 - ( scalar( @roots ) + scalar( @reversions ) );
 760         # Find the parent readings, if any, of this reading.
 761         my $sourceparents = _find_reading_parents( $rid, $graph, $contig, @roots );
 762                 # Work out relationships between readings and their non-followed parent.
 763                 _resolve_parent_relationships( $c, $rid, $rdg, $sourceparents );
 764                 $rdghash->{'source_parents'} = $sourceparents;
 765
 766                 if( @reversions ) {
 767                         my $revparents = _find_reading_parents( $rid, $graph, $contig, @reversions );
 768                         _resolve_parent_relationships( $c, $rid, $rdg, $revparents );
 769                         $rdghash->{'reversion_parents'} = $revparents;
 770                 }
 771
 772                 # Find the number of times this reading was altered, and the number of
 773                 # times we're not sure.
 774                 my( %nofollow, %unknownfollow );
 775                 foreach my $wit ( @{$rdghash->{'group'}} ) {
 776                         foreach my $wchild ( $graph->successors( $wit ) ) {
 777                                 if( $reading_roots->{$wchild} && $contig->{$wchild}
 778                                         && $contig->{$wchild} ne $rid ) {
 779                                         # It definitely changed here.
 780                                         $nofollow{$wchild} = 1;
 781                                 } elsif( !($contig->{$wchild}) ) {
 782                                         # The child is a hypothetical node not definitely in
 783                                         # any group. Answer is unknown.
 784                                         $unknownfollow{$wchild} = 1;
 785                                 } # else it is either in our group, or it is a non-root node in a
 786                                   # known group and therefore is presumed to have its reading from
 787                                   # its group, not this link.
 788                         }
 789                 }
 790                 $rdghash->{'not_followed'} = keys %nofollow;
 791                 $rdghash->{'follow_unknown'} = keys %unknownfollow;
 792
 793                 # Now say whether this reading represents a conflict.
 794                 unless( $variant_row->{'genealogical'} ) {
 795                         $rdghash->{'is_conflict'} = @roots != 1;
 796                         $rdghash->{'is_reverted'} = scalar @reversions;
 797                 }
 798     }
 799 }
 800
 801 sub _find_reading_parents {
 802         my( $rid, $graph, $contig, @list ) = @_;
 803         my $parenthash = {};
 804         foreach my $wit ( @list ) {
 805                 # Look in the stemma graph to find this witness's extant or known-reading
 806                 # immediate ancestor(s), and look up the reading that each ancestor holds.
 807                 my @check = $graph->predecessors( $wit );
 808                 while( @check ) {
 809                         my @next;
 810                         foreach my $wparent( @check ) {
 811                                 my $preading = $contig->{$wparent};
 812                                 if( $preading && $preading ne $rid ) {
 813                                         $parenthash->{$preading} = 1;
 814                                 } else {
 815                                         push( @next, $graph->predecessors( $wparent ) );
 816                                 }
 817                         }
 818                         @check = @next;
 819                 }
 820         }
 821         return $parenthash;
 822 }
 823
 824 sub _resolve_parent_relationships {
 825         my( $c, $rid, $rdg, $rdgparents ) = @_;
 826         foreach my $p ( keys %$rdgparents ) {
 827                 # Resolve the relationship of the parent to the reading, and
 828                 # save it in our hash.
 829                 my $pobj = $c->reading( $p );
 830                 my $prep = $pobj ? $pobj->id . ' (' . $pobj->text . ')' : $p;
 831                 my $phash = { 'label' => $prep };
 832                 if( $pobj ) {
 833                         # Get the attributes of the parent object while we are here
 834                         $phash->{'text'} = $pobj->text if $pobj;
 835                         if( $pobj && $pobj->does('Text::Tradition::Morphology') ) {
 836                                 $phash->{'is_nonsense'} = $pobj->is_nonsense;
 837                                 $phash->{'is_ungrammatical'} = $pobj->grammar_invalid;
 838                         }
 839                         # Now look at the relationship
 840                         my $rel = $c->get_relationship( $p, $rid );
 841                         if( $rel && $rel->type eq 'collated' ) {
 842                                 $rel = undef;
 843                         }
 844                         if( $rel ) {
 845                                 _add_to_hash( $rel, $phash );
 846                         } elsif( $rdg ) {
 847                                 # First check for a transposed relationship
 848                                 if( $rdg->rank != $pobj->rank ) {
 849                                         foreach my $ti ( $rdg->related_readings( 'transposition' ) ) {
 850                                                 next unless $ti->text eq $rdg->text;
 851                                                 $rel = $c->get_relationship( $ti, $pobj );
 852                                                 if( $rel ) {
 853                                                         _add_to_hash( $rel, $phash, 1 );
 854                                                         last;
 855                                                 }
 856                                         }
 857                                         unless( $rel ) {
 858                                                 foreach my $ti ( $pobj->related_readings( 'transposition' ) ) {
 859                                                         next unless $ti->text eq $pobj->text;
 860                                                         $rel = $c->get_relationship( $ti, $rdg );
 861                                                         if( $rel ) {
 862                                                                 _add_to_hash( $rel, $phash, 1 );
 863                                                                 last;
 864                                                         }
 865                                                 }
 866                                         }
 867                                 }
 868                                 unless( $rel ) {
 869                                         # and then check for sheer word similarity.
 870                                         my $rtext = $rdg->text;
 871                                         my $ptext = $pobj->text;
 872                                         if( similar( $rtext, $ptext ) ) {
 873                                                 # say STDERR "Words $rtext and $ptext judged similar";
 874                                                 $phash->{relation} = { type => 'wordsimilar' };
 875                                         }
 876                                 }
 877                         } else {
 878                                 $phash->{relation} = { type => 'deletion' };
 879                         }
 880                 } elsif( $p eq '(omitted)' ) {
 881                         # Check to see if the reading in question is a repetition.
 882                         my @reps = $rdg->related_readings( 'repetition' );
 883                         if( @reps ) {
 884                                 $phash->{relation} = { type => 'repetition',
 885                                         annotation => "of reading @reps" };
 886                         } else {
 887                                 $phash->{relation} = { type => 'addition' };
 888                         }
 889                 }
 890                 # Save it
 891                 $rdgparents->{$p} = $phash;
 892         }
 893 }
 894
 895 sub _add_to_hash {
 896         my( $rel, $phash, $is_transposed ) = @_;
 897         $phash->{relation} = { type => $rel->type };
 898         $phash->{relation}->{transposed} = 1 if $is_transposed;
 899         $phash->{relation}->{annotation} = $rel->annotation
 900                 if $rel->has_annotation;
 901         # Get all the relevant relationship info.
 902         foreach my $prop ( qw/ non_independent is_significant / ) {
 903                 $phash->{relation}->{$prop} = $rel->$prop;
 904         }
 905         # Figure out if the variant was judged revertible.
 906         my $is_a = $rel->reading_a eq $phash->{text};
 907         $phash->{revertible} = $is_a
 908                 ? $rel->a_derivable_from_b : $rel->b_derivable_from_a;
 909 }
 910
 911 =head2 similar( $word1, $word2 )
 912
 913 Use Algorithm::Diff to get a sense of how close the words are to each other.
 914 This will hopefully handle substitutions a bit more nicely than Levenshtein.
 915
 916 =cut
 917
 918 #!/usr/bin/env perl
 919
 920 sub similar {
 921         my( $word1, $word2 ) = sort { length($a) <=> length($b) } @_;
 922         my @let1 = split( '', lc( $word1 ) );
 923         my @let2 = split( '', lc( $word2 ) );
 924         my $diff = Algorithm::Diff->new( \@let1, \@let2 );
 925         my $mag = 0;
 926         while( $diff->Next ) {
 927                 if( $diff->Same ) {
 928                         # Take off points for longer strings
 929                         my $cs = $diff->Range(1) - 2;
 930                         $cs = 0 if $cs < 0;
 931                         $mag -= $cs;
 932                 } elsif( !$diff->Items(1) ) {
 933                         $mag += $diff->Range(2);
 934                 } elsif( !$diff->Items(2) ) {
 935                         $mag += $diff->Range(1);
 936                 } else {
 937                         # Split the difference for substitutions
 938                         my $c1 = $diff->Range(1) || 1;
 939                         my $c2 = $diff->Range(2) || 1;
 940                         my $cd = ( $c1 + $c2 ) / 2;
 941                         $mag += $cd;
 942                 }
 943         }
 944         return ( $mag <= length( $word1 ) / 2 );
 945 }
 946
 947 sub _useful_variant {
 948         my( $rankgroup, $rankgraph, $acstr ) = @_;
 949
 950         # Sort by group size and return
 951         my $is_useful = 0;
 952         foreach my $rdg ( keys %$rankgroup ) {
 953                 my @wits = $rankgroup->{$rdg}->members;
 954                 if( @wits > 1 ) {
 955                         $is_useful++;
 956                 } else {
 957                         $is_useful++ unless( $rankgraph->is_sink_vertex( $wits[0] )
 958                                 || $wits[0] =~ /\Q$acstr\E$/ );
 959                 }
 960         }
 961         return $is_useful > 1;
 962 }
 963
 964 =head2 wit_stringify( $groups )
 965
 966 Takes an array of witness groupings and produces a string like
 967 ['A','B'] / ['C','D','E'] / ['F']
 968
 969 =cut
 970
 971 sub wit_stringify {
 972     my $groups = shift;
 973     my @gst;
 974     # If we were passed an array of witnesses instead of an array of
 975     # groupings, then "group" the witnesses first.
 976     unless( ref( $groups->[0] ) ) {
 977         my $mkgrp = [ $groups ];
 978         $groups = $mkgrp;
 979     }
 980     foreach my $g ( @$groups ) {
 981         push( @gst, '[' . join( ',', map { "'$_'" } @$g ) . ']' );
 982     }
 983     return join( ' / ', @gst );
 984 }
 985
 986 1;
 987
 988 sub throw {
 989         Text::Tradition::Error->throw(
 990                 'ident' => 'Analysis error',
 991                 'message' => $_[0],
 992         );
 993 }
 994
 995 =head1 LICENSE
 996
 997 This package is free software and is provided "as is" without express
 998 or implied warranty.  You can redistribute it and/or modify it under
 999 the same terms as Perl itself.
1000
1001 =head1 AUTHOR
1002
1003 Tara L Andrews E<lt>aurum@cpan.orgE<gt>