analysis/lib/Text/Tradition/Analysis.pm

   1 package Text::Tradition::Analysis;
   2
   3 use strict;
   4 use warnings;
   5 use Algorithm::Diff;  # for word similarity measure
   6 use Encode qw/ decode_utf8 encode_utf8 /;
   7 use Exporter 'import';
   8 use Graph;
   9 use JSON qw/ to_json decode_json /;
  10 use LWP::UserAgent;
  11 use Set::Scalar;
  12 use Text::Tradition::Analysis::Result;
  13 use Text::Tradition::Directory;
  14 use Text::Tradition::Stemma;
  15 use TryCatch;
  16
  17 use vars qw/ @EXPORT_OK $VERSION /;
  18 @EXPORT_OK = qw/ run_analysis group_variants analyze_variant_location wit_stringify /;
  19 $VERSION = "1.0";
  20
  21
  22 my $SOLVER_URL = 'http://byzantini.st/cgi-bin/graphcalc.cgi';
  23 my $unsolved_problems = {};
  24
  25 =head1 NAME
  26
  27 Text::Tradition::Analysis - functions for stemma analysis of a tradition
  28
  29 =head1 SYNOPSIS
  30
  31   use Text::Tradition;
  32   use Text::Tradition::Analysis qw/ run_analysis analyze_variant_location /;
  33   my $t = Text::Tradition->new(
  34     'name' => 'this is a text',
  35     'input' => 'TEI',
  36     'file' => '/path/to/tei_parallel_seg_file.xml' );
  37   $t->enable_stemmata;
  38   $t->add_stemma( 'dotfile' => $stemmafile );
  39
  40   my $variant_data = run_analysis( $tradition );
  41   # Recalculate rank $n treating all orthographic variants as equivalent
  42   my $reanalyze = analyze_variant_location( $tradition, $n, 0, 'orthographic' );
  43
  44 =head1 DESCRIPTION
  45
  46 Text::Tradition is a library for representation and analysis of collated
  47 texts, particularly medieval ones.  Where the Collation is the central feature of
  48 a Tradition, it may also have one or more temmata associated with it, and these stemmata may be analyzed. This package provides the following modules:
  49
  50 =over 4
  51
  52 =item * L<Text::Tradition::HasStemma> - a role that can be composed into Text::Tradition objects, providing the ability for Text::Tradition::Stemma objects to be associated with them.
  53
  54 =item * L<Text::Tradition::Stemma> - an object class that represents stemma hypotheses, both rooted (with a single archetype) and unrooted (e.g. phylogenetic trees).
  55
  56 =item * Text::Tradition::Analysis (this package). Provides functions for the analysis of a given stemma against the collation within a given Tradition.
  57
  58 =back
  59
  60 =head1 SUBROUTINES
  61
  62 =head2 run_analysis( $tradition, %opts )
  63
  64 Runs the analysis described in analyze_variant_location on every location in the
  65 collation of the given tradition, with the given options. These include:
  66
  67 =over 4
  68
  69 =item * stemma_id - Specify which of the tradition's stemmata to use. Default
  70 is 0 (i.e. the first).
  71
  72 =item * ranks - Specify a list of location ranks to analyze; exclude the rest.
  73
  74 =item * merge_types - Specify a list of relationship types, where related readings
  75 should be treated as identical for the purposes of analysis.
  76
  77 =item * exclude_type1 - Exclude those ranks whose groupings have only type-1 variants.
  78
  79 =back
  80
  81 =begin testing
  82
  83 use Text::Tradition;
  84 use Text::Tradition::Analysis qw/ run_analysis analyze_variant_location /;
  85
  86 my $datafile = 't/data/florilegium_tei_ps.xml';
  87 my $tradition = Text::Tradition->new( 'input' => 'TEI',
  88                                       'name' => 'test0',
  89                                       'file' => $datafile );
  90 $tradition->enable_stemmata;
  91 my $s = $tradition->add_stemma( 'dotfile' => 't/data/florilegium.dot' );
  92 is( ref( $s ), 'Text::Tradition::Stemma', "Added stemma to tradition" );
  93
  94 my %expected_genealogical = (
  95         1 => 0,
  96         2 => 1,
  97         3 =>  0,
  98         5 =>  0,
  99         7 =>  0,
 100         8 =>  0,
 101         10 => 0,
 102         13 => 1,
 103         33 => 0,
 104         34 => 0,
 105         37 => 0,
 106         60 => 0,
 107         81 => 1,
 108         84 => 0,
 109         87 => 0,
 110         101 => 0,
 111         102 => 0,
 112         122 => 1,
 113         157 => 0,
 114         166 => 1,
 115         169 => 1,
 116         200 => 0,
 117         216 => 1,
 118         217 => 1,
 119         219 => 1,
 120         241 => 1,
 121         242 => 1,
 122         243 => 1,
 123 );
 124
 125 my $data = run_analysis( $tradition, calcdsn => 'dbi:SQLite:dbname=t/data/analysis.db' );
 126 my $c = $tradition->collation;
 127 foreach my $row ( @{$data->{'variants'}} ) {
 128         # Account for rows that used to be "not useful"
 129         unless( exists $expected_genealogical{$row->{'id'}} ) {
 130                 $expected_genealogical{$row->{'id'}} = 1;
 131         }
 132         my $gen_bool = $row->{'genealogical'} ? 1 : 0;
 133         is( $gen_bool, $expected_genealogical{$row->{'id'}},
 134                 "Got correct genealogical flag for row " . $row->{'id'} );
 135         # Check that we have the right row with the right groups
 136         my $rank = $row->{'id'};
 137         foreach my $rdghash ( @{$row->{'readings'}} ) {
 138                 # Skip 'readings' that aren't really
 139                 next unless $c->reading( $rdghash->{'readingid'} );
 140                 # Check the rank
 141                 is( $c->reading( $rdghash->{'readingid'} )->rank, $rank,
 142                         "Got correct reading rank" );
 143                 # Check the witnesses
 144                 my @realwits = sort $c->reading_witnesses( $rdghash->{'readingid'} );
 145                 my @sgrp = sort @{$rdghash->{'group'}};
 146                 is_deeply( \@sgrp, \@realwits, "Reading analyzed with correct groups" );
 147         }
 148 }
 149 is( $data->{'variant_count'}, 58, "Got right total variant number" );
 150 # TODO Make something meaningful of conflict count, maybe test other bits
 151
 152 =end testing
 153
 154 =cut
 155
 156 sub run_analysis {
 157         my( $tradition, %opts ) = @_;
 158         my $c = $tradition->collation;
 159         my $aclabel = $c->ac_label;
 160
 161         my $stemma_id = $opts{'stemma_id'} || 0;
 162         my @ranks = ref( $opts{'ranks'} ) eq 'ARRAY' ? @{$opts{'ranks'}} : ();
 163         my $collapse = Set::Scalar->new();
 164         if( $opts{'merge_types'} && ref( $opts{'merge_types'} ) eq 'ARRAY' ) {
 165                 $collapse->insert( @{$opts{'merge_types'}} );
 166         } elsif( $opts{'merge_types'} ) {
 167                 $collapse->insert( $opts{'merge_types'} );
 168         }
 169
 170         # Make sure we have a lookup DB for graph calculation results; this will die
 171         # if calcdir or calcdsn isn't passed.
 172         my $dir;
 173         if( exists $opts{'calcdir'} ) {
 174                 $dir = delete $opts{'calcdir'}
 175         } elsif ( exists $opts{'calcdsn'} ) {
 176                 $dir = Text::Tradition::Directory->new( dsn => $opts{'calcdsn'} );
 177         }
 178
 179         # Get the stemma
 180         my $stemma = $tradition->stemma( $stemma_id );
 181
 182         # Figure out which witnesses we are working with - that is, the ones that
 183         # appear both in the stemma and in the tradition. All others are 'lacunose'
 184         # for our purposes.
 185         my $lacunose = Set::Scalar->new( $stemma->hypotheticals );
 186         my $stemma_wits = Set::Scalar->new( $stemma->witnesses );
 187         my $tradition_wits = Set::Scalar->new( map { $_->sigil } $tradition->witnesses );
 188         $lacunose->insert( $stemma_wits->symmetric_difference( $tradition_wits )->members );
 189
 190         # Find and mark 'common' ranks for exclusion, unless they were
 191         # explicitly specified.
 192         unless( @ranks ) {
 193                 my %common_rank;
 194                 foreach my $rdg ( $c->common_readings ) {
 195                         $common_rank{$rdg->rank} = 1;
 196                 }
 197                 @ranks = grep { !$common_rank{$_} } ( 1 .. $c->end->rank-1 );
 198         }
 199
 200         # Group the variants to send to the solver
 201         my @groups;
 202         my @use_ranks;
 203         my %lacunae;
 204         my $moved = {};
 205         foreach my $rank ( @ranks ) {
 206                 my $missing = $lacunose->clone();
 207                 my $rankgroup = group_variants( $tradition, $rank, $missing, $moved, $collapse );
 208                 # Filter out any empty rankgroups
 209                 # (e.g. from the later rank for a transposition)
 210                 next unless keys %$rankgroup;
 211                 # Get the graph for this rankgroup
 212                 my $rankgraph = _graph_for_grouping( $stemma, $rankgroup, $missing, $aclabel );
 213                 if( $opts{'exclude_type1'} ) {
 214                         # Check to see whether this is a "useful" group.
 215                         next unless _useful_variant( $rankgroup, $rankgraph, $aclabel );
 216                 }
 217                 push( @use_ranks, $rank );
 218                 push( @groups, { grouping => $rankgroup, graph => $rankgraph } );
 219                 $lacunae{$rank} = $missing;
 220         }
 221         # Run the solver
 222         my $answer;
 223         try {
 224                 $answer = solve_variants( $dir, @groups );
 225         } catch ( Text::Tradition::Error $e ) {
 226                 if( $e->message =~ /IDP/ ) {
 227                         # Something is wrong with the solver; make the variants table anyway
 228                         $answer->{'variants'} = [];
 229                         map { push( @{$answer->{'variants'}}, _init_unsolved( $_, 'IDP error' ) ) }
 230                                 @groups;
 231                 } else {
 232                         # Something else is wrong; error out.
 233                         $e->throw;
 234                 }
 235         }
 236
 237         # Do further analysis on the answer
 238         my $conflict_count = 0;
 239         my $reversion_count = 0;
 240         foreach my $idx ( 0 .. $#use_ranks ) {
 241                 my $location = $answer->{'variants'}->[$idx];
 242                 # Add the rank back in
 243                 my $rank = $use_ranks[$idx];
 244                 $location->{'id'} = $rank;
 245                 # Note what our lacunae are
 246                 my %lmiss;
 247                 map { $lmiss{$_} = 1 } @{$lacunae{$use_ranks[$idx]}};
 248                 $location->{'missing'} = [ keys %lmiss ];
 249
 250                 # Run the extra analysis we need.
 251                 ## TODO We run through all the variants in this call, so
 252                 ## why not add the reading data there instead of here below?
 253                 my $graph = $groups[$idx]->{graph};
 254                 analyze_location( $tradition, $graph, $location, \%lmiss );
 255
 256                 my @layerwits;
 257                 # Do the final post-analysis tidying up of the data.
 258                 foreach my $rdghash ( @{$location->{'readings'}} ) {
 259                         $conflict_count++ if $rdghash->{'is_conflict'};
 260                         $reversion_count++ if $rdghash->{'is_reverted'};
 261                         # Add the reading text back in, setting display value as needed
 262                         my $rdg = $c->reading( $rdghash->{'readingid'} );
 263                         if( $rdg ) {
 264                                 $rdghash->{'text'} = $rdg->text .
 265                                         ( $rdg->rank == $rank ? '' : ' [' . $rdg->rank . ']' );
 266                                 $rdghash->{'is_ungrammatical'} = $rdg->grammar_invalid;
 267                                 $rdghash->{'is_nonsense'} = $rdg->is_nonsense;
 268                         }
 269                         # Remove lacunose witnesses from this reading's list now that the
 270                         # analysis is done
 271                         my @realgroup;
 272                         map { push( @realgroup, $_ ) unless $lmiss{$_} } @{$rdghash->{'group'}};
 273                         $rdghash->{'group'} = \@realgroup;
 274                         # Note any layered witnesses that appear in this group
 275                         foreach( @realgroup ) {
 276                                 if( $_ =~ /^(.*)\Q$aclabel\E$/ ) {
 277                                         push( @layerwits, $1 );
 278                                 }
 279                         }
 280                 }
 281                 $location->{'layerwits'} = \@layerwits if @layerwits;
 282         }
 283         $answer->{'conflict_count'} = $conflict_count;
 284         $answer->{'reversion_count'} = $reversion_count;
 285
 286         return $answer;
 287 }
 288
 289 =head2 group_variants( $tradition, $rank, $lacunose, $transposed, $merge_relationship_types )
 290
 291 Groups the variants at the given $rank of the collation, treating any
 292 relationships in the set $merge_relationship_types as equivalent.
 293 $lacunose should be a reference to an array, to which the sigla of lacunose
 294 witnesses at this rank will be appended; $transposed should be a reference
 295 to a hash, wherein the identities of transposed readings and their
 296 relatives will be stored.
 297
 298 Returns a hash $group_readings where $rdg is attested by the witnesses listed
 299 in $group_readings->{$rdg}.
 300
 301 =cut
 302
 303 # Return group_readings, groups, lacunose
 304 sub group_variants {
 305         my( $tradition, $rank, $lacunose, $transposed, $collapse ) = @_;
 306         my $c = $tradition->collation;
 307         my $aclabel = $c->ac_label;
 308         my $table = $c->alignment_table;
 309         # Get the alignment table readings
 310         my %readings_at_rank;
 311         my $check_for_gaps = Set::Scalar->new();
 312         my %moved_wits;
 313         my $has_transposition;
 314         foreach my $tablewit ( @{$table->{'alignment'}} ) {
 315                 my $rdg = $tablewit->{'tokens'}->[$rank-1];
 316                 my $wit = $tablewit->{'witness'};
 317                 # Exclude the witness if it is "lacunose" which if we got here
 318                 # means "not in the stemma".
 319                 next if _is_lacunose( $wit, $lacunose, $aclabel );
 320                 # Note if the witness is actually in a lacuna
 321                 if( $rdg && $rdg->{'t'}->is_lacuna ) {
 322                         _add_to_witlist( $wit, $lacunose, $aclabel );
 323                 # Otherwise the witness either has a positive reading...
 324                 } elsif( $rdg ) {
 325                         # If the reading has been counted elsewhere as a transposition, ignore it.
 326                         if( $transposed->{$rdg->{'t'}->id} ) {
 327                                 # TODO Does this cope with three-way transpositions?
 328                                 map { $moved_wits{$_} = 1 } @{$transposed->{$rdg->{'t'}->id}};
 329                                 next;
 330                         }
 331                         # Otherwise, record it...
 332                         $readings_at_rank{$rdg->{'t'}->id} = $rdg->{'t'};
 333                         # ...and grab any transpositions, and their relations.
 334                         my @transp = grep { $_->rank != $rank } $rdg->{'t'}->related_readings();
 335                         foreach my $trdg ( @transp ) {
 336                                 next if exists $readings_at_rank{$trdg->id};
 337                                 $has_transposition = 1;
 338                                 my @affected_wits = _table_witnesses(
 339                                         $table, $trdg, $lacunose, $aclabel );
 340                                 next unless @affected_wits;
 341                                 map { $moved_wits{$_} = 1 } @affected_wits;
 342                                 $transposed->{$trdg->id} =
 343                                         [ _table_witnesses( $table, $rdg->{'t'}, $lacunose, $aclabel ) ];
 344                                 $readings_at_rank{$trdg->id} = $trdg;
 345                         }
 346                 # ...or it is empty, ergo a gap.
 347                 } else {
 348                         _add_to_witlist( $wit, $check_for_gaps, $aclabel );
 349                 }
 350         }
 351         my $gap_wits = Set::Scalar->new();
 352         map { _add_to_witlist( $_, $gap_wits, $aclabel )
 353                 unless $moved_wits{$_} } $check_for_gaps->members;
 354
 355         # Group the readings, collapsing groups by relationship if needed.
 356         my $grouped_readings = {};
 357         foreach my $rdg ( values %readings_at_rank ) {
 358                 # Skip readings that have been collapsed into others.
 359                 next if exists $grouped_readings->{$rdg->id}
 360                         && $grouped_readings->{$rdg->id} eq 'COLLAPSE';
 361                 # Get the witness list, including from readings collapsed into this one.
 362                 my @wits = _table_witnesses( $table, $rdg, $lacunose, $aclabel );
 363                 if( $collapse && $collapse->size ) {
 364                         my $filter = sub { $collapse->has( $_[0]->type ) };
 365                         foreach my $other ( $rdg->related_readings( $filter ) ) {
 366                                 my @otherwits = _table_witnesses( $table, $other, $lacunose, $aclabel );
 367                                 push( @wits, @otherwits );
 368                                 $grouped_readings->{$other->id} = 'COLLAPSE';
 369                         }
 370                 }
 371                 $grouped_readings->{$rdg->id} = Set::Scalar->new( @wits );
 372         }
 373         if( $gap_wits->members ) {
 374                 $grouped_readings->{'(omitted)'} = $gap_wits;
 375         }
 376
 377         # Get rid of our collapsed readings
 378         map { delete $grouped_readings->{$_} if(
 379                          $grouped_readings->{$_} eq 'COLLAPSE'
 380                          || $grouped_readings->{$_}->is_empty ) }
 381                 keys %$grouped_readings;
 382
 383         # If something was transposed, check the groups for doubled-up readings
 384         if( $has_transposition ) {
 385                 # print STDERR "Group for rank $rank:\n";
 386                 # map { print STDERR "\t$_: " . join( ' ' , @{$grouped_readings->{$_}} ) . "\n" }
 387                 #       keys %$grouped_readings;
 388                 _check_transposed_consistency( $c, $rank, $transposed, $grouped_readings );
 389         }
 390
 391         # Return the result
 392         return $grouped_readings;
 393 }
 394
 395 # Helper function to query the alignment table for all witnesses (a.c. included)
 396 # that have a given reading at its rank.
 397 sub _table_witnesses {
 398         my( $table, $trdg, $lacunose, $aclabel ) = @_;
 399         my $tableidx = $trdg->rank - 1;
 400         my $has_reading = Set::Scalar->new();
 401         foreach my $row ( @{$table->{'alignment'}} ) {
 402                 my $wit = $row->{'witness'};
 403                 next if _is_lacunose( $wit, $lacunose, $aclabel );
 404                 my $rdg = $row->{'tokens'}->[$tableidx];
 405                 next unless exists $rdg->{'t'} && defined $rdg->{'t'};
 406                 _add_to_witlist( $wit, $has_reading, $aclabel )
 407                         if $rdg->{'t'}->id eq $trdg->id;
 408         }
 409         return $has_reading->members;
 410 }
 411
 412 # Helper function to see if a witness is lacunose even if we are asking about
 413 # the a.c. version
 414 sub _is_lacunose {
 415         my ( $wit, $lac, $acstr ) = @_;
 416         if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
 417                 $wit = $1;
 418         }
 419         return $lac->has( $wit );
 420 }
 421
 422 # Helper function to ensure that X and X a.c. never appear in the same list.
 423 sub _add_to_witlist {
 424         my( $wit, $list, $acstr ) = @_;
 425         if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
 426                 # Don't add X a.c. if we already have X
 427                 return if $list->has( $1 );
 428         } else {
 429                 # Delete X a.c. if we are about to add X
 430                 $list->delete( $wit.$acstr );
 431         }
 432         $list->insert( $wit );
 433 }
 434
 435 sub _check_transposed_consistency {
 436         my( $c, $rank, $transposed, $groupings ) = @_;
 437         my %seen_wits;
 438         my %thisrank;
 439         # Note which readings are actually at this rank, and which witnesses
 440         # belong to which reading.
 441         foreach my $rdg ( keys %$groupings ) {
 442                 my $rdgobj = $c->reading( $rdg );
 443                 # Count '(omitted)' as a reading at this rank
 444                 $thisrank{$rdg} = 1 if !$rdgobj || $rdgobj->rank == $rank;
 445                 map { push( @{$seen_wits{$_}}, $rdg ) } @{$groupings->{$rdg}};
 446         }
 447         # Our work is done if we have no witness belonging to more than one
 448         # reading.
 449         my @doubled = grep { scalar @{$seen_wits{$_}} > 1 } keys %seen_wits;
 450         return unless @doubled;
 451         # If we have a symmetric related transposition, drop the non-rank readings.
 452         if( @doubled == scalar keys %seen_wits ) {
 453                 foreach my $rdg ( keys %$groupings ) {
 454                         if( !$thisrank{$rdg} ) {
 455                                 my $groupstr = wit_stringify( $groupings->{$rdg} );
 456                                 my ( $matched ) = grep { $groupstr eq wit_stringify( $groupings->{$_} ) }
 457                                         keys %thisrank;
 458                                 delete $groupings->{$rdg};
 459                                 # If we found a group match, assume there is a symmetry happening.
 460                                 # TODO think more about this
 461                                 # print STDERR "*** Deleting symmetric reading $rdg\n";
 462                                 unless( $matched ) {
 463                                         delete $transposed->{$rdg};
 464                                         warn "Found problem in evident symmetry with reading $rdg";
 465                                 }
 466                         }
 467                 }
 468         # Otherwise 'unhook' the transposed reading(s) that have duplicates.
 469         } else {
 470                 foreach my $dup ( @doubled ) {
 471                         foreach my $rdg ( @{$seen_wits{$dup}} ) {
 472                                 next if $thisrank{$rdg};
 473                                 next unless exists $groupings->{$rdg};
 474                                 # print STDERR "*** Deleting asymmetric doubled-up reading $rdg\n";
 475                                 delete $groupings->{$rdg};
 476                                 delete $transposed->{$rdg};
 477                         }
 478                 }
 479                 # and put any now-orphaned readings into an 'omitted' reading.
 480                 foreach my $wit ( keys %seen_wits ) {
 481                         unless( grep { exists $groupings->{$_} } @{$seen_wits{$wit}} ) {
 482                                 $groupings->{'(omitted)'} = Set::Scalar->new()
 483                                          unless exists $groupings->{'(omitted)'};
 484                                 _add_to_witlist( $wit, $groupings->{'(omitted)'}, $c->ac_label );
 485                         }
 486                 }
 487         }
 488 }
 489
 490 # For the given grouping, return its situation graph based on the stemma.
 491 sub _graph_for_grouping {
 492         my( $stemma, $grouping, $lacunose, $aclabel ) = @_;
 493         my $acwits = [];
 494         my $extant = {};
 495         foreach my $gs ( values %$grouping ) {
 496                 map {
 497                         if( $_ =~ /^(.*)\Q$aclabel\E$/ ) {
 498                                 push( @$acwits, $1 ) unless $lacunose->has( $1 );
 499                         } else {
 500                                 $extant->{$_} = 1 unless $lacunose->has( $_ );
 501                         }
 502                 } $gs->members;
 503         }
 504         my $graph;
 505         try {
 506                 # contig contains all extant wits and all hypothetical wits
 507                 # needed to make up the groups.
 508                 $graph = $stemma->situation_graph( $extant, $acwits, $aclabel );
 509         } catch ( Text::Tradition::Error $e ) {
 510                 throw( "Could not extend graph with given extant and a.c. witnesses: "
 511                         . $e->message );
 512         } catch {
 513                 throw( "Could not extend graph with a.c. witnesses @$acwits" );
 514         }
 515         return $graph;
 516 }
 517
 518 =head2 solve_variants( $calcdir, @groups )
 519
 520 Looks up the set of groups in the answers provided by the external graph solver
 521 service and returns a cleaned-up answer, adding the rank IDs back where they belong.
 522
 523 The answer has the form
 524   { "variants" => [ array of variant location structures ],
 525     "variant_count" => total,
 526     "conflict_count" => number of conflicts detected,
 527     "genealogical_count" => number of solutions found }
 528
 529 =cut
 530
 531 sub solve_variants {
 532         my( @groups ) = @_;
 533
 534         # Are we using a local result directory, or did we pass an empty value
 535         # for the directory?
 536         my $dir;
 537         unless( ref( $groups[0] ) eq 'HASH' ) {
 538                 $dir = shift @groups;
 539         }
 540
 541         ## For each graph/group combo, make a Text::Tradition::Analysis::Result
 542         ## object so that we can send it off for IDP lookup.
 543         my $variants = [];
 544         my $genealogical = 0; # counter
 545         # TODO Optimize for unique graph problems
 546         my %problems;
 547         foreach my $graphproblem ( @groups ) {
 548                 # Construct the calc result key and look up its answer
 549                 my $problem = Text::Tradition::Analysis::Result->new(
 550                         graph => $graphproblem->{'graph'},
 551                         setlist => [ values %{$graphproblem->{'grouping'}} ] );
 552                 if( exists $problems{$problem->object_key} ) {
 553                         $problem = $problems{$problem->object_key};
 554                 } else {
 555                         $problems{$problem->object_key} = $problem;
 556                 }
 557                 $graphproblem->{'object'} = $problem;
 558         }
 559
 560         my %results;
 561         if( $dir ) {
 562                 my $scope = $dir->new_scope;
 563                 map { $results{$_} = $dir->lookup( $_ ) || $problems{$_} } keys %problems;
 564         } else {
 565                 my $json = JSON->new->allow_blessed->convert_blessed->utf8->encode(
 566                         [ values %problems ] );
 567                 # Send it off and get the result
 568                 # print STDERR "Sending request: " . decode_utf8( $json ) . "\n";
 569                 my $ua = LWP::UserAgent->new();
 570                 my $resp = $ua->post( $SOLVER_URL, 'Content-Type' => 'application/json',
 571                                                           'Content' => $json );
 572                 my $answer;
 573                 if( $resp->is_success ) {
 574                         $answer = decode_json( $resp->content );
 575                         throw( "Unexpected answer from IDP: $answer" ) unless ref( $answer ) eq 'ARRAY';
 576                 } else {
 577                         throw( "IDP solver returned " . $resp->status_line . " / " . $resp->content
 578                                 . "; cannot run graph analysis" );
 579                 }
 580                 # One more sanity check
 581                 throw( "Something went wrong with answer symmetricity" )
 582                         unless keys( %problems ) == @$answer;
 583                 # Convert the results
 584                 foreach my $a ( @$answer ) {
 585                         my $r = Text::Tradition::Analysis::Result->new( $a );
 586                         $results{$r->object_key} = $r;
 587                 }
 588         }
 589
 590         # We now have a single JSON-encoded Result object per problem sent. Fold its
 591         # answers into our variant info structure.
 592         foreach my $graphproblem ( @groups ) {
 593                 my $result = $results{$graphproblem->{'object'}->object_key}
 594                         || $graphproblem->{'object'};
 595
 596                 # Initialize the result structure for this graph problem
 597                 my $vstruct;
 598                 if( $result->status eq 'OK' ) {
 599                         $vstruct = { readings => [] };
 600                         push( @$variants, $vstruct );
 601                 } else {
 602                         push( @$variants, _init_unsolved( $graphproblem, $result->status ) );
 603                         next;
 604                 }
 605
 606                 # 1. Did the group evaluate as genealogical?
 607                 $vstruct->{genealogical} = $result->is_genealogical;
 608                 $genealogical++ if $result->is_genealogical;
 609
 610                 # 2. What are the calculated minimum groupings for each variant loc?
 611                 foreach my $rid ( keys %{$graphproblem->{grouping}} ) {
 612                         my $inputset = $graphproblem->{grouping}->{$rid};
 613                         my $minset = $result->minimum_grouping_for( $inputset );
 614                         push( @{$vstruct->{readings}}, { readingid => $rid, group => $minset } );
 615                 }
 616
 617                 # 3. What are the sources and classes calculated for each witness?
 618                 $vstruct->{witcopy_types} = { $result->classes };
 619                 $vstruct->{reading_roots} = {};
 620                 map { $vstruct->{reading_roots}->{$_} = 1 } $result->sources;
 621
 622         }
 623
 624         return { 'variants' => $variants,
 625                          'variant_count' => scalar @$variants,
 626                          'genealogical_count' => $genealogical };
 627 }
 628
 629 sub _init_unsolved {
 630         my( $graphproblem, $status ) = @_;
 631         my $vstruct = { 'readings' => [] };
 632         $vstruct->{'unsolved'} = $status;
 633         foreach my $rid ( keys %{$graphproblem->{grouping}} ) {
 634                 push( @{$vstruct->{readings}}, { readingid => $rid,
 635                         group => [ $graphproblem->{grouping}->{$rid}->members ] } );
 636         }
 637         return $vstruct;
 638 }
 639
 640 =head2 analyze_location ( $tradition, $graph, $location_hash )
 641
 642 Given the tradition, its stemma graph, and the solution from the graph solver,
 643 work out the rest of the information we want.  For each reading we need missing,
 644 conflict, reading_parents, independent_occurrence, followed, not_followed,
 645 and follow_unknown.  Alters the location_hash in place.
 646
 647 =cut
 648
 649 sub analyze_location {
 650         my ( $tradition, $graph, $variant_row, $lacunose ) = @_;
 651         my $c = $tradition->collation;
 652
 653         if( exists $variant_row->{'unsolved'} ) {
 654                 return;
 655         }
 656         my $reading_roots = delete $variant_row->{'reading_roots'};
 657         my $classinfo = delete $variant_row->{'witcopy_types'};
 658
 659         # Make a hash of all known node memberships, and make the subgraphs.
 660         my $contig = {};
 661         my $subgraph = {};
 662         my $acstr = $c->ac_label;
 663         my @acwits;
 664
 665         # Note which witnesses positively belong to which group. This information
 666         # comes ultimately from the IDP solver.
 667         # Also make a note of the reading's roots.
 668     foreach my $rdghash ( @{$variant_row->{'readings'}} ) {
 669         my $rid = $rdghash->{'readingid'};
 670         my @roots;
 671         foreach my $wit ( @{$rdghash->{'group'}} ) {
 672                 $contig->{$wit} = $rid;
 673             if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
 674                 push( @acwits, $1 );
 675             }
 676             if( exists $reading_roots->{$wit} && $reading_roots->{$wit} ) {
 677                 push( @roots, $wit );
 678             }
 679         }
 680                 $rdghash->{'independent_occurrence'} = \@roots;
 681         }
 682
 683         # Now that we have all the node group memberships, calculate followed/
 684     # non-followed/unknown values for each reading.  Also figure out the
 685     # reading's evident parent(s).
 686     foreach my $rdghash ( @{$variant_row->{'readings'}} ) {
 687         my $rid = $rdghash->{'readingid'};
 688         my $rdg = $c->reading( $rid );
 689         my @roots = @{$rdghash->{'independent_occurrence'}};
 690         my @reversions;
 691         if( $classinfo ) {
 692                 @reversions = grep { $classinfo->{$_} eq 'revert' }
 693                         $rdghash->{'group'}->members;
 694                 $rdghash->{'reversions'} = \@reversions;
 695         }
 696         my @group = @{$rdghash->{'group'}};
 697
 698         # Start figuring things out.
 699         $rdghash->{'followed'} = scalar( @group )
 700                 - ( scalar( @roots ) + scalar( @reversions ) );
 701         # Find the parent readings, if any, of this reading.
 702         my $sourceparents = _find_reading_parents( $rid, $graph, $contig, @roots );
 703                 # Work out relationships between readings and their non-followed parent.
 704                 _resolve_parent_relationships( $c, $rid, $rdg, $sourceparents );
 705                 $rdghash->{'source_parents'} = $sourceparents;
 706
 707                 if( @reversions ) {
 708                         my $revparents = _find_reading_parents( $rid, $graph, $contig, @reversions );
 709                         _resolve_parent_relationships( $c, $rid, $rdg, $revparents );
 710                         $rdghash->{'reversion_parents'} = $revparents;
 711                 }
 712
 713                 # Find the number of times this reading was altered, and the number of
 714                 # times we're not sure.
 715                 my( %nofollow, %unknownfollow );
 716                 foreach my $wit ( @{$rdghash->{'group'}} ) {
 717                         foreach my $wchild ( $graph->successors( $wit ) ) {
 718                                 if( $reading_roots->{$wchild} && $contig->{$wchild}
 719                                         && $contig->{$wchild} ne $rid ) {
 720                                         # It definitely changed here.
 721                                         $nofollow{$wchild} = 1;
 722                                 } elsif( !($contig->{$wchild}) ) {
 723                                         # The child is a hypothetical node not definitely in
 724                                         # any group. Answer is unknown.
 725                                         $unknownfollow{$wchild} = 1;
 726                                 } # else it is either in our group, or it is a non-root node in a
 727                                   # known group and therefore is presumed to have its reading from
 728                                   # its group, not this link.
 729                         }
 730                 }
 731                 $rdghash->{'not_followed'} = keys %nofollow;
 732                 $rdghash->{'follow_unknown'} = keys %unknownfollow;
 733
 734                 # Now say whether this reading represents a conflict.
 735                 unless( $variant_row->{'genealogical'} ) {
 736                         $rdghash->{'is_conflict'} = @roots != 1;
 737                         $rdghash->{'is_reverted'} = scalar @reversions;
 738                 }
 739     }
 740 }
 741
 742 sub _find_reading_parents {
 743         my( $rid, $graph, $contig, @list ) = @_;
 744         my $parenthash = {};
 745         foreach my $wit ( @list ) {
 746                 # Look in the stemma graph to find this witness's extant or known-reading
 747                 # immediate ancestor(s), and look up the reading that each ancestor holds.
 748                 my @check = $graph->predecessors( $wit );
 749                 while( @check ) {
 750                         my @next;
 751                         foreach my $wparent( @check ) {
 752                                 my $preading = $contig->{$wparent};
 753                                 if( $preading && $preading ne $rid ) {
 754                                         $parenthash->{$preading} = 1;
 755                                 } else {
 756                                         push( @next, $graph->predecessors( $wparent ) );
 757                                 }
 758                         }
 759                         @check = @next;
 760                 }
 761         }
 762         return $parenthash;
 763 }
 764
 765 sub _resolve_parent_relationships {
 766         my( $c, $rid, $rdg, $rdgparents ) = @_;
 767         foreach my $p ( keys %$rdgparents ) {
 768                 # Resolve the relationship of the parent to the reading, and
 769                 # save it in our hash.
 770                 my $pobj = $c->reading( $p );
 771                 my $prep = $pobj ? $pobj->id . ' (' . $pobj->text . ')' : $p;
 772                 my $phash = { 'label' => $prep };
 773                 if( $pobj ) {
 774                         my $rel = $c->get_relationship( $p, $rid );
 775                         if( $rel ) {
 776                                 _add_to_hash( $rel, $phash );
 777                         } elsif( $rdg ) {
 778                                 # First check for a transposed relationship
 779                                 if( $rdg->rank != $pobj->rank ) {
 780                                         foreach my $ti ( $rdg->related_readings( 'transposition' ) ) {
 781                                                 next unless $ti->text eq $rdg->text;
 782                                                 $rel = $c->get_relationship( $ti, $pobj );
 783                                                 if( $rel ) {
 784                                                         _add_to_hash( $rel, $phash, 1 );
 785                                                         last;
 786                                                 }
 787                                         }
 788                                         unless( $rel ) {
 789                                                 foreach my $ti ( $pobj->related_readings( 'transposition' ) ) {
 790                                                         next unless $ti->text eq $pobj->text;
 791                                                         $rel = $c->get_relationship( $ti, $rdg );
 792                                                         if( $rel ) {
 793                                                                 _add_to_hash( $rel, $phash, 1 );
 794                                                                 last;
 795                                                         }
 796                                                 }
 797                                         }
 798                                 }
 799                                 unless( $rel ) {
 800                                         # and then check for sheer word similarity.
 801                                         my $rtext = $rdg->text;
 802                                         my $ptext = $pobj->text;
 803                                         if( similar( $rtext, $ptext ) ) {
 804                                                 # say STDERR "Words $rtext and $ptext judged similar";
 805                                                 $phash->{relation} = { type => 'wordsimilar' };
 806                                         }
 807                                 }
 808                         } else {
 809                                 $phash->{relation} = { type => 'deletion' };
 810                         }
 811                         # Get the attributes of the parent object while we are here
 812                         $phash->{'text'} = $pobj->text if $pobj;
 813                         $phash->{'is_nonsense'} = $pobj->is_nonsense;
 814                         $phash->{'is_ungrammatical'} = $pobj->grammar_invalid;
 815                 } elsif( $p eq '(omitted)' ) {
 816                         $phash->{relation} = { type => 'addition' };
 817                 }
 818                 # Save it
 819                 $rdgparents->{$p} = $phash;
 820         }
 821 }
 822
 823 sub _add_to_hash {
 824         my( $rel, $phash, $is_transposed ) = @_;
 825         $phash->{relation} = { type => $rel->type };
 826         $phash->{relation}->{transposed} = 1 if $is_transposed;
 827         $phash->{relation}->{annotation} = $rel->annotation
 828                 if $rel->has_annotation;
 829 }
 830
 831 =head2 similar( $word1, $word2 )
 832
 833 Use Algorithm::Diff to get a sense of how close the words are to each other.
 834 This will hopefully handle substitutions a bit more nicely than Levenshtein.
 835
 836 =cut
 837
 838 #!/usr/bin/env perl
 839
 840 sub similar {
 841         my( $word1, $word2 ) = sort { length($a) <=> length($b) } @_;
 842         my @let1 = split( '', lc( $word1 ) );
 843         my @let2 = split( '', lc( $word2 ) );
 844         my $diff = Algorithm::Diff->new( \@let1, \@let2 );
 845         my $mag = 0;
 846         while( $diff->Next ) {
 847                 if( $diff->Same ) {
 848                         # Take off points for longer strings
 849                         my $cs = $diff->Range(1) - 2;
 850                         $cs = 0 if $cs < 0;
 851                         $mag -= $cs;
 852                 } elsif( !$diff->Items(1) ) {
 853                         $mag += $diff->Range(2);
 854                 } elsif( !$diff->Items(2) ) {
 855                         $mag += $diff->Range(1);
 856                 } else {
 857                         # Split the difference for substitutions
 858                         my $c1 = $diff->Range(1) || 1;
 859                         my $c2 = $diff->Range(2) || 1;
 860                         my $cd = ( $c1 + $c2 ) / 2;
 861                         $mag += $cd;
 862                 }
 863         }
 864         return ( $mag <= length( $word1 ) / 2 );
 865 }
 866
 867 sub _useful_variant {
 868         my( $rankgroup, $rankgraph, $acstr ) = @_;
 869
 870         # Sort by group size and return
 871         my $is_useful = 0;
 872         foreach my $rdg ( keys %$rankgroup ) {
 873                 my @wits = $rankgroup->{$rdg}->members;
 874                 if( @wits > 1 ) {
 875                         $is_useful++;
 876                 } else {
 877                         $is_useful++ unless( $rankgraph->is_sink_vertex( $wits[0] )
 878                                 || $wits[0] =~ /\Q$acstr\E$/ );
 879                 }
 880         }
 881         return $is_useful > 1;
 882 }
 883
 884 =head2 wit_stringify( $groups )
 885
 886 Takes an array of witness groupings and produces a string like
 887 ['A','B'] / ['C','D','E'] / ['F']
 888
 889 =cut
 890
 891 sub wit_stringify {
 892     my $groups = shift;
 893     my @gst;
 894     # If we were passed an array of witnesses instead of an array of
 895     # groupings, then "group" the witnesses first.
 896     unless( ref( $groups->[0] ) ) {
 897         my $mkgrp = [ $groups ];
 898         $groups = $mkgrp;
 899     }
 900     foreach my $g ( @$groups ) {
 901         push( @gst, '[' . join( ',', map { "'$_'" } @$g ) . ']' );
 902     }
 903     return join( ' / ', @gst );
 904 }
 905
 906 1;
 907
 908 sub throw {
 909         Text::Tradition::Error->throw(
 910                 'ident' => 'Analysis error',
 911                 'message' => $_[0],
 912         );
 913 }
 914
 915 =head1 LICENSE
 916
 917 This package is free software and is provided "as is" without express
 918 or implied warranty.  You can redistribute it and/or modify it under
 919 the same terms as Perl itself.
 920
 921 =head1 AUTHOR
 922
 923 Tara L Andrews E<lt>aurum@cpan.orgE<gt>