analysis/lib/Text/Tradition/Analysis.pm

   1 package Text::Tradition::Analysis;
   2
   3 use strict;
   4 use warnings;
   5 use Algorithm::Diff;  # for word similarity measure
   6 use Encode qw/ decode_utf8 encode_utf8 /;
   7 use Exporter 'import';
   8 use Graph;
   9 use JSON qw/ to_json decode_json /;
  10 use LWP::UserAgent;
  11 use Set::Scalar;
  12 use Text::Tradition::Analysis::Result;
  13 use Text::Tradition::Directory;
  14 use Text::Tradition::Stemma;
  15 use TryCatch;
  16
  17 use vars qw/ @EXPORT_OK $VERSION /;
  18 @EXPORT_OK = qw/ run_analysis group_variants analyze_variant_location wit_stringify /;
  19 $VERSION = "1.1";
  20
  21
  22 my $SOLVER_URL = 'http://byzantini.st/cgi-bin/graphcalc.cgi';
  23 my $unsolved_problems = {};
  24
  25 =head1 NAME
  26
  27 Text::Tradition::Analysis - functions for stemma analysis of a tradition
  28
  29 =head1 SYNOPSIS
  30
  31   use Text::Tradition;
  32   use Text::Tradition::Analysis qw/ run_analysis analyze_variant_location /;
  33   my $t = Text::Tradition->new(
  34     'name' => 'this is a text',
  35     'input' => 'TEI',
  36     'file' => '/path/to/tei_parallel_seg_file.xml' );
  37   $t->add_stemma( 'dotfile' => $stemmafile );
  38
  39   my $variant_data = run_analysis( $tradition );
  40   # Recalculate rank $n treating all orthographic variants as equivalent
  41   my $reanalyze = analyze_variant_location( $tradition, $n, 0, 'orthographic' );
  42
  43 =head1 DESCRIPTION
  44
  45 Text::Tradition is a library for representation and analysis of collated
  46 texts, particularly medieval ones.  Where the Collation is the central feature of
  47 a Tradition, it may also have one or more temmata associated with it, and these stemmata may be analyzed. This package provides the following modules:
  48
  49 =over 4
  50
  51 =item * L<Text::Tradition::HasStemma> - a role that can be composed into Text::Tradition objects, providing the ability for Text::Tradition::Stemma objects to be associated with them.
  52
  53 =item * L<Text::Tradition::Stemma> - an object class that represents stemma hypotheses, both rooted (with a single archetype) and unrooted (e.g. phylogenetic trees).
  54
  55 =item * Text::Tradition::Analysis (this package). Provides functions for the analysis of a given stemma against the collation within a given Tradition.
  56
  57 =back
  58
  59 =head1 SUBROUTINES
  60
  61 =head2 run_analysis( $tradition, %opts )
  62
  63 Runs the analysis described in analyze_variant_location on every location in the
  64 collation of the given tradition, with the given options. These include:
  65
  66 =over 4
  67
  68 =item * stemma_id - Specify which of the tradition's stemmata to use. Default
  69 is 0 (i.e. the first).
  70
  71 =item * ranks - Specify a list of location ranks to analyze; exclude the rest.
  72
  73 =item * merge_types - Specify a list of relationship types, where related readings
  74 should be treated as identical for the purposes of analysis.
  75
  76 =item * exclude_type1 - Exclude those ranks whose groupings have only type-1 variants.
  77
  78 =back
  79
  80 =begin testing
  81
  82 use Text::Tradition;
  83 use Text::Tradition::Analysis qw/ run_analysis analyze_variant_location /;
  84
  85 my $datafile = 't/data/florilegium_tei_ps.xml';
  86 my $tradition = Text::Tradition->new( 'input' => 'TEI',
  87                                       'name' => 'test0',
  88                                       'file' => $datafile );
  89 my $s = $tradition->add_stemma( 'dotfile' => 't/data/florilegium.dot' );
  90 is( ref( $s ), 'Text::Tradition::Stemma', "Added stemma to tradition" );
  91
  92 my %expected_genealogical = (
  93         1 => 0,
  94         2 => 1,
  95         3 =>  0,
  96         5 =>  0,
  97         7 =>  0,
  98         8 =>  0,
  99         10 => 0,
 100         13 => 1,
 101         33 => 0,
 102         34 => 0,
 103         37 => 0,
 104         60 => 0,
 105         81 => 1,
 106         84 => 0,
 107         87 => 0,
 108         101 => 0,
 109         102 => 0,
 110         122 => 1,
 111         157 => 0,
 112         166 => 1,
 113         169 => 1,
 114         200 => 0,
 115         216 => 1,
 116         217 => 1,
 117         219 => 1,
 118         241 => 1,
 119         242 => 1,
 120         243 => 1,
 121 );
 122
 123 my $data = run_analysis( $tradition, calcdsn => 'dbi:SQLite:dbname=t/data/analysis.db' );
 124 my $c = $tradition->collation;
 125 foreach my $row ( @{$data->{'variants'}} ) {
 126         # Account for rows that used to be "not useful"
 127         unless( exists $expected_genealogical{$row->{'id'}} ) {
 128                 $expected_genealogical{$row->{'id'}} = 1;
 129         }
 130         my $gen_bool = $row->{'genealogical'} ? 1 : 0;
 131         is( $gen_bool, $expected_genealogical{$row->{'id'}},
 132                 "Got correct genealogical flag for row " . $row->{'id'} );
 133         # Check that we have the right row with the right groups
 134         my $rank = $row->{'id'};
 135         foreach my $rdghash ( @{$row->{'readings'}} ) {
 136                 # Skip 'readings' that aren't really
 137                 next unless $c->reading( $rdghash->{'readingid'} );
 138                 # Check the rank
 139                 is( $c->reading( $rdghash->{'readingid'} )->rank, $rank,
 140                         "Got correct reading rank" );
 141                 # Check the witnesses
 142                 my @realwits = sort $c->reading_witnesses( $rdghash->{'readingid'} );
 143                 my @sgrp = sort @{$rdghash->{'group'}};
 144                 is_deeply( \@sgrp, \@realwits, "Reading analyzed with correct groups" );
 145         }
 146 }
 147 is( $data->{'variant_count'}, 58, "Got right total variant number" );
 148 # TODO Make something meaningful of conflict count, maybe test other bits
 149
 150 =end testing
 151
 152 =cut
 153
 154 sub run_analysis {
 155         my( $tradition, %opts ) = @_;
 156         my $c = $tradition->collation;
 157         my $aclabel = $c->ac_label;
 158
 159         my $stemma_id = $opts{'stemma_id'} || 0;
 160         my @ranks = ref( $opts{'ranks'} ) eq 'ARRAY' ? @{$opts{'ranks'}} : ();
 161         my $collapse = Set::Scalar->new();
 162         if( $opts{'merge_types'} && ref( $opts{'merge_types'} ) eq 'ARRAY' ) {
 163                 $collapse->insert( @{$opts{'merge_types'}} );
 164         } elsif( $opts{'merge_types'} ) {
 165                 $collapse->insert( $opts{'merge_types'} );
 166         }
 167
 168         # Make sure we have a lookup DB for graph calculation results; this will die
 169         # if calcdir or calcdsn isn't passed.
 170         my $dir;
 171         if( exists $opts{'calcdir'} ) {
 172                 $dir = delete $opts{'calcdir'}
 173         } elsif ( exists $opts{'calcdsn'} ) {
 174                 $dir = Text::Tradition::Directory->new( dsn => $opts{'calcdsn'} );
 175         }
 176
 177         # Get the stemma
 178         my $stemma = $tradition->stemma( $stemma_id );
 179
 180         # Figure out which witnesses we are working with - that is, the ones that
 181         # appear both in the stemma and in the tradition. All others are 'lacunose'
 182         # for our purposes.
 183         my $lacunose = Set::Scalar->new( $stemma->hypotheticals );
 184         my $stemma_wits = Set::Scalar->new( $stemma->witnesses );
 185         my $tradition_wits = Set::Scalar->new( map { $_->sigil } $tradition->witnesses );
 186         $lacunose->insert( $stemma_wits->symmetric_difference( $tradition_wits )->members );
 187
 188         # Find and mark 'common' ranks for exclusion, unless they were
 189         # explicitly specified.
 190         unless( @ranks ) {
 191                 my %common_rank;
 192                 foreach my $rdg ( $c->common_readings ) {
 193                         $common_rank{$rdg->rank} = 1;
 194                 }
 195                 @ranks = grep { !$common_rank{$_} } ( 1 .. $c->end->rank-1 );
 196         }
 197
 198         # Group the variants to send to the solver
 199         my @groups;
 200         my @use_ranks;
 201         my %lacunae;
 202         my $moved = {};
 203         foreach my $rank ( @ranks ) {
 204                 my $missing = $lacunose->clone();
 205                 my $rankgroup = group_variants( $tradition, $rank, $missing, $moved, $collapse );
 206                 # Filter out any empty rankgroups
 207                 # (e.g. from the later rank for a transposition)
 208                 next unless keys %$rankgroup;
 209                 # Get the graph for this rankgroup
 210                 my $rankgraph = _graph_for_grouping( $stemma, $rankgroup, $missing, $aclabel );
 211                 if( $opts{'exclude_type1'} ) {
 212                         # Check to see whether this is a "useful" group.
 213                         next unless _useful_variant( $rankgroup, $rankgraph, $aclabel );
 214                 }
 215                 push( @use_ranks, $rank );
 216                 push( @groups, { grouping => $rankgroup, graph => $rankgraph } );
 217                 $lacunae{$rank} = $missing;
 218         }
 219         # Run the solver
 220         my $answer;
 221         try {
 222                 $answer = solve_variants( $dir, @groups );
 223         } catch ( Text::Tradition::Error $e ) {
 224                 if( $e->message =~ /IDP/ ) {
 225                         # Something is wrong with the solver; make the variants table anyway
 226                         $answer->{'variants'} = [];
 227                         map { push( @{$answer->{'variants'}}, _init_unsolved( $_, 'IDP error' ) ) }
 228                                 @groups;
 229                 } else {
 230                         # Something else is wrong; error out.
 231                         $e->throw;
 232                 }
 233         }
 234
 235         # Do further analysis on the answer
 236         my $conflict_count = 0;
 237         my $reversion_count = 0;
 238         foreach my $idx ( 0 .. $#use_ranks ) {
 239                 my $location = $answer->{'variants'}->[$idx];
 240                 # Add the rank back in
 241                 my $rank = $use_ranks[$idx];
 242                 $location->{'id'} = $rank;
 243                 # Note what our lacunae are
 244                 my %lmiss;
 245                 map { $lmiss{$_} = 1 } @{$lacunae{$use_ranks[$idx]}};
 246                 $location->{'missing'} = [ keys %lmiss ];
 247
 248                 # Run the extra analysis we need.
 249                 ## TODO We run through all the variants in this call, so
 250                 ## why not add the reading data there instead of here below?
 251                 my $graph = $groups[$idx]->{graph};
 252                 analyze_location( $tradition, $graph, $location, \%lmiss );
 253
 254                 my @layerwits;
 255                 # Do the final post-analysis tidying up of the data.
 256                 foreach my $rdghash ( @{$location->{'readings'}} ) {
 257                         $conflict_count++ if $rdghash->{'is_conflict'};
 258                         $reversion_count++ if $rdghash->{'is_reverted'};
 259                         # Add the reading text back in, setting display value as needed
 260                         my $rdg = $c->reading( $rdghash->{'readingid'} );
 261                         if( $rdg ) {
 262                                 $rdghash->{'text'} = $rdg->text .
 263                                         ( $rdg->rank == $rank ? '' : ' [' . $rdg->rank . ']' );
 264                                 if( $rdg->does( 'Text::Tradition::Morphology' ) ) {
 265                                         $rdghash->{'is_ungrammatical'} = $rdg->grammar_invalid;
 266                                         $rdghash->{'is_nonsense'} = $rdg->is_nonsense;
 267                                 }
 268                         }
 269                         # Remove lacunose witnesses from this reading's list now that the
 270                         # analysis is done
 271                         my @realgroup;
 272                         map { push( @realgroup, $_ ) unless $lmiss{$_} } @{$rdghash->{'group'}};
 273                         $rdghash->{'group'} = \@realgroup;
 274                         # Note any layered witnesses that appear in this group
 275                         foreach( @realgroup ) {
 276                                 if( $_ =~ /^(.*)\Q$aclabel\E$/ ) {
 277                                         push( @layerwits, $1 );
 278                                 }
 279                         }
 280                 }
 281                 $location->{'layerwits'} = \@layerwits if @layerwits;
 282         }
 283         $answer->{'conflict_count'} = $conflict_count;
 284         $answer->{'reversion_count'} = $reversion_count;
 285
 286         return $answer;
 287 }
 288
 289 =head2 group_variants( $tradition, $rank, $lacunose, $transposed, $merge_relationship_types )
 290
 291 Groups the variants at the given $rank of the collation, treating any
 292 relationships in the set $merge_relationship_types as equivalent.
 293 $lacunose should be a reference to an array, to which the sigla of lacunose
 294 witnesses at this rank will be appended; $transposed should be a reference
 295 to a hash, wherein the identities of transposed readings and their
 296 relatives will be stored.
 297
 298 Returns a hash $group_readings where $rdg is attested by the witnesses listed
 299 in $group_readings->{$rdg}.
 300
 301 =cut
 302
 303 # Return group_readings, groups, lacunose
 304 sub group_variants {
 305         my( $tradition, $rank, $lacunose, $transposed, $collapse ) = @_;
 306         my $c = $tradition->collation;
 307         my $aclabel = $c->ac_label;
 308         my $table = $c->alignment_table;
 309         # Get the alignment table readings
 310         my %readings_at_rank;
 311         my $check_for_gaps = Set::Scalar->new();
 312         my %moved_wits;
 313         my $has_transposition;
 314         foreach my $tablewit ( @{$table->{'alignment'}} ) {
 315                 my $rdg = $tablewit->{'tokens'}->[$rank-1];
 316                 my $wit = $tablewit->{'witness'};
 317                 # Exclude the witness if it is "lacunose" which if we got here
 318                 # means "not in the stemma".
 319                 next if _is_lacunose( $wit, $lacunose, $aclabel );
 320                 # Note if the witness is actually in a lacuna
 321                 if( $rdg && $rdg->{'t'}->is_lacuna ) {
 322                         _add_to_witlist( $wit, $lacunose, $aclabel );
 323                 # Otherwise the witness either has a positive reading...
 324                 } elsif( $rdg ) {
 325                         # If the reading has been counted elsewhere as a transposition, ignore it.
 326                         if( $transposed->{$rdg->{'t'}->id} ) {
 327                                 # TODO Does this cope with three-way transpositions?
 328                                 map { $moved_wits{$_} = 1 } @{$transposed->{$rdg->{'t'}->id}};
 329                                 next;
 330                         }
 331                         # Otherwise, record it...
 332                         $readings_at_rank{$rdg->{'t'}->id} = $rdg->{'t'};
 333                         # ...and grab any transpositions, and their relations.
 334                         my @transp = grep { $_->rank != $rank } $rdg->{'t'}->related_readings();
 335                         foreach my $trdg ( @transp ) {
 336                                 next if exists $readings_at_rank{$trdg->id};
 337                                 $has_transposition = 1;
 338                                 my @affected_wits = _table_witnesses(
 339                                         $table, $trdg, $lacunose, $aclabel );
 340                                 next unless @affected_wits;
 341                                 map { $moved_wits{$_} = 1 } @affected_wits;
 342                                 $transposed->{$trdg->id} =
 343                                         [ _table_witnesses( $table, $rdg->{'t'}, $lacunose, $aclabel ) ];
 344                                 $readings_at_rank{$trdg->id} = $trdg;
 345                         }
 346                 # ...or it is empty, ergo a gap.
 347                 } else {
 348                         _add_to_witlist( $wit, $check_for_gaps, $aclabel );
 349                 }
 350         }
 351         my $gap_wits = Set::Scalar->new();
 352         map { _add_to_witlist( $_, $gap_wits, $aclabel )
 353                 unless $moved_wits{$_} } $check_for_gaps->members;
 354
 355         # Group the readings, collapsing groups by relationship if needed.
 356         my $grouped_readings = {};
 357         foreach my $rdg ( values %readings_at_rank ) {
 358                 # Skip readings that have been collapsed into others.
 359                 next if exists $grouped_readings->{$rdg->id}
 360                         && $grouped_readings->{$rdg->id} eq 'COLLAPSE';
 361                 # Get the witness list, including from readings collapsed into this one.
 362                 my @wits = _table_witnesses( $table, $rdg, $lacunose, $aclabel );
 363                 if( $collapse && $collapse->size ) {
 364                         my $filter = sub { $collapse->has( $_[0]->type ) };
 365                         foreach my $other ( $rdg->related_readings( $filter ) ) {
 366                                 my @otherwits = _table_witnesses( $table, $other, $lacunose, $aclabel );
 367                                 push( @wits, @otherwits );
 368                                 $grouped_readings->{$other->id} = 'COLLAPSE';
 369                         }
 370                 }
 371                 $grouped_readings->{$rdg->id} = Set::Scalar->new( @wits );
 372         }
 373         if( $gap_wits->members ) {
 374                 $grouped_readings->{'(omitted)'} = $gap_wits;
 375         }
 376
 377         # Get rid of our collapsed readings
 378         map { delete $grouped_readings->{$_} if(
 379                          $grouped_readings->{$_} eq 'COLLAPSE'
 380                          || $grouped_readings->{$_}->is_empty ) }
 381                 keys %$grouped_readings;
 382
 383         # If something was transposed, check the groups for doubled-up readings
 384         if( $has_transposition ) {
 385                 # print STDERR "Group for rank $rank:\n";
 386                 # map { print STDERR "\t$_: " . join( ' ' , @{$grouped_readings->{$_}} ) . "\n" }
 387                 #       keys %$grouped_readings;
 388                 _check_transposed_consistency( $c, $rank, $transposed, $grouped_readings );
 389         }
 390
 391         # Return the result
 392         return $grouped_readings;
 393 }
 394
 395 # Helper function to query the alignment table for all witnesses (a.c. included)
 396 # that have a given reading at its rank.
 397 sub _table_witnesses {
 398         my( $table, $trdg, $lacunose, $aclabel ) = @_;
 399         my $tableidx = $trdg->rank - 1;
 400         my $has_reading = Set::Scalar->new();
 401         foreach my $row ( @{$table->{'alignment'}} ) {
 402                 my $wit = $row->{'witness'};
 403                 next if _is_lacunose( $wit, $lacunose, $aclabel );
 404                 my $rdg = $row->{'tokens'}->[$tableidx];
 405                 next unless exists $rdg->{'t'} && defined $rdg->{'t'};
 406                 _add_to_witlist( $wit, $has_reading, $aclabel )
 407                         if $rdg->{'t'}->id eq $trdg->id;
 408         }
 409         return $has_reading->members;
 410 }
 411
 412 # Helper function to see if a witness is lacunose even if we are asking about
 413 # the a.c. version
 414 sub _is_lacunose {
 415         my ( $wit, $lac, $acstr ) = @_;
 416         if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
 417                 $wit = $1;
 418         }
 419         return $lac->has( $wit );
 420 }
 421
 422 # Helper function to ensure that X and X a.c. never appear in the same list.
 423 sub _add_to_witlist {
 424         my( $wit, $list, $acstr ) = @_;
 425         if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
 426                 # Don't add X a.c. if we already have X
 427                 return if $list->has( $1 );
 428         } else {
 429                 # Delete X a.c. if we are about to add X
 430                 $list->delete( $wit.$acstr );
 431         }
 432         $list->insert( $wit );
 433 }
 434
 435 sub _check_transposed_consistency {
 436         my( $c, $rank, $transposed, $groupings ) = @_;
 437         my %seen_wits;
 438         my %thisrank;
 439         # Note which readings are actually at this rank, and which witnesses
 440         # belong to which reading.
 441         foreach my $rdg ( keys %$groupings ) {
 442                 my $rdgobj = $c->reading( $rdg );
 443                 # Count '(omitted)' as a reading at this rank
 444                 $thisrank{$rdg} = 1 if !$rdgobj || $rdgobj->rank == $rank;
 445                 map { push( @{$seen_wits{$_}}, $rdg ) } @{$groupings->{$rdg}};
 446         }
 447         # Our work is done if we have no witness belonging to more than one
 448         # reading.
 449         my @doubled = grep { scalar @{$seen_wits{$_}} > 1 } keys %seen_wits;
 450         return unless @doubled;
 451         # If we have a symmetric related transposition, drop the non-rank readings.
 452         if( @doubled == scalar keys %seen_wits ) {
 453                 foreach my $rdg ( keys %$groupings ) {
 454                         if( !$thisrank{$rdg} ) {
 455                                 my $groupstr = wit_stringify( $groupings->{$rdg} );
 456                                 my ( $matched ) = grep { $groupstr eq wit_stringify( $groupings->{$_} ) }
 457                                         keys %thisrank;
 458                                 delete $groupings->{$rdg};
 459                                 # If we found a group match, assume there is a symmetry happening.
 460                                 # TODO think more about this
 461                                 # print STDERR "*** Deleting symmetric reading $rdg\n";
 462                                 unless( $matched ) {
 463                                         delete $transposed->{$rdg};
 464                                         warn "Found problem in evident symmetry with reading $rdg";
 465                                 }
 466                         }
 467                 }
 468         # Otherwise 'unhook' the transposed reading(s) that have duplicates.
 469         } else {
 470                 foreach my $dup ( @doubled ) {
 471                         foreach my $rdg ( @{$seen_wits{$dup}} ) {
 472                                 next if $thisrank{$rdg};
 473                                 next unless exists $groupings->{$rdg};
 474                                 # print STDERR "*** Deleting asymmetric doubled-up reading $rdg\n";
 475                                 delete $groupings->{$rdg};
 476                                 delete $transposed->{$rdg};
 477                         }
 478                 }
 479                 # and put any now-orphaned readings into an 'omitted' reading.
 480                 foreach my $wit ( keys %seen_wits ) {
 481                         unless( grep { exists $groupings->{$_} } @{$seen_wits{$wit}} ) {
 482                                 $groupings->{'(omitted)'} = Set::Scalar->new()
 483                                          unless exists $groupings->{'(omitted)'};
 484                                 _add_to_witlist( $wit, $groupings->{'(omitted)'}, $c->ac_label );
 485                         }
 486                 }
 487         }
 488 }
 489
 490 # For the given grouping, return its situation graph based on the stemma.
 491 sub _graph_for_grouping {
 492         my( $stemma, $grouping, $lacunose, $aclabel ) = @_;
 493         my $acwits = [];
 494         my $extant = {};
 495         foreach my $gs ( values %$grouping ) {
 496                 map {
 497                         if( $_ =~ /^(.*)\Q$aclabel\E$/ ) {
 498                                 push( @$acwits, $1 ) unless $lacunose->has( $1 );
 499                         } else {
 500                                 $extant->{$_} = 1 unless $lacunose->has( $_ );
 501                         }
 502                 } $gs->members;
 503         }
 504         my $graph;
 505         try {
 506                 # contig contains all extant wits and all hypothetical wits
 507                 # needed to make up the groups.
 508                 $graph = $stemma->situation_graph( $extant, $acwits, $aclabel );
 509         } catch ( Text::Tradition::Error $e ) {
 510                 throw( "Could not extend graph with given extant and a.c. witnesses: "
 511                         . $e->message );
 512         } catch {
 513                 throw( "Could not extend graph with a.c. witnesses @$acwits" );
 514         }
 515         return $graph;
 516 }
 517
 518 =head2 solve_variants( $calcdir, @groups )
 519
 520 Looks up the set of groups in the answers provided by the external graph solver
 521 service and returns a cleaned-up answer, adding the rank IDs back where they belong.
 522
 523 The answer has the form
 524   { "variants" => [ array of variant location structures ],
 525     "variant_count" => total,
 526     "conflict_count" => number of conflicts detected,
 527     "genealogical_count" => number of solutions found }
 528
 529 =cut
 530
 531 sub solve_variants {
 532         my( @groups ) = @_;
 533
 534         # Are we using a local result directory, or did we pass an empty value
 535         # for the directory?
 536         my $dir;
 537         unless( ref( $groups[0] ) eq 'HASH' ) {
 538                 $dir = shift @groups;
 539         }
 540
 541         ## For each graph/group combo, make a Text::Tradition::Analysis::Result
 542         ## object so that we can send it off for IDP lookup.
 543         my $variants = [];
 544         my $genealogical = 0; # counter
 545         # TODO Optimize for unique graph problems
 546         my %problems;
 547         foreach my $graphproblem ( @groups ) {
 548                 # Construct the calc result key and look up its answer
 549                 my $problem = Text::Tradition::Analysis::Result->new(
 550                         graph => $graphproblem->{'graph'},
 551                         setlist => [ values %{$graphproblem->{'grouping'}} ] );
 552                 if( exists $problems{$problem->object_key} ) {
 553                         $problem = $problems{$problem->object_key};
 554                 } else {
 555                         $problems{$problem->object_key} = $problem;
 556                 }
 557                 $graphproblem->{'object'} = $problem;
 558         }
 559
 560         my %results;
 561         if( $dir ) {
 562                 my $scope = $dir->new_scope;
 563                 map { $results{$_} = $dir->lookup( $_ ) || $problems{$_} } keys %problems;
 564         } else {
 565                 my $json = JSON->new->allow_blessed->convert_blessed->utf8->encode(
 566                         [ values %problems ] );
 567                 # Send it off and get the result
 568                 # print STDERR "Sending request: " . decode_utf8( $json ) . "\n";
 569                 my $ua = LWP::UserAgent->new();
 570                 my $resp = $ua->post( $SOLVER_URL, 'Content-Type' => 'application/json',
 571                                                           'Content' => $json );
 572                 my $answer;
 573                 if( $resp->is_success ) {
 574                         $answer = decode_json( $resp->content );
 575                         throw( "Unexpected answer from IDP: $answer" ) unless ref( $answer ) eq 'ARRAY';
 576                 } else {
 577                         throw( "IDP solver returned " . $resp->status_line . " / " . $resp->content
 578                                 . "; cannot run graph analysis" );
 579                 }
 580                 # One more sanity check
 581                 throw( "Something went wrong with answer symmetricity" )
 582                         unless keys( %problems ) == @$answer;
 583                 # Convert the results
 584                 foreach my $a ( @$answer ) {
 585                         my $r = Text::Tradition::Analysis::Result->new( $a );
 586                         $results{$r->object_key} = $r;
 587                 }
 588         }
 589
 590         # We now have a single JSON-encoded Result object per problem sent. Fold its
 591         # answers into our variant info structure.
 592         foreach my $graphproblem ( @groups ) {
 593                 my $result = $results{$graphproblem->{'object'}->object_key}
 594                         || $graphproblem->{'object'};
 595
 596                 # Initialize the result structure for this graph problem
 597                 my $vstruct;
 598                 if( $result->status eq 'OK' ) {
 599                         $vstruct = { readings => [] };
 600                         push( @$variants, $vstruct );
 601                 } else {
 602                         push( @$variants, _init_unsolved( $graphproblem, $result->status ) );
 603                         next;
 604                 }
 605
 606                 # 1. Did the group evaluate as genealogical?
 607                 $vstruct->{genealogical} = $result->is_genealogical;
 608                 $genealogical++ if $result->is_genealogical;
 609
 610                 # 2. What are the calculated minimum groupings for each variant loc?
 611                 foreach my $rid ( keys %{$graphproblem->{grouping}} ) {
 612                         my $inputset = $graphproblem->{grouping}->{$rid};
 613                         my $minset = $result->minimum_grouping_for( $inputset );
 614                         push( @{$vstruct->{readings}}, { readingid => $rid, group => $minset } );
 615                 }
 616
 617                 # 3. What are the sources and classes calculated for each witness?
 618                 $vstruct->{witcopy_types} = { $result->classes };
 619                 $vstruct->{reading_roots} = {};
 620                 map { $vstruct->{reading_roots}->{$_} = 1 } $result->sources;
 621
 622         }
 623
 624         return { 'variants' => $variants,
 625                          'variant_count' => scalar @$variants,
 626                          'genealogical_count' => $genealogical };
 627 }
 628
 629 sub _init_unsolved {
 630         my( $graphproblem, $status ) = @_;
 631         my $vstruct = { 'readings' => [] };
 632         $vstruct->{'unsolved'} = $status;
 633         foreach my $rid ( keys %{$graphproblem->{grouping}} ) {
 634                 push( @{$vstruct->{readings}}, { readingid => $rid,
 635                         group => [ $graphproblem->{grouping}->{$rid}->members ] } );
 636         }
 637         return $vstruct;
 638 }
 639
 640 =head2 analyze_location ( $tradition, $graph, $location_hash )
 641
 642 Given the tradition, its stemma graph, and the solution from the graph solver,
 643 work out the rest of the information we want.  For each reading we need missing,
 644 conflict, reading_parents, independent_occurrence, followed, not_followed,
 645 and follow_unknown.  Alters the location_hash in place.
 646
 647 =cut
 648
 649 sub analyze_location {
 650         my ( $tradition, $graph, $variant_row, $lacunose ) = @_;
 651         my $c = $tradition->collation;
 652
 653         if( exists $variant_row->{'unsolved'} ) {
 654                 return;
 655         }
 656         my $reading_roots = delete $variant_row->{'reading_roots'};
 657         my $classinfo = delete $variant_row->{'witcopy_types'};
 658
 659         # Make a hash of all known node memberships, and make the subgraphs.
 660         my $contig = {};
 661         my $subgraph = {};
 662         my $acstr = $c->ac_label;
 663         my @acwits;
 664
 665         # Note which witnesses positively belong to which group. This information
 666         # comes ultimately from the IDP solver.
 667         # Also make a note of the reading's roots.
 668     foreach my $rdghash ( @{$variant_row->{'readings'}} ) {
 669         my $rid = $rdghash->{'readingid'};
 670         my @roots;
 671         foreach my $wit ( @{$rdghash->{'group'}} ) {
 672                 $contig->{$wit} = $rid;
 673             if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
 674                 push( @acwits, $1 );
 675             }
 676             if( exists $reading_roots->{$wit} && $reading_roots->{$wit} ) {
 677                 push( @roots, $wit );
 678             }
 679         }
 680                 $rdghash->{'independent_occurrence'} = \@roots;
 681         }
 682
 683         # Now that we have all the node group memberships, calculate followed/
 684     # non-followed/unknown values for each reading.  Also figure out the
 685     # reading's evident parent(s).
 686     foreach my $rdghash ( @{$variant_row->{'readings'}} ) {
 687         my $rid = $rdghash->{'readingid'};
 688         my $rdg = $c->reading( $rid );
 689         my @roots = @{$rdghash->{'independent_occurrence'}};
 690         my @reversions;
 691         if( $classinfo ) {
 692                 @reversions = grep { $classinfo->{$_} eq 'revert' }
 693                         $rdghash->{'group'}->members;
 694                 $rdghash->{'reversions'} = \@reversions;
 695         }
 696         my @group = @{$rdghash->{'group'}};
 697
 698         # Start figuring things out.
 699         $rdghash->{'followed'} = scalar( @group )
 700                 - ( scalar( @roots ) + scalar( @reversions ) );
 701         # Find the parent readings, if any, of this reading.
 702         my $sourceparents = _find_reading_parents( $rid, $graph, $contig, @roots );
 703                 # Work out relationships between readings and their non-followed parent.
 704                 _resolve_parent_relationships( $c, $rid, $rdg, $sourceparents );
 705                 $rdghash->{'source_parents'} = $sourceparents;
 706
 707                 if( @reversions ) {
 708                         my $revparents = _find_reading_parents( $rid, $graph, $contig, @reversions );
 709                         _resolve_parent_relationships( $c, $rid, $rdg, $revparents );
 710                         $rdghash->{'reversion_parents'} = $revparents;
 711                 }
 712
 713                 # Find the number of times this reading was altered, and the number of
 714                 # times we're not sure.
 715                 my( %nofollow, %unknownfollow );
 716                 foreach my $wit ( @{$rdghash->{'group'}} ) {
 717                         foreach my $wchild ( $graph->successors( $wit ) ) {
 718                                 if( $reading_roots->{$wchild} && $contig->{$wchild}
 719                                         && $contig->{$wchild} ne $rid ) {
 720                                         # It definitely changed here.
 721                                         $nofollow{$wchild} = 1;
 722                                 } elsif( !($contig->{$wchild}) ) {
 723                                         # The child is a hypothetical node not definitely in
 724                                         # any group. Answer is unknown.
 725                                         $unknownfollow{$wchild} = 1;
 726                                 } # else it is either in our group, or it is a non-root node in a
 727                                   # known group and therefore is presumed to have its reading from
 728                                   # its group, not this link.
 729                         }
 730                 }
 731                 $rdghash->{'not_followed'} = keys %nofollow;
 732                 $rdghash->{'follow_unknown'} = keys %unknownfollow;
 733
 734                 # Now say whether this reading represents a conflict.
 735                 unless( $variant_row->{'genealogical'} ) {
 736                         $rdghash->{'is_conflict'} = @roots != 1;
 737                         $rdghash->{'is_reverted'} = scalar @reversions;
 738                 }
 739     }
 740 }
 741
 742 sub _find_reading_parents {
 743         my( $rid, $graph, $contig, @list ) = @_;
 744         my $parenthash = {};
 745         foreach my $wit ( @list ) {
 746                 # Look in the stemma graph to find this witness's extant or known-reading
 747                 # immediate ancestor(s), and look up the reading that each ancestor holds.
 748                 my @check = $graph->predecessors( $wit );
 749                 while( @check ) {
 750                         my @next;
 751                         foreach my $wparent( @check ) {
 752                                 my $preading = $contig->{$wparent};
 753                                 if( $preading && $preading ne $rid ) {
 754                                         $parenthash->{$preading} = 1;
 755                                 } else {
 756                                         push( @next, $graph->predecessors( $wparent ) );
 757                                 }
 758                         }
 759                         @check = @next;
 760                 }
 761         }
 762         return $parenthash;
 763 }
 764
 765 sub _resolve_parent_relationships {
 766         my( $c, $rid, $rdg, $rdgparents ) = @_;
 767         foreach my $p ( keys %$rdgparents ) {
 768                 # Resolve the relationship of the parent to the reading, and
 769                 # save it in our hash.
 770                 my $pobj = $c->reading( $p );
 771                 my $prep = $pobj ? $pobj->id . ' (' . $pobj->text . ')' : $p;
 772                 my $phash = { 'label' => $prep };
 773                 if( $pobj ) {
 774                         my $rel = $c->get_relationship( $p, $rid );
 775                         if( $rel ) {
 776                                 _add_to_hash( $rel, $phash );
 777                         } elsif( $rdg ) {
 778                                 # First check for a transposed relationship
 779                                 if( $rdg->rank != $pobj->rank ) {
 780                                         foreach my $ti ( $rdg->related_readings( 'transposition' ) ) {
 781                                                 next unless $ti->text eq $rdg->text;
 782                                                 $rel = $c->get_relationship( $ti, $pobj );
 783                                                 if( $rel ) {
 784                                                         _add_to_hash( $rel, $phash, 1 );
 785                                                         last;
 786                                                 }
 787                                         }
 788                                         unless( $rel ) {
 789                                                 foreach my $ti ( $pobj->related_readings( 'transposition' ) ) {
 790                                                         next unless $ti->text eq $pobj->text;
 791                                                         $rel = $c->get_relationship( $ti, $rdg );
 792                                                         if( $rel ) {
 793                                                                 _add_to_hash( $rel, $phash, 1 );
 794                                                                 last;
 795                                                         }
 796                                                 }
 797                                         }
 798                                 }
 799                                 unless( $rel ) {
 800                                         # and then check for sheer word similarity.
 801                                         my $rtext = $rdg->text;
 802                                         my $ptext = $pobj->text;
 803                                         if( similar( $rtext, $ptext ) ) {
 804                                                 # say STDERR "Words $rtext and $ptext judged similar";
 805                                                 $phash->{relation} = { type => 'wordsimilar' };
 806                                         }
 807                                 }
 808                         } else {
 809                                 $phash->{relation} = { type => 'deletion' };
 810                         }
 811                         # Get the attributes of the parent object while we are here
 812                         $phash->{'text'} = $pobj->text if $pobj;
 813                         if( $pobj && $pobj->does('Text::Tradition::Morphology') ) {
 814                                 $phash->{'is_nonsense'} = $pobj->is_nonsense;
 815                                 $phash->{'is_ungrammatical'} = $pobj->grammar_invalid;
 816                         }
 817                 } elsif( $p eq '(omitted)' ) {
 818                         $phash->{relation} = { type => 'addition' };
 819                 }
 820                 # Save it
 821                 $rdgparents->{$p} = $phash;
 822         }
 823 }
 824
 825 sub _add_to_hash {
 826         my( $rel, $phash, $is_transposed ) = @_;
 827         $phash->{relation} = { type => $rel->type };
 828         $phash->{relation}->{transposed} = 1 if $is_transposed;
 829         $phash->{relation}->{annotation} = $rel->annotation
 830                 if $rel->has_annotation;
 831 }
 832
 833 =head2 similar( $word1, $word2 )
 834
 835 Use Algorithm::Diff to get a sense of how close the words are to each other.
 836 This will hopefully handle substitutions a bit more nicely than Levenshtein.
 837
 838 =cut
 839
 840 #!/usr/bin/env perl
 841
 842 sub similar {
 843         my( $word1, $word2 ) = sort { length($a) <=> length($b) } @_;
 844         my @let1 = split( '', lc( $word1 ) );
 845         my @let2 = split( '', lc( $word2 ) );
 846         my $diff = Algorithm::Diff->new( \@let1, \@let2 );
 847         my $mag = 0;
 848         while( $diff->Next ) {
 849                 if( $diff->Same ) {
 850                         # Take off points for longer strings
 851                         my $cs = $diff->Range(1) - 2;
 852                         $cs = 0 if $cs < 0;
 853                         $mag -= $cs;
 854                 } elsif( !$diff->Items(1) ) {
 855                         $mag += $diff->Range(2);
 856                 } elsif( !$diff->Items(2) ) {
 857                         $mag += $diff->Range(1);
 858                 } else {
 859                         # Split the difference for substitutions
 860                         my $c1 = $diff->Range(1) || 1;
 861                         my $c2 = $diff->Range(2) || 1;
 862                         my $cd = ( $c1 + $c2 ) / 2;
 863                         $mag += $cd;
 864                 }
 865         }
 866         return ( $mag <= length( $word1 ) / 2 );
 867 }
 868
 869 sub _useful_variant {
 870         my( $rankgroup, $rankgraph, $acstr ) = @_;
 871
 872         # Sort by group size and return
 873         my $is_useful = 0;
 874         foreach my $rdg ( keys %$rankgroup ) {
 875                 my @wits = $rankgroup->{$rdg}->members;
 876                 if( @wits > 1 ) {
 877                         $is_useful++;
 878                 } else {
 879                         $is_useful++ unless( $rankgraph->is_sink_vertex( $wits[0] )
 880                                 || $wits[0] =~ /\Q$acstr\E$/ );
 881                 }
 882         }
 883         return $is_useful > 1;
 884 }
 885
 886 =head2 wit_stringify( $groups )
 887
 888 Takes an array of witness groupings and produces a string like
 889 ['A','B'] / ['C','D','E'] / ['F']
 890
 891 =cut
 892
 893 sub wit_stringify {
 894     my $groups = shift;
 895     my @gst;
 896     # If we were passed an array of witnesses instead of an array of
 897     # groupings, then "group" the witnesses first.
 898     unless( ref( $groups->[0] ) ) {
 899         my $mkgrp = [ $groups ];
 900         $groups = $mkgrp;
 901     }
 902     foreach my $g ( @$groups ) {
 903         push( @gst, '[' . join( ',', map { "'$_'" } @$g ) . ']' );
 904     }
 905     return join( ' / ', @gst );
 906 }
 907
 908 1;
 909
 910 sub throw {
 911         Text::Tradition::Error->throw(
 912                 'ident' => 'Analysis error',
 913                 'message' => $_[0],
 914         );
 915 }
 916
 917 =head1 LICENSE
 918
 919 This package is free software and is provided "as is" without express
 920 or implied warranty.  You can redistribute it and/or modify it under
 921 the same terms as Perl itself.
 922
 923 =head1 AUTHOR
 924
 925 Tara L Andrews E<lt>aurum@cpan.orgE<gt>