lib/Text/Tradition/Analysis.pm

   1 package Text::Tradition::Analysis;
   2
   3 use strict;
   4 use warnings;
   5 use Algorithm::Diff;  # for word similarity measure
   6 use Encode qw/ decode_utf8 encode_utf8 /;
   7 use Exporter 'import';
   8 use Graph;
   9 use JSON qw/ to_json decode_json /;
  10 use LWP::UserAgent;
  11 use Set::Scalar;
  12 use Text::Tradition::Analysis::Result;
  13 use Text::Tradition::Directory;
  14 use Text::Tradition::Stemma;
  15 use TryCatch;
  16
  17 use vars qw/ @EXPORT_OK /;
  18 @EXPORT_OK = qw/ run_analysis group_variants analyze_variant_location wit_stringify /;
  19
  20 my $SOLVER_URL = 'http://byzantini.st/cgi-bin/graphcalc.cgi';
  21 my $unsolved_problems = {};
  22
  23 =head1 NAME
  24
  25 Text::Tradition::Analysis - functions for stemma analysis of a tradition
  26
  27 =head1 SYNOPSIS
  28
  29   use Text::Tradition;
  30   use Text::Tradition::Analysis qw/ run_analysis analyze_variant_location /;
  31   my $t = Text::Tradition->new(
  32     'name' => 'this is a text',
  33     'input' => 'TEI',
  34     'file' => '/path/to/tei_parallel_seg_file.xml' );
  35   $t->add_stemma( 'dotfile' => $stemmafile );
  36
  37   my $variant_data = run_analysis( $tradition );
  38   # Recalculate rank $n treating all orthographic variants as equivalent
  39   my $reanalyze = analyze_variant_location( $tradition, $n, 0, 'orthographic' );
  40
  41 =head1 DESCRIPTION
  42
  43 Text::Tradition is a library for representation and analysis of collated
  44 texts, particularly medieval ones.  The Collation is the central feature of
  45 a Tradition, where the text, its sequence of readings, and its relationships
  46 between readings are actually kept.
  47
  48 =head1 SUBROUTINES
  49
  50 =head2 run_analysis( $tradition, %opts )
  51
  52 Runs the analysis described in analyze_variant_location on every location in the
  53 collation of the given tradition, with the given options. These include:
  54
  55 =over 4
  56
  57 =item * stemma_id - Specify which of the tradition's stemmata to use. Default
  58 is 0 (i.e. the first).
  59
  60 =item * ranks - Specify a list of location ranks to analyze; exclude the rest.
  61
  62 =item * merge_types - Specify a list of relationship types, where related readings
  63 should be treated as identical for the purposes of analysis.
  64
  65 =item * exclude_type1 - Exclude those ranks whose groupings have only type-1 variants.
  66
  67 =back
  68
  69 =begin testing
  70
  71 use Text::Tradition;
  72 use Text::Tradition::Analysis qw/ run_analysis analyze_variant_location /;
  73
  74 my $datafile = 't/data/florilegium_tei_ps.xml';
  75 my $tradition = Text::Tradition->new( 'input' => 'TEI',
  76                                       'name' => 'test0',
  77                                       'file' => $datafile );
  78 my $s = $tradition->add_stemma( 'dotfile' => 't/data/florilegium.dot' );
  79 is( ref( $s ), 'Text::Tradition::Stemma', "Added stemma to tradition" );
  80
  81 my %expected_genealogical = (
  82         1 => 0,
  83         2 => 1,
  84         3 =>  0,
  85         5 =>  0,
  86         7 =>  0,
  87         8 =>  0,
  88         10 => 0,
  89         13 => 1,
  90         33 => 0,
  91         34 => 0,
  92         37 => 0,
  93         60 => 0,
  94         81 => 1,
  95         84 => 0,
  96         87 => 0,
  97         101 => 0,
  98         102 => 0,
  99         122 => 1,
 100         157 => 0,
 101         166 => 1,
 102         169 => 1,
 103         200 => 0,
 104         216 => 1,
 105         217 => 1,
 106         219 => 1,
 107         241 => 1,
 108         242 => 1,
 109         243 => 1,
 110 );
 111
 112 my $data = run_analysis( $tradition, calcdsn => 'dbi:SQLite:dbname=t/data/analysis.db' );
 113 my $c = $tradition->collation;
 114 foreach my $row ( @{$data->{'variants'}} ) {
 115         # Account for rows that used to be "not useful"
 116         unless( exists $expected_genealogical{$row->{'id'}} ) {
 117                 $expected_genealogical{$row->{'id'}} = 1;
 118         }
 119         my $gen_bool = $row->{'genealogical'} ? 1 : 0;
 120         is( $gen_bool, $expected_genealogical{$row->{'id'}},
 121                 "Got correct genealogical flag for row " . $row->{'id'} );
 122         # Check that we have the right row with the right groups
 123         my $rank = $row->{'id'};
 124         foreach my $rdghash ( @{$row->{'readings'}} ) {
 125                 # Skip 'readings' that aren't really
 126                 next unless $c->reading( $rdghash->{'readingid'} );
 127                 # Check the rank
 128                 is( $c->reading( $rdghash->{'readingid'} )->rank, $rank,
 129                         "Got correct reading rank" );
 130                 # Check the witnesses
 131                 my @realwits = sort $c->reading_witnesses( $rdghash->{'readingid'} );
 132                 my @sgrp = sort @{$rdghash->{'group'}};
 133                 is_deeply( \@sgrp, \@realwits, "Reading analyzed with correct groups" );
 134         }
 135 }
 136 is( $data->{'variant_count'}, 58, "Got right total variant number" );
 137 # TODO Make something meaningful of conflict count, maybe test other bits
 138
 139 =end testing
 140
 141 =cut
 142
 143 sub run_analysis {
 144         my( $tradition, %opts ) = @_;
 145         my $c = $tradition->collation;
 146         my $aclabel = $c->ac_label;
 147
 148         my $stemma_id = $opts{'stemma_id'} || 0;
 149         my @ranks = ref( $opts{'ranks'} ) eq 'ARRAY' ? @{$opts{'ranks'}} : ();
 150         my $collapse = Set::Scalar->new();
 151         if( $opts{'merge_types'} && ref( $opts{'merge_types'} ) eq 'ARRAY' ) {
 152                 $collapse->insert( @{$opts{'merge_types'}} );
 153         } elsif( $opts{'merge_types'} ) {
 154                 $collapse->insert( $opts{'merge_types'} );
 155         }
 156
 157         # Make sure we have a lookup DB for graph calculation results; this will die
 158         # if calcdir or calcdsn isn't passed.
 159         my $dir;
 160         if( exists $opts{'calcdir'} ) {
 161                 $dir = delete $opts{'calcdir'}
 162         } elsif ( exists $opts{'calcdsn'} ) {
 163                 $dir = Text::Tradition::Directory->new( dsn => $opts{'calcdsn'} );
 164         }
 165
 166         # Get the stemma
 167         my $stemma = $tradition->stemma( $stemma_id );
 168
 169         # Figure out which witnesses we are working with - that is, the ones that
 170         # appear both in the stemma and in the tradition. All others are 'lacunose'
 171         # for our purposes.
 172         my $lacunose = Set::Scalar->new( $stemma->hypotheticals );
 173         my $stemma_wits = Set::Scalar->new( $stemma->witnesses );
 174         my $tradition_wits = Set::Scalar->new( map { $_->sigil } $tradition->witnesses );
 175         $lacunose->insert( $stemma_wits->symmetric_difference( $tradition_wits )->members );
 176
 177         # Find and mark 'common' ranks for exclusion, unless they were
 178         # explicitly specified.
 179         unless( @ranks ) {
 180                 my %common_rank;
 181                 foreach my $rdg ( $c->common_readings ) {
 182                         $common_rank{$rdg->rank} = 1;
 183                 }
 184                 @ranks = grep { !$common_rank{$_} } ( 1 .. $c->end->rank-1 );
 185         }
 186
 187         # Group the variants to send to the solver
 188         my @groups;
 189         my @use_ranks;
 190         my %lacunae;
 191         my $moved = {};
 192         foreach my $rank ( @ranks ) {
 193                 my $missing = $lacunose->clone();
 194                 my $rankgroup = group_variants( $tradition, $rank, $missing, $moved, $collapse );
 195                 # Filter out any empty rankgroups
 196                 # (e.g. from the later rank for a transposition)
 197                 next unless keys %$rankgroup;
 198                 # Get the graph for this rankgroup
 199                 my $rankgraph = _graph_for_grouping( $stemma, $rankgroup, $missing, $aclabel );
 200                 if( $opts{'exclude_type1'} ) {
 201                         # Check to see whether this is a "useful" group.
 202                         next unless _useful_variant( $rankgroup, $rankgraph, $aclabel );
 203                 }
 204                 push( @use_ranks, $rank );
 205                 push( @groups, { grouping => $rankgroup, graph => $rankgraph } );
 206                 $lacunae{$rank} = $missing;
 207         }
 208         # Run the solver
 209         my $answer;
 210         try {
 211                 $answer = solve_variants( $dir, @groups );
 212         } catch ( Text::Tradition::Error $e ) {
 213                 if( $e->message =~ /IDP/ ) {
 214                         # Something is wrong with the solver; make the variants table anyway
 215                         $answer->{'variants'} = [];
 216                         map { push( @{$answer->{'variants'}}, _init_unsolved( $_, 'IDP error' ) ) }
 217                                 @groups;
 218                 } else {
 219                         # Something else is wrong; error out.
 220                         $e->throw;
 221                 }
 222         }
 223
 224         # Do further analysis on the answer
 225         my $conflict_count = 0;
 226         my $reversion_count = 0;
 227         foreach my $idx ( 0 .. $#use_ranks ) {
 228                 my $location = $answer->{'variants'}->[$idx];
 229                 # Add the rank back in
 230                 my $rank = $use_ranks[$idx];
 231                 $location->{'id'} = $rank;
 232                 # Note what our lacunae are
 233                 my %lmiss;
 234                 map { $lmiss{$_} = 1 } @{$lacunae{$use_ranks[$idx]}};
 235                 $location->{'missing'} = [ keys %lmiss ];
 236
 237                 # Run the extra analysis we need.
 238                 ## TODO We run through all the variants in this call, so
 239                 ## why not add the reading data there instead of here below?
 240                 my $graph = $groups[$idx]->{graph};
 241                 analyze_location( $tradition, $graph, $location, \%lmiss );
 242
 243                 my @layerwits;
 244                 # Do the final post-analysis tidying up of the data.
 245                 foreach my $rdghash ( @{$location->{'readings'}} ) {
 246                         $conflict_count++ if $rdghash->{'is_conflict'};
 247                         $reversion_count++ if $rdghash->{'is_reverted'};
 248                         # Add the reading text back in, setting display value as needed
 249                         my $rdg = $c->reading( $rdghash->{'readingid'} );
 250                         if( $rdg ) {
 251                                 $rdghash->{'text'} = $rdg->text .
 252                                         ( $rdg->rank == $rank ? '' : ' [' . $rdg->rank . ']' );
 253                                 $rdghash->{'is_ungrammatical'} = $rdg->grammar_invalid;
 254                                 $rdghash->{'is_nonsense'} = $rdg->is_nonsense;
 255                         }
 256                         # Remove lacunose witnesses from this reading's list now that the
 257                         # analysis is done
 258                         my @realgroup;
 259                         map { push( @realgroup, $_ ) unless $lmiss{$_} } @{$rdghash->{'group'}};
 260                         $rdghash->{'group'} = \@realgroup;
 261                         # Note any layered witnesses that appear in this group
 262                         foreach( @realgroup ) {
 263                                 if( $_ =~ /^(.*)\Q$aclabel\E$/ ) {
 264                                         push( @layerwits, $1 );
 265                                 }
 266                         }
 267                 }
 268                 $location->{'layerwits'} = \@layerwits if @layerwits;
 269         }
 270         $answer->{'conflict_count'} = $conflict_count;
 271         $answer->{'reversion_count'} = $reversion_count;
 272
 273         return $answer;
 274 }
 275
 276 =head2 group_variants( $tradition, $rank, $lacunose, $transposed, $merge_relationship_types )
 277
 278 Groups the variants at the given $rank of the collation, treating any
 279 relationships in the set $merge_relationship_types as equivalent.
 280 $lacunose should be a reference to an array, to which the sigla of lacunose
 281 witnesses at this rank will be appended; $transposed should be a reference
 282 to a hash, wherein the identities of transposed readings and their
 283 relatives will be stored.
 284
 285 Returns a hash $group_readings where $rdg is attested by the witnesses listed
 286 in $group_readings->{$rdg}.
 287
 288 =cut
 289
 290 # Return group_readings, groups, lacunose
 291 sub group_variants {
 292         my( $tradition, $rank, $lacunose, $transposed, $collapse ) = @_;
 293         my $c = $tradition->collation;
 294         my $aclabel = $c->ac_label;
 295         my $table = $c->alignment_table;
 296         # Get the alignment table readings
 297         my %readings_at_rank;
 298         my $check_for_gaps = Set::Scalar->new();
 299         my %moved_wits;
 300         my $has_transposition;
 301         foreach my $tablewit ( @{$table->{'alignment'}} ) {
 302                 my $rdg = $tablewit->{'tokens'}->[$rank-1];
 303                 my $wit = $tablewit->{'witness'};
 304                 # Exclude the witness if it is "lacunose" which if we got here
 305                 # means "not in the stemma".
 306                 next if _is_lacunose( $wit, $lacunose, $aclabel );
 307                 # Note if the witness is actually in a lacuna
 308                 if( $rdg && $rdg->{'t'}->is_lacuna ) {
 309                         _add_to_witlist( $wit, $lacunose, $aclabel );
 310                 # Otherwise the witness either has a positive reading...
 311                 } elsif( $rdg ) {
 312                         # If the reading has been counted elsewhere as a transposition, ignore it.
 313                         if( $transposed->{$rdg->{'t'}->id} ) {
 314                                 # TODO Does this cope with three-way transpositions?
 315                                 map { $moved_wits{$_} = 1 } @{$transposed->{$rdg->{'t'}->id}};
 316                                 next;
 317                         }
 318                         # Otherwise, record it...
 319                         $readings_at_rank{$rdg->{'t'}->id} = $rdg->{'t'};
 320                         # ...and grab any transpositions, and their relations.
 321                         my @transp = grep { $_->rank != $rank } $rdg->{'t'}->related_readings();
 322                         foreach my $trdg ( @transp ) {
 323                                 next if exists $readings_at_rank{$trdg->id};
 324                                 $has_transposition = 1;
 325                                 my @affected_wits = _table_witnesses(
 326                                         $table, $trdg, $lacunose, $aclabel );
 327                                 next unless @affected_wits;
 328                                 map { $moved_wits{$_} = 1 } @affected_wits;
 329                                 $transposed->{$trdg->id} =
 330                                         [ _table_witnesses( $table, $rdg->{'t'}, $lacunose, $aclabel ) ];
 331                                 $readings_at_rank{$trdg->id} = $trdg;
 332                         }
 333                 # ...or it is empty, ergo a gap.
 334                 } else {
 335                         _add_to_witlist( $wit, $check_for_gaps, $aclabel );
 336                 }
 337         }
 338         my $gap_wits = Set::Scalar->new();
 339         map { _add_to_witlist( $_, $gap_wits, $aclabel )
 340                 unless $moved_wits{$_} } $check_for_gaps->members;
 341
 342         # Group the readings, collapsing groups by relationship if needed.
 343         my $grouped_readings = {};
 344         foreach my $rdg ( values %readings_at_rank ) {
 345                 # Skip readings that have been collapsed into others.
 346                 next if exists $grouped_readings->{$rdg->id}
 347                         && $grouped_readings->{$rdg->id} eq 'COLLAPSE';
 348                 # Get the witness list, including from readings collapsed into this one.
 349                 my @wits = _table_witnesses( $table, $rdg, $lacunose, $aclabel );
 350                 if( $collapse && $collapse->size ) {
 351                         my $filter = sub { $collapse->has( $_[0]->type ) };
 352                         foreach my $other ( $rdg->related_readings( $filter ) ) {
 353                                 my @otherwits = _table_witnesses( $table, $other, $lacunose, $aclabel );
 354                                 push( @wits, @otherwits );
 355                                 $grouped_readings->{$other->id} = 'COLLAPSE';
 356                         }
 357                 }
 358                 $grouped_readings->{$rdg->id} = Set::Scalar->new( @wits );
 359         }
 360         if( $gap_wits->members ) {
 361                 $grouped_readings->{'(omitted)'} = $gap_wits;
 362         }
 363
 364         # Get rid of our collapsed readings
 365         map { delete $grouped_readings->{$_} if(
 366                          $grouped_readings->{$_} eq 'COLLAPSE'
 367                          || $grouped_readings->{$_}->is_empty ) }
 368                 keys %$grouped_readings;
 369
 370         # If something was transposed, check the groups for doubled-up readings
 371         if( $has_transposition ) {
 372                 # print STDERR "Group for rank $rank:\n";
 373                 # map { print STDERR "\t$_: " . join( ' ' , @{$grouped_readings->{$_}} ) . "\n" }
 374                 #       keys %$grouped_readings;
 375                 _check_transposed_consistency( $c, $rank, $transposed, $grouped_readings );
 376         }
 377
 378         # Return the result
 379         return $grouped_readings;
 380 }
 381
 382 # Helper function to query the alignment table for all witnesses (a.c. included)
 383 # that have a given reading at its rank.
 384 sub _table_witnesses {
 385         my( $table, $trdg, $lacunose, $aclabel ) = @_;
 386         my $tableidx = $trdg->rank - 1;
 387         my $has_reading = Set::Scalar->new();
 388         foreach my $row ( @{$table->{'alignment'}} ) {
 389                 my $wit = $row->{'witness'};
 390                 next if _is_lacunose( $wit, $lacunose, $aclabel );
 391                 my $rdg = $row->{'tokens'}->[$tableidx];
 392                 next unless exists $rdg->{'t'} && defined $rdg->{'t'};
 393                 _add_to_witlist( $wit, $has_reading, $aclabel )
 394                         if $rdg->{'t'}->id eq $trdg->id;
 395         }
 396         return $has_reading->members;
 397 }
 398
 399 # Helper function to see if a witness is lacunose even if we are asking about
 400 # the a.c. version
 401 sub _is_lacunose {
 402         my ( $wit, $lac, $acstr ) = @_;
 403         if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
 404                 $wit = $1;
 405         }
 406         return $lac->has( $wit );
 407 }
 408
 409 # Helper function to ensure that X and X a.c. never appear in the same list.
 410 sub _add_to_witlist {
 411         my( $wit, $list, $acstr ) = @_;
 412         if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
 413                 # Don't add X a.c. if we already have X
 414                 return if $list->has( $1 );
 415         } else {
 416                 # Delete X a.c. if we are about to add X
 417                 $list->delete( $wit.$acstr );
 418         }
 419         $list->insert( $wit );
 420 }
 421
 422 sub _check_transposed_consistency {
 423         my( $c, $rank, $transposed, $groupings ) = @_;
 424         my %seen_wits;
 425         my %thisrank;
 426         # Note which readings are actually at this rank, and which witnesses
 427         # belong to which reading.
 428         foreach my $rdg ( keys %$groupings ) {
 429                 my $rdgobj = $c->reading( $rdg );
 430                 # Count '(omitted)' as a reading at this rank
 431                 $thisrank{$rdg} = 1 if !$rdgobj || $rdgobj->rank == $rank;
 432                 map { push( @{$seen_wits{$_}}, $rdg ) } @{$groupings->{$rdg}};
 433         }
 434         # Our work is done if we have no witness belonging to more than one
 435         # reading.
 436         my @doubled = grep { scalar @{$seen_wits{$_}} > 1 } keys %seen_wits;
 437         return unless @doubled;
 438         # If we have a symmetric related transposition, drop the non-rank readings.
 439         if( @doubled == scalar keys %seen_wits ) {
 440                 foreach my $rdg ( keys %$groupings ) {
 441                         if( !$thisrank{$rdg} ) {
 442                                 my $groupstr = wit_stringify( $groupings->{$rdg} );
 443                                 my ( $matched ) = grep { $groupstr eq wit_stringify( $groupings->{$_} ) }
 444                                         keys %thisrank;
 445                                 delete $groupings->{$rdg};
 446                                 # If we found a group match, assume there is a symmetry happening.
 447                                 # TODO think more about this
 448                                 # print STDERR "*** Deleting symmetric reading $rdg\n";
 449                                 unless( $matched ) {
 450                                         delete $transposed->{$rdg};
 451                                         warn "Found problem in evident symmetry with reading $rdg";
 452                                 }
 453                         }
 454                 }
 455         # Otherwise 'unhook' the transposed reading(s) that have duplicates.
 456         } else {
 457                 foreach my $dup ( @doubled ) {
 458                         foreach my $rdg ( @{$seen_wits{$dup}} ) {
 459                                 next if $thisrank{$rdg};
 460                                 next unless exists $groupings->{$rdg};
 461                                 # print STDERR "*** Deleting asymmetric doubled-up reading $rdg\n";
 462                                 delete $groupings->{$rdg};
 463                                 delete $transposed->{$rdg};
 464                         }
 465                 }
 466                 # and put any now-orphaned readings into an 'omitted' reading.
 467                 foreach my $wit ( keys %seen_wits ) {
 468                         unless( grep { exists $groupings->{$_} } @{$seen_wits{$wit}} ) {
 469                                 $groupings->{'(omitted)'} = Set::Scalar->new()
 470                                          unless exists $groupings->{'(omitted)'};
 471                                 _add_to_witlist( $wit, $groupings->{'(omitted)'}, $c->ac_label );
 472                         }
 473                 }
 474         }
 475 }
 476
 477 # For the given grouping, return its situation graph based on the stemma.
 478 sub _graph_for_grouping {
 479         my( $stemma, $grouping, $lacunose, $aclabel ) = @_;
 480         my $acwits = [];
 481         my $extant = {};
 482         foreach my $gs ( values %$grouping ) {
 483                 map {
 484                         if( $_ =~ /^(.*)\Q$aclabel\E$/ ) {
 485                                 push( @$acwits, $1 ) unless $lacunose->has( $1 );
 486                         } else {
 487                                 $extant->{$_} = 1 unless $lacunose->has( $_ );
 488                         }
 489                 } $gs->members;
 490         }
 491         my $graph;
 492         try {
 493                 # contig contains all extant wits and all hypothetical wits
 494                 # needed to make up the groups.
 495                 $graph = $stemma->situation_graph( $extant, $acwits );
 496         } catch ( Text::Tradition::Error $e ) {
 497                 throw( "Could not extend graph with given extant and a.c. witnesses: "
 498                         . $e->message );
 499         } catch {
 500                 throw( "Could not extend graph with a.c. witnesses @$acwits" );
 501         }
 502         return $graph;
 503 }
 504
 505 =head2 solve_variants( $calcdir, @groups )
 506
 507 Looks up the set of groups in the answers provided by the external graph solver
 508 service and returns a cleaned-up answer, adding the rank IDs back where they belong.
 509
 510 The answer has the form
 511   { "variants" => [ array of variant location structures ],
 512     "variant_count" => total,
 513     "conflict_count" => number of conflicts detected,
 514     "genealogical_count" => number of solutions found }
 515
 516 =cut
 517
 518 sub solve_variants {
 519         my( @groups ) = @_;
 520
 521         # Are we using a local result directory, or did we pass an empty value
 522         # for the directory?
 523         my $dir;
 524         unless( ref( $groups[0] ) eq 'HASH' ) {
 525                 $dir = shift @groups;
 526         }
 527
 528         ## For each graph/group combo, make a Text::Tradition::Analysis::Result
 529         ## object so that we can send it off for IDP lookup.
 530         my $variants = [];
 531         my $genealogical = 0; # counter
 532         # TODO Optimize for unique graph problems
 533         my %problems;
 534         foreach my $graphproblem ( @groups ) {
 535                 # Construct the calc result key and look up its answer
 536                 my $problem = Text::Tradition::Analysis::Result->new(
 537                         graph => $graphproblem->{'graph'},
 538                         setlist => [ values %{$graphproblem->{'grouping'}} ] );
 539                 if( exists $problems{$problem->object_key} ) {
 540                         $problem = $problems{$problem->object_key};
 541                 } else {
 542                         $problems{$problem->object_key} = $problem;
 543                 }
 544                 $graphproblem->{'object'} = $problem;
 545         }
 546
 547         my %results;
 548         if( $dir ) {
 549                 my $scope = $dir->new_scope;
 550                 map { $results{$_} = $dir->lookup( $_ ) || $problems{$_} } keys %problems;
 551         } else {
 552                 my $json = JSON->new->allow_blessed->convert_blessed->utf8->encode(
 553                         [ values %problems ] );
 554                 # Send it off and get the result
 555                 # print STDERR "Sending request: " . decode_utf8( $json ) . "\n";
 556                 my $ua = LWP::UserAgent->new();
 557                 my $resp = $ua->post( $SOLVER_URL, 'Content-Type' => 'application/json',
 558                                                           'Content' => $json );
 559                 my $answer;
 560                 if( $resp->is_success ) {
 561                         $answer = decode_json( $resp->content );
 562                         throw( "Unexpected answer from IDP: $answer" ) unless ref( $answer ) eq 'ARRAY';
 563                 } else {
 564                         throw( "IDP solver returned " . $resp->status_line . " / " . $resp->content
 565                                 . "; cannot run graph analysis" );
 566                 }
 567                 # One more sanity check
 568                 throw( "Something went wrong with answer symmetricity" )
 569                         unless keys( %problems ) == @$answer;
 570                 # Convert the results
 571                 foreach my $a ( @$answer ) {
 572                         my $r = Text::Tradition::Analysis::Result->new( $a );
 573                         $results{$r->object_key} = $r;
 574                 }
 575         }
 576
 577         # We now have a single JSON-encoded Result object per problem sent. Fold its
 578         # answers into our variant info structure.
 579         foreach my $graphproblem ( @groups ) {
 580                 my $result = $results{$graphproblem->{'object'}->object_key}
 581                         || $graphproblem->{'object'};
 582
 583                 # Initialize the result structure for this graph problem
 584                 my $vstruct;
 585                 if( $result->status eq 'OK' ) {
 586                         $vstruct = { readings => [] };
 587                         push( @$variants, $vstruct );
 588                 } else {
 589                         push( @$variants, _init_unsolved( $graphproblem, $result->status ) );
 590                         next;
 591                 }
 592
 593                 # 1. Did the group evaluate as genealogical?
 594                 $vstruct->{genealogical} = $result->is_genealogical;
 595                 $genealogical++ if $result->is_genealogical;
 596
 597                 # 2. What are the calculated minimum groupings for each variant loc?
 598                 foreach my $rid ( keys %{$graphproblem->{grouping}} ) {
 599                         my $inputset = $graphproblem->{grouping}->{$rid};
 600                         my $minset = $result->minimum_grouping_for( $inputset );
 601                         push( @{$vstruct->{readings}}, { readingid => $rid, group => $minset } );
 602                 }
 603
 604                 # 3. What are the sources and classes calculated for each witness?
 605                 $vstruct->{witcopy_types} = { $result->classes };
 606                 $vstruct->{reading_roots} = {};
 607                 map { $vstruct->{reading_roots}->{$_} = 1 } $result->sources;
 608
 609         }
 610
 611         return { 'variants' => $variants,
 612                          'variant_count' => scalar @$variants,
 613                          'genealogical_count' => $genealogical };
 614 }
 615
 616 sub _init_unsolved {
 617         my( $graphproblem, $status ) = @_;
 618         my $vstruct = { 'readings' => [] };
 619         $vstruct->{'unsolved'} = $status;
 620         foreach my $rid ( keys %{$graphproblem->{grouping}} ) {
 621                 push( @{$vstruct->{readings}}, { readingid => $rid,
 622                         group => [ $graphproblem->{grouping}->{$rid}->members ] } );
 623         }
 624         return $vstruct;
 625 }
 626
 627 =head2 analyze_location ( $tradition, $graph, $location_hash )
 628
 629 Given the tradition, its stemma graph, and the solution from the graph solver,
 630 work out the rest of the information we want.  For each reading we need missing,
 631 conflict, reading_parents, independent_occurrence, followed, not_followed,
 632 and follow_unknown.  Alters the location_hash in place.
 633
 634 =cut
 635
 636 sub analyze_location {
 637         my ( $tradition, $graph, $variant_row, $lacunose ) = @_;
 638         my $c = $tradition->collation;
 639
 640         if( exists $variant_row->{'unsolved'} ) {
 641                 return;
 642         }
 643         my $reading_roots = delete $variant_row->{'reading_roots'};
 644         my $classinfo = delete $variant_row->{'witcopy_types'};
 645
 646         # Make a hash of all known node memberships, and make the subgraphs.
 647         my $contig = {};
 648         my $subgraph = {};
 649         my $acstr = $c->ac_label;
 650         my @acwits;
 651
 652         # Note which witnesses positively belong to which group. This information
 653         # comes ultimately from the IDP solver.
 654         # Also make a note of the reading's roots.
 655     foreach my $rdghash ( @{$variant_row->{'readings'}} ) {
 656         my $rid = $rdghash->{'readingid'};
 657         my @roots;
 658         foreach my $wit ( @{$rdghash->{'group'}} ) {
 659                 $contig->{$wit} = $rid;
 660             if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
 661                 push( @acwits, $1 );
 662             }
 663             if( exists $reading_roots->{$wit} && $reading_roots->{$wit} ) {
 664                 push( @roots, $wit );
 665             }
 666         }
 667                 $rdghash->{'independent_occurrence'} = \@roots;
 668         }
 669
 670         # Now that we have all the node group memberships, calculate followed/
 671     # non-followed/unknown values for each reading.  Also figure out the
 672     # reading's evident parent(s).
 673     foreach my $rdghash ( @{$variant_row->{'readings'}} ) {
 674         my $rid = $rdghash->{'readingid'};
 675         my $rdg = $c->reading( $rid );
 676         my @roots = @{$rdghash->{'independent_occurrence'}};
 677         my @reversions;
 678         if( $classinfo ) {
 679                 @reversions = grep { $classinfo->{$_} eq 'revert' }
 680                         $rdghash->{'group'}->members;
 681                 $rdghash->{'reversions'} = \@reversions;
 682         }
 683         my @group = @{$rdghash->{'group'}};
 684
 685         # Start figuring things out.
 686         $rdghash->{'followed'} = scalar( @group )
 687                 - ( scalar( @roots ) + scalar( @reversions ) );
 688         # Find the parent readings, if any, of this reading.
 689         my $sourceparents = _find_reading_parents( $rid, $graph, $contig, @roots );
 690                 # Work out relationships between readings and their non-followed parent.
 691                 _resolve_parent_relationships( $c, $rid, $rdg, $sourceparents );
 692                 $rdghash->{'source_parents'} = $sourceparents;
 693
 694                 if( @reversions ) {
 695                         my $revparents = _find_reading_parents( $rid, $graph, $contig, @reversions );
 696                         _resolve_parent_relationships( $c, $rid, $rdg, $revparents );
 697                         $rdghash->{'reversion_parents'} = $revparents;
 698                 }
 699
 700                 # Find the number of times this reading was altered, and the number of
 701                 # times we're not sure.
 702                 my( %nofollow, %unknownfollow );
 703                 foreach my $wit ( @{$rdghash->{'group'}} ) {
 704                         foreach my $wchild ( $graph->successors( $wit ) ) {
 705                                 if( $reading_roots->{$wchild} && $contig->{$wchild}
 706                                         && $contig->{$wchild} ne $rid ) {
 707                                         # It definitely changed here.
 708                                         $nofollow{$wchild} = 1;
 709                                 } elsif( !($contig->{$wchild}) ) {
 710                                         # The child is a hypothetical node not definitely in
 711                                         # any group. Answer is unknown.
 712                                         $unknownfollow{$wchild} = 1;
 713                                 } # else it is either in our group, or it is a non-root node in a
 714                                   # known group and therefore is presumed to have its reading from
 715                                   # its group, not this link.
 716                         }
 717                 }
 718                 $rdghash->{'not_followed'} = keys %nofollow;
 719                 $rdghash->{'follow_unknown'} = keys %unknownfollow;
 720
 721                 # Now say whether this reading represents a conflict.
 722                 unless( $variant_row->{'genealogical'} ) {
 723                         $rdghash->{'is_conflict'} = @roots != 1;
 724                         $rdghash->{'is_reverted'} = scalar @reversions;
 725                 }
 726     }
 727 }
 728
 729 sub _find_reading_parents {
 730         my( $rid, $graph, $contig, @list ) = @_;
 731         my $parenthash = {};
 732         foreach my $wit ( @list ) {
 733                 # Look in the stemma graph to find this witness's extant or known-reading
 734                 # immediate ancestor(s), and look up the reading that each ancestor holds.
 735                 my @check = $graph->predecessors( $wit );
 736                 while( @check ) {
 737                         my @next;
 738                         foreach my $wparent( @check ) {
 739                                 my $preading = $contig->{$wparent};
 740                                 if( $preading && $preading ne $rid ) {
 741                                         $parenthash->{$preading} = 1;
 742                                 } else {
 743                                         push( @next, $graph->predecessors( $wparent ) );
 744                                 }
 745                         }
 746                         @check = @next;
 747                 }
 748         }
 749         return $parenthash;
 750 }
 751
 752 sub _resolve_parent_relationships {
 753         my( $c, $rid, $rdg, $rdgparents ) = @_;
 754         foreach my $p ( keys %$rdgparents ) {
 755                 # Resolve the relationship of the parent to the reading, and
 756                 # save it in our hash.
 757                 my $pobj = $c->reading( $p );
 758                 my $prep = $pobj ? $pobj->id . ' (' . $pobj->text . ')' : $p;
 759                 my $phash = { 'label' => $prep };
 760                 if( $pobj ) {
 761                         my $rel = $c->get_relationship( $p, $rid );
 762                         if( $rel ) {
 763                                 _add_to_hash( $rel, $phash );
 764                         } elsif( $rdg ) {
 765                                 # First check for a transposed relationship
 766                                 if( $rdg->rank != $pobj->rank ) {
 767                                         foreach my $ti ( $rdg->related_readings( 'transposition' ) ) {
 768                                                 next unless $ti->text eq $rdg->text;
 769                                                 $rel = $c->get_relationship( $ti, $pobj );
 770                                                 if( $rel ) {
 771                                                         _add_to_hash( $rel, $phash, 1 );
 772                                                         last;
 773                                                 }
 774                                         }
 775                                         unless( $rel ) {
 776                                                 foreach my $ti ( $pobj->related_readings( 'transposition' ) ) {
 777                                                         next unless $ti->text eq $pobj->text;
 778                                                         $rel = $c->get_relationship( $ti, $rdg );
 779                                                         if( $rel ) {
 780                                                                 _add_to_hash( $rel, $phash, 1 );
 781                                                                 last;
 782                                                         }
 783                                                 }
 784                                         }
 785                                 }
 786                                 unless( $rel ) {
 787                                         # and then check for sheer word similarity.
 788                                         my $rtext = $rdg->text;
 789                                         my $ptext = $pobj->text;
 790                                         if( similar( $rtext, $ptext ) ) {
 791                                                 # say STDERR "Words $rtext and $ptext judged similar";
 792                                                 $phash->{relation} = { type => 'wordsimilar' };
 793                                         }
 794                                 }
 795                         } else {
 796                                 $phash->{relation} = { type => 'deletion' };
 797                         }
 798                         # Get the attributes of the parent object while we are here
 799                         $phash->{'text'} = $pobj->text if $pobj;
 800                         $phash->{'is_nonsense'} = $pobj->is_nonsense;
 801                         $phash->{'is_ungrammatical'} = $pobj->grammar_invalid;
 802                 } elsif( $p eq '(omitted)' ) {
 803                         $phash->{relation} = { type => 'addition' };
 804                 }
 805                 # Save it
 806                 $rdgparents->{$p} = $phash;
 807         }
 808 }
 809
 810 sub _add_to_hash {
 811         my( $rel, $phash, $is_transposed ) = @_;
 812         $phash->{relation} = { type => $rel->type };
 813         $phash->{relation}->{transposed} = 1 if $is_transposed;
 814         $phash->{relation}->{annotation} = $rel->annotation
 815                 if $rel->has_annotation;
 816 }
 817
 818 =head2 similar( $word1, $word2 )
 819
 820 Use Algorithm::Diff to get a sense of how close the words are to each other.
 821 This will hopefully handle substitutions a bit more nicely than Levenshtein.
 822
 823 =cut
 824
 825 #!/usr/bin/env perl
 826
 827 sub similar {
 828         my( $word1, $word2 ) = sort { length($a) <=> length($b) } @_;
 829         my @let1 = split( '', lc( $word1 ) );
 830         my @let2 = split( '', lc( $word2 ) );
 831         my $diff = Algorithm::Diff->new( \@let1, \@let2 );
 832         my $mag = 0;
 833         while( $diff->Next ) {
 834                 if( $diff->Same ) {
 835                         # Take off points for longer strings
 836                         my $cs = $diff->Range(1) - 2;
 837                         $cs = 0 if $cs < 0;
 838                         $mag -= $cs;
 839                 } elsif( !$diff->Items(1) ) {
 840                         $mag += $diff->Range(2);
 841                 } elsif( !$diff->Items(2) ) {
 842                         $mag += $diff->Range(1);
 843                 } else {
 844                         # Split the difference for substitutions
 845                         my $c1 = $diff->Range(1) || 1;
 846                         my $c2 = $diff->Range(2) || 1;
 847                         my $cd = ( $c1 + $c2 ) / 2;
 848                         $mag += $cd;
 849                 }
 850         }
 851         return ( $mag <= length( $word1 ) / 2 );
 852 }
 853
 854 sub _useful_variant {
 855         my( $rankgroup, $rankgraph, $acstr ) = @_;
 856
 857         # Sort by group size and return
 858         my $is_useful = 0;
 859         foreach my $rdg ( keys %$rankgroup ) {
 860                 my @wits = $rankgroup->{$rdg}->members;
 861                 if( @wits > 1 ) {
 862                         $is_useful++;
 863                 } else {
 864                         $is_useful++ unless( $rankgraph->is_sink_vertex( $wits[0] )
 865                                 || $wits[0] =~ /\Q$acstr\E$/ );
 866                 }
 867         }
 868         return $is_useful > 1;
 869 }
 870
 871 =head2 wit_stringify( $groups )
 872
 873 Takes an array of witness groupings and produces a string like
 874 ['A','B'] / ['C','D','E'] / ['F']
 875
 876 =cut
 877
 878 sub wit_stringify {
 879     my $groups = shift;
 880     my @gst;
 881     # If we were passed an array of witnesses instead of an array of
 882     # groupings, then "group" the witnesses first.
 883     unless( ref( $groups->[0] ) ) {
 884         my $mkgrp = [ $groups ];
 885         $groups = $mkgrp;
 886     }
 887     foreach my $g ( @$groups ) {
 888         push( @gst, '[' . join( ',', map { "'$_'" } @$g ) . ']' );
 889     }
 890     return join( ' / ', @gst );
 891 }
 892
 893 1;
 894
 895 sub throw {
 896         Text::Tradition::Error->throw(
 897                 'ident' => 'Analysis error',
 898                 'message' => $_[0],
 899         );
 900 }
 901
 902 =head1 LICENSE
 903
 904 This package is free software and is provided "as is" without express
 905 or implied warranty.  You can redistribute it and/or modify it under
 906 the same terms as Perl itself.
 907
 908 =head1 AUTHOR
 909
 910 Tara L Andrews E<lt>aurum@cpan.orgE<gt>