analysis/lib/Text/Tradition/Analysis.pm

   1 package Text::Tradition::Analysis;
   2
   3 use strict;
   4 use warnings;
   5 use Algorithm::Diff;  # for word similarity measure
   6 use Encode qw/ decode_utf8 encode_utf8 /;
   7 use Exporter 'import';
   8 use Graph;
   9 use JSON qw/ to_json decode_json /;
  10 use LWP::UserAgent;
  11 use Set::Scalar;
  12 use Text::Tradition::Analysis::Result;
  13 use Text::Tradition::Directory;
  14 use Text::Tradition::Stemma;
  15 use TryCatch;
  16
  17 use vars qw/ @EXPORT_OK $VERSION /;
  18 @EXPORT_OK = qw/ run_analysis group_variants analyze_variant_location wit_stringify /;
  19 $VERSION = "1.0";
  20
  21
  22 my $SOLVER_URL = 'http://byzantini.st/cgi-bin/graphcalc.cgi';
  23 my $unsolved_problems = {};
  24
  25 =head1 NAME
  26
  27 Text::Tradition::Analysis - functions for stemma analysis of a tradition
  28
  29 =head1 SYNOPSIS
  30
  31   use Text::Tradition;
  32   use Text::Tradition::Analysis qw/ run_analysis analyze_variant_location /;
  33   my $t = Text::Tradition->new(
  34     'name' => 'this is a text',
  35     'input' => 'TEI',
  36     'file' => '/path/to/tei_parallel_seg_file.xml' );
  37   $t->enable_stemmata;
  38   $t->add_stemma( 'dotfile' => $stemmafile );
  39
  40   my $variant_data = run_analysis( $tradition );
  41   # Recalculate rank $n treating all orthographic variants as equivalent
  42   my $reanalyze = analyze_variant_location( $tradition, $n, 0, 'orthographic' );
  43
  44 =head1 DESCRIPTION
  45
  46 Text::Tradition is a library for representation and analysis of collated
  47 texts, particularly medieval ones.  Where the Collation is the central feature of
  48 a Tradition, it may also have one or more temmata associated with it, and these stemmata may be analyzed. This package provides the following modules:
  49
  50 =over 4
  51
  52 =item * L<Text::Tradition::HasStemma> - a role that can be composed into Text::Tradition objects, providing the ability for Text::Tradition::Stemma objects to be associated with them.
  53
  54 =item * L<Text::Tradition::Stemma> - an object class that represents stemma hypotheses, both rooted (with a single archetype) and unrooted (e.g. phylogenetic trees).
  55
  56 =item * Text::Tradition::Analysis (this package). Provides functions for the analysis of a given stemma against the collation within a given Tradition.
  57
  58 =back
  59
  60 =head1 SUBROUTINES
  61
  62 =head2 run_analysis( $tradition, %opts )
  63
  64 Runs the analysis described in analyze_variant_location on every location in the
  65 collation of the given tradition, with the given options. These include:
  66
  67 =over 4
  68
  69 =item * stemma_id - Specify which of the tradition's stemmata to use. Default
  70 is 0 (i.e. the first).
  71
  72 =item * ranks - Specify a list of location ranks to analyze; exclude the rest.
  73
  74 =item * merge_types - Specify a list of relationship types, where related readings
  75 should be treated as identical for the purposes of analysis.
  76
  77 =item * exclude_type1 - Exclude those ranks whose groupings have only type-1 variants.
  78
  79 =back
  80
  81 =begin testing
  82
  83 use Text::Tradition;
  84 use Text::Tradition::Analysis qw/ run_analysis analyze_variant_location /;
  85
  86 my $datafile = 't/data/florilegium_tei_ps.xml';
  87 my $tradition = Text::Tradition->new( 'input' => 'TEI',
  88                                       'name' => 'test0',
  89                                       'file' => $datafile );
  90 $tradition->enable_stemmata;
  91 my $s = $tradition->add_stemma( 'dotfile' => 't/data/florilegium.dot' );
  92 is( ref( $s ), 'Text::Tradition::Stemma', "Added stemma to tradition" );
  93
  94 my %expected_genealogical = (
  95         1 => 0,
  96         2 => 1,
  97         3 =>  0,
  98         5 =>  0,
  99         7 =>  0,
 100         8 =>  0,
 101         10 => 0,
 102         13 => 1,
 103         33 => 0,
 104         34 => 0,
 105         37 => 0,
 106         60 => 0,
 107         81 => 1,
 108         84 => 0,
 109         87 => 0,
 110         101 => 0,
 111         102 => 0,
 112         122 => 1,
 113         157 => 0,
 114         166 => 1,
 115         169 => 1,
 116         200 => 0,
 117         216 => 1,
 118         217 => 1,
 119         219 => 1,
 120         241 => 1,
 121         242 => 1,
 122         243 => 1,
 123 );
 124
 125 my $data = run_analysis( $tradition, calcdsn => 'dbi:SQLite:dbname=t/data/analysis.db' );
 126 my $c = $tradition->collation;
 127 foreach my $row ( @{$data->{'variants'}} ) {
 128         # Account for rows that used to be "not useful"
 129         unless( exists $expected_genealogical{$row->{'id'}} ) {
 130                 $expected_genealogical{$row->{'id'}} = 1;
 131         }
 132         my $gen_bool = $row->{'genealogical'} ? 1 : 0;
 133         is( $gen_bool, $expected_genealogical{$row->{'id'}},
 134                 "Got correct genealogical flag for row " . $row->{'id'} );
 135         # Check that we have the right row with the right groups
 136         my $rank = $row->{'id'};
 137         foreach my $rdghash ( @{$row->{'readings'}} ) {
 138                 # Skip 'readings' that aren't really
 139                 next unless $c->reading( $rdghash->{'readingid'} );
 140                 # Check the rank
 141                 is( $c->reading( $rdghash->{'readingid'} )->rank, $rank,
 142                         "Got correct reading rank" );
 143                 # Check the witnesses
 144                 my @realwits = sort $c->reading_witnesses( $rdghash->{'readingid'} );
 145                 my @sgrp = sort @{$rdghash->{'group'}};
 146                 is_deeply( \@sgrp, \@realwits, "Reading analyzed with correct groups" );
 147         }
 148 }
 149 is( $data->{'variant_count'}, 58, "Got right total variant number" );
 150 # TODO Make something meaningful of conflict count, maybe test other bits
 151
 152 =end testing
 153
 154 =cut
 155
 156 sub run_analysis {
 157         my( $tradition, %opts ) = @_;
 158         my $c = $tradition->collation;
 159         my $aclabel = $c->ac_label;
 160
 161         my $stemma_id = $opts{'stemma_id'} || 0;
 162         my @ranks = ref( $opts{'ranks'} ) eq 'ARRAY' ? @{$opts{'ranks'}} : ();
 163         my $collapse = Set::Scalar->new();
 164         if( $opts{'merge_types'} && ref( $opts{'merge_types'} ) eq 'ARRAY' ) {
 165                 $collapse->insert( @{$opts{'merge_types'}} );
 166         } elsif( $opts{'merge_types'} ) {
 167                 $collapse->insert( $opts{'merge_types'} );
 168         }
 169
 170         # Make sure we have a lookup DB for graph calculation results; this will die
 171         # if calcdir or calcdsn isn't passed.
 172         my $dir;
 173         if( exists $opts{'calcdir'} ) {
 174                 $dir = delete $opts{'calcdir'}
 175         } elsif ( exists $opts{'calcdsn'} ) {
 176                 $dir = Text::Tradition::Directory->new( dsn => $opts{'calcdsn'} );
 177         }
 178
 179         # Get the stemma
 180         my $stemma = $tradition->stemma( $stemma_id );
 181
 182         # Figure out which witnesses we are working with - that is, the ones that
 183         # appear both in the stemma and in the tradition. All others are 'lacunose'
 184         # for our purposes.
 185         my $lacunose = Set::Scalar->new( $stemma->hypotheticals );
 186         my $stemma_wits = Set::Scalar->new( $stemma->witnesses );
 187         my $tradition_wits = Set::Scalar->new( map { $_->sigil } $tradition->witnesses );
 188         $lacunose->insert( $stemma_wits->symmetric_difference( $tradition_wits )->members );
 189
 190         # Find and mark 'common' ranks for exclusion, unless they were
 191         # explicitly specified.
 192         unless( @ranks ) {
 193                 my %common_rank;
 194                 foreach my $rdg ( $c->common_readings ) {
 195                         $common_rank{$rdg->rank} = 1;
 196                 }
 197                 @ranks = grep { !$common_rank{$_} } ( 1 .. $c->end->rank-1 );
 198         }
 199
 200         # Group the variants to send to the solver
 201         my @groups;
 202         my @use_ranks;
 203         my %lacunae;
 204         my $moved = {};
 205         foreach my $rank ( @ranks ) {
 206                 my $missing = $lacunose->clone();
 207                 my $rankgroup = group_variants( $tradition, $rank, $missing, $moved, $collapse );
 208                 # Filter out any empty rankgroups
 209                 # (e.g. from the later rank for a transposition)
 210                 next unless keys %$rankgroup;
 211                 # Get the graph for this rankgroup
 212                 my $rankgraph = _graph_for_grouping( $stemma, $rankgroup, $missing, $aclabel );
 213                 if( $opts{'exclude_type1'} ) {
 214                         # Check to see whether this is a "useful" group.
 215                         next unless _useful_variant( $rankgroup, $rankgraph, $aclabel );
 216                 }
 217                 push( @use_ranks, $rank );
 218                 push( @groups, { grouping => $rankgroup, graph => $rankgraph } );
 219                 $lacunae{$rank} = $missing;
 220         }
 221         # Run the solver
 222         my $answer;
 223         try {
 224                 $answer = solve_variants( $dir, @groups );
 225         } catch ( Text::Tradition::Error $e ) {
 226                 if( $e->message =~ /IDP/ ) {
 227                         # Something is wrong with the solver; make the variants table anyway
 228                         $answer->{'variants'} = [];
 229                         map { push( @{$answer->{'variants'}}, _init_unsolved( $_, 'IDP error' ) ) }
 230                                 @groups;
 231                 } else {
 232                         # Something else is wrong; error out.
 233                         $e->throw;
 234                 }
 235         }
 236
 237         # Do further analysis on the answer
 238         my $conflict_count = 0;
 239         my $reversion_count = 0;
 240         foreach my $idx ( 0 .. $#use_ranks ) {
 241                 my $location = $answer->{'variants'}->[$idx];
 242                 # Add the rank back in
 243                 my $rank = $use_ranks[$idx];
 244                 $location->{'id'} = $rank;
 245                 # Note what our lacunae are
 246                 my %lmiss;
 247                 map { $lmiss{$_} = 1 } @{$lacunae{$use_ranks[$idx]}};
 248                 $location->{'missing'} = [ keys %lmiss ];
 249
 250                 # Run the extra analysis we need.
 251                 ## TODO We run through all the variants in this call, so
 252                 ## why not add the reading data there instead of here below?
 253                 my $graph = $groups[$idx]->{graph};
 254                 analyze_location( $tradition, $graph, $location, \%lmiss );
 255
 256                 my @layerwits;
 257                 # Do the final post-analysis tidying up of the data.
 258                 foreach my $rdghash ( @{$location->{'readings'}} ) {
 259                         $conflict_count++ if $rdghash->{'is_conflict'};
 260                         $reversion_count++ if $rdghash->{'is_reverted'};
 261                         # Add the reading text back in, setting display value as needed
 262                         my $rdg = $c->reading( $rdghash->{'readingid'} );
 263                         if( $rdg ) {
 264                                 $rdghash->{'text'} = $rdg->text .
 265                                         ( $rdg->rank == $rank ? '' : ' [' . $rdg->rank . ']' );
 266                                 if( $rdg->does( 'Text::Tradition::Morphology' ) ) {
 267                                         $rdghash->{'is_ungrammatical'} = $rdg->grammar_invalid;
 268                                         $rdghash->{'is_nonsense'} = $rdg->is_nonsense;
 269                                 }
 270                         }
 271                         # Remove lacunose witnesses from this reading's list now that the
 272                         # analysis is done
 273                         my @realgroup;
 274                         map { push( @realgroup, $_ ) unless $lmiss{$_} } @{$rdghash->{'group'}};
 275                         $rdghash->{'group'} = \@realgroup;
 276                         # Note any layered witnesses that appear in this group
 277                         foreach( @realgroup ) {
 278                                 if( $_ =~ /^(.*)\Q$aclabel\E$/ ) {
 279                                         push( @layerwits, $1 );
 280                                 }
 281                         }
 282                 }
 283                 $location->{'layerwits'} = \@layerwits if @layerwits;
 284         }
 285         $answer->{'conflict_count'} = $conflict_count;
 286         $answer->{'reversion_count'} = $reversion_count;
 287
 288         return $answer;
 289 }
 290
 291 =head2 group_variants( $tradition, $rank, $lacunose, $transposed, $merge_relationship_types )
 292
 293 Groups the variants at the given $rank of the collation, treating any
 294 relationships in the set $merge_relationship_types as equivalent.
 295 $lacunose should be a reference to an array, to which the sigla of lacunose
 296 witnesses at this rank will be appended; $transposed should be a reference
 297 to a hash, wherein the identities of transposed readings and their
 298 relatives will be stored.
 299
 300 Returns a hash $group_readings where $rdg is attested by the witnesses listed
 301 in $group_readings->{$rdg}.
 302
 303 =cut
 304
 305 # Return group_readings, groups, lacunose
 306 sub group_variants {
 307         my( $tradition, $rank, $lacunose, $transposed, $collapse ) = @_;
 308         my $c = $tradition->collation;
 309         my $aclabel = $c->ac_label;
 310         my $table = $c->alignment_table;
 311         # Get the alignment table readings
 312         my %readings_at_rank;
 313         my $check_for_gaps = Set::Scalar->new();
 314         my %moved_wits;
 315         my $has_transposition;
 316         foreach my $tablewit ( @{$table->{'alignment'}} ) {
 317                 my $rdg = $tablewit->{'tokens'}->[$rank-1];
 318                 my $wit = $tablewit->{'witness'};
 319                 # Exclude the witness if it is "lacunose" which if we got here
 320                 # means "not in the stemma".
 321                 next if _is_lacunose( $wit, $lacunose, $aclabel );
 322                 # Note if the witness is actually in a lacuna
 323                 if( $rdg && $rdg->{'t'}->is_lacuna ) {
 324                         _add_to_witlist( $wit, $lacunose, $aclabel );
 325                 # Otherwise the witness either has a positive reading...
 326                 } elsif( $rdg ) {
 327                         # If the reading has been counted elsewhere as a transposition, ignore it.
 328                         if( $transposed->{$rdg->{'t'}->id} ) {
 329                                 # TODO Does this cope with three-way transpositions?
 330                                 map { $moved_wits{$_} = 1 } @{$transposed->{$rdg->{'t'}->id}};
 331                                 next;
 332                         }
 333                         # Otherwise, record it...
 334                         $readings_at_rank{$rdg->{'t'}->id} = $rdg->{'t'};
 335                         # ...and grab any transpositions, and their relations.
 336                         my @transp = grep { $_->rank != $rank } $rdg->{'t'}->related_readings();
 337                         foreach my $trdg ( @transp ) {
 338                                 next if exists $readings_at_rank{$trdg->id};
 339                                 $has_transposition = 1;
 340                                 my @affected_wits = _table_witnesses(
 341                                         $table, $trdg, $lacunose, $aclabel );
 342                                 next unless @affected_wits;
 343                                 map { $moved_wits{$_} = 1 } @affected_wits;
 344                                 $transposed->{$trdg->id} =
 345                                         [ _table_witnesses( $table, $rdg->{'t'}, $lacunose, $aclabel ) ];
 346                                 $readings_at_rank{$trdg->id} = $trdg;
 347                         }
 348                 # ...or it is empty, ergo a gap.
 349                 } else {
 350                         _add_to_witlist( $wit, $check_for_gaps, $aclabel );
 351                 }
 352         }
 353         my $gap_wits = Set::Scalar->new();
 354         map { _add_to_witlist( $_, $gap_wits, $aclabel )
 355                 unless $moved_wits{$_} } $check_for_gaps->members;
 356
 357         # Group the readings, collapsing groups by relationship if needed.
 358         my $grouped_readings = {};
 359         foreach my $rdg ( values %readings_at_rank ) {
 360                 # Skip readings that have been collapsed into others.
 361                 next if exists $grouped_readings->{$rdg->id}
 362                         && $grouped_readings->{$rdg->id} eq 'COLLAPSE';
 363                 # Get the witness list, including from readings collapsed into this one.
 364                 my @wits = _table_witnesses( $table, $rdg, $lacunose, $aclabel );
 365                 if( $collapse && $collapse->size ) {
 366                         my $filter = sub { $collapse->has( $_[0]->type ) };
 367                         foreach my $other ( $rdg->related_readings( $filter ) ) {
 368                                 my @otherwits = _table_witnesses( $table, $other, $lacunose, $aclabel );
 369                                 push( @wits, @otherwits );
 370                                 $grouped_readings->{$other->id} = 'COLLAPSE';
 371                         }
 372                 }
 373                 $grouped_readings->{$rdg->id} = Set::Scalar->new( @wits );
 374         }
 375         if( $gap_wits->members ) {
 376                 $grouped_readings->{'(omitted)'} = $gap_wits;
 377         }
 378
 379         # Get rid of our collapsed readings
 380         map { delete $grouped_readings->{$_} if(
 381                          $grouped_readings->{$_} eq 'COLLAPSE'
 382                          || $grouped_readings->{$_}->is_empty ) }
 383                 keys %$grouped_readings;
 384
 385         # If something was transposed, check the groups for doubled-up readings
 386         if( $has_transposition ) {
 387                 # print STDERR "Group for rank $rank:\n";
 388                 # map { print STDERR "\t$_: " . join( ' ' , @{$grouped_readings->{$_}} ) . "\n" }
 389                 #       keys %$grouped_readings;
 390                 _check_transposed_consistency( $c, $rank, $transposed, $grouped_readings );
 391         }
 392
 393         # Return the result
 394         return $grouped_readings;
 395 }
 396
 397 # Helper function to query the alignment table for all witnesses (a.c. included)
 398 # that have a given reading at its rank.
 399 sub _table_witnesses {
 400         my( $table, $trdg, $lacunose, $aclabel ) = @_;
 401         my $tableidx = $trdg->rank - 1;
 402         my $has_reading = Set::Scalar->new();
 403         foreach my $row ( @{$table->{'alignment'}} ) {
 404                 my $wit = $row->{'witness'};
 405                 next if _is_lacunose( $wit, $lacunose, $aclabel );
 406                 my $rdg = $row->{'tokens'}->[$tableidx];
 407                 next unless exists $rdg->{'t'} && defined $rdg->{'t'};
 408                 _add_to_witlist( $wit, $has_reading, $aclabel )
 409                         if $rdg->{'t'}->id eq $trdg->id;
 410         }
 411         return $has_reading->members;
 412 }
 413
 414 # Helper function to see if a witness is lacunose even if we are asking about
 415 # the a.c. version
 416 sub _is_lacunose {
 417         my ( $wit, $lac, $acstr ) = @_;
 418         if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
 419                 $wit = $1;
 420         }
 421         return $lac->has( $wit );
 422 }
 423
 424 # Helper function to ensure that X and X a.c. never appear in the same list.
 425 sub _add_to_witlist {
 426         my( $wit, $list, $acstr ) = @_;
 427         if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
 428                 # Don't add X a.c. if we already have X
 429                 return if $list->has( $1 );
 430         } else {
 431                 # Delete X a.c. if we are about to add X
 432                 $list->delete( $wit.$acstr );
 433         }
 434         $list->insert( $wit );
 435 }
 436
 437 sub _check_transposed_consistency {
 438         my( $c, $rank, $transposed, $groupings ) = @_;
 439         my %seen_wits;
 440         my %thisrank;
 441         # Note which readings are actually at this rank, and which witnesses
 442         # belong to which reading.
 443         foreach my $rdg ( keys %$groupings ) {
 444                 my $rdgobj = $c->reading( $rdg );
 445                 # Count '(omitted)' as a reading at this rank
 446                 $thisrank{$rdg} = 1 if !$rdgobj || $rdgobj->rank == $rank;
 447                 map { push( @{$seen_wits{$_}}, $rdg ) } @{$groupings->{$rdg}};
 448         }
 449         # Our work is done if we have no witness belonging to more than one
 450         # reading.
 451         my @doubled = grep { scalar @{$seen_wits{$_}} > 1 } keys %seen_wits;
 452         return unless @doubled;
 453         # If we have a symmetric related transposition, drop the non-rank readings.
 454         if( @doubled == scalar keys %seen_wits ) {
 455                 foreach my $rdg ( keys %$groupings ) {
 456                         if( !$thisrank{$rdg} ) {
 457                                 my $groupstr = wit_stringify( $groupings->{$rdg} );
 458                                 my ( $matched ) = grep { $groupstr eq wit_stringify( $groupings->{$_} ) }
 459                                         keys %thisrank;
 460                                 delete $groupings->{$rdg};
 461                                 # If we found a group match, assume there is a symmetry happening.
 462                                 # TODO think more about this
 463                                 # print STDERR "*** Deleting symmetric reading $rdg\n";
 464                                 unless( $matched ) {
 465                                         delete $transposed->{$rdg};
 466                                         warn "Found problem in evident symmetry with reading $rdg";
 467                                 }
 468                         }
 469                 }
 470         # Otherwise 'unhook' the transposed reading(s) that have duplicates.
 471         } else {
 472                 foreach my $dup ( @doubled ) {
 473                         foreach my $rdg ( @{$seen_wits{$dup}} ) {
 474                                 next if $thisrank{$rdg};
 475                                 next unless exists $groupings->{$rdg};
 476                                 # print STDERR "*** Deleting asymmetric doubled-up reading $rdg\n";
 477                                 delete $groupings->{$rdg};
 478                                 delete $transposed->{$rdg};
 479                         }
 480                 }
 481                 # and put any now-orphaned readings into an 'omitted' reading.
 482                 foreach my $wit ( keys %seen_wits ) {
 483                         unless( grep { exists $groupings->{$_} } @{$seen_wits{$wit}} ) {
 484                                 $groupings->{'(omitted)'} = Set::Scalar->new()
 485                                          unless exists $groupings->{'(omitted)'};
 486                                 _add_to_witlist( $wit, $groupings->{'(omitted)'}, $c->ac_label );
 487                         }
 488                 }
 489         }
 490 }
 491
 492 # For the given grouping, return its situation graph based on the stemma.
 493 sub _graph_for_grouping {
 494         my( $stemma, $grouping, $lacunose, $aclabel ) = @_;
 495         my $acwits = [];
 496         my $extant = {};
 497         foreach my $gs ( values %$grouping ) {
 498                 map {
 499                         if( $_ =~ /^(.*)\Q$aclabel\E$/ ) {
 500                                 push( @$acwits, $1 ) unless $lacunose->has( $1 );
 501                         } else {
 502                                 $extant->{$_} = 1 unless $lacunose->has( $_ );
 503                         }
 504                 } $gs->members;
 505         }
 506         my $graph;
 507         try {
 508                 # contig contains all extant wits and all hypothetical wits
 509                 # needed to make up the groups.
 510                 $graph = $stemma->situation_graph( $extant, $acwits, $aclabel );
 511         } catch ( Text::Tradition::Error $e ) {
 512                 throw( "Could not extend graph with given extant and a.c. witnesses: "
 513                         . $e->message );
 514         } catch {
 515                 throw( "Could not extend graph with a.c. witnesses @$acwits" );
 516         }
 517         return $graph;
 518 }
 519
 520 =head2 solve_variants( $calcdir, @groups )
 521
 522 Looks up the set of groups in the answers provided by the external graph solver
 523 service and returns a cleaned-up answer, adding the rank IDs back where they belong.
 524
 525 The answer has the form
 526   { "variants" => [ array of variant location structures ],
 527     "variant_count" => total,
 528     "conflict_count" => number of conflicts detected,
 529     "genealogical_count" => number of solutions found }
 530
 531 =cut
 532
 533 sub solve_variants {
 534         my( @groups ) = @_;
 535
 536         # Are we using a local result directory, or did we pass an empty value
 537         # for the directory?
 538         my $dir;
 539         unless( ref( $groups[0] ) eq 'HASH' ) {
 540                 $dir = shift @groups;
 541         }
 542
 543         ## For each graph/group combo, make a Text::Tradition::Analysis::Result
 544         ## object so that we can send it off for IDP lookup.
 545         my $variants = [];
 546         my $genealogical = 0; # counter
 547         # TODO Optimize for unique graph problems
 548         my %problems;
 549         foreach my $graphproblem ( @groups ) {
 550                 # Construct the calc result key and look up its answer
 551                 my $problem = Text::Tradition::Analysis::Result->new(
 552                         graph => $graphproblem->{'graph'},
 553                         setlist => [ values %{$graphproblem->{'grouping'}} ] );
 554                 if( exists $problems{$problem->object_key} ) {
 555                         $problem = $problems{$problem->object_key};
 556                 } else {
 557                         $problems{$problem->object_key} = $problem;
 558                 }
 559                 $graphproblem->{'object'} = $problem;
 560         }
 561
 562         my %results;
 563         if( $dir ) {
 564                 my $scope = $dir->new_scope;
 565                 map { $results{$_} = $dir->lookup( $_ ) || $problems{$_} } keys %problems;
 566         } else {
 567                 my $json = JSON->new->allow_blessed->convert_blessed->utf8->encode(
 568                         [ values %problems ] );
 569                 # Send it off and get the result
 570                 # print STDERR "Sending request: " . decode_utf8( $json ) . "\n";
 571                 my $ua = LWP::UserAgent->new();
 572                 my $resp = $ua->post( $SOLVER_URL, 'Content-Type' => 'application/json',
 573                                                           'Content' => $json );
 574                 my $answer;
 575                 if( $resp->is_success ) {
 576                         $answer = decode_json( $resp->content );
 577                         throw( "Unexpected answer from IDP: $answer" ) unless ref( $answer ) eq 'ARRAY';
 578                 } else {
 579                         throw( "IDP solver returned " . $resp->status_line . " / " . $resp->content
 580                                 . "; cannot run graph analysis" );
 581                 }
 582                 # One more sanity check
 583                 throw( "Something went wrong with answer symmetricity" )
 584                         unless keys( %problems ) == @$answer;
 585                 # Convert the results
 586                 foreach my $a ( @$answer ) {
 587                         my $r = Text::Tradition::Analysis::Result->new( $a );
 588                         $results{$r->object_key} = $r;
 589                 }
 590         }
 591
 592         # We now have a single JSON-encoded Result object per problem sent. Fold its
 593         # answers into our variant info structure.
 594         foreach my $graphproblem ( @groups ) {
 595                 my $result = $results{$graphproblem->{'object'}->object_key}
 596                         || $graphproblem->{'object'};
 597
 598                 # Initialize the result structure for this graph problem
 599                 my $vstruct;
 600                 if( $result->status eq 'OK' ) {
 601                         $vstruct = { readings => [] };
 602                         push( @$variants, $vstruct );
 603                 } else {
 604                         push( @$variants, _init_unsolved( $graphproblem, $result->status ) );
 605                         next;
 606                 }
 607
 608                 # 1. Did the group evaluate as genealogical?
 609                 $vstruct->{genealogical} = $result->is_genealogical;
 610                 $genealogical++ if $result->is_genealogical;
 611
 612                 # 2. What are the calculated minimum groupings for each variant loc?
 613                 foreach my $rid ( keys %{$graphproblem->{grouping}} ) {
 614                         my $inputset = $graphproblem->{grouping}->{$rid};
 615                         my $minset = $result->minimum_grouping_for( $inputset );
 616                         push( @{$vstruct->{readings}}, { readingid => $rid, group => $minset } );
 617                 }
 618
 619                 # 3. What are the sources and classes calculated for each witness?
 620                 $vstruct->{witcopy_types} = { $result->classes };
 621                 $vstruct->{reading_roots} = {};
 622                 map { $vstruct->{reading_roots}->{$_} = 1 } $result->sources;
 623
 624         }
 625
 626         return { 'variants' => $variants,
 627                          'variant_count' => scalar @$variants,
 628                          'genealogical_count' => $genealogical };
 629 }
 630
 631 sub _init_unsolved {
 632         my( $graphproblem, $status ) = @_;
 633         my $vstruct = { 'readings' => [] };
 634         $vstruct->{'unsolved'} = $status;
 635         foreach my $rid ( keys %{$graphproblem->{grouping}} ) {
 636                 push( @{$vstruct->{readings}}, { readingid => $rid,
 637                         group => [ $graphproblem->{grouping}->{$rid}->members ] } );
 638         }
 639         return $vstruct;
 640 }
 641
 642 =head2 analyze_location ( $tradition, $graph, $location_hash )
 643
 644 Given the tradition, its stemma graph, and the solution from the graph solver,
 645 work out the rest of the information we want.  For each reading we need missing,
 646 conflict, reading_parents, independent_occurrence, followed, not_followed,
 647 and follow_unknown.  Alters the location_hash in place.
 648
 649 =cut
 650
 651 sub analyze_location {
 652         my ( $tradition, $graph, $variant_row, $lacunose ) = @_;
 653         my $c = $tradition->collation;
 654
 655         if( exists $variant_row->{'unsolved'} ) {
 656                 return;
 657         }
 658         my $reading_roots = delete $variant_row->{'reading_roots'};
 659         my $classinfo = delete $variant_row->{'witcopy_types'};
 660
 661         # Make a hash of all known node memberships, and make the subgraphs.
 662         my $contig = {};
 663         my $subgraph = {};
 664         my $acstr = $c->ac_label;
 665         my @acwits;
 666
 667         # Note which witnesses positively belong to which group. This information
 668         # comes ultimately from the IDP solver.
 669         # Also make a note of the reading's roots.
 670     foreach my $rdghash ( @{$variant_row->{'readings'}} ) {
 671         my $rid = $rdghash->{'readingid'};
 672         my @roots;
 673         foreach my $wit ( @{$rdghash->{'group'}} ) {
 674                 $contig->{$wit} = $rid;
 675             if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
 676                 push( @acwits, $1 );
 677             }
 678             if( exists $reading_roots->{$wit} && $reading_roots->{$wit} ) {
 679                 push( @roots, $wit );
 680             }
 681         }
 682                 $rdghash->{'independent_occurrence'} = \@roots;
 683         }
 684
 685         # Now that we have all the node group memberships, calculate followed/
 686     # non-followed/unknown values for each reading.  Also figure out the
 687     # reading's evident parent(s).
 688     foreach my $rdghash ( @{$variant_row->{'readings'}} ) {
 689         my $rid = $rdghash->{'readingid'};
 690         my $rdg = $c->reading( $rid );
 691         my @roots = @{$rdghash->{'independent_occurrence'}};
 692         my @reversions;
 693         if( $classinfo ) {
 694                 @reversions = grep { $classinfo->{$_} eq 'revert' }
 695                         $rdghash->{'group'}->members;
 696                 $rdghash->{'reversions'} = \@reversions;
 697         }
 698         my @group = @{$rdghash->{'group'}};
 699
 700         # Start figuring things out.
 701         $rdghash->{'followed'} = scalar( @group )
 702                 - ( scalar( @roots ) + scalar( @reversions ) );
 703         # Find the parent readings, if any, of this reading.
 704         my $sourceparents = _find_reading_parents( $rid, $graph, $contig, @roots );
 705                 # Work out relationships between readings and their non-followed parent.
 706                 _resolve_parent_relationships( $c, $rid, $rdg, $sourceparents );
 707                 $rdghash->{'source_parents'} = $sourceparents;
 708
 709                 if( @reversions ) {
 710                         my $revparents = _find_reading_parents( $rid, $graph, $contig, @reversions );
 711                         _resolve_parent_relationships( $c, $rid, $rdg, $revparents );
 712                         $rdghash->{'reversion_parents'} = $revparents;
 713                 }
 714
 715                 # Find the number of times this reading was altered, and the number of
 716                 # times we're not sure.
 717                 my( %nofollow, %unknownfollow );
 718                 foreach my $wit ( @{$rdghash->{'group'}} ) {
 719                         foreach my $wchild ( $graph->successors( $wit ) ) {
 720                                 if( $reading_roots->{$wchild} && $contig->{$wchild}
 721                                         && $contig->{$wchild} ne $rid ) {
 722                                         # It definitely changed here.
 723                                         $nofollow{$wchild} = 1;
 724                                 } elsif( !($contig->{$wchild}) ) {
 725                                         # The child is a hypothetical node not definitely in
 726                                         # any group. Answer is unknown.
 727                                         $unknownfollow{$wchild} = 1;
 728                                 } # else it is either in our group, or it is a non-root node in a
 729                                   # known group and therefore is presumed to have its reading from
 730                                   # its group, not this link.
 731                         }
 732                 }
 733                 $rdghash->{'not_followed'} = keys %nofollow;
 734                 $rdghash->{'follow_unknown'} = keys %unknownfollow;
 735
 736                 # Now say whether this reading represents a conflict.
 737                 unless( $variant_row->{'genealogical'} ) {
 738                         $rdghash->{'is_conflict'} = @roots != 1;
 739                         $rdghash->{'is_reverted'} = scalar @reversions;
 740                 }
 741     }
 742 }
 743
 744 sub _find_reading_parents {
 745         my( $rid, $graph, $contig, @list ) = @_;
 746         my $parenthash = {};
 747         foreach my $wit ( @list ) {
 748                 # Look in the stemma graph to find this witness's extant or known-reading
 749                 # immediate ancestor(s), and look up the reading that each ancestor holds.
 750                 my @check = $graph->predecessors( $wit );
 751                 while( @check ) {
 752                         my @next;
 753                         foreach my $wparent( @check ) {
 754                                 my $preading = $contig->{$wparent};
 755                                 if( $preading && $preading ne $rid ) {
 756                                         $parenthash->{$preading} = 1;
 757                                 } else {
 758                                         push( @next, $graph->predecessors( $wparent ) );
 759                                 }
 760                         }
 761                         @check = @next;
 762                 }
 763         }
 764         return $parenthash;
 765 }
 766
 767 sub _resolve_parent_relationships {
 768         my( $c, $rid, $rdg, $rdgparents ) = @_;
 769         foreach my $p ( keys %$rdgparents ) {
 770                 # Resolve the relationship of the parent to the reading, and
 771                 # save it in our hash.
 772                 my $pobj = $c->reading( $p );
 773                 my $prep = $pobj ? $pobj->id . ' (' . $pobj->text . ')' : $p;
 774                 my $phash = { 'label' => $prep };
 775                 if( $pobj ) {
 776                         my $rel = $c->get_relationship( $p, $rid );
 777                         if( $rel ) {
 778                                 _add_to_hash( $rel, $phash );
 779                         } elsif( $rdg ) {
 780                                 # First check for a transposed relationship
 781                                 if( $rdg->rank != $pobj->rank ) {
 782                                         foreach my $ti ( $rdg->related_readings( 'transposition' ) ) {
 783                                                 next unless $ti->text eq $rdg->text;
 784                                                 $rel = $c->get_relationship( $ti, $pobj );
 785                                                 if( $rel ) {
 786                                                         _add_to_hash( $rel, $phash, 1 );
 787                                                         last;
 788                                                 }
 789                                         }
 790                                         unless( $rel ) {
 791                                                 foreach my $ti ( $pobj->related_readings( 'transposition' ) ) {
 792                                                         next unless $ti->text eq $pobj->text;
 793                                                         $rel = $c->get_relationship( $ti, $rdg );
 794                                                         if( $rel ) {
 795                                                                 _add_to_hash( $rel, $phash, 1 );
 796                                                                 last;
 797                                                         }
 798                                                 }
 799                                         }
 800                                 }
 801                                 unless( $rel ) {
 802                                         # and then check for sheer word similarity.
 803                                         my $rtext = $rdg->text;
 804                                         my $ptext = $pobj->text;
 805                                         if( similar( $rtext, $ptext ) ) {
 806                                                 # say STDERR "Words $rtext and $ptext judged similar";
 807                                                 $phash->{relation} = { type => 'wordsimilar' };
 808                                         }
 809                                 }
 810                         } else {
 811                                 $phash->{relation} = { type => 'deletion' };
 812                         }
 813                         # Get the attributes of the parent object while we are here
 814                         $phash->{'text'} = $pobj->text if $pobj;
 815                         if( $pobj && $pobj->does('Text::Tradition::Morphology') ) {
 816                                 $phash->{'is_nonsense'} = $pobj->is_nonsense;
 817                                 $phash->{'is_ungrammatical'} = $pobj->grammar_invalid;
 818                         }
 819                 } elsif( $p eq '(omitted)' ) {
 820                         $phash->{relation} = { type => 'addition' };
 821                 }
 822                 # Save it
 823                 $rdgparents->{$p} = $phash;
 824         }
 825 }
 826
 827 sub _add_to_hash {
 828         my( $rel, $phash, $is_transposed ) = @_;
 829         $phash->{relation} = { type => $rel->type };
 830         $phash->{relation}->{transposed} = 1 if $is_transposed;
 831         $phash->{relation}->{annotation} = $rel->annotation
 832                 if $rel->has_annotation;
 833 }
 834
 835 =head2 similar( $word1, $word2 )
 836
 837 Use Algorithm::Diff to get a sense of how close the words are to each other.
 838 This will hopefully handle substitutions a bit more nicely than Levenshtein.
 839
 840 =cut
 841
 842 #!/usr/bin/env perl
 843
 844 sub similar {
 845         my( $word1, $word2 ) = sort { length($a) <=> length($b) } @_;
 846         my @let1 = split( '', lc( $word1 ) );
 847         my @let2 = split( '', lc( $word2 ) );
 848         my $diff = Algorithm::Diff->new( \@let1, \@let2 );
 849         my $mag = 0;
 850         while( $diff->Next ) {
 851                 if( $diff->Same ) {
 852                         # Take off points for longer strings
 853                         my $cs = $diff->Range(1) - 2;
 854                         $cs = 0 if $cs < 0;
 855                         $mag -= $cs;
 856                 } elsif( !$diff->Items(1) ) {
 857                         $mag += $diff->Range(2);
 858                 } elsif( !$diff->Items(2) ) {
 859                         $mag += $diff->Range(1);
 860                 } else {
 861                         # Split the difference for substitutions
 862                         my $c1 = $diff->Range(1) || 1;
 863                         my $c2 = $diff->Range(2) || 1;
 864                         my $cd = ( $c1 + $c2 ) / 2;
 865                         $mag += $cd;
 866                 }
 867         }
 868         return ( $mag <= length( $word1 ) / 2 );
 869 }
 870
 871 sub _useful_variant {
 872         my( $rankgroup, $rankgraph, $acstr ) = @_;
 873
 874         # Sort by group size and return
 875         my $is_useful = 0;
 876         foreach my $rdg ( keys %$rankgroup ) {
 877                 my @wits = $rankgroup->{$rdg}->members;
 878                 if( @wits > 1 ) {
 879                         $is_useful++;
 880                 } else {
 881                         $is_useful++ unless( $rankgraph->is_sink_vertex( $wits[0] )
 882                                 || $wits[0] =~ /\Q$acstr\E$/ );
 883                 }
 884         }
 885         return $is_useful > 1;
 886 }
 887
 888 =head2 wit_stringify( $groups )
 889
 890 Takes an array of witness groupings and produces a string like
 891 ['A','B'] / ['C','D','E'] / ['F']
 892
 893 =cut
 894
 895 sub wit_stringify {
 896     my $groups = shift;
 897     my @gst;
 898     # If we were passed an array of witnesses instead of an array of
 899     # groupings, then "group" the witnesses first.
 900     unless( ref( $groups->[0] ) ) {
 901         my $mkgrp = [ $groups ];
 902         $groups = $mkgrp;
 903     }
 904     foreach my $g ( @$groups ) {
 905         push( @gst, '[' . join( ',', map { "'$_'" } @$g ) . ']' );
 906     }
 907     return join( ' / ', @gst );
 908 }
 909
 910 1;
 911
 912 sub throw {
 913         Text::Tradition::Error->throw(
 914                 'ident' => 'Analysis error',
 915                 'message' => $_[0],
 916         );
 917 }
 918
 919 =head1 LICENSE
 920
 921 This package is free software and is provided "as is" without express
 922 or implied warranty.  You can redistribute it and/or modify it under
 923 the same terms as Perl itself.
 924
 925 =head1 AUTHOR
 926
 927 Tara L Andrews E<lt>aurum@cpan.orgE<gt>