analysis/lib/Text/Tradition/Analysis.pm

   1 package Text::Tradition::Analysis;
   2
   3 use strict;
   4 use warnings;
   5 use Algorithm::Diff;  # for word similarity measure
   6 use Encode qw/ decode_utf8 encode_utf8 /;
   7 use Exporter 'import';
   8 use Graph;
   9 use JSON qw/ to_json decode_json /;
  10 use LWP::UserAgent;
  11 use Set::Scalar;
  12 use Text::Tradition::Analysis::Result;
  13 use Text::Tradition::Stemma;
  14 use TryCatch;
  15
  16 use vars qw/ @EXPORT_OK $VERSION /;
  17 @EXPORT_OK = qw/ run_analysis group_variants analyze_variant_location wit_stringify /;
  18 $VERSION = "1.1";
  19
  20
  21 my $SOLVER_URL = 'http://byzantini.st/cgi-bin/graphcalc.cgi';
  22 my $unsolved_problems = {};
  23
  24 =head1 NAME
  25
  26 Text::Tradition::Analysis - functions for stemma analysis of a tradition
  27
  28 =head1 DESCRIPTION
  29
  30 Text::Tradition is a library for representation and analysis of collated
  31 texts, particularly medieval ones.  Where the Collation is the central
  32 feature of a Tradition, it may also have one or more stemmata associated
  33 with it, and these stemmata may be analyzed. This package provides the
  34 following modules:
  35
  36 =over 4
  37
  38 =item * L<Text::Tradition::HasStemma> - a role that will be composed into
  39 Text::Tradition objects, providing the ability for Text::Tradition::Stemma
  40 objects to be associated with them.
  41
  42 =item * L<Text::Tradition::Stemma> - an object class that represents stemma
  43 hypotheses, both rooted (with a single archetype) and unrooted (e.g.
  44 phylogenetic trees).
  45
  46 =item * Text::Tradition::Analysis (this package). Provides functions for
  47 the analysis of a given stemma against the collation within a given
  48 Tradition.
  49
  50 =back
  51
  52 =head1 SYNOPSIS
  53
  54   use Text::Tradition;
  55   use Text::Tradition::Analysis qw/ run_analysis analyze_variant_location /;
  56   my $t = Text::Tradition->new(
  57     'name' => 'this is a text',
  58     'input' => 'TEI',
  59     'file' => '/path/to/tei_parallel_seg_file.xml' );
  60   $t->add_stemma( 'dotfile' => $stemmafile );
  61
  62   my $variant_data = run_analysis( $tradition );
  63
  64 =head1 SUBROUTINES
  65
  66 =head2 run_analysis( $tradition, %opts )
  67
  68 Runs the analysis described in analyze_variant_location on every location in the
  69 collation of the given tradition, with the given options. These include:
  70
  71 =over 4
  72
  73 =item * stemma_id - Specify which of the tradition's stemmata to use. Default
  74 is 0 (i.e. the first).
  75
  76 =item * ranks - Specify a list of location ranks to analyze; exclude the rest.
  77
  78 =item * merge_types - Specify a list of relationship types, where related readings
  79 should be treated as identical for the purposes of analysis.
  80
  81 =item * exclude_type1 - Exclude those ranks whose groupings have only type-1 variants.
  82
  83 =back
  84
  85 =begin testing
  86
  87 use Text::Tradition;
  88 use Text::Tradition::Analysis qw/ run_analysis analyze_variant_location /;
  89
  90 my $datafile = 't/data/florilegium_tei_ps.xml';
  91 my $tradition = Text::Tradition->new( 'input' => 'TEI',
  92                                       'name' => 'test0',
  93                                       'file' => $datafile );
  94 my $s = $tradition->add_stemma( 'dotfile' => 't/data/florilegium.dot' );
  95 is( ref( $s ), 'Text::Tradition::Stemma', "Added stemma to tradition" );
  96
  97 my %expected_genealogical = (
  98         1 => 0,
  99         2 => 1,
 100         3 =>  0,
 101         5 =>  0,
 102         7 =>  0,
 103         8 =>  0,
 104         10 => 0,
 105         13 => 1,
 106         33 => 0,
 107         34 => 0,
 108         37 => 0,
 109         60 => 0,
 110         81 => 1,
 111         84 => 0,
 112         87 => 0,
 113         101 => 0,
 114         102 => 0,
 115         122 => 1,
 116         157 => 0,
 117         166 => 1,
 118         169 => 1,
 119         200 => 0,
 120         216 => 1,
 121         217 => 1,
 122         219 => 1,
 123         241 => 1,
 124         242 => 1,
 125         243 => 1,
 126 );
 127
 128 my $data = run_analysis( $tradition, calcdsn => 'dbi:SQLite:dbname=t/data/analysis.db' );
 129 my $c = $tradition->collation;
 130 foreach my $row ( @{$data->{'variants'}} ) {
 131         # Account for rows that used to be "not useful"
 132         unless( exists $expected_genealogical{$row->{'id'}} ) {
 133                 $expected_genealogical{$row->{'id'}} = 1;
 134         }
 135         my $gen_bool = $row->{'genealogical'} ? 1 : 0;
 136         is( $gen_bool, $expected_genealogical{$row->{'id'}},
 137                 "Got correct genealogical flag for row " . $row->{'id'} );
 138         # Check that we have the right row with the right groups
 139         my $rank = $row->{'id'};
 140         foreach my $rdghash ( @{$row->{'readings'}} ) {
 141                 # Skip 'readings' that aren't really
 142                 next unless $c->reading( $rdghash->{'readingid'} );
 143                 # Check the rank
 144                 is( $c->reading( $rdghash->{'readingid'} )->rank, $rank,
 145                         "Got correct reading rank" );
 146                 # Check the witnesses
 147                 my @realwits = sort $c->reading_witnesses( $rdghash->{'readingid'} );
 148                 my @sgrp = sort @{$rdghash->{'group'}};
 149                 is_deeply( \@sgrp, \@realwits, "Reading analyzed with correct groups" );
 150         }
 151 }
 152 is( $data->{'variant_count'}, 58, "Got right total variant number" );
 153 # TODO Make something meaningful of conflict count, maybe test other bits
 154
 155 =end testing
 156
 157 =cut
 158
 159 sub run_analysis {
 160         my( $tradition, %opts ) = @_;
 161         my $c = $tradition->collation;
 162         my $aclabel = $c->ac_label;
 163
 164         my $stemma_id = $opts{'stemma_id'} || 0;
 165         my @ranks = ref( $opts{'ranks'} ) eq 'ARRAY' ? @{$opts{'ranks'}} : ();
 166         my $collapse = Set::Scalar->new();
 167         if( $opts{'merge_types'} && ref( $opts{'merge_types'} ) eq 'ARRAY' ) {
 168                 $collapse->insert( @{$opts{'merge_types'}} );
 169         } elsif( $opts{'merge_types'} ) {
 170                 $collapse->insert( $opts{'merge_types'} );
 171         }
 172
 173         # Make sure we have a lookup DB for graph calculation results; this will die
 174         # if calcdir or calcdsn isn't passed.
 175         my $dir;
 176         if( exists $opts{'calcdir'} ) {
 177                 $dir = delete $opts{'calcdir'}
 178         } elsif ( exists $opts{'calcdsn'} ) {
 179                 eval { require Text::Tradition::Directory };
 180                 if( $@ ) {
 181                         throw( "Could not instantiate a directory for " . $opts{'calcdsn'}
 182                                 . ": $@" );
 183                 }
 184                 $dir = Text::Tradition::Directory->new( dsn => $opts{'calcdsn'} );
 185         }
 186
 187         # Get the stemma
 188         my $stemma = $tradition->stemma( $stemma_id );
 189
 190         # Figure out which witnesses we are working with - that is, the ones that
 191         # appear both in the stemma and in the tradition. All others are 'lacunose'
 192         # for our purposes.
 193         my $lacunose = Set::Scalar->new( $stemma->hypotheticals );
 194         my $stemma_wits = Set::Scalar->new( $stemma->witnesses );
 195         my $tradition_wits = Set::Scalar->new( map { $_->sigil } $tradition->witnesses );
 196         $lacunose->insert( $stemma_wits->symmetric_difference( $tradition_wits )->members );
 197
 198         # Find and mark 'common' ranks for exclusion, unless they were
 199         # explicitly specified.
 200         unless( @ranks ) {
 201                 my %common_rank;
 202                 foreach my $rdg ( $c->common_readings ) {
 203                         $common_rank{$rdg->rank} = 1;
 204                 }
 205                 @ranks = grep { !$common_rank{$_} } ( 1 .. $c->end->rank-1 );
 206         }
 207
 208         # Group the variants to send to the solver
 209         my @groups;
 210         my @use_ranks;
 211         my %lacunae;
 212         my $moved = {};
 213         foreach my $rank ( @ranks ) {
 214                 my $missing = $lacunose->clone();
 215                 my $rankgroup = group_variants( $tradition, $rank, $missing, $moved, $collapse );
 216                 # Filter out any empty rankgroups
 217                 # (e.g. from the later rank for a transposition)
 218                 next unless keys %$rankgroup;
 219                 # Get the graph for this rankgroup
 220                 my $rankgraph = _graph_for_grouping( $stemma, $rankgroup, $missing, $aclabel );
 221                 if( $opts{'exclude_type1'} ) {
 222                         # Check to see whether this is a "useful" group.
 223                         next unless _useful_variant( $rankgroup, $rankgraph, $aclabel );
 224                 }
 225                 push( @use_ranks, $rank );
 226                 push( @groups, { grouping => $rankgroup, graph => $rankgraph } );
 227                 $lacunae{$rank} = $missing;
 228         }
 229         # Run the solver
 230         my $answer;
 231         try {
 232                 $answer = solve_variants( $dir, @groups );
 233         } catch ( Text::Tradition::Error $e ) {
 234                 if( $e->message =~ /IDP/ ) {
 235                         # Something is wrong with the solver; make the variants table anyway
 236                         $answer->{'variants'} = [];
 237                         map { push( @{$answer->{'variants'}}, _init_unsolved( $_, 'IDP error' ) ) }
 238                                 @groups;
 239                 } else {
 240                         # Something else is wrong; error out.
 241                         $e->throw;
 242                 }
 243         }
 244
 245         # Do further analysis on the answer
 246         my $conflict_count = 0;
 247         my $reversion_count = 0;
 248         foreach my $idx ( 0 .. $#use_ranks ) {
 249                 my $location = $answer->{'variants'}->[$idx];
 250                 # Add the rank back in
 251                 my $rank = $use_ranks[$idx];
 252                 $location->{'id'} = $rank;
 253                 # Note what our lacunae are
 254                 my %lmiss;
 255                 map { $lmiss{$_} = 1 } @{$lacunae{$use_ranks[$idx]}};
 256                 $location->{'missing'} = [ keys %lmiss ];
 257
 258                 # Run the extra analysis we need.
 259                 ## TODO We run through all the variants in this call, so
 260                 ## why not add the reading data there instead of here below?
 261                 my $graph = $groups[$idx]->{graph};
 262                 analyze_location( $tradition, $graph, $location, \%lmiss );
 263
 264                 my @layerwits;
 265                 # Do the final post-analysis tidying up of the data.
 266                 foreach my $rdghash ( @{$location->{'readings'}} ) {
 267                         $conflict_count++ if $rdghash->{'is_conflict'};
 268                         $reversion_count++ if $rdghash->{'is_reverted'};
 269                         # Add the reading text back in, setting display value as needed
 270                         my $rdg = $c->reading( $rdghash->{'readingid'} );
 271                         if( $rdg ) {
 272                                 $rdghash->{'text'} = $rdg->text .
 273                                         ( $rdg->rank == $rank ? '' : ' [' . $rdg->rank . ']' );
 274                                 if( $rdg->does( 'Text::Tradition::Morphology' ) ) {
 275                                         $rdghash->{'is_ungrammatical'} = $rdg->grammar_invalid;
 276                                         $rdghash->{'is_nonsense'} = $rdg->is_nonsense;
 277                                 }
 278                         }
 279                         # Remove lacunose witnesses from this reading's list now that the
 280                         # analysis is done
 281                         my @realgroup;
 282                         map { push( @realgroup, $_ ) unless $lmiss{$_} } @{$rdghash->{'group'}};
 283                         $rdghash->{'group'} = \@realgroup;
 284                         # Note any layered witnesses that appear in this group
 285                         foreach( @realgroup ) {
 286                                 if( $_ =~ /^(.*)\Q$aclabel\E$/ ) {
 287                                         push( @layerwits, $1 );
 288                                 }
 289                         }
 290                 }
 291                 $location->{'layerwits'} = \@layerwits if @layerwits;
 292         }
 293         $answer->{'conflict_count'} = $conflict_count;
 294         $answer->{'reversion_count'} = $reversion_count;
 295
 296         return $answer;
 297 }
 298
 299 =head2 group_variants( $tradition, $rank, $lacunose, $transposed, $merge_relationship_types )
 300
 301 Groups the variants at the given $rank of the collation, treating any
 302 relationships in the set $merge_relationship_types as equivalent.
 303 $lacunose should be a reference to an array, to which the sigla of lacunose
 304 witnesses at this rank will be appended; $transposed should be a reference
 305 to a hash, wherein the identities of transposed readings and their
 306 relatives will be stored.
 307
 308 Returns a hash $group_readings where $rdg is attested by the witnesses listed
 309 in $group_readings->{$rdg}.
 310
 311 =cut
 312
 313 # Return group_readings, groups, lacunose
 314 sub group_variants {
 315         my( $tradition, $rank, $lacunose, $transposed, $collapse ) = @_;
 316         my $c = $tradition->collation;
 317         my $aclabel = $c->ac_label;
 318         my $table = $c->alignment_table;
 319         # Get the alignment table readings
 320         my %readings_at_rank;
 321         my $check_for_gaps = Set::Scalar->new();
 322         my %moved_wits;
 323         my $has_transposition;
 324         foreach my $tablewit ( @{$table->{'alignment'}} ) {
 325                 my $rdg = $tablewit->{'tokens'}->[$rank-1];
 326                 my $wit = $tablewit->{'witness'};
 327                 # Exclude the witness if it is "lacunose" which if we got here
 328                 # means "not in the stemma".
 329                 next if _is_lacunose( $wit, $lacunose, $aclabel );
 330                 # Note if the witness is actually in a lacuna
 331                 if( $rdg && $rdg->{'t'}->is_lacuna ) {
 332                         _add_to_witlist( $wit, $lacunose, $aclabel );
 333                 # Otherwise the witness either has a positive reading...
 334                 } elsif( $rdg ) {
 335                         # If the reading has been counted elsewhere as a transposition, ignore it.
 336                         if( $transposed->{$rdg->{'t'}->id} ) {
 337                                 # TODO Does this cope with three-way transpositions?
 338                                 map { $moved_wits{$_} = 1 } @{$transposed->{$rdg->{'t'}->id}};
 339                                 next;
 340                         }
 341                         # Otherwise, record it...
 342                         $readings_at_rank{$rdg->{'t'}->id} = $rdg->{'t'};
 343                         # ...and grab any transpositions, and their relations.
 344                         my @transp = grep { $_->rank != $rank } _all_related( $rdg->{'t'} );
 345                         foreach my $trdg ( @transp ) {
 346                                 next if exists $readings_at_rank{$trdg->id};
 347                                 $has_transposition = 1;
 348                                 my @affected_wits = _table_witnesses(
 349                                         $table, $trdg, $lacunose, $aclabel );
 350                                 next unless @affected_wits;
 351                                 map { $moved_wits{$_} = 1 } @affected_wits;
 352                                 $transposed->{$trdg->id} =
 353                                         [ _table_witnesses( $table, $rdg->{'t'}, $lacunose, $aclabel ) ];
 354                                 $readings_at_rank{$trdg->id} = $trdg;
 355                         }
 356                 # ...or it is empty, ergo a gap.
 357                 } else {
 358                         _add_to_witlist( $wit, $check_for_gaps, $aclabel );
 359                 }
 360         }
 361         my $gap_wits = Set::Scalar->new();
 362         map { _add_to_witlist( $_, $gap_wits, $aclabel )
 363                 unless $moved_wits{$_} } $check_for_gaps->members;
 364
 365         # Group the readings, collapsing groups by relationship if needed.
 366         my $grouped_readings = {};
 367         foreach my $rdg ( values %readings_at_rank ) {
 368                 # Skip readings that have been collapsed into others.
 369                 next if exists $grouped_readings->{$rdg->id}
 370                         && $grouped_readings->{$rdg->id} eq 'COLLAPSE';
 371                 # Get the witness list, including from readings collapsed into this one.
 372                 my @wits = _table_witnesses( $table, $rdg, $lacunose, $aclabel );
 373                 if( $collapse && $collapse->size ) {
 374                         my $filter = sub { $collapse->has( $_[0]->type ) };
 375                         foreach my $other ( $rdg->related_readings( $filter ) ) {
 376                                 my @otherwits = _table_witnesses( $table, $other, $lacunose, $aclabel );
 377                                 push( @wits, @otherwits );
 378                                 $grouped_readings->{$other->id} = 'COLLAPSE';
 379                         }
 380                 }
 381                 $grouped_readings->{$rdg->id} = Set::Scalar->new( @wits );
 382         }
 383         if( $gap_wits->members ) {
 384                 $grouped_readings->{'(omitted)'} = $gap_wits;
 385         }
 386
 387         # Get rid of our collapsed readings
 388         map { delete $grouped_readings->{$_} if(
 389                          $grouped_readings->{$_} eq 'COLLAPSE'
 390                          || $grouped_readings->{$_}->is_empty ) }
 391                 keys %$grouped_readings;
 392
 393         # If something was transposed, check the groups for doubled-up readings
 394         if( $has_transposition ) {
 395                 # print STDERR "Group for rank $rank:\n";
 396                 # map { print STDERR "\t$_: " . join( ' ' , @{$grouped_readings->{$_}} ) . "\n" }
 397                 #       keys %$grouped_readings;
 398                 _check_transposed_consistency( $c, $rank, $transposed, $grouped_readings );
 399         }
 400
 401         # Return the result
 402         return $grouped_readings;
 403 }
 404
 405 sub _all_related {
 406         # Except by repetition
 407         my $rdg = shift;
 408         my $c = $rdg->collation;
 409         my @check = ( $rdg );
 410         my %seen;
 411         while( @check ) {
 412                 my @next;
 413                 foreach my $ck ( @check ) {
 414                         $seen{"$ck"} = 1;
 415                         push( @next, grep { !$seen{"$_"} }
 416                                 $ck->related_readings( sub { $_[0]->type ne 'repetition' } ) );
 417                 }
 418                 @check = @next;
 419         }
 420
 421
 422         my @all = map { $c->reading( $_ ) } keys %seen;
 423         return @all;
 424 }
 425
 426
 427 # Helper function to query the alignment table for all witnesses (a.c. included)
 428 # that have a given reading at its rank.
 429 sub _table_witnesses {
 430         my( $table, $trdg, $lacunose, $aclabel ) = @_;
 431         my $tableidx = $trdg->rank - 1;
 432         my $has_reading = Set::Scalar->new();
 433         foreach my $row ( @{$table->{'alignment'}} ) {
 434                 my $wit = $row->{'witness'};
 435                 next if _is_lacunose( $wit, $lacunose, $aclabel );
 436                 my $rdg = $row->{'tokens'}->[$tableidx];
 437                 next unless exists $rdg->{'t'} && defined $rdg->{'t'};
 438                 _add_to_witlist( $wit, $has_reading, $aclabel )
 439                         if $rdg->{'t'}->id eq $trdg->id;
 440         }
 441         return $has_reading->members;
 442 }
 443
 444 # Helper function to see if a witness is lacunose even if we are asking about
 445 # the a.c. version
 446 sub _is_lacunose {
 447         my ( $wit, $lac, $acstr ) = @_;
 448         if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
 449                 $wit = $1;
 450         }
 451         return $lac->has( $wit );
 452 }
 453
 454 # Helper function to ensure that X and X a.c. never appear in the same list.
 455 sub _add_to_witlist {
 456         my( $wit, $list, $acstr ) = @_;
 457         if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
 458                 # Don't add X a.c. if we already have X
 459                 return if $list->has( $1 );
 460         } else {
 461                 # Delete X a.c. if we are about to add X
 462                 $list->delete( $wit.$acstr );
 463         }
 464         $list->insert( $wit );
 465 }
 466
 467 sub _check_transposed_consistency {
 468         my( $c, $rank, $transposed, $groupings ) = @_;
 469         my %seen_wits;
 470         my %thisrank;
 471         # Note which readings are actually at this rank, and which witnesses
 472         # belong to which reading.
 473         foreach my $rdg ( keys %$groupings ) {
 474                 my $rdgobj = $c->reading( $rdg );
 475                 # Count '(omitted)' as a reading at this rank
 476                 $thisrank{$rdg} = 1 if !$rdgobj || $rdgobj->rank == $rank;
 477                 map { push( @{$seen_wits{$_}}, $rdg ) } @{$groupings->{$rdg}};
 478         }
 479         # Our work is done if we have no witness belonging to more than one
 480         # reading.
 481         my @doubled = grep { scalar @{$seen_wits{$_}} > 1 } keys %seen_wits;
 482         return unless @doubled;
 483         # If we have a symmetric related transposition, drop the non-rank readings.
 484         if( @doubled == scalar keys %seen_wits ) {
 485                 foreach my $rdg ( keys %$groupings ) {
 486                         if( !$thisrank{$rdg} ) {
 487                                 my $groupstr = wit_stringify( $groupings->{$rdg} );
 488                                 my ( $matched ) = grep { $groupstr eq wit_stringify( $groupings->{$_} ) }
 489                                         keys %thisrank;
 490                                 delete $groupings->{$rdg};
 491                                 # If we found a group match, assume there is a symmetry happening.
 492                                 # TODO think more about this
 493                                 # print STDERR "*** Deleting symmetric reading $rdg\n";
 494                                 unless( $matched ) {
 495                                         delete $transposed->{$rdg};
 496                                         warn "Found problem in evident symmetry with reading $rdg";
 497                                 }
 498                         }
 499                 }
 500         # Otherwise 'unhook' the transposed reading(s) that have duplicates.
 501         } else {
 502                 foreach my $dup ( @doubled ) {
 503                         foreach my $rdg ( @{$seen_wits{$dup}} ) {
 504                                 next if $thisrank{$rdg};
 505                                 next unless exists $groupings->{$rdg};
 506                                 # print STDERR "*** Deleting asymmetric doubled-up reading $rdg\n";
 507                                 delete $groupings->{$rdg};
 508                                 delete $transposed->{$rdg};
 509                         }
 510                 }
 511                 # and put any now-orphaned readings into an 'omitted' reading.
 512                 foreach my $wit ( keys %seen_wits ) {
 513                         unless( grep { exists $groupings->{$_} } @{$seen_wits{$wit}} ) {
 514                                 $groupings->{'(omitted)'} = Set::Scalar->new()
 515                                          unless exists $groupings->{'(omitted)'};
 516                                 _add_to_witlist( $wit, $groupings->{'(omitted)'}, $c->ac_label );
 517                         }
 518                 }
 519         }
 520 }
 521
 522 # For the given grouping, return its situation graph based on the stemma.
 523 sub _graph_for_grouping {
 524         my( $stemma, $grouping, $lacunose, $aclabel ) = @_;
 525         my $acwits = [];
 526         my $extant = {};
 527         foreach my $gs ( values %$grouping ) {
 528                 map {
 529                         if( $_ =~ /^(.*)\Q$aclabel\E$/ ) {
 530                                 push( @$acwits, $1 ) unless $lacunose->has( $1 );
 531                         } else {
 532                                 $extant->{$_} = 1 unless $lacunose->has( $_ );
 533                         }
 534                 } $gs->members;
 535         }
 536         my $graph;
 537         try {
 538                 # contig contains all extant wits and all hypothetical wits
 539                 # needed to make up the groups.
 540                 $graph = $stemma->situation_graph( $extant, $acwits, $aclabel );
 541         } catch ( Text::Tradition::Error $e ) {
 542                 $DB::single = 1;
 543                 throw( "Could not extend graph with given extant and a.c. witnesses: "
 544                         . $e->message );
 545         } catch {
 546                 throw( "Could not extend graph with a.c. witnesses @$acwits" );
 547         }
 548         return $graph;
 549 }
 550
 551 =head2 solve_variants( $calcdir, @groups )
 552
 553 Looks up the set of groups in the answers provided by the external graph solver
 554 service and returns a cleaned-up answer, adding the rank IDs back where they belong.
 555
 556 The answer has the form
 557   { "variants" => [ array of variant location structures ],
 558     "variant_count" => total,
 559     "conflict_count" => number of conflicts detected,
 560     "genealogical_count" => number of solutions found }
 561
 562 =cut
 563
 564 sub solve_variants {
 565         my( @groups ) = @_;
 566
 567         # Are we using a local result directory, or did we pass an empty value
 568         # for the directory?
 569         my $dir;
 570         unless( ref( $groups[0] ) eq 'HASH' ) {
 571                 $dir = shift @groups;
 572         }
 573
 574         ## For each graph/group combo, make a Text::Tradition::Analysis::Result
 575         ## object so that we can send it off for IDP lookup.
 576         my $variants = [];
 577         my $genealogical = 0; # counter
 578         # TODO Optimize for unique graph problems
 579         my %problems;
 580         foreach my $graphproblem ( @groups ) {
 581                 # Construct the calc result key and look up its answer
 582                 my $problem = Text::Tradition::Analysis::Result->new(
 583                         graph => $graphproblem->{'graph'},
 584                         setlist => [ values %{$graphproblem->{'grouping'}} ] );
 585                 if( exists $problems{$problem->object_key} ) {
 586                         $problem = $problems{$problem->object_key};
 587                 } else {
 588                         $problems{$problem->object_key} = $problem;
 589                 }
 590                 $graphproblem->{'object'} = $problem;
 591         }
 592
 593         my %results;
 594         if( $dir ) {
 595                 my $scope = $dir->new_scope;
 596                 map { $results{$_} = $dir->lookup( $_ ) || $problems{$_} } keys %problems;
 597         } else {
 598                 my $json = JSON->new->allow_blessed->convert_blessed->utf8->encode(
 599                         [ values %problems ] );
 600                 # Send it off and get the result
 601                 # print STDERR "Sending request: " . decode_utf8( $json ) . "\n";
 602                 my $ua = LWP::UserAgent->new();
 603                 my $resp = $ua->post( $SOLVER_URL, 'Content-Type' => 'application/json',
 604                                                           'Content' => $json );
 605                 my $answer;
 606                 if( $resp->is_success ) {
 607                         $answer = decode_json( $resp->content );
 608                         throw( "Unexpected answer from IDP: $answer" ) unless ref( $answer ) eq 'ARRAY';
 609                 } else {
 610                         throw( "IDP solver returned " . $resp->status_line . " / " . $resp->content
 611                                 . "; cannot run graph analysis" );
 612                 }
 613                 # One more sanity check
 614                 throw( "Something went wrong with answer symmetricity" )
 615                         unless keys( %problems ) == @$answer;
 616                 # Convert the results
 617                 foreach my $a ( @$answer ) {
 618                         my $r = Text::Tradition::Analysis::Result->new( $a );
 619                         $results{$r->object_key} = $r;
 620                 }
 621         }
 622
 623         # We now have a single JSON-encoded Result object per problem sent. Fold its
 624         # answers into our variant info structure.
 625         foreach my $graphproblem ( @groups ) {
 626                 my $result = $results{$graphproblem->{'object'}->object_key}
 627                         || $graphproblem->{'object'};
 628
 629                 # Initialize the result structure for this graph problem
 630                 my $vstruct;
 631                 if( $result->status eq 'OK' ) {
 632                         $vstruct = { readings => [] };
 633                         push( @$variants, $vstruct );
 634                 } else {
 635                         push( @$variants, _init_unsolved( $graphproblem, $result->status ) );
 636                         next;
 637                 }
 638
 639                 # 1. Did the group evaluate as genealogical?
 640                 $vstruct->{genealogical} = $result->is_genealogical;
 641                 $genealogical++ if $result->is_genealogical;
 642
 643                 # 2. What are the calculated minimum groupings for each variant loc?
 644                 foreach my $rid ( keys %{$graphproblem->{grouping}} ) {
 645                         my $inputset = $graphproblem->{grouping}->{$rid};
 646                         my $minset = $result->minimum_grouping_for( $inputset );
 647                         push( @{$vstruct->{readings}}, { readingid => $rid, group => $minset } );
 648                 }
 649
 650                 # 3. What are the sources and classes calculated for each witness?
 651                 $vstruct->{witcopy_types} = { $result->classes };
 652                 $vstruct->{reading_roots} = {};
 653                 map { $vstruct->{reading_roots}->{$_} = 1 } $result->sources;
 654
 655         }
 656
 657         return { 'variants' => $variants,
 658                          'variant_count' => scalar @$variants,
 659                          'genealogical_count' => $genealogical };
 660 }
 661
 662 sub _init_unsolved {
 663         my( $graphproblem, $status ) = @_;
 664         my $vstruct = { 'readings' => [] };
 665         $vstruct->{'unsolved'} = $status;
 666         foreach my $rid ( keys %{$graphproblem->{grouping}} ) {
 667                 push( @{$vstruct->{readings}}, { readingid => $rid,
 668                         group => [ $graphproblem->{grouping}->{$rid}->members ] } );
 669         }
 670         return $vstruct;
 671 }
 672
 673 =head2 analyze_location ( $tradition, $graph, $location_hash )
 674
 675 Given the tradition, its stemma graph, and the solution from the graph solver,
 676 work out the rest of the information we want.  For each reading we need missing,
 677 conflict, reading_parents, independent_occurrence, followed, not_followed,
 678 and follow_unknown.  Alters the location_hash in place.
 679
 680 =cut
 681
 682 sub analyze_location {
 683         my ( $tradition, $graph, $variant_row, $lacunose ) = @_;
 684         my $c = $tradition->collation;
 685
 686         if( exists $variant_row->{'unsolved'} ) {
 687                 return;
 688         }
 689         my $reading_roots = delete $variant_row->{'reading_roots'};
 690         my $classinfo = delete $variant_row->{'witcopy_types'};
 691
 692         # Make a hash of all known node memberships, and make the subgraphs.
 693         my $contig = {};
 694         my $subgraph = {};
 695         my $acstr = $c->ac_label;
 696         my @acwits;
 697
 698         # Note which witnesses positively belong to which group. This information
 699         # comes ultimately from the IDP solver.
 700         # Also make a note of the reading's roots.
 701     foreach my $rdghash ( @{$variant_row->{'readings'}} ) {
 702         my $rid = $rdghash->{'readingid'};
 703         my @roots;
 704         foreach my $wit ( @{$rdghash->{'group'}} ) {
 705                 $contig->{$wit} = $rid;
 706             if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
 707                 push( @acwits, $1 );
 708             }
 709             if( exists $reading_roots->{$wit} && $reading_roots->{$wit} ) {
 710                 push( @roots, $wit );
 711             }
 712         }
 713                 $rdghash->{'independent_occurrence'} = \@roots;
 714         }
 715
 716         # Now that we have all the node group memberships, calculate followed/
 717     # non-followed/unknown values for each reading.  Also figure out the
 718     # reading's evident parent(s).
 719     foreach my $rdghash ( @{$variant_row->{'readings'}} ) {
 720         my $rid = $rdghash->{'readingid'};
 721         my $rdg = $c->reading( $rid );
 722         my @roots = @{$rdghash->{'independent_occurrence'}};
 723         my @reversions;
 724         if( $classinfo ) {
 725                 @reversions = grep { $classinfo->{$_} eq 'revert' }
 726                         $rdghash->{'group'}->members;
 727                 $rdghash->{'reversions'} = \@reversions;
 728         }
 729         my @group = @{$rdghash->{'group'}};
 730
 731         # Start figuring things out.
 732         $rdghash->{'followed'} = scalar( @group )
 733                 - ( scalar( @roots ) + scalar( @reversions ) );
 734         # Find the parent readings, if any, of this reading.
 735         my $sourceparents = _find_reading_parents( $rid, $graph, $contig, @roots );
 736                 # Work out relationships between readings and their non-followed parent.
 737                 _resolve_parent_relationships( $c, $rid, $rdg, $sourceparents );
 738                 $rdghash->{'source_parents'} = $sourceparents;
 739
 740                 if( @reversions ) {
 741                         my $revparents = _find_reading_parents( $rid, $graph, $contig, @reversions );
 742                         _resolve_parent_relationships( $c, $rid, $rdg, $revparents );
 743                         $rdghash->{'reversion_parents'} = $revparents;
 744                 }
 745
 746                 # Find the number of times this reading was altered, and the number of
 747                 # times we're not sure.
 748                 my( %nofollow, %unknownfollow );
 749                 foreach my $wit ( @{$rdghash->{'group'}} ) {
 750                         foreach my $wchild ( $graph->successors( $wit ) ) {
 751                                 if( $reading_roots->{$wchild} && $contig->{$wchild}
 752                                         && $contig->{$wchild} ne $rid ) {
 753                                         # It definitely changed here.
 754                                         $nofollow{$wchild} = 1;
 755                                 } elsif( !($contig->{$wchild}) ) {
 756                                         # The child is a hypothetical node not definitely in
 757                                         # any group. Answer is unknown.
 758                                         $unknownfollow{$wchild} = 1;
 759                                 } # else it is either in our group, or it is a non-root node in a
 760                                   # known group and therefore is presumed to have its reading from
 761                                   # its group, not this link.
 762                         }
 763                 }
 764                 $rdghash->{'not_followed'} = keys %nofollow;
 765                 $rdghash->{'follow_unknown'} = keys %unknownfollow;
 766
 767                 # Now say whether this reading represents a conflict.
 768                 unless( $variant_row->{'genealogical'} ) {
 769                         $rdghash->{'is_conflict'} = @roots != 1;
 770                         $rdghash->{'is_reverted'} = scalar @reversions;
 771                 }
 772     }
 773 }
 774
 775 sub _find_reading_parents {
 776         my( $rid, $graph, $contig, @list ) = @_;
 777         my $parenthash = {};
 778         foreach my $wit ( @list ) {
 779                 # Look in the stemma graph to find this witness's extant or known-reading
 780                 # immediate ancestor(s), and look up the reading that each ancestor holds.
 781                 my @check = $graph->predecessors( $wit );
 782                 while( @check ) {
 783                         my @next;
 784                         foreach my $wparent( @check ) {
 785                                 my $preading = $contig->{$wparent};
 786                                 if( $preading && $preading ne $rid ) {
 787                                         $parenthash->{$preading} = 1;
 788                                 } else {
 789                                         push( @next, $graph->predecessors( $wparent ) );
 790                                 }
 791                         }
 792                         @check = @next;
 793                 }
 794         }
 795         return $parenthash;
 796 }
 797
 798 sub _resolve_parent_relationships {
 799         my( $c, $rid, $rdg, $rdgparents ) = @_;
 800         foreach my $p ( keys %$rdgparents ) {
 801                 # Resolve the relationship of the parent to the reading, and
 802                 # save it in our hash.
 803                 my $pobj = $c->reading( $p );
 804                 my $prep = $pobj ? $pobj->id . ' (' . $pobj->text . ')' : $p;
 805                 my $phash = { 'label' => $prep };
 806                 if( $pobj ) {
 807                         my $rel = $c->get_relationship( $p, $rid );
 808                         if( $rel ) {
 809                                 _add_to_hash( $rel, $phash );
 810                         } elsif( $rdg ) {
 811                                 # First check for a transposed relationship
 812                                 if( $rdg->rank != $pobj->rank ) {
 813                                         foreach my $ti ( $rdg->related_readings( 'transposition' ) ) {
 814                                                 next unless $ti->text eq $rdg->text;
 815                                                 $rel = $c->get_relationship( $ti, $pobj );
 816                                                 if( $rel ) {
 817                                                         _add_to_hash( $rel, $phash, 1 );
 818                                                         last;
 819                                                 }
 820                                         }
 821                                         unless( $rel ) {
 822                                                 foreach my $ti ( $pobj->related_readings( 'transposition' ) ) {
 823                                                         next unless $ti->text eq $pobj->text;
 824                                                         $rel = $c->get_relationship( $ti, $rdg );
 825                                                         if( $rel ) {
 826                                                                 _add_to_hash( $rel, $phash, 1 );
 827                                                                 last;
 828                                                         }
 829                                                 }
 830                                         }
 831                                 }
 832                                 unless( $rel ) {
 833                                         # and then check for sheer word similarity.
 834                                         my $rtext = $rdg->text;
 835                                         my $ptext = $pobj->text;
 836                                         if( similar( $rtext, $ptext ) ) {
 837                                                 # say STDERR "Words $rtext and $ptext judged similar";
 838                                                 $phash->{relation} = { type => 'wordsimilar' };
 839                                         }
 840                                 }
 841                         } else {
 842                                 $phash->{relation} = { type => 'deletion' };
 843                         }
 844                         # Get the attributes of the parent object while we are here
 845                         $phash->{'text'} = $pobj->text if $pobj;
 846                         if( $pobj && $pobj->does('Text::Tradition::Morphology') ) {
 847                                 $phash->{'is_nonsense'} = $pobj->is_nonsense;
 848                                 $phash->{'is_ungrammatical'} = $pobj->grammar_invalid;
 849                         }
 850                 } elsif( $p eq '(omitted)' ) {
 851                         # Check to see if the reading in question is a repetition.
 852                         my @reps = $rdg->related_readings( 'repetition' );
 853                         if( @reps ) {
 854                                 $phash->{relation} = { type => 'repetition',
 855                                         annotation => "of reading @reps" };
 856                         } else {
 857                                 $phash->{relation} = { type => 'addition' };
 858                         }
 859                 }
 860                 # Save it
 861                 $rdgparents->{$p} = $phash;
 862         }
 863 }
 864
 865 sub _add_to_hash {
 866         my( $rel, $phash, $is_transposed ) = @_;
 867         $phash->{relation} = { type => $rel->type };
 868         $phash->{relation}->{transposed} = 1 if $is_transposed;
 869         $phash->{relation}->{annotation} = $rel->annotation
 870                 if $rel->has_annotation;
 871 }
 872
 873 =head2 similar( $word1, $word2 )
 874
 875 Use Algorithm::Diff to get a sense of how close the words are to each other.
 876 This will hopefully handle substitutions a bit more nicely than Levenshtein.
 877
 878 =cut
 879
 880 #!/usr/bin/env perl
 881
 882 sub similar {
 883         my( $word1, $word2 ) = sort { length($a) <=> length($b) } @_;
 884         my @let1 = split( '', lc( $word1 ) );
 885         my @let2 = split( '', lc( $word2 ) );
 886         my $diff = Algorithm::Diff->new( \@let1, \@let2 );
 887         my $mag = 0;
 888         while( $diff->Next ) {
 889                 if( $diff->Same ) {
 890                         # Take off points for longer strings
 891                         my $cs = $diff->Range(1) - 2;
 892                         $cs = 0 if $cs < 0;
 893                         $mag -= $cs;
 894                 } elsif( !$diff->Items(1) ) {
 895                         $mag += $diff->Range(2);
 896                 } elsif( !$diff->Items(2) ) {
 897                         $mag += $diff->Range(1);
 898                 } else {
 899                         # Split the difference for substitutions
 900                         my $c1 = $diff->Range(1) || 1;
 901                         my $c2 = $diff->Range(2) || 1;
 902                         my $cd = ( $c1 + $c2 ) / 2;
 903                         $mag += $cd;
 904                 }
 905         }
 906         return ( $mag <= length( $word1 ) / 2 );
 907 }
 908
 909 sub _useful_variant {
 910         my( $rankgroup, $rankgraph, $acstr ) = @_;
 911
 912         # Sort by group size and return
 913         my $is_useful = 0;
 914         foreach my $rdg ( keys %$rankgroup ) {
 915                 my @wits = $rankgroup->{$rdg}->members;
 916                 if( @wits > 1 ) {
 917                         $is_useful++;
 918                 } else {
 919                         $is_useful++ unless( $rankgraph->is_sink_vertex( $wits[0] )
 920                                 || $wits[0] =~ /\Q$acstr\E$/ );
 921                 }
 922         }
 923         return $is_useful > 1;
 924 }
 925
 926 =head2 wit_stringify( $groups )
 927
 928 Takes an array of witness groupings and produces a string like
 929 ['A','B'] / ['C','D','E'] / ['F']
 930
 931 =cut
 932
 933 sub wit_stringify {
 934     my $groups = shift;
 935     my @gst;
 936     # If we were passed an array of witnesses instead of an array of
 937     # groupings, then "group" the witnesses first.
 938     unless( ref( $groups->[0] ) ) {
 939         my $mkgrp = [ $groups ];
 940         $groups = $mkgrp;
 941     }
 942     foreach my $g ( @$groups ) {
 943         push( @gst, '[' . join( ',', map { "'$_'" } @$g ) . ']' );
 944     }
 945     return join( ' / ', @gst );
 946 }
 947
 948 1;
 949
 950 sub throw {
 951         Text::Tradition::Error->throw(
 952                 'ident' => 'Analysis error',
 953                 'message' => $_[0],
 954         );
 955 }
 956
 957 =head1 LICENSE
 958
 959 This package is free software and is provided "as is" without express
 960 or implied warranty.  You can redistribute it and/or modify it under
 961 the same terms as Perl itself.
 962
 963 =head1 AUTHOR
 964
 965 Tara L Andrews E<lt>aurum@cpan.orgE<gt>