lib/Text/Tradition/Analysis.pm

   1 package Text::Tradition::Analysis;
   2
   3 use strict;
   4 use warnings;
   5 use Algorithm::Diff;  # for word similarity measure
   6 use Encode qw/ decode_utf8 encode_utf8 /;
   7 use Exporter 'import';
   8 use Graph;
   9 use JSON qw/ to_json decode_json /;
  10 use LWP::UserAgent;
  11 use Set::Scalar;
  12 use Text::Tradition::Analysis::Result;
  13 use Text::Tradition::Directory;
  14 use Text::Tradition::Stemma;
  15 use TryCatch;
  16
  17 use vars qw/ @EXPORT_OK /;
  18 @EXPORT_OK = qw/ run_analysis group_variants analyze_variant_location wit_stringify /;
  19
  20 my $SOLVER_URL = 'http://byzantini.st/cgi-bin/graphcalc.cgi';
  21 my $unsolved_problems = {};
  22
  23 =head1 NAME
  24
  25 Text::Tradition::Analysis - functions for stemma analysis of a tradition
  26
  27 =head1 SYNOPSIS
  28
  29   use Text::Tradition;
  30   use Text::Tradition::Analysis qw/ run_analysis analyze_variant_location /;
  31   my $t = Text::Tradition->new(
  32     'name' => 'this is a text',
  33     'input' => 'TEI',
  34     'file' => '/path/to/tei_parallel_seg_file.xml' );
  35   $t->add_stemma( 'dotfile' => $stemmafile );
  36
  37   my $variant_data = run_analysis( $tradition );
  38   # Recalculate rank $n treating all orthographic variants as equivalent
  39   my $reanalyze = analyze_variant_location( $tradition, $n, 0, 'orthographic' );
  40
  41 =head1 DESCRIPTION
  42
  43 Text::Tradition is a library for representation and analysis of collated
  44 texts, particularly medieval ones.  The Collation is the central feature of
  45 a Tradition, where the text, its sequence of readings, and its relationships
  46 between readings are actually kept.
  47
  48 =head1 SUBROUTINES
  49
  50 =head2 run_analysis( $tradition, %opts )
  51
  52 Runs the analysis described in analyze_variant_location on every location in the
  53 collation of the given tradition, with the given options. These include:
  54
  55 =over 4
  56
  57 =item * stemma_id - Specify which of the tradition's stemmata to use. Default
  58 is 0 (i.e. the first).
  59
  60 =item * ranks - Specify a list of location ranks to analyze; exclude the rest.
  61
  62 =item * merge_types - Specify a list of relationship types, where related readings
  63 should be treated as identical for the purposes of analysis.
  64
  65 =item * exclude_type1 - Exclude those ranks whose groupings have only type-1 variants.
  66
  67 =back
  68
  69 =begin testing
  70
  71 use Text::Tradition;
  72 use Text::Tradition::Analysis qw/ run_analysis analyze_variant_location /;
  73
  74 my $datafile = 't/data/florilegium_tei_ps.xml';
  75 my $tradition = Text::Tradition->new( 'input' => 'TEI',
  76                                       'name' => 'test0',
  77                                       'file' => $datafile );
  78 my $s = $tradition->add_stemma( 'dotfile' => 't/data/florilegium.dot' );
  79 is( ref( $s ), 'Text::Tradition::Stemma', "Added stemma to tradition" );
  80
  81 my %expected_genealogical = (
  82         1 => 0,
  83         2 => 1,
  84         3 =>  0,
  85         5 =>  0,
  86         7 =>  0,
  87         8 =>  0,
  88         10 => 0,
  89         13 => 1,
  90         33 => 0,
  91         34 => 0,
  92         37 => 0,
  93         60 => 0,
  94         81 => 1,
  95         84 => 0,
  96         87 => 0,
  97         101 => 0,
  98         102 => 0,
  99         122 => 1,
 100         157 => 0,
 101         166 => 1,
 102         169 => 1,
 103         200 => 0,
 104         216 => 1,
 105         217 => 1,
 106         219 => 1,
 107         241 => 1,
 108         242 => 1,
 109         243 => 1,
 110 );
 111
 112 my $data = run_analysis( $tradition, calcdsn => 'dbi:SQLite:dbname=t/data/analysis.db' );
 113 my $c = $tradition->collation;
 114 foreach my $row ( @{$data->{'variants'}} ) {
 115         # Account for rows that used to be "not useful"
 116         unless( exists $expected_genealogical{$row->{'id'}} ) {
 117                 $expected_genealogical{$row->{'id'}} = 1;
 118         }
 119         my $gen_bool = $row->{'genealogical'} ? 1 : 0;
 120         is( $gen_bool, $expected_genealogical{$row->{'id'}},
 121                 "Got correct genealogical flag for row " . $row->{'id'} );
 122         # Check that we have the right row with the right groups
 123         my $rank = $row->{'id'};
 124         foreach my $rdghash ( @{$row->{'readings'}} ) {
 125                 # Skip 'readings' that aren't really
 126                 next unless $c->reading( $rdghash->{'readingid'} );
 127                 # Check the rank
 128                 is( $c->reading( $rdghash->{'readingid'} )->rank, $rank,
 129                         "Got correct reading rank" );
 130                 # Check the witnesses
 131                 my @realwits = sort $c->reading_witnesses( $rdghash->{'readingid'} );
 132                 my @sgrp = sort @{$rdghash->{'group'}};
 133                 is_deeply( \@sgrp, \@realwits, "Reading analyzed with correct groups" );
 134         }
 135 }
 136 is( $data->{'variant_count'}, 58, "Got right total variant number" );
 137 # TODO Make something meaningful of conflict count, maybe test other bits
 138
 139 =end testing
 140
 141 =cut
 142
 143 sub run_analysis {
 144         my( $tradition, %opts ) = @_;
 145         my $c = $tradition->collation;
 146         my $aclabel = $c->ac_label;
 147
 148         my $stemma_id = $opts{'stemma_id'} || 0;
 149         my @ranks = ref( $opts{'ranks'} ) eq 'ARRAY' ? @{$opts{'ranks'}} : ();
 150         my $collapse = Set::Scalar->new();
 151         if( $opts{'merge_types'} && ref( $opts{'merge_types'} ) eq 'ARRAY' ) {
 152                 $collapse->insert( @{$opts{'merge_types'}} );
 153         } elsif( $opts{'merge_types'} ) {
 154                 $collapse->insert( $opts{'merge_types'} );
 155         }
 156
 157         # Make sure we have a lookup DB for graph calculation results; this will die
 158         # if calcdir or calcdsn isn't passed.
 159         my $dir;
 160         if( exists $opts{'calcdir'} ) {
 161                 $dir = delete $opts{'calcdir'}
 162         } elsif ( exists $opts{'calcdsn'} ) {
 163                 $dir = Text::Tradition::Directory->new( dsn => $opts{'calcdsn'} );
 164         }
 165
 166         # Get the stemma
 167         my $stemma = $tradition->stemma( $stemma_id );
 168
 169         # Figure out which witnesses we are working with - that is, the ones that
 170         # appear both in the stemma and in the tradition. All others are 'lacunose'
 171         # for our purposes.
 172         my $lacunose = Set::Scalar->new( $stemma->hypotheticals );
 173         my $stemma_wits = Set::Scalar->new( $stemma->witnesses );
 174         my $tradition_wits = Set::Scalar->new( map { $_->sigil } $tradition->witnesses );
 175         $lacunose->insert( $stemma_wits->symmetric_difference( $tradition_wits )->members );
 176
 177         # Find and mark 'common' ranks for exclusion, unless they were
 178         # explicitly specified.
 179         unless( @ranks ) {
 180                 my %common_rank;
 181                 foreach my $rdg ( $c->common_readings ) {
 182                         $common_rank{$rdg->rank} = 1;
 183                 }
 184                 @ranks = grep { !$common_rank{$_} } ( 1 .. $c->end->rank-1 );
 185         }
 186
 187         # Group the variants to send to the solver
 188         my @groups;
 189         my @use_ranks;
 190         my %lacunae;
 191         my $moved = {};
 192         foreach my $rank ( @ranks ) {
 193                 my $missing = $lacunose->clone();
 194                 my $rankgroup = group_variants( $tradition, $rank, $missing, $moved, $collapse );
 195                 # Filter out any empty rankgroups
 196                 # (e.g. from the later rank for a transposition)
 197                 next unless keys %$rankgroup;
 198                 # Get the graph for this rankgroup
 199                 my $rankgraph = _graph_for_grouping( $stemma, $rankgroup, $missing, $aclabel );
 200                 if( $opts{'exclude_type1'} ) {
 201                         # Check to see whether this is a "useful" group.
 202                         next unless _useful_variant( $rankgroup, $rankgraph, $aclabel );
 203                 }
 204                 push( @use_ranks, $rank );
 205                 push( @groups, { grouping => $rankgroup, graph => $rankgraph } );
 206                 $lacunae{$rank} = $missing;
 207         }
 208         # Run the solver
 209         my $answer = solve_variants( $dir, @groups );
 210
 211         # Do further analysis on the answer
 212         my $conflict_count = 0;
 213         my $reversion_count = 0;
 214         foreach my $idx ( 0 .. $#use_ranks ) {
 215                 my $location = $answer->{'variants'}->[$idx];
 216                 # Add the rank back in
 217                 my $rank = $use_ranks[$idx];
 218                 $location->{'id'} = $rank;
 219                 # Note what our lacunae are
 220                 my %lmiss;
 221                 map { $lmiss{$_} = 1 } @{$lacunae{$use_ranks[$idx]}};
 222                 $location->{'missing'} = [ keys %lmiss ];
 223
 224                 # Run the extra analysis we need.
 225                 ## TODO We run through all the variants in this call, so
 226                 ## why not add the reading data there instead of here below?
 227                 my $graph = $groups[$idx]->{graph};
 228                 analyze_location( $tradition, $graph, $location, \%lmiss );
 229
 230                 my @layerwits;
 231                 # Do the final post-analysis tidying up of the data.
 232                 foreach my $rdghash ( @{$location->{'readings'}} ) {
 233                         $conflict_count++ if $rdghash->{'is_conflict'};
 234                         $reversion_count++ if $rdghash->{'is_reverted'};
 235                         # Add the reading text back in, setting display value as needed
 236                         my $rdg = $c->reading( $rdghash->{'readingid'} );
 237                         if( $rdg ) {
 238                                 $rdghash->{'text'} = $rdg->text .
 239                                         ( $rdg->rank == $rank ? '' : ' [' . $rdg->rank . ']' );
 240                                 $rdghash->{'is_ungrammatical'} = $rdg->grammar_invalid;
 241                                 $rdghash->{'is_nonsense'} = $rdg->is_nonsense;
 242                         }
 243                         # Remove lacunose witnesses from this reading's list now that the
 244                         # analysis is done
 245                         my @realgroup;
 246                         map { push( @realgroup, $_ ) unless $lmiss{$_} } @{$rdghash->{'group'}};
 247                         $rdghash->{'group'} = \@realgroup;
 248                         # Note any layered witnesses that appear in this group
 249                         foreach( @realgroup ) {
 250                                 if( $_ =~ /^(.*)\Q$aclabel\E$/ ) {
 251                                         push( @layerwits, $1 );
 252                                 }
 253                         }
 254                 }
 255                 $location->{'layerwits'} = \@layerwits if @layerwits;
 256         }
 257         $answer->{'conflict_count'} = $conflict_count;
 258         $answer->{'reversion_count'} = $reversion_count;
 259
 260         return $answer;
 261 }
 262
 263 =head2 group_variants( $tradition, $rank, $lacunose, $transposed, $merge_relationship_types )
 264
 265 Groups the variants at the given $rank of the collation, treating any
 266 relationships in the set $merge_relationship_types as equivalent.
 267 $lacunose should be a reference to an array, to which the sigla of lacunose
 268 witnesses at this rank will be appended; $transposed should be a reference
 269 to a hash, wherein the identities of transposed readings and their
 270 relatives will be stored.
 271
 272 Returns a hash $group_readings where $rdg is attested by the witnesses listed
 273 in $group_readings->{$rdg}.
 274
 275 =cut
 276
 277 # Return group_readings, groups, lacunose
 278 sub group_variants {
 279         my( $tradition, $rank, $lacunose, $transposed, $collapse ) = @_;
 280         my $c = $tradition->collation;
 281         my $aclabel = $c->ac_label;
 282         my $table = $c->alignment_table;
 283         # Get the alignment table readings
 284         my %readings_at_rank;
 285         my $check_for_gaps = Set::Scalar->new();
 286         my %moved_wits;
 287         my $has_transposition;
 288         foreach my $tablewit ( @{$table->{'alignment'}} ) {
 289                 my $rdg = $tablewit->{'tokens'}->[$rank-1];
 290                 my $wit = $tablewit->{'witness'};
 291                 # Exclude the witness if it is "lacunose" which if we got here
 292                 # means "not in the stemma".
 293                 next if _is_lacunose( $wit, $lacunose, $aclabel );
 294                 # Note if the witness is actually in a lacuna
 295                 if( $rdg && $rdg->{'t'}->is_lacuna ) {
 296                         _add_to_witlist( $wit, $lacunose, $aclabel );
 297                 # Otherwise the witness either has a positive reading...
 298                 } elsif( $rdg ) {
 299                         # If the reading has been counted elsewhere as a transposition, ignore it.
 300                         if( $transposed->{$rdg->{'t'}->id} ) {
 301                                 # TODO Does this cope with three-way transpositions?
 302                                 map { $moved_wits{$_} = 1 } @{$transposed->{$rdg->{'t'}->id}};
 303                                 next;
 304                         }
 305                         # Otherwise, record it...
 306                         $readings_at_rank{$rdg->{'t'}->id} = $rdg->{'t'};
 307                         # ...and grab any transpositions, and their relations.
 308                         my @transp = grep { $_->rank != $rank } $rdg->{'t'}->related_readings();
 309                         foreach my $trdg ( @transp ) {
 310                                 next if exists $readings_at_rank{$trdg->id};
 311                                 $has_transposition = 1;
 312                                 my @affected_wits = _table_witnesses(
 313                                         $table, $trdg, $lacunose, $aclabel );
 314                                 next unless @affected_wits;
 315                                 map { $moved_wits{$_} = 1 } @affected_wits;
 316                                 $transposed->{$trdg->id} =
 317                                         [ _table_witnesses( $table, $rdg->{'t'}, $lacunose, $aclabel ) ];
 318                                 $readings_at_rank{$trdg->id} = $trdg;
 319                         }
 320                 # ...or it is empty, ergo a gap.
 321                 } else {
 322                         _add_to_witlist( $wit, $check_for_gaps, $aclabel );
 323                 }
 324         }
 325         my $gap_wits = Set::Scalar->new();
 326         map { _add_to_witlist( $_, $gap_wits, $aclabel )
 327                 unless $moved_wits{$_} } $check_for_gaps->members;
 328
 329         # Group the readings, collapsing groups by relationship if needed.
 330         my $grouped_readings = {};
 331         foreach my $rdg ( values %readings_at_rank ) {
 332                 # Skip readings that have been collapsed into others.
 333                 next if exists $grouped_readings->{$rdg->id}
 334                         && $grouped_readings->{$rdg->id} eq 'COLLAPSE';
 335                 # Get the witness list, including from readings collapsed into this one.
 336                 my @wits = _table_witnesses( $table, $rdg, $lacunose, $aclabel );
 337                 if( $collapse && $collapse->size ) {
 338                         my $filter = sub { $collapse->has( $_[0]->type ) };
 339                         foreach my $other ( $rdg->related_readings( $filter ) ) {
 340                                 my @otherwits = _table_witnesses( $table, $other, $lacunose, $aclabel );
 341                                 push( @wits, @otherwits );
 342                                 $grouped_readings->{$other->id} = 'COLLAPSE';
 343                         }
 344                 }
 345                 $grouped_readings->{$rdg->id} = Set::Scalar->new( @wits );
 346         }
 347         if( $gap_wits->members ) {
 348                 $grouped_readings->{'(omitted)'} = $gap_wits;
 349         }
 350
 351         # Get rid of our collapsed readings
 352         map { delete $grouped_readings->{$_} if(
 353                          $grouped_readings->{$_} eq 'COLLAPSE'
 354                          || $grouped_readings->{$_}->is_empty ) }
 355                 keys %$grouped_readings;
 356
 357         # If something was transposed, check the groups for doubled-up readings
 358         if( $has_transposition ) {
 359                 # print STDERR "Group for rank $rank:\n";
 360                 # map { print STDERR "\t$_: " . join( ' ' , @{$grouped_readings->{$_}} ) . "\n" }
 361                 #       keys %$grouped_readings;
 362                 _check_transposed_consistency( $c, $rank, $transposed, $grouped_readings );
 363         }
 364
 365         # Return the result
 366         return $grouped_readings;
 367 }
 368
 369 # Helper function to query the alignment table for all witnesses (a.c. included)
 370 # that have a given reading at its rank.
 371 sub _table_witnesses {
 372         my( $table, $trdg, $lacunose, $aclabel ) = @_;
 373         my $tableidx = $trdg->rank - 1;
 374         my $has_reading = Set::Scalar->new();
 375         foreach my $row ( @{$table->{'alignment'}} ) {
 376                 my $wit = $row->{'witness'};
 377                 next if _is_lacunose( $wit, $lacunose, $aclabel );
 378                 my $rdg = $row->{'tokens'}->[$tableidx];
 379                 next unless exists $rdg->{'t'} && defined $rdg->{'t'};
 380                 _add_to_witlist( $wit, $has_reading, $aclabel )
 381                         if $rdg->{'t'}->id eq $trdg->id;
 382         }
 383         return $has_reading->members;
 384 }
 385
 386 # Helper function to see if a witness is lacunose even if we are asking about
 387 # the a.c. version
 388 sub _is_lacunose {
 389         my ( $wit, $lac, $acstr ) = @_;
 390         if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
 391                 $wit = $1;
 392         }
 393         return $lac->has( $wit );
 394 }
 395
 396 # Helper function to ensure that X and X a.c. never appear in the same list.
 397 sub _add_to_witlist {
 398         my( $wit, $list, $acstr ) = @_;
 399         if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
 400                 # Don't add X a.c. if we already have X
 401                 return if $list->has( $1 );
 402         } else {
 403                 # Delete X a.c. if we are about to add X
 404                 $list->delete( $wit.$acstr );
 405         }
 406         $list->insert( $wit );
 407 }
 408
 409 sub _check_transposed_consistency {
 410         my( $c, $rank, $transposed, $groupings ) = @_;
 411         my %seen_wits;
 412         my %thisrank;
 413         # Note which readings are actually at this rank, and which witnesses
 414         # belong to which reading.
 415         foreach my $rdg ( keys %$groupings ) {
 416                 my $rdgobj = $c->reading( $rdg );
 417                 # Count '(omitted)' as a reading at this rank
 418                 $thisrank{$rdg} = 1 if !$rdgobj || $rdgobj->rank == $rank;
 419                 map { push( @{$seen_wits{$_}}, $rdg ) } @{$groupings->{$rdg}};
 420         }
 421         # Our work is done if we have no witness belonging to more than one
 422         # reading.
 423         my @doubled = grep { scalar @{$seen_wits{$_}} > 1 } keys %seen_wits;
 424         return unless @doubled;
 425         # If we have a symmetric related transposition, drop the non-rank readings.
 426         if( @doubled == scalar keys %seen_wits ) {
 427                 foreach my $rdg ( keys %$groupings ) {
 428                         if( !$thisrank{$rdg} ) {
 429                                 my $groupstr = wit_stringify( $groupings->{$rdg} );
 430                                 my ( $matched ) = grep { $groupstr eq wit_stringify( $groupings->{$_} ) }
 431                                         keys %thisrank;
 432                                 delete $groupings->{$rdg};
 433                                 # If we found a group match, assume there is a symmetry happening.
 434                                 # TODO think more about this
 435                                 # print STDERR "*** Deleting symmetric reading $rdg\n";
 436                                 unless( $matched ) {
 437                                         delete $transposed->{$rdg};
 438                                         warn "Found problem in evident symmetry with reading $rdg";
 439                                 }
 440                         }
 441                 }
 442         # Otherwise 'unhook' the transposed reading(s) that have duplicates.
 443         } else {
 444                 foreach my $dup ( @doubled ) {
 445                         foreach my $rdg ( @{$seen_wits{$dup}} ) {
 446                                 next if $thisrank{$rdg};
 447                                 next unless exists $groupings->{$rdg};
 448                                 # print STDERR "*** Deleting asymmetric doubled-up reading $rdg\n";
 449                                 delete $groupings->{$rdg};
 450                                 delete $transposed->{$rdg};
 451                         }
 452                 }
 453                 # and put any now-orphaned readings into an 'omitted' reading.
 454                 foreach my $wit ( keys %seen_wits ) {
 455                         unless( grep { exists $groupings->{$_} } @{$seen_wits{$wit}} ) {
 456                                 $groupings->{'(omitted)'} = Set::Scalar->new()
 457                                          unless exists $groupings->{'(omitted)'};
 458                                 _add_to_witlist( $wit, $groupings->{'(omitted)'}, $c->ac_label );
 459                         }
 460                 }
 461         }
 462 }
 463
 464 # For the given grouping, return its situation graph based on the stemma.
 465 sub _graph_for_grouping {
 466         my( $stemma, $grouping, $lacunose, $aclabel ) = @_;
 467         my $acwits = [];
 468         my $extant = {};
 469         foreach my $gs ( values %$grouping ) {
 470                 map {
 471                         if( $_ =~ /^(.*)\Q$aclabel\E$/ ) {
 472                                 push( @$acwits, $1 ) unless $lacunose->has( $1 );
 473                         } else {
 474                                 $extant->{$_} = 1 unless $lacunose->has( $_ );
 475                         }
 476                 } $gs->members;
 477         }
 478         my $graph;
 479         try {
 480                 # contig contains all extant wits and all hypothetical wits
 481                 # needed to make up the groups.
 482                 $graph = $stemma->situation_graph( $extant, $acwits );
 483         } catch ( Text::Tradition::Error $e ) {
 484                 throw( "Could not extend graph with given extant and a.c. witnesses: "
 485                         . $e->message );
 486         } catch {
 487                 throw( "Could not extend graph with a.c. witnesses @$acwits" );
 488         }
 489         return $graph;
 490 }
 491
 492 =head2 solve_variants( $calcdir, @groups )
 493
 494 Looks up the set of groups in the answers provided by the external graph solver
 495 service and returns a cleaned-up answer, adding the rank IDs back where they belong.
 496
 497 The answer has the form
 498   { "variants" => [ array of variant location structures ],
 499     "variant_count" => total,
 500     "conflict_count" => number of conflicts detected,
 501     "genealogical_count" => number of solutions found }
 502
 503 =cut
 504
 505 sub solve_variants {
 506         my( @groups ) = @_;
 507
 508         # Are we using a local result directory, or did we pass an empty value
 509         # for the directory?
 510         my $dir;
 511         unless( ref( $groups[0] ) eq 'HASH' ) {
 512                 $dir = shift @groups;
 513         }
 514
 515         ## For each graph/group combo, make a Text::Tradition::Analysis::Result
 516         ## object so that we can send it off for IDP lookup.
 517         my $variants = [];
 518         my $genealogical = 0; # counter
 519         # TODO Optimize for unique graph problems
 520         my %problems;
 521         foreach my $graphproblem ( @groups ) {
 522                 # Construct the calc result key and look up its answer
 523                 my $problem = Text::Tradition::Analysis::Result->new(
 524                         graph => $graphproblem->{'graph'},
 525                         setlist => [ values %{$graphproblem->{'grouping'}} ] );
 526                 if( exists $problems{$problem->object_key} ) {
 527                         $problem = $problems{$problem->object_key};
 528                 } else {
 529                         $problems{$problem->object_key} = $problem;
 530                 }
 531                 $graphproblem->{'object'} = $problem;
 532         }
 533
 534         my %results;
 535         if( $dir ) {
 536                 my $scope = $dir->new_scope;
 537                 map { $results{$_} = $dir->lookup( $_ ) || $problems{$_} } keys %problems;
 538         } else {
 539                 my $json = JSON->new->allow_blessed->convert_blessed->utf8->encode(
 540                         [ values %problems ] );
 541                 # Send it off and get the result
 542                 # print STDERR "Sending request: " . decode_utf8( $json ) . "\n";
 543                 my $ua = LWP::UserAgent->new();
 544                 my $resp = $ua->post( $SOLVER_URL, 'Content-Type' => 'application/json',
 545                                                           'Content' => $json );
 546                 my $answer;
 547                 if( $resp->is_success ) {
 548                         $answer = decode_json( $resp->content );
 549                         throw( "Unexpected answer from IDP: $answer" ) unless ref( $answer ) eq 'ARRAY';
 550                 } else {
 551                         throw( "IDP solver returned " . $resp->status_line . " / " . $resp->content
 552                                 . "; cannot run graph analysis" );
 553                 }
 554                 # One more sanity check
 555                 throw( "Something went wrong with answer symmetricity" )
 556                         unless keys( %problems ) == @$answer;
 557                 # Convert the results
 558                 foreach my $a ( @$answer ) {
 559                         my $r = Text::Tradition::Analysis::Result->new( $a );
 560                         $results{$r->object_key} = $r;
 561                 }
 562         }
 563
 564         # We now have a single JSON-encoded Result object per problem sent. Fold its
 565         # answers into our variant info structure.
 566         foreach my $graphproblem ( @groups ) {
 567                 my $result = $results{$graphproblem->{'object'}->object_key}
 568                         || $graphproblem->{'object'};
 569
 570                 # Initialize the result structure for this graph problem
 571                 my $vstruct = { readings => [] };
 572                 push( @$variants, $vstruct );
 573
 574                 # 0. Do we have a calculated result at all?
 575                 unless( $result->status eq 'OK' ) {
 576                         $vstruct->{'unsolved'} = $result->status;
 577                         foreach my $rid ( keys %{$graphproblem->{grouping}} ) {
 578                                 push( @{$vstruct->{readings}}, { readingid => $rid,
 579                                         group => [ $graphproblem->{grouping}->{$rid}->members ] } );
 580                         }
 581                         next;
 582                 }
 583
 584                 # 1. Did the group evaluate as genealogical?
 585                 $vstruct->{genealogical} = $result->is_genealogical;
 586                 $genealogical++ if $result->is_genealogical;
 587
 588                 # 2. What are the calculated minimum groupings for each variant loc?
 589                 foreach my $rid ( keys %{$graphproblem->{grouping}} ) {
 590                         my $inputset = $graphproblem->{grouping}->{$rid};
 591                         my $minset = $result->minimum_grouping_for( $inputset );
 592                         push( @{$vstruct->{readings}}, { readingid => $rid, group => $minset } );
 593                 }
 594
 595                 # 3. What are the sources and classes calculated for each witness?
 596                 $vstruct->{witcopy_types} = { $result->classes };
 597                 $vstruct->{reading_roots} = {};
 598                 map { $vstruct->{reading_roots}->{$_} = 1 } $result->sources;
 599
 600         }
 601
 602         return { 'variants' => $variants,
 603                          'variant_count' => scalar @$variants,
 604                          'genealogical_count' => $genealogical };
 605 }
 606
 607 =head2 analyze_location ( $tradition, $graph, $location_hash )
 608
 609 Given the tradition, its stemma graph, and the solution from the graph solver,
 610 work out the rest of the information we want.  For each reading we need missing,
 611 conflict, reading_parents, independent_occurrence, followed, not_followed,
 612 and follow_unknown.  Alters the location_hash in place.
 613
 614 =cut
 615
 616 sub analyze_location {
 617         my ( $tradition, $graph, $variant_row, $lacunose ) = @_;
 618         my $c = $tradition->collation;
 619
 620         if( exists $variant_row->{'unsolved'} ) {
 621                 return;
 622         }
 623         my $reading_roots = delete $variant_row->{'reading_roots'};
 624         my $classinfo = delete $variant_row->{'witcopy_types'};
 625
 626         # Make a hash of all known node memberships, and make the subgraphs.
 627         my $contig = {};
 628         my $subgraph = {};
 629         my $acstr = $c->ac_label;
 630         my @acwits;
 631
 632         # Note which witnesses positively belong to which group. This information
 633         # comes ultimately from the IDP solver.
 634         # Also make a note of the reading's roots.
 635     foreach my $rdghash ( @{$variant_row->{'readings'}} ) {
 636         my $rid = $rdghash->{'readingid'};
 637         my @roots;
 638         foreach my $wit ( @{$rdghash->{'group'}} ) {
 639                 $contig->{$wit} = $rid;
 640             if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
 641                 push( @acwits, $1 );
 642             }
 643             if( exists $reading_roots->{$wit} && $reading_roots->{$wit} ) {
 644                 push( @roots, $wit );
 645             }
 646         }
 647                 $rdghash->{'independent_occurrence'} = \@roots;
 648         }
 649
 650         # Now that we have all the node group memberships, calculate followed/
 651     # non-followed/unknown values for each reading.  Also figure out the
 652     # reading's evident parent(s).
 653     foreach my $rdghash ( @{$variant_row->{'readings'}} ) {
 654         my $rid = $rdghash->{'readingid'};
 655         my $rdg = $c->reading( $rid );
 656         my @roots = @{$rdghash->{'independent_occurrence'}};
 657         my @reversions;
 658         if( $classinfo ) {
 659                 @reversions = grep { $classinfo->{$_} eq 'revert' }
 660                         $rdghash->{'group'}->members;
 661                 $rdghash->{'reversions'} = \@reversions;
 662         }
 663         my @group = @{$rdghash->{'group'}};
 664
 665         # Start figuring things out.
 666         $rdghash->{'followed'} = scalar( @group )
 667                 - ( scalar( @roots ) + scalar( @reversions ) );
 668         # Find the parent readings, if any, of this reading.
 669         my $sourceparents = _find_reading_parents( $rid, $graph, $contig, @roots );
 670                 # Work out relationships between readings and their non-followed parent.
 671                 _resolve_parent_relationships( $c, $rid, $rdg, $sourceparents );
 672                 $rdghash->{'source_parents'} = $sourceparents;
 673
 674                 if( @reversions ) {
 675                         my $revparents = _find_reading_parents( $rid, $graph, $contig, @reversions );
 676                         _resolve_parent_relationships( $c, $rid, $rdg, $revparents );
 677                         $rdghash->{'reversion_parents'} = $revparents;
 678                 }
 679
 680                 # Find the number of times this reading was altered, and the number of
 681                 # times we're not sure.
 682                 my( %nofollow, %unknownfollow );
 683                 foreach my $wit ( @{$rdghash->{'group'}} ) {
 684                         foreach my $wchild ( $graph->successors( $wit ) ) {
 685                                 if( $reading_roots->{$wchild} && $contig->{$wchild}
 686                                         && $contig->{$wchild} ne $rid ) {
 687                                         # It definitely changed here.
 688                                         $nofollow{$wchild} = 1;
 689                                 } elsif( !($contig->{$wchild}) ) {
 690                                         # The child is a hypothetical node not definitely in
 691                                         # any group. Answer is unknown.
 692                                         $unknownfollow{$wchild} = 1;
 693                                 } # else it is either in our group, or it is a non-root node in a
 694                                   # known group and therefore is presumed to have its reading from
 695                                   # its group, not this link.
 696                         }
 697                 }
 698                 $rdghash->{'not_followed'} = keys %nofollow;
 699                 $rdghash->{'follow_unknown'} = keys %unknownfollow;
 700
 701                 # Now say whether this reading represents a conflict.
 702                 unless( $variant_row->{'genealogical'} ) {
 703                         $rdghash->{'is_conflict'} = @roots != 1;
 704                         $rdghash->{'is_reverted'} = scalar @reversions;
 705                 }
 706     }
 707 }
 708
 709 sub _find_reading_parents {
 710         my( $rid, $graph, $contig, @list ) = @_;
 711         my $parenthash = {};
 712         foreach my $wit ( @list ) {
 713                 # Look in the stemma graph to find this witness's extant or known-reading
 714                 # immediate ancestor(s), and look up the reading that each ancestor holds.
 715                 my @check = $graph->predecessors( $wit );
 716                 while( @check ) {
 717                         my @next;
 718                         foreach my $wparent( @check ) {
 719                                 my $preading = $contig->{$wparent};
 720                                 if( $preading && $preading ne $rid ) {
 721                                         $parenthash->{$preading} = 1;
 722                                 } else {
 723                                         push( @next, $graph->predecessors( $wparent ) );
 724                                 }
 725                         }
 726                         @check = @next;
 727                 }
 728         }
 729         return $parenthash;
 730 }
 731
 732 sub _resolve_parent_relationships {
 733         my( $c, $rid, $rdg, $rdgparents ) = @_;
 734         foreach my $p ( keys %$rdgparents ) {
 735                 # Resolve the relationship of the parent to the reading, and
 736                 # save it in our hash.
 737                 my $pobj = $c->reading( $p );
 738                 my $prep = $pobj ? $pobj->id . ' (' . $pobj->text . ')' : $p;
 739                 my $phash = { 'label' => $prep };
 740                 if( $pobj ) {
 741                         my $rel = $c->get_relationship( $p, $rid );
 742                         if( $rel ) {
 743                                 _add_to_hash( $rel, $phash );
 744                         } elsif( $rdg ) {
 745                                 # First check for a transposed relationship
 746                                 if( $rdg->rank != $pobj->rank ) {
 747                                         foreach my $ti ( $rdg->related_readings( 'transposition' ) ) {
 748                                                 next unless $ti->text eq $rdg->text;
 749                                                 $rel = $c->get_relationship( $ti, $pobj );
 750                                                 if( $rel ) {
 751                                                         _add_to_hash( $rel, $phash, 1 );
 752                                                         last;
 753                                                 }
 754                                         }
 755                                         unless( $rel ) {
 756                                                 foreach my $ti ( $pobj->related_readings( 'transposition' ) ) {
 757                                                         next unless $ti->text eq $pobj->text;
 758                                                         $rel = $c->get_relationship( $ti, $rdg );
 759                                                         if( $rel ) {
 760                                                                 _add_to_hash( $rel, $phash, 1 );
 761                                                                 last;
 762                                                         }
 763                                                 }
 764                                         }
 765                                 }
 766                                 unless( $rel ) {
 767                                         # and then check for sheer word similarity.
 768                                         my $rtext = $rdg->text;
 769                                         my $ptext = $pobj->text;
 770                                         if( similar( $rtext, $ptext ) ) {
 771                                                 # say STDERR "Words $rtext and $ptext judged similar";
 772                                                 $phash->{relation} = { type => 'wordsimilar' };
 773                                         }
 774                                 }
 775                         } else {
 776                                 $phash->{relation} = { type => 'deletion' };
 777                         }
 778                         # Get the attributes of the parent object while we are here
 779                         $phash->{'text'} = $pobj->text if $pobj;
 780                         $phash->{'is_nonsense'} = $pobj->is_nonsense;
 781                         $phash->{'is_ungrammatical'} = $pobj->grammar_invalid;
 782                 } elsif( $p eq '(omitted)' ) {
 783                         $phash->{relation} = { type => 'addition' };
 784                 }
 785                 # Save it
 786                 $rdgparents->{$p} = $phash;
 787         }
 788 }
 789
 790 sub _add_to_hash {
 791         my( $rel, $phash, $is_transposed ) = @_;
 792         $phash->{relation} = { type => $rel->type };
 793         $phash->{relation}->{transposed} = 1 if $is_transposed;
 794         $phash->{relation}->{annotation} = $rel->annotation
 795                 if $rel->has_annotation;
 796 }
 797
 798 =head2 similar( $word1, $word2 )
 799
 800 Use Algorithm::Diff to get a sense of how close the words are to each other.
 801 This will hopefully handle substitutions a bit more nicely than Levenshtein.
 802
 803 =cut
 804
 805 #!/usr/bin/env perl
 806
 807 sub similar {
 808         my( $word1, $word2 ) = sort { length($a) <=> length($b) } @_;
 809         my @let1 = split( '', lc( $word1 ) );
 810         my @let2 = split( '', lc( $word2 ) );
 811         my $diff = Algorithm::Diff->new( \@let1, \@let2 );
 812         my $mag = 0;
 813         while( $diff->Next ) {
 814                 if( $diff->Same ) {
 815                         # Take off points for longer strings
 816                         my $cs = $diff->Range(1) - 2;
 817                         $cs = 0 if $cs < 0;
 818                         $mag -= $cs;
 819                 } elsif( !$diff->Items(1) ) {
 820                         $mag += $diff->Range(2);
 821                 } elsif( !$diff->Items(2) ) {
 822                         $mag += $diff->Range(1);
 823                 } else {
 824                         # Split the difference for substitutions
 825                         my $c1 = $diff->Range(1) || 1;
 826                         my $c2 = $diff->Range(2) || 1;
 827                         my $cd = ( $c1 + $c2 ) / 2;
 828                         $mag += $cd;
 829                 }
 830         }
 831         return ( $mag <= length( $word1 ) / 2 );
 832 }
 833
 834 sub _useful_variant {
 835         my( $rankgroup, $rankgraph, $acstr ) = @_;
 836
 837         # Sort by group size and return
 838         my $is_useful = 0;
 839         foreach my $rdg ( keys %$rankgroup ) {
 840                 my @wits = $rankgroup->{$rdg}->members;
 841                 if( @wits > 1 ) {
 842                         $is_useful++;
 843                 } else {
 844                         $is_useful++ unless( $rankgraph->is_sink_vertex( $wits[0] )
 845                                 || $wits[0] =~ /\Q$acstr\E$/ );
 846                 }
 847         }
 848         return $is_useful > 1;
 849 }
 850
 851 =head2 wit_stringify( $groups )
 852
 853 Takes an array of witness groupings and produces a string like
 854 ['A','B'] / ['C','D','E'] / ['F']
 855
 856 =cut
 857
 858 sub wit_stringify {
 859     my $groups = shift;
 860     my @gst;
 861     # If we were passed an array of witnesses instead of an array of
 862     # groupings, then "group" the witnesses first.
 863     unless( ref( $groups->[0] ) ) {
 864         my $mkgrp = [ $groups ];
 865         $groups = $mkgrp;
 866     }
 867     foreach my $g ( @$groups ) {
 868         push( @gst, '[' . join( ',', map { "'$_'" } @$g ) . ']' );
 869     }
 870     return join( ' / ', @gst );
 871 }
 872
 873 1;
 874
 875 sub throw {
 876         Text::Tradition::Error->throw(
 877                 'ident' => 'Analysis error',
 878                 'message' => $_[0],
 879         );
 880 }
 881
 882 =head1 LICENSE
 883
 884 This package is free software and is provided "as is" without express
 885 or implied warranty.  You can redistribute it and/or modify it under
 886 the same terms as Perl itself.
 887
 888 =head1 AUTHOR
 889
 890 Tara L Andrews E<lt>aurum@cpan.orgE<gt>