lib/Text/Tradition/Analysis.pm

   1 package Text::Tradition::Analysis;
   2
   3 use strict;
   4 use warnings;
   5 use Algorithm::Diff;  # for word similarity measure
   6 use Encode qw/ decode_utf8 encode_utf8 /;
   7 use Exporter 'import';
   8 use Graph;
   9 use JSON qw/ to_json decode_json /;
  10 use LWP::UserAgent;
  11 use Set::Scalar;
  12 use Text::Tradition::Analysis::Result;
  13 use Text::Tradition::Directory;
  14 use Text::Tradition::Stemma;
  15 use TryCatch;
  16
  17 use vars qw/ @EXPORT_OK /;
  18 @EXPORT_OK = qw/ run_analysis group_variants analyze_variant_location wit_stringify /;
  19
  20 my $SOLVER_URL = 'http://byzantini.st/cgi-bin/graphcalc.cgi';
  21 my $unsolved_problems = {};
  22
  23 =head1 NAME
  24
  25 Text::Tradition::Analysis - functions for stemma analysis of a tradition
  26
  27 =head1 SYNOPSIS
  28
  29   use Text::Tradition;
  30   use Text::Tradition::Analysis qw/ run_analysis analyze_variant_location /;
  31   my $t = Text::Tradition->new(
  32     'name' => 'this is a text',
  33     'input' => 'TEI',
  34     'file' => '/path/to/tei_parallel_seg_file.xml' );
  35   $t->add_stemma( 'dotfile' => $stemmafile );
  36
  37   my $variant_data = run_analysis( $tradition );
  38   # Recalculate rank $n treating all orthographic variants as equivalent
  39   my $reanalyze = analyze_variant_location( $tradition, $n, 0, 'orthographic' );
  40
  41 =head1 DESCRIPTION
  42
  43 Text::Tradition is a library for representation and analysis of collated
  44 texts, particularly medieval ones.  The Collation is the central feature of
  45 a Tradition, where the text, its sequence of readings, and its relationships
  46 between readings are actually kept.
  47
  48 =head1 SUBROUTINES
  49
  50 =head2 run_analysis( $tradition, %opts )
  51
  52 Runs the analysis described in analyze_variant_location on every location in the
  53 collation of the given tradition, with the given options. These include:
  54
  55 =over 4
  56
  57 =item * stemma_id - Specify which of the tradition's stemmata to use. Default
  58 is 0 (i.e. the first).
  59
  60 =item * ranks - Specify a list of location ranks to analyze; exclude the rest.
  61
  62 =item * merge_types - Specify a list of relationship types, where related readings
  63 should be treated as identical for the purposes of analysis.
  64
  65 =item * exclude_type1 - Exclude those ranks whose groupings have only type-1 variants.
  66
  67 =back
  68
  69 =begin testing
  70
  71 use Text::Tradition;
  72 use Text::Tradition::Analysis qw/ run_analysis analyze_variant_location /;
  73
  74 my $datafile = 't/data/florilegium_tei_ps.xml';
  75 my $tradition = Text::Tradition->new( 'input' => 'TEI',
  76                                       'name' => 'test0',
  77                                       'file' => $datafile );
  78 my $s = $tradition->add_stemma( 'dotfile' => 't/data/florilegium.dot' );
  79 is( ref( $s ), 'Text::Tradition::Stemma', "Added stemma to tradition" );
  80
  81 my %expected_genealogical = (
  82         1 => 0,
  83         2 => 1,
  84         3 =>  0,
  85         5 =>  0,
  86         7 =>  0,
  87         8 =>  0,
  88         10 => 0,
  89         13 => 1,
  90         33 => 0,
  91         34 => 0,
  92         37 => 0,
  93         60 => 0,
  94         81 => 1,
  95         84 => 0,
  96         87 => 0,
  97         101 => 0,
  98         102 => 0,
  99         122 => 1,
 100         157 => 0,
 101         166 => 1,
 102         169 => 1,
 103         200 => 0,
 104         216 => 1,
 105         217 => 1,
 106         219 => 1,
 107         241 => 1,
 108         242 => 1,
 109         243 => 1,
 110 );
 111
 112 my $data = run_analysis( $tradition, calcdsn => 'dbi:SQLite:dbname=t/data/analysis.db' );
 113 my $c = $tradition->collation;
 114 foreach my $row ( @{$data->{'variants'}} ) {
 115         # Account for rows that used to be "not useful"
 116         unless( exists $expected_genealogical{$row->{'id'}} ) {
 117                 $expected_genealogical{$row->{'id'}} = 1;
 118         }
 119         my $gen_bool = $row->{'genealogical'} ? 1 : 0;
 120         is( $gen_bool, $expected_genealogical{$row->{'id'}},
 121                 "Got correct genealogical flag for row " . $row->{'id'} );
 122         # Check that we have the right row with the right groups
 123         my $rank = $row->{'id'};
 124         foreach my $rdghash ( @{$row->{'readings'}} ) {
 125                 # Skip 'readings' that aren't really
 126                 next unless $c->reading( $rdghash->{'readingid'} );
 127                 # Check the rank
 128                 is( $c->reading( $rdghash->{'readingid'} )->rank, $rank,
 129                         "Got correct reading rank" );
 130                 # Check the witnesses
 131                 my @realwits = sort $c->reading_witnesses( $rdghash->{'readingid'} );
 132                 my @sgrp = sort @{$rdghash->{'group'}};
 133                 is_deeply( \@sgrp, \@realwits, "Reading analyzed with correct groups" );
 134         }
 135 }
 136 is( $data->{'variant_count'}, 58, "Got right total variant number" );
 137 # TODO Make something meaningful of conflict count, maybe test other bits
 138
 139 =end testing
 140
 141 =cut
 142
 143 sub run_analysis {
 144         my( $tradition, %opts ) = @_;
 145         my $c = $tradition->collation;
 146         my $aclabel = $c->ac_label;
 147
 148         my $stemma_id = $opts{'stemma_id'} || 0;
 149         my @ranks = ref( $opts{'ranks'} ) eq 'ARRAY' ? @{$opts{'ranks'}} : ();
 150         my $collapse = Set::Scalar->new();
 151         if( $opts{'merge_types'} && ref( $opts{'merge_types'} ) eq 'ARRAY' ) {
 152                 $collapse->insert( @{$opts{'merge_types'}} );
 153         } elsif( $opts{'merge_types'} ) {
 154                 $collapse->insert( $opts{'merge_types'} );
 155         }
 156
 157         # Make sure we have a lookup DB for graph calculation results; this will die
 158         # if calcdir or calcdsn isn't passed.
 159         my $dir;
 160         if( exists $opts{'calcdir'} ) {
 161                 $dir = delete $opts{'calcdir'}
 162         } elsif ( exists $opts{'calcdsn'} ) {
 163                 $dir = Text::Tradition::Directory->new( dsn => $opts{'calcdsn'} );
 164         }
 165
 166         # Get the stemma
 167         my $stemma = $tradition->stemma( $stemma_id );
 168
 169         # Figure out which witnesses we are working with - that is, the ones that
 170         # appear both in the stemma and in the tradition. All others are 'lacunose'
 171         # for our purposes.
 172         my $lacunose = Set::Scalar->new( $stemma->hypotheticals );
 173         my $stemma_wits = Set::Scalar->new( $stemma->witnesses );
 174         my $tradition_wits = Set::Scalar->new( map { $_->sigil } $tradition->witnesses );
 175         $lacunose->insert( $stemma_wits->symmetric_difference( $tradition_wits )->members );
 176
 177         # Find and mark 'common' ranks for exclusion, unless they were
 178         # explicitly specified.
 179         unless( @ranks ) {
 180                 my %common_rank;
 181                 foreach my $rdg ( $c->common_readings ) {
 182                         $common_rank{$rdg->rank} = 1;
 183                 }
 184                 @ranks = grep { !$common_rank{$_} } ( 1 .. $c->end->rank-1 );
 185         }
 186
 187         # Group the variants to send to the solver
 188         my @groups;
 189         my @use_ranks;
 190         my %lacunae;
 191         my $moved = {};
 192         foreach my $rank ( @ranks ) {
 193                 my $missing = $lacunose->clone();
 194                 my $rankgroup = group_variants( $tradition, $rank, $missing, $moved, $collapse );
 195                 # Filter out any empty rankgroups
 196                 # (e.g. from the later rank for a transposition)
 197                 next unless keys %$rankgroup;
 198                 # Get the graph for this rankgroup
 199                 my $rankgraph = _graph_for_grouping( $stemma, $rankgroup, $missing, $aclabel );
 200                 if( $opts{'exclude_type1'} ) {
 201                         # Check to see whether this is a "useful" group.
 202                         next unless _useful_variant( $rankgroup, $rankgraph, $aclabel );
 203                 }
 204                 push( @use_ranks, $rank );
 205                 push( @groups, { grouping => $rankgroup, graph => $rankgraph } );
 206                 $lacunae{$rank} = $missing;
 207         }
 208         # Run the solver
 209         my $answer = solve_variants( $dir, @groups );
 210
 211         # Do further analysis on the answer
 212         my $conflict_count = 0;
 213         my $reversion_count = 0;
 214         foreach my $idx ( 0 .. $#use_ranks ) {
 215                 my $location = $answer->{'variants'}->[$idx];
 216                 # Add the rank back in
 217                 my $rank = $use_ranks[$idx];
 218                 $location->{'id'} = $rank;
 219                 # Note what our lacunae are
 220                 my %lmiss;
 221                 map { $lmiss{$_} = 1 } @{$lacunae{$use_ranks[$idx]}};
 222                 $location->{'missing'} = [ keys %lmiss ];
 223
 224                 # Run the extra analysis we need.
 225                 ## TODO We run through all the variants in this call, so
 226                 ## why not add the reading data there instead of here below?
 227                 my $graph = $groups[$idx]->{graph};
 228                 analyze_location( $tradition, $graph, $location, \%lmiss );
 229
 230                 my @layerwits;
 231                 # Do the final post-analysis tidying up of the data.
 232                 foreach my $rdghash ( @{$location->{'readings'}} ) {
 233                         $conflict_count++ if $rdghash->{'is_conflict'};
 234                         $reversion_count++ if $rdghash->{'is_reverted'};
 235                         # Add the reading text back in, setting display value as needed
 236                         my $rdg = $c->reading( $rdghash->{'readingid'} );
 237                         if( $rdg ) {
 238                                 $rdghash->{'text'} = $rdg->text .
 239                                         ( $rdg->rank == $rank ? '' : ' [' . $rdg->rank . ']' );
 240                                 $rdghash->{'is_ungrammatical'} = $rdg->grammar_invalid;
 241                                 $rdghash->{'is_nonsense'} = $rdg->is_nonsense;
 242                         }
 243                         # Remove lacunose witnesses from this reading's list now that the
 244                         # analysis is done
 245                         my @realgroup;
 246                         map { push( @realgroup, $_ ) unless $lmiss{$_} } @{$rdghash->{'group'}};
 247                         $rdghash->{'group'} = \@realgroup;
 248                         # Note any layered witnesses that appear in this group
 249                         foreach( @realgroup ) {
 250                                 if( $_ =~ /^(.*)\Q$aclabel\E$/ ) {
 251                                         push( @layerwits, $1 );
 252                                 }
 253                         }
 254                 }
 255                 $location->{'layerwits'} = \@layerwits if @layerwits;
 256         }
 257         $answer->{'conflict_count'} = $conflict_count;
 258         $answer->{'reversion_count'} = $reversion_count;
 259
 260         return $answer;
 261 }
 262
 263 =head2 group_variants( $tradition, $rank, $lacunose, $transposed, $merge_relationship_types )
 264
 265 Groups the variants at the given $rank of the collation, treating any
 266 relationships in the set $merge_relationship_types as equivalent.
 267 $lacunose should be a reference to an array, to which the sigla of lacunose
 268 witnesses at this rank will be appended; $transposed should be a reference
 269 to a hash, wherein the identities of transposed readings and their
 270 relatives will be stored.
 271
 272 Returns a hash $group_readings where $rdg is attested by the witnesses listed
 273 in $group_readings->{$rdg}.
 274
 275 =cut
 276
 277 # Return group_readings, groups, lacunose
 278 sub group_variants {
 279         my( $tradition, $rank, $lacunose, $transposed, $collapse ) = @_;
 280         my $c = $tradition->collation;
 281         my $aclabel = $c->ac_label;
 282         my $table = $c->alignment_table;
 283         # Get the alignment table readings
 284         my %readings_at_rank;
 285         my $check_for_gaps = Set::Scalar->new();
 286         my %moved_wits;
 287         my $has_transposition;
 288         foreach my $tablewit ( @{$table->{'alignment'}} ) {
 289                 my $rdg = $tablewit->{'tokens'}->[$rank-1];
 290                 my $wit = $tablewit->{'witness'};
 291                 # Exclude the witness if it is "lacunose" which if we got here
 292                 # means "not in the stemma".
 293                 next if _is_lacunose( $wit, $lacunose, $aclabel );
 294                 # Note if the witness is actually in a lacuna
 295                 if( $rdg && $rdg->{'t'}->is_lacuna ) {
 296                         _add_to_witlist( $wit, $lacunose, $aclabel );
 297                 # Otherwise the witness either has a positive reading...
 298                 } elsif( $rdg ) {
 299                         # If the reading has been counted elsewhere as a transposition, ignore it.
 300                         if( $transposed->{$rdg->{'t'}->id} ) {
 301                                 # TODO Does this cope with three-way transpositions?
 302                                 map { $moved_wits{$_} = 1 } @{$transposed->{$rdg->{'t'}->id}};
 303                                 next;
 304                         }
 305                         # Otherwise, record it...
 306                         $readings_at_rank{$rdg->{'t'}->id} = $rdg->{'t'};
 307                         # ...and grab any transpositions, and their relations.
 308                         my @transp = grep { $_->rank != $rank } $rdg->{'t'}->related_readings();
 309                         foreach my $trdg ( @transp ) {
 310                                 next if exists $readings_at_rank{$trdg->id};
 311                                 $has_transposition = 1;
 312                                 my @affected_wits = _table_witnesses(
 313                                         $table, $trdg, $lacunose, $aclabel );
 314                                 next unless @affected_wits;
 315                                 map { $moved_wits{$_} = 1 } @affected_wits;
 316                                 $transposed->{$trdg->id} =
 317                                         [ _table_witnesses( $table, $rdg->{'t'}, $lacunose, $aclabel ) ];
 318                                 $readings_at_rank{$trdg->id} = $trdg;
 319                         }
 320                 # ...or it is empty, ergo a gap.
 321                 } else {
 322                         _add_to_witlist( $wit, $check_for_gaps, $aclabel );
 323                 }
 324         }
 325         my $gap_wits = Set::Scalar->new();
 326         map { _add_to_witlist( $_, $gap_wits, $aclabel )
 327                 unless $moved_wits{$_} } $check_for_gaps->members;
 328
 329         # Group the readings, collapsing groups by relationship if needed.
 330         my $grouped_readings = {};
 331         foreach my $rdg ( values %readings_at_rank ) {
 332                 # Skip readings that have been collapsed into others.
 333                 next if exists $grouped_readings->{$rdg->id}
 334                         && $grouped_readings->{$rdg->id} eq 'COLLAPSE';
 335                 # Get the witness list, including from readings collapsed into this one.
 336                 my @wits = _table_witnesses( $table, $rdg, $lacunose, $aclabel );
 337                 if( $collapse && $collapse->size ) {
 338                         my $filter = sub { $collapse->has( $_[0]->type ) };
 339                         foreach my $other ( $rdg->related_readings( $filter ) ) {
 340                                 my @otherwits = _table_witnesses( $table, $other, $lacunose, $aclabel );
 341                                 push( @wits, @otherwits );
 342                                 $grouped_readings->{$other->id} = 'COLLAPSE';
 343                         }
 344                 }
 345                 $grouped_readings->{$rdg->id} = Set::Scalar->new( @wits );
 346         }
 347         if( $gap_wits->members ) {
 348                 $grouped_readings->{'(omitted)'} = $gap_wits;
 349         }
 350
 351         # Get rid of our collapsed readings
 352         map { delete $grouped_readings->{$_} if(
 353                          $grouped_readings->{$_} eq 'COLLAPSE'
 354                          || $grouped_readings->{$_}->is_empty ) }
 355                 keys %$grouped_readings;
 356
 357         # If something was transposed, check the groups for doubled-up readings
 358         if( $has_transposition ) {
 359                 # print STDERR "Group for rank $rank:\n";
 360                 # map { print STDERR "\t$_: " . join( ' ' , @{$grouped_readings->{$_}} ) . "\n" }
 361                 #       keys %$grouped_readings;
 362                 _check_transposed_consistency( $c, $rank, $transposed, $grouped_readings );
 363         }
 364
 365         # Return the result
 366         return $grouped_readings;
 367 }
 368
 369 # Helper function to query the alignment table for all witnesses (a.c. included)
 370 # that have a given reading at its rank.
 371 sub _table_witnesses {
 372         my( $table, $trdg, $lacunose, $aclabel ) = @_;
 373         my $tableidx = $trdg->rank - 1;
 374         my $has_reading = Set::Scalar->new();
 375         foreach my $row ( @{$table->{'alignment'}} ) {
 376                 my $wit = $row->{'witness'};
 377                 next if _is_lacunose( $wit, $lacunose, $aclabel );
 378                 my $rdg = $row->{'tokens'}->[$tableidx];
 379                 next unless exists $rdg->{'t'} && defined $rdg->{'t'};
 380                 _add_to_witlist( $wit, $has_reading, $aclabel )
 381                         if $rdg->{'t'}->id eq $trdg->id;
 382         }
 383         return $has_reading->members;
 384 }
 385
 386 # Helper function to see if a witness is lacunose even if we are asking about
 387 # the a.c. version
 388 sub _is_lacunose {
 389         my ( $wit, $lac, $acstr ) = @_;
 390         if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
 391                 $wit = $1;
 392         }
 393         return $lac->has( $wit );
 394 }
 395
 396 # Helper function to ensure that X and X a.c. never appear in the same list.
 397 sub _add_to_witlist {
 398         my( $wit, $list, $acstr ) = @_;
 399         if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
 400                 # Don't add X a.c. if we already have X
 401                 return if $list->has( $1 );
 402         } else {
 403                 # Delete X a.c. if we are about to add X
 404                 $list->delete( $wit.$acstr );
 405         }
 406         $list->insert( $wit );
 407 }
 408
 409 sub _check_transposed_consistency {
 410         my( $c, $rank, $transposed, $groupings ) = @_;
 411         my %seen_wits;
 412         my %thisrank;
 413         # Note which readings are actually at this rank, and which witnesses
 414         # belong to which reading.
 415         foreach my $rdg ( keys %$groupings ) {
 416                 my $rdgobj = $c->reading( $rdg );
 417                 # Count '(omitted)' as a reading at this rank
 418                 $thisrank{$rdg} = 1 if !$rdgobj || $rdgobj->rank == $rank;
 419                 map { push( @{$seen_wits{$_}}, $rdg ) } @{$groupings->{$rdg}};
 420         }
 421         # Our work is done if we have no witness belonging to more than one
 422         # reading.
 423         my @doubled = grep { scalar @{$seen_wits{$_}} > 1 } keys %seen_wits;
 424         return unless @doubled;
 425         # If we have a symmetric related transposition, drop the non-rank readings.
 426         if( @doubled == scalar keys %seen_wits ) {
 427                 foreach my $rdg ( keys %$groupings ) {
 428                         if( !$thisrank{$rdg} ) {
 429                                 my $groupstr = wit_stringify( $groupings->{$rdg} );
 430                                 my ( $matched ) = grep { $groupstr eq wit_stringify( $groupings->{$_} ) }
 431                                         keys %thisrank;
 432                                 delete $groupings->{$rdg};
 433                                 # If we found a group match, assume there is a symmetry happening.
 434                                 # TODO think more about this
 435                                 # print STDERR "*** Deleting symmetric reading $rdg\n";
 436                                 unless( $matched ) {
 437                                         delete $transposed->{$rdg};
 438                                         warn "Found problem in evident symmetry with reading $rdg";
 439                                 }
 440                         }
 441                 }
 442         # Otherwise 'unhook' the transposed reading(s) that have duplicates.
 443         } else {
 444                 foreach my $dup ( @doubled ) {
 445                         foreach my $rdg ( @{$seen_wits{$dup}} ) {
 446                                 next if $thisrank{$rdg};
 447                                 next unless exists $groupings->{$rdg};
 448                                 # print STDERR "*** Deleting asymmetric doubled-up reading $rdg\n";
 449                                 delete $groupings->{$rdg};
 450                                 delete $transposed->{$rdg};
 451                         }
 452                 }
 453                 # and put any now-orphaned readings into an 'omitted' reading.
 454                 foreach my $wit ( keys %seen_wits ) {
 455                         unless( grep { exists $groupings->{$_} } @{$seen_wits{$wit}} ) {
 456                                 $groupings->{'(omitted)'} = Set::Scalar->new()
 457                                          unless exists $groupings->{'(omitted)'};
 458                                 _add_to_witlist( $wit, $groupings->{'(omitted)'}, $c->ac_label );
 459                         }
 460                 }
 461         }
 462 }
 463
 464 # For the given grouping, return its situation graph based on the stemma.
 465 sub _graph_for_grouping {
 466         my( $stemma, $grouping, $lacunose, $aclabel ) = @_;
 467         my $acwits = [];
 468         my $extant = {};
 469         foreach my $gs ( values %$grouping ) {
 470                 map {
 471                         if( $_ =~ /^(.*)\Q$aclabel\E$/ ) {
 472                                 push( @$acwits, $1 ) unless $lacunose->has( $1 );
 473                         } else {
 474                                 $extant->{$_} = 1 unless $lacunose->has( $_ );
 475                         }
 476                 } $gs->members;
 477         }
 478         my $graph;
 479         try {
 480                 # contig contains all extant wits and all hypothetical wits
 481                 # needed to make up the groups.
 482                 $graph = $stemma->situation_graph( $extant, $acwits );
 483         } catch ( Text::Tradition::Error $e ) {
 484                 throw( "Could not extend graph with given extant and a.c. witnesses: "
 485                         . $e->message );
 486         } catch {
 487                 throw( "Could not extend graph with a.c. witnesses @$acwits" );
 488         }
 489         return $graph;
 490 }
 491
 492 =head2 solve_variants( $calcdir, @groups )
 493
 494 Looks up the set of groups in the answers provided by the external graph solver
 495 service and returns a cleaned-up answer, adding the rank IDs back where they belong.
 496
 497 The answer has the form
 498   { "variants" => [ array of variant location structures ],
 499     "variant_count" => total,
 500     "conflict_count" => number of conflicts detected,
 501     "genealogical_count" => number of solutions found }
 502
 503 =cut
 504
 505 sub solve_variants {
 506         my( @groups ) = @_;
 507
 508         # Are we using a local result directory, or did we pass an empty value
 509         # for the directory?
 510         my $dir;
 511         unless( ref( $groups[0] ) eq 'HASH' ) {
 512                 $dir = shift @groups;
 513         }
 514
 515         ## For each graph/group combo, make a Text::Tradition::Analysis::Result
 516         ## object so that we can send it off for IDP lookup.
 517         my $variants = [];
 518         my $genealogical = 0; # counter
 519         # TODO Optimize for unique graph problems
 520         my @problems;
 521         foreach my $graphproblem ( @groups ) {
 522                 # Construct the calc result key and look up its answer
 523                 my $problem = Text::Tradition::Analysis::Result->new(
 524                         graph => $graphproblem->{'graph'},
 525                         setlist => [ values %{$graphproblem->{'grouping'}} ] );
 526                 push( @problems, $problem );
 527         }
 528
 529         my @results;
 530         if( $dir ) {
 531                 my $scope = $dir->new_scope;
 532                 @results = map { $dir->lookup( $_->object_key ) || $_ } @problems;
 533         } else {
 534                 my $json = JSON->new->allow_blessed->convert_blessed->utf8->encode( \@problems );
 535                 # Send it off and get the result
 536                 # print STDERR "Sending request: " . decode_utf8( $json ) . "\n";
 537                 my $ua = LWP::UserAgent->new();
 538                 my $resp = $ua->post( $SOLVER_URL, 'Content-Type' => 'application/json',
 539                                                           'Content' => $json );
 540                 my $answer;
 541                 if( $resp->is_success ) {
 542                         $answer = decode_json( $resp->content );
 543                         throw( "Unexpected answer from IDP: $answer" ) unless ref( $answer ) eq 'ARRAY';
 544                 } else {
 545                         throw( "IDP solver returned " . $resp->status_line . " / " . $resp->content
 546                                 . "; cannot run graph analysis" );
 547                 }
 548                 # One more sanity check
 549                 throw( "Something went wrong with answer symmetricity" )
 550                         unless @groups == @$answer;
 551                 # Convert the results
 552                 @results = map { Text::Tradition::Analysis::Result->new( $_ ) } @$answer;
 553         }
 554
 555         # We now have a single JSON-encoded Result object per problem sent. Fold its
 556         # answers into our variant info structure.
 557         foreach my $idx ( 0 .. $#groups ) {
 558                 my $graphproblem = $groups[$idx];
 559                 my $result = $results[$idx];
 560
 561                 # Initialize the result structure for this graph problem
 562                 my $vstruct = { readings => [] };
 563                 push( @$variants, $vstruct );
 564
 565                 # 0. Do we have a calculated result at all?
 566                 unless( $result->status eq 'OK' ) {
 567                         $vstruct->{'unsolved'} = $result->status;
 568                         foreach my $rid ( keys %{$graphproblem->{grouping}} ) {
 569                                 push( @{$vstruct->{readings}}, { readingid => $rid,
 570                                         group => [ $graphproblem->{grouping}->{$rid}->members ] } );
 571                         }
 572                         next;
 573                 }
 574
 575                 # 1. Did the group evaluate as genealogical?
 576                 $vstruct->{genealogical} = $result->is_genealogical;
 577                 $genealogical++ if $result->is_genealogical;
 578
 579                 # 2. What are the calculated minimum groupings for each variant loc?
 580                 foreach my $rid ( keys %{$graphproblem->{grouping}} ) {
 581                         my $inputset = $graphproblem->{grouping}->{$rid};
 582                         my $minset = $result->minimum_grouping_for( $inputset );
 583                         push( @{$vstruct->{readings}}, { readingid => $rid, group => $minset } );
 584                 }
 585
 586                 # 3. What are the sources and classes calculated for each witness?
 587                 $vstruct->{witcopy_types} = { $result->classes };
 588                 $vstruct->{reading_roots} = {};
 589                 map { $vstruct->{reading_roots}->{$_} = 1 } $result->sources;
 590
 591         }
 592
 593         return { 'variants' => $variants,
 594                          'variant_count' => scalar @$variants,
 595                          'genealogical_count' => $genealogical };
 596 }
 597
 598 =head2 analyze_location ( $tradition, $graph, $location_hash )
 599
 600 Given the tradition, its stemma graph, and the solution from the graph solver,
 601 work out the rest of the information we want.  For each reading we need missing,
 602 conflict, reading_parents, independent_occurrence, followed, not_followed,
 603 and follow_unknown.  Alters the location_hash in place.
 604
 605 =cut
 606
 607 sub analyze_location {
 608         my ( $tradition, $graph, $variant_row, $lacunose ) = @_;
 609         my $c = $tradition->collation;
 610
 611         if( exists $variant_row->{'unsolved'} ) {
 612                 return;
 613         }
 614         my $reading_roots = delete $variant_row->{'reading_roots'};
 615         my $classinfo = delete $variant_row->{'witcopy_types'};
 616
 617         # Make a hash of all known node memberships, and make the subgraphs.
 618         my $contig = {};
 619         my $subgraph = {};
 620         my $acstr = $c->ac_label;
 621         my @acwits;
 622
 623         # Note which witnesses positively belong to which group. This information
 624         # comes ultimately from the IDP solver.
 625         # Also make a note of the reading's roots.
 626     foreach my $rdghash ( @{$variant_row->{'readings'}} ) {
 627         my $rid = $rdghash->{'readingid'};
 628         my @roots;
 629         foreach my $wit ( @{$rdghash->{'group'}} ) {
 630                 $contig->{$wit} = $rid;
 631             if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
 632                 push( @acwits, $1 );
 633             }
 634             if( exists $reading_roots->{$wit} && $reading_roots->{$wit} ) {
 635                 push( @roots, $wit );
 636             }
 637         }
 638                 $rdghash->{'independent_occurrence'} = \@roots;
 639         }
 640
 641         # Now that we have all the node group memberships, calculate followed/
 642     # non-followed/unknown values for each reading.  Also figure out the
 643     # reading's evident parent(s).
 644     foreach my $rdghash ( @{$variant_row->{'readings'}} ) {
 645         my $rid = $rdghash->{'readingid'};
 646         my $rdg = $c->reading( $rid );
 647         my @roots = @{$rdghash->{'independent_occurrence'}};
 648         my @reversions;
 649         if( $classinfo ) {
 650                 @reversions = grep { $classinfo->{$_} eq 'revert' }
 651                         $rdghash->{'group'}->members;
 652                 $rdghash->{'reversions'} = \@reversions;
 653         }
 654         my @group = @{$rdghash->{'group'}};
 655
 656         # Start figuring things out.
 657         $rdghash->{'followed'} = scalar( @group )
 658                 - ( scalar( @roots ) + scalar( @reversions ) );
 659         # Find the parent readings, if any, of this reading.
 660         my $sourceparents = _find_reading_parents( $rid, $graph, $contig, @roots );
 661                 # Work out relationships between readings and their non-followed parent.
 662                 _resolve_parent_relationships( $c, $rid, $rdg, $sourceparents );
 663                 $rdghash->{'source_parents'} = $sourceparents;
 664
 665                 if( @reversions ) {
 666                         my $revparents = _find_reading_parents( $rid, $graph, $contig, @reversions );
 667                         _resolve_parent_relationships( $c, $rid, $rdg, $revparents );
 668                         $rdghash->{'reversion_parents'} = $revparents;
 669                 }
 670
 671                 # Find the number of times this reading was altered, and the number of
 672                 # times we're not sure.
 673                 my( %nofollow, %unknownfollow );
 674                 foreach my $wit ( @{$rdghash->{'group'}} ) {
 675                         foreach my $wchild ( $graph->successors( $wit ) ) {
 676                                 if( $reading_roots->{$wchild} && $contig->{$wchild}
 677                                         && $contig->{$wchild} ne $rid ) {
 678                                         # It definitely changed here.
 679                                         $nofollow{$wchild} = 1;
 680                                 } elsif( !($contig->{$wchild}) ) {
 681                                         # The child is a hypothetical node not definitely in
 682                                         # any group. Answer is unknown.
 683                                         $unknownfollow{$wchild} = 1;
 684                                 } # else it is either in our group, or it is a non-root node in a
 685                                   # known group and therefore is presumed to have its reading from
 686                                   # its group, not this link.
 687                         }
 688                 }
 689                 $rdghash->{'not_followed'} = keys %nofollow;
 690                 $rdghash->{'follow_unknown'} = keys %unknownfollow;
 691
 692                 # Now say whether this reading represents a conflict.
 693                 unless( $variant_row->{'genealogical'} ) {
 694                         $rdghash->{'is_conflict'} = @roots != 1;
 695                         $rdghash->{'is_reverted'} = scalar @reversions;
 696                 }
 697     }
 698 }
 699
 700 sub _find_reading_parents {
 701         my( $rid, $graph, $contig, @list ) = @_;
 702         my $parenthash = {};
 703         foreach my $wit ( @list ) {
 704                 # Look in the stemma graph to find this witness's extant or known-reading
 705                 # immediate ancestor(s), and look up the reading that each ancestor holds.
 706                 my @check = $graph->predecessors( $wit );
 707                 while( @check ) {
 708                         my @next;
 709                         foreach my $wparent( @check ) {
 710                                 my $preading = $contig->{$wparent};
 711                                 if( $preading && $preading ne $rid ) {
 712                                         $parenthash->{$preading} = 1;
 713                                 } else {
 714                                         push( @next, $graph->predecessors( $wparent ) );
 715                                 }
 716                         }
 717                         @check = @next;
 718                 }
 719         }
 720         return $parenthash;
 721 }
 722
 723 sub _resolve_parent_relationships {
 724         my( $c, $rid, $rdg, $rdgparents ) = @_;
 725         foreach my $p ( keys %$rdgparents ) {
 726                 # Resolve the relationship of the parent to the reading, and
 727                 # save it in our hash.
 728                 my $pobj = $c->reading( $p );
 729                 my $prep = $pobj ? $pobj->id . ' (' . $pobj->text . ')' : $p;
 730                 my $phash = { 'label' => $prep };
 731                 if( $pobj ) {
 732                         my $rel = $c->get_relationship( $p, $rid );
 733                         if( $rel ) {
 734                                 _add_to_hash( $rel, $phash );
 735                         } elsif( $rdg ) {
 736                                 # First check for a transposed relationship
 737                                 if( $rdg->rank != $pobj->rank ) {
 738                                         foreach my $ti ( $rdg->related_readings( 'transposition' ) ) {
 739                                                 next unless $ti->text eq $rdg->text;
 740                                                 $rel = $c->get_relationship( $ti, $pobj );
 741                                                 if( $rel ) {
 742                                                         _add_to_hash( $rel, $phash, 1 );
 743                                                         last;
 744                                                 }
 745                                         }
 746                                         unless( $rel ) {
 747                                                 foreach my $ti ( $pobj->related_readings( 'transposition' ) ) {
 748                                                         next unless $ti->text eq $pobj->text;
 749                                                         $rel = $c->get_relationship( $ti, $rdg );
 750                                                         if( $rel ) {
 751                                                                 _add_to_hash( $rel, $phash, 1 );
 752                                                                 last;
 753                                                         }
 754                                                 }
 755                                         }
 756                                 }
 757                                 unless( $rel ) {
 758                                         # and then check for sheer word similarity.
 759                                         my $rtext = $rdg->text;
 760                                         my $ptext = $pobj->text;
 761                                         if( similar( $rtext, $ptext ) ) {
 762                                                 # say STDERR "Words $rtext and $ptext judged similar";
 763                                                 $phash->{relation} = { type => 'wordsimilar' };
 764                                         }
 765                                 }
 766                         } else {
 767                                 $phash->{relation} = { type => 'deletion' };
 768                         }
 769                         # Get the attributes of the parent object while we are here
 770                         $phash->{'text'} = $pobj->text if $pobj;
 771                         $phash->{'is_nonsense'} = $pobj->is_nonsense;
 772                         $phash->{'is_ungrammatical'} = $pobj->grammar_invalid;
 773                 } elsif( $p eq '(omitted)' ) {
 774                         $phash->{relation} = { type => 'addition' };
 775                 }
 776                 # Save it
 777                 $rdgparents->{$p} = $phash;
 778         }
 779 }
 780
 781 sub _add_to_hash {
 782         my( $rel, $phash, $is_transposed ) = @_;
 783         $phash->{relation} = { type => $rel->type };
 784         $phash->{relation}->{transposed} = 1 if $is_transposed;
 785         $phash->{relation}->{annotation} = $rel->annotation
 786                 if $rel->has_annotation;
 787 }
 788
 789 =head2 similar( $word1, $word2 )
 790
 791 Use Algorithm::Diff to get a sense of how close the words are to each other.
 792 This will hopefully handle substitutions a bit more nicely than Levenshtein.
 793
 794 =cut
 795
 796 #!/usr/bin/env perl
 797
 798 sub similar {
 799         my( $word1, $word2 ) = sort { length($a) <=> length($b) } @_;
 800         my @let1 = split( '', lc( $word1 ) );
 801         my @let2 = split( '', lc( $word2 ) );
 802         my $diff = Algorithm::Diff->new( \@let1, \@let2 );
 803         my $mag = 0;
 804         while( $diff->Next ) {
 805                 if( $diff->Same ) {
 806                         # Take off points for longer strings
 807                         my $cs = $diff->Range(1) - 2;
 808                         $cs = 0 if $cs < 0;
 809                         $mag -= $cs;
 810                 } elsif( !$diff->Items(1) ) {
 811                         $mag += $diff->Range(2);
 812                 } elsif( !$diff->Items(2) ) {
 813                         $mag += $diff->Range(1);
 814                 } else {
 815                         # Split the difference for substitutions
 816                         my $c1 = $diff->Range(1) || 1;
 817                         my $c2 = $diff->Range(2) || 1;
 818                         my $cd = ( $c1 + $c2 ) / 2;
 819                         $mag += $cd;
 820                 }
 821         }
 822         return ( $mag <= length( $word1 ) / 2 );
 823 }
 824
 825 sub _prune_group {
 826         my( $group, $graph ) = @_;
 827         my $relevant = {};
 828         # Record the existence of the vertices in the group
 829         map { $relevant->{$_} = 1 } @$group;
 830         # Make our subgraph
 831         my $subgraph = $graph->deep_copy;
 832         map { $subgraph->delete_vertex( $_ ) unless $relevant->{$_} }
 833                 $subgraph->vertices;
 834         # Now prune and return the remaining vertices.
 835         _prune_subtree( $subgraph );
 836         # Return the list of vertices and the list of roots.
 837         my $pruned_group = [ sort $subgraph->vertices ];
 838         my $pruned_roots = [ $subgraph->predecessorless_vertices ];
 839         return( $pruned_group, $pruned_roots );
 840 }
 841
 842 sub _prune_subtree {
 843         my( $tree ) = @_;
 844
 845         # Delete lacunose witnesses that have no successors
 846         my @orphan_hypotheticals;
 847         my $ctr = 0;
 848         do {
 849                 throw( "Infinite loop on leaves" ) if $ctr > 100;
 850                 @orphan_hypotheticals =
 851                         grep { $tree->get_vertex_attribute( $_, 'class' ) eq 'hypothetical' }
 852                                 $tree->successorless_vertices;
 853                 $tree->delete_vertices( @orphan_hypotheticals );
 854                 $ctr++;
 855         } while( @orphan_hypotheticals );
 856
 857         # Delete lacunose roots that have a single successor
 858         my @redundant_root;
 859         $ctr = 0;
 860         do {
 861                 throw( "Infinite loop on roots" ) if $ctr > 100;
 862                 @redundant_root =
 863                         grep { $tree->get_vertex_attribute( $_, 'class' ) eq 'hypothetical'
 864                                    && $tree->successors( $_ ) == 1 }
 865                                 $tree->predecessorless_vertices;
 866                 $tree->delete_vertices( @redundant_root );
 867                 $ctr++;
 868         } while( @redundant_root );
 869 }
 870
 871 sub _useful_variant {
 872         my( $rankgroup, $rankgraph, $acstr ) = @_;
 873
 874         # Sort by group size and return
 875         my $is_useful = 0;
 876         foreach my $rdg ( keys %$rankgroup ) {
 877                 my @wits = $rankgroup->{$rdg}->members;
 878                 if( @wits > 1 ) {
 879                         $is_useful++;
 880                 } else {
 881                         $is_useful++ unless( $rankgraph->is_sink_vertex( $wits[0] )
 882                                 || $wits[0] =~ /\Q$acstr\E$/ );
 883                 }
 884         }
 885         return $is_useful > 1;
 886 }
 887
 888 =head2 wit_stringify( $groups )
 889
 890 Takes an array of witness groupings and produces a string like
 891 ['A','B'] / ['C','D','E'] / ['F']
 892
 893 =cut
 894
 895 sub wit_stringify {
 896     my $groups = shift;
 897     my @gst;
 898     # If we were passed an array of witnesses instead of an array of
 899     # groupings, then "group" the witnesses first.
 900     unless( ref( $groups->[0] ) ) {
 901         my $mkgrp = [ $groups ];
 902         $groups = $mkgrp;
 903     }
 904     foreach my $g ( @$groups ) {
 905         push( @gst, '[' . join( ',', map { "'$_'" } @$g ) . ']' );
 906     }
 907     return join( ' / ', @gst );
 908 }
 909
 910 1;
 911
 912 sub throw {
 913         Text::Tradition::Error->throw(
 914                 'ident' => 'Analysis error',
 915                 'message' => $_[0],
 916         );
 917 }
 918
 919 =head1 LICENSE
 920
 921 This package is free software and is provided "as is" without express
 922 or implied warranty.  You can redistribute it and/or modify it under
 923 the same terms as Perl itself.
 924
 925 =head1 AUTHOR
 926
 927 Tara L Andrews E<lt>aurum@cpan.orgE<gt>