lib/Text/Tradition/Analysis.pm

   1 package Text::Tradition::Analysis;
   2
   3 use strict;
   4 use warnings;
   5 use Algorithm::Diff;  # for word similarity measure
   6 use Benchmark;
   7 use Encode qw/ encode_utf8 /;
   8 use Exporter 'import';
   9 use Graph;
  10 use JSON qw/ encode_json decode_json /;
  11 use LWP::UserAgent;
  12 use Text::Tradition;
  13 use Text::Tradition::Stemma;
  14 use TryCatch;
  15
  16 use vars qw/ @EXPORT_OK /;
  17 @EXPORT_OK = qw/ run_analysis group_variants analyze_variant_location wit_stringify /;
  18
  19 my $SOLVER_URL = 'http://byzantini.st/cgi-bin/graphcalc.cgi';
  20
  21
  22 =head1 NAME
  23
  24 Text::Tradition::Analysis - functions for stemma analysis of a tradition
  25
  26 =head1 SYNOPSIS
  27
  28   use Text::Tradition;
  29   use Text::Tradition::Analysis qw/ run_analysis analyze_variant_location /;
  30   my $t = Text::Tradition->new(
  31     'name' => 'this is a text',
  32     'input' => 'TEI',
  33     'file' => '/path/to/tei_parallel_seg_file.xml' );
  34   $t->add_stemma( 'dotfile' => $stemmafile );
  35
  36   my $variant_data = run_analysis( $tradition );
  37   # Recalculate rank $n treating all orthographic variants as equivalent
  38   my $reanalyze = analyze_variant_location( $tradition, $n, 0, 'orthographic' );
  39
  40 =head1 DESCRIPTION
  41
  42 Text::Tradition is a library for representation and analysis of collated
  43 texts, particularly medieval ones.  The Collation is the central feature of
  44 a Tradition, where the text, its sequence of readings, and its relationships
  45 between readings are actually kept.
  46
  47 =head1 SUBROUTINES
  48
  49 =head2 run_analysis( $tradition, %opts )
  50
  51 Runs the analysis described in analyze_variant_location on every location in the
  52 collation of the given tradition, with the given options. These include:
  53
  54 =over 4
  55
  56 =item * stemma_id - Specify which of the tradition's stemmata to use. Default
  57 is 0 (i.e. the first).
  58
  59 =item * ranks - Specify a list of location ranks to analyze; exclude the rest.
  60
  61 =item * merge_types - Specify a list of relationship types, where related readings
  62 should be treated as identical for the purposes of analysis.
  63
  64 =item * exclude_type1 - Exclude those ranks whose groupings have only type-1 variants.
  65
  66 =back
  67
  68 =begin testing
  69
  70 use Text::Tradition;
  71 use Text::Tradition::Analysis qw/ run_analysis analyze_variant_location /;
  72
  73 my $datafile = 't/data/florilegium_tei_ps.xml';
  74 my $tradition = Text::Tradition->new( 'input' => 'TEI',
  75                                       'name' => 'test0',
  76                                       'file' => $datafile );
  77 my $s = $tradition->add_stemma( 'dotfile' => 't/data/florilegium.dot' );
  78 is( ref( $s ), 'Text::Tradition::Stemma', "Added stemma to tradition" );
  79
  80 my %expected_genealogical = (
  81         1 => 0,
  82         2 => 1,
  83         3 =>  0,
  84         5 =>  0,
  85         7 =>  0,
  86         8 =>  0,
  87         10 => 0,
  88         13 => 1,
  89         33 => 0,
  90         34 => 0,
  91         37 => 0,
  92         60 => 0,
  93         81 => 1,
  94         84 => 0,
  95         87 => 0,
  96         101 => 0,
  97         102 => 0,
  98         122 => 1,
  99         157 => 0,
 100         166 => 1,
 101         169 => 1,
 102         200 => 0,
 103         216 => 1,
 104         217 => 1,
 105         219 => 1,
 106         241 => 1,
 107         242 => 1,
 108         243 => 1,
 109 );
 110
 111 my $data = run_analysis( $tradition );
 112 my $c = $tradition->collation;
 113 foreach my $row ( @{$data->{'variants'}} ) {
 114         # Account for rows that used to be "not useful"
 115         unless( exists $expected_genealogical{$row->{'id'}} ) {
 116                 $expected_genealogical{$row->{'id'}} = 1;
 117         }
 118         my $gen_bool = $row->{'genealogical'} ? 1 : 0;
 119         is( $gen_bool, $expected_genealogical{$row->{'id'}},
 120                 "Got correct genealogical flag for row " . $row->{'id'} );
 121         # Check that we have the right row with the right groups
 122         my $rank = $row->{'id'};
 123         foreach my $rdghash ( @{$row->{'readings'}} ) {
 124                 # Skip 'readings' that aren't really
 125                 next unless $c->reading( $rdghash->{'readingid'} );
 126                 # Check the rank
 127                 is( $c->reading( $rdghash->{'readingid'} )->rank, $rank,
 128                         "Got correct reading rank" );
 129                 # Check the witnesses
 130                 my @realwits = sort $c->reading_witnesses( $rdghash->{'readingid'} );
 131                 my @sgrp = sort @{$rdghash->{'group'}};
 132                 is_deeply( \@sgrp, \@realwits, "Reading analyzed with correct groups" );
 133         }
 134 }
 135 is( $data->{'variant_count'}, 58, "Got right total variant number" );
 136 # TODO Make something meaningful of conflict count, maybe test other bits
 137
 138 =end testing
 139
 140 =cut
 141
 142 sub run_analysis {
 143         my( $tradition, %opts ) = @_;
 144         my $c = $tradition->collation;
 145
 146         my $stemma_id = $opts{'stemma_id'} || 0;
 147         my @ranks = ref( $opts{'ranks'} ) eq 'ARRAY' ? @{$opts{'ranks'}} : ();
 148         my @collapse = ref( $opts{'merge_types'} ) eq 'ARRAY' ? @{$opts{'merge_types'}} : ();
 149
 150         # Get the stemma
 151         my $stemma = $tradition->stemma( $stemma_id );
 152
 153         # Figure out which witnesses we are working with - that is, the ones that
 154         # appear both in the stemma and in the tradition. All others are 'lacunose'
 155         # for our purposes.
 156         my @lacunose = $stemma->hypotheticals;
 157         my @tradition_wits = map { $_->sigil } $tradition->witnesses;
 158         push( @lacunose, _symmdiff( [ $stemma->witnesses ], \@tradition_wits ) );
 159
 160         # Find and mark 'common' ranks for exclusion, unless they were
 161         # explicitly specified.
 162         unless( @ranks ) {
 163                 my %common_rank;
 164                 foreach my $rdg ( $c->common_readings ) {
 165                         $common_rank{$rdg->rank} = 1;
 166                 }
 167                 @ranks = grep { !$common_rank{$_} } ( 1 .. $c->end->rank-1 );
 168         }
 169
 170         # Group the variants to send to the solver
 171         my @groups;
 172         my @use_ranks;
 173         my %lacunae;
 174         my $moved = {};
 175         foreach my $rank ( @ranks ) {
 176                 my $missing = [ @lacunose ];
 177                 my $rankgroup = group_variants( $tradition, $rank, $missing, $moved, \@collapse );
 178                 # Filter out any empty rankgroups
 179                 # (e.g. from the later rank for a transposition)
 180                 next unless keys %$rankgroup;
 181                 if( $opts{'exclude_type1'} ) {
 182                         # Check to see whether this is a "useful" group.
 183                         my( $rdgs, $grps ) = _useful_variant( $rankgroup,
 184                                 $stemma->graph, $c->ac_label );
 185                         next unless @$rdgs;
 186                 }
 187                 push( @use_ranks, $rank );
 188                 push( @groups, $rankgroup );
 189                 $lacunae{$rank} = $missing;
 190         }
 191         # Run the solver
 192         my $answer = solve_variants( $stemma, @groups );
 193
 194         # Do further analysis on the answer
 195         my $conflict_count = 0;
 196         my $aclabel = $c->ac_label;
 197         foreach my $idx ( 0 .. $#use_ranks ) {
 198                 my $location = $answer->{'variants'}->[$idx];
 199                 # Add the rank back in
 200                 my $rank = $use_ranks[$idx];
 201                 $location->{'id'} = $rank;
 202                 # Note what our lacunae are
 203                 my %lmiss;
 204                 map { $lmiss{$_} = 1 } @{$lacunae{$use_ranks[$idx]}};
 205                 $location->{'missing'} = [ keys %lmiss ];
 206
 207                 # Run the extra analysis we need.
 208                 ## TODO We run through all the variants in this call, so
 209                 ## why not add the reading data there instead of here below?
 210                 analyze_location( $tradition, $stemma, $location, \%lmiss );
 211
 212                 my @layerwits;
 213                 # Do the final post-analysis tidying up of the data.
 214                 foreach my $rdghash ( @{$location->{'readings'}} ) {
 215                         $conflict_count++
 216                                 if exists $rdghash->{'conflict'} && $rdghash->{'conflict'};
 217                         # Add the reading text back in, setting display value as needed
 218                         my $rdg = $c->reading( $rdghash->{'readingid'} );
 219                         if( $rdg ) {
 220                                 $rdghash->{'text'} = $rdg->text .
 221                                         ( $rdg->rank == $rank ? '' : ' [' . $rdg->rank . ']' );
 222                                 $rdghash->{'is_ungrammatical'} = $rdg->grammar_invalid;
 223                                 $rdghash->{'is_nonsense'} = $rdg->is_nonsense;
 224                         }
 225                         # Remove lacunose witnesses from this reading's list now that the
 226                         # analysis is done
 227                         my @realgroup;
 228                         map { push( @realgroup, $_ ) unless $lmiss{$_} } @{$rdghash->{'group'}};
 229                         $rdghash->{'group'} = \@realgroup;
 230                         # Note any layered witnesses that appear in this group
 231                         foreach( @realgroup ) {
 232                                 if( $_ =~ /^(.*)\Q$aclabel\E$/ ) {
 233                                         push( @layerwits, $1 );
 234                                 }
 235                         }
 236                 }
 237                 $location->{'layerwits'} = \@layerwits if @layerwits;
 238         }
 239         $answer->{'conflict_count'} = $conflict_count;
 240
 241         return $answer;
 242 }
 243
 244 =head2 group_variants( $tradition, $rank, $lacunose, @merge_relationship_types )
 245
 246 Groups the variants at the given $rank of the collation, treating any
 247 relationships in @merge_relationship_types as equivalent.  $lacunose should
 248 be a reference to an array, to which the sigla of lacunose witnesses at this
 249 rank will be appended; $transposed should be a reference to a hash, wherein
 250 the identities of transposed readings and their relatives will be stored.
 251
 252 Returns a hash $group_readings where $rdg is attested by the witnesses listed
 253 in $group_readings->{$rdg}.
 254
 255 =cut
 256
 257 # Return group_readings, groups, lacunose
 258 sub group_variants {
 259         my( $tradition, $rank, $lacunose, $transposed, $collapse ) = @_;
 260         my $c = $tradition->collation;
 261         my $aclabel = $c->ac_label;
 262         my $table = $c->alignment_table;
 263         # Get the alignment table readings
 264         my %readings_at_rank;
 265         my %is_lacunose; # lookup table for witnesses not in stemma
 266         map { $is_lacunose{$_} = 1; $is_lacunose{$_.$aclabel} = 1 } @$lacunose;
 267         my @check_for_gaps;
 268         my %moved_wits;
 269         my $has_transposition;
 270         foreach my $tablewit ( @{$table->{'alignment'}} ) {
 271                 my $rdg = $tablewit->{'tokens'}->[$rank-1];
 272                 my $wit = $tablewit->{'witness'};
 273                 # Exclude the witness if it is "lacunose" which if we got here
 274                 # means "not in the stemma".
 275                 next if $is_lacunose{$wit};
 276                 # Note if the witness is actually in a lacuna
 277                 if( $rdg && $rdg->{'t'}->is_lacuna ) {
 278                         _add_to_witlist( $wit, $lacunose, $aclabel );
 279                 # Otherwise the witness either has a positive reading...
 280                 } elsif( $rdg ) {
 281                         # If the reading has been counted elsewhere as a transposition, ignore it.
 282                         if( $transposed->{$rdg->{'t'}->id} ) {
 283                                 # TODO Does this cope with three-way transpositions?
 284                                 map { $moved_wits{$_} = 1 } @{$transposed->{$rdg->{'t'}->id}};
 285                                 next;
 286                         }
 287                         # Otherwise, record it...
 288                         $readings_at_rank{$rdg->{'t'}->id} = $rdg->{'t'};
 289                         # ...and grab any transpositions, and their relations.
 290                         my @transp = grep { $_->rank != $rank } $rdg->{'t'}->related_readings();
 291                         foreach my $trdg ( @transp ) {
 292                                 next if exists $readings_at_rank{$trdg->id};
 293                                 $has_transposition = 1;
 294                                 my @affected_wits = _table_witnesses(
 295                                         $table, $trdg, \%is_lacunose, $aclabel );
 296                                 next unless @affected_wits;
 297                                 map { $moved_wits{$_} = 1 } @affected_wits;
 298                                 $transposed->{$trdg->id} =
 299                                         [ _table_witnesses( $table, $rdg->{'t'}, \%is_lacunose, $aclabel ) ];
 300                                 $readings_at_rank{$trdg->id} = $trdg;
 301                         }
 302                 # ...or it is empty, ergo a gap.
 303                 } else {
 304                         _add_to_witlist( $wit, \@check_for_gaps, $aclabel );
 305                 }
 306         }
 307         my @gap_wits;
 308         map { _add_to_witlist( $_, \@gap_wits, $aclabel )
 309                 unless $moved_wits{$_} } @check_for_gaps;
 310         # Group the readings, collapsing groups by relationship if needed
 311         my $grouped_readings = {};
 312         foreach my $rdg ( values %readings_at_rank ) {
 313                 # Skip readings that have been collapsed into others.
 314                 next if exists $grouped_readings->{$rdg->id}
 315                         && $grouped_readings->{$rdg->id} eq 'COLLAPSE';
 316                 # Get the witness list, including from readings collapsed into this one.
 317                 my @wits = _table_witnesses( $table, $rdg, \%is_lacunose, $aclabel );
 318                 if( $collapse && @$collapse ) {
 319                         my $filter = sub { my $r = $_[0]; grep { $_ eq $r->type } @$collapse; };
 320                         foreach my $other ( $rdg->related_readings( $filter ) ) {
 321                                 my @otherwits = _table_witnesses(
 322                                         $table, $other, \%is_lacunose, $aclabel );
 323                                 push( @wits, @otherwits );
 324                                 $grouped_readings->{$other->id} = 'COLLAPSE';
 325                         }
 326                 }
 327                 $grouped_readings->{$rdg->id} = \@wits;
 328         }
 329         $grouped_readings->{'(omitted)'} = \@gap_wits if @gap_wits;
 330         # Get rid of our collapsed readings
 331         map { delete $grouped_readings->{$_} if $grouped_readings->{$_} eq 'COLLAPSE' }
 332                 keys %$grouped_readings
 333                 if $collapse;
 334
 335         # If something was transposed, check the groups for doubled-up readings
 336         if( $has_transposition ) {
 337                 # print STDERR "Group for rank $rank:\n";
 338                 # map { print STDERR "\t$_: " . join( ' ' , @{$grouped_readings->{$_}} ) . "\n" }
 339                 #       keys %$grouped_readings;
 340                 _check_transposed_consistency( $c, $rank, $transposed, $grouped_readings );
 341         }
 342
 343         # Return the result
 344         return $grouped_readings;
 345 }
 346
 347 # Helper function to query the alignment table for all witnesses (a.c. included)
 348 # that have a given reading at its rank.
 349 sub _table_witnesses {
 350         my( $table, $trdg, $lacunose, $aclabel ) = @_;
 351         my $tableidx = $trdg->rank - 1;
 352         my @has_reading;
 353         foreach my $row ( @{$table->{'alignment'}} ) {
 354                 my $wit = $row->{'witness'};
 355                 next if $lacunose->{$wit};
 356                 my $rdg = $row->{'tokens'}->[$tableidx];
 357                 next unless exists $rdg->{'t'} && defined $rdg->{'t'};
 358                 _add_to_witlist( $wit, \@has_reading, $aclabel )
 359                         if $rdg->{'t'}->id eq $trdg->id;
 360         }
 361         return @has_reading;
 362 }
 363
 364 # Helper function to ensure that X and X a.c. never appear in the same list.
 365 sub _add_to_witlist {
 366         my( $wit, $list, $acstr ) = @_;
 367         my %inlist;
 368         my $idx = 0;
 369         map { $inlist{$_} = $idx++ } @$list;
 370         if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
 371                 my $acwit = $1;
 372                 unless( exists $inlist{$acwit} ) {
 373                         push( @$list, $acwit.$acstr );
 374                 }
 375         } else {
 376                 if( exists( $inlist{$wit.$acstr} ) ) {
 377                         # Replace the a.c. version with the main witness
 378                         my $i = $inlist{$wit.$acstr};
 379                         $list->[$i] = $wit;
 380                 } else {
 381                         push( @$list, $wit );
 382                 }
 383         }
 384 }
 385
 386 sub _check_transposed_consistency {
 387         my( $c, $rank, $transposed, $groupings ) = @_;
 388         my %seen_wits;
 389         my %thisrank;
 390         # Note which readings are actually at this rank, and which witnesses
 391         # belong to which reading.
 392         foreach my $rdg ( keys %$groupings ) {
 393                 my $rdgobj = $c->reading( $rdg );
 394                 # Count '(omitted)' as a reading at this rank
 395                 $thisrank{$rdg} = 1 if !$rdgobj || $rdgobj->rank == $rank;
 396                 map { push( @{$seen_wits{$_}}, $rdg ) } @{$groupings->{$rdg}};
 397         }
 398         # Our work is done if we have no witness belonging to more than one
 399         # reading.
 400         my @doubled = grep { scalar @{$seen_wits{$_}} > 1 } keys %seen_wits;
 401         return unless @doubled;
 402         # If we have a symmetric related transposition, drop the non-rank readings.
 403         if( @doubled == scalar keys %seen_wits ) {
 404                 foreach my $rdg ( keys %$groupings ) {
 405                         if( !$thisrank{$rdg} ) {
 406                                 my $groupstr = wit_stringify( $groupings->{$rdg} );
 407                                 my ( $matched ) = grep { $groupstr eq wit_stringify( $groupings->{$_} ) }
 408                                         keys %thisrank;
 409                                 delete $groupings->{$rdg};
 410                                 # If we found a group match, assume there is a symmetry happening.
 411                                 # TODO think more about this
 412                                 # print STDERR "*** Deleting symmetric reading $rdg\n";
 413                                 unless( $matched ) {
 414                                         delete $transposed->{$rdg};
 415                                         warn "Found problem in evident symmetry with reading $rdg";
 416                                 }
 417                         }
 418                 }
 419         # Otherwise 'unhook' the transposed reading(s) that have duplicates.
 420         } else {
 421                 foreach my $dup ( @doubled ) {
 422                         foreach my $rdg ( @{$seen_wits{$dup}} ) {
 423                                 next if $thisrank{$rdg};
 424                                 next unless exists $groupings->{$rdg};
 425                                 # print STDERR "*** Deleting asymmetric doubled-up reading $rdg\n";
 426                                 delete $groupings->{$rdg};
 427                                 delete $transposed->{$rdg};
 428                         }
 429                 }
 430                 # and put any now-orphaned readings into an 'omitted' reading.
 431                 foreach my $wit ( keys %seen_wits ) {
 432                         unless( grep { exists $groupings->{$_} } @{$seen_wits{$wit}} ) {
 433                                 $groupings->{'(omitted)'} = [] unless exists $groupings->{'(omitted)'};
 434                                 _add_to_witlist( $wit, $groupings->{'(omitted)'}, $c->ac_label );
 435                         }
 436                 }
 437         }
 438 }
 439
 440 =head2 solve_variants( $graph, @groups )
 441
 442 Sends the set of groups to the external graph solver service and returns
 443 a cleaned-up answer, adding the rank IDs back where they belong.
 444
 445 The JSON has the form
 446   { "graph": [ stemmagraph DOT string without newlines ],
 447     "groupings": [ array of arrays of groups, one per rank ] }
 448
 449 The answer has the form
 450   { "variants" => [ array of variant location structures ],
 451     "variant_count" => total,
 452     "conflict_count" => number of conflicts detected,
 453     "genealogical_count" => number of solutions found }
 454
 455 =cut
 456
 457 sub solve_variants {
 458         my( $stemma, @groups ) = @_;
 459
 460         # Filter the groups down to distinct groups, and work out what graph
 461         # should be used in the calculation of each group. We want to send each
 462         # distinct problem to the solver only once.
 463         # We need a whole bunch of lookup tables for this.
 464         my( $index_groupkeys, $group_indices, $graph_problems ) = _prepare_groups( @_ );
 465
 466         ## For each distinct graph, send its groups to the solver.
 467         my $ua = LWP::UserAgent->new();
 468         ## Witness map is a HACK to get around limitations in node names from IDP
 469         my $witness_map = {};
 470         ## Variables to store answers as they come back
 471         my $variants = [ ( undef ) x ( scalar keys %$index_groupkeys ) ];
 472         my $genealogical = 0;
 473         foreach my $graphkey ( keys %$graph_problems ) {
 474                 my $graph = $graph_problems->{$graphkey}->{'object'};
 475                 my $groupings = [ values %{$graph_problems->{$graphkey}->{'groups'}} ];
 476                 my $req = _safe_wit_strings( $graph, $stemma->collation,
 477                         $groupings, $witness_map );
 478                 $req->{'command'} = 'findGroupings';
 479                 my $json = encode_json( $req );
 480                 # Send it off and get the result
 481                 # print STDERR "Sending request: " . to_json( $req ) . "\n";
 482                 my $resp = $ua->post( $SOLVER_URL, 'Content-Type' => 'application/json',
 483                                                           'Content' => $json );
 484                 my $answer;
 485                 if( $resp->is_success ) {
 486                         $answer = _desanitize_names( decode_json( $resp->content ), $witness_map );
 487                 } else {
 488                         # Fall back to the old method.
 489                         die "IDP solver returned " . $resp->status_line . " / " . $resp->content
 490                                 . "; cannot run graph analysis";
 491                 }
 492
 493                 ## If IDP worked, asked it the other two questions for this dataset.
 494                 my $more_eval = {};
 495                 foreach my $test ( qw/ findSources findClasses / ) {
 496                         $req->{'command'} = $test;
 497                         $json = encode_json( $req );
 498                         $resp = $ua->post( $SOLVER_URL, 'Content-Type' => 'application/json',
 499                                                            'Content' => $json );
 500                         if( $resp->is_success ) {
 501                                 $more_eval->{$test} = _desanitize_names(
 502                                         decode_json( $resp->content ), $witness_map );
 503                         } else {
 504                                 warn "IDP solver for $test returned " . $resp->status_line .
 505                                         " / " . $resp->content;
 506                                 # TODO arrange fallback
 507                         }
 508                 }
 509
 510                 ## The answer is the evaluated groupings, plus a boolean for whether
 511                 ## they were genealogical.  Reconstruct our original groups.
 512                 foreach my $gidx ( 0 .. $#{$groupings} ) {
 513                         my( $calc_groups, $result ) = @{$answer->[$gidx]};
 514                         # Keep track of the total # of genealogical readings
 515                         $genealogical++ if $result;
 516
 517                         my( $sources, $classes );
 518                         # Use the expanded groups from findSources if that got calculated.
 519                         if( exists( $more_eval->{'findSources'} ) ) {
 520                                 ( $calc_groups, $sources ) = @{$more_eval->{'findSources'}->[$gidx]};
 521                         }
 522                         # Use the (same) expanded groups from findClasses if that got calculated
 523                         # and is relevant.
 524                         if( exists( $more_eval->{'findClasses'} ) && !$result ) {
 525                                 my $throwaway_groups;
 526                                 ( $throwaway_groups, $classes ) = @{$more_eval->{'findClasses'}->[$gidx]};
 527                         }
 528
 529                         # Convert the source list into a lookup hash
 530                         my $roots = {};
 531                         map { $roots->{$_} = 1 } @$sources;
 532                         # Convert the class list into a lookup hash
 533                         if( $classes ) {
 534                                 $classes = _invert_hash( $classes );
 535                         }
 536
 537                         # Retrieve the key for the original group that went to the solver
 538                         my $input_group = wit_stringify( $groupings->[$gidx] );
 539
 540                         # Make the variant hash for each location that had this particular
 541                         # grouping on this particular stemma situation
 542                         foreach my $oidx ( @{$group_indices->{$input_group}} ) {
 543                                 my @readings = @{$index_groupkeys->{$oidx}};
 544                                 my $vstruct = {
 545                                         'genealogical' => $result,
 546                                         'readings' => [],
 547                                 };
 548                                 foreach my $ridx ( 0 .. $#readings ) {
 549                                         push( @{$vstruct->{'readings'}},
 550                                                 { 'readingid' => $readings[$ridx],
 551                                                   'group' => $calc_groups->[$ridx] } );
 552                                 }
 553                                 $vstruct->{'reading_roots'} = $roots if $roots;
 554                                 $vstruct->{'reading_types'} = $classes if $classes;
 555                                 $variants->[$oidx] = $vstruct;
 556                         }
 557                 }
 558         }
 559
 560         return { 'variants' => $variants,
 561                          'variant_count' => scalar @$variants,
 562                          'genealogical_count' => $genealogical };
 563 }
 564
 565 sub _prepare_groups {
 566         my( $stemma, @groups ) = @_;
 567         my $aclabel = $stemma->collation->ac_label;
 568
 569         my $index_groupkeys = {};       # Save the order of readings
 570         my $group_indices = {};         # Save the indices that have a given grouping
 571         my $graph_problems = {};        # Save the groupings for the given graph
 572
 573         foreach my $idx ( 0..$#groups ) {
 574                 my $ghash = $groups[$idx];
 575                 my @grouping;
 576                 # Sort the groupings from big to little, and scan for a.c. witnesses
 577                 # that would need an extended graph.
 578                 my @acwits;   # note which AC witnesses crop up at this rank
 579                 my $extant;   # note which witnesses crop up at this rank full stop
 580                 my @idxkeys = sort { scalar @{$ghash->{$b}} <=> scalar @{$ghash->{$a}} }
 581                         keys %$ghash;
 582                 foreach my $rdg ( @idxkeys ) {
 583                         my @sg = sort @{$ghash->{$rdg}};
 584                         push( @acwits, grep { $_ =~ /\Q$aclabel\E$/ } @sg );
 585                         map { $extant->{$_} = 1 } @sg;
 586                         push( @grouping, \@sg );
 587                 }
 588                 # Save the reading order
 589                 $index_groupkeys->{$idx} = \@idxkeys;
 590
 591                 # Now associate the distinct group with this index
 592                 my $gstr = wit_stringify( \@grouping );
 593                 push( @{$group_indices->{$gstr}}, $idx );
 594
 595                 # Finally, add the group to the list to be calculated for this graph.
 596                 map { s/\Q$aclabel\E$// } @acwits;
 597                 my $graph;
 598                 ## TODO When we get rid of the safe_wit_strings HACK we should also
 599                 ## be able to save the graph here as a dotstring rather than as an
 600                 ## object, thus simplifying life enormously.
 601                 try {
 602                         $graph = $stemma->situation_graph( $extant, \@acwits );
 603                 } catch {
 604                         die "Unable to extend graph with @acwits";
 605                 }
 606                 my $graphkey = "$graph || " . wit_stringify( [ sort keys %$extant ] );
 607                 unless( exists $graph_problems->{$graphkey} ) {
 608                         $graph_problems->{$graphkey} = { 'object' => $graph, 'groups' => {} };
 609                 }
 610                 $graph_problems->{$graphkey}->{'groups'}->{wit_stringify( \@grouping )} = \@grouping;
 611         }
 612         say STDERR "Created " . scalar( keys %$graph_problems ). " distinct graph(s)";
 613         return( $index_groupkeys, $group_indices, $graph_problems );
 614 }
 615
 616 #### HACKERY to cope with IDP's limited idea of what a node name looks like ###
 617
 618 sub _safe_wit_strings {
 619         my( $graph, $c, $groupings, $witness_map ) = @_;
 620         # Convert the graph to a safe representation and store the conversion.
 621         my $safegraph = Graph->new();
 622         foreach my $n ( $graph->vertices ) {
 623                 my $sn = _safe_witstr( $n );
 624                 if( exists $witness_map->{$sn} ) {
 625                         warn "Ambiguous stringification $sn for $n and " . $witness_map->{$sn}
 626                                 if $witness_map->{$sn} ne $n;
 627                 } else {
 628                         $witness_map->{$sn} = $n;
 629                 }
 630                 $safegraph->add_vertex( $sn );
 631                 $safegraph->set_vertex_attributes( $sn,
 632                         $graph->get_vertex_attributes( $n ) );
 633         }
 634         foreach my $e ( $graph->edges ) {
 635                 my @safe_e = ( _safe_witstr( $e->[0] ), _safe_witstr( $e->[1] ) );
 636                 $safegraph->add_edge( @safe_e );
 637         }
 638
 639         # Now convert the witness groupings to a safe representation.
 640         my $safe_groupings = [];
 641         foreach my $grouping ( @$groupings ) {
 642                 my $safe_grouping = [];
 643                 foreach my $group ( @$grouping ) {
 644                         my $safe_group = [];
 645                         foreach my $n ( @$group ) {
 646                                 my $sn = _safe_witstr( $n );
 647                                 warn "Ambiguous stringification $sn for $n and " . $witness_map->{$sn}
 648                                         if exists $witness_map->{$sn} && $witness_map->{$sn} ne $n;
 649                                 $witness_map->{$sn} = $n;
 650                                 push( @$safe_group, $sn );
 651                         }
 652                         push( @$safe_grouping, $safe_group );
 653                 }
 654                 push( @$safe_groupings, $safe_grouping );
 655         }
 656
 657         # Return it all in the struct we expect.  We have stored the reductions
 658         # in the $witness_map that we were passed.
 659         return { 'graph' => Text::Tradition::Stemma::editable_graph(
 660                                 $safegraph, { 'linesep' => ' ' } ),
 661                          'groupings' => $safe_groupings };
 662 }
 663
 664 sub _safe_witstr {
 665         my $witstr = shift;
 666         $witstr =~ s/\s+/_/g;
 667         $witstr =~ s/[^\w\d-]//g;
 668         return $witstr;
 669 }
 670
 671 sub _desanitize_names {
 672         my( $element, $witness_map ) = @_;
 673         my $result = [];
 674         if( ref( $element ) eq 'ARRAY' ) {
 675                 foreach my $n ( @$element ) {
 676                         push( @$result, _desanitize_names( $n, $witness_map ) );
 677                 }
 678         } elsif( ref( $element ) eq 'HASH' ) {
 679                 my $real_hash = {};
 680                 map { $real_hash->{$_} = _desanitize_names( $element->{$_}, $witness_map ) }
 681                         keys %$element;
 682                 $result = $real_hash;
 683         } elsif( exists $witness_map->{$element} ) {
 684                 $result = $witness_map->{$element}
 685         } else {
 686                 $result = $element;
 687         }
 688         return $result;
 689 }
 690
 691 sub _invert_hash {
 692         my( $hash ) = @_;
 693         my $newhash;
 694         foreach my $k ( keys %$hash ) {
 695                 if( ref( $hash->{$k} ) eq 'ARRAY' ) {
 696                         foreach my $v ( @{$hash->{$k}} ) {
 697                                 $newhash->{$v} = $k;
 698                         }
 699                 } else {
 700                         $newhash->{$hash->{$k}} = $k;
 701                 }
 702         }
 703         return $newhash;
 704 }
 705
 706 ### END HACKERY ###
 707
 708 =head2 analyze_location ( $tradition, $graph, $location_hash )
 709
 710 Given the tradition, its stemma graph, and the solution from the graph solver,
 711 work out the rest of the information we want.  For each reading we need missing,
 712 conflict, reading_parents, independent_occurrence, followed, not_followed,
 713 and follow_unknown.  Alters the location_hash in place.
 714
 715 =cut
 716
 717 sub analyze_location {
 718         my ( $tradition, $stemma, $variant_row, $lacunose ) = @_;
 719         my $c = $tradition->collation;
 720
 721         # Make a hash of all known node memberships, and make the subgraphs.
 722         my $contig = {};
 723         my $reading_roots = {};
 724         my $subgraph = {};
 725         my $acstr = $c->ac_label;
 726         my @acwits;
 727
 728         my $NO_IDP;
 729         if( exists $variant_row->{'reading_roots'} ) {
 730                 $reading_roots = delete $variant_row->{'reading_roots'};
 731         } else {
 732                 warn "No reading source information from IDP - proceed at your own risk";
 733                 $NO_IDP = 1;
 734         }
 735
 736         # Note which witnesses positively belong to which group. This information
 737         # comes ultimately from the IDP solver.
 738         # Also make a note of the reading's roots.
 739     foreach my $rdghash ( @{$variant_row->{'readings'}} ) {
 740         my $rid = $rdghash->{'readingid'};
 741         my @roots;
 742         foreach my $wit ( @{$rdghash->{'group'}} ) {
 743                 $contig->{$wit} = $rid;
 744             if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
 745                 push( @acwits, $1 );
 746             }
 747             if( exists $reading_roots->{$wit} && $reading_roots->{$wit} ) {
 748                 push( @roots, $wit );
 749             }
 750         }
 751                 $rdghash->{'independent_occurrence'} = \@roots;
 752         }
 753
 754         # Get the actual graph we should work with
 755         my $graph;
 756         try {
 757                 # contig contains all extant wits and all hypothetical wits
 758                 # needed to make up the groups.
 759                 $graph = $stemma->situation_graph( $contig, \@acwits );
 760         } catch ( Text::Tradition::Error $e ) {
 761                 die "Could not extend graph with given extant and a.c. witnesses: "
 762                         . $e->message;
 763         } catch {
 764                 die "Could not extend graph with a.c. witnesses @acwits";
 765         }
 766
 767
 768         # Now that we have all the node group memberships, calculate followed/
 769     # non-followed/unknown values for each reading.  Also figure out the
 770     # reading's evident parent(s).
 771     foreach my $rdghash ( @{$variant_row->{'readings'}} ) {
 772         my $rid = $rdghash->{'readingid'};
 773         my $rdg = $c->reading( $rid );
 774         my @roots = @{$rdghash->{'independent_occurrence'}};
 775         my @group = @{$rdghash->{'group'}};
 776
 777         # Start figuring things out.
 778         $rdghash->{'followed'} = scalar( @group ) - scalar( @roots );
 779         # Find the parent readings, if any, of this reading.
 780         my $rdgparents = {};
 781         foreach my $wit ( @roots ) {
 782                 # Look in the stemma graph to find this witness's extant or known-reading
 783                 # immediate ancestor(s), and look up the reading that each ancestor olds.
 784                         my @check = $graph->predecessors( $wit );
 785                         while( @check ) {
 786                                 my @next;
 787                                 foreach my $wparent( @check ) {
 788                                         my $preading = $contig->{$wparent};
 789                                         if( $preading && $preading ne $rid ) {
 790                                                 $rdgparents->{$preading} = 1;
 791                                         } else {
 792                                                 push( @next, $graph->predecessors( $wparent ) );
 793                                         }
 794                                 }
 795                                 @check = @next;
 796                         }
 797                 }
 798                 foreach my $p ( keys %$rdgparents ) {
 799                         # Resolve the relationship of the parent to the reading, and
 800                         # save it in our hash.
 801                         my $pobj = $c->reading( $p );
 802                         my $prep = $pobj ? $pobj->id . ' (' . $pobj->text . ')' : $p;
 803                         my $phash = { 'label' => $prep };
 804                         if( $pobj ) {
 805                                 my $rel = $c->get_relationship( $p, $rdghash->{readingid} );
 806                                 if( $rel ) {
 807                                         _add_to_hash( $rel, $phash );
 808                                 } elsif( $rdg ) {
 809                                         # First check for a transposed relationship
 810                                         if( $rdg->rank != $pobj->rank ) {
 811                                                 foreach my $ti ( $rdg->related_readings( 'transposition' ) ) {
 812                                                         next unless $ti->text eq $rdg->text;
 813                                                         $rel = $c->get_relationship( $ti, $pobj );
 814                                                         if( $rel ) {
 815                                                                 _add_to_hash( $rel, $phash, 1 );
 816                                                                 last;
 817                                                         }
 818                                                 }
 819                                                 unless( $rel ) {
 820                                                         foreach my $ti ( $pobj->related_readings( 'transposition' ) ) {
 821                                                                 next unless $ti->text eq $pobj->text;
 822                                                                 $rel = $c->get_relationship( $ti, $rdg );
 823                                                                 if( $rel ) {
 824                                                                         _add_to_hash( $rel, $phash, 1 );
 825                                                                         last;
 826                                                                 }
 827                                                         }
 828                                                 }
 829                                         }
 830                                         unless( $rel ) {
 831                                                 # and then check for sheer word similarity.
 832                                                 my $rtext = $rdg->text;
 833                                                 my $ptext = $pobj->text;
 834                                                 if( similar( $rtext, $ptext ) ) {
 835                                                         # say STDERR "Words $rtext and $ptext judged similar";
 836                                                         $phash->{relation} = { type => 'wordsimilar' };
 837                                                 }
 838                                         }
 839                                 } else {
 840                                         $phash->{relation} = { type => 'deletion' };
 841                                 }
 842                                 # Get the attributes of the parent object while we are here
 843                                 $phash->{'text'} = $pobj->text if $pobj;
 844                                 $phash->{'is_nonsense'} = $pobj->is_nonsense;
 845                                 $phash->{'is_ungrammatical'} = $pobj->grammar_invalid;
 846                         } elsif( $p eq '(omitted)' ) {
 847                                 $phash->{relation} = { type => 'addition' };
 848                         }
 849                         # Save it
 850                         $rdgparents->{$p} = $phash;
 851                 }
 852
 853                 $rdghash->{'reading_parents'} = $rdgparents;
 854
 855                 # Find the number of times this reading was altered, and the number of
 856                 # times we're not sure.
 857                 my( %nofollow, %unknownfollow );
 858                 foreach my $wit ( @{$rdghash->{'group'}} ) {
 859                         foreach my $wchild ( $graph->successors( $wit ) ) {
 860                                 if( $reading_roots->{$wchild} && $contig->{$wchild}
 861                                         && $contig->{$wchild} ne $rid ) {
 862                                         # It definitely changed here.
 863                                         $nofollow{$wchild} = 1;
 864                                 } elsif( !($contig->{$wchild}) ) {
 865                                         # The child is a hypothetical node not definitely in
 866                                         # any group. Answer is unknown.
 867                                         $unknownfollow{$wchild} = 1;
 868                                 } # else it is either in our group, or it is a non-root node in a
 869                                   # known group and therefore is presumed to have its reading from
 870                                   # its group, not this link.
 871                         }
 872                 }
 873                 $rdghash->{'not_followed'} = keys %nofollow;
 874                 $rdghash->{'follow_unknown'} = keys %unknownfollow;
 875
 876                 # Now say whether this reading represents a conflict.
 877                 unless( $variant_row->{'genealogical'} ) {
 878                         my @trueroots;
 879                         if( exists $variant_row->{'reading_types'} ) {
 880                                 my $classinfo = delete $variant_row->{'reading_types'};
 881                                 # We have tested for reversions. Use the information.
 882                                 my @reversions;
 883                                 foreach my $rdgroot ( @roots ) {
 884                                         if( $classinfo->{$rdgroot} eq 'revert' ) {
 885                                                 push( @reversions, $rdgroot );
 886                                         } else {
 887                                                 push( @trueroots, $rdgroot );
 888                                         }
 889                                 }
 890                                 $rdghash->{'independent_occurrence'} = \@trueroots;
 891                                 $rdghash->{'reversion'} = \@reversions if @reversions;
 892                         } else {
 893                                 @trueroots = @roots;
 894                         }
 895                         $rdghash->{'conflict'} = @trueroots != 1;
 896                 }
 897     }
 898 }
 899
 900 sub _add_to_hash {
 901         my( $rel, $phash, $is_transposed ) = @_;
 902         $phash->{relation} = { type => $rel->type };
 903         $phash->{relation}->{transposed} = 1 if $is_transposed;
 904         $phash->{relation}->{annotation} = $rel->annotation
 905                 if $rel->has_annotation;
 906 }
 907
 908 =head2 similar( $word1, $word2 )
 909
 910 Use Algorithm::Diff to get a sense of how close the words are to each other.
 911 This will hopefully handle substitutions a bit more nicely than Levenshtein.
 912
 913 =cut
 914
 915 #!/usr/bin/env perl
 916
 917 sub similar {
 918         my( $word1, $word2 ) = sort { length($a) <=> length($b) } @_;
 919         my @let1 = split( '', lc( $word1 ) );
 920         my @let2 = split( '', lc( $word2 ) );
 921         my $diff = Algorithm::Diff->new( \@let1, \@let2 );
 922         my $mag = 0;
 923         while( $diff->Next ) {
 924                 if( $diff->Same ) {
 925                         # Take off points for longer strings
 926                         my $cs = $diff->Range(1) - 2;
 927                         $cs = 0 if $cs < 0;
 928                         $mag -= $cs;
 929                 } elsif( !$diff->Items(1) ) {
 930                         $mag += $diff->Range(2);
 931                 } elsif( !$diff->Items(2) ) {
 932                         $mag += $diff->Range(1);
 933                 } else {
 934                         # Split the difference for substitutions
 935                         my $c1 = $diff->Range(1) || 1;
 936                         my $c2 = $diff->Range(2) || 1;
 937                         my $cd = ( $c1 + $c2 ) / 2;
 938                         $mag += $cd;
 939                 }
 940         }
 941         return ( $mag <= length( $word1 ) / 2 );
 942 }
 943
 944 sub _prune_group {
 945         my( $group, $graph ) = @_;
 946         my $relevant = {};
 947         # Record the existence of the vertices in the group
 948         map { $relevant->{$_} = 1 } @$group;
 949         # Make our subgraph
 950         my $subgraph = $graph->deep_copy;
 951         map { $subgraph->delete_vertex( $_ ) unless $relevant->{$_} }
 952                 $subgraph->vertices;
 953         # Now prune and return the remaining vertices.
 954         _prune_subtree( $subgraph );
 955         # Return the list of vertices and the list of roots.
 956         my $pruned_group = [ sort $subgraph->vertices ];
 957         my $pruned_roots = [ $subgraph->predecessorless_vertices ];
 958         return( $pruned_group, $pruned_roots );
 959 }
 960
 961 sub _prune_subtree {
 962         my( $tree ) = @_;
 963
 964         # Delete lacunose witnesses that have no successors
 965         my @orphan_hypotheticals;
 966         my $ctr = 0;
 967         do {
 968                 die "Infinite loop on leaves" if $ctr > 100;
 969                 @orphan_hypotheticals =
 970                         grep { $tree->get_vertex_attribute( $_, 'class' ) eq 'hypothetical' }
 971                                 $tree->successorless_vertices;
 972                 $tree->delete_vertices( @orphan_hypotheticals );
 973                 $ctr++;
 974         } while( @orphan_hypotheticals );
 975
 976         # Delete lacunose roots that have a single successor
 977         my @redundant_root;
 978         $ctr = 0;
 979         do {
 980                 die "Infinite loop on roots" if $ctr > 100;
 981                 @redundant_root =
 982                         grep { $tree->get_vertex_attribute( $_, 'class' ) eq 'hypothetical'
 983                                    && $tree->successors( $_ ) == 1 }
 984                                 $tree->predecessorless_vertices;
 985                 $tree->delete_vertices( @redundant_root );
 986                 $ctr++;
 987         } while( @redundant_root );
 988 }
 989
 990 sub _useful_variant {
 991         my( $group_readings, $graph, $acstr ) = @_;
 992
 993         # TODO Decide what to do with AC witnesses
 994
 995         # Sort by group size and return
 996         my $is_useful = 0;
 997         my( @readings, @groups );   # The sorted groups for our answer.
 998         foreach my $rdg ( sort { @{$group_readings->{$b}} <=> @{$group_readings->{$a}} }
 999                 keys %$group_readings ) {
1000                 push( @readings, $rdg );
1001                 push( @groups, $group_readings->{$rdg} );
1002                 if( @{$group_readings->{$rdg}} > 1 ) {
1003                         $is_useful++;
1004                 } else {
1005                         my( $wit ) = @{$group_readings->{$rdg}};
1006                         $wit =~ s/^(.*)\Q$acstr\E$/$1/;
1007                         $is_useful++ unless( $graph->is_sink_vertex( $wit ) );
1008                 }
1009         }
1010         if( $is_useful > 1 ) {
1011                 return( \@readings, \@groups );
1012         } else {
1013                 return( [], [] );
1014         }
1015 }
1016
1017 =head2 wit_stringify( $groups )
1018
1019 Takes an array of witness groupings and produces a string like
1020 ['A','B'] / ['C','D','E'] / ['F']
1021
1022 =cut
1023
1024 sub wit_stringify {
1025     my $groups = shift;
1026     my @gst;
1027     # If we were passed an array of witnesses instead of an array of
1028     # groupings, then "group" the witnesses first.
1029     unless( ref( $groups->[0] ) ) {
1030         my $mkgrp = [ $groups ];
1031         $groups = $mkgrp;
1032     }
1033     foreach my $g ( @$groups ) {
1034         push( @gst, '[' . join( ',', map { "'$_'" } @$g ) . ']' );
1035     }
1036     return join( ' / ', @gst );
1037 }
1038
1039 sub _symmdiff {
1040         my( $lista, $listb ) = @_;
1041         my %union;
1042         my %scalars;
1043         map { $union{$_} = 1; $scalars{$_} = $_ } @$lista;
1044         map { $union{$_} += 1; $scalars{$_} = $_ } @$listb;
1045         my @set = grep { $union{$_} == 1 } keys %union;
1046         return map { $scalars{$_} } @set;
1047 }
1048
1049 1;
1050
1051 =head1 LICENSE
1052
1053 This package is free software and is provided "as is" without express
1054 or implied warranty.  You can redistribute it and/or modify it under
1055 the same terms as Perl itself.
1056
1057 =head1 AUTHOR
1058
1059 Tara L Andrews E<lt>aurum@cpan.orgE<gt>