lib/Text/Tradition/Analysis.pm

   1 package Text::Tradition::Analysis;
   2
   3 use strict;
   4 use warnings;
   5 use Algorithm::Diff;  # for word similarity measure
   6 use Benchmark;
   7 use Encode qw/ encode_utf8 /;
   8 use Exporter 'import';
   9 use Graph;
  10 use JSON qw/ encode_json decode_json /;
  11 use LWP::UserAgent;
  12 use Text::Tradition;
  13 use Text::Tradition::Stemma;
  14 use TryCatch;
  15
  16 use vars qw/ @EXPORT_OK /;
  17 @EXPORT_OK = qw/ run_analysis group_variants analyze_variant_location wit_stringify /;
  18
  19 my $SOLVER_URL = 'http://byzantini.st/cgi-bin/graphcalc.cgi';
  20
  21
  22 =head1 NAME
  23
  24 Text::Tradition::Analysis - functions for stemma analysis of a tradition
  25
  26 =head1 SYNOPSIS
  27
  28   use Text::Tradition;
  29   use Text::Tradition::Analysis qw/ run_analysis analyze_variant_location /;
  30   my $t = Text::Tradition->new(
  31     'name' => 'this is a text',
  32     'input' => 'TEI',
  33     'file' => '/path/to/tei_parallel_seg_file.xml' );
  34   $t->add_stemma( 'dotfile' => $stemmafile );
  35
  36   my $variant_data = run_analysis( $tradition );
  37   # Recalculate rank $n treating all orthographic variants as equivalent
  38   my $reanalyze = analyze_variant_location( $tradition, $n, 0, 'orthographic' );
  39
  40 =head1 DESCRIPTION
  41
  42 Text::Tradition is a library for representation and analysis of collated
  43 texts, particularly medieval ones.  The Collation is the central feature of
  44 a Tradition, where the text, its sequence of readings, and its relationships
  45 between readings are actually kept.
  46
  47 =head1 SUBROUTINES
  48
  49 =head2 run_analysis( $tradition, %opts )
  50
  51 Runs the analysis described in analyze_variant_location on every location in the
  52 collation of the given tradition, with the given options. These include:
  53
  54 =over 4
  55
  56 =item * stemma_id - Specify which of the tradition's stemmata to use. Default
  57 is 0 (i.e. the first).
  58
  59 =item * ranks - Specify a list of location ranks to analyze; exclude the rest.
  60
  61 =item * merge_types - Specify a list of relationship types, where related readings
  62 should be treated as identical for the purposes of analysis.
  63
  64 =item * exclude_type1 - Exclude those ranks whose groupings have only type-1 variants.
  65
  66 =back
  67
  68 =begin testing
  69
  70 use Text::Tradition;
  71 use Text::Tradition::Analysis qw/ run_analysis analyze_variant_location /;
  72
  73 my $datafile = 't/data/florilegium_tei_ps.xml';
  74 my $tradition = Text::Tradition->new( 'input' => 'TEI',
  75                                       'name' => 'test0',
  76                                       'file' => $datafile );
  77 my $s = $tradition->add_stemma( 'dotfile' => 't/data/florilegium.dot' );
  78 is( ref( $s ), 'Text::Tradition::Stemma', "Added stemma to tradition" );
  79
  80 my %expected_genealogical = (
  81         1 => 0,
  82         2 => 1,
  83         3 =>  0,
  84         5 =>  0,
  85         7 =>  0,
  86         8 =>  0,
  87         10 => 0,
  88         13 => 1,
  89         33 => 0,
  90         34 => 0,
  91         37 => 0,
  92         60 => 0,
  93         81 => 1,
  94         84 => 0,
  95         87 => 0,
  96         101 => 0,
  97         102 => 0,
  98         122 => 1,
  99         157 => 0,
 100         166 => 1,
 101         169 => 1,
 102         200 => 0,
 103         216 => 1,
 104         217 => 1,
 105         219 => 1,
 106         241 => 1,
 107         242 => 1,
 108         243 => 1,
 109 );
 110
 111 my $data = run_analysis( $tradition );
 112 my $c = $tradition->collation;
 113 foreach my $row ( @{$data->{'variants'}} ) {
 114         # Account for rows that used to be "not useful"
 115         unless( exists $expected_genealogical{$row->{'id'}} ) {
 116                 $expected_genealogical{$row->{'id'}} = 1;
 117         }
 118         my $gen_bool = $row->{'genealogical'} ? 1 : 0;
 119         is( $gen_bool, $expected_genealogical{$row->{'id'}},
 120                 "Got correct genealogical flag for row " . $row->{'id'} );
 121         # Check that we have the right row with the right groups
 122         my $rank = $row->{'id'};
 123         foreach my $rdghash ( @{$row->{'readings'}} ) {
 124                 # Skip 'readings' that aren't really
 125                 next unless $c->reading( $rdghash->{'readingid'} );
 126                 # Check the rank
 127                 is( $c->reading( $rdghash->{'readingid'} )->rank, $rank,
 128                         "Got correct reading rank" );
 129                 # Check the witnesses
 130                 my @realwits = sort $c->reading_witnesses( $rdghash->{'readingid'} );
 131                 my @sgrp = sort @{$rdghash->{'group'}};
 132                 is_deeply( \@sgrp, \@realwits, "Reading analyzed with correct groups" );
 133         }
 134 }
 135 is( $data->{'variant_count'}, 58, "Got right total variant number" );
 136 # TODO Make something meaningful of conflict count, maybe test other bits
 137
 138 =end testing
 139
 140 =cut
 141
 142 sub run_analysis {
 143         my( $tradition, %opts ) = @_;
 144         my $c = $tradition->collation;
 145
 146         my $stemma_id = $opts{'stemma_id'} || 0;
 147         my @ranks = ref( $opts{'ranks'} ) eq 'ARRAY' ? @{$opts{'ranks'}} : ();
 148         my @collapse = ref( $opts{'merge_types'} ) eq 'ARRAY' ? @{$opts{'merge_types'}} : ();
 149
 150         # Get the stemma
 151         my $stemma = $tradition->stemma( $stemma_id );
 152
 153         # Figure out which witnesses we are working with - that is, the ones that
 154         # appear both in the stemma and in the tradition. All others are 'lacunose'
 155         # for our purposes.
 156         my @lacunose = $stemma->hypotheticals;
 157         my @tradition_wits = map { $_->sigil } $tradition->witnesses;
 158         push( @lacunose, _symmdiff( [ $stemma->witnesses ], \@tradition_wits ) );
 159
 160         # Find and mark 'common' ranks for exclusion, unless they were
 161         # explicitly specified.
 162         unless( @ranks ) {
 163                 my %common_rank;
 164                 foreach my $rdg ( $c->common_readings ) {
 165                         $common_rank{$rdg->rank} = 1;
 166                 }
 167                 @ranks = grep { !$common_rank{$_} } ( 1 .. $c->end->rank-1 );
 168         }
 169
 170         # Group the variants to send to the solver
 171         my @groups;
 172         my @use_ranks;
 173         my %lacunae;
 174         my $moved = {};
 175         foreach my $rank ( @ranks ) {
 176                 my $missing = [ @lacunose ];
 177                 my $rankgroup = group_variants( $tradition, $rank, $missing, $moved, \@collapse );
 178                 # Filter out any empty rankgroups
 179                 # (e.g. from the later rank for a transposition)
 180                 next unless keys %$rankgroup;
 181                 if( $opts{'exclude_type1'} ) {
 182                         # Check to see whether this is a "useful" group.
 183                         my( $rdgs, $grps ) = _useful_variant( $rankgroup,
 184                                 $stemma->graph, $c->ac_label );
 185                         next unless @$rdgs;
 186                 }
 187                 push( @use_ranks, $rank );
 188                 push( @groups, $rankgroup );
 189                 $lacunae{$rank} = $missing;
 190         }
 191         # Run the solver
 192         my $answer = solve_variants( $stemma, @groups );
 193
 194         # Do further analysis on the answer
 195         my $conflict_count = 0;
 196         my $aclabel = $c->ac_label;
 197         foreach my $idx ( 0 .. $#use_ranks ) {
 198                 my $location = $answer->{'variants'}->[$idx];
 199                 # Add the rank back in
 200                 my $rank = $use_ranks[$idx];
 201                 $location->{'id'} = $rank;
 202                 # Note what our lacunae are
 203                 my %lmiss;
 204                 map { $lmiss{$_} = 1 } @{$lacunae{$use_ranks[$idx]}};
 205                 $location->{'missing'} = [ keys %lmiss ];
 206
 207                 # Run the extra analysis we need.
 208                 ## TODO We run through all the variants in this call, so
 209                 ## why not add the reading data there instead of here below?
 210                 analyze_location( $tradition, $stemma, $location, \%lmiss );
 211
 212                 my @layerwits;
 213                 # Do the final post-analysis tidying up of the data.
 214                 foreach my $rdghash ( @{$location->{'readings'}} ) {
 215                         $conflict_count++
 216                                 if exists $rdghash->{'conflict'} && $rdghash->{'conflict'};
 217                         # Add the reading text back in, setting display value as needed
 218                         my $rdg = $c->reading( $rdghash->{'readingid'} );
 219                         if( $rdg ) {
 220                                 $rdghash->{'text'} = $rdg->text .
 221                                         ( $rdg->rank == $rank ? '' : ' [' . $rdg->rank . ']' );
 222                                 $rdghash->{'is_ungrammatical'} = $rdg->grammar_invalid;
 223                                 $rdghash->{'is_nonsense'} = $rdg->is_nonsense;
 224                         }
 225                         # Remove lacunose witnesses from this reading's list now that the
 226                         # analysis is done
 227                         my @realgroup;
 228                         map { push( @realgroup, $_ ) unless $lmiss{$_} } @{$rdghash->{'group'}};
 229                         $rdghash->{'group'} = \@realgroup;
 230                         # Note any layered witnesses that appear in this group
 231                         foreach( @realgroup ) {
 232                                 if( $_ =~ /^(.*)\Q$aclabel\E$/ ) {
 233                                         push( @layerwits, $1 );
 234                                 }
 235                         }
 236                 }
 237                 $location->{'layerwits'} = \@layerwits if @layerwits;
 238         }
 239         $answer->{'conflict_count'} = $conflict_count;
 240
 241         return $answer;
 242 }
 243
 244 =head2 group_variants( $tradition, $rank, $lacunose, @merge_relationship_types )
 245
 246 Groups the variants at the given $rank of the collation, treating any
 247 relationships in @merge_relationship_types as equivalent.  $lacunose should
 248 be a reference to an array, to which the sigla of lacunose witnesses at this
 249 rank will be appended; $transposed should be a reference to a hash, wherein
 250 the identities of transposed readings and their relatives will be stored.
 251
 252 Returns a hash $group_readings where $rdg is attested by the witnesses listed
 253 in $group_readings->{$rdg}.
 254
 255 =cut
 256
 257 # Return group_readings, groups, lacunose
 258 sub group_variants {
 259         my( $tradition, $rank, $lacunose, $transposed, $collapse ) = @_;
 260         my $c = $tradition->collation;
 261         my $aclabel = $c->ac_label;
 262         my $table = $c->alignment_table;
 263         # Get the alignment table readings
 264         my %readings_at_rank;
 265         my %is_lacunose; # lookup table for witnesses not in stemma
 266         map { $is_lacunose{$_} = 1; $is_lacunose{$_.$aclabel} = 1 } @$lacunose;
 267         my @check_for_gaps;
 268         my %moved_wits;
 269         my $has_transposition;
 270         foreach my $tablewit ( @{$table->{'alignment'}} ) {
 271                 my $rdg = $tablewit->{'tokens'}->[$rank-1];
 272                 my $wit = $tablewit->{'witness'};
 273                 # Exclude the witness if it is "lacunose" which if we got here
 274                 # means "not in the stemma".
 275                 next if $is_lacunose{$wit};
 276                 # Note if the witness is actually in a lacuna
 277                 if( $rdg && $rdg->{'t'}->is_lacuna ) {
 278                         _add_to_witlist( $wit, $lacunose, $aclabel );
 279                 # Otherwise the witness either has a positive reading...
 280                 } elsif( $rdg ) {
 281                         # If the reading has been counted elsewhere as a transposition, ignore it.
 282                         if( $transposed->{$rdg->{'t'}->id} ) {
 283                                 # TODO Does this cope with three-way transpositions?
 284                                 map { $moved_wits{$_} = 1 } @{$transposed->{$rdg->{'t'}->id}};
 285                                 next;
 286                         }
 287                         # Otherwise, record it...
 288                         $readings_at_rank{$rdg->{'t'}->id} = $rdg->{'t'};
 289                         # ...and grab any transpositions, and their relations.
 290                         my @transp = grep { $_->rank != $rank } $rdg->{'t'}->related_readings();
 291                         foreach my $trdg ( @transp ) {
 292                                 next if exists $readings_at_rank{$trdg->id};
 293                                 $has_transposition = 1;
 294                                 my @affected_wits = _table_witnesses(
 295                                         $table, $trdg, \%is_lacunose, $aclabel );
 296                                 next unless @affected_wits;
 297                                 map { $moved_wits{$_} = 1 } @affected_wits;
 298                                 $transposed->{$trdg->id} =
 299                                         [ _table_witnesses( $table, $rdg->{'t'}, \%is_lacunose, $aclabel ) ];
 300                                 $readings_at_rank{$trdg->id} = $trdg;
 301                         }
 302                 # ...or it is empty, ergo a gap.
 303                 } else {
 304                         _add_to_witlist( $wit, \@check_for_gaps, $aclabel );
 305                 }
 306         }
 307         my @gap_wits;
 308         map { _add_to_witlist( $_, \@gap_wits, $aclabel )
 309                 unless $moved_wits{$_} } @check_for_gaps;
 310         # Group the readings, collapsing groups by relationship if needed
 311         my $grouped_readings = {};
 312         foreach my $rdg ( values %readings_at_rank ) {
 313                 # Skip readings that have been collapsed into others.
 314                 next if exists $grouped_readings->{$rdg->id}
 315                         && $grouped_readings->{$rdg->id} eq 'COLLAPSE';
 316                 # Get the witness list, including from readings collapsed into this one.
 317                 my @wits = _table_witnesses( $table, $rdg, \%is_lacunose, $aclabel );
 318                 if( $collapse && @$collapse ) {
 319                         my $filter = sub { my $r = $_[0]; grep { $_ eq $r->type } @$collapse; };
 320                         foreach my $other ( $rdg->related_readings( $filter ) ) {
 321                                 my @otherwits = _table_witnesses(
 322                                         $table, $other, \%is_lacunose, $aclabel );
 323                                 push( @wits, @otherwits );
 324                                 $grouped_readings->{$other->id} = 'COLLAPSE';
 325                         }
 326                 }
 327                 $grouped_readings->{$rdg->id} = \@wits;
 328         }
 329         $grouped_readings->{'(omitted)'} = \@gap_wits if @gap_wits;
 330         # Get rid of our collapsed readings
 331         map { delete $grouped_readings->{$_} if $grouped_readings->{$_} eq 'COLLAPSE' }
 332                 keys %$grouped_readings
 333                 if $collapse;
 334
 335         # If something was transposed, check the groups for doubled-up readings
 336         if( $has_transposition ) {
 337                 # print STDERR "Group for rank $rank:\n";
 338                 # map { print STDERR "\t$_: " . join( ' ' , @{$grouped_readings->{$_}} ) . "\n" }
 339                 #       keys %$grouped_readings;
 340                 _check_transposed_consistency( $c, $rank, $transposed, $grouped_readings );
 341         }
 342
 343         # Return the result
 344         return $grouped_readings;
 345 }
 346
 347 # Helper function to query the alignment table for all witnesses (a.c. included)
 348 # that have a given reading at its rank.
 349 sub _table_witnesses {
 350         my( $table, $trdg, $lacunose, $aclabel ) = @_;
 351         my $tableidx = $trdg->rank - 1;
 352         my @has_reading;
 353         foreach my $row ( @{$table->{'alignment'}} ) {
 354                 my $wit = $row->{'witness'};
 355                 next if $lacunose->{$wit};
 356                 my $rdg = $row->{'tokens'}->[$tableidx];
 357                 next unless exists $rdg->{'t'} && defined $rdg->{'t'};
 358                 _add_to_witlist( $wit, \@has_reading, $aclabel )
 359                         if $rdg->{'t'}->id eq $trdg->id;
 360         }
 361         return @has_reading;
 362 }
 363
 364 # Helper function to ensure that X and X a.c. never appear in the same list.
 365 sub _add_to_witlist {
 366         my( $wit, $list, $acstr ) = @_;
 367         my %inlist;
 368         my $idx = 0;
 369         map { $inlist{$_} = $idx++ } @$list;
 370         if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
 371                 my $acwit = $1;
 372                 unless( exists $inlist{$acwit} ) {
 373                         push( @$list, $acwit.$acstr );
 374                 }
 375         } else {
 376                 if( exists( $inlist{$wit.$acstr} ) ) {
 377                         # Replace the a.c. version with the main witness
 378                         my $i = $inlist{$wit.$acstr};
 379                         $list->[$i] = $wit;
 380                 } else {
 381                         push( @$list, $wit );
 382                 }
 383         }
 384 }
 385
 386 sub _check_transposed_consistency {
 387         my( $c, $rank, $transposed, $groupings ) = @_;
 388         my %seen_wits;
 389         my %thisrank;
 390         # Note which readings are actually at this rank, and which witnesses
 391         # belong to which reading.
 392         foreach my $rdg ( keys %$groupings ) {
 393                 my $rdgobj = $c->reading( $rdg );
 394                 # Count '(omitted)' as a reading at this rank
 395                 $thisrank{$rdg} = 1 if !$rdgobj || $rdgobj->rank == $rank;
 396                 map { push( @{$seen_wits{$_}}, $rdg ) } @{$groupings->{$rdg}};
 397         }
 398         # Our work is done if we have no witness belonging to more than one
 399         # reading.
 400         my @doubled = grep { scalar @{$seen_wits{$_}} > 1 } keys %seen_wits;
 401         return unless @doubled;
 402         # If we have a symmetric related transposition, drop the non-rank readings.
 403         if( @doubled == scalar keys %seen_wits ) {
 404                 foreach my $rdg ( keys %$groupings ) {
 405                         if( !$thisrank{$rdg} ) {
 406                                 my $groupstr = wit_stringify( $groupings->{$rdg} );
 407                                 my ( $matched ) = grep { $groupstr eq wit_stringify( $groupings->{$_} ) }
 408                                         keys %thisrank;
 409                                 delete $groupings->{$rdg};
 410                                 # If we found a group match, assume there is a symmetry happening.
 411                                 # TODO think more about this
 412                                 # print STDERR "*** Deleting symmetric reading $rdg\n";
 413                                 unless( $matched ) {
 414                                         delete $transposed->{$rdg};
 415                                         warn "Found problem in evident symmetry with reading $rdg";
 416                                 }
 417                         }
 418                 }
 419         # Otherwise 'unhook' the transposed reading(s) that have duplicates.
 420         } else {
 421                 foreach my $dup ( @doubled ) {
 422                         foreach my $rdg ( @{$seen_wits{$dup}} ) {
 423                                 next if $thisrank{$rdg};
 424                                 next unless exists $groupings->{$rdg};
 425                                 # print STDERR "*** Deleting asymmetric doubled-up reading $rdg\n";
 426                                 delete $groupings->{$rdg};
 427                                 delete $transposed->{$rdg};
 428                         }
 429                 }
 430                 # and put any now-orphaned readings into an 'omitted' reading.
 431                 foreach my $wit ( keys %seen_wits ) {
 432                         unless( grep { exists $groupings->{$_} } @{$seen_wits{$wit}} ) {
 433                                 $groupings->{'(omitted)'} = [] unless exists $groupings->{'(omitted)'};
 434                                 _add_to_witlist( $wit, $groupings->{'(omitted)'}, $c->ac_label );
 435                         }
 436                 }
 437         }
 438 }
 439
 440 =head2 solve_variants( $graph, @groups )
 441
 442 Sends the set of groups to the external graph solver service and returns
 443 a cleaned-up answer, adding the rank IDs back where they belong.
 444
 445 The JSON has the form
 446   { "graph": [ stemmagraph DOT string without newlines ],
 447     "groupings": [ array of arrays of groups, one per rank ] }
 448
 449 The answer has the form
 450   { "variants" => [ array of variant location structures ],
 451     "variant_count" => total,
 452     "conflict_count" => number of conflicts detected,
 453     "genealogical_count" => number of solutions found }
 454
 455 =cut
 456
 457 sub solve_variants {
 458         my( $stemma, @groups ) = @_;
 459
 460         # Filter the groups down to distinct groups, and work out what graph
 461         # should be used in the calculation of each group. We want to send each
 462         # distinct problem to the solver only once.
 463         # We need a whole bunch of lookup tables for this.
 464         my( $index_groupkeys, $group_indices, $graph_problems ) = _prepare_groups( @_ );
 465
 466         ## For each distinct graph, send its groups to the solver.
 467         my $ua = LWP::UserAgent->new();
 468         ## Witness map is a HACK to get around limitations in node names from IDP
 469         my $witness_map = {};
 470         ## Variables to store answers as they come back
 471         my $variants = [ ( undef ) x ( scalar keys %$index_groupkeys ) ];
 472         my $genealogical = 0;
 473         foreach my $graphkey ( keys %$graph_problems ) {
 474                 my $graph = $graph_problems->{$graphkey}->{'object'};
 475                 my $groupings = [ values %{$graph_problems->{$graphkey}->{'groups'}} ];
 476                 my $req = _safe_wit_strings( $graph, $stemma->collation,
 477                         $groupings, $witness_map );
 478                 $req->{'command'} = 'findGroupings';
 479                 my $json = encode_json( $req );
 480                 # Send it off and get the result
 481                 # print STDERR "Sending request: " . to_json( $req ) . "\n";
 482                 my $resp = $ua->post( $SOLVER_URL, 'Content-Type' => 'application/json',
 483                                                           'Content' => $json );
 484                 my $answer;
 485                 if( $resp->is_success ) {
 486                         $answer = _desanitize_names( decode_json( $resp->content ), $witness_map );
 487                 } else {
 488                         # Fall back to the old method.
 489                         die "IDP solver returned " . $resp->status_line . " / " . $resp->content
 490                                 . "; cannot run graph analysis";
 491                 }
 492
 493                 ## If IDP worked, asked it the other two questions for this dataset.
 494                 my $more_eval = {};
 495                 foreach my $test ( qw/ findSources findClasses / ) {
 496                         $req->{'command'} = $test;
 497                         $json = encode_json( $req );
 498                         $resp = $ua->post( $SOLVER_URL, 'Content-Type' => 'application/json',
 499                                                            'Content' => $json );
 500                         if( $resp->is_success ) {
 501                                 $more_eval->{$test} = _desanitize_names(
 502                                         decode_json( $resp->content ), $witness_map );
 503                         } else {
 504                                 warn "IDP solver for $test returned " . $resp->status_line .
 505                                         " / " . $resp->content;
 506                                 # TODO arrange fallback
 507                         }
 508                 }
 509
 510                 ## The answer is the evaluated groupings, plus a boolean for whether
 511                 ## they were genealogical.  Reconstruct our original groups.
 512                 foreach my $gidx ( 0 .. $#{$groupings} ) {
 513                         my( $calc_groups, $result ) = @{$answer->[$gidx]};
 514                         # Keep track of the total # of genealogical readings
 515                         $genealogical++ if $result;
 516
 517                         my( $sources, $classes );
 518                         # Use the expanded groups from findSources if that got calculated.
 519                         if( exists( $more_eval->{'findSources'} ) ) {
 520                                 ( $calc_groups, $sources ) = @{$more_eval->{'findSources'}->[$gidx]};
 521                         }
 522                         # Use the (same) expanded groups from findClasses if that got calculated
 523                         # and is relevant.
 524                         if( exists( $more_eval->{'findClasses'} ) && !$result ) {
 525                                 ( $calc_groups, $classes ) = @{$more_eval->{'findClasses'}->[$gidx]};
 526                         }
 527
 528                         # Prune the calculated groups, in case the IDP solver failed to.
 529                         if( $sources || $result ) {
 530                                 my @pruned_groups;
 531                                 my @pruned_roots;
 532                                 foreach my $cg ( @$calc_groups ) {
 533                                         my( $pg, $pr ) = _prune_group( $cg, $graph );
 534                                         push( @pruned_groups, $pg );
 535                                         push( @pruned_roots, @$pr );
 536                                 }
 537                                 $calc_groups = \@pruned_groups;
 538                                 say STDERR "Pruned roots from @$sources to @pruned_roots"
 539                                         unless wit_stringify( [ sort @$sources ] )
 540                                                 eq wit_stringify( [ sort @pruned_roots ] );
 541                                 $sources = \@pruned_roots;
 542                         }
 543
 544                         # Convert the source list into a lookup hash
 545                         my $roots = {};
 546                         map { $roots->{$_} = 1 } @$sources;
 547                         # Convert the class list into a lookup hash
 548                         if( $classes ) {
 549                                 $classes = _invert_hash( $classes );
 550                         }
 551
 552                         # Retrieve the key for the original group that went to the solver
 553                         my $input_group = wit_stringify( $groupings->[$gidx] );
 554
 555                         # Make the variant hash for each location that had this particular
 556                         # grouping on this particular stemma situation
 557                         foreach my $oidx ( @{$group_indices->{$input_group}} ) {
 558                                 my @readings = @{$index_groupkeys->{$oidx}};
 559                                 my $vstruct = {
 560                                         'genealogical' => $result,
 561                                         'readings' => [],
 562                                 };
 563                                 foreach my $ridx ( 0 .. $#readings ) {
 564                                         push( @{$vstruct->{'readings'}},
 565                                                 { 'readingid' => $readings[$ridx],
 566                                                   'group' => $calc_groups->[$ridx] } );
 567                                 }
 568                                 $vstruct->{'reading_roots'} = $roots if $roots;
 569                                 $vstruct->{'reading_types'} = $classes if $classes;
 570                                 $variants->[$oidx] = $vstruct;
 571                         }
 572                 }
 573         }
 574
 575         return { 'variants' => $variants,
 576                          'variant_count' => scalar @$variants,
 577                          'genealogical_count' => $genealogical };
 578 }
 579
 580 sub _prepare_groups {
 581         my( $stemma, @groups ) = @_;
 582         my $aclabel = $stemma->collation->ac_label;
 583
 584         my $index_groupkeys = {};       # Save the order of readings
 585         my $group_indices = {};         # Save the indices that have a given grouping
 586         my $graph_problems = {};        # Save the groupings for the given graph
 587
 588         foreach my $idx ( 0..$#groups ) {
 589                 my $ghash = $groups[$idx];
 590                 my @grouping;
 591                 # Sort the groupings from big to little, and scan for a.c. witnesses
 592                 # that would need an extended graph.
 593                 my @acwits;   # note which AC witnesses crop up at this rank
 594                 my $extant;   # note which witnesses crop up at this rank full stop
 595                 my @idxkeys = sort { scalar @{$ghash->{$b}} <=> scalar @{$ghash->{$a}} }
 596                         keys %$ghash;
 597                 foreach my $rdg ( @idxkeys ) {
 598                         my @sg = sort @{$ghash->{$rdg}};
 599                         push( @acwits, grep { $_ =~ /\Q$aclabel\E$/ } @sg );
 600                         map { $extant->{$_} = 1 } @sg;
 601                         push( @grouping, \@sg );
 602                 }
 603                 # Save the reading order
 604                 $index_groupkeys->{$idx} = \@idxkeys;
 605
 606                 # Now associate the distinct group with this index
 607                 my $gstr = wit_stringify( \@grouping );
 608                 push( @{$group_indices->{$gstr}}, $idx );
 609
 610                 # Finally, add the group to the list to be calculated for this graph.
 611                 map { s/\Q$aclabel\E$// } @acwits;
 612                 my $graph;
 613                 ## TODO When we get rid of the safe_wit_strings HACK we should also
 614                 ## be able to save the graph here as a dotstring rather than as an
 615                 ## object, thus simplifying life enormously.
 616                 try {
 617                         $graph = $stemma->situation_graph( $extant, \@acwits );
 618                 } catch {
 619                         $DB::single = 1;
 620                         die "Unable to extend graph with @acwits";
 621                 }
 622                 my $graphkey = "$graph || " . wit_stringify( [ sort keys %$extant ] );
 623                 unless( exists $graph_problems->{$graphkey} ) {
 624                         $graph_problems->{$graphkey} = { 'object' => $graph, 'groups' => {} };
 625                 }
 626                 $graph_problems->{$graphkey}->{'groups'}->{wit_stringify( \@grouping )} = \@grouping;
 627         }
 628         say STDERR "Created " . scalar( keys %$graph_problems ). " distinct graph(s)";
 629         return( $index_groupkeys, $group_indices, $graph_problems );
 630 }
 631
 632 #### HACKERY to cope with IDP's limited idea of what a node name looks like ###
 633
 634 sub _safe_wit_strings {
 635         my( $graph, $c, $groupings, $witness_map ) = @_;
 636         # Convert the graph to a safe representation and store the conversion.
 637         my $safegraph = Graph->new();
 638         foreach my $n ( $graph->vertices ) {
 639                 my $sn = _safe_witstr( $n );
 640                 if( exists $witness_map->{$sn} ) {
 641                         warn "Ambiguous stringification $sn for $n and " . $witness_map->{$sn}
 642                                 if $witness_map->{$sn} ne $n;
 643                 } else {
 644                         $witness_map->{$sn} = $n;
 645                 }
 646                 $safegraph->add_vertex( $sn );
 647                 $safegraph->set_vertex_attributes( $sn,
 648                         $graph->get_vertex_attributes( $n ) );
 649         }
 650         foreach my $e ( $graph->edges ) {
 651                 my @safe_e = ( _safe_witstr( $e->[0] ), _safe_witstr( $e->[1] ) );
 652                 $safegraph->add_edge( @safe_e );
 653         }
 654
 655         # Now convert the witness groupings to a safe representation.
 656         my $safe_groupings = [];
 657         foreach my $grouping ( @$groupings ) {
 658                 my $safe_grouping = [];
 659                 foreach my $group ( @$grouping ) {
 660                         my $safe_group = [];
 661                         foreach my $n ( @$group ) {
 662                                 my $sn = _safe_witstr( $n );
 663                                 warn "Ambiguous stringification $sn for $n and " . $witness_map->{$sn}
 664                                         if exists $witness_map->{$sn} && $witness_map->{$sn} ne $n;
 665                                 $witness_map->{$sn} = $n;
 666                                 push( @$safe_group, $sn );
 667                         }
 668                         push( @$safe_grouping, $safe_group );
 669                 }
 670                 push( @$safe_groupings, $safe_grouping );
 671         }
 672
 673         # Return it all in the struct we expect.  We have stored the reductions
 674         # in the $witness_map that we were passed.
 675         return { 'graph' => Text::Tradition::Stemma::editable_graph(
 676                                 $safegraph, { 'linesep' => ' ' } ),
 677                          'groupings' => $safe_groupings };
 678 }
 679
 680 sub _safe_witstr {
 681         my $witstr = shift;
 682         $witstr =~ s/\s+/_/g;
 683         $witstr =~ s/[^\w\d-]//g;
 684         return $witstr;
 685 }
 686
 687 sub _desanitize_names {
 688         my( $element, $witness_map ) = @_;
 689         my $result = [];
 690         if( ref( $element ) eq 'ARRAY' ) {
 691                 foreach my $n ( @$element ) {
 692                         push( @$result, _desanitize_names( $n, $witness_map ) );
 693                 }
 694         } elsif( ref( $element ) eq 'HASH' ) {
 695                 my $real_hash = {};
 696                 map { $real_hash->{$_} = _desanitize_names( $element->{$_}, $witness_map ) }
 697                         keys %$element;
 698                 $result = $real_hash;
 699         } elsif( exists $witness_map->{$element} ) {
 700                 $result = $witness_map->{$element}
 701         } else {
 702                 $result = $element;
 703         }
 704         return $result;
 705 }
 706
 707 sub _invert_hash {
 708         my( $hash ) = @_;
 709         my $newhash;
 710         foreach my $k ( keys %$hash ) {
 711                 if( ref( $hash->{$k} ) eq 'ARRAY' ) {
 712                         foreach my $v ( @{$hash->{$k}} ) {
 713                                 $newhash->{$v} = $k;
 714                         }
 715                 } else {
 716                         $newhash->{$hash->{$k}} = $k;
 717                 }
 718         }
 719         return $newhash;
 720 }
 721
 722 ### END HACKERY ###
 723
 724 =head2 analyze_location ( $tradition, $graph, $location_hash )
 725
 726 Given the tradition, its stemma graph, and the solution from the graph solver,
 727 work out the rest of the information we want.  For each reading we need missing,
 728 conflict, reading_parents, independent_occurrence, followed, not_followed,
 729 and follow_unknown.  Alters the location_hash in place.
 730
 731 =cut
 732
 733 sub analyze_location {
 734         my ( $tradition, $stemma, $variant_row, $lacunose ) = @_;
 735         my $c = $tradition->collation;
 736
 737         # Make a hash of all known node memberships, and make the subgraphs.
 738         my $contig = {};
 739         my $reading_roots = {};
 740         my $subgraph = {};
 741         my $acstr = $c->ac_label;
 742         my @acwits;
 743
 744         my $NO_IDP;
 745         if( exists $variant_row->{'reading_roots'} ) {
 746                 $reading_roots = delete $variant_row->{'reading_roots'};
 747         } else {
 748                 warn "No reading source information from IDP - proceed at your own risk";
 749                 $NO_IDP = 1;
 750         }
 751
 752         # Note which witnesses positively belong to which group. This information
 753         # comes ultimately from the IDP solver.
 754         # Also make a note of the reading's roots.
 755     foreach my $rdghash ( @{$variant_row->{'readings'}} ) {
 756         my $rid = $rdghash->{'readingid'};
 757         my @roots;
 758         foreach my $wit ( @{$rdghash->{'group'}} ) {
 759                 $contig->{$wit} = $rid;
 760             if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
 761                 push( @acwits, $1 );
 762             }
 763             if( exists $reading_roots->{$wit} && $reading_roots->{$wit} ) {
 764                 push( @roots, $wit );
 765             }
 766         }
 767                 $rdghash->{'independent_occurrence'} = \@roots;
 768         }
 769
 770         # Get the actual graph we should work with
 771         my $graph;
 772         try {
 773                 # contig contains all extant wits and all hypothetical wits
 774                 # needed to make up the groups.
 775                 $graph = $stemma->situation_graph( $contig, \@acwits );
 776         } catch ( Text::Tradition::Error $e ) {
 777                 die "Could not extend graph with given extant and a.c. witnesses: "
 778                         . $e->message;
 779         } catch {
 780                 die "Could not extend graph with a.c. witnesses @acwits";
 781         }
 782
 783
 784         # Now that we have all the node group memberships, calculate followed/
 785     # non-followed/unknown values for each reading.  Also figure out the
 786     # reading's evident parent(s).
 787     foreach my $rdghash ( @{$variant_row->{'readings'}} ) {
 788         my $rid = $rdghash->{'readingid'};
 789         my $rdg = $c->reading( $rid );
 790         my @roots = @{$rdghash->{'independent_occurrence'}};
 791         my @group = @{$rdghash->{'group'}};
 792
 793         # Start figuring things out.
 794         $rdghash->{'followed'} = scalar( @group ) - scalar( @roots );
 795         # Find the parent readings, if any, of this reading.
 796         my $rdgparents = {};
 797         foreach my $wit ( @roots ) {
 798                 # Look in the stemma graph to find this witness's extant or known-reading
 799                 # immediate ancestor(s), and look up the reading that each ancestor olds.
 800                         my @check = $graph->predecessors( $wit );
 801                         while( @check ) {
 802                                 my @next;
 803                                 foreach my $wparent( @check ) {
 804                                         my $preading = $contig->{$wparent};
 805                                         if( $preading && $preading ne $rid ) {
 806                                                 $rdgparents->{$preading} = 1;
 807                                         } else {
 808                                                 push( @next, $graph->predecessors( $wparent ) );
 809                                         }
 810                                 }
 811                                 @check = @next;
 812                         }
 813                 }
 814                 foreach my $p ( keys %$rdgparents ) {
 815                         # Resolve the relationship of the parent to the reading, and
 816                         # save it in our hash.
 817                         my $pobj = $c->reading( $p );
 818                         my $prep = $pobj ? $pobj->id . ' (' . $pobj->text . ')' : $p;
 819                         my $phash = { 'label' => $prep };
 820                         if( $pobj ) {
 821                                 my $rel = $c->get_relationship( $p, $rdghash->{readingid} );
 822                                 if( $rel ) {
 823                                         _add_to_hash( $rel, $phash );
 824                                 } elsif( $rdg ) {
 825                                         # First check for a transposed relationship
 826                                         if( $rdg->rank != $pobj->rank ) {
 827                                                 foreach my $ti ( $rdg->related_readings( 'transposition' ) ) {
 828                                                         next unless $ti->text eq $rdg->text;
 829                                                         $rel = $c->get_relationship( $ti, $pobj );
 830                                                         if( $rel ) {
 831                                                                 _add_to_hash( $rel, $phash, 1 );
 832                                                                 last;
 833                                                         }
 834                                                 }
 835                                                 unless( $rel ) {
 836                                                         foreach my $ti ( $pobj->related_readings( 'transposition' ) ) {
 837                                                                 next unless $ti->text eq $pobj->text;
 838                                                                 $rel = $c->get_relationship( $ti, $rdg );
 839                                                                 if( $rel ) {
 840                                                                         _add_to_hash( $rel, $phash, 1 );
 841                                                                         last;
 842                                                                 }
 843                                                         }
 844                                                 }
 845                                         }
 846                                         unless( $rel ) {
 847                                                 # and then check for sheer word similarity.
 848                                                 my $rtext = $rdg->text;
 849                                                 my $ptext = $pobj->text;
 850                                                 if( similar( $rtext, $ptext ) ) {
 851                                                         # say STDERR "Words $rtext and $ptext judged similar";
 852                                                         $phash->{relation} = { type => 'wordsimilar' };
 853                                                 }
 854                                         }
 855                                 } else {
 856                                         $phash->{relation} = { type => 'deletion' };
 857                                 }
 858                                 # Get the attributes of the parent object while we are here
 859                                 $phash->{'text'} = $pobj->text if $pobj;
 860                                 $phash->{'is_nonsense'} = $pobj->is_nonsense;
 861                                 $phash->{'is_ungrammatical'} = $pobj->grammar_invalid;
 862                         } elsif( $p eq '(omitted)' ) {
 863                                 $phash->{relation} = { type => 'addition' };
 864                         }
 865                         # Save it
 866                         $rdgparents->{$p} = $phash;
 867                 }
 868
 869                 $rdghash->{'reading_parents'} = $rdgparents;
 870
 871                 # Find the number of times this reading was altered, and the number of
 872                 # times we're not sure.
 873                 my( %nofollow, %unknownfollow );
 874                 foreach my $wit ( @{$rdghash->{'group'}} ) {
 875                         foreach my $wchild ( $graph->successors( $wit ) ) {
 876                                 if( $reading_roots->{$wchild} && $contig->{$wchild}
 877                                         && $contig->{$wchild} ne $rid ) {
 878                                         # It definitely changed here.
 879                                         $nofollow{$wchild} = 1;
 880                                 } elsif( !($contig->{$wchild}) ) {
 881                                         # The child is a hypothetical node not definitely in
 882                                         # any group. Answer is unknown.
 883                                         $unknownfollow{$wchild} = 1;
 884                                 } # else it is either in our group, or it is a non-root node in a
 885                                   # known group and therefore is presumed to have its reading from
 886                                   # its group, not this link.
 887                         }
 888                 }
 889                 $rdghash->{'not_followed'} = keys %nofollow;
 890                 $rdghash->{'follow_unknown'} = keys %unknownfollow;
 891
 892                 # Now say whether this reading represents a conflict.
 893                 unless( $variant_row->{'genealogical'} ) {
 894                         my @trueroots;
 895                         if( exists $variant_row->{'classes'} ) {
 896                                 # We have tested for reversions. Use the information.
 897                                 my @reversions;
 898                                 foreach my $rdgroot ( @roots ) {
 899                                         ## TODO This needs IDP to prune itself in order to be
 900                                         ## correct.
 901                                         if( $variant_row->{'classes'}->{$rdgroot} eq 'revert' ) {
 902                                                 push( @reversions, $rdgroot );
 903                                         } else {
 904                                                 push( @trueroots, $rdgroot );
 905                                         }
 906                                 }
 907                                 $rdghash->{'independent_occurrence'} = \@trueroots;
 908                                 $rdghash->{'reversion'} = \@reversions if @reversions;
 909                         } else {
 910                                 @trueroots = @roots;
 911                         }
 912                         $rdghash->{'conflict'} = @trueroots != 1;
 913                 }
 914     }
 915 }
 916
 917 sub _add_to_hash {
 918         my( $rel, $phash, $is_transposed ) = @_;
 919         $phash->{relation} = { type => $rel->type };
 920         $phash->{relation}->{transposed} = 1 if $is_transposed;
 921         $phash->{relation}->{annotation} = $rel->annotation
 922                 if $rel->has_annotation;
 923 }
 924
 925 =head2 similar( $word1, $word2 )
 926
 927 Use Algorithm::Diff to get a sense of how close the words are to each other.
 928 This will hopefully handle substitutions a bit more nicely than Levenshtein.
 929
 930 =cut
 931
 932 #!/usr/bin/env perl
 933
 934 sub similar {
 935         my( $word1, $word2 ) = sort { length($a) <=> length($b) } @_;
 936         my @let1 = split( '', lc( $word1 ) );
 937         my @let2 = split( '', lc( $word2 ) );
 938         my $diff = Algorithm::Diff->new( \@let1, \@let2 );
 939         my $mag = 0;
 940         while( $diff->Next ) {
 941                 if( $diff->Same ) {
 942                         # Take off points for longer strings
 943                         my $cs = $diff->Range(1) - 2;
 944                         $cs = 0 if $cs < 0;
 945                         $mag -= $cs;
 946                 } elsif( !$diff->Items(1) ) {
 947                         $mag += $diff->Range(2);
 948                 } elsif( !$diff->Items(2) ) {
 949                         $mag += $diff->Range(1);
 950                 } else {
 951                         # Split the difference for substitutions
 952                         my $c1 = $diff->Range(1) || 1;
 953                         my $c2 = $diff->Range(2) || 1;
 954                         my $cd = ( $c1 + $c2 ) / 2;
 955                         $mag += $cd;
 956                 }
 957         }
 958         return ( $mag <= length( $word1 ) / 2 );
 959 }
 960
 961 sub _prune_group {
 962         my( $group, $graph ) = @_;
 963         my $relevant = {};
 964         # Record the existence of the vertices in the group
 965         map { $relevant->{$_} = 1 } @$group;
 966         # Make our subgraph
 967         my $subgraph = $graph->deep_copy;
 968         map { $subgraph->delete_vertex( $_ ) unless $relevant->{$_} }
 969                 $subgraph->vertices;
 970         # Now prune and return the remaining vertices.
 971         _prune_subtree( $subgraph );
 972         # Return the list of vertices and the list of roots.
 973         my $pruned_group = [ sort $subgraph->vertices ];
 974         my $pruned_roots = [ $subgraph->predecessorless_vertices ];
 975         return( $pruned_group, $pruned_roots );
 976 }
 977
 978 sub _prune_subtree {
 979         my( $tree ) = @_;
 980
 981         # Delete lacunose witnesses that have no successors
 982         my @orphan_hypotheticals;
 983         my $ctr = 0;
 984         do {
 985                 die "Infinite loop on leaves" if $ctr > 100;
 986                 @orphan_hypotheticals =
 987                         grep { $tree->get_vertex_attribute( $_, 'class' ) eq 'hypothetical' }
 988                                 $tree->successorless_vertices;
 989                 $tree->delete_vertices( @orphan_hypotheticals );
 990                 $ctr++;
 991         } while( @orphan_hypotheticals );
 992
 993         # Delete lacunose roots that have a single successor
 994         my @redundant_root;
 995         $ctr = 0;
 996         do {
 997                 die "Infinite loop on roots" if $ctr > 100;
 998                 @redundant_root =
 999                         grep { $tree->get_vertex_attribute( $_, 'class' ) eq 'hypothetical'
1000                                    && $tree->successors( $_ ) == 1 }
1001                                 $tree->predecessorless_vertices;
1002                 $tree->delete_vertices( @redundant_root );
1003                 $ctr++;
1004         } while( @redundant_root );
1005 }
1006
1007 sub _useful_variant {
1008         my( $group_readings, $graph, $acstr ) = @_;
1009
1010         # TODO Decide what to do with AC witnesses
1011
1012         # Sort by group size and return
1013         my $is_useful = 0;
1014         my( @readings, @groups );   # The sorted groups for our answer.
1015         foreach my $rdg ( sort { @{$group_readings->{$b}} <=> @{$group_readings->{$a}} }
1016                 keys %$group_readings ) {
1017                 push( @readings, $rdg );
1018                 push( @groups, $group_readings->{$rdg} );
1019                 if( @{$group_readings->{$rdg}} > 1 ) {
1020                         $is_useful++;
1021                 } else {
1022                         my( $wit ) = @{$group_readings->{$rdg}};
1023                         $wit =~ s/^(.*)\Q$acstr\E$/$1/;
1024                         $is_useful++ unless( $graph->is_sink_vertex( $wit ) );
1025                 }
1026         }
1027         if( $is_useful > 1 ) {
1028                 return( \@readings, \@groups );
1029         } else {
1030                 return( [], [] );
1031         }
1032 }
1033
1034 =head2 wit_stringify( $groups )
1035
1036 Takes an array of witness groupings and produces a string like
1037 ['A','B'] / ['C','D','E'] / ['F']
1038
1039 =cut
1040
1041 sub wit_stringify {
1042     my $groups = shift;
1043     my @gst;
1044     # If we were passed an array of witnesses instead of an array of
1045     # groupings, then "group" the witnesses first.
1046     unless( ref( $groups->[0] ) ) {
1047         my $mkgrp = [ $groups ];
1048         $groups = $mkgrp;
1049     }
1050     foreach my $g ( @$groups ) {
1051         push( @gst, '[' . join( ',', map { "'$_'" } @$g ) . ']' );
1052     }
1053     return join( ' / ', @gst );
1054 }
1055
1056 sub _symmdiff {
1057         my( $lista, $listb ) = @_;
1058         my %union;
1059         my %scalars;
1060         map { $union{$_} = 1; $scalars{$_} = $_ } @$lista;
1061         map { $union{$_} += 1; $scalars{$_} = $_ } @$listb;
1062         my @set = grep { $union{$_} == 1 } keys %union;
1063         return map { $scalars{$_} } @set;
1064 }
1065
1066 1;
1067
1068 =head1 LICENSE
1069
1070 This package is free software and is provided "as is" without express
1071 or implied warranty.  You can redistribute it and/or modify it under
1072 the same terms as Perl itself.
1073
1074 =head1 AUTHOR
1075
1076 Tara L Andrews E<lt>aurum@cpan.orgE<gt>