use Set::Scalar comparison operator to compare sets. Addresses #24
[scpubgit/stemmatology.git] / analysis / lib / Text / Tradition / Analysis.pm
CommitLineData
d71100ed 1package Text::Tradition::Analysis;
2
3use strict;
4use warnings;
428bcf0b 5use Algorithm::Diff; # for word similarity measure
03c0a7d7 6use Encode qw/ decode_utf8 encode_utf8 /;
d1348d38 7use Exporter 'import';
b4cb2d60 8use Graph;
a745c3d9 9use JSON qw/ to_json decode_json /;
03c0a7d7 10use LWP::UserAgent;
7e17346f 11use Set::Scalar;
7e17346f 12use Text::Tradition::Analysis::Result;
d71100ed 13use Text::Tradition::Stemma;
d120c995 14use TryCatch;
d71100ed 15
951ddfe8 16use vars qw/ @EXPORT_OK $VERSION /;
a2cf85dd 17@EXPORT_OK = qw/ run_analysis group_variants analyze_variant_location wit_stringify /;
98c5430f 18$VERSION = "1.3";
951ddfe8 19
d1348d38 20
98c5430f 21my $DEFAULT_SOLVER_URL = 'http://perf.byzantini.st/cgi-bin/graphcalc.cgi';
7e17346f 22my $unsolved_problems = {};
5c44c598 23
7f52eac8 24=head1 NAME
25
26Text::Tradition::Analysis - functions for stemma analysis of a tradition
27
8943ff68 28=head1 DESCRIPTION
29
30Text::Tradition is a library for representation and analysis of collated
31texts, particularly medieval ones. Where the Collation is the central
32feature of a Tradition, it may also have one or more stemmata associated
33with it, and these stemmata may be analyzed. This package provides the
34following modules:
35
36=over 4
37
38=item * L<Text::Tradition::HasStemma> - a role that will be composed into
39Text::Tradition objects, providing the ability for Text::Tradition::Stemma
40objects to be associated with them.
41
42=item * L<Text::Tradition::Stemma> - an object class that represents stemma
43hypotheses, both rooted (with a single archetype) and unrooted (e.g.
44phylogenetic trees).
45
46=item * Text::Tradition::Analysis (this package). Provides functions for
47the analysis of a given stemma against the collation within a given
48Tradition.
49
50=back
51
7f52eac8 52=head1 SYNOPSIS
53
54 use Text::Tradition;
55 use Text::Tradition::Analysis qw/ run_analysis analyze_variant_location /;
56 my $t = Text::Tradition->new(
57 'name' => 'this is a text',
58 'input' => 'TEI',
59 'file' => '/path/to/tei_parallel_seg_file.xml' );
60 $t->add_stemma( 'dotfile' => $stemmafile );
61
62 my $variant_data = run_analysis( $tradition );
7f52eac8 63
7f52eac8 64=head1 SUBROUTINES
65
88a6bac5 66=head2 run_analysis( $tradition, %opts )
7f52eac8 67
88a6bac5 68Runs the analysis described in analyze_variant_location on every location in the
69collation of the given tradition, with the given options. These include:
7f52eac8 70
88a6bac5 71=over 4
72
73=item * stemma_id - Specify which of the tradition's stemmata to use. Default
74is 0 (i.e. the first).
75
76=item * ranks - Specify a list of location ranks to analyze; exclude the rest.
77
78=item * merge_types - Specify a list of relationship types, where related readings
79should be treated as identical for the purposes of analysis.
80
ffa22d6f 81=item * exclude_type1 - Exclude those ranks whose groupings have only type-1 variants.
82
88a6bac5 83=back
7f52eac8 84
85=begin testing
86
87use Text::Tradition;
88use Text::Tradition::Analysis qw/ run_analysis analyze_variant_location /;
89
90my $datafile = 't/data/florilegium_tei_ps.xml';
91my $tradition = Text::Tradition->new( 'input' => 'TEI',
92 'name' => 'test0',
93 'file' => $datafile );
94my $s = $tradition->add_stemma( 'dotfile' => 't/data/florilegium.dot' );
95is( ref( $s ), 'Text::Tradition::Stemma', "Added stemma to tradition" );
96
f00cefe8 97my %expected_genealogical = (
a44aaf2a 98 1 => 0,
f00cefe8 99 2 => 1,
a44aaf2a 100 3 => 0,
101 5 => 0,
102 7 => 0,
103 8 => 0,
104 10 => 0,
f00cefe8 105 13 => 1,
a44aaf2a 106 33 => 0,
107 34 => 0,
108 37 => 0,
109 60 => 0,
f00cefe8 110 81 => 1,
a44aaf2a 111 84 => 0,
112 87 => 0,
113 101 => 0,
114 102 => 0,
f00cefe8 115 122 => 1,
a44aaf2a 116 157 => 0,
f00cefe8 117 166 => 1,
118 169 => 1,
a44aaf2a 119 200 => 0,
f00cefe8 120 216 => 1,
121 217 => 1,
122 219 => 1,
123 241 => 1,
124 242 => 1,
125 243 => 1,
126);
127
7e17346f 128my $data = run_analysis( $tradition, calcdsn => 'dbi:SQLite:dbname=t/data/analysis.db' );
7234b01d 129my $c = $tradition->collation;
f00cefe8 130foreach my $row ( @{$data->{'variants'}} ) {
a44aaf2a 131 # Account for rows that used to be "not useful"
132 unless( exists $expected_genealogical{$row->{'id'}} ) {
133 $expected_genealogical{$row->{'id'}} = 1;
134 }
18f48b82 135 my $gen_bool = $row->{'genealogical'} ? 1 : 0;
136 is( $gen_bool, $expected_genealogical{$row->{'id'}},
f00cefe8 137 "Got correct genealogical flag for row " . $row->{'id'} );
7234b01d 138 # Check that we have the right row with the right groups
139 my $rank = $row->{'id'};
140 foreach my $rdghash ( @{$row->{'readings'}} ) {
141 # Skip 'readings' that aren't really
142 next unless $c->reading( $rdghash->{'readingid'} );
143 # Check the rank
144 is( $c->reading( $rdghash->{'readingid'} )->rank, $rank,
145 "Got correct reading rank" );
146 # Check the witnesses
147 my @realwits = sort $c->reading_witnesses( $rdghash->{'readingid'} );
148 my @sgrp = sort @{$rdghash->{'group'}};
149 is_deeply( \@sgrp, \@realwits, "Reading analyzed with correct groups" );
150 }
f00cefe8 151}
a44aaf2a 152is( $data->{'variant_count'}, 58, "Got right total variant number" );
b4cb2d60 153# TODO Make something meaningful of conflict count, maybe test other bits
7f52eac8 154
155=end testing
156
157=cut
158
d71100ed 159sub run_analysis {
88a6bac5 160 my( $tradition, %opts ) = @_;
f00cefe8 161 my $c = $tradition->collation;
7e17346f 162 my $aclabel = $c->ac_label;
88a6bac5 163
164 my $stemma_id = $opts{'stemma_id'} || 0;
1d73ecad 165 my @ranks = ref( $opts{'ranks'} ) eq 'ARRAY' ? @{$opts{'ranks'}} : ();
e23225e7 166 my $collapse = Set::Scalar->new();
167 if( $opts{'merge_types'} && ref( $opts{'merge_types'} ) eq 'ARRAY' ) {
168 $collapse->insert( @{$opts{'merge_types'}} );
169 } elsif( $opts{'merge_types'} ) {
170 $collapse->insert( $opts{'merge_types'} );
171 }
7e17346f 172
98c5430f 173 # If we have specified a local lookup DB for graph calculation results,
174 # make sure it exists and connect to it.
03c0a7d7 175 my $dir;
98c5430f 176 if ( exists $opts{'calcdsn'} ) {
8943ff68 177 eval { require Text::Tradition::Directory };
178 if( $@ ) {
179 throw( "Could not instantiate a directory for " . $opts{'calcdsn'}
180 . ": $@" );
181 }
98c5430f 182 $opts{'dir'} = Text::Tradition::Directory->new( dsn => $opts{'calcdsn'} );
183 } elsif( !exists $opts{'solver_url'} ) {
184 $opts{'solver_url'} = $DEFAULT_SOLVER_URL;
03c0a7d7 185 }
88a6bac5 186
187 # Get the stemma
188 my $stemma = $tradition->stemma( $stemma_id );
b4cb2d60 189
4ce27d42 190 # Figure out which witnesses we are working with - that is, the ones that
191 # appear both in the stemma and in the tradition. All others are 'lacunose'
192 # for our purposes.
7e17346f 193 my $lacunose = Set::Scalar->new( $stemma->hypotheticals );
194 my $stemma_wits = Set::Scalar->new( $stemma->witnesses );
195 my $tradition_wits = Set::Scalar->new( map { $_->sigil } $tradition->witnesses );
196 $lacunose->insert( $stemma_wits->symmetric_difference( $tradition_wits )->members );
88a6bac5 197
198 # Find and mark 'common' ranks for exclusion, unless they were
199 # explicitly specified.
200 unless( @ranks ) {
201 my %common_rank;
a44aaf2a 202 foreach my $rdg ( $c->common_readings ) {
88a6bac5 203 $common_rank{$rdg->rank} = 1;
204 }
205 @ranks = grep { !$common_rank{$_} } ( 1 .. $c->end->rank-1 );
d71100ed 206 }
7f52eac8 207
88a6bac5 208 # Group the variants to send to the solver
209 my @groups;
f629cb3b 210 my @use_ranks;
a44aaf2a 211 my %lacunae;
94654e27 212 my $moved = {};
88a6bac5 213 foreach my $rank ( @ranks ) {
7e17346f 214 my $missing = $lacunose->clone();
e23225e7 215 my $rankgroup = group_variants( $tradition, $rank, $missing, $moved, $collapse );
94654e27 216 # Filter out any empty rankgroups
217 # (e.g. from the later rank for a transposition)
218 next unless keys %$rankgroup;
7e17346f 219 # Get the graph for this rankgroup
220 my $rankgraph = _graph_for_grouping( $stemma, $rankgroup, $missing, $aclabel );
ffa22d6f 221 if( $opts{'exclude_type1'} ) {
222 # Check to see whether this is a "useful" group.
7e17346f 223 next unless _useful_variant( $rankgroup, $rankgraph, $aclabel );
ffa22d6f 224 }
f629cb3b 225 push( @use_ranks, $rank );
7e17346f 226 push( @groups, { grouping => $rankgroup, graph => $rankgraph } );
a44aaf2a 227 $lacunae{$rank} = $missing;
d71100ed 228 }
4ce27d42 229 # Run the solver
b7bd7aa5 230 my $answer;
231 try {
98c5430f 232 $answer = solve_variants( \%opts, @groups );
b7bd7aa5 233 } catch ( Text::Tradition::Error $e ) {
234 if( $e->message =~ /IDP/ ) {
235 # Something is wrong with the solver; make the variants table anyway
236 $answer->{'variants'} = [];
237 map { push( @{$answer->{'variants'}}, _init_unsolved( $_, 'IDP error' ) ) }
238 @groups;
239 } else {
240 # Something else is wrong; error out.
241 $e->throw;
242 }
243 }
fae07016 244
88a6bac5 245 # Do further analysis on the answer
a44aaf2a 246 my $conflict_count = 0;
7e17346f 247 my $reversion_count = 0;
f629cb3b 248 foreach my $idx ( 0 .. $#use_ranks ) {
88a6bac5 249 my $location = $answer->{'variants'}->[$idx];
250 # Add the rank back in
94654e27 251 my $rank = $use_ranks[$idx];
252 $location->{'id'} = $rank;
7234b01d 253 # Note what our lacunae are
f629cb3b 254 my %lmiss;
7234b01d 255 map { $lmiss{$_} = 1 } @{$lacunae{$use_ranks[$idx]}};
7234b01d 256 $location->{'missing'} = [ keys %lmiss ];
257
88a6bac5 258 # Run the extra analysis we need.
428bcf0b 259 ## TODO We run through all the variants in this call, so
260 ## why not add the reading data there instead of here below?
7e17346f 261 my $graph = $groups[$idx]->{graph};
262 analyze_location( $tradition, $graph, $location, \%lmiss );
7234b01d 263
638e2a95 264 my @layerwits;
7234b01d 265 # Do the final post-analysis tidying up of the data.
a44aaf2a 266 foreach my $rdghash ( @{$location->{'readings'}} ) {
e23225e7 267 $conflict_count++ if $rdghash->{'is_conflict'};
268 $reversion_count++ if $rdghash->{'is_reverted'};
94654e27 269 # Add the reading text back in, setting display value as needed
a44aaf2a 270 my $rdg = $c->reading( $rdghash->{'readingid'} );
94654e27 271 if( $rdg ) {
272 $rdghash->{'text'} = $rdg->text .
273 ( $rdg->rank == $rank ? '' : ' [' . $rdg->rank . ']' );
a445ce40 274 if( $rdg->does( 'Text::Tradition::Morphology' ) ) {
275 $rdghash->{'is_ungrammatical'} = $rdg->grammar_invalid;
276 $rdghash->{'is_nonsense'} = $rdg->is_nonsense;
277 }
94654e27 278 }
f629cb3b 279 # Remove lacunose witnesses from this reading's list now that the
7234b01d 280 # analysis is done
f629cb3b 281 my @realgroup;
7234b01d 282 map { push( @realgroup, $_ ) unless $lmiss{$_} } @{$rdghash->{'group'}};
f629cb3b 283 $rdghash->{'group'} = \@realgroup;
638e2a95 284 # Note any layered witnesses that appear in this group
285 foreach( @realgroup ) {
286 if( $_ =~ /^(.*)\Q$aclabel\E$/ ) {
287 push( @layerwits, $1 );
288 }
289 }
a44aaf2a 290 }
638e2a95 291 $location->{'layerwits'} = \@layerwits if @layerwits;
88a6bac5 292 }
a44aaf2a 293 $answer->{'conflict_count'} = $conflict_count;
7e17346f 294 $answer->{'reversion_count'} = $reversion_count;
f00cefe8 295
88a6bac5 296 return $answer;
d71100ed 297}
298
e23225e7 299=head2 group_variants( $tradition, $rank, $lacunose, $transposed, $merge_relationship_types )
7f52eac8 300
301Groups the variants at the given $rank of the collation, treating any
e23225e7 302relationships in the set $merge_relationship_types as equivalent.
303$lacunose should be a reference to an array, to which the sigla of lacunose
304witnesses at this rank will be appended; $transposed should be a reference
305to a hash, wherein the identities of transposed readings and their
306relatives will be stored.
7f52eac8 307
ffa22d6f 308Returns a hash $group_readings where $rdg is attested by the witnesses listed
309in $group_readings->{$rdg}.
7f52eac8 310
311=cut
312
313# Return group_readings, groups, lacunose
d1348d38 314sub group_variants {
94654e27 315 my( $tradition, $rank, $lacunose, $transposed, $collapse ) = @_;
7f52eac8 316 my $c = $tradition->collation;
335a62ef 317 my $aclabel = $c->ac_label;
d120c995 318 my $table = $c->alignment_table;
7f52eac8 319 # Get the alignment table readings
320 my %readings_at_rank;
7e17346f 321 my $check_for_gaps = Set::Scalar->new();
94654e27 322 my %moved_wits;
d120c995 323 my $has_transposition;
608bbd95 324 my @transp_acgap;
d120c995 325 foreach my $tablewit ( @{$table->{'alignment'}} ) {
7f52eac8 326 my $rdg = $tablewit->{'tokens'}->[$rank-1];
fae07016 327 my $wit = $tablewit->{'witness'};
ffa22d6f 328 # Exclude the witness if it is "lacunose" which if we got here
329 # means "not in the stemma".
7e17346f 330 next if _is_lacunose( $wit, $lacunose, $aclabel );
94654e27 331 # Note if the witness is actually in a lacuna
7f52eac8 332 if( $rdg && $rdg->{'t'}->is_lacuna ) {
335a62ef 333 _add_to_witlist( $wit, $lacunose, $aclabel );
94654e27 334 # Otherwise the witness either has a positive reading...
7f52eac8 335 } elsif( $rdg ) {
94654e27 336 # If the reading has been counted elsewhere as a transposition, ignore it.
337 if( $transposed->{$rdg->{'t'}->id} ) {
d120c995 338 # TODO Does this cope with three-way transpositions?
94654e27 339 map { $moved_wits{$_} = 1 } @{$transposed->{$rdg->{'t'}->id}};
340 next;
341 }
342 # Otherwise, record it...
343 $readings_at_rank{$rdg->{'t'}->id} = $rdg->{'t'};
344 # ...and grab any transpositions, and their relations.
ee26c4d9 345 my @transp = grep { $_->rank != $rank } _all_related( $rdg->{'t'} );
94654e27 346 foreach my $trdg ( @transp ) {
d120c995 347 next if exists $readings_at_rank{$trdg->id};
348 $has_transposition = 1;
349 my @affected_wits = _table_witnesses(
608bbd95 350 $table, $trdg->rank, $trdg, $lacunose, $aclabel );
d120c995 351 next unless @affected_wits;
352 map { $moved_wits{$_} = 1 } @affected_wits;
608bbd95 353 my @thisloc_wits = _table_witnesses( $table, $rank, $rdg->{'t'},
f9595796 354 $lacunose, $aclabel );
355 # Check to see if our affected wits have layers that do something
356 # wacky.
357 my %transploc_gaps;
358 map { $transploc_gaps{$_} = 1 }
608bbd95 359 _table_witnesses( $table, $trdg->rank, undef, $lacunose, $aclabel );
f9595796 360 foreach my $aw ( @affected_wits ) {
608bbd95 361 if( $transploc_gaps{$aw.$aclabel} ) {
362 push( @thisloc_wits, $aw.$aclabel );
363 push( @transp_acgap, $aw.$aclabel );
364 }
f9595796 365 }
366 # Record which witnesses we should count as already analyzed when we
367 # get to the transposed reading's own rank.
368 $transposed->{$trdg->id} = \@thisloc_wits;
94654e27 369 $readings_at_rank{$trdg->id} = $trdg;
370 }
371 # ...or it is empty, ergo a gap.
7f52eac8 372 } else {
7e17346f 373 _add_to_witlist( $wit, $check_for_gaps, $aclabel );
7f52eac8 374 }
375 }
608bbd95 376 # Push all the transposition layer gaps onto our list
377 $check_for_gaps->insert( @transp_acgap );
378 # Now remove from our 'gaps' any witnesses known to have been dealt with elsewhere.
7e17346f 379 my $gap_wits = Set::Scalar->new();
380 map { _add_to_witlist( $_, $gap_wits, $aclabel )
381 unless $moved_wits{$_} } $check_for_gaps->members;
382
383 # Group the readings, collapsing groups by relationship if needed.
d120c995 384 my $grouped_readings = {};
4ce27d42 385 foreach my $rdg ( values %readings_at_rank ) {
7f52eac8 386 # Skip readings that have been collapsed into others.
d120c995 387 next if exists $grouped_readings->{$rdg->id}
388 && $grouped_readings->{$rdg->id} eq 'COLLAPSE';
4ce27d42 389 # Get the witness list, including from readings collapsed into this one.
608bbd95 390 my @wits = _table_witnesses( $table, $rdg->rank, $rdg, $lacunose, $aclabel );
e23225e7 391 if( $collapse && $collapse->size ) {
392 my $filter = sub { $collapse->has( $_[0]->type ) };
7f52eac8 393 foreach my $other ( $rdg->related_readings( $filter ) ) {
608bbd95 394 my @otherwits = _table_witnesses( $table, $other->rank, $other, $lacunose, $aclabel );
fae07016 395 push( @wits, @otherwits );
d120c995 396 $grouped_readings->{$other->id} = 'COLLAPSE';
d1348d38 397 }
398 }
7e17346f 399 $grouped_readings->{$rdg->id} = Set::Scalar->new( @wits );
7f52eac8 400 }
7e17346f 401 if( $gap_wits->members ) {
402 $grouped_readings->{'(omitted)'} = $gap_wits;
403 }
404
7f52eac8 405 # Get rid of our collapsed readings
7e17346f 406 map { delete $grouped_readings->{$_} if(
407 $grouped_readings->{$_} eq 'COLLAPSE'
408 || $grouped_readings->{$_}->is_empty ) }
409 keys %$grouped_readings;
d120c995 410
411 # If something was transposed, check the groups for doubled-up readings
412 if( $has_transposition ) {
62a39b8f 413 # print STDERR "Group for rank $rank:\n";
414 # map { print STDERR "\t$_: " . join( ' ' , @{$grouped_readings->{$_}} ) . "\n" }
415 # keys %$grouped_readings;
d120c995 416 _check_transposed_consistency( $c, $rank, $transposed, $grouped_readings );
417 }
7f52eac8 418
4ce27d42 419 # Return the result
d120c995 420 return $grouped_readings;
421}
422
ee26c4d9 423sub _all_related {
f9b9b580 424 # Except by repetition
ee26c4d9 425 my $rdg = shift;
426 my $c = $rdg->collation;
f9b9b580 427 my @check = ( $rdg );
428 my %seen;
429 while( @check ) {
430 my @next;
431 foreach my $ck ( @check ) {
432 $seen{"$ck"} = 1;
433 push( @next, grep { !$seen{"$_"} }
434 $ck->related_readings( sub { $_[0]->type ne 'repetition' } ) );
435 }
436 @check = @next;
437 }
438
439
440 my @all = map { $c->reading( $_ ) } keys %seen;
ee26c4d9 441 return @all;
442}
443
444
d120c995 445# Helper function to query the alignment table for all witnesses (a.c. included)
446# that have a given reading at its rank.
447sub _table_witnesses {
608bbd95 448 my( $table, $rank, $trdg, $lacunose, $aclabel ) = @_;
449 my $tableidx = $rank - 1;
7e17346f 450 my $has_reading = Set::Scalar->new();
d120c995 451 foreach my $row ( @{$table->{'alignment'}} ) {
452 my $wit = $row->{'witness'};
7e17346f 453 next if _is_lacunose( $wit, $lacunose, $aclabel );
d120c995 454 my $rdg = $row->{'tokens'}->[$tableidx];
f9595796 455 if( $trdg ) {
456 # We have some positive reading we want.
457 next unless exists $rdg->{'t'} && defined $rdg->{'t'};
458 if( $trdg->is_lacuna ) {
459 _add_to_witlist( $wit, $has_reading, $aclabel )
460 if $rdg->{'t'}->is_lacuna;
461 } else {
462 _add_to_witlist( $wit, $has_reading, $aclabel )
463 if $rdg->{'t'}->id eq $trdg->id;
464 }
465 } else {
466 # We want the omissions.
467 next if exists $rdg->{'t'} && defined $rdg->{'t'};
468 _add_to_witlist( $wit, $has_reading, $aclabel )
469 }
d120c995 470 }
7e17346f 471 return $has_reading->members;
472}
473
474# Helper function to see if a witness is lacunose even if we are asking about
475# the a.c. version
476sub _is_lacunose {
477 my ( $wit, $lac, $acstr ) = @_;
478 if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
479 $wit = $1;
480 }
481 return $lac->has( $wit );
d1348d38 482}
483
335a62ef 484# Helper function to ensure that X and X a.c. never appear in the same list.
485sub _add_to_witlist {
486 my( $wit, $list, $acstr ) = @_;
335a62ef 487 if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
7e17346f 488 # Don't add X a.c. if we already have X
489 return if $list->has( $1 );
335a62ef 490 } else {
7e17346f 491 # Delete X a.c. if we are about to add X
492 $list->delete( $wit.$acstr );
335a62ef 493 }
7e17346f 494 $list->insert( $wit );
335a62ef 495}
496
d120c995 497sub _check_transposed_consistency {
498 my( $c, $rank, $transposed, $groupings ) = @_;
499 my %seen_wits;
500 my %thisrank;
501 # Note which readings are actually at this rank, and which witnesses
502 # belong to which reading.
503 foreach my $rdg ( keys %$groupings ) {
504 my $rdgobj = $c->reading( $rdg );
505 # Count '(omitted)' as a reading at this rank
506 $thisrank{$rdg} = 1 if !$rdgobj || $rdgobj->rank == $rank;
507 map { push( @{$seen_wits{$_}}, $rdg ) } @{$groupings->{$rdg}};
508 }
509 # Our work is done if we have no witness belonging to more than one
510 # reading.
511 my @doubled = grep { scalar @{$seen_wits{$_}} > 1 } keys %seen_wits;
512 return unless @doubled;
513 # If we have a symmetric related transposition, drop the non-rank readings.
514 if( @doubled == scalar keys %seen_wits ) {
515 foreach my $rdg ( keys %$groupings ) {
516 if( !$thisrank{$rdg} ) {
9289e257 517 # Groupings are Set::Scalar objects so we can compare them outright.
518 my ( $matched ) = grep { $groupings->{$rdg} == $groupings->{$_} }
d120c995 519 keys %thisrank;
520 delete $groupings->{$rdg};
521 # If we found a group match, assume there is a symmetry happening.
522 # TODO think more about this
62a39b8f 523 # print STDERR "*** Deleting symmetric reading $rdg\n";
d120c995 524 unless( $matched ) {
525 delete $transposed->{$rdg};
526 warn "Found problem in evident symmetry with reading $rdg";
527 }
528 }
529 }
530 # Otherwise 'unhook' the transposed reading(s) that have duplicates.
531 } else {
532 foreach my $dup ( @doubled ) {
533 foreach my $rdg ( @{$seen_wits{$dup}} ) {
534 next if $thisrank{$rdg};
535 next unless exists $groupings->{$rdg};
62a39b8f 536 # print STDERR "*** Deleting asymmetric doubled-up reading $rdg\n";
d120c995 537 delete $groupings->{$rdg};
538 delete $transposed->{$rdg};
539 }
540 }
541 # and put any now-orphaned readings into an 'omitted' reading.
542 foreach my $wit ( keys %seen_wits ) {
543 unless( grep { exists $groupings->{$_} } @{$seen_wits{$wit}} ) {
7e17346f 544 $groupings->{'(omitted)'} = Set::Scalar->new()
545 unless exists $groupings->{'(omitted)'};
d120c995 546 _add_to_witlist( $wit, $groupings->{'(omitted)'}, $c->ac_label );
547 }
548 }
549 }
550}
551
7e17346f 552# For the given grouping, return its situation graph based on the stemma.
553sub _graph_for_grouping {
554 my( $stemma, $grouping, $lacunose, $aclabel ) = @_;
555 my $acwits = [];
556 my $extant = {};
557 foreach my $gs ( values %$grouping ) {
558 map {
559 if( $_ =~ /^(.*)\Q$aclabel\E$/ ) {
560 push( @$acwits, $1 ) unless $lacunose->has( $1 );
561 } else {
562 $extant->{$_} = 1 unless $lacunose->has( $_ );
563 }
564 } $gs->members;
565 }
566 my $graph;
567 try {
568 # contig contains all extant wits and all hypothetical wits
569 # needed to make up the groups.
ace5fce5 570 $graph = $stemma->situation_graph( $extant, $acwits, $aclabel );
7e17346f 571 } catch ( Text::Tradition::Error $e ) {
03c0a7d7 572 throw( "Could not extend graph with given extant and a.c. witnesses: "
573 . $e->message );
7e17346f 574 } catch {
03c0a7d7 575 throw( "Could not extend graph with a.c. witnesses @$acwits" );
7e17346f 576 }
577 return $graph;
578}
88a6bac5 579
7e17346f 580=head2 solve_variants( $calcdir, @groups )
581
582Looks up the set of groups in the answers provided by the external graph solver
583service and returns a cleaned-up answer, adding the rank IDs back where they belong.
88a6bac5 584
88a6bac5 585The answer has the form
586 { "variants" => [ array of variant location structures ],
587 "variant_count" => total,
588 "conflict_count" => number of conflicts detected,
589 "genealogical_count" => number of solutions found }
590
591=cut
592
593sub solve_variants {
98c5430f 594 my( $opts, @groups ) = @_;
03c0a7d7 595
98c5430f 596 # Are we using a local result directory?
597 my $dir = $opts->{dir};
335a62ef 598
03c0a7d7 599 ## For each graph/group combo, make a Text::Tradition::Analysis::Result
600 ## object so that we can send it off for IDP lookup.
7e17346f 601 my $variants = [];
602 my $genealogical = 0; # counter
03c0a7d7 603 # TODO Optimize for unique graph problems
554e2e7d 604 my %problems;
7e17346f 605 foreach my $graphproblem ( @groups ) {
03c0a7d7 606 # Construct the calc result key and look up its answer
607 my $problem = Text::Tradition::Analysis::Result->new(
608 graph => $graphproblem->{'graph'},
609 setlist => [ values %{$graphproblem->{'grouping'}} ] );
554e2e7d 610 if( exists $problems{$problem->object_key} ) {
611 $problem = $problems{$problem->object_key};
612 } else {
613 $problems{$problem->object_key} = $problem;
614 }
615 $graphproblem->{'object'} = $problem;
03c0a7d7 616 }
617
554e2e7d 618 my %results;
03c0a7d7 619 if( $dir ) {
620 my $scope = $dir->new_scope;
554e2e7d 621 map { $results{$_} = $dir->lookup( $_ ) || $problems{$_} } keys %problems;
622 } else {
98c5430f 623 # print STDERR "Using solver at " . $opts->{solver_url} . "\n";
554e2e7d 624 my $json = JSON->new->allow_blessed->convert_blessed->utf8->encode(
625 [ values %problems ] );
03c0a7d7 626 # Send it off and get the result
627 # print STDERR "Sending request: " . decode_utf8( $json ) . "\n";
628 my $ua = LWP::UserAgent->new();
98c5430f 629 my $resp = $ua->post( $opts->{solver_url}, 'Content-Type' => 'application/json',
03c0a7d7 630 'Content' => $json );
a745c3d9 631 my $answer;
03c0a7d7 632 if( $resp->is_success ) {
a745c3d9 633 $answer = decode_json( $resp->content );
03c0a7d7 634 throw( "Unexpected answer from IDP: $answer" ) unless ref( $answer ) eq 'ARRAY';
635 } else {
636 throw( "IDP solver returned " . $resp->status_line . " / " . $resp->content
637 . "; cannot run graph analysis" );
638 }
639 # One more sanity check
640 throw( "Something went wrong with answer symmetricity" )
554e2e7d 641 unless keys( %problems ) == @$answer;
03c0a7d7 642 # Convert the results
554e2e7d 643 foreach my $a ( @$answer ) {
644 my $r = Text::Tradition::Analysis::Result->new( $a );
645 $results{$r->object_key} = $r;
646 }
03c0a7d7 647 }
648
649 # We now have a single JSON-encoded Result object per problem sent. Fold its
650 # answers into our variant info structure.
554e2e7d 651 foreach my $graphproblem ( @groups ) {
652 my $result = $results{$graphproblem->{'object'}->object_key}
653 || $graphproblem->{'object'};
03c0a7d7 654
7e17346f 655 # Initialize the result structure for this graph problem
b7bd7aa5 656 my $vstruct;
657 if( $result->status eq 'OK' ) {
658 $vstruct = { readings => [] };
659 push( @$variants, $vstruct );
660 } else {
661 push( @$variants, _init_unsolved( $graphproblem, $result->status ) );
7e17346f 662 next;
5c44c598 663 }
b7bd7aa5 664
7e17346f 665 # 1. Did the group evaluate as genealogical?
03c0a7d7 666 $vstruct->{genealogical} = $result->is_genealogical;
667 $genealogical++ if $result->is_genealogical;
7e17346f 668
669 # 2. What are the calculated minimum groupings for each variant loc?
670 foreach my $rid ( keys %{$graphproblem->{grouping}} ) {
671 my $inputset = $graphproblem->{grouping}->{$rid};
03c0a7d7 672 my $minset = $result->minimum_grouping_for( $inputset );
7e17346f 673 push( @{$vstruct->{readings}}, { readingid => $rid, group => $minset } );
6d25a3a0 674 }
7e17346f 675
676 # 3. What are the sources and classes calculated for each witness?
03c0a7d7 677 $vstruct->{witcopy_types} = { $result->classes };
7e17346f 678 $vstruct->{reading_roots} = {};
03c0a7d7 679 map { $vstruct->{reading_roots}->{$_} = 1 } $result->sources;
7e17346f 680
88a6bac5 681 }
682
683 return { 'variants' => $variants,
684 'variant_count' => scalar @$variants,
685 'genealogical_count' => $genealogical };
686}
687
b7bd7aa5 688sub _init_unsolved {
689 my( $graphproblem, $status ) = @_;
690 my $vstruct = { 'readings' => [] };
691 $vstruct->{'unsolved'} = $status;
692 foreach my $rid ( keys %{$graphproblem->{grouping}} ) {
693 push( @{$vstruct->{readings}}, { readingid => $rid,
694 group => [ $graphproblem->{grouping}->{$rid}->members ] } );
695 }
696 return $vstruct;
697}
698
fae07016 699=head2 analyze_location ( $tradition, $graph, $location_hash )
7f52eac8 700
fae07016 701Given the tradition, its stemma graph, and the solution from the graph solver,
702work out the rest of the information we want. For each reading we need missing,
5c44c598 703conflict, reading_parents, independent_occurrence, followed, not_followed,
704and follow_unknown. Alters the location_hash in place.
7f52eac8 705
706=cut
732152b1 707
fae07016 708sub analyze_location {
7e17346f 709 my ( $tradition, $graph, $variant_row, $lacunose ) = @_;
94654e27 710 my $c = $tradition->collation;
fae07016 711
03c0a7d7 712 if( exists $variant_row->{'unsolved'} ) {
713 return;
714 }
715 my $reading_roots = delete $variant_row->{'reading_roots'};
716 my $classinfo = delete $variant_row->{'witcopy_types'};
717
fae07016 718 # Make a hash of all known node memberships, and make the subgraphs.
719 my $contig = {};
fae07016 720 my $subgraph = {};
638e2a95 721 my $acstr = $c->ac_label;
722 my @acwits;
5c44c598 723
5c44c598 724 # Note which witnesses positively belong to which group. This information
725 # comes ultimately from the IDP solver.
726 # Also make a note of the reading's roots.
fae07016 727 foreach my $rdghash ( @{$variant_row->{'readings'}} ) {
728 my $rid = $rdghash->{'readingid'};
5c44c598 729 my @roots;
638e2a95 730 foreach my $wit ( @{$rdghash->{'group'}} ) {
731 $contig->{$wit} = $rid;
732 if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
733 push( @acwits, $1 );
734 }
5c44c598 735 if( exists $reading_roots->{$wit} && $reading_roots->{$wit} ) {
736 push( @roots, $wit );
737 }
638e2a95 738 }
5c44c598 739 $rdghash->{'independent_occurrence'} = \@roots;
94654e27 740 }
7e17346f 741
fae07016 742 # Now that we have all the node group memberships, calculate followed/
bebec0e9 743 # non-followed/unknown values for each reading. Also figure out the
744 # reading's evident parent(s).
745 foreach my $rdghash ( @{$variant_row->{'readings'}} ) {
fae07016 746 my $rid = $rdghash->{'readingid'};
428bcf0b 747 my $rdg = $c->reading( $rid );
5c44c598 748 my @roots = @{$rdghash->{'independent_occurrence'}};
7e17346f 749 my @reversions;
750 if( $classinfo ) {
751 @reversions = grep { $classinfo->{$_} eq 'revert' }
752 $rdghash->{'group'}->members;
be590045 753 $rdghash->{'reversions'} = \@reversions;
7e17346f 754 }
5c44c598 755 my @group = @{$rdghash->{'group'}};
fae07016 756
757 # Start figuring things out.
7e17346f 758 $rdghash->{'followed'} = scalar( @group )
759 - ( scalar( @roots ) + scalar( @reversions ) );
bebec0e9 760 # Find the parent readings, if any, of this reading.
7e17346f 761 my $sourceparents = _find_reading_parents( $rid, $graph, $contig, @roots );
7e17346f 762 # Work out relationships between readings and their non-followed parent.
763 _resolve_parent_relationships( $c, $rid, $rdg, $sourceparents );
be590045 764 $rdghash->{'source_parents'} = $sourceparents;
e23225e7 765
766 if( @reversions ) {
767 my $revparents = _find_reading_parents( $rid, $graph, $contig, @reversions );
768 _resolve_parent_relationships( $c, $rid, $rdg, $revparents );
769 $rdghash->{'reversion_parents'} = $revparents;
770 }
bebec0e9 771
772 # Find the number of times this reading was altered, and the number of
773 # times we're not sure.
774 my( %nofollow, %unknownfollow );
5c44c598 775 foreach my $wit ( @{$rdghash->{'group'}} ) {
bebec0e9 776 foreach my $wchild ( $graph->successors( $wit ) ) {
5c44c598 777 if( $reading_roots->{$wchild} && $contig->{$wchild}
778 && $contig->{$wchild} ne $rid ) {
bebec0e9 779 # It definitely changed here.
780 $nofollow{$wchild} = 1;
781 } elsif( !($contig->{$wchild}) ) {
782 # The child is a hypothetical node not definitely in
783 # any group. Answer is unknown.
784 $unknownfollow{$wchild} = 1;
5c44c598 785 } # else it is either in our group, or it is a non-root node in a
786 # known group and therefore is presumed to have its reading from
787 # its group, not this link.
bebec0e9 788 }
789 }
790 $rdghash->{'not_followed'} = keys %nofollow;
791 $rdghash->{'follow_unknown'} = keys %unknownfollow;
fae07016 792
793 # Now say whether this reading represents a conflict.
794 unless( $variant_row->{'genealogical'} ) {
7e17346f 795 $rdghash->{'is_conflict'} = @roots != 1;
e23225e7 796 $rdghash->{'is_reverted'} = scalar @reversions;
7e17346f 797 }
798 }
799}
800
801sub _find_reading_parents {
802 my( $rid, $graph, $contig, @list ) = @_;
803 my $parenthash = {};
804 foreach my $wit ( @list ) {
805 # Look in the stemma graph to find this witness's extant or known-reading
806 # immediate ancestor(s), and look up the reading that each ancestor holds.
807 my @check = $graph->predecessors( $wit );
808 while( @check ) {
809 my @next;
810 foreach my $wparent( @check ) {
811 my $preading = $contig->{$wparent};
812 if( $preading && $preading ne $rid ) {
813 $parenthash->{$preading} = 1;
814 } else {
815 push( @next, $graph->predecessors( $wparent ) );
816 }
817 }
818 @check = @next;
819 }
820 }
821 return $parenthash;
822}
823
824sub _resolve_parent_relationships {
825 my( $c, $rid, $rdg, $rdgparents ) = @_;
826 foreach my $p ( keys %$rdgparents ) {
827 # Resolve the relationship of the parent to the reading, and
828 # save it in our hash.
829 my $pobj = $c->reading( $p );
830 my $prep = $pobj ? $pobj->id . ' (' . $pobj->text . ')' : $p;
831 my $phash = { 'label' => $prep };
832 if( $pobj ) {
4e49efd7 833 # Get the attributes of the parent object while we are here
834 $phash->{'text'} = $pobj->text if $pobj;
835 if( $pobj && $pobj->does('Text::Tradition::Morphology') ) {
836 $phash->{'is_nonsense'} = $pobj->is_nonsense;
837 $phash->{'is_ungrammatical'} = $pobj->grammar_invalid;
838 }
839 # Now look at the relationship
7e17346f 840 my $rel = $c->get_relationship( $p, $rid );
3bce8f64 841 if( $rel && $rel->type eq 'collated' ) {
842 $rel = undef;
843 }
7e17346f 844 if( $rel ) {
845 _add_to_hash( $rel, $phash );
846 } elsif( $rdg ) {
847 # First check for a transposed relationship
848 if( $rdg->rank != $pobj->rank ) {
849 foreach my $ti ( $rdg->related_readings( 'transposition' ) ) {
850 next unless $ti->text eq $rdg->text;
851 $rel = $c->get_relationship( $ti, $pobj );
852 if( $rel ) {
853 _add_to_hash( $rel, $phash, 1 );
854 last;
855 }
856 }
857 unless( $rel ) {
858 foreach my $ti ( $pobj->related_readings( 'transposition' ) ) {
859 next unless $ti->text eq $pobj->text;
860 $rel = $c->get_relationship( $ti, $rdg );
861 if( $rel ) {
862 _add_to_hash( $rel, $phash, 1 );
863 last;
864 }
865 }
5c44c598 866 }
867 }
7e17346f 868 unless( $rel ) {
869 # and then check for sheer word similarity.
870 my $rtext = $rdg->text;
871 my $ptext = $pobj->text;
872 if( similar( $rtext, $ptext ) ) {
873 # say STDERR "Words $rtext and $ptext judged similar";
874 $phash->{relation} = { type => 'wordsimilar' };
875 }
876 }
5c44c598 877 } else {
7e17346f 878 $phash->{relation} = { type => 'deletion' };
5c44c598 879 }
7e17346f 880 } elsif( $p eq '(omitted)' ) {
ee26c4d9 881 # Check to see if the reading in question is a repetition.
882 my @reps = $rdg->related_readings( 'repetition' );
883 if( @reps ) {
884 $phash->{relation} = { type => 'repetition',
885 annotation => "of reading @reps" };
886 } else {
887 $phash->{relation} = { type => 'addition' };
888 }
7e17346f 889 }
890 # Save it
891 $rdgparents->{$p} = $phash;
892 }
d71100ed 893}
894
428bcf0b 895sub _add_to_hash {
896 my( $rel, $phash, $is_transposed ) = @_;
897 $phash->{relation} = { type => $rel->type };
898 $phash->{relation}->{transposed} = 1 if $is_transposed;
899 $phash->{relation}->{annotation} = $rel->annotation
900 if $rel->has_annotation;
4e49efd7 901 # Get all the relevant relationship info.
902 foreach my $prop ( qw/ non_independent is_significant / ) {
903 $phash->{relation}->{$prop} = $rel->$prop;
904 }
905 # Figure out if the variant was judged revertible.
906 my $is_a = $rel->reading_a eq $phash->{text};
907 $phash->{revertible} = $is_a
908 ? $rel->a_derivable_from_b : $rel->b_derivable_from_a;
428bcf0b 909}
910
911=head2 similar( $word1, $word2 )
912
913Use Algorithm::Diff to get a sense of how close the words are to each other.
914This will hopefully handle substitutions a bit more nicely than Levenshtein.
915
916=cut
917
918#!/usr/bin/env perl
919
920sub similar {
921 my( $word1, $word2 ) = sort { length($a) <=> length($b) } @_;
922 my @let1 = split( '', lc( $word1 ) );
923 my @let2 = split( '', lc( $word2 ) );
924 my $diff = Algorithm::Diff->new( \@let1, \@let2 );
925 my $mag = 0;
926 while( $diff->Next ) {
927 if( $diff->Same ) {
928 # Take off points for longer strings
929 my $cs = $diff->Range(1) - 2;
930 $cs = 0 if $cs < 0;
931 $mag -= $cs;
932 } elsif( !$diff->Items(1) ) {
933 $mag += $diff->Range(2);
934 } elsif( !$diff->Items(2) ) {
935 $mag += $diff->Range(1);
936 } else {
937 # Split the difference for substitutions
938 my $c1 = $diff->Range(1) || 1;
939 my $c2 = $diff->Range(2) || 1;
940 my $cd = ( $c1 + $c2 ) / 2;
941 $mag += $cd;
942 }
943 }
944 return ( $mag <= length( $word1 ) / 2 );
945}
946
5be0cdeb 947sub _useful_variant {
7e17346f 948 my( $rankgroup, $rankgraph, $acstr ) = @_;
5be0cdeb 949
950 # Sort by group size and return
951 my $is_useful = 0;
7e17346f 952 foreach my $rdg ( keys %$rankgroup ) {
953 my @wits = $rankgroup->{$rdg}->members;
954 if( @wits > 1 ) {
5be0cdeb 955 $is_useful++;
956 } else {
7e17346f 957 $is_useful++ unless( $rankgraph->is_sink_vertex( $wits[0] )
958 || $wits[0] =~ /\Q$acstr\E$/ );
5be0cdeb 959 }
960 }
7e17346f 961 return $is_useful > 1;
5be0cdeb 962}
963
7f52eac8 964=head2 wit_stringify( $groups )
965
966Takes an array of witness groupings and produces a string like
967['A','B'] / ['C','D','E'] / ['F']
d71100ed 968
7f52eac8 969=cut
d71100ed 970
971sub wit_stringify {
972 my $groups = shift;
973 my @gst;
974 # If we were passed an array of witnesses instead of an array of
975 # groupings, then "group" the witnesses first.
976 unless( ref( $groups->[0] ) ) {
977 my $mkgrp = [ $groups ];
978 $groups = $mkgrp;
979 }
980 foreach my $g ( @$groups ) {
981 push( @gst, '[' . join( ',', map { "'$_'" } @$g ) . ']' );
982 }
983 return join( ' / ', @gst );
984}
7f52eac8 985
7f52eac8 9861;
987
a745c3d9 988sub throw {
989 Text::Tradition::Error->throw(
990 'ident' => 'Analysis error',
991 'message' => $_[0],
992 );
993}
994
7f52eac8 995=head1 LICENSE
996
997This package is free software and is provided "as is" without express
998or implied warranty. You can redistribute it and/or modify it under
999the same terms as Perl itself.
1000
1001=head1 AUTHOR
1002
1003Tara L Andrews E<lt>aurum@cpan.orgE<gt>