move analysis script; update Analysis.pm for new relationship regime
[scpubgit/stemmatology.git] / analysis / lib / Text / Tradition / Analysis.pm
CommitLineData
d71100ed 1package Text::Tradition::Analysis;
2
3use strict;
4use warnings;
428bcf0b 5use Algorithm::Diff; # for word similarity measure
03c0a7d7 6use Encode qw/ decode_utf8 encode_utf8 /;
d1348d38 7use Exporter 'import';
b4cb2d60 8use Graph;
a745c3d9 9use JSON qw/ to_json decode_json /;
03c0a7d7 10use LWP::UserAgent;
7e17346f 11use Set::Scalar;
7e17346f 12use Text::Tradition::Analysis::Result;
d71100ed 13use Text::Tradition::Stemma;
d120c995 14use TryCatch;
d71100ed 15
951ddfe8 16use vars qw/ @EXPORT_OK $VERSION /;
a2cf85dd 17@EXPORT_OK = qw/ run_analysis group_variants analyze_variant_location wit_stringify /;
37bf09f4 18$VERSION = "1.1";
951ddfe8 19
d1348d38 20
03c0a7d7 21my $SOLVER_URL = 'http://byzantini.st/cgi-bin/graphcalc.cgi';
7e17346f 22my $unsolved_problems = {};
5c44c598 23
7f52eac8 24=head1 NAME
25
26Text::Tradition::Analysis - functions for stemma analysis of a tradition
27
8943ff68 28=head1 DESCRIPTION
29
30Text::Tradition is a library for representation and analysis of collated
31texts, particularly medieval ones. Where the Collation is the central
32feature of a Tradition, it may also have one or more stemmata associated
33with it, and these stemmata may be analyzed. This package provides the
34following modules:
35
36=over 4
37
38=item * L<Text::Tradition::HasStemma> - a role that will be composed into
39Text::Tradition objects, providing the ability for Text::Tradition::Stemma
40objects to be associated with them.
41
42=item * L<Text::Tradition::Stemma> - an object class that represents stemma
43hypotheses, both rooted (with a single archetype) and unrooted (e.g.
44phylogenetic trees).
45
46=item * Text::Tradition::Analysis (this package). Provides functions for
47the analysis of a given stemma against the collation within a given
48Tradition.
49
50=back
51
7f52eac8 52=head1 SYNOPSIS
53
54 use Text::Tradition;
55 use Text::Tradition::Analysis qw/ run_analysis analyze_variant_location /;
56 my $t = Text::Tradition->new(
57 'name' => 'this is a text',
58 'input' => 'TEI',
59 'file' => '/path/to/tei_parallel_seg_file.xml' );
60 $t->add_stemma( 'dotfile' => $stemmafile );
61
62 my $variant_data = run_analysis( $tradition );
7f52eac8 63
7f52eac8 64=head1 SUBROUTINES
65
88a6bac5 66=head2 run_analysis( $tradition, %opts )
7f52eac8 67
88a6bac5 68Runs the analysis described in analyze_variant_location on every location in the
69collation of the given tradition, with the given options. These include:
7f52eac8 70
88a6bac5 71=over 4
72
73=item * stemma_id - Specify which of the tradition's stemmata to use. Default
74is 0 (i.e. the first).
75
76=item * ranks - Specify a list of location ranks to analyze; exclude the rest.
77
78=item * merge_types - Specify a list of relationship types, where related readings
79should be treated as identical for the purposes of analysis.
80
ffa22d6f 81=item * exclude_type1 - Exclude those ranks whose groupings have only type-1 variants.
82
88a6bac5 83=back
7f52eac8 84
85=begin testing
86
87use Text::Tradition;
88use Text::Tradition::Analysis qw/ run_analysis analyze_variant_location /;
89
90my $datafile = 't/data/florilegium_tei_ps.xml';
91my $tradition = Text::Tradition->new( 'input' => 'TEI',
92 'name' => 'test0',
93 'file' => $datafile );
94my $s = $tradition->add_stemma( 'dotfile' => 't/data/florilegium.dot' );
95is( ref( $s ), 'Text::Tradition::Stemma', "Added stemma to tradition" );
96
f00cefe8 97my %expected_genealogical = (
a44aaf2a 98 1 => 0,
f00cefe8 99 2 => 1,
a44aaf2a 100 3 => 0,
101 5 => 0,
102 7 => 0,
103 8 => 0,
104 10 => 0,
f00cefe8 105 13 => 1,
a44aaf2a 106 33 => 0,
107 34 => 0,
108 37 => 0,
109 60 => 0,
f00cefe8 110 81 => 1,
a44aaf2a 111 84 => 0,
112 87 => 0,
113 101 => 0,
114 102 => 0,
f00cefe8 115 122 => 1,
a44aaf2a 116 157 => 0,
f00cefe8 117 166 => 1,
118 169 => 1,
a44aaf2a 119 200 => 0,
f00cefe8 120 216 => 1,
121 217 => 1,
122 219 => 1,
123 241 => 1,
124 242 => 1,
125 243 => 1,
126);
127
7e17346f 128my $data = run_analysis( $tradition, calcdsn => 'dbi:SQLite:dbname=t/data/analysis.db' );
7234b01d 129my $c = $tradition->collation;
f00cefe8 130foreach my $row ( @{$data->{'variants'}} ) {
a44aaf2a 131 # Account for rows that used to be "not useful"
132 unless( exists $expected_genealogical{$row->{'id'}} ) {
133 $expected_genealogical{$row->{'id'}} = 1;
134 }
18f48b82 135 my $gen_bool = $row->{'genealogical'} ? 1 : 0;
136 is( $gen_bool, $expected_genealogical{$row->{'id'}},
f00cefe8 137 "Got correct genealogical flag for row " . $row->{'id'} );
7234b01d 138 # Check that we have the right row with the right groups
139 my $rank = $row->{'id'};
140 foreach my $rdghash ( @{$row->{'readings'}} ) {
141 # Skip 'readings' that aren't really
142 next unless $c->reading( $rdghash->{'readingid'} );
143 # Check the rank
144 is( $c->reading( $rdghash->{'readingid'} )->rank, $rank,
145 "Got correct reading rank" );
146 # Check the witnesses
147 my @realwits = sort $c->reading_witnesses( $rdghash->{'readingid'} );
148 my @sgrp = sort @{$rdghash->{'group'}};
149 is_deeply( \@sgrp, \@realwits, "Reading analyzed with correct groups" );
150 }
f00cefe8 151}
a44aaf2a 152is( $data->{'variant_count'}, 58, "Got right total variant number" );
b4cb2d60 153# TODO Make something meaningful of conflict count, maybe test other bits
7f52eac8 154
155=end testing
156
157=cut
158
d71100ed 159sub run_analysis {
88a6bac5 160 my( $tradition, %opts ) = @_;
f00cefe8 161 my $c = $tradition->collation;
7e17346f 162 my $aclabel = $c->ac_label;
88a6bac5 163
164 my $stemma_id = $opts{'stemma_id'} || 0;
1d73ecad 165 my @ranks = ref( $opts{'ranks'} ) eq 'ARRAY' ? @{$opts{'ranks'}} : ();
e23225e7 166 my $collapse = Set::Scalar->new();
167 if( $opts{'merge_types'} && ref( $opts{'merge_types'} ) eq 'ARRAY' ) {
168 $collapse->insert( @{$opts{'merge_types'}} );
169 } elsif( $opts{'merge_types'} ) {
170 $collapse->insert( $opts{'merge_types'} );
171 }
7e17346f 172
173 # Make sure we have a lookup DB for graph calculation results; this will die
174 # if calcdir or calcdsn isn't passed.
03c0a7d7 175 my $dir;
176 if( exists $opts{'calcdir'} ) {
177 $dir = delete $opts{'calcdir'}
178 } elsif ( exists $opts{'calcdsn'} ) {
8943ff68 179 eval { require Text::Tradition::Directory };
180 if( $@ ) {
181 throw( "Could not instantiate a directory for " . $opts{'calcdsn'}
182 . ": $@" );
183 }
03c0a7d7 184 $dir = Text::Tradition::Directory->new( dsn => $opts{'calcdsn'} );
185 }
88a6bac5 186
187 # Get the stemma
188 my $stemma = $tradition->stemma( $stemma_id );
b4cb2d60 189
4ce27d42 190 # Figure out which witnesses we are working with - that is, the ones that
191 # appear both in the stemma and in the tradition. All others are 'lacunose'
192 # for our purposes.
7e17346f 193 my $lacunose = Set::Scalar->new( $stemma->hypotheticals );
194 my $stemma_wits = Set::Scalar->new( $stemma->witnesses );
195 my $tradition_wits = Set::Scalar->new( map { $_->sigil } $tradition->witnesses );
196 $lacunose->insert( $stemma_wits->symmetric_difference( $tradition_wits )->members );
88a6bac5 197
198 # Find and mark 'common' ranks for exclusion, unless they were
199 # explicitly specified.
200 unless( @ranks ) {
201 my %common_rank;
a44aaf2a 202 foreach my $rdg ( $c->common_readings ) {
88a6bac5 203 $common_rank{$rdg->rank} = 1;
204 }
205 @ranks = grep { !$common_rank{$_} } ( 1 .. $c->end->rank-1 );
d71100ed 206 }
7f52eac8 207
88a6bac5 208 # Group the variants to send to the solver
209 my @groups;
f629cb3b 210 my @use_ranks;
a44aaf2a 211 my %lacunae;
94654e27 212 my $moved = {};
88a6bac5 213 foreach my $rank ( @ranks ) {
7e17346f 214 my $missing = $lacunose->clone();
e23225e7 215 my $rankgroup = group_variants( $tradition, $rank, $missing, $moved, $collapse );
94654e27 216 # Filter out any empty rankgroups
217 # (e.g. from the later rank for a transposition)
218 next unless keys %$rankgroup;
7e17346f 219 # Get the graph for this rankgroup
220 my $rankgraph = _graph_for_grouping( $stemma, $rankgroup, $missing, $aclabel );
ffa22d6f 221 if( $opts{'exclude_type1'} ) {
222 # Check to see whether this is a "useful" group.
7e17346f 223 next unless _useful_variant( $rankgroup, $rankgraph, $aclabel );
ffa22d6f 224 }
f629cb3b 225 push( @use_ranks, $rank );
7e17346f 226 push( @groups, { grouping => $rankgroup, graph => $rankgraph } );
a44aaf2a 227 $lacunae{$rank} = $missing;
d71100ed 228 }
4ce27d42 229 # Run the solver
b7bd7aa5 230 my $answer;
231 try {
232 $answer = solve_variants( $dir, @groups );
233 } catch ( Text::Tradition::Error $e ) {
234 if( $e->message =~ /IDP/ ) {
235 # Something is wrong with the solver; make the variants table anyway
236 $answer->{'variants'} = [];
237 map { push( @{$answer->{'variants'}}, _init_unsolved( $_, 'IDP error' ) ) }
238 @groups;
239 } else {
240 # Something else is wrong; error out.
241 $e->throw;
242 }
243 }
fae07016 244
88a6bac5 245 # Do further analysis on the answer
a44aaf2a 246 my $conflict_count = 0;
7e17346f 247 my $reversion_count = 0;
f629cb3b 248 foreach my $idx ( 0 .. $#use_ranks ) {
88a6bac5 249 my $location = $answer->{'variants'}->[$idx];
250 # Add the rank back in
94654e27 251 my $rank = $use_ranks[$idx];
252 $location->{'id'} = $rank;
7234b01d 253 # Note what our lacunae are
f629cb3b 254 my %lmiss;
7234b01d 255 map { $lmiss{$_} = 1 } @{$lacunae{$use_ranks[$idx]}};
7234b01d 256 $location->{'missing'} = [ keys %lmiss ];
257
88a6bac5 258 # Run the extra analysis we need.
428bcf0b 259 ## TODO We run through all the variants in this call, so
260 ## why not add the reading data there instead of here below?
7e17346f 261 my $graph = $groups[$idx]->{graph};
262 analyze_location( $tradition, $graph, $location, \%lmiss );
7234b01d 263
638e2a95 264 my @layerwits;
7234b01d 265 # Do the final post-analysis tidying up of the data.
a44aaf2a 266 foreach my $rdghash ( @{$location->{'readings'}} ) {
e23225e7 267 $conflict_count++ if $rdghash->{'is_conflict'};
268 $reversion_count++ if $rdghash->{'is_reverted'};
94654e27 269 # Add the reading text back in, setting display value as needed
a44aaf2a 270 my $rdg = $c->reading( $rdghash->{'readingid'} );
94654e27 271 if( $rdg ) {
272 $rdghash->{'text'} = $rdg->text .
273 ( $rdg->rank == $rank ? '' : ' [' . $rdg->rank . ']' );
a445ce40 274 if( $rdg->does( 'Text::Tradition::Morphology' ) ) {
275 $rdghash->{'is_ungrammatical'} = $rdg->grammar_invalid;
276 $rdghash->{'is_nonsense'} = $rdg->is_nonsense;
277 }
94654e27 278 }
f629cb3b 279 # Remove lacunose witnesses from this reading's list now that the
7234b01d 280 # analysis is done
f629cb3b 281 my @realgroup;
7234b01d 282 map { push( @realgroup, $_ ) unless $lmiss{$_} } @{$rdghash->{'group'}};
f629cb3b 283 $rdghash->{'group'} = \@realgroup;
638e2a95 284 # Note any layered witnesses that appear in this group
285 foreach( @realgroup ) {
286 if( $_ =~ /^(.*)\Q$aclabel\E$/ ) {
287 push( @layerwits, $1 );
288 }
289 }
a44aaf2a 290 }
638e2a95 291 $location->{'layerwits'} = \@layerwits if @layerwits;
88a6bac5 292 }
a44aaf2a 293 $answer->{'conflict_count'} = $conflict_count;
7e17346f 294 $answer->{'reversion_count'} = $reversion_count;
f00cefe8 295
88a6bac5 296 return $answer;
d71100ed 297}
298
e23225e7 299=head2 group_variants( $tradition, $rank, $lacunose, $transposed, $merge_relationship_types )
7f52eac8 300
301Groups the variants at the given $rank of the collation, treating any
e23225e7 302relationships in the set $merge_relationship_types as equivalent.
303$lacunose should be a reference to an array, to which the sigla of lacunose
304witnesses at this rank will be appended; $transposed should be a reference
305to a hash, wherein the identities of transposed readings and their
306relatives will be stored.
7f52eac8 307
ffa22d6f 308Returns a hash $group_readings where $rdg is attested by the witnesses listed
309in $group_readings->{$rdg}.
7f52eac8 310
311=cut
312
313# Return group_readings, groups, lacunose
d1348d38 314sub group_variants {
94654e27 315 my( $tradition, $rank, $lacunose, $transposed, $collapse ) = @_;
7f52eac8 316 my $c = $tradition->collation;
335a62ef 317 my $aclabel = $c->ac_label;
d120c995 318 my $table = $c->alignment_table;
7f52eac8 319 # Get the alignment table readings
320 my %readings_at_rank;
7e17346f 321 my $check_for_gaps = Set::Scalar->new();
94654e27 322 my %moved_wits;
d120c995 323 my $has_transposition;
324 foreach my $tablewit ( @{$table->{'alignment'}} ) {
7f52eac8 325 my $rdg = $tablewit->{'tokens'}->[$rank-1];
fae07016 326 my $wit = $tablewit->{'witness'};
ffa22d6f 327 # Exclude the witness if it is "lacunose" which if we got here
328 # means "not in the stemma".
7e17346f 329 next if _is_lacunose( $wit, $lacunose, $aclabel );
94654e27 330 # Note if the witness is actually in a lacuna
7f52eac8 331 if( $rdg && $rdg->{'t'}->is_lacuna ) {
335a62ef 332 _add_to_witlist( $wit, $lacunose, $aclabel );
94654e27 333 # Otherwise the witness either has a positive reading...
7f52eac8 334 } elsif( $rdg ) {
94654e27 335 # If the reading has been counted elsewhere as a transposition, ignore it.
336 if( $transposed->{$rdg->{'t'}->id} ) {
d120c995 337 # TODO Does this cope with three-way transpositions?
94654e27 338 map { $moved_wits{$_} = 1 } @{$transposed->{$rdg->{'t'}->id}};
339 next;
340 }
341 # Otherwise, record it...
342 $readings_at_rank{$rdg->{'t'}->id} = $rdg->{'t'};
343 # ...and grab any transpositions, and their relations.
ee26c4d9 344 my @transp = grep { $_->rank != $rank } _all_related( $rdg->{'t'} );
94654e27 345 foreach my $trdg ( @transp ) {
d120c995 346 next if exists $readings_at_rank{$trdg->id};
347 $has_transposition = 1;
348 my @affected_wits = _table_witnesses(
7e17346f 349 $table, $trdg, $lacunose, $aclabel );
d120c995 350 next unless @affected_wits;
351 map { $moved_wits{$_} = 1 } @affected_wits;
352 $transposed->{$trdg->id} =
7e17346f 353 [ _table_witnesses( $table, $rdg->{'t'}, $lacunose, $aclabel ) ];
94654e27 354 $readings_at_rank{$trdg->id} = $trdg;
355 }
356 # ...or it is empty, ergo a gap.
7f52eac8 357 } else {
7e17346f 358 _add_to_witlist( $wit, $check_for_gaps, $aclabel );
7f52eac8 359 }
360 }
7e17346f 361 my $gap_wits = Set::Scalar->new();
362 map { _add_to_witlist( $_, $gap_wits, $aclabel )
363 unless $moved_wits{$_} } $check_for_gaps->members;
364
365 # Group the readings, collapsing groups by relationship if needed.
d120c995 366 my $grouped_readings = {};
4ce27d42 367 foreach my $rdg ( values %readings_at_rank ) {
7f52eac8 368 # Skip readings that have been collapsed into others.
d120c995 369 next if exists $grouped_readings->{$rdg->id}
370 && $grouped_readings->{$rdg->id} eq 'COLLAPSE';
4ce27d42 371 # Get the witness list, including from readings collapsed into this one.
7e17346f 372 my @wits = _table_witnesses( $table, $rdg, $lacunose, $aclabel );
e23225e7 373 if( $collapse && $collapse->size ) {
374 my $filter = sub { $collapse->has( $_[0]->type ) };
7f52eac8 375 foreach my $other ( $rdg->related_readings( $filter ) ) {
7e17346f 376 my @otherwits = _table_witnesses( $table, $other, $lacunose, $aclabel );
fae07016 377 push( @wits, @otherwits );
d120c995 378 $grouped_readings->{$other->id} = 'COLLAPSE';
d1348d38 379 }
380 }
7e17346f 381 $grouped_readings->{$rdg->id} = Set::Scalar->new( @wits );
7f52eac8 382 }
7e17346f 383 if( $gap_wits->members ) {
384 $grouped_readings->{'(omitted)'} = $gap_wits;
385 }
386
7f52eac8 387 # Get rid of our collapsed readings
7e17346f 388 map { delete $grouped_readings->{$_} if(
389 $grouped_readings->{$_} eq 'COLLAPSE'
390 || $grouped_readings->{$_}->is_empty ) }
391 keys %$grouped_readings;
d120c995 392
393 # If something was transposed, check the groups for doubled-up readings
394 if( $has_transposition ) {
62a39b8f 395 # print STDERR "Group for rank $rank:\n";
396 # map { print STDERR "\t$_: " . join( ' ' , @{$grouped_readings->{$_}} ) . "\n" }
397 # keys %$grouped_readings;
d120c995 398 _check_transposed_consistency( $c, $rank, $transposed, $grouped_readings );
399 }
7f52eac8 400
4ce27d42 401 # Return the result
d120c995 402 return $grouped_readings;
403}
404
ee26c4d9 405sub _all_related {
406 my $rdg = shift;
407 my $c = $rdg->collation;
408 my @all = map { $c->reading( $_ ) } $c->relations->graph->all_neighbors( $rdg );
409 return @all;
410}
411
412
d120c995 413# Helper function to query the alignment table for all witnesses (a.c. included)
414# that have a given reading at its rank.
415sub _table_witnesses {
416 my( $table, $trdg, $lacunose, $aclabel ) = @_;
417 my $tableidx = $trdg->rank - 1;
7e17346f 418 my $has_reading = Set::Scalar->new();
d120c995 419 foreach my $row ( @{$table->{'alignment'}} ) {
420 my $wit = $row->{'witness'};
7e17346f 421 next if _is_lacunose( $wit, $lacunose, $aclabel );
d120c995 422 my $rdg = $row->{'tokens'}->[$tableidx];
423 next unless exists $rdg->{'t'} && defined $rdg->{'t'};
7e17346f 424 _add_to_witlist( $wit, $has_reading, $aclabel )
d120c995 425 if $rdg->{'t'}->id eq $trdg->id;
426 }
7e17346f 427 return $has_reading->members;
428}
429
430# Helper function to see if a witness is lacunose even if we are asking about
431# the a.c. version
432sub _is_lacunose {
433 my ( $wit, $lac, $acstr ) = @_;
434 if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
435 $wit = $1;
436 }
437 return $lac->has( $wit );
d1348d38 438}
439
335a62ef 440# Helper function to ensure that X and X a.c. never appear in the same list.
441sub _add_to_witlist {
442 my( $wit, $list, $acstr ) = @_;
335a62ef 443 if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
7e17346f 444 # Don't add X a.c. if we already have X
445 return if $list->has( $1 );
335a62ef 446 } else {
7e17346f 447 # Delete X a.c. if we are about to add X
448 $list->delete( $wit.$acstr );
335a62ef 449 }
7e17346f 450 $list->insert( $wit );
335a62ef 451}
452
d120c995 453sub _check_transposed_consistency {
454 my( $c, $rank, $transposed, $groupings ) = @_;
455 my %seen_wits;
456 my %thisrank;
457 # Note which readings are actually at this rank, and which witnesses
458 # belong to which reading.
459 foreach my $rdg ( keys %$groupings ) {
460 my $rdgobj = $c->reading( $rdg );
461 # Count '(omitted)' as a reading at this rank
462 $thisrank{$rdg} = 1 if !$rdgobj || $rdgobj->rank == $rank;
463 map { push( @{$seen_wits{$_}}, $rdg ) } @{$groupings->{$rdg}};
464 }
465 # Our work is done if we have no witness belonging to more than one
466 # reading.
467 my @doubled = grep { scalar @{$seen_wits{$_}} > 1 } keys %seen_wits;
468 return unless @doubled;
469 # If we have a symmetric related transposition, drop the non-rank readings.
470 if( @doubled == scalar keys %seen_wits ) {
471 foreach my $rdg ( keys %$groupings ) {
472 if( !$thisrank{$rdg} ) {
473 my $groupstr = wit_stringify( $groupings->{$rdg} );
474 my ( $matched ) = grep { $groupstr eq wit_stringify( $groupings->{$_} ) }
475 keys %thisrank;
476 delete $groupings->{$rdg};
477 # If we found a group match, assume there is a symmetry happening.
478 # TODO think more about this
62a39b8f 479 # print STDERR "*** Deleting symmetric reading $rdg\n";
d120c995 480 unless( $matched ) {
481 delete $transposed->{$rdg};
482 warn "Found problem in evident symmetry with reading $rdg";
483 }
484 }
485 }
486 # Otherwise 'unhook' the transposed reading(s) that have duplicates.
487 } else {
488 foreach my $dup ( @doubled ) {
489 foreach my $rdg ( @{$seen_wits{$dup}} ) {
490 next if $thisrank{$rdg};
491 next unless exists $groupings->{$rdg};
62a39b8f 492 # print STDERR "*** Deleting asymmetric doubled-up reading $rdg\n";
d120c995 493 delete $groupings->{$rdg};
494 delete $transposed->{$rdg};
495 }
496 }
497 # and put any now-orphaned readings into an 'omitted' reading.
498 foreach my $wit ( keys %seen_wits ) {
499 unless( grep { exists $groupings->{$_} } @{$seen_wits{$wit}} ) {
7e17346f 500 $groupings->{'(omitted)'} = Set::Scalar->new()
501 unless exists $groupings->{'(omitted)'};
d120c995 502 _add_to_witlist( $wit, $groupings->{'(omitted)'}, $c->ac_label );
503 }
504 }
505 }
506}
507
7e17346f 508# For the given grouping, return its situation graph based on the stemma.
509sub _graph_for_grouping {
510 my( $stemma, $grouping, $lacunose, $aclabel ) = @_;
511 my $acwits = [];
512 my $extant = {};
513 foreach my $gs ( values %$grouping ) {
514 map {
515 if( $_ =~ /^(.*)\Q$aclabel\E$/ ) {
516 push( @$acwits, $1 ) unless $lacunose->has( $1 );
517 } else {
518 $extant->{$_} = 1 unless $lacunose->has( $_ );
519 }
520 } $gs->members;
521 }
522 my $graph;
523 try {
524 # contig contains all extant wits and all hypothetical wits
525 # needed to make up the groups.
ace5fce5 526 $graph = $stemma->situation_graph( $extant, $acwits, $aclabel );
7e17346f 527 } catch ( Text::Tradition::Error $e ) {
ee26c4d9 528 $DB::single = 1;
03c0a7d7 529 throw( "Could not extend graph with given extant and a.c. witnesses: "
530 . $e->message );
7e17346f 531 } catch {
03c0a7d7 532 throw( "Could not extend graph with a.c. witnesses @$acwits" );
7e17346f 533 }
534 return $graph;
535}
88a6bac5 536
7e17346f 537=head2 solve_variants( $calcdir, @groups )
538
539Looks up the set of groups in the answers provided by the external graph solver
540service and returns a cleaned-up answer, adding the rank IDs back where they belong.
88a6bac5 541
88a6bac5 542The answer has the form
543 { "variants" => [ array of variant location structures ],
544 "variant_count" => total,
545 "conflict_count" => number of conflicts detected,
546 "genealogical_count" => number of solutions found }
547
548=cut
549
550sub solve_variants {
03c0a7d7 551 my( @groups ) = @_;
552
a745c3d9 553 # Are we using a local result directory, or did we pass an empty value
554 # for the directory?
03c0a7d7 555 my $dir;
a745c3d9 556 unless( ref( $groups[0] ) eq 'HASH' ) {
03c0a7d7 557 $dir = shift @groups;
558 }
335a62ef 559
03c0a7d7 560 ## For each graph/group combo, make a Text::Tradition::Analysis::Result
561 ## object so that we can send it off for IDP lookup.
7e17346f 562 my $variants = [];
563 my $genealogical = 0; # counter
03c0a7d7 564 # TODO Optimize for unique graph problems
554e2e7d 565 my %problems;
7e17346f 566 foreach my $graphproblem ( @groups ) {
03c0a7d7 567 # Construct the calc result key and look up its answer
568 my $problem = Text::Tradition::Analysis::Result->new(
569 graph => $graphproblem->{'graph'},
570 setlist => [ values %{$graphproblem->{'grouping'}} ] );
554e2e7d 571 if( exists $problems{$problem->object_key} ) {
572 $problem = $problems{$problem->object_key};
573 } else {
574 $problems{$problem->object_key} = $problem;
575 }
576 $graphproblem->{'object'} = $problem;
03c0a7d7 577 }
578
554e2e7d 579 my %results;
03c0a7d7 580 if( $dir ) {
581 my $scope = $dir->new_scope;
554e2e7d 582 map { $results{$_} = $dir->lookup( $_ ) || $problems{$_} } keys %problems;
583 } else {
584 my $json = JSON->new->allow_blessed->convert_blessed->utf8->encode(
585 [ values %problems ] );
03c0a7d7 586 # Send it off and get the result
587 # print STDERR "Sending request: " . decode_utf8( $json ) . "\n";
588 my $ua = LWP::UserAgent->new();
589 my $resp = $ua->post( $SOLVER_URL, 'Content-Type' => 'application/json',
590 'Content' => $json );
a745c3d9 591 my $answer;
03c0a7d7 592 if( $resp->is_success ) {
a745c3d9 593 $answer = decode_json( $resp->content );
03c0a7d7 594 throw( "Unexpected answer from IDP: $answer" ) unless ref( $answer ) eq 'ARRAY';
595 } else {
596 throw( "IDP solver returned " . $resp->status_line . " / " . $resp->content
597 . "; cannot run graph analysis" );
598 }
599 # One more sanity check
600 throw( "Something went wrong with answer symmetricity" )
554e2e7d 601 unless keys( %problems ) == @$answer;
03c0a7d7 602 # Convert the results
554e2e7d 603 foreach my $a ( @$answer ) {
604 my $r = Text::Tradition::Analysis::Result->new( $a );
605 $results{$r->object_key} = $r;
606 }
03c0a7d7 607 }
608
609 # We now have a single JSON-encoded Result object per problem sent. Fold its
610 # answers into our variant info structure.
554e2e7d 611 foreach my $graphproblem ( @groups ) {
612 my $result = $results{$graphproblem->{'object'}->object_key}
613 || $graphproblem->{'object'};
03c0a7d7 614
7e17346f 615 # Initialize the result structure for this graph problem
b7bd7aa5 616 my $vstruct;
617 if( $result->status eq 'OK' ) {
618 $vstruct = { readings => [] };
619 push( @$variants, $vstruct );
620 } else {
621 push( @$variants, _init_unsolved( $graphproblem, $result->status ) );
7e17346f 622 next;
5c44c598 623 }
b7bd7aa5 624
7e17346f 625 # 1. Did the group evaluate as genealogical?
03c0a7d7 626 $vstruct->{genealogical} = $result->is_genealogical;
627 $genealogical++ if $result->is_genealogical;
7e17346f 628
629 # 2. What are the calculated minimum groupings for each variant loc?
630 foreach my $rid ( keys %{$graphproblem->{grouping}} ) {
631 my $inputset = $graphproblem->{grouping}->{$rid};
03c0a7d7 632 my $minset = $result->minimum_grouping_for( $inputset );
7e17346f 633 push( @{$vstruct->{readings}}, { readingid => $rid, group => $minset } );
6d25a3a0 634 }
7e17346f 635
636 # 3. What are the sources and classes calculated for each witness?
03c0a7d7 637 $vstruct->{witcopy_types} = { $result->classes };
7e17346f 638 $vstruct->{reading_roots} = {};
03c0a7d7 639 map { $vstruct->{reading_roots}->{$_} = 1 } $result->sources;
7e17346f 640
88a6bac5 641 }
642
643 return { 'variants' => $variants,
644 'variant_count' => scalar @$variants,
645 'genealogical_count' => $genealogical };
646}
647
b7bd7aa5 648sub _init_unsolved {
649 my( $graphproblem, $status ) = @_;
650 my $vstruct = { 'readings' => [] };
651 $vstruct->{'unsolved'} = $status;
652 foreach my $rid ( keys %{$graphproblem->{grouping}} ) {
653 push( @{$vstruct->{readings}}, { readingid => $rid,
654 group => [ $graphproblem->{grouping}->{$rid}->members ] } );
655 }
656 return $vstruct;
657}
658
fae07016 659=head2 analyze_location ( $tradition, $graph, $location_hash )
7f52eac8 660
fae07016 661Given the tradition, its stemma graph, and the solution from the graph solver,
662work out the rest of the information we want. For each reading we need missing,
5c44c598 663conflict, reading_parents, independent_occurrence, followed, not_followed,
664and follow_unknown. Alters the location_hash in place.
7f52eac8 665
666=cut
732152b1 667
fae07016 668sub analyze_location {
7e17346f 669 my ( $tradition, $graph, $variant_row, $lacunose ) = @_;
94654e27 670 my $c = $tradition->collation;
fae07016 671
03c0a7d7 672 if( exists $variant_row->{'unsolved'} ) {
673 return;
674 }
675 my $reading_roots = delete $variant_row->{'reading_roots'};
676 my $classinfo = delete $variant_row->{'witcopy_types'};
677
fae07016 678 # Make a hash of all known node memberships, and make the subgraphs.
679 my $contig = {};
fae07016 680 my $subgraph = {};
638e2a95 681 my $acstr = $c->ac_label;
682 my @acwits;
5c44c598 683
5c44c598 684 # Note which witnesses positively belong to which group. This information
685 # comes ultimately from the IDP solver.
686 # Also make a note of the reading's roots.
fae07016 687 foreach my $rdghash ( @{$variant_row->{'readings'}} ) {
688 my $rid = $rdghash->{'readingid'};
5c44c598 689 my @roots;
638e2a95 690 foreach my $wit ( @{$rdghash->{'group'}} ) {
691 $contig->{$wit} = $rid;
692 if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
693 push( @acwits, $1 );
694 }
5c44c598 695 if( exists $reading_roots->{$wit} && $reading_roots->{$wit} ) {
696 push( @roots, $wit );
697 }
638e2a95 698 }
5c44c598 699 $rdghash->{'independent_occurrence'} = \@roots;
94654e27 700 }
7e17346f 701
fae07016 702 # Now that we have all the node group memberships, calculate followed/
bebec0e9 703 # non-followed/unknown values for each reading. Also figure out the
704 # reading's evident parent(s).
705 foreach my $rdghash ( @{$variant_row->{'readings'}} ) {
fae07016 706 my $rid = $rdghash->{'readingid'};
428bcf0b 707 my $rdg = $c->reading( $rid );
5c44c598 708 my @roots = @{$rdghash->{'independent_occurrence'}};
7e17346f 709 my @reversions;
710 if( $classinfo ) {
711 @reversions = grep { $classinfo->{$_} eq 'revert' }
712 $rdghash->{'group'}->members;
be590045 713 $rdghash->{'reversions'} = \@reversions;
7e17346f 714 }
5c44c598 715 my @group = @{$rdghash->{'group'}};
fae07016 716
717 # Start figuring things out.
7e17346f 718 $rdghash->{'followed'} = scalar( @group )
719 - ( scalar( @roots ) + scalar( @reversions ) );
bebec0e9 720 # Find the parent readings, if any, of this reading.
7e17346f 721 my $sourceparents = _find_reading_parents( $rid, $graph, $contig, @roots );
7e17346f 722 # Work out relationships between readings and their non-followed parent.
723 _resolve_parent_relationships( $c, $rid, $rdg, $sourceparents );
be590045 724 $rdghash->{'source_parents'} = $sourceparents;
e23225e7 725
726 if( @reversions ) {
727 my $revparents = _find_reading_parents( $rid, $graph, $contig, @reversions );
728 _resolve_parent_relationships( $c, $rid, $rdg, $revparents );
729 $rdghash->{'reversion_parents'} = $revparents;
730 }
bebec0e9 731
732 # Find the number of times this reading was altered, and the number of
733 # times we're not sure.
734 my( %nofollow, %unknownfollow );
5c44c598 735 foreach my $wit ( @{$rdghash->{'group'}} ) {
bebec0e9 736 foreach my $wchild ( $graph->successors( $wit ) ) {
5c44c598 737 if( $reading_roots->{$wchild} && $contig->{$wchild}
738 && $contig->{$wchild} ne $rid ) {
bebec0e9 739 # It definitely changed here.
740 $nofollow{$wchild} = 1;
741 } elsif( !($contig->{$wchild}) ) {
742 # The child is a hypothetical node not definitely in
743 # any group. Answer is unknown.
744 $unknownfollow{$wchild} = 1;
5c44c598 745 } # else it is either in our group, or it is a non-root node in a
746 # known group and therefore is presumed to have its reading from
747 # its group, not this link.
bebec0e9 748 }
749 }
750 $rdghash->{'not_followed'} = keys %nofollow;
751 $rdghash->{'follow_unknown'} = keys %unknownfollow;
fae07016 752
753 # Now say whether this reading represents a conflict.
754 unless( $variant_row->{'genealogical'} ) {
7e17346f 755 $rdghash->{'is_conflict'} = @roots != 1;
e23225e7 756 $rdghash->{'is_reverted'} = scalar @reversions;
7e17346f 757 }
758 }
759}
760
761sub _find_reading_parents {
762 my( $rid, $graph, $contig, @list ) = @_;
763 my $parenthash = {};
764 foreach my $wit ( @list ) {
765 # Look in the stemma graph to find this witness's extant or known-reading
766 # immediate ancestor(s), and look up the reading that each ancestor holds.
767 my @check = $graph->predecessors( $wit );
768 while( @check ) {
769 my @next;
770 foreach my $wparent( @check ) {
771 my $preading = $contig->{$wparent};
772 if( $preading && $preading ne $rid ) {
773 $parenthash->{$preading} = 1;
774 } else {
775 push( @next, $graph->predecessors( $wparent ) );
776 }
777 }
778 @check = @next;
779 }
780 }
781 return $parenthash;
782}
783
784sub _resolve_parent_relationships {
785 my( $c, $rid, $rdg, $rdgparents ) = @_;
786 foreach my $p ( keys %$rdgparents ) {
787 # Resolve the relationship of the parent to the reading, and
788 # save it in our hash.
789 my $pobj = $c->reading( $p );
790 my $prep = $pobj ? $pobj->id . ' (' . $pobj->text . ')' : $p;
791 my $phash = { 'label' => $prep };
792 if( $pobj ) {
793 my $rel = $c->get_relationship( $p, $rid );
794 if( $rel ) {
795 _add_to_hash( $rel, $phash );
796 } elsif( $rdg ) {
797 # First check for a transposed relationship
798 if( $rdg->rank != $pobj->rank ) {
799 foreach my $ti ( $rdg->related_readings( 'transposition' ) ) {
800 next unless $ti->text eq $rdg->text;
801 $rel = $c->get_relationship( $ti, $pobj );
802 if( $rel ) {
803 _add_to_hash( $rel, $phash, 1 );
804 last;
805 }
806 }
807 unless( $rel ) {
808 foreach my $ti ( $pobj->related_readings( 'transposition' ) ) {
809 next unless $ti->text eq $pobj->text;
810 $rel = $c->get_relationship( $ti, $rdg );
811 if( $rel ) {
812 _add_to_hash( $rel, $phash, 1 );
813 last;
814 }
815 }
5c44c598 816 }
817 }
7e17346f 818 unless( $rel ) {
819 # and then check for sheer word similarity.
820 my $rtext = $rdg->text;
821 my $ptext = $pobj->text;
822 if( similar( $rtext, $ptext ) ) {
823 # say STDERR "Words $rtext and $ptext judged similar";
824 $phash->{relation} = { type => 'wordsimilar' };
825 }
826 }
5c44c598 827 } else {
7e17346f 828 $phash->{relation} = { type => 'deletion' };
5c44c598 829 }
7e17346f 830 # Get the attributes of the parent object while we are here
831 $phash->{'text'} = $pobj->text if $pobj;
a445ce40 832 if( $pobj && $pobj->does('Text::Tradition::Morphology') ) {
833 $phash->{'is_nonsense'} = $pobj->is_nonsense;
834 $phash->{'is_ungrammatical'} = $pobj->grammar_invalid;
835 }
7e17346f 836 } elsif( $p eq '(omitted)' ) {
ee26c4d9 837 # Check to see if the reading in question is a repetition.
838 my @reps = $rdg->related_readings( 'repetition' );
839 if( @reps ) {
840 $phash->{relation} = { type => 'repetition',
841 annotation => "of reading @reps" };
842 } else {
843 $phash->{relation} = { type => 'addition' };
844 }
7e17346f 845 }
846 # Save it
847 $rdgparents->{$p} = $phash;
848 }
d71100ed 849}
850
428bcf0b 851sub _add_to_hash {
852 my( $rel, $phash, $is_transposed ) = @_;
853 $phash->{relation} = { type => $rel->type };
854 $phash->{relation}->{transposed} = 1 if $is_transposed;
855 $phash->{relation}->{annotation} = $rel->annotation
856 if $rel->has_annotation;
857}
858
859=head2 similar( $word1, $word2 )
860
861Use Algorithm::Diff to get a sense of how close the words are to each other.
862This will hopefully handle substitutions a bit more nicely than Levenshtein.
863
864=cut
865
866#!/usr/bin/env perl
867
868sub similar {
869 my( $word1, $word2 ) = sort { length($a) <=> length($b) } @_;
870 my @let1 = split( '', lc( $word1 ) );
871 my @let2 = split( '', lc( $word2 ) );
872 my $diff = Algorithm::Diff->new( \@let1, \@let2 );
873 my $mag = 0;
874 while( $diff->Next ) {
875 if( $diff->Same ) {
876 # Take off points for longer strings
877 my $cs = $diff->Range(1) - 2;
878 $cs = 0 if $cs < 0;
879 $mag -= $cs;
880 } elsif( !$diff->Items(1) ) {
881 $mag += $diff->Range(2);
882 } elsif( !$diff->Items(2) ) {
883 $mag += $diff->Range(1);
884 } else {
885 # Split the difference for substitutions
886 my $c1 = $diff->Range(1) || 1;
887 my $c2 = $diff->Range(2) || 1;
888 my $cd = ( $c1 + $c2 ) / 2;
889 $mag += $cd;
890 }
891 }
892 return ( $mag <= length( $word1 ) / 2 );
893}
894
5be0cdeb 895sub _useful_variant {
7e17346f 896 my( $rankgroup, $rankgraph, $acstr ) = @_;
5be0cdeb 897
898 # Sort by group size and return
899 my $is_useful = 0;
7e17346f 900 foreach my $rdg ( keys %$rankgroup ) {
901 my @wits = $rankgroup->{$rdg}->members;
902 if( @wits > 1 ) {
5be0cdeb 903 $is_useful++;
904 } else {
7e17346f 905 $is_useful++ unless( $rankgraph->is_sink_vertex( $wits[0] )
906 || $wits[0] =~ /\Q$acstr\E$/ );
5be0cdeb 907 }
908 }
7e17346f 909 return $is_useful > 1;
5be0cdeb 910}
911
7f52eac8 912=head2 wit_stringify( $groups )
913
914Takes an array of witness groupings and produces a string like
915['A','B'] / ['C','D','E'] / ['F']
d71100ed 916
7f52eac8 917=cut
d71100ed 918
919sub wit_stringify {
920 my $groups = shift;
921 my @gst;
922 # If we were passed an array of witnesses instead of an array of
923 # groupings, then "group" the witnesses first.
924 unless( ref( $groups->[0] ) ) {
925 my $mkgrp = [ $groups ];
926 $groups = $mkgrp;
927 }
928 foreach my $g ( @$groups ) {
929 push( @gst, '[' . join( ',', map { "'$_'" } @$g ) . ']' );
930 }
931 return join( ' / ', @gst );
932}
7f52eac8 933
7f52eac8 9341;
935
a745c3d9 936sub throw {
937 Text::Tradition::Error->throw(
938 'ident' => 'Analysis error',
939 'message' => $_[0],
940 );
941}
942
7f52eac8 943=head1 LICENSE
944
945This package is free software and is provided "as is" without express
946or implied warranty. You can redistribute it and/or modify it under
947the same terms as Perl itself.
948
949=head1 AUTHOR
950
951Tara L Andrews E<lt>aurum@cpan.orgE<gt>