split off stemma analysis modules from base Tradition layer
[scpubgit/stemmatology.git] / analysis / lib / Text / Tradition / Analysis.pm
CommitLineData
d71100ed 1package Text::Tradition::Analysis;
2
3use strict;
4use warnings;
428bcf0b 5use Algorithm::Diff; # for word similarity measure
03c0a7d7 6use Encode qw/ decode_utf8 encode_utf8 /;
d1348d38 7use Exporter 'import';
b4cb2d60 8use Graph;
a745c3d9 9use JSON qw/ to_json decode_json /;
03c0a7d7 10use LWP::UserAgent;
7e17346f 11use Set::Scalar;
7e17346f 12use Text::Tradition::Analysis::Result;
13use Text::Tradition::Directory;
d71100ed 14use Text::Tradition::Stemma;
d120c995 15use TryCatch;
d71100ed 16
951ddfe8 17use vars qw/ @EXPORT_OK $VERSION /;
a2cf85dd 18@EXPORT_OK = qw/ run_analysis group_variants analyze_variant_location wit_stringify /;
951ddfe8 19$VERSION = "1.0";
20
d1348d38 21
03c0a7d7 22my $SOLVER_URL = 'http://byzantini.st/cgi-bin/graphcalc.cgi';
7e17346f 23my $unsolved_problems = {};
5c44c598 24
7f52eac8 25=head1 NAME
26
27Text::Tradition::Analysis - functions for stemma analysis of a tradition
28
29=head1 SYNOPSIS
30
31 use Text::Tradition;
32 use Text::Tradition::Analysis qw/ run_analysis analyze_variant_location /;
33 my $t = Text::Tradition->new(
34 'name' => 'this is a text',
35 'input' => 'TEI',
36 'file' => '/path/to/tei_parallel_seg_file.xml' );
951ddfe8 37 $t->enable_stemmata;
7f52eac8 38 $t->add_stemma( 'dotfile' => $stemmafile );
39
40 my $variant_data = run_analysis( $tradition );
41 # Recalculate rank $n treating all orthographic variants as equivalent
42 my $reanalyze = analyze_variant_location( $tradition, $n, 0, 'orthographic' );
43
44=head1 DESCRIPTION
45
46Text::Tradition is a library for representation and analysis of collated
951ddfe8 47texts, particularly medieval ones. Where the Collation is the central feature of
48a Tradition, it may also have one or more temmata associated with it, and these stemmata may be analyzed. This package provides the following modules:
49
50=over 4
51
52=item * L<Text::Tradition::HasStemma> - a role that can be composed into Text::Tradition objects, providing the ability for Text::Tradition::Stemma objects to be associated with them.
53
54=item * L<Text::Tradition::Stemma> - an object class that represents stemma hypotheses, both rooted (with a single archetype) and unrooted (e.g. phylogenetic trees).
55
56=item * Text::Tradition::Analysis (this package). Provides functions for the analysis of a given stemma against the collation within a given Tradition.
57
58=back
7f52eac8 59
60=head1 SUBROUTINES
61
88a6bac5 62=head2 run_analysis( $tradition, %opts )
7f52eac8 63
88a6bac5 64Runs the analysis described in analyze_variant_location on every location in the
65collation of the given tradition, with the given options. These include:
7f52eac8 66
88a6bac5 67=over 4
68
69=item * stemma_id - Specify which of the tradition's stemmata to use. Default
70is 0 (i.e. the first).
71
72=item * ranks - Specify a list of location ranks to analyze; exclude the rest.
73
74=item * merge_types - Specify a list of relationship types, where related readings
75should be treated as identical for the purposes of analysis.
76
ffa22d6f 77=item * exclude_type1 - Exclude those ranks whose groupings have only type-1 variants.
78
88a6bac5 79=back
7f52eac8 80
81=begin testing
82
83use Text::Tradition;
84use Text::Tradition::Analysis qw/ run_analysis analyze_variant_location /;
85
86my $datafile = 't/data/florilegium_tei_ps.xml';
87my $tradition = Text::Tradition->new( 'input' => 'TEI',
88 'name' => 'test0',
89 'file' => $datafile );
951ddfe8 90$tradition->enable_stemmata;
7f52eac8 91my $s = $tradition->add_stemma( 'dotfile' => 't/data/florilegium.dot' );
92is( ref( $s ), 'Text::Tradition::Stemma', "Added stemma to tradition" );
93
f00cefe8 94my %expected_genealogical = (
a44aaf2a 95 1 => 0,
f00cefe8 96 2 => 1,
a44aaf2a 97 3 => 0,
98 5 => 0,
99 7 => 0,
100 8 => 0,
101 10 => 0,
f00cefe8 102 13 => 1,
a44aaf2a 103 33 => 0,
104 34 => 0,
105 37 => 0,
106 60 => 0,
f00cefe8 107 81 => 1,
a44aaf2a 108 84 => 0,
109 87 => 0,
110 101 => 0,
111 102 => 0,
f00cefe8 112 122 => 1,
a44aaf2a 113 157 => 0,
f00cefe8 114 166 => 1,
115 169 => 1,
a44aaf2a 116 200 => 0,
f00cefe8 117 216 => 1,
118 217 => 1,
119 219 => 1,
120 241 => 1,
121 242 => 1,
122 243 => 1,
123);
124
7e17346f 125my $data = run_analysis( $tradition, calcdsn => 'dbi:SQLite:dbname=t/data/analysis.db' );
7234b01d 126my $c = $tradition->collation;
f00cefe8 127foreach my $row ( @{$data->{'variants'}} ) {
a44aaf2a 128 # Account for rows that used to be "not useful"
129 unless( exists $expected_genealogical{$row->{'id'}} ) {
130 $expected_genealogical{$row->{'id'}} = 1;
131 }
18f48b82 132 my $gen_bool = $row->{'genealogical'} ? 1 : 0;
133 is( $gen_bool, $expected_genealogical{$row->{'id'}},
f00cefe8 134 "Got correct genealogical flag for row " . $row->{'id'} );
7234b01d 135 # Check that we have the right row with the right groups
136 my $rank = $row->{'id'};
137 foreach my $rdghash ( @{$row->{'readings'}} ) {
138 # Skip 'readings' that aren't really
139 next unless $c->reading( $rdghash->{'readingid'} );
140 # Check the rank
141 is( $c->reading( $rdghash->{'readingid'} )->rank, $rank,
142 "Got correct reading rank" );
143 # Check the witnesses
144 my @realwits = sort $c->reading_witnesses( $rdghash->{'readingid'} );
145 my @sgrp = sort @{$rdghash->{'group'}};
146 is_deeply( \@sgrp, \@realwits, "Reading analyzed with correct groups" );
147 }
f00cefe8 148}
a44aaf2a 149is( $data->{'variant_count'}, 58, "Got right total variant number" );
b4cb2d60 150# TODO Make something meaningful of conflict count, maybe test other bits
7f52eac8 151
152=end testing
153
154=cut
155
d71100ed 156sub run_analysis {
88a6bac5 157 my( $tradition, %opts ) = @_;
f00cefe8 158 my $c = $tradition->collation;
7e17346f 159 my $aclabel = $c->ac_label;
88a6bac5 160
161 my $stemma_id = $opts{'stemma_id'} || 0;
1d73ecad 162 my @ranks = ref( $opts{'ranks'} ) eq 'ARRAY' ? @{$opts{'ranks'}} : ();
e23225e7 163 my $collapse = Set::Scalar->new();
164 if( $opts{'merge_types'} && ref( $opts{'merge_types'} ) eq 'ARRAY' ) {
165 $collapse->insert( @{$opts{'merge_types'}} );
166 } elsif( $opts{'merge_types'} ) {
167 $collapse->insert( $opts{'merge_types'} );
168 }
7e17346f 169
170 # Make sure we have a lookup DB for graph calculation results; this will die
171 # if calcdir or calcdsn isn't passed.
03c0a7d7 172 my $dir;
173 if( exists $opts{'calcdir'} ) {
174 $dir = delete $opts{'calcdir'}
175 } elsif ( exists $opts{'calcdsn'} ) {
176 $dir = Text::Tradition::Directory->new( dsn => $opts{'calcdsn'} );
177 }
88a6bac5 178
179 # Get the stemma
180 my $stemma = $tradition->stemma( $stemma_id );
b4cb2d60 181
4ce27d42 182 # Figure out which witnesses we are working with - that is, the ones that
183 # appear both in the stemma and in the tradition. All others are 'lacunose'
184 # for our purposes.
7e17346f 185 my $lacunose = Set::Scalar->new( $stemma->hypotheticals );
186 my $stemma_wits = Set::Scalar->new( $stemma->witnesses );
187 my $tradition_wits = Set::Scalar->new( map { $_->sigil } $tradition->witnesses );
188 $lacunose->insert( $stemma_wits->symmetric_difference( $tradition_wits )->members );
88a6bac5 189
190 # Find and mark 'common' ranks for exclusion, unless they were
191 # explicitly specified.
192 unless( @ranks ) {
193 my %common_rank;
a44aaf2a 194 foreach my $rdg ( $c->common_readings ) {
88a6bac5 195 $common_rank{$rdg->rank} = 1;
196 }
197 @ranks = grep { !$common_rank{$_} } ( 1 .. $c->end->rank-1 );
d71100ed 198 }
7f52eac8 199
88a6bac5 200 # Group the variants to send to the solver
201 my @groups;
f629cb3b 202 my @use_ranks;
a44aaf2a 203 my %lacunae;
94654e27 204 my $moved = {};
88a6bac5 205 foreach my $rank ( @ranks ) {
7e17346f 206 my $missing = $lacunose->clone();
e23225e7 207 my $rankgroup = group_variants( $tradition, $rank, $missing, $moved, $collapse );
94654e27 208 # Filter out any empty rankgroups
209 # (e.g. from the later rank for a transposition)
210 next unless keys %$rankgroup;
7e17346f 211 # Get the graph for this rankgroup
212 my $rankgraph = _graph_for_grouping( $stemma, $rankgroup, $missing, $aclabel );
ffa22d6f 213 if( $opts{'exclude_type1'} ) {
214 # Check to see whether this is a "useful" group.
7e17346f 215 next unless _useful_variant( $rankgroup, $rankgraph, $aclabel );
ffa22d6f 216 }
f629cb3b 217 push( @use_ranks, $rank );
7e17346f 218 push( @groups, { grouping => $rankgroup, graph => $rankgraph } );
a44aaf2a 219 $lacunae{$rank} = $missing;
d71100ed 220 }
4ce27d42 221 # Run the solver
b7bd7aa5 222 my $answer;
223 try {
224 $answer = solve_variants( $dir, @groups );
225 } catch ( Text::Tradition::Error $e ) {
226 if( $e->message =~ /IDP/ ) {
227 # Something is wrong with the solver; make the variants table anyway
228 $answer->{'variants'} = [];
229 map { push( @{$answer->{'variants'}}, _init_unsolved( $_, 'IDP error' ) ) }
230 @groups;
231 } else {
232 # Something else is wrong; error out.
233 $e->throw;
234 }
235 }
fae07016 236
88a6bac5 237 # Do further analysis on the answer
a44aaf2a 238 my $conflict_count = 0;
7e17346f 239 my $reversion_count = 0;
f629cb3b 240 foreach my $idx ( 0 .. $#use_ranks ) {
88a6bac5 241 my $location = $answer->{'variants'}->[$idx];
242 # Add the rank back in
94654e27 243 my $rank = $use_ranks[$idx];
244 $location->{'id'} = $rank;
7234b01d 245 # Note what our lacunae are
f629cb3b 246 my %lmiss;
7234b01d 247 map { $lmiss{$_} = 1 } @{$lacunae{$use_ranks[$idx]}};
7234b01d 248 $location->{'missing'} = [ keys %lmiss ];
249
88a6bac5 250 # Run the extra analysis we need.
428bcf0b 251 ## TODO We run through all the variants in this call, so
252 ## why not add the reading data there instead of here below?
7e17346f 253 my $graph = $groups[$idx]->{graph};
254 analyze_location( $tradition, $graph, $location, \%lmiss );
7234b01d 255
638e2a95 256 my @layerwits;
7234b01d 257 # Do the final post-analysis tidying up of the data.
a44aaf2a 258 foreach my $rdghash ( @{$location->{'readings'}} ) {
e23225e7 259 $conflict_count++ if $rdghash->{'is_conflict'};
260 $reversion_count++ if $rdghash->{'is_reverted'};
94654e27 261 # Add the reading text back in, setting display value as needed
a44aaf2a 262 my $rdg = $c->reading( $rdghash->{'readingid'} );
94654e27 263 if( $rdg ) {
264 $rdghash->{'text'} = $rdg->text .
265 ( $rdg->rank == $rank ? '' : ' [' . $rdg->rank . ']' );
7b7abf10 266 $rdghash->{'is_ungrammatical'} = $rdg->grammar_invalid;
267 $rdghash->{'is_nonsense'} = $rdg->is_nonsense;
94654e27 268 }
f629cb3b 269 # Remove lacunose witnesses from this reading's list now that the
7234b01d 270 # analysis is done
f629cb3b 271 my @realgroup;
7234b01d 272 map { push( @realgroup, $_ ) unless $lmiss{$_} } @{$rdghash->{'group'}};
f629cb3b 273 $rdghash->{'group'} = \@realgroup;
638e2a95 274 # Note any layered witnesses that appear in this group
275 foreach( @realgroup ) {
276 if( $_ =~ /^(.*)\Q$aclabel\E$/ ) {
277 push( @layerwits, $1 );
278 }
279 }
a44aaf2a 280 }
638e2a95 281 $location->{'layerwits'} = \@layerwits if @layerwits;
88a6bac5 282 }
a44aaf2a 283 $answer->{'conflict_count'} = $conflict_count;
7e17346f 284 $answer->{'reversion_count'} = $reversion_count;
f00cefe8 285
88a6bac5 286 return $answer;
d71100ed 287}
288
e23225e7 289=head2 group_variants( $tradition, $rank, $lacunose, $transposed, $merge_relationship_types )
7f52eac8 290
291Groups the variants at the given $rank of the collation, treating any
e23225e7 292relationships in the set $merge_relationship_types as equivalent.
293$lacunose should be a reference to an array, to which the sigla of lacunose
294witnesses at this rank will be appended; $transposed should be a reference
295to a hash, wherein the identities of transposed readings and their
296relatives will be stored.
7f52eac8 297
ffa22d6f 298Returns a hash $group_readings where $rdg is attested by the witnesses listed
299in $group_readings->{$rdg}.
7f52eac8 300
301=cut
302
303# Return group_readings, groups, lacunose
d1348d38 304sub group_variants {
94654e27 305 my( $tradition, $rank, $lacunose, $transposed, $collapse ) = @_;
7f52eac8 306 my $c = $tradition->collation;
335a62ef 307 my $aclabel = $c->ac_label;
d120c995 308 my $table = $c->alignment_table;
7f52eac8 309 # Get the alignment table readings
310 my %readings_at_rank;
7e17346f 311 my $check_for_gaps = Set::Scalar->new();
94654e27 312 my %moved_wits;
d120c995 313 my $has_transposition;
314 foreach my $tablewit ( @{$table->{'alignment'}} ) {
7f52eac8 315 my $rdg = $tablewit->{'tokens'}->[$rank-1];
fae07016 316 my $wit = $tablewit->{'witness'};
ffa22d6f 317 # Exclude the witness if it is "lacunose" which if we got here
318 # means "not in the stemma".
7e17346f 319 next if _is_lacunose( $wit, $lacunose, $aclabel );
94654e27 320 # Note if the witness is actually in a lacuna
7f52eac8 321 if( $rdg && $rdg->{'t'}->is_lacuna ) {
335a62ef 322 _add_to_witlist( $wit, $lacunose, $aclabel );
94654e27 323 # Otherwise the witness either has a positive reading...
7f52eac8 324 } elsif( $rdg ) {
94654e27 325 # If the reading has been counted elsewhere as a transposition, ignore it.
326 if( $transposed->{$rdg->{'t'}->id} ) {
d120c995 327 # TODO Does this cope with three-way transpositions?
94654e27 328 map { $moved_wits{$_} = 1 } @{$transposed->{$rdg->{'t'}->id}};
329 next;
330 }
331 # Otherwise, record it...
332 $readings_at_rank{$rdg->{'t'}->id} = $rdg->{'t'};
333 # ...and grab any transpositions, and their relations.
334 my @transp = grep { $_->rank != $rank } $rdg->{'t'}->related_readings();
335 foreach my $trdg ( @transp ) {
d120c995 336 next if exists $readings_at_rank{$trdg->id};
337 $has_transposition = 1;
338 my @affected_wits = _table_witnesses(
7e17346f 339 $table, $trdg, $lacunose, $aclabel );
d120c995 340 next unless @affected_wits;
341 map { $moved_wits{$_} = 1 } @affected_wits;
342 $transposed->{$trdg->id} =
7e17346f 343 [ _table_witnesses( $table, $rdg->{'t'}, $lacunose, $aclabel ) ];
94654e27 344 $readings_at_rank{$trdg->id} = $trdg;
345 }
346 # ...or it is empty, ergo a gap.
7f52eac8 347 } else {
7e17346f 348 _add_to_witlist( $wit, $check_for_gaps, $aclabel );
7f52eac8 349 }
350 }
7e17346f 351 my $gap_wits = Set::Scalar->new();
352 map { _add_to_witlist( $_, $gap_wits, $aclabel )
353 unless $moved_wits{$_} } $check_for_gaps->members;
354
355 # Group the readings, collapsing groups by relationship if needed.
d120c995 356 my $grouped_readings = {};
4ce27d42 357 foreach my $rdg ( values %readings_at_rank ) {
7f52eac8 358 # Skip readings that have been collapsed into others.
d120c995 359 next if exists $grouped_readings->{$rdg->id}
360 && $grouped_readings->{$rdg->id} eq 'COLLAPSE';
4ce27d42 361 # Get the witness list, including from readings collapsed into this one.
7e17346f 362 my @wits = _table_witnesses( $table, $rdg, $lacunose, $aclabel );
e23225e7 363 if( $collapse && $collapse->size ) {
364 my $filter = sub { $collapse->has( $_[0]->type ) };
7f52eac8 365 foreach my $other ( $rdg->related_readings( $filter ) ) {
7e17346f 366 my @otherwits = _table_witnesses( $table, $other, $lacunose, $aclabel );
fae07016 367 push( @wits, @otherwits );
d120c995 368 $grouped_readings->{$other->id} = 'COLLAPSE';
d1348d38 369 }
370 }
7e17346f 371 $grouped_readings->{$rdg->id} = Set::Scalar->new( @wits );
7f52eac8 372 }
7e17346f 373 if( $gap_wits->members ) {
374 $grouped_readings->{'(omitted)'} = $gap_wits;
375 }
376
7f52eac8 377 # Get rid of our collapsed readings
7e17346f 378 map { delete $grouped_readings->{$_} if(
379 $grouped_readings->{$_} eq 'COLLAPSE'
380 || $grouped_readings->{$_}->is_empty ) }
381 keys %$grouped_readings;
d120c995 382
383 # If something was transposed, check the groups for doubled-up readings
384 if( $has_transposition ) {
62a39b8f 385 # print STDERR "Group for rank $rank:\n";
386 # map { print STDERR "\t$_: " . join( ' ' , @{$grouped_readings->{$_}} ) . "\n" }
387 # keys %$grouped_readings;
d120c995 388 _check_transposed_consistency( $c, $rank, $transposed, $grouped_readings );
389 }
7f52eac8 390
4ce27d42 391 # Return the result
d120c995 392 return $grouped_readings;
393}
394
395# Helper function to query the alignment table for all witnesses (a.c. included)
396# that have a given reading at its rank.
397sub _table_witnesses {
398 my( $table, $trdg, $lacunose, $aclabel ) = @_;
399 my $tableidx = $trdg->rank - 1;
7e17346f 400 my $has_reading = Set::Scalar->new();
d120c995 401 foreach my $row ( @{$table->{'alignment'}} ) {
402 my $wit = $row->{'witness'};
7e17346f 403 next if _is_lacunose( $wit, $lacunose, $aclabel );
d120c995 404 my $rdg = $row->{'tokens'}->[$tableidx];
405 next unless exists $rdg->{'t'} && defined $rdg->{'t'};
7e17346f 406 _add_to_witlist( $wit, $has_reading, $aclabel )
d120c995 407 if $rdg->{'t'}->id eq $trdg->id;
408 }
7e17346f 409 return $has_reading->members;
410}
411
412# Helper function to see if a witness is lacunose even if we are asking about
413# the a.c. version
414sub _is_lacunose {
415 my ( $wit, $lac, $acstr ) = @_;
416 if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
417 $wit = $1;
418 }
419 return $lac->has( $wit );
d1348d38 420}
421
335a62ef 422# Helper function to ensure that X and X a.c. never appear in the same list.
423sub _add_to_witlist {
424 my( $wit, $list, $acstr ) = @_;
335a62ef 425 if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
7e17346f 426 # Don't add X a.c. if we already have X
427 return if $list->has( $1 );
335a62ef 428 } else {
7e17346f 429 # Delete X a.c. if we are about to add X
430 $list->delete( $wit.$acstr );
335a62ef 431 }
7e17346f 432 $list->insert( $wit );
335a62ef 433}
434
d120c995 435sub _check_transposed_consistency {
436 my( $c, $rank, $transposed, $groupings ) = @_;
437 my %seen_wits;
438 my %thisrank;
439 # Note which readings are actually at this rank, and which witnesses
440 # belong to which reading.
441 foreach my $rdg ( keys %$groupings ) {
442 my $rdgobj = $c->reading( $rdg );
443 # Count '(omitted)' as a reading at this rank
444 $thisrank{$rdg} = 1 if !$rdgobj || $rdgobj->rank == $rank;
445 map { push( @{$seen_wits{$_}}, $rdg ) } @{$groupings->{$rdg}};
446 }
447 # Our work is done if we have no witness belonging to more than one
448 # reading.
449 my @doubled = grep { scalar @{$seen_wits{$_}} > 1 } keys %seen_wits;
450 return unless @doubled;
451 # If we have a symmetric related transposition, drop the non-rank readings.
452 if( @doubled == scalar keys %seen_wits ) {
453 foreach my $rdg ( keys %$groupings ) {
454 if( !$thisrank{$rdg} ) {
455 my $groupstr = wit_stringify( $groupings->{$rdg} );
456 my ( $matched ) = grep { $groupstr eq wit_stringify( $groupings->{$_} ) }
457 keys %thisrank;
458 delete $groupings->{$rdg};
459 # If we found a group match, assume there is a symmetry happening.
460 # TODO think more about this
62a39b8f 461 # print STDERR "*** Deleting symmetric reading $rdg\n";
d120c995 462 unless( $matched ) {
463 delete $transposed->{$rdg};
464 warn "Found problem in evident symmetry with reading $rdg";
465 }
466 }
467 }
468 # Otherwise 'unhook' the transposed reading(s) that have duplicates.
469 } else {
470 foreach my $dup ( @doubled ) {
471 foreach my $rdg ( @{$seen_wits{$dup}} ) {
472 next if $thisrank{$rdg};
473 next unless exists $groupings->{$rdg};
62a39b8f 474 # print STDERR "*** Deleting asymmetric doubled-up reading $rdg\n";
d120c995 475 delete $groupings->{$rdg};
476 delete $transposed->{$rdg};
477 }
478 }
479 # and put any now-orphaned readings into an 'omitted' reading.
480 foreach my $wit ( keys %seen_wits ) {
481 unless( grep { exists $groupings->{$_} } @{$seen_wits{$wit}} ) {
7e17346f 482 $groupings->{'(omitted)'} = Set::Scalar->new()
483 unless exists $groupings->{'(omitted)'};
d120c995 484 _add_to_witlist( $wit, $groupings->{'(omitted)'}, $c->ac_label );
485 }
486 }
487 }
488}
489
7e17346f 490# For the given grouping, return its situation graph based on the stemma.
491sub _graph_for_grouping {
492 my( $stemma, $grouping, $lacunose, $aclabel ) = @_;
493 my $acwits = [];
494 my $extant = {};
495 foreach my $gs ( values %$grouping ) {
496 map {
497 if( $_ =~ /^(.*)\Q$aclabel\E$/ ) {
498 push( @$acwits, $1 ) unless $lacunose->has( $1 );
499 } else {
500 $extant->{$_} = 1 unless $lacunose->has( $_ );
501 }
502 } $gs->members;
503 }
504 my $graph;
505 try {
506 # contig contains all extant wits and all hypothetical wits
507 # needed to make up the groups.
ace5fce5 508 $graph = $stemma->situation_graph( $extant, $acwits, $aclabel );
7e17346f 509 } catch ( Text::Tradition::Error $e ) {
03c0a7d7 510 throw( "Could not extend graph with given extant and a.c. witnesses: "
511 . $e->message );
7e17346f 512 } catch {
03c0a7d7 513 throw( "Could not extend graph with a.c. witnesses @$acwits" );
7e17346f 514 }
515 return $graph;
516}
88a6bac5 517
7e17346f 518=head2 solve_variants( $calcdir, @groups )
519
520Looks up the set of groups in the answers provided by the external graph solver
521service and returns a cleaned-up answer, adding the rank IDs back where they belong.
88a6bac5 522
88a6bac5 523The answer has the form
524 { "variants" => [ array of variant location structures ],
525 "variant_count" => total,
526 "conflict_count" => number of conflicts detected,
527 "genealogical_count" => number of solutions found }
528
529=cut
530
531sub solve_variants {
03c0a7d7 532 my( @groups ) = @_;
533
a745c3d9 534 # Are we using a local result directory, or did we pass an empty value
535 # for the directory?
03c0a7d7 536 my $dir;
a745c3d9 537 unless( ref( $groups[0] ) eq 'HASH' ) {
03c0a7d7 538 $dir = shift @groups;
539 }
335a62ef 540
03c0a7d7 541 ## For each graph/group combo, make a Text::Tradition::Analysis::Result
542 ## object so that we can send it off for IDP lookup.
7e17346f 543 my $variants = [];
544 my $genealogical = 0; # counter
03c0a7d7 545 # TODO Optimize for unique graph problems
554e2e7d 546 my %problems;
7e17346f 547 foreach my $graphproblem ( @groups ) {
03c0a7d7 548 # Construct the calc result key and look up its answer
549 my $problem = Text::Tradition::Analysis::Result->new(
550 graph => $graphproblem->{'graph'},
551 setlist => [ values %{$graphproblem->{'grouping'}} ] );
554e2e7d 552 if( exists $problems{$problem->object_key} ) {
553 $problem = $problems{$problem->object_key};
554 } else {
555 $problems{$problem->object_key} = $problem;
556 }
557 $graphproblem->{'object'} = $problem;
03c0a7d7 558 }
559
554e2e7d 560 my %results;
03c0a7d7 561 if( $dir ) {
562 my $scope = $dir->new_scope;
554e2e7d 563 map { $results{$_} = $dir->lookup( $_ ) || $problems{$_} } keys %problems;
564 } else {
565 my $json = JSON->new->allow_blessed->convert_blessed->utf8->encode(
566 [ values %problems ] );
03c0a7d7 567 # Send it off and get the result
568 # print STDERR "Sending request: " . decode_utf8( $json ) . "\n";
569 my $ua = LWP::UserAgent->new();
570 my $resp = $ua->post( $SOLVER_URL, 'Content-Type' => 'application/json',
571 'Content' => $json );
a745c3d9 572 my $answer;
03c0a7d7 573 if( $resp->is_success ) {
a745c3d9 574 $answer = decode_json( $resp->content );
03c0a7d7 575 throw( "Unexpected answer from IDP: $answer" ) unless ref( $answer ) eq 'ARRAY';
576 } else {
577 throw( "IDP solver returned " . $resp->status_line . " / " . $resp->content
578 . "; cannot run graph analysis" );
579 }
580 # One more sanity check
581 throw( "Something went wrong with answer symmetricity" )
554e2e7d 582 unless keys( %problems ) == @$answer;
03c0a7d7 583 # Convert the results
554e2e7d 584 foreach my $a ( @$answer ) {
585 my $r = Text::Tradition::Analysis::Result->new( $a );
586 $results{$r->object_key} = $r;
587 }
03c0a7d7 588 }
589
590 # We now have a single JSON-encoded Result object per problem sent. Fold its
591 # answers into our variant info structure.
554e2e7d 592 foreach my $graphproblem ( @groups ) {
593 my $result = $results{$graphproblem->{'object'}->object_key}
594 || $graphproblem->{'object'};
03c0a7d7 595
7e17346f 596 # Initialize the result structure for this graph problem
b7bd7aa5 597 my $vstruct;
598 if( $result->status eq 'OK' ) {
599 $vstruct = { readings => [] };
600 push( @$variants, $vstruct );
601 } else {
602 push( @$variants, _init_unsolved( $graphproblem, $result->status ) );
7e17346f 603 next;
5c44c598 604 }
b7bd7aa5 605
7e17346f 606 # 1. Did the group evaluate as genealogical?
03c0a7d7 607 $vstruct->{genealogical} = $result->is_genealogical;
608 $genealogical++ if $result->is_genealogical;
7e17346f 609
610 # 2. What are the calculated minimum groupings for each variant loc?
611 foreach my $rid ( keys %{$graphproblem->{grouping}} ) {
612 my $inputset = $graphproblem->{grouping}->{$rid};
03c0a7d7 613 my $minset = $result->minimum_grouping_for( $inputset );
7e17346f 614 push( @{$vstruct->{readings}}, { readingid => $rid, group => $minset } );
6d25a3a0 615 }
7e17346f 616
617 # 3. What are the sources and classes calculated for each witness?
03c0a7d7 618 $vstruct->{witcopy_types} = { $result->classes };
7e17346f 619 $vstruct->{reading_roots} = {};
03c0a7d7 620 map { $vstruct->{reading_roots}->{$_} = 1 } $result->sources;
7e17346f 621
88a6bac5 622 }
623
624 return { 'variants' => $variants,
625 'variant_count' => scalar @$variants,
626 'genealogical_count' => $genealogical };
627}
628
b7bd7aa5 629sub _init_unsolved {
630 my( $graphproblem, $status ) = @_;
631 my $vstruct = { 'readings' => [] };
632 $vstruct->{'unsolved'} = $status;
633 foreach my $rid ( keys %{$graphproblem->{grouping}} ) {
634 push( @{$vstruct->{readings}}, { readingid => $rid,
635 group => [ $graphproblem->{grouping}->{$rid}->members ] } );
636 }
637 return $vstruct;
638}
639
fae07016 640=head2 analyze_location ( $tradition, $graph, $location_hash )
7f52eac8 641
fae07016 642Given the tradition, its stemma graph, and the solution from the graph solver,
643work out the rest of the information we want. For each reading we need missing,
5c44c598 644conflict, reading_parents, independent_occurrence, followed, not_followed,
645and follow_unknown. Alters the location_hash in place.
7f52eac8 646
647=cut
732152b1 648
fae07016 649sub analyze_location {
7e17346f 650 my ( $tradition, $graph, $variant_row, $lacunose ) = @_;
94654e27 651 my $c = $tradition->collation;
fae07016 652
03c0a7d7 653 if( exists $variant_row->{'unsolved'} ) {
654 return;
655 }
656 my $reading_roots = delete $variant_row->{'reading_roots'};
657 my $classinfo = delete $variant_row->{'witcopy_types'};
658
fae07016 659 # Make a hash of all known node memberships, and make the subgraphs.
660 my $contig = {};
fae07016 661 my $subgraph = {};
638e2a95 662 my $acstr = $c->ac_label;
663 my @acwits;
5c44c598 664
5c44c598 665 # Note which witnesses positively belong to which group. This information
666 # comes ultimately from the IDP solver.
667 # Also make a note of the reading's roots.
fae07016 668 foreach my $rdghash ( @{$variant_row->{'readings'}} ) {
669 my $rid = $rdghash->{'readingid'};
5c44c598 670 my @roots;
638e2a95 671 foreach my $wit ( @{$rdghash->{'group'}} ) {
672 $contig->{$wit} = $rid;
673 if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
674 push( @acwits, $1 );
675 }
5c44c598 676 if( exists $reading_roots->{$wit} && $reading_roots->{$wit} ) {
677 push( @roots, $wit );
678 }
638e2a95 679 }
5c44c598 680 $rdghash->{'independent_occurrence'} = \@roots;
94654e27 681 }
7e17346f 682
fae07016 683 # Now that we have all the node group memberships, calculate followed/
bebec0e9 684 # non-followed/unknown values for each reading. Also figure out the
685 # reading's evident parent(s).
686 foreach my $rdghash ( @{$variant_row->{'readings'}} ) {
fae07016 687 my $rid = $rdghash->{'readingid'};
428bcf0b 688 my $rdg = $c->reading( $rid );
5c44c598 689 my @roots = @{$rdghash->{'independent_occurrence'}};
7e17346f 690 my @reversions;
691 if( $classinfo ) {
692 @reversions = grep { $classinfo->{$_} eq 'revert' }
693 $rdghash->{'group'}->members;
be590045 694 $rdghash->{'reversions'} = \@reversions;
7e17346f 695 }
5c44c598 696 my @group = @{$rdghash->{'group'}};
fae07016 697
698 # Start figuring things out.
7e17346f 699 $rdghash->{'followed'} = scalar( @group )
700 - ( scalar( @roots ) + scalar( @reversions ) );
bebec0e9 701 # Find the parent readings, if any, of this reading.
7e17346f 702 my $sourceparents = _find_reading_parents( $rid, $graph, $contig, @roots );
7e17346f 703 # Work out relationships between readings and their non-followed parent.
704 _resolve_parent_relationships( $c, $rid, $rdg, $sourceparents );
be590045 705 $rdghash->{'source_parents'} = $sourceparents;
e23225e7 706
707 if( @reversions ) {
708 my $revparents = _find_reading_parents( $rid, $graph, $contig, @reversions );
709 _resolve_parent_relationships( $c, $rid, $rdg, $revparents );
710 $rdghash->{'reversion_parents'} = $revparents;
711 }
bebec0e9 712
713 # Find the number of times this reading was altered, and the number of
714 # times we're not sure.
715 my( %nofollow, %unknownfollow );
5c44c598 716 foreach my $wit ( @{$rdghash->{'group'}} ) {
bebec0e9 717 foreach my $wchild ( $graph->successors( $wit ) ) {
5c44c598 718 if( $reading_roots->{$wchild} && $contig->{$wchild}
719 && $contig->{$wchild} ne $rid ) {
bebec0e9 720 # It definitely changed here.
721 $nofollow{$wchild} = 1;
722 } elsif( !($contig->{$wchild}) ) {
723 # The child is a hypothetical node not definitely in
724 # any group. Answer is unknown.
725 $unknownfollow{$wchild} = 1;
5c44c598 726 } # else it is either in our group, or it is a non-root node in a
727 # known group and therefore is presumed to have its reading from
728 # its group, not this link.
bebec0e9 729 }
730 }
731 $rdghash->{'not_followed'} = keys %nofollow;
732 $rdghash->{'follow_unknown'} = keys %unknownfollow;
fae07016 733
734 # Now say whether this reading represents a conflict.
735 unless( $variant_row->{'genealogical'} ) {
7e17346f 736 $rdghash->{'is_conflict'} = @roots != 1;
e23225e7 737 $rdghash->{'is_reverted'} = scalar @reversions;
7e17346f 738 }
739 }
740}
741
742sub _find_reading_parents {
743 my( $rid, $graph, $contig, @list ) = @_;
744 my $parenthash = {};
745 foreach my $wit ( @list ) {
746 # Look in the stemma graph to find this witness's extant or known-reading
747 # immediate ancestor(s), and look up the reading that each ancestor holds.
748 my @check = $graph->predecessors( $wit );
749 while( @check ) {
750 my @next;
751 foreach my $wparent( @check ) {
752 my $preading = $contig->{$wparent};
753 if( $preading && $preading ne $rid ) {
754 $parenthash->{$preading} = 1;
755 } else {
756 push( @next, $graph->predecessors( $wparent ) );
757 }
758 }
759 @check = @next;
760 }
761 }
762 return $parenthash;
763}
764
765sub _resolve_parent_relationships {
766 my( $c, $rid, $rdg, $rdgparents ) = @_;
767 foreach my $p ( keys %$rdgparents ) {
768 # Resolve the relationship of the parent to the reading, and
769 # save it in our hash.
770 my $pobj = $c->reading( $p );
771 my $prep = $pobj ? $pobj->id . ' (' . $pobj->text . ')' : $p;
772 my $phash = { 'label' => $prep };
773 if( $pobj ) {
774 my $rel = $c->get_relationship( $p, $rid );
775 if( $rel ) {
776 _add_to_hash( $rel, $phash );
777 } elsif( $rdg ) {
778 # First check for a transposed relationship
779 if( $rdg->rank != $pobj->rank ) {
780 foreach my $ti ( $rdg->related_readings( 'transposition' ) ) {
781 next unless $ti->text eq $rdg->text;
782 $rel = $c->get_relationship( $ti, $pobj );
783 if( $rel ) {
784 _add_to_hash( $rel, $phash, 1 );
785 last;
786 }
787 }
788 unless( $rel ) {
789 foreach my $ti ( $pobj->related_readings( 'transposition' ) ) {
790 next unless $ti->text eq $pobj->text;
791 $rel = $c->get_relationship( $ti, $rdg );
792 if( $rel ) {
793 _add_to_hash( $rel, $phash, 1 );
794 last;
795 }
796 }
5c44c598 797 }
798 }
7e17346f 799 unless( $rel ) {
800 # and then check for sheer word similarity.
801 my $rtext = $rdg->text;
802 my $ptext = $pobj->text;
803 if( similar( $rtext, $ptext ) ) {
804 # say STDERR "Words $rtext and $ptext judged similar";
805 $phash->{relation} = { type => 'wordsimilar' };
806 }
807 }
5c44c598 808 } else {
7e17346f 809 $phash->{relation} = { type => 'deletion' };
5c44c598 810 }
7e17346f 811 # Get the attributes of the parent object while we are here
812 $phash->{'text'} = $pobj->text if $pobj;
813 $phash->{'is_nonsense'} = $pobj->is_nonsense;
814 $phash->{'is_ungrammatical'} = $pobj->grammar_invalid;
815 } elsif( $p eq '(omitted)' ) {
816 $phash->{relation} = { type => 'addition' };
817 }
818 # Save it
819 $rdgparents->{$p} = $phash;
820 }
d71100ed 821}
822
428bcf0b 823sub _add_to_hash {
824 my( $rel, $phash, $is_transposed ) = @_;
825 $phash->{relation} = { type => $rel->type };
826 $phash->{relation}->{transposed} = 1 if $is_transposed;
827 $phash->{relation}->{annotation} = $rel->annotation
828 if $rel->has_annotation;
829}
830
831=head2 similar( $word1, $word2 )
832
833Use Algorithm::Diff to get a sense of how close the words are to each other.
834This will hopefully handle substitutions a bit more nicely than Levenshtein.
835
836=cut
837
838#!/usr/bin/env perl
839
840sub similar {
841 my( $word1, $word2 ) = sort { length($a) <=> length($b) } @_;
842 my @let1 = split( '', lc( $word1 ) );
843 my @let2 = split( '', lc( $word2 ) );
844 my $diff = Algorithm::Diff->new( \@let1, \@let2 );
845 my $mag = 0;
846 while( $diff->Next ) {
847 if( $diff->Same ) {
848 # Take off points for longer strings
849 my $cs = $diff->Range(1) - 2;
850 $cs = 0 if $cs < 0;
851 $mag -= $cs;
852 } elsif( !$diff->Items(1) ) {
853 $mag += $diff->Range(2);
854 } elsif( !$diff->Items(2) ) {
855 $mag += $diff->Range(1);
856 } else {
857 # Split the difference for substitutions
858 my $c1 = $diff->Range(1) || 1;
859 my $c2 = $diff->Range(2) || 1;
860 my $cd = ( $c1 + $c2 ) / 2;
861 $mag += $cd;
862 }
863 }
864 return ( $mag <= length( $word1 ) / 2 );
865}
866
5be0cdeb 867sub _useful_variant {
7e17346f 868 my( $rankgroup, $rankgraph, $acstr ) = @_;
5be0cdeb 869
870 # Sort by group size and return
871 my $is_useful = 0;
7e17346f 872 foreach my $rdg ( keys %$rankgroup ) {
873 my @wits = $rankgroup->{$rdg}->members;
874 if( @wits > 1 ) {
5be0cdeb 875 $is_useful++;
876 } else {
7e17346f 877 $is_useful++ unless( $rankgraph->is_sink_vertex( $wits[0] )
878 || $wits[0] =~ /\Q$acstr\E$/ );
5be0cdeb 879 }
880 }
7e17346f 881 return $is_useful > 1;
5be0cdeb 882}
883
7f52eac8 884=head2 wit_stringify( $groups )
885
886Takes an array of witness groupings and produces a string like
887['A','B'] / ['C','D','E'] / ['F']
d71100ed 888
7f52eac8 889=cut
d71100ed 890
891sub wit_stringify {
892 my $groups = shift;
893 my @gst;
894 # If we were passed an array of witnesses instead of an array of
895 # groupings, then "group" the witnesses first.
896 unless( ref( $groups->[0] ) ) {
897 my $mkgrp = [ $groups ];
898 $groups = $mkgrp;
899 }
900 foreach my $g ( @$groups ) {
901 push( @gst, '[' . join( ',', map { "'$_'" } @$g ) . ']' );
902 }
903 return join( ' / ', @gst );
904}
7f52eac8 905
7f52eac8 9061;
907
a745c3d9 908sub throw {
909 Text::Tradition::Error->throw(
910 'ident' => 'Analysis error',
911 'message' => $_[0],
912 );
913}
914
7f52eac8 915=head1 LICENSE
916
917This package is free software and is provided "as is" without express
918or implied warranty. You can redistribute it and/or modify it under
919the same terms as Perl itself.
920
921=head1 AUTHOR
922
923Tara L Andrews E<lt>aurum@cpan.orgE<gt>