split out morphology; make all tests pass apart from morphology POD
[scpubgit/stemmatology.git] / analysis / lib / Text / Tradition / Analysis.pm
CommitLineData
d71100ed 1package Text::Tradition::Analysis;
2
3use strict;
4use warnings;
428bcf0b 5use Algorithm::Diff; # for word similarity measure
03c0a7d7 6use Encode qw/ decode_utf8 encode_utf8 /;
d1348d38 7use Exporter 'import';
b4cb2d60 8use Graph;
a745c3d9 9use JSON qw/ to_json decode_json /;
03c0a7d7 10use LWP::UserAgent;
7e17346f 11use Set::Scalar;
7e17346f 12use Text::Tradition::Analysis::Result;
13use Text::Tradition::Directory;
d71100ed 14use Text::Tradition::Stemma;
d120c995 15use TryCatch;
d71100ed 16
951ddfe8 17use vars qw/ @EXPORT_OK $VERSION /;
a2cf85dd 18@EXPORT_OK = qw/ run_analysis group_variants analyze_variant_location wit_stringify /;
951ddfe8 19$VERSION = "1.0";
20
d1348d38 21
03c0a7d7 22my $SOLVER_URL = 'http://byzantini.st/cgi-bin/graphcalc.cgi';
7e17346f 23my $unsolved_problems = {};
5c44c598 24
7f52eac8 25=head1 NAME
26
27Text::Tradition::Analysis - functions for stemma analysis of a tradition
28
29=head1 SYNOPSIS
30
31 use Text::Tradition;
32 use Text::Tradition::Analysis qw/ run_analysis analyze_variant_location /;
33 my $t = Text::Tradition->new(
34 'name' => 'this is a text',
35 'input' => 'TEI',
36 'file' => '/path/to/tei_parallel_seg_file.xml' );
951ddfe8 37 $t->enable_stemmata;
7f52eac8 38 $t->add_stemma( 'dotfile' => $stemmafile );
39
40 my $variant_data = run_analysis( $tradition );
41 # Recalculate rank $n treating all orthographic variants as equivalent
42 my $reanalyze = analyze_variant_location( $tradition, $n, 0, 'orthographic' );
43
44=head1 DESCRIPTION
45
46Text::Tradition is a library for representation and analysis of collated
951ddfe8 47texts, particularly medieval ones. Where the Collation is the central feature of
48a Tradition, it may also have one or more temmata associated with it, and these stemmata may be analyzed. This package provides the following modules:
49
50=over 4
51
52=item * L<Text::Tradition::HasStemma> - a role that can be composed into Text::Tradition objects, providing the ability for Text::Tradition::Stemma objects to be associated with them.
53
54=item * L<Text::Tradition::Stemma> - an object class that represents stemma hypotheses, both rooted (with a single archetype) and unrooted (e.g. phylogenetic trees).
55
56=item * Text::Tradition::Analysis (this package). Provides functions for the analysis of a given stemma against the collation within a given Tradition.
57
58=back
7f52eac8 59
60=head1 SUBROUTINES
61
88a6bac5 62=head2 run_analysis( $tradition, %opts )
7f52eac8 63
88a6bac5 64Runs the analysis described in analyze_variant_location on every location in the
65collation of the given tradition, with the given options. These include:
7f52eac8 66
88a6bac5 67=over 4
68
69=item * stemma_id - Specify which of the tradition's stemmata to use. Default
70is 0 (i.e. the first).
71
72=item * ranks - Specify a list of location ranks to analyze; exclude the rest.
73
74=item * merge_types - Specify a list of relationship types, where related readings
75should be treated as identical for the purposes of analysis.
76
ffa22d6f 77=item * exclude_type1 - Exclude those ranks whose groupings have only type-1 variants.
78
88a6bac5 79=back
7f52eac8 80
81=begin testing
82
83use Text::Tradition;
84use Text::Tradition::Analysis qw/ run_analysis analyze_variant_location /;
85
86my $datafile = 't/data/florilegium_tei_ps.xml';
87my $tradition = Text::Tradition->new( 'input' => 'TEI',
88 'name' => 'test0',
89 'file' => $datafile );
951ddfe8 90$tradition->enable_stemmata;
7f52eac8 91my $s = $tradition->add_stemma( 'dotfile' => 't/data/florilegium.dot' );
92is( ref( $s ), 'Text::Tradition::Stemma', "Added stemma to tradition" );
93
f00cefe8 94my %expected_genealogical = (
a44aaf2a 95 1 => 0,
f00cefe8 96 2 => 1,
a44aaf2a 97 3 => 0,
98 5 => 0,
99 7 => 0,
100 8 => 0,
101 10 => 0,
f00cefe8 102 13 => 1,
a44aaf2a 103 33 => 0,
104 34 => 0,
105 37 => 0,
106 60 => 0,
f00cefe8 107 81 => 1,
a44aaf2a 108 84 => 0,
109 87 => 0,
110 101 => 0,
111 102 => 0,
f00cefe8 112 122 => 1,
a44aaf2a 113 157 => 0,
f00cefe8 114 166 => 1,
115 169 => 1,
a44aaf2a 116 200 => 0,
f00cefe8 117 216 => 1,
118 217 => 1,
119 219 => 1,
120 241 => 1,
121 242 => 1,
122 243 => 1,
123);
124
7e17346f 125my $data = run_analysis( $tradition, calcdsn => 'dbi:SQLite:dbname=t/data/analysis.db' );
7234b01d 126my $c = $tradition->collation;
f00cefe8 127foreach my $row ( @{$data->{'variants'}} ) {
a44aaf2a 128 # Account for rows that used to be "not useful"
129 unless( exists $expected_genealogical{$row->{'id'}} ) {
130 $expected_genealogical{$row->{'id'}} = 1;
131 }
18f48b82 132 my $gen_bool = $row->{'genealogical'} ? 1 : 0;
133 is( $gen_bool, $expected_genealogical{$row->{'id'}},
f00cefe8 134 "Got correct genealogical flag for row " . $row->{'id'} );
7234b01d 135 # Check that we have the right row with the right groups
136 my $rank = $row->{'id'};
137 foreach my $rdghash ( @{$row->{'readings'}} ) {
138 # Skip 'readings' that aren't really
139 next unless $c->reading( $rdghash->{'readingid'} );
140 # Check the rank
141 is( $c->reading( $rdghash->{'readingid'} )->rank, $rank,
142 "Got correct reading rank" );
143 # Check the witnesses
144 my @realwits = sort $c->reading_witnesses( $rdghash->{'readingid'} );
145 my @sgrp = sort @{$rdghash->{'group'}};
146 is_deeply( \@sgrp, \@realwits, "Reading analyzed with correct groups" );
147 }
f00cefe8 148}
a44aaf2a 149is( $data->{'variant_count'}, 58, "Got right total variant number" );
b4cb2d60 150# TODO Make something meaningful of conflict count, maybe test other bits
7f52eac8 151
152=end testing
153
154=cut
155
d71100ed 156sub run_analysis {
88a6bac5 157 my( $tradition, %opts ) = @_;
f00cefe8 158 my $c = $tradition->collation;
7e17346f 159 my $aclabel = $c->ac_label;
88a6bac5 160
161 my $stemma_id = $opts{'stemma_id'} || 0;
1d73ecad 162 my @ranks = ref( $opts{'ranks'} ) eq 'ARRAY' ? @{$opts{'ranks'}} : ();
e23225e7 163 my $collapse = Set::Scalar->new();
164 if( $opts{'merge_types'} && ref( $opts{'merge_types'} ) eq 'ARRAY' ) {
165 $collapse->insert( @{$opts{'merge_types'}} );
166 } elsif( $opts{'merge_types'} ) {
167 $collapse->insert( $opts{'merge_types'} );
168 }
7e17346f 169
170 # Make sure we have a lookup DB for graph calculation results; this will die
171 # if calcdir or calcdsn isn't passed.
03c0a7d7 172 my $dir;
173 if( exists $opts{'calcdir'} ) {
174 $dir = delete $opts{'calcdir'}
175 } elsif ( exists $opts{'calcdsn'} ) {
176 $dir = Text::Tradition::Directory->new( dsn => $opts{'calcdsn'} );
177 }
88a6bac5 178
179 # Get the stemma
180 my $stemma = $tradition->stemma( $stemma_id );
b4cb2d60 181
4ce27d42 182 # Figure out which witnesses we are working with - that is, the ones that
183 # appear both in the stemma and in the tradition. All others are 'lacunose'
184 # for our purposes.
7e17346f 185 my $lacunose = Set::Scalar->new( $stemma->hypotheticals );
186 my $stemma_wits = Set::Scalar->new( $stemma->witnesses );
187 my $tradition_wits = Set::Scalar->new( map { $_->sigil } $tradition->witnesses );
188 $lacunose->insert( $stemma_wits->symmetric_difference( $tradition_wits )->members );
88a6bac5 189
190 # Find and mark 'common' ranks for exclusion, unless they were
191 # explicitly specified.
192 unless( @ranks ) {
193 my %common_rank;
a44aaf2a 194 foreach my $rdg ( $c->common_readings ) {
88a6bac5 195 $common_rank{$rdg->rank} = 1;
196 }
197 @ranks = grep { !$common_rank{$_} } ( 1 .. $c->end->rank-1 );
d71100ed 198 }
7f52eac8 199
88a6bac5 200 # Group the variants to send to the solver
201 my @groups;
f629cb3b 202 my @use_ranks;
a44aaf2a 203 my %lacunae;
94654e27 204 my $moved = {};
88a6bac5 205 foreach my $rank ( @ranks ) {
7e17346f 206 my $missing = $lacunose->clone();
e23225e7 207 my $rankgroup = group_variants( $tradition, $rank, $missing, $moved, $collapse );
94654e27 208 # Filter out any empty rankgroups
209 # (e.g. from the later rank for a transposition)
210 next unless keys %$rankgroup;
7e17346f 211 # Get the graph for this rankgroup
212 my $rankgraph = _graph_for_grouping( $stemma, $rankgroup, $missing, $aclabel );
ffa22d6f 213 if( $opts{'exclude_type1'} ) {
214 # Check to see whether this is a "useful" group.
7e17346f 215 next unless _useful_variant( $rankgroup, $rankgraph, $aclabel );
ffa22d6f 216 }
f629cb3b 217 push( @use_ranks, $rank );
7e17346f 218 push( @groups, { grouping => $rankgroup, graph => $rankgraph } );
a44aaf2a 219 $lacunae{$rank} = $missing;
d71100ed 220 }
4ce27d42 221 # Run the solver
b7bd7aa5 222 my $answer;
223 try {
224 $answer = solve_variants( $dir, @groups );
225 } catch ( Text::Tradition::Error $e ) {
226 if( $e->message =~ /IDP/ ) {
227 # Something is wrong with the solver; make the variants table anyway
228 $answer->{'variants'} = [];
229 map { push( @{$answer->{'variants'}}, _init_unsolved( $_, 'IDP error' ) ) }
230 @groups;
231 } else {
232 # Something else is wrong; error out.
233 $e->throw;
234 }
235 }
fae07016 236
88a6bac5 237 # Do further analysis on the answer
a44aaf2a 238 my $conflict_count = 0;
7e17346f 239 my $reversion_count = 0;
f629cb3b 240 foreach my $idx ( 0 .. $#use_ranks ) {
88a6bac5 241 my $location = $answer->{'variants'}->[$idx];
242 # Add the rank back in
94654e27 243 my $rank = $use_ranks[$idx];
244 $location->{'id'} = $rank;
7234b01d 245 # Note what our lacunae are
f629cb3b 246 my %lmiss;
7234b01d 247 map { $lmiss{$_} = 1 } @{$lacunae{$use_ranks[$idx]}};
7234b01d 248 $location->{'missing'} = [ keys %lmiss ];
249
88a6bac5 250 # Run the extra analysis we need.
428bcf0b 251 ## TODO We run through all the variants in this call, so
252 ## why not add the reading data there instead of here below?
7e17346f 253 my $graph = $groups[$idx]->{graph};
254 analyze_location( $tradition, $graph, $location, \%lmiss );
7234b01d 255
638e2a95 256 my @layerwits;
7234b01d 257 # Do the final post-analysis tidying up of the data.
a44aaf2a 258 foreach my $rdghash ( @{$location->{'readings'}} ) {
e23225e7 259 $conflict_count++ if $rdghash->{'is_conflict'};
260 $reversion_count++ if $rdghash->{'is_reverted'};
94654e27 261 # Add the reading text back in, setting display value as needed
a44aaf2a 262 my $rdg = $c->reading( $rdghash->{'readingid'} );
94654e27 263 if( $rdg ) {
264 $rdghash->{'text'} = $rdg->text .
265 ( $rdg->rank == $rank ? '' : ' [' . $rdg->rank . ']' );
a445ce40 266 if( $rdg->does( 'Text::Tradition::Morphology' ) ) {
267 $rdghash->{'is_ungrammatical'} = $rdg->grammar_invalid;
268 $rdghash->{'is_nonsense'} = $rdg->is_nonsense;
269 }
94654e27 270 }
f629cb3b 271 # Remove lacunose witnesses from this reading's list now that the
7234b01d 272 # analysis is done
f629cb3b 273 my @realgroup;
7234b01d 274 map { push( @realgroup, $_ ) unless $lmiss{$_} } @{$rdghash->{'group'}};
f629cb3b 275 $rdghash->{'group'} = \@realgroup;
638e2a95 276 # Note any layered witnesses that appear in this group
277 foreach( @realgroup ) {
278 if( $_ =~ /^(.*)\Q$aclabel\E$/ ) {
279 push( @layerwits, $1 );
280 }
281 }
a44aaf2a 282 }
638e2a95 283 $location->{'layerwits'} = \@layerwits if @layerwits;
88a6bac5 284 }
a44aaf2a 285 $answer->{'conflict_count'} = $conflict_count;
7e17346f 286 $answer->{'reversion_count'} = $reversion_count;
f00cefe8 287
88a6bac5 288 return $answer;
d71100ed 289}
290
e23225e7 291=head2 group_variants( $tradition, $rank, $lacunose, $transposed, $merge_relationship_types )
7f52eac8 292
293Groups the variants at the given $rank of the collation, treating any
e23225e7 294relationships in the set $merge_relationship_types as equivalent.
295$lacunose should be a reference to an array, to which the sigla of lacunose
296witnesses at this rank will be appended; $transposed should be a reference
297to a hash, wherein the identities of transposed readings and their
298relatives will be stored.
7f52eac8 299
ffa22d6f 300Returns a hash $group_readings where $rdg is attested by the witnesses listed
301in $group_readings->{$rdg}.
7f52eac8 302
303=cut
304
305# Return group_readings, groups, lacunose
d1348d38 306sub group_variants {
94654e27 307 my( $tradition, $rank, $lacunose, $transposed, $collapse ) = @_;
7f52eac8 308 my $c = $tradition->collation;
335a62ef 309 my $aclabel = $c->ac_label;
d120c995 310 my $table = $c->alignment_table;
7f52eac8 311 # Get the alignment table readings
312 my %readings_at_rank;
7e17346f 313 my $check_for_gaps = Set::Scalar->new();
94654e27 314 my %moved_wits;
d120c995 315 my $has_transposition;
316 foreach my $tablewit ( @{$table->{'alignment'}} ) {
7f52eac8 317 my $rdg = $tablewit->{'tokens'}->[$rank-1];
fae07016 318 my $wit = $tablewit->{'witness'};
ffa22d6f 319 # Exclude the witness if it is "lacunose" which if we got here
320 # means "not in the stemma".
7e17346f 321 next if _is_lacunose( $wit, $lacunose, $aclabel );
94654e27 322 # Note if the witness is actually in a lacuna
7f52eac8 323 if( $rdg && $rdg->{'t'}->is_lacuna ) {
335a62ef 324 _add_to_witlist( $wit, $lacunose, $aclabel );
94654e27 325 # Otherwise the witness either has a positive reading...
7f52eac8 326 } elsif( $rdg ) {
94654e27 327 # If the reading has been counted elsewhere as a transposition, ignore it.
328 if( $transposed->{$rdg->{'t'}->id} ) {
d120c995 329 # TODO Does this cope with three-way transpositions?
94654e27 330 map { $moved_wits{$_} = 1 } @{$transposed->{$rdg->{'t'}->id}};
331 next;
332 }
333 # Otherwise, record it...
334 $readings_at_rank{$rdg->{'t'}->id} = $rdg->{'t'};
335 # ...and grab any transpositions, and their relations.
336 my @transp = grep { $_->rank != $rank } $rdg->{'t'}->related_readings();
337 foreach my $trdg ( @transp ) {
d120c995 338 next if exists $readings_at_rank{$trdg->id};
339 $has_transposition = 1;
340 my @affected_wits = _table_witnesses(
7e17346f 341 $table, $trdg, $lacunose, $aclabel );
d120c995 342 next unless @affected_wits;
343 map { $moved_wits{$_} = 1 } @affected_wits;
344 $transposed->{$trdg->id} =
7e17346f 345 [ _table_witnesses( $table, $rdg->{'t'}, $lacunose, $aclabel ) ];
94654e27 346 $readings_at_rank{$trdg->id} = $trdg;
347 }
348 # ...or it is empty, ergo a gap.
7f52eac8 349 } else {
7e17346f 350 _add_to_witlist( $wit, $check_for_gaps, $aclabel );
7f52eac8 351 }
352 }
7e17346f 353 my $gap_wits = Set::Scalar->new();
354 map { _add_to_witlist( $_, $gap_wits, $aclabel )
355 unless $moved_wits{$_} } $check_for_gaps->members;
356
357 # Group the readings, collapsing groups by relationship if needed.
d120c995 358 my $grouped_readings = {};
4ce27d42 359 foreach my $rdg ( values %readings_at_rank ) {
7f52eac8 360 # Skip readings that have been collapsed into others.
d120c995 361 next if exists $grouped_readings->{$rdg->id}
362 && $grouped_readings->{$rdg->id} eq 'COLLAPSE';
4ce27d42 363 # Get the witness list, including from readings collapsed into this one.
7e17346f 364 my @wits = _table_witnesses( $table, $rdg, $lacunose, $aclabel );
e23225e7 365 if( $collapse && $collapse->size ) {
366 my $filter = sub { $collapse->has( $_[0]->type ) };
7f52eac8 367 foreach my $other ( $rdg->related_readings( $filter ) ) {
7e17346f 368 my @otherwits = _table_witnesses( $table, $other, $lacunose, $aclabel );
fae07016 369 push( @wits, @otherwits );
d120c995 370 $grouped_readings->{$other->id} = 'COLLAPSE';
d1348d38 371 }
372 }
7e17346f 373 $grouped_readings->{$rdg->id} = Set::Scalar->new( @wits );
7f52eac8 374 }
7e17346f 375 if( $gap_wits->members ) {
376 $grouped_readings->{'(omitted)'} = $gap_wits;
377 }
378
7f52eac8 379 # Get rid of our collapsed readings
7e17346f 380 map { delete $grouped_readings->{$_} if(
381 $grouped_readings->{$_} eq 'COLLAPSE'
382 || $grouped_readings->{$_}->is_empty ) }
383 keys %$grouped_readings;
d120c995 384
385 # If something was transposed, check the groups for doubled-up readings
386 if( $has_transposition ) {
62a39b8f 387 # print STDERR "Group for rank $rank:\n";
388 # map { print STDERR "\t$_: " . join( ' ' , @{$grouped_readings->{$_}} ) . "\n" }
389 # keys %$grouped_readings;
d120c995 390 _check_transposed_consistency( $c, $rank, $transposed, $grouped_readings );
391 }
7f52eac8 392
4ce27d42 393 # Return the result
d120c995 394 return $grouped_readings;
395}
396
397# Helper function to query the alignment table for all witnesses (a.c. included)
398# that have a given reading at its rank.
399sub _table_witnesses {
400 my( $table, $trdg, $lacunose, $aclabel ) = @_;
401 my $tableidx = $trdg->rank - 1;
7e17346f 402 my $has_reading = Set::Scalar->new();
d120c995 403 foreach my $row ( @{$table->{'alignment'}} ) {
404 my $wit = $row->{'witness'};
7e17346f 405 next if _is_lacunose( $wit, $lacunose, $aclabel );
d120c995 406 my $rdg = $row->{'tokens'}->[$tableidx];
407 next unless exists $rdg->{'t'} && defined $rdg->{'t'};
7e17346f 408 _add_to_witlist( $wit, $has_reading, $aclabel )
d120c995 409 if $rdg->{'t'}->id eq $trdg->id;
410 }
7e17346f 411 return $has_reading->members;
412}
413
414# Helper function to see if a witness is lacunose even if we are asking about
415# the a.c. version
416sub _is_lacunose {
417 my ( $wit, $lac, $acstr ) = @_;
418 if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
419 $wit = $1;
420 }
421 return $lac->has( $wit );
d1348d38 422}
423
335a62ef 424# Helper function to ensure that X and X a.c. never appear in the same list.
425sub _add_to_witlist {
426 my( $wit, $list, $acstr ) = @_;
335a62ef 427 if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
7e17346f 428 # Don't add X a.c. if we already have X
429 return if $list->has( $1 );
335a62ef 430 } else {
7e17346f 431 # Delete X a.c. if we are about to add X
432 $list->delete( $wit.$acstr );
335a62ef 433 }
7e17346f 434 $list->insert( $wit );
335a62ef 435}
436
d120c995 437sub _check_transposed_consistency {
438 my( $c, $rank, $transposed, $groupings ) = @_;
439 my %seen_wits;
440 my %thisrank;
441 # Note which readings are actually at this rank, and which witnesses
442 # belong to which reading.
443 foreach my $rdg ( keys %$groupings ) {
444 my $rdgobj = $c->reading( $rdg );
445 # Count '(omitted)' as a reading at this rank
446 $thisrank{$rdg} = 1 if !$rdgobj || $rdgobj->rank == $rank;
447 map { push( @{$seen_wits{$_}}, $rdg ) } @{$groupings->{$rdg}};
448 }
449 # Our work is done if we have no witness belonging to more than one
450 # reading.
451 my @doubled = grep { scalar @{$seen_wits{$_}} > 1 } keys %seen_wits;
452 return unless @doubled;
453 # If we have a symmetric related transposition, drop the non-rank readings.
454 if( @doubled == scalar keys %seen_wits ) {
455 foreach my $rdg ( keys %$groupings ) {
456 if( !$thisrank{$rdg} ) {
457 my $groupstr = wit_stringify( $groupings->{$rdg} );
458 my ( $matched ) = grep { $groupstr eq wit_stringify( $groupings->{$_} ) }
459 keys %thisrank;
460 delete $groupings->{$rdg};
461 # If we found a group match, assume there is a symmetry happening.
462 # TODO think more about this
62a39b8f 463 # print STDERR "*** Deleting symmetric reading $rdg\n";
d120c995 464 unless( $matched ) {
465 delete $transposed->{$rdg};
466 warn "Found problem in evident symmetry with reading $rdg";
467 }
468 }
469 }
470 # Otherwise 'unhook' the transposed reading(s) that have duplicates.
471 } else {
472 foreach my $dup ( @doubled ) {
473 foreach my $rdg ( @{$seen_wits{$dup}} ) {
474 next if $thisrank{$rdg};
475 next unless exists $groupings->{$rdg};
62a39b8f 476 # print STDERR "*** Deleting asymmetric doubled-up reading $rdg\n";
d120c995 477 delete $groupings->{$rdg};
478 delete $transposed->{$rdg};
479 }
480 }
481 # and put any now-orphaned readings into an 'omitted' reading.
482 foreach my $wit ( keys %seen_wits ) {
483 unless( grep { exists $groupings->{$_} } @{$seen_wits{$wit}} ) {
7e17346f 484 $groupings->{'(omitted)'} = Set::Scalar->new()
485 unless exists $groupings->{'(omitted)'};
d120c995 486 _add_to_witlist( $wit, $groupings->{'(omitted)'}, $c->ac_label );
487 }
488 }
489 }
490}
491
7e17346f 492# For the given grouping, return its situation graph based on the stemma.
493sub _graph_for_grouping {
494 my( $stemma, $grouping, $lacunose, $aclabel ) = @_;
495 my $acwits = [];
496 my $extant = {};
497 foreach my $gs ( values %$grouping ) {
498 map {
499 if( $_ =~ /^(.*)\Q$aclabel\E$/ ) {
500 push( @$acwits, $1 ) unless $lacunose->has( $1 );
501 } else {
502 $extant->{$_} = 1 unless $lacunose->has( $_ );
503 }
504 } $gs->members;
505 }
506 my $graph;
507 try {
508 # contig contains all extant wits and all hypothetical wits
509 # needed to make up the groups.
ace5fce5 510 $graph = $stemma->situation_graph( $extant, $acwits, $aclabel );
7e17346f 511 } catch ( Text::Tradition::Error $e ) {
03c0a7d7 512 throw( "Could not extend graph with given extant and a.c. witnesses: "
513 . $e->message );
7e17346f 514 } catch {
03c0a7d7 515 throw( "Could not extend graph with a.c. witnesses @$acwits" );
7e17346f 516 }
517 return $graph;
518}
88a6bac5 519
7e17346f 520=head2 solve_variants( $calcdir, @groups )
521
522Looks up the set of groups in the answers provided by the external graph solver
523service and returns a cleaned-up answer, adding the rank IDs back where they belong.
88a6bac5 524
88a6bac5 525The answer has the form
526 { "variants" => [ array of variant location structures ],
527 "variant_count" => total,
528 "conflict_count" => number of conflicts detected,
529 "genealogical_count" => number of solutions found }
530
531=cut
532
533sub solve_variants {
03c0a7d7 534 my( @groups ) = @_;
535
a745c3d9 536 # Are we using a local result directory, or did we pass an empty value
537 # for the directory?
03c0a7d7 538 my $dir;
a745c3d9 539 unless( ref( $groups[0] ) eq 'HASH' ) {
03c0a7d7 540 $dir = shift @groups;
541 }
335a62ef 542
03c0a7d7 543 ## For each graph/group combo, make a Text::Tradition::Analysis::Result
544 ## object so that we can send it off for IDP lookup.
7e17346f 545 my $variants = [];
546 my $genealogical = 0; # counter
03c0a7d7 547 # TODO Optimize for unique graph problems
554e2e7d 548 my %problems;
7e17346f 549 foreach my $graphproblem ( @groups ) {
03c0a7d7 550 # Construct the calc result key and look up its answer
551 my $problem = Text::Tradition::Analysis::Result->new(
552 graph => $graphproblem->{'graph'},
553 setlist => [ values %{$graphproblem->{'grouping'}} ] );
554e2e7d 554 if( exists $problems{$problem->object_key} ) {
555 $problem = $problems{$problem->object_key};
556 } else {
557 $problems{$problem->object_key} = $problem;
558 }
559 $graphproblem->{'object'} = $problem;
03c0a7d7 560 }
561
554e2e7d 562 my %results;
03c0a7d7 563 if( $dir ) {
564 my $scope = $dir->new_scope;
554e2e7d 565 map { $results{$_} = $dir->lookup( $_ ) || $problems{$_} } keys %problems;
566 } else {
567 my $json = JSON->new->allow_blessed->convert_blessed->utf8->encode(
568 [ values %problems ] );
03c0a7d7 569 # Send it off and get the result
570 # print STDERR "Sending request: " . decode_utf8( $json ) . "\n";
571 my $ua = LWP::UserAgent->new();
572 my $resp = $ua->post( $SOLVER_URL, 'Content-Type' => 'application/json',
573 'Content' => $json );
a745c3d9 574 my $answer;
03c0a7d7 575 if( $resp->is_success ) {
a745c3d9 576 $answer = decode_json( $resp->content );
03c0a7d7 577 throw( "Unexpected answer from IDP: $answer" ) unless ref( $answer ) eq 'ARRAY';
578 } else {
579 throw( "IDP solver returned " . $resp->status_line . " / " . $resp->content
580 . "; cannot run graph analysis" );
581 }
582 # One more sanity check
583 throw( "Something went wrong with answer symmetricity" )
554e2e7d 584 unless keys( %problems ) == @$answer;
03c0a7d7 585 # Convert the results
554e2e7d 586 foreach my $a ( @$answer ) {
587 my $r = Text::Tradition::Analysis::Result->new( $a );
588 $results{$r->object_key} = $r;
589 }
03c0a7d7 590 }
591
592 # We now have a single JSON-encoded Result object per problem sent. Fold its
593 # answers into our variant info structure.
554e2e7d 594 foreach my $graphproblem ( @groups ) {
595 my $result = $results{$graphproblem->{'object'}->object_key}
596 || $graphproblem->{'object'};
03c0a7d7 597
7e17346f 598 # Initialize the result structure for this graph problem
b7bd7aa5 599 my $vstruct;
600 if( $result->status eq 'OK' ) {
601 $vstruct = { readings => [] };
602 push( @$variants, $vstruct );
603 } else {
604 push( @$variants, _init_unsolved( $graphproblem, $result->status ) );
7e17346f 605 next;
5c44c598 606 }
b7bd7aa5 607
7e17346f 608 # 1. Did the group evaluate as genealogical?
03c0a7d7 609 $vstruct->{genealogical} = $result->is_genealogical;
610 $genealogical++ if $result->is_genealogical;
7e17346f 611
612 # 2. What are the calculated minimum groupings for each variant loc?
613 foreach my $rid ( keys %{$graphproblem->{grouping}} ) {
614 my $inputset = $graphproblem->{grouping}->{$rid};
03c0a7d7 615 my $minset = $result->minimum_grouping_for( $inputset );
7e17346f 616 push( @{$vstruct->{readings}}, { readingid => $rid, group => $minset } );
6d25a3a0 617 }
7e17346f 618
619 # 3. What are the sources and classes calculated for each witness?
03c0a7d7 620 $vstruct->{witcopy_types} = { $result->classes };
7e17346f 621 $vstruct->{reading_roots} = {};
03c0a7d7 622 map { $vstruct->{reading_roots}->{$_} = 1 } $result->sources;
7e17346f 623
88a6bac5 624 }
625
626 return { 'variants' => $variants,
627 'variant_count' => scalar @$variants,
628 'genealogical_count' => $genealogical };
629}
630
b7bd7aa5 631sub _init_unsolved {
632 my( $graphproblem, $status ) = @_;
633 my $vstruct = { 'readings' => [] };
634 $vstruct->{'unsolved'} = $status;
635 foreach my $rid ( keys %{$graphproblem->{grouping}} ) {
636 push( @{$vstruct->{readings}}, { readingid => $rid,
637 group => [ $graphproblem->{grouping}->{$rid}->members ] } );
638 }
639 return $vstruct;
640}
641
fae07016 642=head2 analyze_location ( $tradition, $graph, $location_hash )
7f52eac8 643
fae07016 644Given the tradition, its stemma graph, and the solution from the graph solver,
645work out the rest of the information we want. For each reading we need missing,
5c44c598 646conflict, reading_parents, independent_occurrence, followed, not_followed,
647and follow_unknown. Alters the location_hash in place.
7f52eac8 648
649=cut
732152b1 650
fae07016 651sub analyze_location {
7e17346f 652 my ( $tradition, $graph, $variant_row, $lacunose ) = @_;
94654e27 653 my $c = $tradition->collation;
fae07016 654
03c0a7d7 655 if( exists $variant_row->{'unsolved'} ) {
656 return;
657 }
658 my $reading_roots = delete $variant_row->{'reading_roots'};
659 my $classinfo = delete $variant_row->{'witcopy_types'};
660
fae07016 661 # Make a hash of all known node memberships, and make the subgraphs.
662 my $contig = {};
fae07016 663 my $subgraph = {};
638e2a95 664 my $acstr = $c->ac_label;
665 my @acwits;
5c44c598 666
5c44c598 667 # Note which witnesses positively belong to which group. This information
668 # comes ultimately from the IDP solver.
669 # Also make a note of the reading's roots.
fae07016 670 foreach my $rdghash ( @{$variant_row->{'readings'}} ) {
671 my $rid = $rdghash->{'readingid'};
5c44c598 672 my @roots;
638e2a95 673 foreach my $wit ( @{$rdghash->{'group'}} ) {
674 $contig->{$wit} = $rid;
675 if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
676 push( @acwits, $1 );
677 }
5c44c598 678 if( exists $reading_roots->{$wit} && $reading_roots->{$wit} ) {
679 push( @roots, $wit );
680 }
638e2a95 681 }
5c44c598 682 $rdghash->{'independent_occurrence'} = \@roots;
94654e27 683 }
7e17346f 684
fae07016 685 # Now that we have all the node group memberships, calculate followed/
bebec0e9 686 # non-followed/unknown values for each reading. Also figure out the
687 # reading's evident parent(s).
688 foreach my $rdghash ( @{$variant_row->{'readings'}} ) {
fae07016 689 my $rid = $rdghash->{'readingid'};
428bcf0b 690 my $rdg = $c->reading( $rid );
5c44c598 691 my @roots = @{$rdghash->{'independent_occurrence'}};
7e17346f 692 my @reversions;
693 if( $classinfo ) {
694 @reversions = grep { $classinfo->{$_} eq 'revert' }
695 $rdghash->{'group'}->members;
be590045 696 $rdghash->{'reversions'} = \@reversions;
7e17346f 697 }
5c44c598 698 my @group = @{$rdghash->{'group'}};
fae07016 699
700 # Start figuring things out.
7e17346f 701 $rdghash->{'followed'} = scalar( @group )
702 - ( scalar( @roots ) + scalar( @reversions ) );
bebec0e9 703 # Find the parent readings, if any, of this reading.
7e17346f 704 my $sourceparents = _find_reading_parents( $rid, $graph, $contig, @roots );
7e17346f 705 # Work out relationships between readings and their non-followed parent.
706 _resolve_parent_relationships( $c, $rid, $rdg, $sourceparents );
be590045 707 $rdghash->{'source_parents'} = $sourceparents;
e23225e7 708
709 if( @reversions ) {
710 my $revparents = _find_reading_parents( $rid, $graph, $contig, @reversions );
711 _resolve_parent_relationships( $c, $rid, $rdg, $revparents );
712 $rdghash->{'reversion_parents'} = $revparents;
713 }
bebec0e9 714
715 # Find the number of times this reading was altered, and the number of
716 # times we're not sure.
717 my( %nofollow, %unknownfollow );
5c44c598 718 foreach my $wit ( @{$rdghash->{'group'}} ) {
bebec0e9 719 foreach my $wchild ( $graph->successors( $wit ) ) {
5c44c598 720 if( $reading_roots->{$wchild} && $contig->{$wchild}
721 && $contig->{$wchild} ne $rid ) {
bebec0e9 722 # It definitely changed here.
723 $nofollow{$wchild} = 1;
724 } elsif( !($contig->{$wchild}) ) {
725 # The child is a hypothetical node not definitely in
726 # any group. Answer is unknown.
727 $unknownfollow{$wchild} = 1;
5c44c598 728 } # else it is either in our group, or it is a non-root node in a
729 # known group and therefore is presumed to have its reading from
730 # its group, not this link.
bebec0e9 731 }
732 }
733 $rdghash->{'not_followed'} = keys %nofollow;
734 $rdghash->{'follow_unknown'} = keys %unknownfollow;
fae07016 735
736 # Now say whether this reading represents a conflict.
737 unless( $variant_row->{'genealogical'} ) {
7e17346f 738 $rdghash->{'is_conflict'} = @roots != 1;
e23225e7 739 $rdghash->{'is_reverted'} = scalar @reversions;
7e17346f 740 }
741 }
742}
743
744sub _find_reading_parents {
745 my( $rid, $graph, $contig, @list ) = @_;
746 my $parenthash = {};
747 foreach my $wit ( @list ) {
748 # Look in the stemma graph to find this witness's extant or known-reading
749 # immediate ancestor(s), and look up the reading that each ancestor holds.
750 my @check = $graph->predecessors( $wit );
751 while( @check ) {
752 my @next;
753 foreach my $wparent( @check ) {
754 my $preading = $contig->{$wparent};
755 if( $preading && $preading ne $rid ) {
756 $parenthash->{$preading} = 1;
757 } else {
758 push( @next, $graph->predecessors( $wparent ) );
759 }
760 }
761 @check = @next;
762 }
763 }
764 return $parenthash;
765}
766
767sub _resolve_parent_relationships {
768 my( $c, $rid, $rdg, $rdgparents ) = @_;
769 foreach my $p ( keys %$rdgparents ) {
770 # Resolve the relationship of the parent to the reading, and
771 # save it in our hash.
772 my $pobj = $c->reading( $p );
773 my $prep = $pobj ? $pobj->id . ' (' . $pobj->text . ')' : $p;
774 my $phash = { 'label' => $prep };
775 if( $pobj ) {
776 my $rel = $c->get_relationship( $p, $rid );
777 if( $rel ) {
778 _add_to_hash( $rel, $phash );
779 } elsif( $rdg ) {
780 # First check for a transposed relationship
781 if( $rdg->rank != $pobj->rank ) {
782 foreach my $ti ( $rdg->related_readings( 'transposition' ) ) {
783 next unless $ti->text eq $rdg->text;
784 $rel = $c->get_relationship( $ti, $pobj );
785 if( $rel ) {
786 _add_to_hash( $rel, $phash, 1 );
787 last;
788 }
789 }
790 unless( $rel ) {
791 foreach my $ti ( $pobj->related_readings( 'transposition' ) ) {
792 next unless $ti->text eq $pobj->text;
793 $rel = $c->get_relationship( $ti, $rdg );
794 if( $rel ) {
795 _add_to_hash( $rel, $phash, 1 );
796 last;
797 }
798 }
5c44c598 799 }
800 }
7e17346f 801 unless( $rel ) {
802 # and then check for sheer word similarity.
803 my $rtext = $rdg->text;
804 my $ptext = $pobj->text;
805 if( similar( $rtext, $ptext ) ) {
806 # say STDERR "Words $rtext and $ptext judged similar";
807 $phash->{relation} = { type => 'wordsimilar' };
808 }
809 }
5c44c598 810 } else {
7e17346f 811 $phash->{relation} = { type => 'deletion' };
5c44c598 812 }
7e17346f 813 # Get the attributes of the parent object while we are here
814 $phash->{'text'} = $pobj->text if $pobj;
a445ce40 815 if( $pobj && $pobj->does('Text::Tradition::Morphology') ) {
816 $phash->{'is_nonsense'} = $pobj->is_nonsense;
817 $phash->{'is_ungrammatical'} = $pobj->grammar_invalid;
818 }
7e17346f 819 } elsif( $p eq '(omitted)' ) {
820 $phash->{relation} = { type => 'addition' };
821 }
822 # Save it
823 $rdgparents->{$p} = $phash;
824 }
d71100ed 825}
826
428bcf0b 827sub _add_to_hash {
828 my( $rel, $phash, $is_transposed ) = @_;
829 $phash->{relation} = { type => $rel->type };
830 $phash->{relation}->{transposed} = 1 if $is_transposed;
831 $phash->{relation}->{annotation} = $rel->annotation
832 if $rel->has_annotation;
833}
834
835=head2 similar( $word1, $word2 )
836
837Use Algorithm::Diff to get a sense of how close the words are to each other.
838This will hopefully handle substitutions a bit more nicely than Levenshtein.
839
840=cut
841
842#!/usr/bin/env perl
843
844sub similar {
845 my( $word1, $word2 ) = sort { length($a) <=> length($b) } @_;
846 my @let1 = split( '', lc( $word1 ) );
847 my @let2 = split( '', lc( $word2 ) );
848 my $diff = Algorithm::Diff->new( \@let1, \@let2 );
849 my $mag = 0;
850 while( $diff->Next ) {
851 if( $diff->Same ) {
852 # Take off points for longer strings
853 my $cs = $diff->Range(1) - 2;
854 $cs = 0 if $cs < 0;
855 $mag -= $cs;
856 } elsif( !$diff->Items(1) ) {
857 $mag += $diff->Range(2);
858 } elsif( !$diff->Items(2) ) {
859 $mag += $diff->Range(1);
860 } else {
861 # Split the difference for substitutions
862 my $c1 = $diff->Range(1) || 1;
863 my $c2 = $diff->Range(2) || 1;
864 my $cd = ( $c1 + $c2 ) / 2;
865 $mag += $cd;
866 }
867 }
868 return ( $mag <= length( $word1 ) / 2 );
869}
870
5be0cdeb 871sub _useful_variant {
7e17346f 872 my( $rankgroup, $rankgraph, $acstr ) = @_;
5be0cdeb 873
874 # Sort by group size and return
875 my $is_useful = 0;
7e17346f 876 foreach my $rdg ( keys %$rankgroup ) {
877 my @wits = $rankgroup->{$rdg}->members;
878 if( @wits > 1 ) {
5be0cdeb 879 $is_useful++;
880 } else {
7e17346f 881 $is_useful++ unless( $rankgraph->is_sink_vertex( $wits[0] )
882 || $wits[0] =~ /\Q$acstr\E$/ );
5be0cdeb 883 }
884 }
7e17346f 885 return $is_useful > 1;
5be0cdeb 886}
887
7f52eac8 888=head2 wit_stringify( $groups )
889
890Takes an array of witness groupings and produces a string like
891['A','B'] / ['C','D','E'] / ['F']
d71100ed 892
7f52eac8 893=cut
d71100ed 894
895sub wit_stringify {
896 my $groups = shift;
897 my @gst;
898 # If we were passed an array of witnesses instead of an array of
899 # groupings, then "group" the witnesses first.
900 unless( ref( $groups->[0] ) ) {
901 my $mkgrp = [ $groups ];
902 $groups = $mkgrp;
903 }
904 foreach my $g ( @$groups ) {
905 push( @gst, '[' . join( ',', map { "'$_'" } @$g ) . ']' );
906 }
907 return join( ' / ', @gst );
908}
7f52eac8 909
7f52eac8 9101;
911
a745c3d9 912sub throw {
913 Text::Tradition::Error->throw(
914 'ident' => 'Analysis error',
915 'message' => $_[0],
916 );
917}
918
7f52eac8 919=head1 LICENSE
920
921This package is free software and is provided "as is" without express
922or implied warranty. You can redistribute it and/or modify it under
923the same terms as Perl itself.
924
925=head1 AUTHOR
926
927Tara L Andrews E<lt>aurum@cpan.orgE<gt>