load extensions statically to avoid bad object wrapping interactions
[scpubgit/stemmatology.git] / analysis / lib / Text / Tradition / Analysis.pm
CommitLineData
d71100ed 1package Text::Tradition::Analysis;
2
3use strict;
4use warnings;
428bcf0b 5use Algorithm::Diff; # for word similarity measure
03c0a7d7 6use Encode qw/ decode_utf8 encode_utf8 /;
d1348d38 7use Exporter 'import';
b4cb2d60 8use Graph;
a745c3d9 9use JSON qw/ to_json decode_json /;
03c0a7d7 10use LWP::UserAgent;
7e17346f 11use Set::Scalar;
7e17346f 12use Text::Tradition::Analysis::Result;
13use Text::Tradition::Directory;
d71100ed 14use Text::Tradition::Stemma;
d120c995 15use TryCatch;
d71100ed 16
951ddfe8 17use vars qw/ @EXPORT_OK $VERSION /;
a2cf85dd 18@EXPORT_OK = qw/ run_analysis group_variants analyze_variant_location wit_stringify /;
37bf09f4 19$VERSION = "1.1";
951ddfe8 20
d1348d38 21
03c0a7d7 22my $SOLVER_URL = 'http://byzantini.st/cgi-bin/graphcalc.cgi';
7e17346f 23my $unsolved_problems = {};
5c44c598 24
7f52eac8 25=head1 NAME
26
27Text::Tradition::Analysis - functions for stemma analysis of a tradition
28
29=head1 SYNOPSIS
30
31 use Text::Tradition;
32 use Text::Tradition::Analysis qw/ run_analysis analyze_variant_location /;
33 my $t = Text::Tradition->new(
34 'name' => 'this is a text',
35 'input' => 'TEI',
36 'file' => '/path/to/tei_parallel_seg_file.xml' );
37 $t->add_stemma( 'dotfile' => $stemmafile );
38
39 my $variant_data = run_analysis( $tradition );
40 # Recalculate rank $n treating all orthographic variants as equivalent
41 my $reanalyze = analyze_variant_location( $tradition, $n, 0, 'orthographic' );
42
43=head1 DESCRIPTION
44
45Text::Tradition is a library for representation and analysis of collated
951ddfe8 46texts, particularly medieval ones. Where the Collation is the central feature of
47a Tradition, it may also have one or more temmata associated with it, and these stemmata may be analyzed. This package provides the following modules:
48
49=over 4
50
51=item * L<Text::Tradition::HasStemma> - a role that can be composed into Text::Tradition objects, providing the ability for Text::Tradition::Stemma objects to be associated with them.
52
53=item * L<Text::Tradition::Stemma> - an object class that represents stemma hypotheses, both rooted (with a single archetype) and unrooted (e.g. phylogenetic trees).
54
55=item * Text::Tradition::Analysis (this package). Provides functions for the analysis of a given stemma against the collation within a given Tradition.
56
57=back
7f52eac8 58
59=head1 SUBROUTINES
60
88a6bac5 61=head2 run_analysis( $tradition, %opts )
7f52eac8 62
88a6bac5 63Runs the analysis described in analyze_variant_location on every location in the
64collation of the given tradition, with the given options. These include:
7f52eac8 65
88a6bac5 66=over 4
67
68=item * stemma_id - Specify which of the tradition's stemmata to use. Default
69is 0 (i.e. the first).
70
71=item * ranks - Specify a list of location ranks to analyze; exclude the rest.
72
73=item * merge_types - Specify a list of relationship types, where related readings
74should be treated as identical for the purposes of analysis.
75
ffa22d6f 76=item * exclude_type1 - Exclude those ranks whose groupings have only type-1 variants.
77
88a6bac5 78=back
7f52eac8 79
80=begin testing
81
82use Text::Tradition;
83use Text::Tradition::Analysis qw/ run_analysis analyze_variant_location /;
84
85my $datafile = 't/data/florilegium_tei_ps.xml';
86my $tradition = Text::Tradition->new( 'input' => 'TEI',
87 'name' => 'test0',
88 'file' => $datafile );
89my $s = $tradition->add_stemma( 'dotfile' => 't/data/florilegium.dot' );
90is( ref( $s ), 'Text::Tradition::Stemma', "Added stemma to tradition" );
91
f00cefe8 92my %expected_genealogical = (
a44aaf2a 93 1 => 0,
f00cefe8 94 2 => 1,
a44aaf2a 95 3 => 0,
96 5 => 0,
97 7 => 0,
98 8 => 0,
99 10 => 0,
f00cefe8 100 13 => 1,
a44aaf2a 101 33 => 0,
102 34 => 0,
103 37 => 0,
104 60 => 0,
f00cefe8 105 81 => 1,
a44aaf2a 106 84 => 0,
107 87 => 0,
108 101 => 0,
109 102 => 0,
f00cefe8 110 122 => 1,
a44aaf2a 111 157 => 0,
f00cefe8 112 166 => 1,
113 169 => 1,
a44aaf2a 114 200 => 0,
f00cefe8 115 216 => 1,
116 217 => 1,
117 219 => 1,
118 241 => 1,
119 242 => 1,
120 243 => 1,
121);
122
7e17346f 123my $data = run_analysis( $tradition, calcdsn => 'dbi:SQLite:dbname=t/data/analysis.db' );
7234b01d 124my $c = $tradition->collation;
f00cefe8 125foreach my $row ( @{$data->{'variants'}} ) {
a44aaf2a 126 # Account for rows that used to be "not useful"
127 unless( exists $expected_genealogical{$row->{'id'}} ) {
128 $expected_genealogical{$row->{'id'}} = 1;
129 }
18f48b82 130 my $gen_bool = $row->{'genealogical'} ? 1 : 0;
131 is( $gen_bool, $expected_genealogical{$row->{'id'}},
f00cefe8 132 "Got correct genealogical flag for row " . $row->{'id'} );
7234b01d 133 # Check that we have the right row with the right groups
134 my $rank = $row->{'id'};
135 foreach my $rdghash ( @{$row->{'readings'}} ) {
136 # Skip 'readings' that aren't really
137 next unless $c->reading( $rdghash->{'readingid'} );
138 # Check the rank
139 is( $c->reading( $rdghash->{'readingid'} )->rank, $rank,
140 "Got correct reading rank" );
141 # Check the witnesses
142 my @realwits = sort $c->reading_witnesses( $rdghash->{'readingid'} );
143 my @sgrp = sort @{$rdghash->{'group'}};
144 is_deeply( \@sgrp, \@realwits, "Reading analyzed with correct groups" );
145 }
f00cefe8 146}
a44aaf2a 147is( $data->{'variant_count'}, 58, "Got right total variant number" );
b4cb2d60 148# TODO Make something meaningful of conflict count, maybe test other bits
7f52eac8 149
150=end testing
151
152=cut
153
d71100ed 154sub run_analysis {
88a6bac5 155 my( $tradition, %opts ) = @_;
f00cefe8 156 my $c = $tradition->collation;
7e17346f 157 my $aclabel = $c->ac_label;
88a6bac5 158
159 my $stemma_id = $opts{'stemma_id'} || 0;
1d73ecad 160 my @ranks = ref( $opts{'ranks'} ) eq 'ARRAY' ? @{$opts{'ranks'}} : ();
e23225e7 161 my $collapse = Set::Scalar->new();
162 if( $opts{'merge_types'} && ref( $opts{'merge_types'} ) eq 'ARRAY' ) {
163 $collapse->insert( @{$opts{'merge_types'}} );
164 } elsif( $opts{'merge_types'} ) {
165 $collapse->insert( $opts{'merge_types'} );
166 }
7e17346f 167
168 # Make sure we have a lookup DB for graph calculation results; this will die
169 # if calcdir or calcdsn isn't passed.
03c0a7d7 170 my $dir;
171 if( exists $opts{'calcdir'} ) {
172 $dir = delete $opts{'calcdir'}
173 } elsif ( exists $opts{'calcdsn'} ) {
174 $dir = Text::Tradition::Directory->new( dsn => $opts{'calcdsn'} );
175 }
88a6bac5 176
177 # Get the stemma
178 my $stemma = $tradition->stemma( $stemma_id );
b4cb2d60 179
4ce27d42 180 # Figure out which witnesses we are working with - that is, the ones that
181 # appear both in the stemma and in the tradition. All others are 'lacunose'
182 # for our purposes.
7e17346f 183 my $lacunose = Set::Scalar->new( $stemma->hypotheticals );
184 my $stemma_wits = Set::Scalar->new( $stemma->witnesses );
185 my $tradition_wits = Set::Scalar->new( map { $_->sigil } $tradition->witnesses );
186 $lacunose->insert( $stemma_wits->symmetric_difference( $tradition_wits )->members );
88a6bac5 187
188 # Find and mark 'common' ranks for exclusion, unless they were
189 # explicitly specified.
190 unless( @ranks ) {
191 my %common_rank;
a44aaf2a 192 foreach my $rdg ( $c->common_readings ) {
88a6bac5 193 $common_rank{$rdg->rank} = 1;
194 }
195 @ranks = grep { !$common_rank{$_} } ( 1 .. $c->end->rank-1 );
d71100ed 196 }
7f52eac8 197
88a6bac5 198 # Group the variants to send to the solver
199 my @groups;
f629cb3b 200 my @use_ranks;
a44aaf2a 201 my %lacunae;
94654e27 202 my $moved = {};
88a6bac5 203 foreach my $rank ( @ranks ) {
7e17346f 204 my $missing = $lacunose->clone();
e23225e7 205 my $rankgroup = group_variants( $tradition, $rank, $missing, $moved, $collapse );
94654e27 206 # Filter out any empty rankgroups
207 # (e.g. from the later rank for a transposition)
208 next unless keys %$rankgroup;
7e17346f 209 # Get the graph for this rankgroup
210 my $rankgraph = _graph_for_grouping( $stemma, $rankgroup, $missing, $aclabel );
ffa22d6f 211 if( $opts{'exclude_type1'} ) {
212 # Check to see whether this is a "useful" group.
7e17346f 213 next unless _useful_variant( $rankgroup, $rankgraph, $aclabel );
ffa22d6f 214 }
f629cb3b 215 push( @use_ranks, $rank );
7e17346f 216 push( @groups, { grouping => $rankgroup, graph => $rankgraph } );
a44aaf2a 217 $lacunae{$rank} = $missing;
d71100ed 218 }
4ce27d42 219 # Run the solver
b7bd7aa5 220 my $answer;
221 try {
222 $answer = solve_variants( $dir, @groups );
223 } catch ( Text::Tradition::Error $e ) {
224 if( $e->message =~ /IDP/ ) {
225 # Something is wrong with the solver; make the variants table anyway
226 $answer->{'variants'} = [];
227 map { push( @{$answer->{'variants'}}, _init_unsolved( $_, 'IDP error' ) ) }
228 @groups;
229 } else {
230 # Something else is wrong; error out.
231 $e->throw;
232 }
233 }
fae07016 234
88a6bac5 235 # Do further analysis on the answer
a44aaf2a 236 my $conflict_count = 0;
7e17346f 237 my $reversion_count = 0;
f629cb3b 238 foreach my $idx ( 0 .. $#use_ranks ) {
88a6bac5 239 my $location = $answer->{'variants'}->[$idx];
240 # Add the rank back in
94654e27 241 my $rank = $use_ranks[$idx];
242 $location->{'id'} = $rank;
7234b01d 243 # Note what our lacunae are
f629cb3b 244 my %lmiss;
7234b01d 245 map { $lmiss{$_} = 1 } @{$lacunae{$use_ranks[$idx]}};
7234b01d 246 $location->{'missing'} = [ keys %lmiss ];
247
88a6bac5 248 # Run the extra analysis we need.
428bcf0b 249 ## TODO We run through all the variants in this call, so
250 ## why not add the reading data there instead of here below?
7e17346f 251 my $graph = $groups[$idx]->{graph};
252 analyze_location( $tradition, $graph, $location, \%lmiss );
7234b01d 253
638e2a95 254 my @layerwits;
7234b01d 255 # Do the final post-analysis tidying up of the data.
a44aaf2a 256 foreach my $rdghash ( @{$location->{'readings'}} ) {
e23225e7 257 $conflict_count++ if $rdghash->{'is_conflict'};
258 $reversion_count++ if $rdghash->{'is_reverted'};
94654e27 259 # Add the reading text back in, setting display value as needed
a44aaf2a 260 my $rdg = $c->reading( $rdghash->{'readingid'} );
94654e27 261 if( $rdg ) {
262 $rdghash->{'text'} = $rdg->text .
263 ( $rdg->rank == $rank ? '' : ' [' . $rdg->rank . ']' );
a445ce40 264 if( $rdg->does( 'Text::Tradition::Morphology' ) ) {
265 $rdghash->{'is_ungrammatical'} = $rdg->grammar_invalid;
266 $rdghash->{'is_nonsense'} = $rdg->is_nonsense;
267 }
94654e27 268 }
f629cb3b 269 # Remove lacunose witnesses from this reading's list now that the
7234b01d 270 # analysis is done
f629cb3b 271 my @realgroup;
7234b01d 272 map { push( @realgroup, $_ ) unless $lmiss{$_} } @{$rdghash->{'group'}};
f629cb3b 273 $rdghash->{'group'} = \@realgroup;
638e2a95 274 # Note any layered witnesses that appear in this group
275 foreach( @realgroup ) {
276 if( $_ =~ /^(.*)\Q$aclabel\E$/ ) {
277 push( @layerwits, $1 );
278 }
279 }
a44aaf2a 280 }
638e2a95 281 $location->{'layerwits'} = \@layerwits if @layerwits;
88a6bac5 282 }
a44aaf2a 283 $answer->{'conflict_count'} = $conflict_count;
7e17346f 284 $answer->{'reversion_count'} = $reversion_count;
f00cefe8 285
88a6bac5 286 return $answer;
d71100ed 287}
288
e23225e7 289=head2 group_variants( $tradition, $rank, $lacunose, $transposed, $merge_relationship_types )
7f52eac8 290
291Groups the variants at the given $rank of the collation, treating any
e23225e7 292relationships in the set $merge_relationship_types as equivalent.
293$lacunose should be a reference to an array, to which the sigla of lacunose
294witnesses at this rank will be appended; $transposed should be a reference
295to a hash, wherein the identities of transposed readings and their
296relatives will be stored.
7f52eac8 297
ffa22d6f 298Returns a hash $group_readings where $rdg is attested by the witnesses listed
299in $group_readings->{$rdg}.
7f52eac8 300
301=cut
302
303# Return group_readings, groups, lacunose
d1348d38 304sub group_variants {
94654e27 305 my( $tradition, $rank, $lacunose, $transposed, $collapse ) = @_;
7f52eac8 306 my $c = $tradition->collation;
335a62ef 307 my $aclabel = $c->ac_label;
d120c995 308 my $table = $c->alignment_table;
7f52eac8 309 # Get the alignment table readings
310 my %readings_at_rank;
7e17346f 311 my $check_for_gaps = Set::Scalar->new();
94654e27 312 my %moved_wits;
d120c995 313 my $has_transposition;
314 foreach my $tablewit ( @{$table->{'alignment'}} ) {
7f52eac8 315 my $rdg = $tablewit->{'tokens'}->[$rank-1];
fae07016 316 my $wit = $tablewit->{'witness'};
ffa22d6f 317 # Exclude the witness if it is "lacunose" which if we got here
318 # means "not in the stemma".
7e17346f 319 next if _is_lacunose( $wit, $lacunose, $aclabel );
94654e27 320 # Note if the witness is actually in a lacuna
7f52eac8 321 if( $rdg && $rdg->{'t'}->is_lacuna ) {
335a62ef 322 _add_to_witlist( $wit, $lacunose, $aclabel );
94654e27 323 # Otherwise the witness either has a positive reading...
7f52eac8 324 } elsif( $rdg ) {
94654e27 325 # If the reading has been counted elsewhere as a transposition, ignore it.
326 if( $transposed->{$rdg->{'t'}->id} ) {
d120c995 327 # TODO Does this cope with three-way transpositions?
94654e27 328 map { $moved_wits{$_} = 1 } @{$transposed->{$rdg->{'t'}->id}};
329 next;
330 }
331 # Otherwise, record it...
332 $readings_at_rank{$rdg->{'t'}->id} = $rdg->{'t'};
333 # ...and grab any transpositions, and their relations.
334 my @transp = grep { $_->rank != $rank } $rdg->{'t'}->related_readings();
335 foreach my $trdg ( @transp ) {
d120c995 336 next if exists $readings_at_rank{$trdg->id};
337 $has_transposition = 1;
338 my @affected_wits = _table_witnesses(
7e17346f 339 $table, $trdg, $lacunose, $aclabel );
d120c995 340 next unless @affected_wits;
341 map { $moved_wits{$_} = 1 } @affected_wits;
342 $transposed->{$trdg->id} =
7e17346f 343 [ _table_witnesses( $table, $rdg->{'t'}, $lacunose, $aclabel ) ];
94654e27 344 $readings_at_rank{$trdg->id} = $trdg;
345 }
346 # ...or it is empty, ergo a gap.
7f52eac8 347 } else {
7e17346f 348 _add_to_witlist( $wit, $check_for_gaps, $aclabel );
7f52eac8 349 }
350 }
7e17346f 351 my $gap_wits = Set::Scalar->new();
352 map { _add_to_witlist( $_, $gap_wits, $aclabel )
353 unless $moved_wits{$_} } $check_for_gaps->members;
354
355 # Group the readings, collapsing groups by relationship if needed.
d120c995 356 my $grouped_readings = {};
4ce27d42 357 foreach my $rdg ( values %readings_at_rank ) {
7f52eac8 358 # Skip readings that have been collapsed into others.
d120c995 359 next if exists $grouped_readings->{$rdg->id}
360 && $grouped_readings->{$rdg->id} eq 'COLLAPSE';
4ce27d42 361 # Get the witness list, including from readings collapsed into this one.
7e17346f 362 my @wits = _table_witnesses( $table, $rdg, $lacunose, $aclabel );
e23225e7 363 if( $collapse && $collapse->size ) {
364 my $filter = sub { $collapse->has( $_[0]->type ) };
7f52eac8 365 foreach my $other ( $rdg->related_readings( $filter ) ) {
7e17346f 366 my @otherwits = _table_witnesses( $table, $other, $lacunose, $aclabel );
fae07016 367 push( @wits, @otherwits );
d120c995 368 $grouped_readings->{$other->id} = 'COLLAPSE';
d1348d38 369 }
370 }
7e17346f 371 $grouped_readings->{$rdg->id} = Set::Scalar->new( @wits );
7f52eac8 372 }
7e17346f 373 if( $gap_wits->members ) {
374 $grouped_readings->{'(omitted)'} = $gap_wits;
375 }
376
7f52eac8 377 # Get rid of our collapsed readings
7e17346f 378 map { delete $grouped_readings->{$_} if(
379 $grouped_readings->{$_} eq 'COLLAPSE'
380 || $grouped_readings->{$_}->is_empty ) }
381 keys %$grouped_readings;
d120c995 382
383 # If something was transposed, check the groups for doubled-up readings
384 if( $has_transposition ) {
62a39b8f 385 # print STDERR "Group for rank $rank:\n";
386 # map { print STDERR "\t$_: " . join( ' ' , @{$grouped_readings->{$_}} ) . "\n" }
387 # keys %$grouped_readings;
d120c995 388 _check_transposed_consistency( $c, $rank, $transposed, $grouped_readings );
389 }
7f52eac8 390
4ce27d42 391 # Return the result
d120c995 392 return $grouped_readings;
393}
394
395# Helper function to query the alignment table for all witnesses (a.c. included)
396# that have a given reading at its rank.
397sub _table_witnesses {
398 my( $table, $trdg, $lacunose, $aclabel ) = @_;
399 my $tableidx = $trdg->rank - 1;
7e17346f 400 my $has_reading = Set::Scalar->new();
d120c995 401 foreach my $row ( @{$table->{'alignment'}} ) {
402 my $wit = $row->{'witness'};
7e17346f 403 next if _is_lacunose( $wit, $lacunose, $aclabel );
d120c995 404 my $rdg = $row->{'tokens'}->[$tableidx];
405 next unless exists $rdg->{'t'} && defined $rdg->{'t'};
7e17346f 406 _add_to_witlist( $wit, $has_reading, $aclabel )
d120c995 407 if $rdg->{'t'}->id eq $trdg->id;
408 }
7e17346f 409 return $has_reading->members;
410}
411
412# Helper function to see if a witness is lacunose even if we are asking about
413# the a.c. version
414sub _is_lacunose {
415 my ( $wit, $lac, $acstr ) = @_;
416 if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
417 $wit = $1;
418 }
419 return $lac->has( $wit );
d1348d38 420}
421
335a62ef 422# Helper function to ensure that X and X a.c. never appear in the same list.
423sub _add_to_witlist {
424 my( $wit, $list, $acstr ) = @_;
335a62ef 425 if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
7e17346f 426 # Don't add X a.c. if we already have X
427 return if $list->has( $1 );
335a62ef 428 } else {
7e17346f 429 # Delete X a.c. if we are about to add X
430 $list->delete( $wit.$acstr );
335a62ef 431 }
7e17346f 432 $list->insert( $wit );
335a62ef 433}
434
d120c995 435sub _check_transposed_consistency {
436 my( $c, $rank, $transposed, $groupings ) = @_;
437 my %seen_wits;
438 my %thisrank;
439 # Note which readings are actually at this rank, and which witnesses
440 # belong to which reading.
441 foreach my $rdg ( keys %$groupings ) {
442 my $rdgobj = $c->reading( $rdg );
443 # Count '(omitted)' as a reading at this rank
444 $thisrank{$rdg} = 1 if !$rdgobj || $rdgobj->rank == $rank;
445 map { push( @{$seen_wits{$_}}, $rdg ) } @{$groupings->{$rdg}};
446 }
447 # Our work is done if we have no witness belonging to more than one
448 # reading.
449 my @doubled = grep { scalar @{$seen_wits{$_}} > 1 } keys %seen_wits;
450 return unless @doubled;
451 # If we have a symmetric related transposition, drop the non-rank readings.
452 if( @doubled == scalar keys %seen_wits ) {
453 foreach my $rdg ( keys %$groupings ) {
454 if( !$thisrank{$rdg} ) {
455 my $groupstr = wit_stringify( $groupings->{$rdg} );
456 my ( $matched ) = grep { $groupstr eq wit_stringify( $groupings->{$_} ) }
457 keys %thisrank;
458 delete $groupings->{$rdg};
459 # If we found a group match, assume there is a symmetry happening.
460 # TODO think more about this
62a39b8f 461 # print STDERR "*** Deleting symmetric reading $rdg\n";
d120c995 462 unless( $matched ) {
463 delete $transposed->{$rdg};
464 warn "Found problem in evident symmetry with reading $rdg";
465 }
466 }
467 }
468 # Otherwise 'unhook' the transposed reading(s) that have duplicates.
469 } else {
470 foreach my $dup ( @doubled ) {
471 foreach my $rdg ( @{$seen_wits{$dup}} ) {
472 next if $thisrank{$rdg};
473 next unless exists $groupings->{$rdg};
62a39b8f 474 # print STDERR "*** Deleting asymmetric doubled-up reading $rdg\n";
d120c995 475 delete $groupings->{$rdg};
476 delete $transposed->{$rdg};
477 }
478 }
479 # and put any now-orphaned readings into an 'omitted' reading.
480 foreach my $wit ( keys %seen_wits ) {
481 unless( grep { exists $groupings->{$_} } @{$seen_wits{$wit}} ) {
7e17346f 482 $groupings->{'(omitted)'} = Set::Scalar->new()
483 unless exists $groupings->{'(omitted)'};
d120c995 484 _add_to_witlist( $wit, $groupings->{'(omitted)'}, $c->ac_label );
485 }
486 }
487 }
488}
489
7e17346f 490# For the given grouping, return its situation graph based on the stemma.
491sub _graph_for_grouping {
492 my( $stemma, $grouping, $lacunose, $aclabel ) = @_;
493 my $acwits = [];
494 my $extant = {};
495 foreach my $gs ( values %$grouping ) {
496 map {
497 if( $_ =~ /^(.*)\Q$aclabel\E$/ ) {
498 push( @$acwits, $1 ) unless $lacunose->has( $1 );
499 } else {
500 $extant->{$_} = 1 unless $lacunose->has( $_ );
501 }
502 } $gs->members;
503 }
504 my $graph;
505 try {
506 # contig contains all extant wits and all hypothetical wits
507 # needed to make up the groups.
ace5fce5 508 $graph = $stemma->situation_graph( $extant, $acwits, $aclabel );
7e17346f 509 } catch ( Text::Tradition::Error $e ) {
03c0a7d7 510 throw( "Could not extend graph with given extant and a.c. witnesses: "
511 . $e->message );
7e17346f 512 } catch {
03c0a7d7 513 throw( "Could not extend graph with a.c. witnesses @$acwits" );
7e17346f 514 }
515 return $graph;
516}
88a6bac5 517
7e17346f 518=head2 solve_variants( $calcdir, @groups )
519
520Looks up the set of groups in the answers provided by the external graph solver
521service and returns a cleaned-up answer, adding the rank IDs back where they belong.
88a6bac5 522
88a6bac5 523The answer has the form
524 { "variants" => [ array of variant location structures ],
525 "variant_count" => total,
526 "conflict_count" => number of conflicts detected,
527 "genealogical_count" => number of solutions found }
528
529=cut
530
531sub solve_variants {
03c0a7d7 532 my( @groups ) = @_;
533
a745c3d9 534 # Are we using a local result directory, or did we pass an empty value
535 # for the directory?
03c0a7d7 536 my $dir;
a745c3d9 537 unless( ref( $groups[0] ) eq 'HASH' ) {
03c0a7d7 538 $dir = shift @groups;
539 }
335a62ef 540
03c0a7d7 541 ## For each graph/group combo, make a Text::Tradition::Analysis::Result
542 ## object so that we can send it off for IDP lookup.
7e17346f 543 my $variants = [];
544 my $genealogical = 0; # counter
03c0a7d7 545 # TODO Optimize for unique graph problems
554e2e7d 546 my %problems;
7e17346f 547 foreach my $graphproblem ( @groups ) {
03c0a7d7 548 # Construct the calc result key and look up its answer
549 my $problem = Text::Tradition::Analysis::Result->new(
550 graph => $graphproblem->{'graph'},
551 setlist => [ values %{$graphproblem->{'grouping'}} ] );
554e2e7d 552 if( exists $problems{$problem->object_key} ) {
553 $problem = $problems{$problem->object_key};
554 } else {
555 $problems{$problem->object_key} = $problem;
556 }
557 $graphproblem->{'object'} = $problem;
03c0a7d7 558 }
559
554e2e7d 560 my %results;
03c0a7d7 561 if( $dir ) {
562 my $scope = $dir->new_scope;
554e2e7d 563 map { $results{$_} = $dir->lookup( $_ ) || $problems{$_} } keys %problems;
564 } else {
565 my $json = JSON->new->allow_blessed->convert_blessed->utf8->encode(
566 [ values %problems ] );
03c0a7d7 567 # Send it off and get the result
568 # print STDERR "Sending request: " . decode_utf8( $json ) . "\n";
569 my $ua = LWP::UserAgent->new();
570 my $resp = $ua->post( $SOLVER_URL, 'Content-Type' => 'application/json',
571 'Content' => $json );
a745c3d9 572 my $answer;
03c0a7d7 573 if( $resp->is_success ) {
a745c3d9 574 $answer = decode_json( $resp->content );
03c0a7d7 575 throw( "Unexpected answer from IDP: $answer" ) unless ref( $answer ) eq 'ARRAY';
576 } else {
577 throw( "IDP solver returned " . $resp->status_line . " / " . $resp->content
578 . "; cannot run graph analysis" );
579 }
580 # One more sanity check
581 throw( "Something went wrong with answer symmetricity" )
554e2e7d 582 unless keys( %problems ) == @$answer;
03c0a7d7 583 # Convert the results
554e2e7d 584 foreach my $a ( @$answer ) {
585 my $r = Text::Tradition::Analysis::Result->new( $a );
586 $results{$r->object_key} = $r;
587 }
03c0a7d7 588 }
589
590 # We now have a single JSON-encoded Result object per problem sent. Fold its
591 # answers into our variant info structure.
554e2e7d 592 foreach my $graphproblem ( @groups ) {
593 my $result = $results{$graphproblem->{'object'}->object_key}
594 || $graphproblem->{'object'};
03c0a7d7 595
7e17346f 596 # Initialize the result structure for this graph problem
b7bd7aa5 597 my $vstruct;
598 if( $result->status eq 'OK' ) {
599 $vstruct = { readings => [] };
600 push( @$variants, $vstruct );
601 } else {
602 push( @$variants, _init_unsolved( $graphproblem, $result->status ) );
7e17346f 603 next;
5c44c598 604 }
b7bd7aa5 605
7e17346f 606 # 1. Did the group evaluate as genealogical?
03c0a7d7 607 $vstruct->{genealogical} = $result->is_genealogical;
608 $genealogical++ if $result->is_genealogical;
7e17346f 609
610 # 2. What are the calculated minimum groupings for each variant loc?
611 foreach my $rid ( keys %{$graphproblem->{grouping}} ) {
612 my $inputset = $graphproblem->{grouping}->{$rid};
03c0a7d7 613 my $minset = $result->minimum_grouping_for( $inputset );
7e17346f 614 push( @{$vstruct->{readings}}, { readingid => $rid, group => $minset } );
6d25a3a0 615 }
7e17346f 616
617 # 3. What are the sources and classes calculated for each witness?
03c0a7d7 618 $vstruct->{witcopy_types} = { $result->classes };
7e17346f 619 $vstruct->{reading_roots} = {};
03c0a7d7 620 map { $vstruct->{reading_roots}->{$_} = 1 } $result->sources;
7e17346f 621
88a6bac5 622 }
623
624 return { 'variants' => $variants,
625 'variant_count' => scalar @$variants,
626 'genealogical_count' => $genealogical };
627}
628
b7bd7aa5 629sub _init_unsolved {
630 my( $graphproblem, $status ) = @_;
631 my $vstruct = { 'readings' => [] };
632 $vstruct->{'unsolved'} = $status;
633 foreach my $rid ( keys %{$graphproblem->{grouping}} ) {
634 push( @{$vstruct->{readings}}, { readingid => $rid,
635 group => [ $graphproblem->{grouping}->{$rid}->members ] } );
636 }
637 return $vstruct;
638}
639
fae07016 640=head2 analyze_location ( $tradition, $graph, $location_hash )
7f52eac8 641
fae07016 642Given the tradition, its stemma graph, and the solution from the graph solver,
643work out the rest of the information we want. For each reading we need missing,
5c44c598 644conflict, reading_parents, independent_occurrence, followed, not_followed,
645and follow_unknown. Alters the location_hash in place.
7f52eac8 646
647=cut
732152b1 648
fae07016 649sub analyze_location {
7e17346f 650 my ( $tradition, $graph, $variant_row, $lacunose ) = @_;
94654e27 651 my $c = $tradition->collation;
fae07016 652
03c0a7d7 653 if( exists $variant_row->{'unsolved'} ) {
654 return;
655 }
656 my $reading_roots = delete $variant_row->{'reading_roots'};
657 my $classinfo = delete $variant_row->{'witcopy_types'};
658
fae07016 659 # Make a hash of all known node memberships, and make the subgraphs.
660 my $contig = {};
fae07016 661 my $subgraph = {};
638e2a95 662 my $acstr = $c->ac_label;
663 my @acwits;
5c44c598 664
5c44c598 665 # Note which witnesses positively belong to which group. This information
666 # comes ultimately from the IDP solver.
667 # Also make a note of the reading's roots.
fae07016 668 foreach my $rdghash ( @{$variant_row->{'readings'}} ) {
669 my $rid = $rdghash->{'readingid'};
5c44c598 670 my @roots;
638e2a95 671 foreach my $wit ( @{$rdghash->{'group'}} ) {
672 $contig->{$wit} = $rid;
673 if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
674 push( @acwits, $1 );
675 }
5c44c598 676 if( exists $reading_roots->{$wit} && $reading_roots->{$wit} ) {
677 push( @roots, $wit );
678 }
638e2a95 679 }
5c44c598 680 $rdghash->{'independent_occurrence'} = \@roots;
94654e27 681 }
7e17346f 682
fae07016 683 # Now that we have all the node group memberships, calculate followed/
bebec0e9 684 # non-followed/unknown values for each reading. Also figure out the
685 # reading's evident parent(s).
686 foreach my $rdghash ( @{$variant_row->{'readings'}} ) {
fae07016 687 my $rid = $rdghash->{'readingid'};
428bcf0b 688 my $rdg = $c->reading( $rid );
5c44c598 689 my @roots = @{$rdghash->{'independent_occurrence'}};
7e17346f 690 my @reversions;
691 if( $classinfo ) {
692 @reversions = grep { $classinfo->{$_} eq 'revert' }
693 $rdghash->{'group'}->members;
be590045 694 $rdghash->{'reversions'} = \@reversions;
7e17346f 695 }
5c44c598 696 my @group = @{$rdghash->{'group'}};
fae07016 697
698 # Start figuring things out.
7e17346f 699 $rdghash->{'followed'} = scalar( @group )
700 - ( scalar( @roots ) + scalar( @reversions ) );
bebec0e9 701 # Find the parent readings, if any, of this reading.
7e17346f 702 my $sourceparents = _find_reading_parents( $rid, $graph, $contig, @roots );
7e17346f 703 # Work out relationships between readings and their non-followed parent.
704 _resolve_parent_relationships( $c, $rid, $rdg, $sourceparents );
be590045 705 $rdghash->{'source_parents'} = $sourceparents;
e23225e7 706
707 if( @reversions ) {
708 my $revparents = _find_reading_parents( $rid, $graph, $contig, @reversions );
709 _resolve_parent_relationships( $c, $rid, $rdg, $revparents );
710 $rdghash->{'reversion_parents'} = $revparents;
711 }
bebec0e9 712
713 # Find the number of times this reading was altered, and the number of
714 # times we're not sure.
715 my( %nofollow, %unknownfollow );
5c44c598 716 foreach my $wit ( @{$rdghash->{'group'}} ) {
bebec0e9 717 foreach my $wchild ( $graph->successors( $wit ) ) {
5c44c598 718 if( $reading_roots->{$wchild} && $contig->{$wchild}
719 && $contig->{$wchild} ne $rid ) {
bebec0e9 720 # It definitely changed here.
721 $nofollow{$wchild} = 1;
722 } elsif( !($contig->{$wchild}) ) {
723 # The child is a hypothetical node not definitely in
724 # any group. Answer is unknown.
725 $unknownfollow{$wchild} = 1;
5c44c598 726 } # else it is either in our group, or it is a non-root node in a
727 # known group and therefore is presumed to have its reading from
728 # its group, not this link.
bebec0e9 729 }
730 }
731 $rdghash->{'not_followed'} = keys %nofollow;
732 $rdghash->{'follow_unknown'} = keys %unknownfollow;
fae07016 733
734 # Now say whether this reading represents a conflict.
735 unless( $variant_row->{'genealogical'} ) {
7e17346f 736 $rdghash->{'is_conflict'} = @roots != 1;
e23225e7 737 $rdghash->{'is_reverted'} = scalar @reversions;
7e17346f 738 }
739 }
740}
741
742sub _find_reading_parents {
743 my( $rid, $graph, $contig, @list ) = @_;
744 my $parenthash = {};
745 foreach my $wit ( @list ) {
746 # Look in the stemma graph to find this witness's extant or known-reading
747 # immediate ancestor(s), and look up the reading that each ancestor holds.
748 my @check = $graph->predecessors( $wit );
749 while( @check ) {
750 my @next;
751 foreach my $wparent( @check ) {
752 my $preading = $contig->{$wparent};
753 if( $preading && $preading ne $rid ) {
754 $parenthash->{$preading} = 1;
755 } else {
756 push( @next, $graph->predecessors( $wparent ) );
757 }
758 }
759 @check = @next;
760 }
761 }
762 return $parenthash;
763}
764
765sub _resolve_parent_relationships {
766 my( $c, $rid, $rdg, $rdgparents ) = @_;
767 foreach my $p ( keys %$rdgparents ) {
768 # Resolve the relationship of the parent to the reading, and
769 # save it in our hash.
770 my $pobj = $c->reading( $p );
771 my $prep = $pobj ? $pobj->id . ' (' . $pobj->text . ')' : $p;
772 my $phash = { 'label' => $prep };
773 if( $pobj ) {
774 my $rel = $c->get_relationship( $p, $rid );
775 if( $rel ) {
776 _add_to_hash( $rel, $phash );
777 } elsif( $rdg ) {
778 # First check for a transposed relationship
779 if( $rdg->rank != $pobj->rank ) {
780 foreach my $ti ( $rdg->related_readings( 'transposition' ) ) {
781 next unless $ti->text eq $rdg->text;
782 $rel = $c->get_relationship( $ti, $pobj );
783 if( $rel ) {
784 _add_to_hash( $rel, $phash, 1 );
785 last;
786 }
787 }
788 unless( $rel ) {
789 foreach my $ti ( $pobj->related_readings( 'transposition' ) ) {
790 next unless $ti->text eq $pobj->text;
791 $rel = $c->get_relationship( $ti, $rdg );
792 if( $rel ) {
793 _add_to_hash( $rel, $phash, 1 );
794 last;
795 }
796 }
5c44c598 797 }
798 }
7e17346f 799 unless( $rel ) {
800 # and then check for sheer word similarity.
801 my $rtext = $rdg->text;
802 my $ptext = $pobj->text;
803 if( similar( $rtext, $ptext ) ) {
804 # say STDERR "Words $rtext and $ptext judged similar";
805 $phash->{relation} = { type => 'wordsimilar' };
806 }
807 }
5c44c598 808 } else {
7e17346f 809 $phash->{relation} = { type => 'deletion' };
5c44c598 810 }
7e17346f 811 # Get the attributes of the parent object while we are here
812 $phash->{'text'} = $pobj->text if $pobj;
a445ce40 813 if( $pobj && $pobj->does('Text::Tradition::Morphology') ) {
814 $phash->{'is_nonsense'} = $pobj->is_nonsense;
815 $phash->{'is_ungrammatical'} = $pobj->grammar_invalid;
816 }
7e17346f 817 } elsif( $p eq '(omitted)' ) {
818 $phash->{relation} = { type => 'addition' };
819 }
820 # Save it
821 $rdgparents->{$p} = $phash;
822 }
d71100ed 823}
824
428bcf0b 825sub _add_to_hash {
826 my( $rel, $phash, $is_transposed ) = @_;
827 $phash->{relation} = { type => $rel->type };
828 $phash->{relation}->{transposed} = 1 if $is_transposed;
829 $phash->{relation}->{annotation} = $rel->annotation
830 if $rel->has_annotation;
831}
832
833=head2 similar( $word1, $word2 )
834
835Use Algorithm::Diff to get a sense of how close the words are to each other.
836This will hopefully handle substitutions a bit more nicely than Levenshtein.
837
838=cut
839
840#!/usr/bin/env perl
841
842sub similar {
843 my( $word1, $word2 ) = sort { length($a) <=> length($b) } @_;
844 my @let1 = split( '', lc( $word1 ) );
845 my @let2 = split( '', lc( $word2 ) );
846 my $diff = Algorithm::Diff->new( \@let1, \@let2 );
847 my $mag = 0;
848 while( $diff->Next ) {
849 if( $diff->Same ) {
850 # Take off points for longer strings
851 my $cs = $diff->Range(1) - 2;
852 $cs = 0 if $cs < 0;
853 $mag -= $cs;
854 } elsif( !$diff->Items(1) ) {
855 $mag += $diff->Range(2);
856 } elsif( !$diff->Items(2) ) {
857 $mag += $diff->Range(1);
858 } else {
859 # Split the difference for substitutions
860 my $c1 = $diff->Range(1) || 1;
861 my $c2 = $diff->Range(2) || 1;
862 my $cd = ( $c1 + $c2 ) / 2;
863 $mag += $cd;
864 }
865 }
866 return ( $mag <= length( $word1 ) / 2 );
867}
868
5be0cdeb 869sub _useful_variant {
7e17346f 870 my( $rankgroup, $rankgraph, $acstr ) = @_;
5be0cdeb 871
872 # Sort by group size and return
873 my $is_useful = 0;
7e17346f 874 foreach my $rdg ( keys %$rankgroup ) {
875 my @wits = $rankgroup->{$rdg}->members;
876 if( @wits > 1 ) {
5be0cdeb 877 $is_useful++;
878 } else {
7e17346f 879 $is_useful++ unless( $rankgraph->is_sink_vertex( $wits[0] )
880 || $wits[0] =~ /\Q$acstr\E$/ );
5be0cdeb 881 }
882 }
7e17346f 883 return $is_useful > 1;
5be0cdeb 884}
885
7f52eac8 886=head2 wit_stringify( $groups )
887
888Takes an array of witness groupings and produces a string like
889['A','B'] / ['C','D','E'] / ['F']
d71100ed 890
7f52eac8 891=cut
d71100ed 892
893sub wit_stringify {
894 my $groups = shift;
895 my @gst;
896 # If we were passed an array of witnesses instead of an array of
897 # groupings, then "group" the witnesses first.
898 unless( ref( $groups->[0] ) ) {
899 my $mkgrp = [ $groups ];
900 $groups = $mkgrp;
901 }
902 foreach my $g ( @$groups ) {
903 push( @gst, '[' . join( ',', map { "'$_'" } @$g ) . ']' );
904 }
905 return join( ' / ', @gst );
906}
7f52eac8 907
7f52eac8 9081;
909
a745c3d9 910sub throw {
911 Text::Tradition::Error->throw(
912 'ident' => 'Analysis error',
913 'message' => $_[0],
914 );
915}
916
7f52eac8 917=head1 LICENSE
918
919This package is free software and is provided "as is" without express
920or implied warranty. You can redistribute it and/or modify it under
921the same terms as Perl itself.
922
923=head1 AUTHOR
924
925Tara L Andrews E<lt>aurum@cpan.orgE<gt>