let stexaminer run without IDP server reachable
[scpubgit/stemmatology.git] / lib / Text / Tradition / Analysis.pm
CommitLineData
d71100ed 1package Text::Tradition::Analysis;
2
3use strict;
4use warnings;
428bcf0b 5use Algorithm::Diff; # for word similarity measure
03c0a7d7 6use Encode qw/ decode_utf8 encode_utf8 /;
d1348d38 7use Exporter 'import';
b4cb2d60 8use Graph;
a745c3d9 9use JSON qw/ to_json decode_json /;
03c0a7d7 10use LWP::UserAgent;
7e17346f 11use Set::Scalar;
7e17346f 12use Text::Tradition::Analysis::Result;
13use Text::Tradition::Directory;
d71100ed 14use Text::Tradition::Stemma;
d120c995 15use TryCatch;
d71100ed 16
d1348d38 17use vars qw/ @EXPORT_OK /;
a2cf85dd 18@EXPORT_OK = qw/ run_analysis group_variants analyze_variant_location wit_stringify /;
d1348d38 19
03c0a7d7 20my $SOLVER_URL = 'http://byzantini.st/cgi-bin/graphcalc.cgi';
7e17346f 21my $unsolved_problems = {};
5c44c598 22
7f52eac8 23=head1 NAME
24
25Text::Tradition::Analysis - functions for stemma analysis of a tradition
26
27=head1 SYNOPSIS
28
29 use Text::Tradition;
30 use Text::Tradition::Analysis qw/ run_analysis analyze_variant_location /;
31 my $t = Text::Tradition->new(
32 'name' => 'this is a text',
33 'input' => 'TEI',
34 'file' => '/path/to/tei_parallel_seg_file.xml' );
35 $t->add_stemma( 'dotfile' => $stemmafile );
36
37 my $variant_data = run_analysis( $tradition );
38 # Recalculate rank $n treating all orthographic variants as equivalent
39 my $reanalyze = analyze_variant_location( $tradition, $n, 0, 'orthographic' );
40
41=head1 DESCRIPTION
42
43Text::Tradition is a library for representation and analysis of collated
44texts, particularly medieval ones. The Collation is the central feature of
45a Tradition, where the text, its sequence of readings, and its relationships
46between readings are actually kept.
47
48=head1 SUBROUTINES
49
88a6bac5 50=head2 run_analysis( $tradition, %opts )
7f52eac8 51
88a6bac5 52Runs the analysis described in analyze_variant_location on every location in the
53collation of the given tradition, with the given options. These include:
7f52eac8 54
88a6bac5 55=over 4
56
57=item * stemma_id - Specify which of the tradition's stemmata to use. Default
58is 0 (i.e. the first).
59
60=item * ranks - Specify a list of location ranks to analyze; exclude the rest.
61
62=item * merge_types - Specify a list of relationship types, where related readings
63should be treated as identical for the purposes of analysis.
64
ffa22d6f 65=item * exclude_type1 - Exclude those ranks whose groupings have only type-1 variants.
66
88a6bac5 67=back
7f52eac8 68
69=begin testing
70
71use Text::Tradition;
72use Text::Tradition::Analysis qw/ run_analysis analyze_variant_location /;
73
74my $datafile = 't/data/florilegium_tei_ps.xml';
75my $tradition = Text::Tradition->new( 'input' => 'TEI',
76 'name' => 'test0',
77 'file' => $datafile );
78my $s = $tradition->add_stemma( 'dotfile' => 't/data/florilegium.dot' );
79is( ref( $s ), 'Text::Tradition::Stemma', "Added stemma to tradition" );
80
f00cefe8 81my %expected_genealogical = (
a44aaf2a 82 1 => 0,
f00cefe8 83 2 => 1,
a44aaf2a 84 3 => 0,
85 5 => 0,
86 7 => 0,
87 8 => 0,
88 10 => 0,
f00cefe8 89 13 => 1,
a44aaf2a 90 33 => 0,
91 34 => 0,
92 37 => 0,
93 60 => 0,
f00cefe8 94 81 => 1,
a44aaf2a 95 84 => 0,
96 87 => 0,
97 101 => 0,
98 102 => 0,
f00cefe8 99 122 => 1,
a44aaf2a 100 157 => 0,
f00cefe8 101 166 => 1,
102 169 => 1,
a44aaf2a 103 200 => 0,
f00cefe8 104 216 => 1,
105 217 => 1,
106 219 => 1,
107 241 => 1,
108 242 => 1,
109 243 => 1,
110);
111
7e17346f 112my $data = run_analysis( $tradition, calcdsn => 'dbi:SQLite:dbname=t/data/analysis.db' );
7234b01d 113my $c = $tradition->collation;
f00cefe8 114foreach my $row ( @{$data->{'variants'}} ) {
a44aaf2a 115 # Account for rows that used to be "not useful"
116 unless( exists $expected_genealogical{$row->{'id'}} ) {
117 $expected_genealogical{$row->{'id'}} = 1;
118 }
18f48b82 119 my $gen_bool = $row->{'genealogical'} ? 1 : 0;
120 is( $gen_bool, $expected_genealogical{$row->{'id'}},
f00cefe8 121 "Got correct genealogical flag for row " . $row->{'id'} );
7234b01d 122 # Check that we have the right row with the right groups
123 my $rank = $row->{'id'};
124 foreach my $rdghash ( @{$row->{'readings'}} ) {
125 # Skip 'readings' that aren't really
126 next unless $c->reading( $rdghash->{'readingid'} );
127 # Check the rank
128 is( $c->reading( $rdghash->{'readingid'} )->rank, $rank,
129 "Got correct reading rank" );
130 # Check the witnesses
131 my @realwits = sort $c->reading_witnesses( $rdghash->{'readingid'} );
132 my @sgrp = sort @{$rdghash->{'group'}};
133 is_deeply( \@sgrp, \@realwits, "Reading analyzed with correct groups" );
134 }
f00cefe8 135}
a44aaf2a 136is( $data->{'variant_count'}, 58, "Got right total variant number" );
b4cb2d60 137# TODO Make something meaningful of conflict count, maybe test other bits
7f52eac8 138
139=end testing
140
141=cut
142
d71100ed 143sub run_analysis {
88a6bac5 144 my( $tradition, %opts ) = @_;
f00cefe8 145 my $c = $tradition->collation;
7e17346f 146 my $aclabel = $c->ac_label;
88a6bac5 147
148 my $stemma_id = $opts{'stemma_id'} || 0;
1d73ecad 149 my @ranks = ref( $opts{'ranks'} ) eq 'ARRAY' ? @{$opts{'ranks'}} : ();
e23225e7 150 my $collapse = Set::Scalar->new();
151 if( $opts{'merge_types'} && ref( $opts{'merge_types'} ) eq 'ARRAY' ) {
152 $collapse->insert( @{$opts{'merge_types'}} );
153 } elsif( $opts{'merge_types'} ) {
154 $collapse->insert( $opts{'merge_types'} );
155 }
7e17346f 156
157 # Make sure we have a lookup DB for graph calculation results; this will die
158 # if calcdir or calcdsn isn't passed.
03c0a7d7 159 my $dir;
160 if( exists $opts{'calcdir'} ) {
161 $dir = delete $opts{'calcdir'}
162 } elsif ( exists $opts{'calcdsn'} ) {
163 $dir = Text::Tradition::Directory->new( dsn => $opts{'calcdsn'} );
164 }
88a6bac5 165
166 # Get the stemma
167 my $stemma = $tradition->stemma( $stemma_id );
b4cb2d60 168
4ce27d42 169 # Figure out which witnesses we are working with - that is, the ones that
170 # appear both in the stemma and in the tradition. All others are 'lacunose'
171 # for our purposes.
7e17346f 172 my $lacunose = Set::Scalar->new( $stemma->hypotheticals );
173 my $stemma_wits = Set::Scalar->new( $stemma->witnesses );
174 my $tradition_wits = Set::Scalar->new( map { $_->sigil } $tradition->witnesses );
175 $lacunose->insert( $stemma_wits->symmetric_difference( $tradition_wits )->members );
88a6bac5 176
177 # Find and mark 'common' ranks for exclusion, unless they were
178 # explicitly specified.
179 unless( @ranks ) {
180 my %common_rank;
a44aaf2a 181 foreach my $rdg ( $c->common_readings ) {
88a6bac5 182 $common_rank{$rdg->rank} = 1;
183 }
184 @ranks = grep { !$common_rank{$_} } ( 1 .. $c->end->rank-1 );
d71100ed 185 }
7f52eac8 186
88a6bac5 187 # Group the variants to send to the solver
188 my @groups;
f629cb3b 189 my @use_ranks;
a44aaf2a 190 my %lacunae;
94654e27 191 my $moved = {};
88a6bac5 192 foreach my $rank ( @ranks ) {
7e17346f 193 my $missing = $lacunose->clone();
e23225e7 194 my $rankgroup = group_variants( $tradition, $rank, $missing, $moved, $collapse );
94654e27 195 # Filter out any empty rankgroups
196 # (e.g. from the later rank for a transposition)
197 next unless keys %$rankgroup;
7e17346f 198 # Get the graph for this rankgroup
199 my $rankgraph = _graph_for_grouping( $stemma, $rankgroup, $missing, $aclabel );
ffa22d6f 200 if( $opts{'exclude_type1'} ) {
201 # Check to see whether this is a "useful" group.
7e17346f 202 next unless _useful_variant( $rankgroup, $rankgraph, $aclabel );
ffa22d6f 203 }
f629cb3b 204 push( @use_ranks, $rank );
7e17346f 205 push( @groups, { grouping => $rankgroup, graph => $rankgraph } );
a44aaf2a 206 $lacunae{$rank} = $missing;
d71100ed 207 }
4ce27d42 208 # Run the solver
b7bd7aa5 209 my $answer;
210 try {
211 $answer = solve_variants( $dir, @groups );
212 } catch ( Text::Tradition::Error $e ) {
213 if( $e->message =~ /IDP/ ) {
214 # Something is wrong with the solver; make the variants table anyway
215 $answer->{'variants'} = [];
216 map { push( @{$answer->{'variants'}}, _init_unsolved( $_, 'IDP error' ) ) }
217 @groups;
218 } else {
219 # Something else is wrong; error out.
220 $e->throw;
221 }
222 }
fae07016 223
88a6bac5 224 # Do further analysis on the answer
a44aaf2a 225 my $conflict_count = 0;
7e17346f 226 my $reversion_count = 0;
f629cb3b 227 foreach my $idx ( 0 .. $#use_ranks ) {
88a6bac5 228 my $location = $answer->{'variants'}->[$idx];
229 # Add the rank back in
94654e27 230 my $rank = $use_ranks[$idx];
231 $location->{'id'} = $rank;
7234b01d 232 # Note what our lacunae are
f629cb3b 233 my %lmiss;
7234b01d 234 map { $lmiss{$_} = 1 } @{$lacunae{$use_ranks[$idx]}};
7234b01d 235 $location->{'missing'} = [ keys %lmiss ];
236
88a6bac5 237 # Run the extra analysis we need.
428bcf0b 238 ## TODO We run through all the variants in this call, so
239 ## why not add the reading data there instead of here below?
7e17346f 240 my $graph = $groups[$idx]->{graph};
241 analyze_location( $tradition, $graph, $location, \%lmiss );
7234b01d 242
638e2a95 243 my @layerwits;
7234b01d 244 # Do the final post-analysis tidying up of the data.
a44aaf2a 245 foreach my $rdghash ( @{$location->{'readings'}} ) {
e23225e7 246 $conflict_count++ if $rdghash->{'is_conflict'};
247 $reversion_count++ if $rdghash->{'is_reverted'};
94654e27 248 # Add the reading text back in, setting display value as needed
a44aaf2a 249 my $rdg = $c->reading( $rdghash->{'readingid'} );
94654e27 250 if( $rdg ) {
251 $rdghash->{'text'} = $rdg->text .
252 ( $rdg->rank == $rank ? '' : ' [' . $rdg->rank . ']' );
7b7abf10 253 $rdghash->{'is_ungrammatical'} = $rdg->grammar_invalid;
254 $rdghash->{'is_nonsense'} = $rdg->is_nonsense;
94654e27 255 }
f629cb3b 256 # Remove lacunose witnesses from this reading's list now that the
7234b01d 257 # analysis is done
f629cb3b 258 my @realgroup;
7234b01d 259 map { push( @realgroup, $_ ) unless $lmiss{$_} } @{$rdghash->{'group'}};
f629cb3b 260 $rdghash->{'group'} = \@realgroup;
638e2a95 261 # Note any layered witnesses that appear in this group
262 foreach( @realgroup ) {
263 if( $_ =~ /^(.*)\Q$aclabel\E$/ ) {
264 push( @layerwits, $1 );
265 }
266 }
a44aaf2a 267 }
638e2a95 268 $location->{'layerwits'} = \@layerwits if @layerwits;
88a6bac5 269 }
a44aaf2a 270 $answer->{'conflict_count'} = $conflict_count;
7e17346f 271 $answer->{'reversion_count'} = $reversion_count;
f00cefe8 272
88a6bac5 273 return $answer;
d71100ed 274}
275
e23225e7 276=head2 group_variants( $tradition, $rank, $lacunose, $transposed, $merge_relationship_types )
7f52eac8 277
278Groups the variants at the given $rank of the collation, treating any
e23225e7 279relationships in the set $merge_relationship_types as equivalent.
280$lacunose should be a reference to an array, to which the sigla of lacunose
281witnesses at this rank will be appended; $transposed should be a reference
282to a hash, wherein the identities of transposed readings and their
283relatives will be stored.
7f52eac8 284
ffa22d6f 285Returns a hash $group_readings where $rdg is attested by the witnesses listed
286in $group_readings->{$rdg}.
7f52eac8 287
288=cut
289
290# Return group_readings, groups, lacunose
d1348d38 291sub group_variants {
94654e27 292 my( $tradition, $rank, $lacunose, $transposed, $collapse ) = @_;
7f52eac8 293 my $c = $tradition->collation;
335a62ef 294 my $aclabel = $c->ac_label;
d120c995 295 my $table = $c->alignment_table;
7f52eac8 296 # Get the alignment table readings
297 my %readings_at_rank;
7e17346f 298 my $check_for_gaps = Set::Scalar->new();
94654e27 299 my %moved_wits;
d120c995 300 my $has_transposition;
301 foreach my $tablewit ( @{$table->{'alignment'}} ) {
7f52eac8 302 my $rdg = $tablewit->{'tokens'}->[$rank-1];
fae07016 303 my $wit = $tablewit->{'witness'};
ffa22d6f 304 # Exclude the witness if it is "lacunose" which if we got here
305 # means "not in the stemma".
7e17346f 306 next if _is_lacunose( $wit, $lacunose, $aclabel );
94654e27 307 # Note if the witness is actually in a lacuna
7f52eac8 308 if( $rdg && $rdg->{'t'}->is_lacuna ) {
335a62ef 309 _add_to_witlist( $wit, $lacunose, $aclabel );
94654e27 310 # Otherwise the witness either has a positive reading...
7f52eac8 311 } elsif( $rdg ) {
94654e27 312 # If the reading has been counted elsewhere as a transposition, ignore it.
313 if( $transposed->{$rdg->{'t'}->id} ) {
d120c995 314 # TODO Does this cope with three-way transpositions?
94654e27 315 map { $moved_wits{$_} = 1 } @{$transposed->{$rdg->{'t'}->id}};
316 next;
317 }
318 # Otherwise, record it...
319 $readings_at_rank{$rdg->{'t'}->id} = $rdg->{'t'};
320 # ...and grab any transpositions, and their relations.
321 my @transp = grep { $_->rank != $rank } $rdg->{'t'}->related_readings();
322 foreach my $trdg ( @transp ) {
d120c995 323 next if exists $readings_at_rank{$trdg->id};
324 $has_transposition = 1;
325 my @affected_wits = _table_witnesses(
7e17346f 326 $table, $trdg, $lacunose, $aclabel );
d120c995 327 next unless @affected_wits;
328 map { $moved_wits{$_} = 1 } @affected_wits;
329 $transposed->{$trdg->id} =
7e17346f 330 [ _table_witnesses( $table, $rdg->{'t'}, $lacunose, $aclabel ) ];
94654e27 331 $readings_at_rank{$trdg->id} = $trdg;
332 }
333 # ...or it is empty, ergo a gap.
7f52eac8 334 } else {
7e17346f 335 _add_to_witlist( $wit, $check_for_gaps, $aclabel );
7f52eac8 336 }
337 }
7e17346f 338 my $gap_wits = Set::Scalar->new();
339 map { _add_to_witlist( $_, $gap_wits, $aclabel )
340 unless $moved_wits{$_} } $check_for_gaps->members;
341
342 # Group the readings, collapsing groups by relationship if needed.
d120c995 343 my $grouped_readings = {};
4ce27d42 344 foreach my $rdg ( values %readings_at_rank ) {
7f52eac8 345 # Skip readings that have been collapsed into others.
d120c995 346 next if exists $grouped_readings->{$rdg->id}
347 && $grouped_readings->{$rdg->id} eq 'COLLAPSE';
4ce27d42 348 # Get the witness list, including from readings collapsed into this one.
7e17346f 349 my @wits = _table_witnesses( $table, $rdg, $lacunose, $aclabel );
e23225e7 350 if( $collapse && $collapse->size ) {
351 my $filter = sub { $collapse->has( $_[0]->type ) };
7f52eac8 352 foreach my $other ( $rdg->related_readings( $filter ) ) {
7e17346f 353 my @otherwits = _table_witnesses( $table, $other, $lacunose, $aclabel );
fae07016 354 push( @wits, @otherwits );
d120c995 355 $grouped_readings->{$other->id} = 'COLLAPSE';
d1348d38 356 }
357 }
7e17346f 358 $grouped_readings->{$rdg->id} = Set::Scalar->new( @wits );
7f52eac8 359 }
7e17346f 360 if( $gap_wits->members ) {
361 $grouped_readings->{'(omitted)'} = $gap_wits;
362 }
363
7f52eac8 364 # Get rid of our collapsed readings
7e17346f 365 map { delete $grouped_readings->{$_} if(
366 $grouped_readings->{$_} eq 'COLLAPSE'
367 || $grouped_readings->{$_}->is_empty ) }
368 keys %$grouped_readings;
d120c995 369
370 # If something was transposed, check the groups for doubled-up readings
371 if( $has_transposition ) {
62a39b8f 372 # print STDERR "Group for rank $rank:\n";
373 # map { print STDERR "\t$_: " . join( ' ' , @{$grouped_readings->{$_}} ) . "\n" }
374 # keys %$grouped_readings;
d120c995 375 _check_transposed_consistency( $c, $rank, $transposed, $grouped_readings );
376 }
7f52eac8 377
4ce27d42 378 # Return the result
d120c995 379 return $grouped_readings;
380}
381
382# Helper function to query the alignment table for all witnesses (a.c. included)
383# that have a given reading at its rank.
384sub _table_witnesses {
385 my( $table, $trdg, $lacunose, $aclabel ) = @_;
386 my $tableidx = $trdg->rank - 1;
7e17346f 387 my $has_reading = Set::Scalar->new();
d120c995 388 foreach my $row ( @{$table->{'alignment'}} ) {
389 my $wit = $row->{'witness'};
7e17346f 390 next if _is_lacunose( $wit, $lacunose, $aclabel );
d120c995 391 my $rdg = $row->{'tokens'}->[$tableidx];
392 next unless exists $rdg->{'t'} && defined $rdg->{'t'};
7e17346f 393 _add_to_witlist( $wit, $has_reading, $aclabel )
d120c995 394 if $rdg->{'t'}->id eq $trdg->id;
395 }
7e17346f 396 return $has_reading->members;
397}
398
399# Helper function to see if a witness is lacunose even if we are asking about
400# the a.c. version
401sub _is_lacunose {
402 my ( $wit, $lac, $acstr ) = @_;
403 if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
404 $wit = $1;
405 }
406 return $lac->has( $wit );
d1348d38 407}
408
335a62ef 409# Helper function to ensure that X and X a.c. never appear in the same list.
410sub _add_to_witlist {
411 my( $wit, $list, $acstr ) = @_;
335a62ef 412 if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
7e17346f 413 # Don't add X a.c. if we already have X
414 return if $list->has( $1 );
335a62ef 415 } else {
7e17346f 416 # Delete X a.c. if we are about to add X
417 $list->delete( $wit.$acstr );
335a62ef 418 }
7e17346f 419 $list->insert( $wit );
335a62ef 420}
421
d120c995 422sub _check_transposed_consistency {
423 my( $c, $rank, $transposed, $groupings ) = @_;
424 my %seen_wits;
425 my %thisrank;
426 # Note which readings are actually at this rank, and which witnesses
427 # belong to which reading.
428 foreach my $rdg ( keys %$groupings ) {
429 my $rdgobj = $c->reading( $rdg );
430 # Count '(omitted)' as a reading at this rank
431 $thisrank{$rdg} = 1 if !$rdgobj || $rdgobj->rank == $rank;
432 map { push( @{$seen_wits{$_}}, $rdg ) } @{$groupings->{$rdg}};
433 }
434 # Our work is done if we have no witness belonging to more than one
435 # reading.
436 my @doubled = grep { scalar @{$seen_wits{$_}} > 1 } keys %seen_wits;
437 return unless @doubled;
438 # If we have a symmetric related transposition, drop the non-rank readings.
439 if( @doubled == scalar keys %seen_wits ) {
440 foreach my $rdg ( keys %$groupings ) {
441 if( !$thisrank{$rdg} ) {
442 my $groupstr = wit_stringify( $groupings->{$rdg} );
443 my ( $matched ) = grep { $groupstr eq wit_stringify( $groupings->{$_} ) }
444 keys %thisrank;
445 delete $groupings->{$rdg};
446 # If we found a group match, assume there is a symmetry happening.
447 # TODO think more about this
62a39b8f 448 # print STDERR "*** Deleting symmetric reading $rdg\n";
d120c995 449 unless( $matched ) {
450 delete $transposed->{$rdg};
451 warn "Found problem in evident symmetry with reading $rdg";
452 }
453 }
454 }
455 # Otherwise 'unhook' the transposed reading(s) that have duplicates.
456 } else {
457 foreach my $dup ( @doubled ) {
458 foreach my $rdg ( @{$seen_wits{$dup}} ) {
459 next if $thisrank{$rdg};
460 next unless exists $groupings->{$rdg};
62a39b8f 461 # print STDERR "*** Deleting asymmetric doubled-up reading $rdg\n";
d120c995 462 delete $groupings->{$rdg};
463 delete $transposed->{$rdg};
464 }
465 }
466 # and put any now-orphaned readings into an 'omitted' reading.
467 foreach my $wit ( keys %seen_wits ) {
468 unless( grep { exists $groupings->{$_} } @{$seen_wits{$wit}} ) {
7e17346f 469 $groupings->{'(omitted)'} = Set::Scalar->new()
470 unless exists $groupings->{'(omitted)'};
d120c995 471 _add_to_witlist( $wit, $groupings->{'(omitted)'}, $c->ac_label );
472 }
473 }
474 }
475}
476
7e17346f 477# For the given grouping, return its situation graph based on the stemma.
478sub _graph_for_grouping {
479 my( $stemma, $grouping, $lacunose, $aclabel ) = @_;
480 my $acwits = [];
481 my $extant = {};
482 foreach my $gs ( values %$grouping ) {
483 map {
484 if( $_ =~ /^(.*)\Q$aclabel\E$/ ) {
485 push( @$acwits, $1 ) unless $lacunose->has( $1 );
486 } else {
487 $extant->{$_} = 1 unless $lacunose->has( $_ );
488 }
489 } $gs->members;
490 }
491 my $graph;
492 try {
493 # contig contains all extant wits and all hypothetical wits
494 # needed to make up the groups.
495 $graph = $stemma->situation_graph( $extant, $acwits );
496 } catch ( Text::Tradition::Error $e ) {
03c0a7d7 497 throw( "Could not extend graph with given extant and a.c. witnesses: "
498 . $e->message );
7e17346f 499 } catch {
03c0a7d7 500 throw( "Could not extend graph with a.c. witnesses @$acwits" );
7e17346f 501 }
502 return $graph;
503}
88a6bac5 504
7e17346f 505=head2 solve_variants( $calcdir, @groups )
506
507Looks up the set of groups in the answers provided by the external graph solver
508service and returns a cleaned-up answer, adding the rank IDs back where they belong.
88a6bac5 509
88a6bac5 510The answer has the form
511 { "variants" => [ array of variant location structures ],
512 "variant_count" => total,
513 "conflict_count" => number of conflicts detected,
514 "genealogical_count" => number of solutions found }
515
516=cut
517
518sub solve_variants {
03c0a7d7 519 my( @groups ) = @_;
520
a745c3d9 521 # Are we using a local result directory, or did we pass an empty value
522 # for the directory?
03c0a7d7 523 my $dir;
a745c3d9 524 unless( ref( $groups[0] ) eq 'HASH' ) {
03c0a7d7 525 $dir = shift @groups;
526 }
335a62ef 527
03c0a7d7 528 ## For each graph/group combo, make a Text::Tradition::Analysis::Result
529 ## object so that we can send it off for IDP lookup.
7e17346f 530 my $variants = [];
531 my $genealogical = 0; # counter
03c0a7d7 532 # TODO Optimize for unique graph problems
554e2e7d 533 my %problems;
7e17346f 534 foreach my $graphproblem ( @groups ) {
03c0a7d7 535 # Construct the calc result key and look up its answer
536 my $problem = Text::Tradition::Analysis::Result->new(
537 graph => $graphproblem->{'graph'},
538 setlist => [ values %{$graphproblem->{'grouping'}} ] );
554e2e7d 539 if( exists $problems{$problem->object_key} ) {
540 $problem = $problems{$problem->object_key};
541 } else {
542 $problems{$problem->object_key} = $problem;
543 }
544 $graphproblem->{'object'} = $problem;
03c0a7d7 545 }
546
554e2e7d 547 my %results;
03c0a7d7 548 if( $dir ) {
549 my $scope = $dir->new_scope;
554e2e7d 550 map { $results{$_} = $dir->lookup( $_ ) || $problems{$_} } keys %problems;
551 } else {
552 my $json = JSON->new->allow_blessed->convert_blessed->utf8->encode(
553 [ values %problems ] );
03c0a7d7 554 # Send it off and get the result
555 # print STDERR "Sending request: " . decode_utf8( $json ) . "\n";
556 my $ua = LWP::UserAgent->new();
557 my $resp = $ua->post( $SOLVER_URL, 'Content-Type' => 'application/json',
558 'Content' => $json );
a745c3d9 559 my $answer;
03c0a7d7 560 if( $resp->is_success ) {
a745c3d9 561 $answer = decode_json( $resp->content );
03c0a7d7 562 throw( "Unexpected answer from IDP: $answer" ) unless ref( $answer ) eq 'ARRAY';
563 } else {
564 throw( "IDP solver returned " . $resp->status_line . " / " . $resp->content
565 . "; cannot run graph analysis" );
566 }
567 # One more sanity check
568 throw( "Something went wrong with answer symmetricity" )
554e2e7d 569 unless keys( %problems ) == @$answer;
03c0a7d7 570 # Convert the results
554e2e7d 571 foreach my $a ( @$answer ) {
572 my $r = Text::Tradition::Analysis::Result->new( $a );
573 $results{$r->object_key} = $r;
574 }
03c0a7d7 575 }
576
577 # We now have a single JSON-encoded Result object per problem sent. Fold its
578 # answers into our variant info structure.
554e2e7d 579 foreach my $graphproblem ( @groups ) {
580 my $result = $results{$graphproblem->{'object'}->object_key}
581 || $graphproblem->{'object'};
03c0a7d7 582
7e17346f 583 # Initialize the result structure for this graph problem
b7bd7aa5 584 my $vstruct;
585 if( $result->status eq 'OK' ) {
586 $vstruct = { readings => [] };
587 push( @$variants, $vstruct );
588 } else {
589 push( @$variants, _init_unsolved( $graphproblem, $result->status ) );
7e17346f 590 next;
5c44c598 591 }
b7bd7aa5 592
7e17346f 593 # 1. Did the group evaluate as genealogical?
03c0a7d7 594 $vstruct->{genealogical} = $result->is_genealogical;
595 $genealogical++ if $result->is_genealogical;
7e17346f 596
597 # 2. What are the calculated minimum groupings for each variant loc?
598 foreach my $rid ( keys %{$graphproblem->{grouping}} ) {
599 my $inputset = $graphproblem->{grouping}->{$rid};
03c0a7d7 600 my $minset = $result->minimum_grouping_for( $inputset );
7e17346f 601 push( @{$vstruct->{readings}}, { readingid => $rid, group => $minset } );
6d25a3a0 602 }
7e17346f 603
604 # 3. What are the sources and classes calculated for each witness?
03c0a7d7 605 $vstruct->{witcopy_types} = { $result->classes };
7e17346f 606 $vstruct->{reading_roots} = {};
03c0a7d7 607 map { $vstruct->{reading_roots}->{$_} = 1 } $result->sources;
7e17346f 608
88a6bac5 609 }
610
611 return { 'variants' => $variants,
612 'variant_count' => scalar @$variants,
613 'genealogical_count' => $genealogical };
614}
615
b7bd7aa5 616sub _init_unsolved {
617 my( $graphproblem, $status ) = @_;
618 my $vstruct = { 'readings' => [] };
619 $vstruct->{'unsolved'} = $status;
620 foreach my $rid ( keys %{$graphproblem->{grouping}} ) {
621 push( @{$vstruct->{readings}}, { readingid => $rid,
622 group => [ $graphproblem->{grouping}->{$rid}->members ] } );
623 }
624 return $vstruct;
625}
626
fae07016 627=head2 analyze_location ( $tradition, $graph, $location_hash )
7f52eac8 628
fae07016 629Given the tradition, its stemma graph, and the solution from the graph solver,
630work out the rest of the information we want. For each reading we need missing,
5c44c598 631conflict, reading_parents, independent_occurrence, followed, not_followed,
632and follow_unknown. Alters the location_hash in place.
7f52eac8 633
634=cut
732152b1 635
fae07016 636sub analyze_location {
7e17346f 637 my ( $tradition, $graph, $variant_row, $lacunose ) = @_;
94654e27 638 my $c = $tradition->collation;
fae07016 639
03c0a7d7 640 if( exists $variant_row->{'unsolved'} ) {
641 return;
642 }
643 my $reading_roots = delete $variant_row->{'reading_roots'};
644 my $classinfo = delete $variant_row->{'witcopy_types'};
645
fae07016 646 # Make a hash of all known node memberships, and make the subgraphs.
647 my $contig = {};
fae07016 648 my $subgraph = {};
638e2a95 649 my $acstr = $c->ac_label;
650 my @acwits;
5c44c598 651
5c44c598 652 # Note which witnesses positively belong to which group. This information
653 # comes ultimately from the IDP solver.
654 # Also make a note of the reading's roots.
fae07016 655 foreach my $rdghash ( @{$variant_row->{'readings'}} ) {
656 my $rid = $rdghash->{'readingid'};
5c44c598 657 my @roots;
638e2a95 658 foreach my $wit ( @{$rdghash->{'group'}} ) {
659 $contig->{$wit} = $rid;
660 if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
661 push( @acwits, $1 );
662 }
5c44c598 663 if( exists $reading_roots->{$wit} && $reading_roots->{$wit} ) {
664 push( @roots, $wit );
665 }
638e2a95 666 }
5c44c598 667 $rdghash->{'independent_occurrence'} = \@roots;
94654e27 668 }
7e17346f 669
fae07016 670 # Now that we have all the node group memberships, calculate followed/
bebec0e9 671 # non-followed/unknown values for each reading. Also figure out the
672 # reading's evident parent(s).
673 foreach my $rdghash ( @{$variant_row->{'readings'}} ) {
fae07016 674 my $rid = $rdghash->{'readingid'};
428bcf0b 675 my $rdg = $c->reading( $rid );
5c44c598 676 my @roots = @{$rdghash->{'independent_occurrence'}};
7e17346f 677 my @reversions;
678 if( $classinfo ) {
679 @reversions = grep { $classinfo->{$_} eq 'revert' }
680 $rdghash->{'group'}->members;
be590045 681 $rdghash->{'reversions'} = \@reversions;
7e17346f 682 }
5c44c598 683 my @group = @{$rdghash->{'group'}};
fae07016 684
685 # Start figuring things out.
7e17346f 686 $rdghash->{'followed'} = scalar( @group )
687 - ( scalar( @roots ) + scalar( @reversions ) );
bebec0e9 688 # Find the parent readings, if any, of this reading.
7e17346f 689 my $sourceparents = _find_reading_parents( $rid, $graph, $contig, @roots );
7e17346f 690 # Work out relationships between readings and their non-followed parent.
691 _resolve_parent_relationships( $c, $rid, $rdg, $sourceparents );
be590045 692 $rdghash->{'source_parents'} = $sourceparents;
e23225e7 693
694 if( @reversions ) {
695 my $revparents = _find_reading_parents( $rid, $graph, $contig, @reversions );
696 _resolve_parent_relationships( $c, $rid, $rdg, $revparents );
697 $rdghash->{'reversion_parents'} = $revparents;
698 }
bebec0e9 699
700 # Find the number of times this reading was altered, and the number of
701 # times we're not sure.
702 my( %nofollow, %unknownfollow );
5c44c598 703 foreach my $wit ( @{$rdghash->{'group'}} ) {
bebec0e9 704 foreach my $wchild ( $graph->successors( $wit ) ) {
5c44c598 705 if( $reading_roots->{$wchild} && $contig->{$wchild}
706 && $contig->{$wchild} ne $rid ) {
bebec0e9 707 # It definitely changed here.
708 $nofollow{$wchild} = 1;
709 } elsif( !($contig->{$wchild}) ) {
710 # The child is a hypothetical node not definitely in
711 # any group. Answer is unknown.
712 $unknownfollow{$wchild} = 1;
5c44c598 713 } # else it is either in our group, or it is a non-root node in a
714 # known group and therefore is presumed to have its reading from
715 # its group, not this link.
bebec0e9 716 }
717 }
718 $rdghash->{'not_followed'} = keys %nofollow;
719 $rdghash->{'follow_unknown'} = keys %unknownfollow;
fae07016 720
721 # Now say whether this reading represents a conflict.
722 unless( $variant_row->{'genealogical'} ) {
7e17346f 723 $rdghash->{'is_conflict'} = @roots != 1;
e23225e7 724 $rdghash->{'is_reverted'} = scalar @reversions;
7e17346f 725 }
726 }
727}
728
729sub _find_reading_parents {
730 my( $rid, $graph, $contig, @list ) = @_;
731 my $parenthash = {};
732 foreach my $wit ( @list ) {
733 # Look in the stemma graph to find this witness's extant or known-reading
734 # immediate ancestor(s), and look up the reading that each ancestor holds.
735 my @check = $graph->predecessors( $wit );
736 while( @check ) {
737 my @next;
738 foreach my $wparent( @check ) {
739 my $preading = $contig->{$wparent};
740 if( $preading && $preading ne $rid ) {
741 $parenthash->{$preading} = 1;
742 } else {
743 push( @next, $graph->predecessors( $wparent ) );
744 }
745 }
746 @check = @next;
747 }
748 }
749 return $parenthash;
750}
751
752sub _resolve_parent_relationships {
753 my( $c, $rid, $rdg, $rdgparents ) = @_;
754 foreach my $p ( keys %$rdgparents ) {
755 # Resolve the relationship of the parent to the reading, and
756 # save it in our hash.
757 my $pobj = $c->reading( $p );
758 my $prep = $pobj ? $pobj->id . ' (' . $pobj->text . ')' : $p;
759 my $phash = { 'label' => $prep };
760 if( $pobj ) {
761 my $rel = $c->get_relationship( $p, $rid );
762 if( $rel ) {
763 _add_to_hash( $rel, $phash );
764 } elsif( $rdg ) {
765 # First check for a transposed relationship
766 if( $rdg->rank != $pobj->rank ) {
767 foreach my $ti ( $rdg->related_readings( 'transposition' ) ) {
768 next unless $ti->text eq $rdg->text;
769 $rel = $c->get_relationship( $ti, $pobj );
770 if( $rel ) {
771 _add_to_hash( $rel, $phash, 1 );
772 last;
773 }
774 }
775 unless( $rel ) {
776 foreach my $ti ( $pobj->related_readings( 'transposition' ) ) {
777 next unless $ti->text eq $pobj->text;
778 $rel = $c->get_relationship( $ti, $rdg );
779 if( $rel ) {
780 _add_to_hash( $rel, $phash, 1 );
781 last;
782 }
783 }
5c44c598 784 }
785 }
7e17346f 786 unless( $rel ) {
787 # and then check for sheer word similarity.
788 my $rtext = $rdg->text;
789 my $ptext = $pobj->text;
790 if( similar( $rtext, $ptext ) ) {
791 # say STDERR "Words $rtext and $ptext judged similar";
792 $phash->{relation} = { type => 'wordsimilar' };
793 }
794 }
5c44c598 795 } else {
7e17346f 796 $phash->{relation} = { type => 'deletion' };
5c44c598 797 }
7e17346f 798 # Get the attributes of the parent object while we are here
799 $phash->{'text'} = $pobj->text if $pobj;
800 $phash->{'is_nonsense'} = $pobj->is_nonsense;
801 $phash->{'is_ungrammatical'} = $pobj->grammar_invalid;
802 } elsif( $p eq '(omitted)' ) {
803 $phash->{relation} = { type => 'addition' };
804 }
805 # Save it
806 $rdgparents->{$p} = $phash;
807 }
d71100ed 808}
809
428bcf0b 810sub _add_to_hash {
811 my( $rel, $phash, $is_transposed ) = @_;
812 $phash->{relation} = { type => $rel->type };
813 $phash->{relation}->{transposed} = 1 if $is_transposed;
814 $phash->{relation}->{annotation} = $rel->annotation
815 if $rel->has_annotation;
816}
817
818=head2 similar( $word1, $word2 )
819
820Use Algorithm::Diff to get a sense of how close the words are to each other.
821This will hopefully handle substitutions a bit more nicely than Levenshtein.
822
823=cut
824
825#!/usr/bin/env perl
826
827sub similar {
828 my( $word1, $word2 ) = sort { length($a) <=> length($b) } @_;
829 my @let1 = split( '', lc( $word1 ) );
830 my @let2 = split( '', lc( $word2 ) );
831 my $diff = Algorithm::Diff->new( \@let1, \@let2 );
832 my $mag = 0;
833 while( $diff->Next ) {
834 if( $diff->Same ) {
835 # Take off points for longer strings
836 my $cs = $diff->Range(1) - 2;
837 $cs = 0 if $cs < 0;
838 $mag -= $cs;
839 } elsif( !$diff->Items(1) ) {
840 $mag += $diff->Range(2);
841 } elsif( !$diff->Items(2) ) {
842 $mag += $diff->Range(1);
843 } else {
844 # Split the difference for substitutions
845 my $c1 = $diff->Range(1) || 1;
846 my $c2 = $diff->Range(2) || 1;
847 my $cd = ( $c1 + $c2 ) / 2;
848 $mag += $cd;
849 }
850 }
851 return ( $mag <= length( $word1 ) / 2 );
852}
853
5be0cdeb 854sub _useful_variant {
7e17346f 855 my( $rankgroup, $rankgraph, $acstr ) = @_;
5be0cdeb 856
857 # Sort by group size and return
858 my $is_useful = 0;
7e17346f 859 foreach my $rdg ( keys %$rankgroup ) {
860 my @wits = $rankgroup->{$rdg}->members;
861 if( @wits > 1 ) {
5be0cdeb 862 $is_useful++;
863 } else {
7e17346f 864 $is_useful++ unless( $rankgraph->is_sink_vertex( $wits[0] )
865 || $wits[0] =~ /\Q$acstr\E$/ );
5be0cdeb 866 }
867 }
7e17346f 868 return $is_useful > 1;
5be0cdeb 869}
870
7f52eac8 871=head2 wit_stringify( $groups )
872
873Takes an array of witness groupings and produces a string like
874['A','B'] / ['C','D','E'] / ['F']
d71100ed 875
7f52eac8 876=cut
d71100ed 877
878sub wit_stringify {
879 my $groups = shift;
880 my @gst;
881 # If we were passed an array of witnesses instead of an array of
882 # groupings, then "group" the witnesses first.
883 unless( ref( $groups->[0] ) ) {
884 my $mkgrp = [ $groups ];
885 $groups = $mkgrp;
886 }
887 foreach my $g ( @$groups ) {
888 push( @gst, '[' . join( ',', map { "'$_'" } @$g ) . ']' );
889 }
890 return join( ' / ', @gst );
891}
7f52eac8 892
7f52eac8 8931;
894
a745c3d9 895sub throw {
896 Text::Tradition::Error->throw(
897 'ident' => 'Analysis error',
898 'message' => $_[0],
899 );
900}
901
7f52eac8 902=head1 LICENSE
903
904This package is free software and is provided "as is" without express
905or implied warranty. You can redistribute it and/or modify it under
906the same terms as Perl itself.
907
908=head1 AUTHOR
909
910Tara L Andrews E<lt>aurum@cpan.orgE<gt>