guard against non-array values in stringify
[scpubgit/stemmatology.git] / lib / Text / Tradition / Analysis.pm
CommitLineData
d71100ed 1package Text::Tradition::Analysis;
2
3use strict;
4use warnings;
428bcf0b 5use Algorithm::Diff; # for word similarity measure
03c0a7d7 6use Encode qw/ decode_utf8 encode_utf8 /;
d1348d38 7use Exporter 'import';
b4cb2d60 8use Graph;
a745c3d9 9use JSON qw/ to_json decode_json /;
03c0a7d7 10use LWP::UserAgent;
7e17346f 11use Set::Scalar;
7e17346f 12use Text::Tradition::Analysis::Result;
13use Text::Tradition::Directory;
d71100ed 14use Text::Tradition::Stemma;
d120c995 15use TryCatch;
d71100ed 16
d1348d38 17use vars qw/ @EXPORT_OK /;
a2cf85dd 18@EXPORT_OK = qw/ run_analysis group_variants analyze_variant_location wit_stringify /;
d1348d38 19
03c0a7d7 20my $SOLVER_URL = 'http://byzantini.st/cgi-bin/graphcalc.cgi';
7e17346f 21my $unsolved_problems = {};
5c44c598 22
7f52eac8 23=head1 NAME
24
25Text::Tradition::Analysis - functions for stemma analysis of a tradition
26
27=head1 SYNOPSIS
28
29 use Text::Tradition;
30 use Text::Tradition::Analysis qw/ run_analysis analyze_variant_location /;
31 my $t = Text::Tradition->new(
32 'name' => 'this is a text',
33 'input' => 'TEI',
34 'file' => '/path/to/tei_parallel_seg_file.xml' );
35 $t->add_stemma( 'dotfile' => $stemmafile );
36
37 my $variant_data = run_analysis( $tradition );
38 # Recalculate rank $n treating all orthographic variants as equivalent
39 my $reanalyze = analyze_variant_location( $tradition, $n, 0, 'orthographic' );
40
41=head1 DESCRIPTION
42
43Text::Tradition is a library for representation and analysis of collated
44texts, particularly medieval ones. The Collation is the central feature of
45a Tradition, where the text, its sequence of readings, and its relationships
46between readings are actually kept.
47
48=head1 SUBROUTINES
49
88a6bac5 50=head2 run_analysis( $tradition, %opts )
7f52eac8 51
88a6bac5 52Runs the analysis described in analyze_variant_location on every location in the
53collation of the given tradition, with the given options. These include:
7f52eac8 54
88a6bac5 55=over 4
56
57=item * stemma_id - Specify which of the tradition's stemmata to use. Default
58is 0 (i.e. the first).
59
60=item * ranks - Specify a list of location ranks to analyze; exclude the rest.
61
62=item * merge_types - Specify a list of relationship types, where related readings
63should be treated as identical for the purposes of analysis.
64
ffa22d6f 65=item * exclude_type1 - Exclude those ranks whose groupings have only type-1 variants.
66
88a6bac5 67=back
7f52eac8 68
69=begin testing
70
71use Text::Tradition;
72use Text::Tradition::Analysis qw/ run_analysis analyze_variant_location /;
73
74my $datafile = 't/data/florilegium_tei_ps.xml';
75my $tradition = Text::Tradition->new( 'input' => 'TEI',
76 'name' => 'test0',
77 'file' => $datafile );
78my $s = $tradition->add_stemma( 'dotfile' => 't/data/florilegium.dot' );
79is( ref( $s ), 'Text::Tradition::Stemma', "Added stemma to tradition" );
80
f00cefe8 81my %expected_genealogical = (
a44aaf2a 82 1 => 0,
f00cefe8 83 2 => 1,
a44aaf2a 84 3 => 0,
85 5 => 0,
86 7 => 0,
87 8 => 0,
88 10 => 0,
f00cefe8 89 13 => 1,
a44aaf2a 90 33 => 0,
91 34 => 0,
92 37 => 0,
93 60 => 0,
f00cefe8 94 81 => 1,
a44aaf2a 95 84 => 0,
96 87 => 0,
97 101 => 0,
98 102 => 0,
f00cefe8 99 122 => 1,
a44aaf2a 100 157 => 0,
f00cefe8 101 166 => 1,
102 169 => 1,
a44aaf2a 103 200 => 0,
f00cefe8 104 216 => 1,
105 217 => 1,
106 219 => 1,
107 241 => 1,
108 242 => 1,
109 243 => 1,
110);
111
7e17346f 112my $data = run_analysis( $tradition, calcdsn => 'dbi:SQLite:dbname=t/data/analysis.db' );
7234b01d 113my $c = $tradition->collation;
f00cefe8 114foreach my $row ( @{$data->{'variants'}} ) {
a44aaf2a 115 # Account for rows that used to be "not useful"
116 unless( exists $expected_genealogical{$row->{'id'}} ) {
117 $expected_genealogical{$row->{'id'}} = 1;
118 }
18f48b82 119 my $gen_bool = $row->{'genealogical'} ? 1 : 0;
120 is( $gen_bool, $expected_genealogical{$row->{'id'}},
f00cefe8 121 "Got correct genealogical flag for row " . $row->{'id'} );
7234b01d 122 # Check that we have the right row with the right groups
123 my $rank = $row->{'id'};
124 foreach my $rdghash ( @{$row->{'readings'}} ) {
125 # Skip 'readings' that aren't really
126 next unless $c->reading( $rdghash->{'readingid'} );
127 # Check the rank
128 is( $c->reading( $rdghash->{'readingid'} )->rank, $rank,
129 "Got correct reading rank" );
130 # Check the witnesses
131 my @realwits = sort $c->reading_witnesses( $rdghash->{'readingid'} );
132 my @sgrp = sort @{$rdghash->{'group'}};
133 is_deeply( \@sgrp, \@realwits, "Reading analyzed with correct groups" );
134 }
f00cefe8 135}
a44aaf2a 136is( $data->{'variant_count'}, 58, "Got right total variant number" );
b4cb2d60 137# TODO Make something meaningful of conflict count, maybe test other bits
7f52eac8 138
139=end testing
140
141=cut
142
d71100ed 143sub run_analysis {
88a6bac5 144 my( $tradition, %opts ) = @_;
f00cefe8 145 my $c = $tradition->collation;
7e17346f 146 my $aclabel = $c->ac_label;
88a6bac5 147
148 my $stemma_id = $opts{'stemma_id'} || 0;
1d73ecad 149 my @ranks = ref( $opts{'ranks'} ) eq 'ARRAY' ? @{$opts{'ranks'}} : ();
e23225e7 150 my $collapse = Set::Scalar->new();
151 if( $opts{'merge_types'} && ref( $opts{'merge_types'} ) eq 'ARRAY' ) {
152 $collapse->insert( @{$opts{'merge_types'}} );
153 } elsif( $opts{'merge_types'} ) {
154 $collapse->insert( $opts{'merge_types'} );
155 }
7e17346f 156
157 # Make sure we have a lookup DB for graph calculation results; this will die
158 # if calcdir or calcdsn isn't passed.
03c0a7d7 159 my $dir;
160 if( exists $opts{'calcdir'} ) {
161 $dir = delete $opts{'calcdir'}
162 } elsif ( exists $opts{'calcdsn'} ) {
163 $dir = Text::Tradition::Directory->new( dsn => $opts{'calcdsn'} );
164 }
88a6bac5 165
166 # Get the stemma
167 my $stemma = $tradition->stemma( $stemma_id );
b4cb2d60 168
4ce27d42 169 # Figure out which witnesses we are working with - that is, the ones that
170 # appear both in the stemma and in the tradition. All others are 'lacunose'
171 # for our purposes.
7e17346f 172 my $lacunose = Set::Scalar->new( $stemma->hypotheticals );
173 my $stemma_wits = Set::Scalar->new( $stemma->witnesses );
174 my $tradition_wits = Set::Scalar->new( map { $_->sigil } $tradition->witnesses );
175 $lacunose->insert( $stemma_wits->symmetric_difference( $tradition_wits )->members );
88a6bac5 176
177 # Find and mark 'common' ranks for exclusion, unless they were
178 # explicitly specified.
179 unless( @ranks ) {
180 my %common_rank;
a44aaf2a 181 foreach my $rdg ( $c->common_readings ) {
88a6bac5 182 $common_rank{$rdg->rank} = 1;
183 }
184 @ranks = grep { !$common_rank{$_} } ( 1 .. $c->end->rank-1 );
d71100ed 185 }
7f52eac8 186
88a6bac5 187 # Group the variants to send to the solver
188 my @groups;
f629cb3b 189 my @use_ranks;
a44aaf2a 190 my %lacunae;
94654e27 191 my $moved = {};
88a6bac5 192 foreach my $rank ( @ranks ) {
7e17346f 193 my $missing = $lacunose->clone();
e23225e7 194 my $rankgroup = group_variants( $tradition, $rank, $missing, $moved, $collapse );
94654e27 195 # Filter out any empty rankgroups
196 # (e.g. from the later rank for a transposition)
197 next unless keys %$rankgroup;
7e17346f 198 # Get the graph for this rankgroup
199 my $rankgraph = _graph_for_grouping( $stemma, $rankgroup, $missing, $aclabel );
ffa22d6f 200 if( $opts{'exclude_type1'} ) {
201 # Check to see whether this is a "useful" group.
7e17346f 202 next unless _useful_variant( $rankgroup, $rankgraph, $aclabel );
ffa22d6f 203 }
f629cb3b 204 push( @use_ranks, $rank );
7e17346f 205 push( @groups, { grouping => $rankgroup, graph => $rankgraph } );
a44aaf2a 206 $lacunae{$rank} = $missing;
d71100ed 207 }
4ce27d42 208 # Run the solver
7e17346f 209 my $answer = solve_variants( $dir, @groups );
fae07016 210
88a6bac5 211 # Do further analysis on the answer
a44aaf2a 212 my $conflict_count = 0;
7e17346f 213 my $reversion_count = 0;
f629cb3b 214 foreach my $idx ( 0 .. $#use_ranks ) {
88a6bac5 215 my $location = $answer->{'variants'}->[$idx];
216 # Add the rank back in
94654e27 217 my $rank = $use_ranks[$idx];
218 $location->{'id'} = $rank;
7234b01d 219 # Note what our lacunae are
f629cb3b 220 my %lmiss;
7234b01d 221 map { $lmiss{$_} = 1 } @{$lacunae{$use_ranks[$idx]}};
7234b01d 222 $location->{'missing'} = [ keys %lmiss ];
223
88a6bac5 224 # Run the extra analysis we need.
428bcf0b 225 ## TODO We run through all the variants in this call, so
226 ## why not add the reading data there instead of here below?
7e17346f 227 my $graph = $groups[$idx]->{graph};
228 analyze_location( $tradition, $graph, $location, \%lmiss );
7234b01d 229
638e2a95 230 my @layerwits;
7234b01d 231 # Do the final post-analysis tidying up of the data.
a44aaf2a 232 foreach my $rdghash ( @{$location->{'readings'}} ) {
e23225e7 233 $conflict_count++ if $rdghash->{'is_conflict'};
234 $reversion_count++ if $rdghash->{'is_reverted'};
94654e27 235 # Add the reading text back in, setting display value as needed
a44aaf2a 236 my $rdg = $c->reading( $rdghash->{'readingid'} );
94654e27 237 if( $rdg ) {
238 $rdghash->{'text'} = $rdg->text .
239 ( $rdg->rank == $rank ? '' : ' [' . $rdg->rank . ']' );
7b7abf10 240 $rdghash->{'is_ungrammatical'} = $rdg->grammar_invalid;
241 $rdghash->{'is_nonsense'} = $rdg->is_nonsense;
94654e27 242 }
f629cb3b 243 # Remove lacunose witnesses from this reading's list now that the
7234b01d 244 # analysis is done
f629cb3b 245 my @realgroup;
7234b01d 246 map { push( @realgroup, $_ ) unless $lmiss{$_} } @{$rdghash->{'group'}};
f629cb3b 247 $rdghash->{'group'} = \@realgroup;
638e2a95 248 # Note any layered witnesses that appear in this group
249 foreach( @realgroup ) {
250 if( $_ =~ /^(.*)\Q$aclabel\E$/ ) {
251 push( @layerwits, $1 );
252 }
253 }
a44aaf2a 254 }
638e2a95 255 $location->{'layerwits'} = \@layerwits if @layerwits;
88a6bac5 256 }
a44aaf2a 257 $answer->{'conflict_count'} = $conflict_count;
7e17346f 258 $answer->{'reversion_count'} = $reversion_count;
f00cefe8 259
88a6bac5 260 return $answer;
d71100ed 261}
262
e23225e7 263=head2 group_variants( $tradition, $rank, $lacunose, $transposed, $merge_relationship_types )
7f52eac8 264
265Groups the variants at the given $rank of the collation, treating any
e23225e7 266relationships in the set $merge_relationship_types as equivalent.
267$lacunose should be a reference to an array, to which the sigla of lacunose
268witnesses at this rank will be appended; $transposed should be a reference
269to a hash, wherein the identities of transposed readings and their
270relatives will be stored.
7f52eac8 271
ffa22d6f 272Returns a hash $group_readings where $rdg is attested by the witnesses listed
273in $group_readings->{$rdg}.
7f52eac8 274
275=cut
276
277# Return group_readings, groups, lacunose
d1348d38 278sub group_variants {
94654e27 279 my( $tradition, $rank, $lacunose, $transposed, $collapse ) = @_;
7f52eac8 280 my $c = $tradition->collation;
335a62ef 281 my $aclabel = $c->ac_label;
d120c995 282 my $table = $c->alignment_table;
7f52eac8 283 # Get the alignment table readings
284 my %readings_at_rank;
7e17346f 285 my $check_for_gaps = Set::Scalar->new();
94654e27 286 my %moved_wits;
d120c995 287 my $has_transposition;
288 foreach my $tablewit ( @{$table->{'alignment'}} ) {
7f52eac8 289 my $rdg = $tablewit->{'tokens'}->[$rank-1];
fae07016 290 my $wit = $tablewit->{'witness'};
ffa22d6f 291 # Exclude the witness if it is "lacunose" which if we got here
292 # means "not in the stemma".
7e17346f 293 next if _is_lacunose( $wit, $lacunose, $aclabel );
94654e27 294 # Note if the witness is actually in a lacuna
7f52eac8 295 if( $rdg && $rdg->{'t'}->is_lacuna ) {
335a62ef 296 _add_to_witlist( $wit, $lacunose, $aclabel );
94654e27 297 # Otherwise the witness either has a positive reading...
7f52eac8 298 } elsif( $rdg ) {
94654e27 299 # If the reading has been counted elsewhere as a transposition, ignore it.
300 if( $transposed->{$rdg->{'t'}->id} ) {
d120c995 301 # TODO Does this cope with three-way transpositions?
94654e27 302 map { $moved_wits{$_} = 1 } @{$transposed->{$rdg->{'t'}->id}};
303 next;
304 }
305 # Otherwise, record it...
306 $readings_at_rank{$rdg->{'t'}->id} = $rdg->{'t'};
307 # ...and grab any transpositions, and their relations.
308 my @transp = grep { $_->rank != $rank } $rdg->{'t'}->related_readings();
309 foreach my $trdg ( @transp ) {
d120c995 310 next if exists $readings_at_rank{$trdg->id};
311 $has_transposition = 1;
312 my @affected_wits = _table_witnesses(
7e17346f 313 $table, $trdg, $lacunose, $aclabel );
d120c995 314 next unless @affected_wits;
315 map { $moved_wits{$_} = 1 } @affected_wits;
316 $transposed->{$trdg->id} =
7e17346f 317 [ _table_witnesses( $table, $rdg->{'t'}, $lacunose, $aclabel ) ];
94654e27 318 $readings_at_rank{$trdg->id} = $trdg;
319 }
320 # ...or it is empty, ergo a gap.
7f52eac8 321 } else {
7e17346f 322 _add_to_witlist( $wit, $check_for_gaps, $aclabel );
7f52eac8 323 }
324 }
7e17346f 325 my $gap_wits = Set::Scalar->new();
326 map { _add_to_witlist( $_, $gap_wits, $aclabel )
327 unless $moved_wits{$_} } $check_for_gaps->members;
328
329 # Group the readings, collapsing groups by relationship if needed.
d120c995 330 my $grouped_readings = {};
4ce27d42 331 foreach my $rdg ( values %readings_at_rank ) {
7f52eac8 332 # Skip readings that have been collapsed into others.
d120c995 333 next if exists $grouped_readings->{$rdg->id}
334 && $grouped_readings->{$rdg->id} eq 'COLLAPSE';
4ce27d42 335 # Get the witness list, including from readings collapsed into this one.
7e17346f 336 my @wits = _table_witnesses( $table, $rdg, $lacunose, $aclabel );
e23225e7 337 if( $collapse && $collapse->size ) {
338 my $filter = sub { $collapse->has( $_[0]->type ) };
7f52eac8 339 foreach my $other ( $rdg->related_readings( $filter ) ) {
7e17346f 340 my @otherwits = _table_witnesses( $table, $other, $lacunose, $aclabel );
fae07016 341 push( @wits, @otherwits );
d120c995 342 $grouped_readings->{$other->id} = 'COLLAPSE';
d1348d38 343 }
344 }
7e17346f 345 $grouped_readings->{$rdg->id} = Set::Scalar->new( @wits );
7f52eac8 346 }
7e17346f 347 if( $gap_wits->members ) {
348 $grouped_readings->{'(omitted)'} = $gap_wits;
349 }
350
7f52eac8 351 # Get rid of our collapsed readings
7e17346f 352 map { delete $grouped_readings->{$_} if(
353 $grouped_readings->{$_} eq 'COLLAPSE'
354 || $grouped_readings->{$_}->is_empty ) }
355 keys %$grouped_readings;
d120c995 356
357 # If something was transposed, check the groups for doubled-up readings
358 if( $has_transposition ) {
62a39b8f 359 # print STDERR "Group for rank $rank:\n";
360 # map { print STDERR "\t$_: " . join( ' ' , @{$grouped_readings->{$_}} ) . "\n" }
361 # keys %$grouped_readings;
d120c995 362 _check_transposed_consistency( $c, $rank, $transposed, $grouped_readings );
363 }
7f52eac8 364
4ce27d42 365 # Return the result
d120c995 366 return $grouped_readings;
367}
368
369# Helper function to query the alignment table for all witnesses (a.c. included)
370# that have a given reading at its rank.
371sub _table_witnesses {
372 my( $table, $trdg, $lacunose, $aclabel ) = @_;
373 my $tableidx = $trdg->rank - 1;
7e17346f 374 my $has_reading = Set::Scalar->new();
d120c995 375 foreach my $row ( @{$table->{'alignment'}} ) {
376 my $wit = $row->{'witness'};
7e17346f 377 next if _is_lacunose( $wit, $lacunose, $aclabel );
d120c995 378 my $rdg = $row->{'tokens'}->[$tableidx];
379 next unless exists $rdg->{'t'} && defined $rdg->{'t'};
7e17346f 380 _add_to_witlist( $wit, $has_reading, $aclabel )
d120c995 381 if $rdg->{'t'}->id eq $trdg->id;
382 }
7e17346f 383 return $has_reading->members;
384}
385
386# Helper function to see if a witness is lacunose even if we are asking about
387# the a.c. version
388sub _is_lacunose {
389 my ( $wit, $lac, $acstr ) = @_;
390 if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
391 $wit = $1;
392 }
393 return $lac->has( $wit );
d1348d38 394}
395
335a62ef 396# Helper function to ensure that X and X a.c. never appear in the same list.
397sub _add_to_witlist {
398 my( $wit, $list, $acstr ) = @_;
335a62ef 399 if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
7e17346f 400 # Don't add X a.c. if we already have X
401 return if $list->has( $1 );
335a62ef 402 } else {
7e17346f 403 # Delete X a.c. if we are about to add X
404 $list->delete( $wit.$acstr );
335a62ef 405 }
7e17346f 406 $list->insert( $wit );
335a62ef 407}
408
d120c995 409sub _check_transposed_consistency {
410 my( $c, $rank, $transposed, $groupings ) = @_;
411 my %seen_wits;
412 my %thisrank;
413 # Note which readings are actually at this rank, and which witnesses
414 # belong to which reading.
415 foreach my $rdg ( keys %$groupings ) {
416 my $rdgobj = $c->reading( $rdg );
417 # Count '(omitted)' as a reading at this rank
418 $thisrank{$rdg} = 1 if !$rdgobj || $rdgobj->rank == $rank;
419 map { push( @{$seen_wits{$_}}, $rdg ) } @{$groupings->{$rdg}};
420 }
421 # Our work is done if we have no witness belonging to more than one
422 # reading.
423 my @doubled = grep { scalar @{$seen_wits{$_}} > 1 } keys %seen_wits;
424 return unless @doubled;
425 # If we have a symmetric related transposition, drop the non-rank readings.
426 if( @doubled == scalar keys %seen_wits ) {
427 foreach my $rdg ( keys %$groupings ) {
428 if( !$thisrank{$rdg} ) {
429 my $groupstr = wit_stringify( $groupings->{$rdg} );
430 my ( $matched ) = grep { $groupstr eq wit_stringify( $groupings->{$_} ) }
431 keys %thisrank;
432 delete $groupings->{$rdg};
433 # If we found a group match, assume there is a symmetry happening.
434 # TODO think more about this
62a39b8f 435 # print STDERR "*** Deleting symmetric reading $rdg\n";
d120c995 436 unless( $matched ) {
437 delete $transposed->{$rdg};
438 warn "Found problem in evident symmetry with reading $rdg";
439 }
440 }
441 }
442 # Otherwise 'unhook' the transposed reading(s) that have duplicates.
443 } else {
444 foreach my $dup ( @doubled ) {
445 foreach my $rdg ( @{$seen_wits{$dup}} ) {
446 next if $thisrank{$rdg};
447 next unless exists $groupings->{$rdg};
62a39b8f 448 # print STDERR "*** Deleting asymmetric doubled-up reading $rdg\n";
d120c995 449 delete $groupings->{$rdg};
450 delete $transposed->{$rdg};
451 }
452 }
453 # and put any now-orphaned readings into an 'omitted' reading.
454 foreach my $wit ( keys %seen_wits ) {
455 unless( grep { exists $groupings->{$_} } @{$seen_wits{$wit}} ) {
7e17346f 456 $groupings->{'(omitted)'} = Set::Scalar->new()
457 unless exists $groupings->{'(omitted)'};
d120c995 458 _add_to_witlist( $wit, $groupings->{'(omitted)'}, $c->ac_label );
459 }
460 }
461 }
462}
463
7e17346f 464# For the given grouping, return its situation graph based on the stemma.
465sub _graph_for_grouping {
466 my( $stemma, $grouping, $lacunose, $aclabel ) = @_;
467 my $acwits = [];
468 my $extant = {};
469 foreach my $gs ( values %$grouping ) {
470 map {
471 if( $_ =~ /^(.*)\Q$aclabel\E$/ ) {
472 push( @$acwits, $1 ) unless $lacunose->has( $1 );
473 } else {
474 $extant->{$_} = 1 unless $lacunose->has( $_ );
475 }
476 } $gs->members;
477 }
478 my $graph;
479 try {
480 # contig contains all extant wits and all hypothetical wits
481 # needed to make up the groups.
482 $graph = $stemma->situation_graph( $extant, $acwits );
483 } catch ( Text::Tradition::Error $e ) {
03c0a7d7 484 throw( "Could not extend graph with given extant and a.c. witnesses: "
485 . $e->message );
7e17346f 486 } catch {
03c0a7d7 487 throw( "Could not extend graph with a.c. witnesses @$acwits" );
7e17346f 488 }
489 return $graph;
490}
88a6bac5 491
7e17346f 492=head2 solve_variants( $calcdir, @groups )
493
494Looks up the set of groups in the answers provided by the external graph solver
495service and returns a cleaned-up answer, adding the rank IDs back where they belong.
88a6bac5 496
88a6bac5 497The answer has the form
498 { "variants" => [ array of variant location structures ],
499 "variant_count" => total,
500 "conflict_count" => number of conflicts detected,
501 "genealogical_count" => number of solutions found }
502
503=cut
504
505sub solve_variants {
03c0a7d7 506 my( @groups ) = @_;
507
a745c3d9 508 # Are we using a local result directory, or did we pass an empty value
509 # for the directory?
03c0a7d7 510 my $dir;
a745c3d9 511 unless( ref( $groups[0] ) eq 'HASH' ) {
03c0a7d7 512 $dir = shift @groups;
513 }
335a62ef 514
03c0a7d7 515 ## For each graph/group combo, make a Text::Tradition::Analysis::Result
516 ## object so that we can send it off for IDP lookup.
7e17346f 517 my $variants = [];
518 my $genealogical = 0; # counter
03c0a7d7 519 # TODO Optimize for unique graph problems
520 my @problems;
7e17346f 521 foreach my $graphproblem ( @groups ) {
03c0a7d7 522 # Construct the calc result key and look up its answer
523 my $problem = Text::Tradition::Analysis::Result->new(
524 graph => $graphproblem->{'graph'},
525 setlist => [ values %{$graphproblem->{'grouping'}} ] );
526 push( @problems, $problem );
527 }
528
529 my @results;
530 if( $dir ) {
531 my $scope = $dir->new_scope;
532 @results = map { $dir->lookup( $_->object_key ) || $_ } @problems;
533 } else {
534 my $json = JSON->new->allow_blessed->convert_blessed->utf8->encode( \@problems );
535 # Send it off and get the result
536 # print STDERR "Sending request: " . decode_utf8( $json ) . "\n";
537 my $ua = LWP::UserAgent->new();
538 my $resp = $ua->post( $SOLVER_URL, 'Content-Type' => 'application/json',
539 'Content' => $json );
a745c3d9 540 my $answer;
03c0a7d7 541 if( $resp->is_success ) {
a745c3d9 542 $answer = decode_json( $resp->content );
03c0a7d7 543 throw( "Unexpected answer from IDP: $answer" ) unless ref( $answer ) eq 'ARRAY';
544 } else {
545 throw( "IDP solver returned " . $resp->status_line . " / " . $resp->content
546 . "; cannot run graph analysis" );
547 }
548 # One more sanity check
549 throw( "Something went wrong with answer symmetricity" )
550 unless @groups == @$answer;
551 # Convert the results
552 @results = map { Text::Tradition::Analysis::Result->new( $_ ) } @$answer;
553 }
554
555 # We now have a single JSON-encoded Result object per problem sent. Fold its
556 # answers into our variant info structure.
557 foreach my $idx ( 0 .. $#groups ) {
558 my $graphproblem = $groups[$idx];
559 my $result = $results[$idx];
560
7e17346f 561 # Initialize the result structure for this graph problem
562 my $vstruct = { readings => [] };
563 push( @$variants, $vstruct );
5c44c598 564
03c0a7d7 565 # 0. Do we have a calculated result at all?
566 unless( $result->status eq 'OK' ) {
567 $vstruct->{'unsolved'} = $result->status;
7e17346f 568 foreach my $rid ( keys %{$graphproblem->{grouping}} ) {
569 push( @{$vstruct->{readings}}, { readingid => $rid,
570 group => [ $graphproblem->{grouping}->{$rid}->members ] } );
5c44c598 571 }
7e17346f 572 next;
5c44c598 573 }
574
7e17346f 575 # 1. Did the group evaluate as genealogical?
03c0a7d7 576 $vstruct->{genealogical} = $result->is_genealogical;
577 $genealogical++ if $result->is_genealogical;
7e17346f 578
579 # 2. What are the calculated minimum groupings for each variant loc?
580 foreach my $rid ( keys %{$graphproblem->{grouping}} ) {
581 my $inputset = $graphproblem->{grouping}->{$rid};
03c0a7d7 582 my $minset = $result->minimum_grouping_for( $inputset );
7e17346f 583 push( @{$vstruct->{readings}}, { readingid => $rid, group => $minset } );
6d25a3a0 584 }
7e17346f 585
586 # 3. What are the sources and classes calculated for each witness?
03c0a7d7 587 $vstruct->{witcopy_types} = { $result->classes };
7e17346f 588 $vstruct->{reading_roots} = {};
03c0a7d7 589 map { $vstruct->{reading_roots}->{$_} = 1 } $result->sources;
7e17346f 590
88a6bac5 591 }
592
593 return { 'variants' => $variants,
594 'variant_count' => scalar @$variants,
595 'genealogical_count' => $genealogical };
596}
597
fae07016 598=head2 analyze_location ( $tradition, $graph, $location_hash )
7f52eac8 599
fae07016 600Given the tradition, its stemma graph, and the solution from the graph solver,
601work out the rest of the information we want. For each reading we need missing,
5c44c598 602conflict, reading_parents, independent_occurrence, followed, not_followed,
603and follow_unknown. Alters the location_hash in place.
7f52eac8 604
605=cut
732152b1 606
fae07016 607sub analyze_location {
7e17346f 608 my ( $tradition, $graph, $variant_row, $lacunose ) = @_;
94654e27 609 my $c = $tradition->collation;
fae07016 610
03c0a7d7 611 if( exists $variant_row->{'unsolved'} ) {
612 return;
613 }
614 my $reading_roots = delete $variant_row->{'reading_roots'};
615 my $classinfo = delete $variant_row->{'witcopy_types'};
616
fae07016 617 # Make a hash of all known node memberships, and make the subgraphs.
618 my $contig = {};
fae07016 619 my $subgraph = {};
638e2a95 620 my $acstr = $c->ac_label;
621 my @acwits;
5c44c598 622
5c44c598 623 # Note which witnesses positively belong to which group. This information
624 # comes ultimately from the IDP solver.
625 # Also make a note of the reading's roots.
fae07016 626 foreach my $rdghash ( @{$variant_row->{'readings'}} ) {
627 my $rid = $rdghash->{'readingid'};
5c44c598 628 my @roots;
638e2a95 629 foreach my $wit ( @{$rdghash->{'group'}} ) {
630 $contig->{$wit} = $rid;
631 if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
632 push( @acwits, $1 );
633 }
5c44c598 634 if( exists $reading_roots->{$wit} && $reading_roots->{$wit} ) {
635 push( @roots, $wit );
636 }
638e2a95 637 }
5c44c598 638 $rdghash->{'independent_occurrence'} = \@roots;
94654e27 639 }
7e17346f 640
fae07016 641 # Now that we have all the node group memberships, calculate followed/
bebec0e9 642 # non-followed/unknown values for each reading. Also figure out the
643 # reading's evident parent(s).
644 foreach my $rdghash ( @{$variant_row->{'readings'}} ) {
fae07016 645 my $rid = $rdghash->{'readingid'};
428bcf0b 646 my $rdg = $c->reading( $rid );
5c44c598 647 my @roots = @{$rdghash->{'independent_occurrence'}};
7e17346f 648 my @reversions;
649 if( $classinfo ) {
650 @reversions = grep { $classinfo->{$_} eq 'revert' }
651 $rdghash->{'group'}->members;
be590045 652 $rdghash->{'reversions'} = \@reversions;
7e17346f 653 }
5c44c598 654 my @group = @{$rdghash->{'group'}};
fae07016 655
656 # Start figuring things out.
7e17346f 657 $rdghash->{'followed'} = scalar( @group )
658 - ( scalar( @roots ) + scalar( @reversions ) );
bebec0e9 659 # Find the parent readings, if any, of this reading.
7e17346f 660 my $sourceparents = _find_reading_parents( $rid, $graph, $contig, @roots );
7e17346f 661 # Work out relationships between readings and their non-followed parent.
662 _resolve_parent_relationships( $c, $rid, $rdg, $sourceparents );
be590045 663 $rdghash->{'source_parents'} = $sourceparents;
e23225e7 664
665 if( @reversions ) {
666 my $revparents = _find_reading_parents( $rid, $graph, $contig, @reversions );
667 _resolve_parent_relationships( $c, $rid, $rdg, $revparents );
668 $rdghash->{'reversion_parents'} = $revparents;
669 }
bebec0e9 670
671 # Find the number of times this reading was altered, and the number of
672 # times we're not sure.
673 my( %nofollow, %unknownfollow );
5c44c598 674 foreach my $wit ( @{$rdghash->{'group'}} ) {
bebec0e9 675 foreach my $wchild ( $graph->successors( $wit ) ) {
5c44c598 676 if( $reading_roots->{$wchild} && $contig->{$wchild}
677 && $contig->{$wchild} ne $rid ) {
bebec0e9 678 # It definitely changed here.
679 $nofollow{$wchild} = 1;
680 } elsif( !($contig->{$wchild}) ) {
681 # The child is a hypothetical node not definitely in
682 # any group. Answer is unknown.
683 $unknownfollow{$wchild} = 1;
5c44c598 684 } # else it is either in our group, or it is a non-root node in a
685 # known group and therefore is presumed to have its reading from
686 # its group, not this link.
bebec0e9 687 }
688 }
689 $rdghash->{'not_followed'} = keys %nofollow;
690 $rdghash->{'follow_unknown'} = keys %unknownfollow;
fae07016 691
692 # Now say whether this reading represents a conflict.
693 unless( $variant_row->{'genealogical'} ) {
7e17346f 694 $rdghash->{'is_conflict'} = @roots != 1;
e23225e7 695 $rdghash->{'is_reverted'} = scalar @reversions;
7e17346f 696 }
697 }
698}
699
700sub _find_reading_parents {
701 my( $rid, $graph, $contig, @list ) = @_;
702 my $parenthash = {};
703 foreach my $wit ( @list ) {
704 # Look in the stemma graph to find this witness's extant or known-reading
705 # immediate ancestor(s), and look up the reading that each ancestor holds.
706 my @check = $graph->predecessors( $wit );
707 while( @check ) {
708 my @next;
709 foreach my $wparent( @check ) {
710 my $preading = $contig->{$wparent};
711 if( $preading && $preading ne $rid ) {
712 $parenthash->{$preading} = 1;
713 } else {
714 push( @next, $graph->predecessors( $wparent ) );
715 }
716 }
717 @check = @next;
718 }
719 }
720 return $parenthash;
721}
722
723sub _resolve_parent_relationships {
724 my( $c, $rid, $rdg, $rdgparents ) = @_;
725 foreach my $p ( keys %$rdgparents ) {
726 # Resolve the relationship of the parent to the reading, and
727 # save it in our hash.
728 my $pobj = $c->reading( $p );
729 my $prep = $pobj ? $pobj->id . ' (' . $pobj->text . ')' : $p;
730 my $phash = { 'label' => $prep };
731 if( $pobj ) {
732 my $rel = $c->get_relationship( $p, $rid );
733 if( $rel ) {
734 _add_to_hash( $rel, $phash );
735 } elsif( $rdg ) {
736 # First check for a transposed relationship
737 if( $rdg->rank != $pobj->rank ) {
738 foreach my $ti ( $rdg->related_readings( 'transposition' ) ) {
739 next unless $ti->text eq $rdg->text;
740 $rel = $c->get_relationship( $ti, $pobj );
741 if( $rel ) {
742 _add_to_hash( $rel, $phash, 1 );
743 last;
744 }
745 }
746 unless( $rel ) {
747 foreach my $ti ( $pobj->related_readings( 'transposition' ) ) {
748 next unless $ti->text eq $pobj->text;
749 $rel = $c->get_relationship( $ti, $rdg );
750 if( $rel ) {
751 _add_to_hash( $rel, $phash, 1 );
752 last;
753 }
754 }
5c44c598 755 }
756 }
7e17346f 757 unless( $rel ) {
758 # and then check for sheer word similarity.
759 my $rtext = $rdg->text;
760 my $ptext = $pobj->text;
761 if( similar( $rtext, $ptext ) ) {
762 # say STDERR "Words $rtext and $ptext judged similar";
763 $phash->{relation} = { type => 'wordsimilar' };
764 }
765 }
5c44c598 766 } else {
7e17346f 767 $phash->{relation} = { type => 'deletion' };
5c44c598 768 }
7e17346f 769 # Get the attributes of the parent object while we are here
770 $phash->{'text'} = $pobj->text if $pobj;
771 $phash->{'is_nonsense'} = $pobj->is_nonsense;
772 $phash->{'is_ungrammatical'} = $pobj->grammar_invalid;
773 } elsif( $p eq '(omitted)' ) {
774 $phash->{relation} = { type => 'addition' };
775 }
776 # Save it
777 $rdgparents->{$p} = $phash;
778 }
d71100ed 779}
780
428bcf0b 781sub _add_to_hash {
782 my( $rel, $phash, $is_transposed ) = @_;
783 $phash->{relation} = { type => $rel->type };
784 $phash->{relation}->{transposed} = 1 if $is_transposed;
785 $phash->{relation}->{annotation} = $rel->annotation
786 if $rel->has_annotation;
787}
788
789=head2 similar( $word1, $word2 )
790
791Use Algorithm::Diff to get a sense of how close the words are to each other.
792This will hopefully handle substitutions a bit more nicely than Levenshtein.
793
794=cut
795
796#!/usr/bin/env perl
797
798sub similar {
799 my( $word1, $word2 ) = sort { length($a) <=> length($b) } @_;
800 my @let1 = split( '', lc( $word1 ) );
801 my @let2 = split( '', lc( $word2 ) );
802 my $diff = Algorithm::Diff->new( \@let1, \@let2 );
803 my $mag = 0;
804 while( $diff->Next ) {
805 if( $diff->Same ) {
806 # Take off points for longer strings
807 my $cs = $diff->Range(1) - 2;
808 $cs = 0 if $cs < 0;
809 $mag -= $cs;
810 } elsif( !$diff->Items(1) ) {
811 $mag += $diff->Range(2);
812 } elsif( !$diff->Items(2) ) {
813 $mag += $diff->Range(1);
814 } else {
815 # Split the difference for substitutions
816 my $c1 = $diff->Range(1) || 1;
817 my $c2 = $diff->Range(2) || 1;
818 my $cd = ( $c1 + $c2 ) / 2;
819 $mag += $cd;
820 }
821 }
822 return ( $mag <= length( $word1 ) / 2 );
823}
824
6d25a3a0 825sub _prune_group {
5c44c598 826 my( $group, $graph ) = @_;
827 my $relevant = {};
828 # Record the existence of the vertices in the group
829 map { $relevant->{$_} = 1 } @$group;
6d25a3a0 830 # Make our subgraph
5c44c598 831 my $subgraph = $graph->deep_copy;
832 map { $subgraph->delete_vertex( $_ ) unless $relevant->{$_} }
6d25a3a0 833 $subgraph->vertices;
6d25a3a0 834 # Now prune and return the remaining vertices.
5c44c598 835 _prune_subtree( $subgraph );
836 # Return the list of vertices and the list of roots.
837 my $pruned_group = [ sort $subgraph->vertices ];
838 my $pruned_roots = [ $subgraph->predecessorless_vertices ];
839 return( $pruned_group, $pruned_roots );
6d25a3a0 840}
841
7f52eac8 842sub _prune_subtree {
5c44c598 843 my( $tree ) = @_;
94654e27 844
845 # Delete lacunose witnesses that have no successors
5c44c598 846 my @orphan_hypotheticals;
847 my $ctr = 0;
848 do {
03c0a7d7 849 throw( "Infinite loop on leaves" ) if $ctr > 100;
5c44c598 850 @orphan_hypotheticals =
851 grep { $tree->get_vertex_attribute( $_, 'class' ) eq 'hypothetical' }
852 $tree->successorless_vertices;
853 $tree->delete_vertices( @orphan_hypotheticals );
854 $ctr++;
855 } while( @orphan_hypotheticals );
94654e27 856
857 # Delete lacunose roots that have a single successor
858 my @redundant_root;
859 $ctr = 0;
860 do {
03c0a7d7 861 throw( "Infinite loop on roots" ) if $ctr > 100;
5c44c598 862 @redundant_root =
863 grep { $tree->get_vertex_attribute( $_, 'class' ) eq 'hypothetical'
864 && $tree->successors( $_ ) == 1 }
865 $tree->predecessorless_vertices;
94654e27 866 $tree->delete_vertices( @redundant_root );
867 $ctr++;
868 } while( @redundant_root );
869}
870
5be0cdeb 871sub _useful_variant {
7e17346f 872 my( $rankgroup, $rankgraph, $acstr ) = @_;
5be0cdeb 873
874 # Sort by group size and return
875 my $is_useful = 0;
7e17346f 876 foreach my $rdg ( keys %$rankgroup ) {
877 my @wits = $rankgroup->{$rdg}->members;
878 if( @wits > 1 ) {
5be0cdeb 879 $is_useful++;
880 } else {
7e17346f 881 $is_useful++ unless( $rankgraph->is_sink_vertex( $wits[0] )
882 || $wits[0] =~ /\Q$acstr\E$/ );
5be0cdeb 883 }
884 }
7e17346f 885 return $is_useful > 1;
5be0cdeb 886}
887
7f52eac8 888=head2 wit_stringify( $groups )
889
890Takes an array of witness groupings and produces a string like
891['A','B'] / ['C','D','E'] / ['F']
d71100ed 892
7f52eac8 893=cut
d71100ed 894
895sub wit_stringify {
896 my $groups = shift;
897 my @gst;
898 # If we were passed an array of witnesses instead of an array of
899 # groupings, then "group" the witnesses first.
900 unless( ref( $groups->[0] ) ) {
901 my $mkgrp = [ $groups ];
902 $groups = $mkgrp;
903 }
904 foreach my $g ( @$groups ) {
905 push( @gst, '[' . join( ',', map { "'$_'" } @$g ) . ']' );
906 }
907 return join( ' / ', @gst );
908}
7f52eac8 909
7f52eac8 9101;
911
a745c3d9 912sub throw {
913 Text::Tradition::Error->throw(
914 'ident' => 'Analysis error',
915 'message' => $_[0],
916 );
917}
918
7f52eac8 919=head1 LICENSE
920
921This package is free software and is provided "as is" without express
922or implied warranty. You can redistribute it and/or modify it under
923the same terms as Perl itself.
924
925=head1 AUTHOR
926
927Tara L Andrews E<lt>aurum@cpan.orgE<gt>