allow Graph init arg; allow multiple problems in problem_json
[scpubgit/stemmatology.git] / lib / Text / Tradition / Analysis.pm
CommitLineData
d71100ed 1package Text::Tradition::Analysis;
2
3use strict;
4use warnings;
428bcf0b 5use Algorithm::Diff; # for word similarity measure
e4386ba9 6use Benchmark;
7e17346f 7use Digest::MD5 qw/ md5_hex /;
88a6bac5 8use Encode qw/ encode_utf8 /;
d1348d38 9use Exporter 'import';
b4cb2d60 10use Graph;
7e17346f 11use JSON qw/ to_json /;
12use Set::Scalar;
d71100ed 13use Text::Tradition;
7e17346f 14use Text::Tradition::Analysis::Result;
15use Text::Tradition::Directory;
d71100ed 16use Text::Tradition::Stemma;
d120c995 17use TryCatch;
d71100ed 18
d1348d38 19use vars qw/ @EXPORT_OK /;
a2cf85dd 20@EXPORT_OK = qw/ run_analysis group_variants analyze_variant_location wit_stringify /;
d1348d38 21
7e17346f 22my $unsolved_problems = {};
5c44c598 23
7f52eac8 24=head1 NAME
25
26Text::Tradition::Analysis - functions for stemma analysis of a tradition
27
28=head1 SYNOPSIS
29
30 use Text::Tradition;
31 use Text::Tradition::Analysis qw/ run_analysis analyze_variant_location /;
32 my $t = Text::Tradition->new(
33 'name' => 'this is a text',
34 'input' => 'TEI',
35 'file' => '/path/to/tei_parallel_seg_file.xml' );
36 $t->add_stemma( 'dotfile' => $stemmafile );
37
38 my $variant_data = run_analysis( $tradition );
39 # Recalculate rank $n treating all orthographic variants as equivalent
40 my $reanalyze = analyze_variant_location( $tradition, $n, 0, 'orthographic' );
41
42=head1 DESCRIPTION
43
44Text::Tradition is a library for representation and analysis of collated
45texts, particularly medieval ones. The Collation is the central feature of
46a Tradition, where the text, its sequence of readings, and its relationships
47between readings are actually kept.
48
49=head1 SUBROUTINES
50
88a6bac5 51=head2 run_analysis( $tradition, %opts )
7f52eac8 52
88a6bac5 53Runs the analysis described in analyze_variant_location on every location in the
54collation of the given tradition, with the given options. These include:
7f52eac8 55
88a6bac5 56=over 4
57
58=item * stemma_id - Specify which of the tradition's stemmata to use. Default
59is 0 (i.e. the first).
60
61=item * ranks - Specify a list of location ranks to analyze; exclude the rest.
62
63=item * merge_types - Specify a list of relationship types, where related readings
64should be treated as identical for the purposes of analysis.
65
ffa22d6f 66=item * exclude_type1 - Exclude those ranks whose groupings have only type-1 variants.
67
88a6bac5 68=back
7f52eac8 69
70=begin testing
71
72use Text::Tradition;
73use Text::Tradition::Analysis qw/ run_analysis analyze_variant_location /;
74
75my $datafile = 't/data/florilegium_tei_ps.xml';
76my $tradition = Text::Tradition->new( 'input' => 'TEI',
77 'name' => 'test0',
78 'file' => $datafile );
79my $s = $tradition->add_stemma( 'dotfile' => 't/data/florilegium.dot' );
80is( ref( $s ), 'Text::Tradition::Stemma', "Added stemma to tradition" );
81
f00cefe8 82my %expected_genealogical = (
a44aaf2a 83 1 => 0,
f00cefe8 84 2 => 1,
a44aaf2a 85 3 => 0,
86 5 => 0,
87 7 => 0,
88 8 => 0,
89 10 => 0,
f00cefe8 90 13 => 1,
a44aaf2a 91 33 => 0,
92 34 => 0,
93 37 => 0,
94 60 => 0,
f00cefe8 95 81 => 1,
a44aaf2a 96 84 => 0,
97 87 => 0,
98 101 => 0,
99 102 => 0,
f00cefe8 100 122 => 1,
a44aaf2a 101 157 => 0,
f00cefe8 102 166 => 1,
103 169 => 1,
a44aaf2a 104 200 => 0,
f00cefe8 105 216 => 1,
106 217 => 1,
107 219 => 1,
108 241 => 1,
109 242 => 1,
110 243 => 1,
111);
112
7e17346f 113my $data = run_analysis( $tradition, calcdsn => 'dbi:SQLite:dbname=t/data/analysis.db' );
7234b01d 114my $c = $tradition->collation;
f00cefe8 115foreach my $row ( @{$data->{'variants'}} ) {
a44aaf2a 116 # Account for rows that used to be "not useful"
117 unless( exists $expected_genealogical{$row->{'id'}} ) {
118 $expected_genealogical{$row->{'id'}} = 1;
119 }
18f48b82 120 my $gen_bool = $row->{'genealogical'} ? 1 : 0;
121 is( $gen_bool, $expected_genealogical{$row->{'id'}},
f00cefe8 122 "Got correct genealogical flag for row " . $row->{'id'} );
7234b01d 123 # Check that we have the right row with the right groups
124 my $rank = $row->{'id'};
125 foreach my $rdghash ( @{$row->{'readings'}} ) {
126 # Skip 'readings' that aren't really
127 next unless $c->reading( $rdghash->{'readingid'} );
128 # Check the rank
129 is( $c->reading( $rdghash->{'readingid'} )->rank, $rank,
130 "Got correct reading rank" );
131 # Check the witnesses
132 my @realwits = sort $c->reading_witnesses( $rdghash->{'readingid'} );
133 my @sgrp = sort @{$rdghash->{'group'}};
134 is_deeply( \@sgrp, \@realwits, "Reading analyzed with correct groups" );
135 }
f00cefe8 136}
a44aaf2a 137is( $data->{'variant_count'}, 58, "Got right total variant number" );
b4cb2d60 138# TODO Make something meaningful of conflict count, maybe test other bits
7f52eac8 139
140=end testing
141
142=cut
143
d71100ed 144sub run_analysis {
88a6bac5 145 my( $tradition, %opts ) = @_;
f00cefe8 146 my $c = $tradition->collation;
7e17346f 147 my $aclabel = $c->ac_label;
88a6bac5 148
149 my $stemma_id = $opts{'stemma_id'} || 0;
1d73ecad 150 my @ranks = ref( $opts{'ranks'} ) eq 'ARRAY' ? @{$opts{'ranks'}} : ();
151 my @collapse = ref( $opts{'merge_types'} ) eq 'ARRAY' ? @{$opts{'merge_types'}} : ();
7e17346f 152
153 # Make sure we have a lookup DB for graph calculation results; this will die
154 # if calcdir or calcdsn isn't passed.
155 my $dir = $opts{'calcdir'} ? delete $opts{'calcdir'}
156 : Text::Tradition::Directory->new( dsn => $opts{'calcdsn'} );
88a6bac5 157
158 # Get the stemma
159 my $stemma = $tradition->stemma( $stemma_id );
b4cb2d60 160
4ce27d42 161 # Figure out which witnesses we are working with - that is, the ones that
162 # appear both in the stemma and in the tradition. All others are 'lacunose'
163 # for our purposes.
7e17346f 164 my $lacunose = Set::Scalar->new( $stemma->hypotheticals );
165 my $stemma_wits = Set::Scalar->new( $stemma->witnesses );
166 my $tradition_wits = Set::Scalar->new( map { $_->sigil } $tradition->witnesses );
167 $lacunose->insert( $stemma_wits->symmetric_difference( $tradition_wits )->members );
88a6bac5 168
169 # Find and mark 'common' ranks for exclusion, unless they were
170 # explicitly specified.
171 unless( @ranks ) {
172 my %common_rank;
a44aaf2a 173 foreach my $rdg ( $c->common_readings ) {
88a6bac5 174 $common_rank{$rdg->rank} = 1;
175 }
176 @ranks = grep { !$common_rank{$_} } ( 1 .. $c->end->rank-1 );
d71100ed 177 }
7f52eac8 178
88a6bac5 179 # Group the variants to send to the solver
180 my @groups;
f629cb3b 181 my @use_ranks;
a44aaf2a 182 my %lacunae;
94654e27 183 my $moved = {};
88a6bac5 184 foreach my $rank ( @ranks ) {
7e17346f 185 my $missing = $lacunose->clone();
94654e27 186 my $rankgroup = group_variants( $tradition, $rank, $missing, $moved, \@collapse );
187 # Filter out any empty rankgroups
188 # (e.g. from the later rank for a transposition)
189 next unless keys %$rankgroup;
7e17346f 190 # Get the graph for this rankgroup
191 my $rankgraph = _graph_for_grouping( $stemma, $rankgroup, $missing, $aclabel );
ffa22d6f 192 if( $opts{'exclude_type1'} ) {
193 # Check to see whether this is a "useful" group.
7e17346f 194 next unless _useful_variant( $rankgroup, $rankgraph, $aclabel );
ffa22d6f 195 }
f629cb3b 196 push( @use_ranks, $rank );
7e17346f 197 push( @groups, { grouping => $rankgroup, graph => $rankgraph } );
a44aaf2a 198 $lacunae{$rank} = $missing;
d71100ed 199 }
4ce27d42 200 # Run the solver
7e17346f 201 my $answer = solve_variants( $dir, @groups );
fae07016 202
88a6bac5 203 # Do further analysis on the answer
a44aaf2a 204 my $conflict_count = 0;
7e17346f 205 my $reversion_count = 0;
f629cb3b 206 foreach my $idx ( 0 .. $#use_ranks ) {
88a6bac5 207 my $location = $answer->{'variants'}->[$idx];
208 # Add the rank back in
94654e27 209 my $rank = $use_ranks[$idx];
210 $location->{'id'} = $rank;
7234b01d 211 # Note what our lacunae are
f629cb3b 212 my %lmiss;
7234b01d 213 map { $lmiss{$_} = 1 } @{$lacunae{$use_ranks[$idx]}};
7234b01d 214 $location->{'missing'} = [ keys %lmiss ];
215
88a6bac5 216 # Run the extra analysis we need.
428bcf0b 217 ## TODO We run through all the variants in this call, so
218 ## why not add the reading data there instead of here below?
7e17346f 219 my $graph = $groups[$idx]->{graph};
220 analyze_location( $tradition, $graph, $location, \%lmiss );
7234b01d 221
638e2a95 222 my @layerwits;
7234b01d 223 # Do the final post-analysis tidying up of the data.
a44aaf2a 224 foreach my $rdghash ( @{$location->{'readings'}} ) {
225 $conflict_count++
226 if exists $rdghash->{'conflict'} && $rdghash->{'conflict'};
7e17346f 227 $reversion_count++ if $rdghash->{'reverted'};
94654e27 228 # Add the reading text back in, setting display value as needed
a44aaf2a 229 my $rdg = $c->reading( $rdghash->{'readingid'} );
94654e27 230 if( $rdg ) {
231 $rdghash->{'text'} = $rdg->text .
232 ( $rdg->rank == $rank ? '' : ' [' . $rdg->rank . ']' );
7b7abf10 233 $rdghash->{'is_ungrammatical'} = $rdg->grammar_invalid;
234 $rdghash->{'is_nonsense'} = $rdg->is_nonsense;
94654e27 235 }
f629cb3b 236 # Remove lacunose witnesses from this reading's list now that the
7234b01d 237 # analysis is done
f629cb3b 238 my @realgroup;
7234b01d 239 map { push( @realgroup, $_ ) unless $lmiss{$_} } @{$rdghash->{'group'}};
f629cb3b 240 $rdghash->{'group'} = \@realgroup;
638e2a95 241 # Note any layered witnesses that appear in this group
242 foreach( @realgroup ) {
243 if( $_ =~ /^(.*)\Q$aclabel\E$/ ) {
244 push( @layerwits, $1 );
245 }
246 }
a44aaf2a 247 }
638e2a95 248 $location->{'layerwits'} = \@layerwits if @layerwits;
88a6bac5 249 }
a44aaf2a 250 $answer->{'conflict_count'} = $conflict_count;
7e17346f 251 $answer->{'reversion_count'} = $reversion_count;
f00cefe8 252
88a6bac5 253 return $answer;
d71100ed 254}
255
7f52eac8 256=head2 group_variants( $tradition, $rank, $lacunose, @merge_relationship_types )
257
258Groups the variants at the given $rank of the collation, treating any
259relationships in @merge_relationship_types as equivalent. $lacunose should
260be a reference to an array, to which the sigla of lacunose witnesses at this
94654e27 261rank will be appended; $transposed should be a reference to a hash, wherein
262the identities of transposed readings and their relatives will be stored.
7f52eac8 263
ffa22d6f 264Returns a hash $group_readings where $rdg is attested by the witnesses listed
265in $group_readings->{$rdg}.
7f52eac8 266
267=cut
268
269# Return group_readings, groups, lacunose
d1348d38 270sub group_variants {
94654e27 271 my( $tradition, $rank, $lacunose, $transposed, $collapse ) = @_;
7f52eac8 272 my $c = $tradition->collation;
335a62ef 273 my $aclabel = $c->ac_label;
d120c995 274 my $table = $c->alignment_table;
7f52eac8 275 # Get the alignment table readings
276 my %readings_at_rank;
7e17346f 277 my $check_for_gaps = Set::Scalar->new();
94654e27 278 my %moved_wits;
d120c995 279 my $has_transposition;
280 foreach my $tablewit ( @{$table->{'alignment'}} ) {
7f52eac8 281 my $rdg = $tablewit->{'tokens'}->[$rank-1];
fae07016 282 my $wit = $tablewit->{'witness'};
ffa22d6f 283 # Exclude the witness if it is "lacunose" which if we got here
284 # means "not in the stemma".
7e17346f 285 next if _is_lacunose( $wit, $lacunose, $aclabel );
94654e27 286 # Note if the witness is actually in a lacuna
7f52eac8 287 if( $rdg && $rdg->{'t'}->is_lacuna ) {
335a62ef 288 _add_to_witlist( $wit, $lacunose, $aclabel );
94654e27 289 # Otherwise the witness either has a positive reading...
7f52eac8 290 } elsif( $rdg ) {
94654e27 291 # If the reading has been counted elsewhere as a transposition, ignore it.
292 if( $transposed->{$rdg->{'t'}->id} ) {
d120c995 293 # TODO Does this cope with three-way transpositions?
94654e27 294 map { $moved_wits{$_} = 1 } @{$transposed->{$rdg->{'t'}->id}};
295 next;
296 }
297 # Otherwise, record it...
298 $readings_at_rank{$rdg->{'t'}->id} = $rdg->{'t'};
299 # ...and grab any transpositions, and their relations.
300 my @transp = grep { $_->rank != $rank } $rdg->{'t'}->related_readings();
301 foreach my $trdg ( @transp ) {
d120c995 302 next if exists $readings_at_rank{$trdg->id};
303 $has_transposition = 1;
304 my @affected_wits = _table_witnesses(
7e17346f 305 $table, $trdg, $lacunose, $aclabel );
d120c995 306 next unless @affected_wits;
307 map { $moved_wits{$_} = 1 } @affected_wits;
308 $transposed->{$trdg->id} =
7e17346f 309 [ _table_witnesses( $table, $rdg->{'t'}, $lacunose, $aclabel ) ];
94654e27 310 $readings_at_rank{$trdg->id} = $trdg;
311 }
312 # ...or it is empty, ergo a gap.
7f52eac8 313 } else {
7e17346f 314 _add_to_witlist( $wit, $check_for_gaps, $aclabel );
7f52eac8 315 }
316 }
7e17346f 317 my $gap_wits = Set::Scalar->new();
318 map { _add_to_witlist( $_, $gap_wits, $aclabel )
319 unless $moved_wits{$_} } $check_for_gaps->members;
320
321 # Group the readings, collapsing groups by relationship if needed.
d120c995 322 my $grouped_readings = {};
4ce27d42 323 foreach my $rdg ( values %readings_at_rank ) {
7f52eac8 324 # Skip readings that have been collapsed into others.
d120c995 325 next if exists $grouped_readings->{$rdg->id}
326 && $grouped_readings->{$rdg->id} eq 'COLLAPSE';
4ce27d42 327 # Get the witness list, including from readings collapsed into this one.
7e17346f 328 my @wits = _table_witnesses( $table, $rdg, $lacunose, $aclabel );
d120c995 329 if( $collapse && @$collapse ) {
7f52eac8 330 my $filter = sub { my $r = $_[0]; grep { $_ eq $r->type } @$collapse; };
331 foreach my $other ( $rdg->related_readings( $filter ) ) {
7e17346f 332 my @otherwits = _table_witnesses( $table, $other, $lacunose, $aclabel );
fae07016 333 push( @wits, @otherwits );
d120c995 334 $grouped_readings->{$other->id} = 'COLLAPSE';
d1348d38 335 }
336 }
7e17346f 337 $grouped_readings->{$rdg->id} = Set::Scalar->new( @wits );
7f52eac8 338 }
7e17346f 339 if( $gap_wits->members ) {
340 $grouped_readings->{'(omitted)'} = $gap_wits;
341 }
342
7f52eac8 343 # Get rid of our collapsed readings
7e17346f 344 map { delete $grouped_readings->{$_} if(
345 $grouped_readings->{$_} eq 'COLLAPSE'
346 || $grouped_readings->{$_}->is_empty ) }
347 keys %$grouped_readings;
d120c995 348
349 # If something was transposed, check the groups for doubled-up readings
350 if( $has_transposition ) {
62a39b8f 351 # print STDERR "Group for rank $rank:\n";
352 # map { print STDERR "\t$_: " . join( ' ' , @{$grouped_readings->{$_}} ) . "\n" }
353 # keys %$grouped_readings;
d120c995 354 _check_transposed_consistency( $c, $rank, $transposed, $grouped_readings );
355 }
7f52eac8 356
4ce27d42 357 # Return the result
d120c995 358 return $grouped_readings;
359}
360
361# Helper function to query the alignment table for all witnesses (a.c. included)
362# that have a given reading at its rank.
363sub _table_witnesses {
364 my( $table, $trdg, $lacunose, $aclabel ) = @_;
365 my $tableidx = $trdg->rank - 1;
7e17346f 366 my $has_reading = Set::Scalar->new();
d120c995 367 foreach my $row ( @{$table->{'alignment'}} ) {
368 my $wit = $row->{'witness'};
7e17346f 369 next if _is_lacunose( $wit, $lacunose, $aclabel );
d120c995 370 my $rdg = $row->{'tokens'}->[$tableidx];
371 next unless exists $rdg->{'t'} && defined $rdg->{'t'};
7e17346f 372 _add_to_witlist( $wit, $has_reading, $aclabel )
d120c995 373 if $rdg->{'t'}->id eq $trdg->id;
374 }
7e17346f 375 return $has_reading->members;
376}
377
378# Helper function to see if a witness is lacunose even if we are asking about
379# the a.c. version
380sub _is_lacunose {
381 my ( $wit, $lac, $acstr ) = @_;
382 if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
383 $wit = $1;
384 }
385 return $lac->has( $wit );
d1348d38 386}
387
335a62ef 388# Helper function to ensure that X and X a.c. never appear in the same list.
389sub _add_to_witlist {
390 my( $wit, $list, $acstr ) = @_;
335a62ef 391 if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
7e17346f 392 # Don't add X a.c. if we already have X
393 return if $list->has( $1 );
335a62ef 394 } else {
7e17346f 395 # Delete X a.c. if we are about to add X
396 $list->delete( $wit.$acstr );
335a62ef 397 }
7e17346f 398 $list->insert( $wit );
335a62ef 399}
400
d120c995 401sub _check_transposed_consistency {
402 my( $c, $rank, $transposed, $groupings ) = @_;
403 my %seen_wits;
404 my %thisrank;
405 # Note which readings are actually at this rank, and which witnesses
406 # belong to which reading.
407 foreach my $rdg ( keys %$groupings ) {
408 my $rdgobj = $c->reading( $rdg );
409 # Count '(omitted)' as a reading at this rank
410 $thisrank{$rdg} = 1 if !$rdgobj || $rdgobj->rank == $rank;
411 map { push( @{$seen_wits{$_}}, $rdg ) } @{$groupings->{$rdg}};
412 }
413 # Our work is done if we have no witness belonging to more than one
414 # reading.
415 my @doubled = grep { scalar @{$seen_wits{$_}} > 1 } keys %seen_wits;
416 return unless @doubled;
417 # If we have a symmetric related transposition, drop the non-rank readings.
418 if( @doubled == scalar keys %seen_wits ) {
419 foreach my $rdg ( keys %$groupings ) {
420 if( !$thisrank{$rdg} ) {
421 my $groupstr = wit_stringify( $groupings->{$rdg} );
422 my ( $matched ) = grep { $groupstr eq wit_stringify( $groupings->{$_} ) }
423 keys %thisrank;
424 delete $groupings->{$rdg};
425 # If we found a group match, assume there is a symmetry happening.
426 # TODO think more about this
62a39b8f 427 # print STDERR "*** Deleting symmetric reading $rdg\n";
d120c995 428 unless( $matched ) {
429 delete $transposed->{$rdg};
430 warn "Found problem in evident symmetry with reading $rdg";
431 }
432 }
433 }
434 # Otherwise 'unhook' the transposed reading(s) that have duplicates.
435 } else {
436 foreach my $dup ( @doubled ) {
437 foreach my $rdg ( @{$seen_wits{$dup}} ) {
438 next if $thisrank{$rdg};
439 next unless exists $groupings->{$rdg};
62a39b8f 440 # print STDERR "*** Deleting asymmetric doubled-up reading $rdg\n";
d120c995 441 delete $groupings->{$rdg};
442 delete $transposed->{$rdg};
443 }
444 }
445 # and put any now-orphaned readings into an 'omitted' reading.
446 foreach my $wit ( keys %seen_wits ) {
447 unless( grep { exists $groupings->{$_} } @{$seen_wits{$wit}} ) {
7e17346f 448 $groupings->{'(omitted)'} = Set::Scalar->new()
449 unless exists $groupings->{'(omitted)'};
d120c995 450 _add_to_witlist( $wit, $groupings->{'(omitted)'}, $c->ac_label );
451 }
452 }
453 }
454}
455
7e17346f 456# For the given grouping, return its situation graph based on the stemma.
457sub _graph_for_grouping {
458 my( $stemma, $grouping, $lacunose, $aclabel ) = @_;
459 my $acwits = [];
460 my $extant = {};
461 foreach my $gs ( values %$grouping ) {
462 map {
463 if( $_ =~ /^(.*)\Q$aclabel\E$/ ) {
464 push( @$acwits, $1 ) unless $lacunose->has( $1 );
465 } else {
466 $extant->{$_} = 1 unless $lacunose->has( $_ );
467 }
468 } $gs->members;
469 }
470 my $graph;
471 try {
472 # contig contains all extant wits and all hypothetical wits
473 # needed to make up the groups.
474 $graph = $stemma->situation_graph( $extant, $acwits );
475 } catch ( Text::Tradition::Error $e ) {
476 die "Could not extend graph with given extant and a.c. witnesses: "
477 . $e->message;
478 } catch {
479 die "Could not extend graph with a.c. witnesses @$acwits";
480 }
481 return $graph;
482}
88a6bac5 483
7e17346f 484=head2 solve_variants( $calcdir, @groups )
485
486Looks up the set of groups in the answers provided by the external graph solver
487service and returns a cleaned-up answer, adding the rank IDs back where they belong.
88a6bac5 488
489The JSON has the form
490 { "graph": [ stemmagraph DOT string without newlines ],
491 "groupings": [ array of arrays of groups, one per rank ] }
492
493The answer has the form
494 { "variants" => [ array of variant location structures ],
495 "variant_count" => total,
496 "conflict_count" => number of conflicts detected,
497 "genealogical_count" => number of solutions found }
498
499=cut
500
501sub solve_variants {
7e17346f 502 my( $dir, @groups ) = @_;
335a62ef 503
7e17346f 504 ## For each graph/group combo, look it up in the DB.
335a62ef 505 ## Witness map is a HACK to get around limitations in node names from IDP
506 my $witness_map = {};
507 ## Variables to store answers as they come back
7e17346f 508 my $variants = [];
509 my $genealogical = 0; # counter
510 foreach my $graphproblem ( @groups ) {
511 # Initialize the result structure for this graph problem
512 my $vstruct = { readings => [] };
513 push( @$variants, $vstruct );
5c44c598 514
7e17346f 515 # Construct the calc result key and look up its answer
516 my $reqkey = _get_calc_key( $graphproblem );
517 my $scope = $dir->new_scope;
518 my $answer = $dir->lookup( $reqkey );
519 unless( $answer ) {
520 #warn "No answer found for graph problem $reqkey, moving on";
521 # Record the unsolved problem so that we can go get a solution
522 _save_problem( $graphproblem );
523 # Put just the request, with no real result, into vstruct
524 foreach my $rid ( keys %{$graphproblem->{grouping}} ) {
525 push( @{$vstruct->{readings}}, { readingid => $rid,
526 group => [ $graphproblem->{grouping}->{$rid}->members ] } );
5c44c598 527 }
7e17346f 528 next;
5c44c598 529 }
7e17346f 530
531 ## The answer is a Text::Tradition::Analysis::Result containing a bunch
532 ## of information about this variant set. Record the information therein.
5c44c598 533
7e17346f 534 # 1. Did the group evaluate as genealogical?
535 $vstruct->{genealogical} = $answer->is_genealogical;
536 $genealogical++ if $answer->is_genealogical;
537
538 # 2. What are the calculated minimum groupings for each variant loc?
539 foreach my $rid ( keys %{$graphproblem->{grouping}} ) {
540 my $inputset = $graphproblem->{grouping}->{$rid};
541 my $minset = $answer->minimum_grouping_for( $inputset );
542 push( @{$vstruct->{readings}}, { readingid => $rid, group => $minset } );
6d25a3a0 543 }
7e17346f 544
545 # 3. What are the sources and classes calculated for each witness?
546 $vstruct->{witcopy_types} = { $answer->classes };
547 $vstruct->{reading_roots} = {};
548 map { $vstruct->{reading_roots}->{$_} = 1 } $answer->sources;
549
88a6bac5 550 }
551
7e17346f 552 # Spit out any unsolved problems we encountered
553 _list_unsolved();
554
88a6bac5 555 return { 'variants' => $variants,
556 'variant_count' => scalar @$variants,
557 'genealogical_count' => $genealogical };
558}
559
7e17346f 560sub _get_calc_key {
561 my( $graphproblem ) = @_;
562 my $graph = $graphproblem->{graph};
563 my $grouping = [ values %{$graphproblem->{grouping}} ];
564 my $key = Text::Tradition::Analysis::Result::string_from_graph_problem(
565 $graph, $grouping );
566 return md5_hex( encode_utf8( $key ) );
5c44c598 567}
568
7e17346f 569sub _save_problem {
570 my( $graphproblem ) = @_;
571 my $graphstr = Text::Tradition::Stemma::editable_graph(
572 $graphproblem->{graph}, { 'linesep' => ' ' } );
573 unless( exists $unsolved_problems->{$graphstr} ) {
574 $unsolved_problems->{$graphstr} = {};
b4cb2d60 575 }
7e17346f 576 my $grouping = [];
577 foreach my $set ( sort { Text::Tradition::Analysis::Result::by_size_and_alpha( $a, $b ) } values %{$graphproblem->{grouping}} ) {
578 push( @$grouping, [ sort $set->members ] );
b4cb2d60 579 }
7e17346f 580 $unsolved_problems->{$graphstr}->{wit_stringify( $grouping )} = $grouping;
b4cb2d60 581}
582
7e17346f 583sub _list_unsolved {
584 #say STDERR "Problems needing a solution:";
585 foreach my $graphstr ( keys %$unsolved_problems ) {
586 my $struct = { graph => $graphstr, groupings => [] };
587 foreach my $gp ( values %{$unsolved_problems->{$graphstr}} ) {
588 push( @{$struct->{groupings}}, $gp );
b4cb2d60 589 }
7e17346f 590 my $json = to_json( $struct );
591 say STDERR "$json";
b4cb2d60 592 }
b4cb2d60 593}
594
fae07016 595=head2 analyze_location ( $tradition, $graph, $location_hash )
7f52eac8 596
fae07016 597Given the tradition, its stemma graph, and the solution from the graph solver,
598work out the rest of the information we want. For each reading we need missing,
5c44c598 599conflict, reading_parents, independent_occurrence, followed, not_followed,
600and follow_unknown. Alters the location_hash in place.
7f52eac8 601
602=cut
732152b1 603
fae07016 604sub analyze_location {
7e17346f 605 my ( $tradition, $graph, $variant_row, $lacunose ) = @_;
94654e27 606 my $c = $tradition->collation;
fae07016 607
608 # Make a hash of all known node memberships, and make the subgraphs.
609 my $contig = {};
610 my $reading_roots = {};
611 my $subgraph = {};
638e2a95 612 my $acstr = $c->ac_label;
613 my @acwits;
5c44c598 614
615 my $NO_IDP;
616 if( exists $variant_row->{'reading_roots'} ) {
617 $reading_roots = delete $variant_row->{'reading_roots'};
618 } else {
619 warn "No reading source information from IDP - proceed at your own risk";
620 $NO_IDP = 1;
621 }
7e17346f 622 my $classinfo = delete $variant_row->{'witcopy_types'};
5c44c598 623
624 # Note which witnesses positively belong to which group. This information
625 # comes ultimately from the IDP solver.
626 # Also make a note of the reading's roots.
fae07016 627 foreach my $rdghash ( @{$variant_row->{'readings'}} ) {
628 my $rid = $rdghash->{'readingid'};
5c44c598 629 my @roots;
638e2a95 630 foreach my $wit ( @{$rdghash->{'group'}} ) {
631 $contig->{$wit} = $rid;
632 if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
633 push( @acwits, $1 );
634 }
5c44c598 635 if( exists $reading_roots->{$wit} && $reading_roots->{$wit} ) {
636 push( @roots, $wit );
637 }
638e2a95 638 }
5c44c598 639 $rdghash->{'independent_occurrence'} = \@roots;
94654e27 640 }
7e17346f 641
fae07016 642 # Now that we have all the node group memberships, calculate followed/
bebec0e9 643 # non-followed/unknown values for each reading. Also figure out the
644 # reading's evident parent(s).
645 foreach my $rdghash ( @{$variant_row->{'readings'}} ) {
fae07016 646 my $rid = $rdghash->{'readingid'};
428bcf0b 647 my $rdg = $c->reading( $rid );
5c44c598 648 my @roots = @{$rdghash->{'independent_occurrence'}};
7e17346f 649 my @reversions;
650 if( $classinfo ) {
651 @reversions = grep { $classinfo->{$_} eq 'revert' }
652 $rdghash->{'group'}->members;
653 }
5c44c598 654 my @group = @{$rdghash->{'group'}};
fae07016 655
656 # Start figuring things out.
7e17346f 657 $rdghash->{'followed'} = scalar( @group )
658 - ( scalar( @roots ) + scalar( @reversions ) );
bebec0e9 659 # Find the parent readings, if any, of this reading.
7e17346f 660 my $sourceparents = _find_reading_parents( $rid, $graph, $contig, @roots );
661 my $revertparents = _find_reading_parents( $rid, $graph, $contig, @reversions );
662 # Work out relationships between readings and their non-followed parent.
663 _resolve_parent_relationships( $c, $rid, $rdg, $sourceparents );
664 _resolve_parent_relationships( $c, $rid, $rdg, $revertparents );
94654e27 665
7e17346f 666 $rdghash->{'reading_parents'} = $sourceparents;
667 $rdghash->{'reversion_parents'} = $revertparents;
bebec0e9 668
669 # Find the number of times this reading was altered, and the number of
670 # times we're not sure.
671 my( %nofollow, %unknownfollow );
5c44c598 672 foreach my $wit ( @{$rdghash->{'group'}} ) {
bebec0e9 673 foreach my $wchild ( $graph->successors( $wit ) ) {
5c44c598 674 if( $reading_roots->{$wchild} && $contig->{$wchild}
675 && $contig->{$wchild} ne $rid ) {
bebec0e9 676 # It definitely changed here.
677 $nofollow{$wchild} = 1;
678 } elsif( !($contig->{$wchild}) ) {
679 # The child is a hypothetical node not definitely in
680 # any group. Answer is unknown.
681 $unknownfollow{$wchild} = 1;
5c44c598 682 } # else it is either in our group, or it is a non-root node in a
683 # known group and therefore is presumed to have its reading from
684 # its group, not this link.
bebec0e9 685 }
686 }
687 $rdghash->{'not_followed'} = keys %nofollow;
688 $rdghash->{'follow_unknown'} = keys %unknownfollow;
fae07016 689
690 # Now say whether this reading represents a conflict.
691 unless( $variant_row->{'genealogical'} ) {
7e17346f 692 my @reversions;
693 if( $classinfo ) {
5c44c598 694 # We have tested for reversions. Use the information.
7e17346f 695 @reversions =
696 $rdghash->{'reversions'} = \@reversions if @reversions;
697 }
698 $rdghash->{'is_conflict'} = @roots != 1;
699 $rdghash->{'is_reverted'} = !!@reversions;
700 }
701 }
702}
703
704sub _find_reading_parents {
705 my( $rid, $graph, $contig, @list ) = @_;
706 my $parenthash = {};
707 foreach my $wit ( @list ) {
708 # Look in the stemma graph to find this witness's extant or known-reading
709 # immediate ancestor(s), and look up the reading that each ancestor holds.
710 my @check = $graph->predecessors( $wit );
711 while( @check ) {
712 my @next;
713 foreach my $wparent( @check ) {
714 my $preading = $contig->{$wparent};
715 if( $preading && $preading ne $rid ) {
716 $parenthash->{$preading} = 1;
717 } else {
718 push( @next, $graph->predecessors( $wparent ) );
719 }
720 }
721 @check = @next;
722 }
723 }
724 return $parenthash;
725}
726
727sub _resolve_parent_relationships {
728 my( $c, $rid, $rdg, $rdgparents ) = @_;
729 foreach my $p ( keys %$rdgparents ) {
730 # Resolve the relationship of the parent to the reading, and
731 # save it in our hash.
732 my $pobj = $c->reading( $p );
733 my $prep = $pobj ? $pobj->id . ' (' . $pobj->text . ')' : $p;
734 my $phash = { 'label' => $prep };
735 if( $pobj ) {
736 my $rel = $c->get_relationship( $p, $rid );
737 if( $rel ) {
738 _add_to_hash( $rel, $phash );
739 } elsif( $rdg ) {
740 # First check for a transposed relationship
741 if( $rdg->rank != $pobj->rank ) {
742 foreach my $ti ( $rdg->related_readings( 'transposition' ) ) {
743 next unless $ti->text eq $rdg->text;
744 $rel = $c->get_relationship( $ti, $pobj );
745 if( $rel ) {
746 _add_to_hash( $rel, $phash, 1 );
747 last;
748 }
749 }
750 unless( $rel ) {
751 foreach my $ti ( $pobj->related_readings( 'transposition' ) ) {
752 next unless $ti->text eq $pobj->text;
753 $rel = $c->get_relationship( $ti, $rdg );
754 if( $rel ) {
755 _add_to_hash( $rel, $phash, 1 );
756 last;
757 }
758 }
5c44c598 759 }
760 }
7e17346f 761 unless( $rel ) {
762 # and then check for sheer word similarity.
763 my $rtext = $rdg->text;
764 my $ptext = $pobj->text;
765 if( similar( $rtext, $ptext ) ) {
766 # say STDERR "Words $rtext and $ptext judged similar";
767 $phash->{relation} = { type => 'wordsimilar' };
768 }
769 }
5c44c598 770 } else {
7e17346f 771 $phash->{relation} = { type => 'deletion' };
5c44c598 772 }
7e17346f 773 # Get the attributes of the parent object while we are here
774 $phash->{'text'} = $pobj->text if $pobj;
775 $phash->{'is_nonsense'} = $pobj->is_nonsense;
776 $phash->{'is_ungrammatical'} = $pobj->grammar_invalid;
777 } elsif( $p eq '(omitted)' ) {
778 $phash->{relation} = { type => 'addition' };
779 }
780 # Save it
781 $rdgparents->{$p} = $phash;
782 }
d71100ed 783}
784
428bcf0b 785sub _add_to_hash {
786 my( $rel, $phash, $is_transposed ) = @_;
787 $phash->{relation} = { type => $rel->type };
788 $phash->{relation}->{transposed} = 1 if $is_transposed;
789 $phash->{relation}->{annotation} = $rel->annotation
790 if $rel->has_annotation;
791}
792
793=head2 similar( $word1, $word2 )
794
795Use Algorithm::Diff to get a sense of how close the words are to each other.
796This will hopefully handle substitutions a bit more nicely than Levenshtein.
797
798=cut
799
800#!/usr/bin/env perl
801
802sub similar {
803 my( $word1, $word2 ) = sort { length($a) <=> length($b) } @_;
804 my @let1 = split( '', lc( $word1 ) );
805 my @let2 = split( '', lc( $word2 ) );
806 my $diff = Algorithm::Diff->new( \@let1, \@let2 );
807 my $mag = 0;
808 while( $diff->Next ) {
809 if( $diff->Same ) {
810 # Take off points for longer strings
811 my $cs = $diff->Range(1) - 2;
812 $cs = 0 if $cs < 0;
813 $mag -= $cs;
814 } elsif( !$diff->Items(1) ) {
815 $mag += $diff->Range(2);
816 } elsif( !$diff->Items(2) ) {
817 $mag += $diff->Range(1);
818 } else {
819 # Split the difference for substitutions
820 my $c1 = $diff->Range(1) || 1;
821 my $c2 = $diff->Range(2) || 1;
822 my $cd = ( $c1 + $c2 ) / 2;
823 $mag += $cd;
824 }
825 }
826 return ( $mag <= length( $word1 ) / 2 );
827}
828
6d25a3a0 829sub _prune_group {
5c44c598 830 my( $group, $graph ) = @_;
831 my $relevant = {};
832 # Record the existence of the vertices in the group
833 map { $relevant->{$_} = 1 } @$group;
6d25a3a0 834 # Make our subgraph
5c44c598 835 my $subgraph = $graph->deep_copy;
836 map { $subgraph->delete_vertex( $_ ) unless $relevant->{$_} }
6d25a3a0 837 $subgraph->vertices;
6d25a3a0 838 # Now prune and return the remaining vertices.
5c44c598 839 _prune_subtree( $subgraph );
840 # Return the list of vertices and the list of roots.
841 my $pruned_group = [ sort $subgraph->vertices ];
842 my $pruned_roots = [ $subgraph->predecessorless_vertices ];
843 return( $pruned_group, $pruned_roots );
6d25a3a0 844}
845
7f52eac8 846sub _prune_subtree {
5c44c598 847 my( $tree ) = @_;
94654e27 848
849 # Delete lacunose witnesses that have no successors
5c44c598 850 my @orphan_hypotheticals;
851 my $ctr = 0;
852 do {
853 die "Infinite loop on leaves" if $ctr > 100;
854 @orphan_hypotheticals =
855 grep { $tree->get_vertex_attribute( $_, 'class' ) eq 'hypothetical' }
856 $tree->successorless_vertices;
857 $tree->delete_vertices( @orphan_hypotheticals );
858 $ctr++;
859 } while( @orphan_hypotheticals );
94654e27 860
861 # Delete lacunose roots that have a single successor
862 my @redundant_root;
863 $ctr = 0;
864 do {
5c44c598 865 die "Infinite loop on roots" if $ctr > 100;
866 @redundant_root =
867 grep { $tree->get_vertex_attribute( $_, 'class' ) eq 'hypothetical'
868 && $tree->successors( $_ ) == 1 }
869 $tree->predecessorless_vertices;
94654e27 870 $tree->delete_vertices( @redundant_root );
871 $ctr++;
872 } while( @redundant_root );
873}
874
5be0cdeb 875sub _useful_variant {
7e17346f 876 my( $rankgroup, $rankgraph, $acstr ) = @_;
5be0cdeb 877
878 # Sort by group size and return
879 my $is_useful = 0;
7e17346f 880 foreach my $rdg ( keys %$rankgroup ) {
881 my @wits = $rankgroup->{$rdg}->members;
882 if( @wits > 1 ) {
5be0cdeb 883 $is_useful++;
884 } else {
7e17346f 885 $is_useful++ unless( $rankgraph->is_sink_vertex( $wits[0] )
886 || $wits[0] =~ /\Q$acstr\E$/ );
5be0cdeb 887 }
888 }
7e17346f 889 return $is_useful > 1;
5be0cdeb 890}
891
7f52eac8 892=head2 wit_stringify( $groups )
893
894Takes an array of witness groupings and produces a string like
895['A','B'] / ['C','D','E'] / ['F']
d71100ed 896
7f52eac8 897=cut
d71100ed 898
899sub wit_stringify {
900 my $groups = shift;
901 my @gst;
902 # If we were passed an array of witnesses instead of an array of
903 # groupings, then "group" the witnesses first.
904 unless( ref( $groups->[0] ) ) {
905 my $mkgrp = [ $groups ];
906 $groups = $mkgrp;
907 }
908 foreach my $g ( @$groups ) {
909 push( @gst, '[' . join( ',', map { "'$_'" } @$g ) . ']' );
910 }
911 return join( ' / ', @gst );
912}
7f52eac8 913
7f52eac8 9141;
915
916=head1 LICENSE
917
918This package is free software and is provided "as is" without express
919or implied warranty. You can redistribute it and/or modify it under
920the same terms as Perl itself.
921
922=head1 AUTHOR
923
924Tara L Andrews E<lt>aurum@cpan.orgE<gt>