distinguish source vs reversion parents; fix collapse functionality
[scpubgit/stemmatology.git] / lib / Text / Tradition / Analysis.pm
CommitLineData
d71100ed 1package Text::Tradition::Analysis;
2
3use strict;
4use warnings;
428bcf0b 5use Algorithm::Diff; # for word similarity measure
e4386ba9 6use Benchmark;
7e17346f 7use Digest::MD5 qw/ md5_hex /;
88a6bac5 8use Encode qw/ encode_utf8 /;
d1348d38 9use Exporter 'import';
b4cb2d60 10use Graph;
7e17346f 11use JSON qw/ to_json /;
12use Set::Scalar;
d71100ed 13use Text::Tradition;
7e17346f 14use Text::Tradition::Analysis::Result;
15use Text::Tradition::Directory;
d71100ed 16use Text::Tradition::Stemma;
d120c995 17use TryCatch;
d71100ed 18
d1348d38 19use vars qw/ @EXPORT_OK /;
a2cf85dd 20@EXPORT_OK = qw/ run_analysis group_variants analyze_variant_location wit_stringify /;
d1348d38 21
7e17346f 22my $unsolved_problems = {};
5c44c598 23
7f52eac8 24=head1 NAME
25
26Text::Tradition::Analysis - functions for stemma analysis of a tradition
27
28=head1 SYNOPSIS
29
30 use Text::Tradition;
31 use Text::Tradition::Analysis qw/ run_analysis analyze_variant_location /;
32 my $t = Text::Tradition->new(
33 'name' => 'this is a text',
34 'input' => 'TEI',
35 'file' => '/path/to/tei_parallel_seg_file.xml' );
36 $t->add_stemma( 'dotfile' => $stemmafile );
37
38 my $variant_data = run_analysis( $tradition );
39 # Recalculate rank $n treating all orthographic variants as equivalent
40 my $reanalyze = analyze_variant_location( $tradition, $n, 0, 'orthographic' );
41
42=head1 DESCRIPTION
43
44Text::Tradition is a library for representation and analysis of collated
45texts, particularly medieval ones. The Collation is the central feature of
46a Tradition, where the text, its sequence of readings, and its relationships
47between readings are actually kept.
48
49=head1 SUBROUTINES
50
88a6bac5 51=head2 run_analysis( $tradition, %opts )
7f52eac8 52
88a6bac5 53Runs the analysis described in analyze_variant_location on every location in the
54collation of the given tradition, with the given options. These include:
7f52eac8 55
88a6bac5 56=over 4
57
58=item * stemma_id - Specify which of the tradition's stemmata to use. Default
59is 0 (i.e. the first).
60
61=item * ranks - Specify a list of location ranks to analyze; exclude the rest.
62
63=item * merge_types - Specify a list of relationship types, where related readings
64should be treated as identical for the purposes of analysis.
65
ffa22d6f 66=item * exclude_type1 - Exclude those ranks whose groupings have only type-1 variants.
67
88a6bac5 68=back
7f52eac8 69
70=begin testing
71
72use Text::Tradition;
73use Text::Tradition::Analysis qw/ run_analysis analyze_variant_location /;
74
75my $datafile = 't/data/florilegium_tei_ps.xml';
76my $tradition = Text::Tradition->new( 'input' => 'TEI',
77 'name' => 'test0',
78 'file' => $datafile );
79my $s = $tradition->add_stemma( 'dotfile' => 't/data/florilegium.dot' );
80is( ref( $s ), 'Text::Tradition::Stemma', "Added stemma to tradition" );
81
f00cefe8 82my %expected_genealogical = (
a44aaf2a 83 1 => 0,
f00cefe8 84 2 => 1,
a44aaf2a 85 3 => 0,
86 5 => 0,
87 7 => 0,
88 8 => 0,
89 10 => 0,
f00cefe8 90 13 => 1,
a44aaf2a 91 33 => 0,
92 34 => 0,
93 37 => 0,
94 60 => 0,
f00cefe8 95 81 => 1,
a44aaf2a 96 84 => 0,
97 87 => 0,
98 101 => 0,
99 102 => 0,
f00cefe8 100 122 => 1,
a44aaf2a 101 157 => 0,
f00cefe8 102 166 => 1,
103 169 => 1,
a44aaf2a 104 200 => 0,
f00cefe8 105 216 => 1,
106 217 => 1,
107 219 => 1,
108 241 => 1,
109 242 => 1,
110 243 => 1,
111);
112
7e17346f 113my $data = run_analysis( $tradition, calcdsn => 'dbi:SQLite:dbname=t/data/analysis.db' );
7234b01d 114my $c = $tradition->collation;
f00cefe8 115foreach my $row ( @{$data->{'variants'}} ) {
a44aaf2a 116 # Account for rows that used to be "not useful"
117 unless( exists $expected_genealogical{$row->{'id'}} ) {
118 $expected_genealogical{$row->{'id'}} = 1;
119 }
18f48b82 120 my $gen_bool = $row->{'genealogical'} ? 1 : 0;
121 is( $gen_bool, $expected_genealogical{$row->{'id'}},
f00cefe8 122 "Got correct genealogical flag for row " . $row->{'id'} );
7234b01d 123 # Check that we have the right row with the right groups
124 my $rank = $row->{'id'};
125 foreach my $rdghash ( @{$row->{'readings'}} ) {
126 # Skip 'readings' that aren't really
127 next unless $c->reading( $rdghash->{'readingid'} );
128 # Check the rank
129 is( $c->reading( $rdghash->{'readingid'} )->rank, $rank,
130 "Got correct reading rank" );
131 # Check the witnesses
132 my @realwits = sort $c->reading_witnesses( $rdghash->{'readingid'} );
133 my @sgrp = sort @{$rdghash->{'group'}};
134 is_deeply( \@sgrp, \@realwits, "Reading analyzed with correct groups" );
135 }
f00cefe8 136}
a44aaf2a 137is( $data->{'variant_count'}, 58, "Got right total variant number" );
b4cb2d60 138# TODO Make something meaningful of conflict count, maybe test other bits
7f52eac8 139
140=end testing
141
142=cut
143
d71100ed 144sub run_analysis {
88a6bac5 145 my( $tradition, %opts ) = @_;
f00cefe8 146 my $c = $tradition->collation;
7e17346f 147 my $aclabel = $c->ac_label;
88a6bac5 148
149 my $stemma_id = $opts{'stemma_id'} || 0;
1d73ecad 150 my @ranks = ref( $opts{'ranks'} ) eq 'ARRAY' ? @{$opts{'ranks'}} : ();
e23225e7 151 my $collapse = Set::Scalar->new();
152 if( $opts{'merge_types'} && ref( $opts{'merge_types'} ) eq 'ARRAY' ) {
153 $collapse->insert( @{$opts{'merge_types'}} );
154 } elsif( $opts{'merge_types'} ) {
155 $collapse->insert( $opts{'merge_types'} );
156 }
7e17346f 157
158 # Make sure we have a lookup DB for graph calculation results; this will die
159 # if calcdir or calcdsn isn't passed.
160 my $dir = $opts{'calcdir'} ? delete $opts{'calcdir'}
161 : Text::Tradition::Directory->new( dsn => $opts{'calcdsn'} );
88a6bac5 162
163 # Get the stemma
164 my $stemma = $tradition->stemma( $stemma_id );
b4cb2d60 165
4ce27d42 166 # Figure out which witnesses we are working with - that is, the ones that
167 # appear both in the stemma and in the tradition. All others are 'lacunose'
168 # for our purposes.
7e17346f 169 my $lacunose = Set::Scalar->new( $stemma->hypotheticals );
170 my $stemma_wits = Set::Scalar->new( $stemma->witnesses );
171 my $tradition_wits = Set::Scalar->new( map { $_->sigil } $tradition->witnesses );
172 $lacunose->insert( $stemma_wits->symmetric_difference( $tradition_wits )->members );
88a6bac5 173
174 # Find and mark 'common' ranks for exclusion, unless they were
175 # explicitly specified.
176 unless( @ranks ) {
177 my %common_rank;
a44aaf2a 178 foreach my $rdg ( $c->common_readings ) {
88a6bac5 179 $common_rank{$rdg->rank} = 1;
180 }
181 @ranks = grep { !$common_rank{$_} } ( 1 .. $c->end->rank-1 );
d71100ed 182 }
7f52eac8 183
88a6bac5 184 # Group the variants to send to the solver
185 my @groups;
f629cb3b 186 my @use_ranks;
a44aaf2a 187 my %lacunae;
94654e27 188 my $moved = {};
88a6bac5 189 foreach my $rank ( @ranks ) {
7e17346f 190 my $missing = $lacunose->clone();
e23225e7 191 my $rankgroup = group_variants( $tradition, $rank, $missing, $moved, $collapse );
94654e27 192 # Filter out any empty rankgroups
193 # (e.g. from the later rank for a transposition)
194 next unless keys %$rankgroup;
7e17346f 195 # Get the graph for this rankgroup
196 my $rankgraph = _graph_for_grouping( $stemma, $rankgroup, $missing, $aclabel );
ffa22d6f 197 if( $opts{'exclude_type1'} ) {
198 # Check to see whether this is a "useful" group.
7e17346f 199 next unless _useful_variant( $rankgroup, $rankgraph, $aclabel );
ffa22d6f 200 }
f629cb3b 201 push( @use_ranks, $rank );
7e17346f 202 push( @groups, { grouping => $rankgroup, graph => $rankgraph } );
a44aaf2a 203 $lacunae{$rank} = $missing;
d71100ed 204 }
4ce27d42 205 # Run the solver
7e17346f 206 my $answer = solve_variants( $dir, @groups );
fae07016 207
88a6bac5 208 # Do further analysis on the answer
a44aaf2a 209 my $conflict_count = 0;
7e17346f 210 my $reversion_count = 0;
f629cb3b 211 foreach my $idx ( 0 .. $#use_ranks ) {
88a6bac5 212 my $location = $answer->{'variants'}->[$idx];
213 # Add the rank back in
94654e27 214 my $rank = $use_ranks[$idx];
215 $location->{'id'} = $rank;
7234b01d 216 # Note what our lacunae are
f629cb3b 217 my %lmiss;
7234b01d 218 map { $lmiss{$_} = 1 } @{$lacunae{$use_ranks[$idx]}};
7234b01d 219 $location->{'missing'} = [ keys %lmiss ];
220
88a6bac5 221 # Run the extra analysis we need.
428bcf0b 222 ## TODO We run through all the variants in this call, so
223 ## why not add the reading data there instead of here below?
7e17346f 224 my $graph = $groups[$idx]->{graph};
225 analyze_location( $tradition, $graph, $location, \%lmiss );
7234b01d 226
638e2a95 227 my @layerwits;
7234b01d 228 # Do the final post-analysis tidying up of the data.
a44aaf2a 229 foreach my $rdghash ( @{$location->{'readings'}} ) {
e23225e7 230 $conflict_count++ if $rdghash->{'is_conflict'};
231 $reversion_count++ if $rdghash->{'is_reverted'};
94654e27 232 # Add the reading text back in, setting display value as needed
a44aaf2a 233 my $rdg = $c->reading( $rdghash->{'readingid'} );
94654e27 234 if( $rdg ) {
235 $rdghash->{'text'} = $rdg->text .
236 ( $rdg->rank == $rank ? '' : ' [' . $rdg->rank . ']' );
7b7abf10 237 $rdghash->{'is_ungrammatical'} = $rdg->grammar_invalid;
238 $rdghash->{'is_nonsense'} = $rdg->is_nonsense;
94654e27 239 }
f629cb3b 240 # Remove lacunose witnesses from this reading's list now that the
7234b01d 241 # analysis is done
f629cb3b 242 my @realgroup;
7234b01d 243 map { push( @realgroup, $_ ) unless $lmiss{$_} } @{$rdghash->{'group'}};
f629cb3b 244 $rdghash->{'group'} = \@realgroup;
638e2a95 245 # Note any layered witnesses that appear in this group
246 foreach( @realgroup ) {
247 if( $_ =~ /^(.*)\Q$aclabel\E$/ ) {
248 push( @layerwits, $1 );
249 }
250 }
a44aaf2a 251 }
638e2a95 252 $location->{'layerwits'} = \@layerwits if @layerwits;
88a6bac5 253 }
a44aaf2a 254 $answer->{'conflict_count'} = $conflict_count;
7e17346f 255 $answer->{'reversion_count'} = $reversion_count;
f00cefe8 256
88a6bac5 257 return $answer;
d71100ed 258}
259
e23225e7 260=head2 group_variants( $tradition, $rank, $lacunose, $transposed, $merge_relationship_types )
7f52eac8 261
262Groups the variants at the given $rank of the collation, treating any
e23225e7 263relationships in the set $merge_relationship_types as equivalent.
264$lacunose should be a reference to an array, to which the sigla of lacunose
265witnesses at this rank will be appended; $transposed should be a reference
266to a hash, wherein the identities of transposed readings and their
267relatives will be stored.
7f52eac8 268
ffa22d6f 269Returns a hash $group_readings where $rdg is attested by the witnesses listed
270in $group_readings->{$rdg}.
7f52eac8 271
272=cut
273
274# Return group_readings, groups, lacunose
d1348d38 275sub group_variants {
94654e27 276 my( $tradition, $rank, $lacunose, $transposed, $collapse ) = @_;
7f52eac8 277 my $c = $tradition->collation;
335a62ef 278 my $aclabel = $c->ac_label;
d120c995 279 my $table = $c->alignment_table;
7f52eac8 280 # Get the alignment table readings
281 my %readings_at_rank;
7e17346f 282 my $check_for_gaps = Set::Scalar->new();
94654e27 283 my %moved_wits;
d120c995 284 my $has_transposition;
285 foreach my $tablewit ( @{$table->{'alignment'}} ) {
7f52eac8 286 my $rdg = $tablewit->{'tokens'}->[$rank-1];
fae07016 287 my $wit = $tablewit->{'witness'};
ffa22d6f 288 # Exclude the witness if it is "lacunose" which if we got here
289 # means "not in the stemma".
7e17346f 290 next if _is_lacunose( $wit, $lacunose, $aclabel );
94654e27 291 # Note if the witness is actually in a lacuna
7f52eac8 292 if( $rdg && $rdg->{'t'}->is_lacuna ) {
335a62ef 293 _add_to_witlist( $wit, $lacunose, $aclabel );
94654e27 294 # Otherwise the witness either has a positive reading...
7f52eac8 295 } elsif( $rdg ) {
94654e27 296 # If the reading has been counted elsewhere as a transposition, ignore it.
297 if( $transposed->{$rdg->{'t'}->id} ) {
d120c995 298 # TODO Does this cope with three-way transpositions?
94654e27 299 map { $moved_wits{$_} = 1 } @{$transposed->{$rdg->{'t'}->id}};
300 next;
301 }
302 # Otherwise, record it...
303 $readings_at_rank{$rdg->{'t'}->id} = $rdg->{'t'};
304 # ...and grab any transpositions, and their relations.
305 my @transp = grep { $_->rank != $rank } $rdg->{'t'}->related_readings();
306 foreach my $trdg ( @transp ) {
d120c995 307 next if exists $readings_at_rank{$trdg->id};
308 $has_transposition = 1;
309 my @affected_wits = _table_witnesses(
7e17346f 310 $table, $trdg, $lacunose, $aclabel );
d120c995 311 next unless @affected_wits;
312 map { $moved_wits{$_} = 1 } @affected_wits;
313 $transposed->{$trdg->id} =
7e17346f 314 [ _table_witnesses( $table, $rdg->{'t'}, $lacunose, $aclabel ) ];
94654e27 315 $readings_at_rank{$trdg->id} = $trdg;
316 }
317 # ...or it is empty, ergo a gap.
7f52eac8 318 } else {
7e17346f 319 _add_to_witlist( $wit, $check_for_gaps, $aclabel );
7f52eac8 320 }
321 }
7e17346f 322 my $gap_wits = Set::Scalar->new();
323 map { _add_to_witlist( $_, $gap_wits, $aclabel )
324 unless $moved_wits{$_} } $check_for_gaps->members;
325
326 # Group the readings, collapsing groups by relationship if needed.
d120c995 327 my $grouped_readings = {};
4ce27d42 328 foreach my $rdg ( values %readings_at_rank ) {
7f52eac8 329 # Skip readings that have been collapsed into others.
d120c995 330 next if exists $grouped_readings->{$rdg->id}
331 && $grouped_readings->{$rdg->id} eq 'COLLAPSE';
4ce27d42 332 # Get the witness list, including from readings collapsed into this one.
7e17346f 333 my @wits = _table_witnesses( $table, $rdg, $lacunose, $aclabel );
e23225e7 334 if( $collapse && $collapse->size ) {
335 my $filter = sub { $collapse->has( $_[0]->type ) };
7f52eac8 336 foreach my $other ( $rdg->related_readings( $filter ) ) {
7e17346f 337 my @otherwits = _table_witnesses( $table, $other, $lacunose, $aclabel );
fae07016 338 push( @wits, @otherwits );
d120c995 339 $grouped_readings->{$other->id} = 'COLLAPSE';
d1348d38 340 }
341 }
7e17346f 342 $grouped_readings->{$rdg->id} = Set::Scalar->new( @wits );
7f52eac8 343 }
7e17346f 344 if( $gap_wits->members ) {
345 $grouped_readings->{'(omitted)'} = $gap_wits;
346 }
347
7f52eac8 348 # Get rid of our collapsed readings
7e17346f 349 map { delete $grouped_readings->{$_} if(
350 $grouped_readings->{$_} eq 'COLLAPSE'
351 || $grouped_readings->{$_}->is_empty ) }
352 keys %$grouped_readings;
d120c995 353
354 # If something was transposed, check the groups for doubled-up readings
355 if( $has_transposition ) {
62a39b8f 356 # print STDERR "Group for rank $rank:\n";
357 # map { print STDERR "\t$_: " . join( ' ' , @{$grouped_readings->{$_}} ) . "\n" }
358 # keys %$grouped_readings;
d120c995 359 _check_transposed_consistency( $c, $rank, $transposed, $grouped_readings );
360 }
7f52eac8 361
4ce27d42 362 # Return the result
d120c995 363 return $grouped_readings;
364}
365
366# Helper function to query the alignment table for all witnesses (a.c. included)
367# that have a given reading at its rank.
368sub _table_witnesses {
369 my( $table, $trdg, $lacunose, $aclabel ) = @_;
370 my $tableidx = $trdg->rank - 1;
7e17346f 371 my $has_reading = Set::Scalar->new();
d120c995 372 foreach my $row ( @{$table->{'alignment'}} ) {
373 my $wit = $row->{'witness'};
7e17346f 374 next if _is_lacunose( $wit, $lacunose, $aclabel );
d120c995 375 my $rdg = $row->{'tokens'}->[$tableidx];
376 next unless exists $rdg->{'t'} && defined $rdg->{'t'};
7e17346f 377 _add_to_witlist( $wit, $has_reading, $aclabel )
d120c995 378 if $rdg->{'t'}->id eq $trdg->id;
379 }
7e17346f 380 return $has_reading->members;
381}
382
383# Helper function to see if a witness is lacunose even if we are asking about
384# the a.c. version
385sub _is_lacunose {
386 my ( $wit, $lac, $acstr ) = @_;
387 if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
388 $wit = $1;
389 }
390 return $lac->has( $wit );
d1348d38 391}
392
335a62ef 393# Helper function to ensure that X and X a.c. never appear in the same list.
394sub _add_to_witlist {
395 my( $wit, $list, $acstr ) = @_;
335a62ef 396 if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
7e17346f 397 # Don't add X a.c. if we already have X
398 return if $list->has( $1 );
335a62ef 399 } else {
7e17346f 400 # Delete X a.c. if we are about to add X
401 $list->delete( $wit.$acstr );
335a62ef 402 }
7e17346f 403 $list->insert( $wit );
335a62ef 404}
405
d120c995 406sub _check_transposed_consistency {
407 my( $c, $rank, $transposed, $groupings ) = @_;
408 my %seen_wits;
409 my %thisrank;
410 # Note which readings are actually at this rank, and which witnesses
411 # belong to which reading.
412 foreach my $rdg ( keys %$groupings ) {
413 my $rdgobj = $c->reading( $rdg );
414 # Count '(omitted)' as a reading at this rank
415 $thisrank{$rdg} = 1 if !$rdgobj || $rdgobj->rank == $rank;
416 map { push( @{$seen_wits{$_}}, $rdg ) } @{$groupings->{$rdg}};
417 }
418 # Our work is done if we have no witness belonging to more than one
419 # reading.
420 my @doubled = grep { scalar @{$seen_wits{$_}} > 1 } keys %seen_wits;
421 return unless @doubled;
422 # If we have a symmetric related transposition, drop the non-rank readings.
423 if( @doubled == scalar keys %seen_wits ) {
424 foreach my $rdg ( keys %$groupings ) {
425 if( !$thisrank{$rdg} ) {
426 my $groupstr = wit_stringify( $groupings->{$rdg} );
427 my ( $matched ) = grep { $groupstr eq wit_stringify( $groupings->{$_} ) }
428 keys %thisrank;
429 delete $groupings->{$rdg};
430 # If we found a group match, assume there is a symmetry happening.
431 # TODO think more about this
62a39b8f 432 # print STDERR "*** Deleting symmetric reading $rdg\n";
d120c995 433 unless( $matched ) {
434 delete $transposed->{$rdg};
435 warn "Found problem in evident symmetry with reading $rdg";
436 }
437 }
438 }
439 # Otherwise 'unhook' the transposed reading(s) that have duplicates.
440 } else {
441 foreach my $dup ( @doubled ) {
442 foreach my $rdg ( @{$seen_wits{$dup}} ) {
443 next if $thisrank{$rdg};
444 next unless exists $groupings->{$rdg};
62a39b8f 445 # print STDERR "*** Deleting asymmetric doubled-up reading $rdg\n";
d120c995 446 delete $groupings->{$rdg};
447 delete $transposed->{$rdg};
448 }
449 }
450 # and put any now-orphaned readings into an 'omitted' reading.
451 foreach my $wit ( keys %seen_wits ) {
452 unless( grep { exists $groupings->{$_} } @{$seen_wits{$wit}} ) {
7e17346f 453 $groupings->{'(omitted)'} = Set::Scalar->new()
454 unless exists $groupings->{'(omitted)'};
d120c995 455 _add_to_witlist( $wit, $groupings->{'(omitted)'}, $c->ac_label );
456 }
457 }
458 }
459}
460
7e17346f 461# For the given grouping, return its situation graph based on the stemma.
462sub _graph_for_grouping {
463 my( $stemma, $grouping, $lacunose, $aclabel ) = @_;
464 my $acwits = [];
465 my $extant = {};
466 foreach my $gs ( values %$grouping ) {
467 map {
468 if( $_ =~ /^(.*)\Q$aclabel\E$/ ) {
469 push( @$acwits, $1 ) unless $lacunose->has( $1 );
470 } else {
471 $extant->{$_} = 1 unless $lacunose->has( $_ );
472 }
473 } $gs->members;
474 }
475 my $graph;
476 try {
477 # contig contains all extant wits and all hypothetical wits
478 # needed to make up the groups.
479 $graph = $stemma->situation_graph( $extant, $acwits );
480 } catch ( Text::Tradition::Error $e ) {
481 die "Could not extend graph with given extant and a.c. witnesses: "
482 . $e->message;
483 } catch {
484 die "Could not extend graph with a.c. witnesses @$acwits";
485 }
486 return $graph;
487}
88a6bac5 488
7e17346f 489=head2 solve_variants( $calcdir, @groups )
490
491Looks up the set of groups in the answers provided by the external graph solver
492service and returns a cleaned-up answer, adding the rank IDs back where they belong.
88a6bac5 493
494The JSON has the form
495 { "graph": [ stemmagraph DOT string without newlines ],
496 "groupings": [ array of arrays of groups, one per rank ] }
497
498The answer has the form
499 { "variants" => [ array of variant location structures ],
500 "variant_count" => total,
501 "conflict_count" => number of conflicts detected,
502 "genealogical_count" => number of solutions found }
503
504=cut
505
506sub solve_variants {
7e17346f 507 my( $dir, @groups ) = @_;
335a62ef 508
7e17346f 509 ## For each graph/group combo, look it up in the DB.
335a62ef 510 ## Witness map is a HACK to get around limitations in node names from IDP
511 my $witness_map = {};
512 ## Variables to store answers as they come back
7e17346f 513 my $variants = [];
514 my $genealogical = 0; # counter
515 foreach my $graphproblem ( @groups ) {
516 # Initialize the result structure for this graph problem
517 my $vstruct = { readings => [] };
518 push( @$variants, $vstruct );
5c44c598 519
7e17346f 520 # Construct the calc result key and look up its answer
521 my $reqkey = _get_calc_key( $graphproblem );
522 my $scope = $dir->new_scope;
523 my $answer = $dir->lookup( $reqkey );
524 unless( $answer ) {
525 #warn "No answer found for graph problem $reqkey, moving on";
526 # Record the unsolved problem so that we can go get a solution
527 _save_problem( $graphproblem );
528 # Put just the request, with no real result, into vstruct
529 foreach my $rid ( keys %{$graphproblem->{grouping}} ) {
530 push( @{$vstruct->{readings}}, { readingid => $rid,
531 group => [ $graphproblem->{grouping}->{$rid}->members ] } );
5c44c598 532 }
7e17346f 533 next;
5c44c598 534 }
7e17346f 535
536 ## The answer is a Text::Tradition::Analysis::Result containing a bunch
537 ## of information about this variant set. Record the information therein.
5c44c598 538
7e17346f 539 # 1. Did the group evaluate as genealogical?
540 $vstruct->{genealogical} = $answer->is_genealogical;
541 $genealogical++ if $answer->is_genealogical;
542
543 # 2. What are the calculated minimum groupings for each variant loc?
544 foreach my $rid ( keys %{$graphproblem->{grouping}} ) {
545 my $inputset = $graphproblem->{grouping}->{$rid};
546 my $minset = $answer->minimum_grouping_for( $inputset );
547 push( @{$vstruct->{readings}}, { readingid => $rid, group => $minset } );
6d25a3a0 548 }
7e17346f 549
550 # 3. What are the sources and classes calculated for each witness?
551 $vstruct->{witcopy_types} = { $answer->classes };
552 $vstruct->{reading_roots} = {};
553 map { $vstruct->{reading_roots}->{$_} = 1 } $answer->sources;
554
88a6bac5 555 }
556
7e17346f 557 # Spit out any unsolved problems we encountered
e23225e7 558 # _list_unsolved();
7e17346f 559
88a6bac5 560 return { 'variants' => $variants,
561 'variant_count' => scalar @$variants,
562 'genealogical_count' => $genealogical };
563}
564
7e17346f 565sub _get_calc_key {
566 my( $graphproblem ) = @_;
567 my $graph = $graphproblem->{graph};
568 my $grouping = [ values %{$graphproblem->{grouping}} ];
569 my $key = Text::Tradition::Analysis::Result::string_from_graph_problem(
570 $graph, $grouping );
571 return md5_hex( encode_utf8( $key ) );
5c44c598 572}
573
7e17346f 574sub _save_problem {
575 my( $graphproblem ) = @_;
e23225e7 576 my $problem = Text::Tradition::Analysis::Result->new(
577 graph => $graphproblem->{graph},
578 setlist => [ values %{$graphproblem->{grouping}} ]
579 );
580 my $key = _get_calc_key( $graphproblem );
581 my( $str ) = $problem->problem_json;
582 say STDERR "Stashing unsolved problem $str at key $key";
583 $unsolved_problems->{$key} = $problem;
b4cb2d60 584}
585
7e17346f 586sub _list_unsolved {
587 #say STDERR "Problems needing a solution:";
e23225e7 588 my @problems = values %$unsolved_problems;
589 return unless @problems;
590 my $first = shift @problems;
591 map { say STDERR $_ } $first->problem_json( @problems );
b4cb2d60 592}
593
fae07016 594=head2 analyze_location ( $tradition, $graph, $location_hash )
7f52eac8 595
fae07016 596Given the tradition, its stemma graph, and the solution from the graph solver,
597work out the rest of the information we want. For each reading we need missing,
5c44c598 598conflict, reading_parents, independent_occurrence, followed, not_followed,
599and follow_unknown. Alters the location_hash in place.
7f52eac8 600
601=cut
732152b1 602
fae07016 603sub analyze_location {
7e17346f 604 my ( $tradition, $graph, $variant_row, $lacunose ) = @_;
94654e27 605 my $c = $tradition->collation;
fae07016 606
607 # Make a hash of all known node memberships, and make the subgraphs.
608 my $contig = {};
609 my $reading_roots = {};
610 my $subgraph = {};
638e2a95 611 my $acstr = $c->ac_label;
612 my @acwits;
5c44c598 613
614 my $NO_IDP;
615 if( exists $variant_row->{'reading_roots'} ) {
616 $reading_roots = delete $variant_row->{'reading_roots'};
617 } else {
618 warn "No reading source information from IDP - proceed at your own risk";
619 $NO_IDP = 1;
620 }
7e17346f 621 my $classinfo = delete $variant_row->{'witcopy_types'};
5c44c598 622
623 # Note which witnesses positively belong to which group. This information
624 # comes ultimately from the IDP solver.
625 # Also make a note of the reading's roots.
fae07016 626 foreach my $rdghash ( @{$variant_row->{'readings'}} ) {
627 my $rid = $rdghash->{'readingid'};
5c44c598 628 my @roots;
638e2a95 629 foreach my $wit ( @{$rdghash->{'group'}} ) {
630 $contig->{$wit} = $rid;
631 if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
632 push( @acwits, $1 );
633 }
5c44c598 634 if( exists $reading_roots->{$wit} && $reading_roots->{$wit} ) {
635 push( @roots, $wit );
636 }
638e2a95 637 }
5c44c598 638 $rdghash->{'independent_occurrence'} = \@roots;
94654e27 639 }
7e17346f 640
fae07016 641 # Now that we have all the node group memberships, calculate followed/
bebec0e9 642 # non-followed/unknown values for each reading. Also figure out the
643 # reading's evident parent(s).
644 foreach my $rdghash ( @{$variant_row->{'readings'}} ) {
fae07016 645 my $rid = $rdghash->{'readingid'};
428bcf0b 646 my $rdg = $c->reading( $rid );
5c44c598 647 my @roots = @{$rdghash->{'independent_occurrence'}};
7e17346f 648 my @reversions;
649 if( $classinfo ) {
650 @reversions = grep { $classinfo->{$_} eq 'revert' }
651 $rdghash->{'group'}->members;
652 }
5c44c598 653 my @group = @{$rdghash->{'group'}};
fae07016 654
655 # Start figuring things out.
7e17346f 656 $rdghash->{'followed'} = scalar( @group )
657 - ( scalar( @roots ) + scalar( @reversions ) );
bebec0e9 658 # Find the parent readings, if any, of this reading.
7e17346f 659 my $sourceparents = _find_reading_parents( $rid, $graph, $contig, @roots );
7e17346f 660 # Work out relationships between readings and their non-followed parent.
661 _resolve_parent_relationships( $c, $rid, $rdg, $sourceparents );
7e17346f 662 $rdghash->{'reading_parents'} = $sourceparents;
e23225e7 663
664 if( @reversions ) {
665 my $revparents = _find_reading_parents( $rid, $graph, $contig, @reversions );
666 _resolve_parent_relationships( $c, $rid, $rdg, $revparents );
667 $rdghash->{'reversion_parents'} = $revparents;
668 }
bebec0e9 669
670 # Find the number of times this reading was altered, and the number of
671 # times we're not sure.
672 my( %nofollow, %unknownfollow );
5c44c598 673 foreach my $wit ( @{$rdghash->{'group'}} ) {
bebec0e9 674 foreach my $wchild ( $graph->successors( $wit ) ) {
5c44c598 675 if( $reading_roots->{$wchild} && $contig->{$wchild}
676 && $contig->{$wchild} ne $rid ) {
bebec0e9 677 # It definitely changed here.
678 $nofollow{$wchild} = 1;
679 } elsif( !($contig->{$wchild}) ) {
680 # The child is a hypothetical node not definitely in
681 # any group. Answer is unknown.
682 $unknownfollow{$wchild} = 1;
5c44c598 683 } # else it is either in our group, or it is a non-root node in a
684 # known group and therefore is presumed to have its reading from
685 # its group, not this link.
bebec0e9 686 }
687 }
688 $rdghash->{'not_followed'} = keys %nofollow;
689 $rdghash->{'follow_unknown'} = keys %unknownfollow;
fae07016 690
691 # Now say whether this reading represents a conflict.
692 unless( $variant_row->{'genealogical'} ) {
7e17346f 693 $rdghash->{'is_conflict'} = @roots != 1;
e23225e7 694 $rdghash->{'is_reverted'} = scalar @reversions;
7e17346f 695 }
696 }
697}
698
699sub _find_reading_parents {
700 my( $rid, $graph, $contig, @list ) = @_;
701 my $parenthash = {};
702 foreach my $wit ( @list ) {
703 # Look in the stemma graph to find this witness's extant or known-reading
704 # immediate ancestor(s), and look up the reading that each ancestor holds.
705 my @check = $graph->predecessors( $wit );
706 while( @check ) {
707 my @next;
708 foreach my $wparent( @check ) {
709 my $preading = $contig->{$wparent};
710 if( $preading && $preading ne $rid ) {
711 $parenthash->{$preading} = 1;
712 } else {
713 push( @next, $graph->predecessors( $wparent ) );
714 }
715 }
716 @check = @next;
717 }
718 }
719 return $parenthash;
720}
721
722sub _resolve_parent_relationships {
723 my( $c, $rid, $rdg, $rdgparents ) = @_;
724 foreach my $p ( keys %$rdgparents ) {
725 # Resolve the relationship of the parent to the reading, and
726 # save it in our hash.
727 my $pobj = $c->reading( $p );
728 my $prep = $pobj ? $pobj->id . ' (' . $pobj->text . ')' : $p;
729 my $phash = { 'label' => $prep };
730 if( $pobj ) {
731 my $rel = $c->get_relationship( $p, $rid );
732 if( $rel ) {
733 _add_to_hash( $rel, $phash );
734 } elsif( $rdg ) {
735 # First check for a transposed relationship
736 if( $rdg->rank != $pobj->rank ) {
737 foreach my $ti ( $rdg->related_readings( 'transposition' ) ) {
738 next unless $ti->text eq $rdg->text;
739 $rel = $c->get_relationship( $ti, $pobj );
740 if( $rel ) {
741 _add_to_hash( $rel, $phash, 1 );
742 last;
743 }
744 }
745 unless( $rel ) {
746 foreach my $ti ( $pobj->related_readings( 'transposition' ) ) {
747 next unless $ti->text eq $pobj->text;
748 $rel = $c->get_relationship( $ti, $rdg );
749 if( $rel ) {
750 _add_to_hash( $rel, $phash, 1 );
751 last;
752 }
753 }
5c44c598 754 }
755 }
7e17346f 756 unless( $rel ) {
757 # and then check for sheer word similarity.
758 my $rtext = $rdg->text;
759 my $ptext = $pobj->text;
760 if( similar( $rtext, $ptext ) ) {
761 # say STDERR "Words $rtext and $ptext judged similar";
762 $phash->{relation} = { type => 'wordsimilar' };
763 }
764 }
5c44c598 765 } else {
7e17346f 766 $phash->{relation} = { type => 'deletion' };
5c44c598 767 }
7e17346f 768 # Get the attributes of the parent object while we are here
769 $phash->{'text'} = $pobj->text if $pobj;
770 $phash->{'is_nonsense'} = $pobj->is_nonsense;
771 $phash->{'is_ungrammatical'} = $pobj->grammar_invalid;
772 } elsif( $p eq '(omitted)' ) {
773 $phash->{relation} = { type => 'addition' };
774 }
775 # Save it
776 $rdgparents->{$p} = $phash;
777 }
d71100ed 778}
779
428bcf0b 780sub _add_to_hash {
781 my( $rel, $phash, $is_transposed ) = @_;
782 $phash->{relation} = { type => $rel->type };
783 $phash->{relation}->{transposed} = 1 if $is_transposed;
784 $phash->{relation}->{annotation} = $rel->annotation
785 if $rel->has_annotation;
786}
787
788=head2 similar( $word1, $word2 )
789
790Use Algorithm::Diff to get a sense of how close the words are to each other.
791This will hopefully handle substitutions a bit more nicely than Levenshtein.
792
793=cut
794
795#!/usr/bin/env perl
796
797sub similar {
798 my( $word1, $word2 ) = sort { length($a) <=> length($b) } @_;
799 my @let1 = split( '', lc( $word1 ) );
800 my @let2 = split( '', lc( $word2 ) );
801 my $diff = Algorithm::Diff->new( \@let1, \@let2 );
802 my $mag = 0;
803 while( $diff->Next ) {
804 if( $diff->Same ) {
805 # Take off points for longer strings
806 my $cs = $diff->Range(1) - 2;
807 $cs = 0 if $cs < 0;
808 $mag -= $cs;
809 } elsif( !$diff->Items(1) ) {
810 $mag += $diff->Range(2);
811 } elsif( !$diff->Items(2) ) {
812 $mag += $diff->Range(1);
813 } else {
814 # Split the difference for substitutions
815 my $c1 = $diff->Range(1) || 1;
816 my $c2 = $diff->Range(2) || 1;
817 my $cd = ( $c1 + $c2 ) / 2;
818 $mag += $cd;
819 }
820 }
821 return ( $mag <= length( $word1 ) / 2 );
822}
823
6d25a3a0 824sub _prune_group {
5c44c598 825 my( $group, $graph ) = @_;
826 my $relevant = {};
827 # Record the existence of the vertices in the group
828 map { $relevant->{$_} = 1 } @$group;
6d25a3a0 829 # Make our subgraph
5c44c598 830 my $subgraph = $graph->deep_copy;
831 map { $subgraph->delete_vertex( $_ ) unless $relevant->{$_} }
6d25a3a0 832 $subgraph->vertices;
6d25a3a0 833 # Now prune and return the remaining vertices.
5c44c598 834 _prune_subtree( $subgraph );
835 # Return the list of vertices and the list of roots.
836 my $pruned_group = [ sort $subgraph->vertices ];
837 my $pruned_roots = [ $subgraph->predecessorless_vertices ];
838 return( $pruned_group, $pruned_roots );
6d25a3a0 839}
840
7f52eac8 841sub _prune_subtree {
5c44c598 842 my( $tree ) = @_;
94654e27 843
844 # Delete lacunose witnesses that have no successors
5c44c598 845 my @orphan_hypotheticals;
846 my $ctr = 0;
847 do {
848 die "Infinite loop on leaves" if $ctr > 100;
849 @orphan_hypotheticals =
850 grep { $tree->get_vertex_attribute( $_, 'class' ) eq 'hypothetical' }
851 $tree->successorless_vertices;
852 $tree->delete_vertices( @orphan_hypotheticals );
853 $ctr++;
854 } while( @orphan_hypotheticals );
94654e27 855
856 # Delete lacunose roots that have a single successor
857 my @redundant_root;
858 $ctr = 0;
859 do {
5c44c598 860 die "Infinite loop on roots" if $ctr > 100;
861 @redundant_root =
862 grep { $tree->get_vertex_attribute( $_, 'class' ) eq 'hypothetical'
863 && $tree->successors( $_ ) == 1 }
864 $tree->predecessorless_vertices;
94654e27 865 $tree->delete_vertices( @redundant_root );
866 $ctr++;
867 } while( @redundant_root );
868}
869
5be0cdeb 870sub _useful_variant {
7e17346f 871 my( $rankgroup, $rankgraph, $acstr ) = @_;
5be0cdeb 872
873 # Sort by group size and return
874 my $is_useful = 0;
7e17346f 875 foreach my $rdg ( keys %$rankgroup ) {
876 my @wits = $rankgroup->{$rdg}->members;
877 if( @wits > 1 ) {
5be0cdeb 878 $is_useful++;
879 } else {
7e17346f 880 $is_useful++ unless( $rankgraph->is_sink_vertex( $wits[0] )
881 || $wits[0] =~ /\Q$acstr\E$/ );
5be0cdeb 882 }
883 }
7e17346f 884 return $is_useful > 1;
5be0cdeb 885}
886
7f52eac8 887=head2 wit_stringify( $groups )
888
889Takes an array of witness groupings and produces a string like
890['A','B'] / ['C','D','E'] / ['F']
d71100ed 891
7f52eac8 892=cut
d71100ed 893
894sub wit_stringify {
895 my $groups = shift;
896 my @gst;
897 # If we were passed an array of witnesses instead of an array of
898 # groupings, then "group" the witnesses first.
899 unless( ref( $groups->[0] ) ) {
900 my $mkgrp = [ $groups ];
901 $groups = $mkgrp;
902 }
903 foreach my $g ( @$groups ) {
904 push( @gst, '[' . join( ',', map { "'$_'" } @$g ) . ']' );
905 }
906 return join( ' / ', @gst );
907}
7f52eac8 908
7f52eac8 9091;
910
911=head1 LICENSE
912
913This package is free software and is provided "as is" without express
914or implied warranty. You can redistribute it and/or modify it under
915the same terms as Perl itself.
916
917=head1 AUTHOR
918
919Tara L Andrews E<lt>aurum@cpan.orgE<gt>