small nomenclature rationalization; save reversion roots
[scpubgit/stemmatology.git] / lib / Text / Tradition / Analysis.pm
CommitLineData
d71100ed 1package Text::Tradition::Analysis;
2
3use strict;
4use warnings;
428bcf0b 5use Algorithm::Diff; # for word similarity measure
e4386ba9 6use Benchmark;
7e17346f 7use Digest::MD5 qw/ md5_hex /;
88a6bac5 8use Encode qw/ encode_utf8 /;
d1348d38 9use Exporter 'import';
b4cb2d60 10use Graph;
7e17346f 11use JSON qw/ to_json /;
12use Set::Scalar;
d71100ed 13use Text::Tradition;
7e17346f 14use Text::Tradition::Analysis::Result;
15use Text::Tradition::Directory;
d71100ed 16use Text::Tradition::Stemma;
d120c995 17use TryCatch;
d71100ed 18
d1348d38 19use vars qw/ @EXPORT_OK /;
a2cf85dd 20@EXPORT_OK = qw/ run_analysis group_variants analyze_variant_location wit_stringify /;
d1348d38 21
7e17346f 22my $unsolved_problems = {};
5c44c598 23
7f52eac8 24=head1 NAME
25
26Text::Tradition::Analysis - functions for stemma analysis of a tradition
27
28=head1 SYNOPSIS
29
30 use Text::Tradition;
31 use Text::Tradition::Analysis qw/ run_analysis analyze_variant_location /;
32 my $t = Text::Tradition->new(
33 'name' => 'this is a text',
34 'input' => 'TEI',
35 'file' => '/path/to/tei_parallel_seg_file.xml' );
36 $t->add_stemma( 'dotfile' => $stemmafile );
37
38 my $variant_data = run_analysis( $tradition );
39 # Recalculate rank $n treating all orthographic variants as equivalent
40 my $reanalyze = analyze_variant_location( $tradition, $n, 0, 'orthographic' );
41
42=head1 DESCRIPTION
43
44Text::Tradition is a library for representation and analysis of collated
45texts, particularly medieval ones. The Collation is the central feature of
46a Tradition, where the text, its sequence of readings, and its relationships
47between readings are actually kept.
48
49=head1 SUBROUTINES
50
88a6bac5 51=head2 run_analysis( $tradition, %opts )
7f52eac8 52
88a6bac5 53Runs the analysis described in analyze_variant_location on every location in the
54collation of the given tradition, with the given options. These include:
7f52eac8 55
88a6bac5 56=over 4
57
58=item * stemma_id - Specify which of the tradition's stemmata to use. Default
59is 0 (i.e. the first).
60
61=item * ranks - Specify a list of location ranks to analyze; exclude the rest.
62
63=item * merge_types - Specify a list of relationship types, where related readings
64should be treated as identical for the purposes of analysis.
65
ffa22d6f 66=item * exclude_type1 - Exclude those ranks whose groupings have only type-1 variants.
67
88a6bac5 68=back
7f52eac8 69
70=begin testing
71
72use Text::Tradition;
73use Text::Tradition::Analysis qw/ run_analysis analyze_variant_location /;
74
75my $datafile = 't/data/florilegium_tei_ps.xml';
76my $tradition = Text::Tradition->new( 'input' => 'TEI',
77 'name' => 'test0',
78 'file' => $datafile );
79my $s = $tradition->add_stemma( 'dotfile' => 't/data/florilegium.dot' );
80is( ref( $s ), 'Text::Tradition::Stemma', "Added stemma to tradition" );
81
f00cefe8 82my %expected_genealogical = (
a44aaf2a 83 1 => 0,
f00cefe8 84 2 => 1,
a44aaf2a 85 3 => 0,
86 5 => 0,
87 7 => 0,
88 8 => 0,
89 10 => 0,
f00cefe8 90 13 => 1,
a44aaf2a 91 33 => 0,
92 34 => 0,
93 37 => 0,
94 60 => 0,
f00cefe8 95 81 => 1,
a44aaf2a 96 84 => 0,
97 87 => 0,
98 101 => 0,
99 102 => 0,
f00cefe8 100 122 => 1,
a44aaf2a 101 157 => 0,
f00cefe8 102 166 => 1,
103 169 => 1,
a44aaf2a 104 200 => 0,
f00cefe8 105 216 => 1,
106 217 => 1,
107 219 => 1,
108 241 => 1,
109 242 => 1,
110 243 => 1,
111);
112
7e17346f 113my $data = run_analysis( $tradition, calcdsn => 'dbi:SQLite:dbname=t/data/analysis.db' );
7234b01d 114my $c = $tradition->collation;
f00cefe8 115foreach my $row ( @{$data->{'variants'}} ) {
a44aaf2a 116 # Account for rows that used to be "not useful"
117 unless( exists $expected_genealogical{$row->{'id'}} ) {
118 $expected_genealogical{$row->{'id'}} = 1;
119 }
18f48b82 120 my $gen_bool = $row->{'genealogical'} ? 1 : 0;
121 is( $gen_bool, $expected_genealogical{$row->{'id'}},
f00cefe8 122 "Got correct genealogical flag for row " . $row->{'id'} );
7234b01d 123 # Check that we have the right row with the right groups
124 my $rank = $row->{'id'};
125 foreach my $rdghash ( @{$row->{'readings'}} ) {
126 # Skip 'readings' that aren't really
127 next unless $c->reading( $rdghash->{'readingid'} );
128 # Check the rank
129 is( $c->reading( $rdghash->{'readingid'} )->rank, $rank,
130 "Got correct reading rank" );
131 # Check the witnesses
132 my @realwits = sort $c->reading_witnesses( $rdghash->{'readingid'} );
133 my @sgrp = sort @{$rdghash->{'group'}};
134 is_deeply( \@sgrp, \@realwits, "Reading analyzed with correct groups" );
135 }
f00cefe8 136}
a44aaf2a 137is( $data->{'variant_count'}, 58, "Got right total variant number" );
b4cb2d60 138# TODO Make something meaningful of conflict count, maybe test other bits
7f52eac8 139
140=end testing
141
142=cut
143
d71100ed 144sub run_analysis {
88a6bac5 145 my( $tradition, %opts ) = @_;
f00cefe8 146 my $c = $tradition->collation;
7e17346f 147 my $aclabel = $c->ac_label;
88a6bac5 148
149 my $stemma_id = $opts{'stemma_id'} || 0;
1d73ecad 150 my @ranks = ref( $opts{'ranks'} ) eq 'ARRAY' ? @{$opts{'ranks'}} : ();
e23225e7 151 my $collapse = Set::Scalar->new();
152 if( $opts{'merge_types'} && ref( $opts{'merge_types'} ) eq 'ARRAY' ) {
153 $collapse->insert( @{$opts{'merge_types'}} );
154 } elsif( $opts{'merge_types'} ) {
155 $collapse->insert( $opts{'merge_types'} );
156 }
7e17346f 157
158 # Make sure we have a lookup DB for graph calculation results; this will die
159 # if calcdir or calcdsn isn't passed.
160 my $dir = $opts{'calcdir'} ? delete $opts{'calcdir'}
161 : Text::Tradition::Directory->new( dsn => $opts{'calcdsn'} );
88a6bac5 162
163 # Get the stemma
164 my $stemma = $tradition->stemma( $stemma_id );
b4cb2d60 165
4ce27d42 166 # Figure out which witnesses we are working with - that is, the ones that
167 # appear both in the stemma and in the tradition. All others are 'lacunose'
168 # for our purposes.
7e17346f 169 my $lacunose = Set::Scalar->new( $stemma->hypotheticals );
170 my $stemma_wits = Set::Scalar->new( $stemma->witnesses );
171 my $tradition_wits = Set::Scalar->new( map { $_->sigil } $tradition->witnesses );
172 $lacunose->insert( $stemma_wits->symmetric_difference( $tradition_wits )->members );
88a6bac5 173
174 # Find and mark 'common' ranks for exclusion, unless they were
175 # explicitly specified.
176 unless( @ranks ) {
177 my %common_rank;
a44aaf2a 178 foreach my $rdg ( $c->common_readings ) {
88a6bac5 179 $common_rank{$rdg->rank} = 1;
180 }
181 @ranks = grep { !$common_rank{$_} } ( 1 .. $c->end->rank-1 );
d71100ed 182 }
7f52eac8 183
88a6bac5 184 # Group the variants to send to the solver
185 my @groups;
f629cb3b 186 my @use_ranks;
a44aaf2a 187 my %lacunae;
94654e27 188 my $moved = {};
88a6bac5 189 foreach my $rank ( @ranks ) {
7e17346f 190 my $missing = $lacunose->clone();
e23225e7 191 my $rankgroup = group_variants( $tradition, $rank, $missing, $moved, $collapse );
94654e27 192 # Filter out any empty rankgroups
193 # (e.g. from the later rank for a transposition)
194 next unless keys %$rankgroup;
7e17346f 195 # Get the graph for this rankgroup
196 my $rankgraph = _graph_for_grouping( $stemma, $rankgroup, $missing, $aclabel );
ffa22d6f 197 if( $opts{'exclude_type1'} ) {
198 # Check to see whether this is a "useful" group.
7e17346f 199 next unless _useful_variant( $rankgroup, $rankgraph, $aclabel );
ffa22d6f 200 }
f629cb3b 201 push( @use_ranks, $rank );
7e17346f 202 push( @groups, { grouping => $rankgroup, graph => $rankgraph } );
a44aaf2a 203 $lacunae{$rank} = $missing;
d71100ed 204 }
4ce27d42 205 # Run the solver
7e17346f 206 my $answer = solve_variants( $dir, @groups );
fae07016 207
88a6bac5 208 # Do further analysis on the answer
a44aaf2a 209 my $conflict_count = 0;
7e17346f 210 my $reversion_count = 0;
f629cb3b 211 foreach my $idx ( 0 .. $#use_ranks ) {
88a6bac5 212 my $location = $answer->{'variants'}->[$idx];
213 # Add the rank back in
94654e27 214 my $rank = $use_ranks[$idx];
215 $location->{'id'} = $rank;
7234b01d 216 # Note what our lacunae are
f629cb3b 217 my %lmiss;
7234b01d 218 map { $lmiss{$_} = 1 } @{$lacunae{$use_ranks[$idx]}};
7234b01d 219 $location->{'missing'} = [ keys %lmiss ];
220
88a6bac5 221 # Run the extra analysis we need.
428bcf0b 222 ## TODO We run through all the variants in this call, so
223 ## why not add the reading data there instead of here below?
7e17346f 224 my $graph = $groups[$idx]->{graph};
225 analyze_location( $tradition, $graph, $location, \%lmiss );
7234b01d 226
638e2a95 227 my @layerwits;
7234b01d 228 # Do the final post-analysis tidying up of the data.
a44aaf2a 229 foreach my $rdghash ( @{$location->{'readings'}} ) {
e23225e7 230 $conflict_count++ if $rdghash->{'is_conflict'};
231 $reversion_count++ if $rdghash->{'is_reverted'};
94654e27 232 # Add the reading text back in, setting display value as needed
a44aaf2a 233 my $rdg = $c->reading( $rdghash->{'readingid'} );
94654e27 234 if( $rdg ) {
235 $rdghash->{'text'} = $rdg->text .
236 ( $rdg->rank == $rank ? '' : ' [' . $rdg->rank . ']' );
7b7abf10 237 $rdghash->{'is_ungrammatical'} = $rdg->grammar_invalid;
238 $rdghash->{'is_nonsense'} = $rdg->is_nonsense;
94654e27 239 }
f629cb3b 240 # Remove lacunose witnesses from this reading's list now that the
7234b01d 241 # analysis is done
f629cb3b 242 my @realgroup;
7234b01d 243 map { push( @realgroup, $_ ) unless $lmiss{$_} } @{$rdghash->{'group'}};
f629cb3b 244 $rdghash->{'group'} = \@realgroup;
638e2a95 245 # Note any layered witnesses that appear in this group
246 foreach( @realgroup ) {
247 if( $_ =~ /^(.*)\Q$aclabel\E$/ ) {
248 push( @layerwits, $1 );
249 }
250 }
a44aaf2a 251 }
638e2a95 252 $location->{'layerwits'} = \@layerwits if @layerwits;
88a6bac5 253 }
a44aaf2a 254 $answer->{'conflict_count'} = $conflict_count;
7e17346f 255 $answer->{'reversion_count'} = $reversion_count;
f00cefe8 256
88a6bac5 257 return $answer;
d71100ed 258}
259
e23225e7 260=head2 group_variants( $tradition, $rank, $lacunose, $transposed, $merge_relationship_types )
7f52eac8 261
262Groups the variants at the given $rank of the collation, treating any
e23225e7 263relationships in the set $merge_relationship_types as equivalent.
264$lacunose should be a reference to an array, to which the sigla of lacunose
265witnesses at this rank will be appended; $transposed should be a reference
266to a hash, wherein the identities of transposed readings and their
267relatives will be stored.
7f52eac8 268
ffa22d6f 269Returns a hash $group_readings where $rdg is attested by the witnesses listed
270in $group_readings->{$rdg}.
7f52eac8 271
272=cut
273
274# Return group_readings, groups, lacunose
d1348d38 275sub group_variants {
94654e27 276 my( $tradition, $rank, $lacunose, $transposed, $collapse ) = @_;
7f52eac8 277 my $c = $tradition->collation;
335a62ef 278 my $aclabel = $c->ac_label;
d120c995 279 my $table = $c->alignment_table;
7f52eac8 280 # Get the alignment table readings
281 my %readings_at_rank;
7e17346f 282 my $check_for_gaps = Set::Scalar->new();
94654e27 283 my %moved_wits;
d120c995 284 my $has_transposition;
285 foreach my $tablewit ( @{$table->{'alignment'}} ) {
7f52eac8 286 my $rdg = $tablewit->{'tokens'}->[$rank-1];
fae07016 287 my $wit = $tablewit->{'witness'};
ffa22d6f 288 # Exclude the witness if it is "lacunose" which if we got here
289 # means "not in the stemma".
7e17346f 290 next if _is_lacunose( $wit, $lacunose, $aclabel );
94654e27 291 # Note if the witness is actually in a lacuna
7f52eac8 292 if( $rdg && $rdg->{'t'}->is_lacuna ) {
335a62ef 293 _add_to_witlist( $wit, $lacunose, $aclabel );
94654e27 294 # Otherwise the witness either has a positive reading...
7f52eac8 295 } elsif( $rdg ) {
94654e27 296 # If the reading has been counted elsewhere as a transposition, ignore it.
297 if( $transposed->{$rdg->{'t'}->id} ) {
d120c995 298 # TODO Does this cope with three-way transpositions?
94654e27 299 map { $moved_wits{$_} = 1 } @{$transposed->{$rdg->{'t'}->id}};
300 next;
301 }
302 # Otherwise, record it...
303 $readings_at_rank{$rdg->{'t'}->id} = $rdg->{'t'};
304 # ...and grab any transpositions, and their relations.
305 my @transp = grep { $_->rank != $rank } $rdg->{'t'}->related_readings();
306 foreach my $trdg ( @transp ) {
d120c995 307 next if exists $readings_at_rank{$trdg->id};
308 $has_transposition = 1;
309 my @affected_wits = _table_witnesses(
7e17346f 310 $table, $trdg, $lacunose, $aclabel );
d120c995 311 next unless @affected_wits;
312 map { $moved_wits{$_} = 1 } @affected_wits;
313 $transposed->{$trdg->id} =
7e17346f 314 [ _table_witnesses( $table, $rdg->{'t'}, $lacunose, $aclabel ) ];
94654e27 315 $readings_at_rank{$trdg->id} = $trdg;
316 }
317 # ...or it is empty, ergo a gap.
7f52eac8 318 } else {
7e17346f 319 _add_to_witlist( $wit, $check_for_gaps, $aclabel );
7f52eac8 320 }
321 }
7e17346f 322 my $gap_wits = Set::Scalar->new();
323 map { _add_to_witlist( $_, $gap_wits, $aclabel )
324 unless $moved_wits{$_} } $check_for_gaps->members;
325
326 # Group the readings, collapsing groups by relationship if needed.
d120c995 327 my $grouped_readings = {};
4ce27d42 328 foreach my $rdg ( values %readings_at_rank ) {
7f52eac8 329 # Skip readings that have been collapsed into others.
d120c995 330 next if exists $grouped_readings->{$rdg->id}
331 && $grouped_readings->{$rdg->id} eq 'COLLAPSE';
4ce27d42 332 # Get the witness list, including from readings collapsed into this one.
7e17346f 333 my @wits = _table_witnesses( $table, $rdg, $lacunose, $aclabel );
e23225e7 334 if( $collapse && $collapse->size ) {
335 my $filter = sub { $collapse->has( $_[0]->type ) };
7f52eac8 336 foreach my $other ( $rdg->related_readings( $filter ) ) {
7e17346f 337 my @otherwits = _table_witnesses( $table, $other, $lacunose, $aclabel );
fae07016 338 push( @wits, @otherwits );
d120c995 339 $grouped_readings->{$other->id} = 'COLLAPSE';
d1348d38 340 }
341 }
7e17346f 342 $grouped_readings->{$rdg->id} = Set::Scalar->new( @wits );
7f52eac8 343 }
7e17346f 344 if( $gap_wits->members ) {
345 $grouped_readings->{'(omitted)'} = $gap_wits;
346 }
347
7f52eac8 348 # Get rid of our collapsed readings
7e17346f 349 map { delete $grouped_readings->{$_} if(
350 $grouped_readings->{$_} eq 'COLLAPSE'
351 || $grouped_readings->{$_}->is_empty ) }
352 keys %$grouped_readings;
d120c995 353
354 # If something was transposed, check the groups for doubled-up readings
355 if( $has_transposition ) {
62a39b8f 356 # print STDERR "Group for rank $rank:\n";
357 # map { print STDERR "\t$_: " . join( ' ' , @{$grouped_readings->{$_}} ) . "\n" }
358 # keys %$grouped_readings;
d120c995 359 _check_transposed_consistency( $c, $rank, $transposed, $grouped_readings );
360 }
7f52eac8 361
4ce27d42 362 # Return the result
d120c995 363 return $grouped_readings;
364}
365
366# Helper function to query the alignment table for all witnesses (a.c. included)
367# that have a given reading at its rank.
368sub _table_witnesses {
369 my( $table, $trdg, $lacunose, $aclabel ) = @_;
370 my $tableidx = $trdg->rank - 1;
7e17346f 371 my $has_reading = Set::Scalar->new();
d120c995 372 foreach my $row ( @{$table->{'alignment'}} ) {
373 my $wit = $row->{'witness'};
7e17346f 374 next if _is_lacunose( $wit, $lacunose, $aclabel );
d120c995 375 my $rdg = $row->{'tokens'}->[$tableidx];
376 next unless exists $rdg->{'t'} && defined $rdg->{'t'};
7e17346f 377 _add_to_witlist( $wit, $has_reading, $aclabel )
d120c995 378 if $rdg->{'t'}->id eq $trdg->id;
379 }
7e17346f 380 return $has_reading->members;
381}
382
383# Helper function to see if a witness is lacunose even if we are asking about
384# the a.c. version
385sub _is_lacunose {
386 my ( $wit, $lac, $acstr ) = @_;
387 if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
388 $wit = $1;
389 }
390 return $lac->has( $wit );
d1348d38 391}
392
335a62ef 393# Helper function to ensure that X and X a.c. never appear in the same list.
394sub _add_to_witlist {
395 my( $wit, $list, $acstr ) = @_;
335a62ef 396 if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
7e17346f 397 # Don't add X a.c. if we already have X
398 return if $list->has( $1 );
335a62ef 399 } else {
7e17346f 400 # Delete X a.c. if we are about to add X
401 $list->delete( $wit.$acstr );
335a62ef 402 }
7e17346f 403 $list->insert( $wit );
335a62ef 404}
405
d120c995 406sub _check_transposed_consistency {
407 my( $c, $rank, $transposed, $groupings ) = @_;
408 my %seen_wits;
409 my %thisrank;
410 # Note which readings are actually at this rank, and which witnesses
411 # belong to which reading.
412 foreach my $rdg ( keys %$groupings ) {
413 my $rdgobj = $c->reading( $rdg );
414 # Count '(omitted)' as a reading at this rank
415 $thisrank{$rdg} = 1 if !$rdgobj || $rdgobj->rank == $rank;
416 map { push( @{$seen_wits{$_}}, $rdg ) } @{$groupings->{$rdg}};
417 }
418 # Our work is done if we have no witness belonging to more than one
419 # reading.
420 my @doubled = grep { scalar @{$seen_wits{$_}} > 1 } keys %seen_wits;
421 return unless @doubled;
422 # If we have a symmetric related transposition, drop the non-rank readings.
423 if( @doubled == scalar keys %seen_wits ) {
424 foreach my $rdg ( keys %$groupings ) {
425 if( !$thisrank{$rdg} ) {
426 my $groupstr = wit_stringify( $groupings->{$rdg} );
427 my ( $matched ) = grep { $groupstr eq wit_stringify( $groupings->{$_} ) }
428 keys %thisrank;
429 delete $groupings->{$rdg};
430 # If we found a group match, assume there is a symmetry happening.
431 # TODO think more about this
62a39b8f 432 # print STDERR "*** Deleting symmetric reading $rdg\n";
d120c995 433 unless( $matched ) {
434 delete $transposed->{$rdg};
435 warn "Found problem in evident symmetry with reading $rdg";
436 }
437 }
438 }
439 # Otherwise 'unhook' the transposed reading(s) that have duplicates.
440 } else {
441 foreach my $dup ( @doubled ) {
442 foreach my $rdg ( @{$seen_wits{$dup}} ) {
443 next if $thisrank{$rdg};
444 next unless exists $groupings->{$rdg};
62a39b8f 445 # print STDERR "*** Deleting asymmetric doubled-up reading $rdg\n";
d120c995 446 delete $groupings->{$rdg};
447 delete $transposed->{$rdg};
448 }
449 }
450 # and put any now-orphaned readings into an 'omitted' reading.
451 foreach my $wit ( keys %seen_wits ) {
452 unless( grep { exists $groupings->{$_} } @{$seen_wits{$wit}} ) {
7e17346f 453 $groupings->{'(omitted)'} = Set::Scalar->new()
454 unless exists $groupings->{'(omitted)'};
d120c995 455 _add_to_witlist( $wit, $groupings->{'(omitted)'}, $c->ac_label );
456 }
457 }
458 }
459}
460
7e17346f 461# For the given grouping, return its situation graph based on the stemma.
462sub _graph_for_grouping {
463 my( $stemma, $grouping, $lacunose, $aclabel ) = @_;
464 my $acwits = [];
465 my $extant = {};
466 foreach my $gs ( values %$grouping ) {
467 map {
468 if( $_ =~ /^(.*)\Q$aclabel\E$/ ) {
469 push( @$acwits, $1 ) unless $lacunose->has( $1 );
470 } else {
471 $extant->{$_} = 1 unless $lacunose->has( $_ );
472 }
473 } $gs->members;
474 }
475 my $graph;
476 try {
477 # contig contains all extant wits and all hypothetical wits
478 # needed to make up the groups.
479 $graph = $stemma->situation_graph( $extant, $acwits );
480 } catch ( Text::Tradition::Error $e ) {
481 die "Could not extend graph with given extant and a.c. witnesses: "
482 . $e->message;
483 } catch {
484 die "Could not extend graph with a.c. witnesses @$acwits";
485 }
486 return $graph;
487}
88a6bac5 488
7e17346f 489=head2 solve_variants( $calcdir, @groups )
490
491Looks up the set of groups in the answers provided by the external graph solver
492service and returns a cleaned-up answer, adding the rank IDs back where they belong.
88a6bac5 493
494The JSON has the form
495 { "graph": [ stemmagraph DOT string without newlines ],
496 "groupings": [ array of arrays of groups, one per rank ] }
497
498The answer has the form
499 { "variants" => [ array of variant location structures ],
500 "variant_count" => total,
501 "conflict_count" => number of conflicts detected,
502 "genealogical_count" => number of solutions found }
503
504=cut
505
506sub solve_variants {
7e17346f 507 my( $dir, @groups ) = @_;
335a62ef 508
7e17346f 509 ## For each graph/group combo, look it up in the DB.
335a62ef 510 ## Witness map is a HACK to get around limitations in node names from IDP
511 my $witness_map = {};
512 ## Variables to store answers as they come back
7e17346f 513 my $variants = [];
514 my $genealogical = 0; # counter
515 foreach my $graphproblem ( @groups ) {
516 # Initialize the result structure for this graph problem
517 my $vstruct = { readings => [] };
518 push( @$variants, $vstruct );
5c44c598 519
7e17346f 520 # Construct the calc result key and look up its answer
521 my $reqkey = _get_calc_key( $graphproblem );
522 my $scope = $dir->new_scope;
523 my $answer = $dir->lookup( $reqkey );
524 unless( $answer ) {
525 #warn "No answer found for graph problem $reqkey, moving on";
526 # Record the unsolved problem so that we can go get a solution
527 _save_problem( $graphproblem );
528 # Put just the request, with no real result, into vstruct
529 foreach my $rid ( keys %{$graphproblem->{grouping}} ) {
530 push( @{$vstruct->{readings}}, { readingid => $rid,
531 group => [ $graphproblem->{grouping}->{$rid}->members ] } );
5c44c598 532 }
7e17346f 533 next;
5c44c598 534 }
7e17346f 535
536 ## The answer is a Text::Tradition::Analysis::Result containing a bunch
537 ## of information about this variant set. Record the information therein.
5c44c598 538
7e17346f 539 # 1. Did the group evaluate as genealogical?
540 $vstruct->{genealogical} = $answer->is_genealogical;
541 $genealogical++ if $answer->is_genealogical;
542
543 # 2. What are the calculated minimum groupings for each variant loc?
544 foreach my $rid ( keys %{$graphproblem->{grouping}} ) {
545 my $inputset = $graphproblem->{grouping}->{$rid};
546 my $minset = $answer->minimum_grouping_for( $inputset );
547 push( @{$vstruct->{readings}}, { readingid => $rid, group => $minset } );
6d25a3a0 548 }
7e17346f 549
550 # 3. What are the sources and classes calculated for each witness?
551 $vstruct->{witcopy_types} = { $answer->classes };
552 $vstruct->{reading_roots} = {};
553 map { $vstruct->{reading_roots}->{$_} = 1 } $answer->sources;
554
88a6bac5 555 }
556
7e17346f 557 # Spit out any unsolved problems we encountered
e23225e7 558 # _list_unsolved();
7e17346f 559
88a6bac5 560 return { 'variants' => $variants,
561 'variant_count' => scalar @$variants,
562 'genealogical_count' => $genealogical };
563}
564
7e17346f 565sub _get_calc_key {
566 my( $graphproblem ) = @_;
567 my $graph = $graphproblem->{graph};
568 my $grouping = [ values %{$graphproblem->{grouping}} ];
569 my $key = Text::Tradition::Analysis::Result::string_from_graph_problem(
570 $graph, $grouping );
571 return md5_hex( encode_utf8( $key ) );
5c44c598 572}
573
7e17346f 574sub _save_problem {
575 my( $graphproblem ) = @_;
e23225e7 576 my $problem = Text::Tradition::Analysis::Result->new(
577 graph => $graphproblem->{graph},
578 setlist => [ values %{$graphproblem->{grouping}} ]
579 );
580 my $key = _get_calc_key( $graphproblem );
581 my( $str ) = $problem->problem_json;
582 say STDERR "Stashing unsolved problem $str at key $key";
583 $unsolved_problems->{$key} = $problem;
b4cb2d60 584}
585
7e17346f 586sub _list_unsolved {
587 #say STDERR "Problems needing a solution:";
e23225e7 588 my @problems = values %$unsolved_problems;
589 return unless @problems;
590 my $first = shift @problems;
591 map { say STDERR $_ } $first->problem_json( @problems );
b4cb2d60 592}
593
fae07016 594=head2 analyze_location ( $tradition, $graph, $location_hash )
7f52eac8 595
fae07016 596Given the tradition, its stemma graph, and the solution from the graph solver,
597work out the rest of the information we want. For each reading we need missing,
5c44c598 598conflict, reading_parents, independent_occurrence, followed, not_followed,
599and follow_unknown. Alters the location_hash in place.
7f52eac8 600
601=cut
732152b1 602
fae07016 603sub analyze_location {
7e17346f 604 my ( $tradition, $graph, $variant_row, $lacunose ) = @_;
94654e27 605 my $c = $tradition->collation;
fae07016 606
607 # Make a hash of all known node memberships, and make the subgraphs.
608 my $contig = {};
609 my $reading_roots = {};
610 my $subgraph = {};
638e2a95 611 my $acstr = $c->ac_label;
612 my @acwits;
5c44c598 613
614 my $NO_IDP;
615 if( exists $variant_row->{'reading_roots'} ) {
616 $reading_roots = delete $variant_row->{'reading_roots'};
617 } else {
618 warn "No reading source information from IDP - proceed at your own risk";
619 $NO_IDP = 1;
620 }
7e17346f 621 my $classinfo = delete $variant_row->{'witcopy_types'};
5c44c598 622
623 # Note which witnesses positively belong to which group. This information
624 # comes ultimately from the IDP solver.
625 # Also make a note of the reading's roots.
fae07016 626 foreach my $rdghash ( @{$variant_row->{'readings'}} ) {
627 my $rid = $rdghash->{'readingid'};
5c44c598 628 my @roots;
638e2a95 629 foreach my $wit ( @{$rdghash->{'group'}} ) {
630 $contig->{$wit} = $rid;
631 if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
632 push( @acwits, $1 );
633 }
5c44c598 634 if( exists $reading_roots->{$wit} && $reading_roots->{$wit} ) {
635 push( @roots, $wit );
636 }
638e2a95 637 }
5c44c598 638 $rdghash->{'independent_occurrence'} = \@roots;
94654e27 639 }
7e17346f 640
fae07016 641 # Now that we have all the node group memberships, calculate followed/
bebec0e9 642 # non-followed/unknown values for each reading. Also figure out the
643 # reading's evident parent(s).
644 foreach my $rdghash ( @{$variant_row->{'readings'}} ) {
fae07016 645 my $rid = $rdghash->{'readingid'};
428bcf0b 646 my $rdg = $c->reading( $rid );
5c44c598 647 my @roots = @{$rdghash->{'independent_occurrence'}};
7e17346f 648 my @reversions;
649 if( $classinfo ) {
650 @reversions = grep { $classinfo->{$_} eq 'revert' }
651 $rdghash->{'group'}->members;
be590045 652 $rdghash->{'reversions'} = \@reversions;
7e17346f 653 }
5c44c598 654 my @group = @{$rdghash->{'group'}};
fae07016 655
656 # Start figuring things out.
7e17346f 657 $rdghash->{'followed'} = scalar( @group )
658 - ( scalar( @roots ) + scalar( @reversions ) );
bebec0e9 659 # Find the parent readings, if any, of this reading.
7e17346f 660 my $sourceparents = _find_reading_parents( $rid, $graph, $contig, @roots );
7e17346f 661 # Work out relationships between readings and their non-followed parent.
662 _resolve_parent_relationships( $c, $rid, $rdg, $sourceparents );
be590045 663 $rdghash->{'source_parents'} = $sourceparents;
e23225e7 664
665 if( @reversions ) {
666 my $revparents = _find_reading_parents( $rid, $graph, $contig, @reversions );
667 _resolve_parent_relationships( $c, $rid, $rdg, $revparents );
668 $rdghash->{'reversion_parents'} = $revparents;
669 }
bebec0e9 670
671 # Find the number of times this reading was altered, and the number of
672 # times we're not sure.
673 my( %nofollow, %unknownfollow );
5c44c598 674 foreach my $wit ( @{$rdghash->{'group'}} ) {
bebec0e9 675 foreach my $wchild ( $graph->successors( $wit ) ) {
5c44c598 676 if( $reading_roots->{$wchild} && $contig->{$wchild}
677 && $contig->{$wchild} ne $rid ) {
bebec0e9 678 # It definitely changed here.
679 $nofollow{$wchild} = 1;
680 } elsif( !($contig->{$wchild}) ) {
681 # The child is a hypothetical node not definitely in
682 # any group. Answer is unknown.
683 $unknownfollow{$wchild} = 1;
5c44c598 684 } # else it is either in our group, or it is a non-root node in a
685 # known group and therefore is presumed to have its reading from
686 # its group, not this link.
bebec0e9 687 }
688 }
689 $rdghash->{'not_followed'} = keys %nofollow;
690 $rdghash->{'follow_unknown'} = keys %unknownfollow;
fae07016 691
692 # Now say whether this reading represents a conflict.
693 unless( $variant_row->{'genealogical'} ) {
7e17346f 694 $rdghash->{'is_conflict'} = @roots != 1;
e23225e7 695 $rdghash->{'is_reverted'} = scalar @reversions;
7e17346f 696 }
697 }
698}
699
700sub _find_reading_parents {
701 my( $rid, $graph, $contig, @list ) = @_;
702 my $parenthash = {};
703 foreach my $wit ( @list ) {
704 # Look in the stemma graph to find this witness's extant or known-reading
705 # immediate ancestor(s), and look up the reading that each ancestor holds.
706 my @check = $graph->predecessors( $wit );
707 while( @check ) {
708 my @next;
709 foreach my $wparent( @check ) {
710 my $preading = $contig->{$wparent};
711 if( $preading && $preading ne $rid ) {
712 $parenthash->{$preading} = 1;
713 } else {
714 push( @next, $graph->predecessors( $wparent ) );
715 }
716 }
717 @check = @next;
718 }
719 }
720 return $parenthash;
721}
722
723sub _resolve_parent_relationships {
724 my( $c, $rid, $rdg, $rdgparents ) = @_;
725 foreach my $p ( keys %$rdgparents ) {
726 # Resolve the relationship of the parent to the reading, and
727 # save it in our hash.
728 my $pobj = $c->reading( $p );
729 my $prep = $pobj ? $pobj->id . ' (' . $pobj->text . ')' : $p;
730 my $phash = { 'label' => $prep };
731 if( $pobj ) {
732 my $rel = $c->get_relationship( $p, $rid );
733 if( $rel ) {
734 _add_to_hash( $rel, $phash );
735 } elsif( $rdg ) {
736 # First check for a transposed relationship
737 if( $rdg->rank != $pobj->rank ) {
738 foreach my $ti ( $rdg->related_readings( 'transposition' ) ) {
739 next unless $ti->text eq $rdg->text;
740 $rel = $c->get_relationship( $ti, $pobj );
741 if( $rel ) {
742 _add_to_hash( $rel, $phash, 1 );
743 last;
744 }
745 }
746 unless( $rel ) {
747 foreach my $ti ( $pobj->related_readings( 'transposition' ) ) {
748 next unless $ti->text eq $pobj->text;
749 $rel = $c->get_relationship( $ti, $rdg );
750 if( $rel ) {
751 _add_to_hash( $rel, $phash, 1 );
752 last;
753 }
754 }
5c44c598 755 }
756 }
7e17346f 757 unless( $rel ) {
758 # and then check for sheer word similarity.
759 my $rtext = $rdg->text;
760 my $ptext = $pobj->text;
761 if( similar( $rtext, $ptext ) ) {
762 # say STDERR "Words $rtext and $ptext judged similar";
763 $phash->{relation} = { type => 'wordsimilar' };
764 }
765 }
5c44c598 766 } else {
7e17346f 767 $phash->{relation} = { type => 'deletion' };
5c44c598 768 }
7e17346f 769 # Get the attributes of the parent object while we are here
770 $phash->{'text'} = $pobj->text if $pobj;
771 $phash->{'is_nonsense'} = $pobj->is_nonsense;
772 $phash->{'is_ungrammatical'} = $pobj->grammar_invalid;
773 } elsif( $p eq '(omitted)' ) {
774 $phash->{relation} = { type => 'addition' };
775 }
776 # Save it
777 $rdgparents->{$p} = $phash;
778 }
d71100ed 779}
780
428bcf0b 781sub _add_to_hash {
782 my( $rel, $phash, $is_transposed ) = @_;
783 $phash->{relation} = { type => $rel->type };
784 $phash->{relation}->{transposed} = 1 if $is_transposed;
785 $phash->{relation}->{annotation} = $rel->annotation
786 if $rel->has_annotation;
787}
788
789=head2 similar( $word1, $word2 )
790
791Use Algorithm::Diff to get a sense of how close the words are to each other.
792This will hopefully handle substitutions a bit more nicely than Levenshtein.
793
794=cut
795
796#!/usr/bin/env perl
797
798sub similar {
799 my( $word1, $word2 ) = sort { length($a) <=> length($b) } @_;
800 my @let1 = split( '', lc( $word1 ) );
801 my @let2 = split( '', lc( $word2 ) );
802 my $diff = Algorithm::Diff->new( \@let1, \@let2 );
803 my $mag = 0;
804 while( $diff->Next ) {
805 if( $diff->Same ) {
806 # Take off points for longer strings
807 my $cs = $diff->Range(1) - 2;
808 $cs = 0 if $cs < 0;
809 $mag -= $cs;
810 } elsif( !$diff->Items(1) ) {
811 $mag += $diff->Range(2);
812 } elsif( !$diff->Items(2) ) {
813 $mag += $diff->Range(1);
814 } else {
815 # Split the difference for substitutions
816 my $c1 = $diff->Range(1) || 1;
817 my $c2 = $diff->Range(2) || 1;
818 my $cd = ( $c1 + $c2 ) / 2;
819 $mag += $cd;
820 }
821 }
822 return ( $mag <= length( $word1 ) / 2 );
823}
824
6d25a3a0 825sub _prune_group {
5c44c598 826 my( $group, $graph ) = @_;
827 my $relevant = {};
828 # Record the existence of the vertices in the group
829 map { $relevant->{$_} = 1 } @$group;
6d25a3a0 830 # Make our subgraph
5c44c598 831 my $subgraph = $graph->deep_copy;
832 map { $subgraph->delete_vertex( $_ ) unless $relevant->{$_} }
6d25a3a0 833 $subgraph->vertices;
6d25a3a0 834 # Now prune and return the remaining vertices.
5c44c598 835 _prune_subtree( $subgraph );
836 # Return the list of vertices and the list of roots.
837 my $pruned_group = [ sort $subgraph->vertices ];
838 my $pruned_roots = [ $subgraph->predecessorless_vertices ];
839 return( $pruned_group, $pruned_roots );
6d25a3a0 840}
841
7f52eac8 842sub _prune_subtree {
5c44c598 843 my( $tree ) = @_;
94654e27 844
845 # Delete lacunose witnesses that have no successors
5c44c598 846 my @orphan_hypotheticals;
847 my $ctr = 0;
848 do {
849 die "Infinite loop on leaves" if $ctr > 100;
850 @orphan_hypotheticals =
851 grep { $tree->get_vertex_attribute( $_, 'class' ) eq 'hypothetical' }
852 $tree->successorless_vertices;
853 $tree->delete_vertices( @orphan_hypotheticals );
854 $ctr++;
855 } while( @orphan_hypotheticals );
94654e27 856
857 # Delete lacunose roots that have a single successor
858 my @redundant_root;
859 $ctr = 0;
860 do {
5c44c598 861 die "Infinite loop on roots" if $ctr > 100;
862 @redundant_root =
863 grep { $tree->get_vertex_attribute( $_, 'class' ) eq 'hypothetical'
864 && $tree->successors( $_ ) == 1 }
865 $tree->predecessorless_vertices;
94654e27 866 $tree->delete_vertices( @redundant_root );
867 $ctr++;
868 } while( @redundant_root );
869}
870
5be0cdeb 871sub _useful_variant {
7e17346f 872 my( $rankgroup, $rankgraph, $acstr ) = @_;
5be0cdeb 873
874 # Sort by group size and return
875 my $is_useful = 0;
7e17346f 876 foreach my $rdg ( keys %$rankgroup ) {
877 my @wits = $rankgroup->{$rdg}->members;
878 if( @wits > 1 ) {
5be0cdeb 879 $is_useful++;
880 } else {
7e17346f 881 $is_useful++ unless( $rankgraph->is_sink_vertex( $wits[0] )
882 || $wits[0] =~ /\Q$acstr\E$/ );
5be0cdeb 883 }
884 }
7e17346f 885 return $is_useful > 1;
5be0cdeb 886}
887
7f52eac8 888=head2 wit_stringify( $groups )
889
890Takes an array of witness groupings and produces a string like
891['A','B'] / ['C','D','E'] / ['F']
d71100ed 892
7f52eac8 893=cut
d71100ed 894
895sub wit_stringify {
896 my $groups = shift;
897 my @gst;
898 # If we were passed an array of witnesses instead of an array of
899 # groupings, then "group" the witnesses first.
900 unless( ref( $groups->[0] ) ) {
901 my $mkgrp = [ $groups ];
902 $groups = $mkgrp;
903 }
904 foreach my $g ( @$groups ) {
905 push( @gst, '[' . join( ',', map { "'$_'" } @$g ) . ']' );
906 }
907 return join( ' / ', @gst );
908}
7f52eac8 909
7f52eac8 9101;
911
912=head1 LICENSE
913
914This package is free software and is provided "as is" without express
915or implied warranty. You can redistribute it and/or modify it under
916the same terms as Perl itself.
917
918=head1 AUTHOR
919
920Tara L Andrews E<lt>aurum@cpan.orgE<gt>