minor tweaks to display logic
[scpubgit/stemmatology.git] / lib / Text / Tradition / Analysis.pm
CommitLineData
d71100ed 1package Text::Tradition::Analysis;
2
3use strict;
4use warnings;
e4386ba9 5use Benchmark;
88a6bac5 6use Encode qw/ encode_utf8 /;
d1348d38 7use Exporter 'import';
b4cb2d60 8use Graph;
88a6bac5 9use JSON qw/ encode_json decode_json /;
10use LWP::UserAgent;
d71100ed 11use Text::Tradition;
12use Text::Tradition::Stemma;
d120c995 13use TryCatch;
d71100ed 14
d1348d38 15use vars qw/ @EXPORT_OK /;
a2cf85dd 16@EXPORT_OK = qw/ run_analysis group_variants analyze_variant_location wit_stringify /;
d1348d38 17
7f52eac8 18=head1 NAME
19
20Text::Tradition::Analysis - functions for stemma analysis of a tradition
21
22=head1 SYNOPSIS
23
24 use Text::Tradition;
25 use Text::Tradition::Analysis qw/ run_analysis analyze_variant_location /;
26 my $t = Text::Tradition->new(
27 'name' => 'this is a text',
28 'input' => 'TEI',
29 'file' => '/path/to/tei_parallel_seg_file.xml' );
30 $t->add_stemma( 'dotfile' => $stemmafile );
31
32 my $variant_data = run_analysis( $tradition );
33 # Recalculate rank $n treating all orthographic variants as equivalent
34 my $reanalyze = analyze_variant_location( $tradition, $n, 0, 'orthographic' );
35
36=head1 DESCRIPTION
37
38Text::Tradition is a library for representation and analysis of collated
39texts, particularly medieval ones. The Collation is the central feature of
40a Tradition, where the text, its sequence of readings, and its relationships
41between readings are actually kept.
42
43=head1 SUBROUTINES
44
88a6bac5 45=head2 run_analysis( $tradition, %opts )
7f52eac8 46
88a6bac5 47Runs the analysis described in analyze_variant_location on every location in the
48collation of the given tradition, with the given options. These include:
7f52eac8 49
88a6bac5 50=over 4
51
52=item * stemma_id - Specify which of the tradition's stemmata to use. Default
53is 0 (i.e. the first).
54
55=item * ranks - Specify a list of location ranks to analyze; exclude the rest.
56
57=item * merge_types - Specify a list of relationship types, where related readings
58should be treated as identical for the purposes of analysis.
59
ffa22d6f 60=item * exclude_type1 - Exclude those ranks whose groupings have only type-1 variants.
61
88a6bac5 62=back
7f52eac8 63
64=begin testing
65
66use Text::Tradition;
67use Text::Tradition::Analysis qw/ run_analysis analyze_variant_location /;
68
69my $datafile = 't/data/florilegium_tei_ps.xml';
70my $tradition = Text::Tradition->new( 'input' => 'TEI',
71 'name' => 'test0',
72 'file' => $datafile );
73my $s = $tradition->add_stemma( 'dotfile' => 't/data/florilegium.dot' );
74is( ref( $s ), 'Text::Tradition::Stemma', "Added stemma to tradition" );
75
f00cefe8 76my %expected_genealogical = (
a44aaf2a 77 1 => 0,
f00cefe8 78 2 => 1,
a44aaf2a 79 3 => 0,
80 5 => 0,
81 7 => 0,
82 8 => 0,
83 10 => 0,
f00cefe8 84 13 => 1,
a44aaf2a 85 33 => 0,
86 34 => 0,
87 37 => 0,
88 60 => 0,
f00cefe8 89 81 => 1,
a44aaf2a 90 84 => 0,
91 87 => 0,
92 101 => 0,
93 102 => 0,
f00cefe8 94 122 => 1,
a44aaf2a 95 157 => 0,
f00cefe8 96 166 => 1,
97 169 => 1,
a44aaf2a 98 200 => 0,
f00cefe8 99 216 => 1,
100 217 => 1,
101 219 => 1,
102 241 => 1,
103 242 => 1,
104 243 => 1,
105);
106
7f52eac8 107my $data = run_analysis( $tradition );
7234b01d 108my $c = $tradition->collation;
f00cefe8 109foreach my $row ( @{$data->{'variants'}} ) {
a44aaf2a 110 # Account for rows that used to be "not useful"
111 unless( exists $expected_genealogical{$row->{'id'}} ) {
112 $expected_genealogical{$row->{'id'}} = 1;
113 }
18f48b82 114 my $gen_bool = $row->{'genealogical'} ? 1 : 0;
115 is( $gen_bool, $expected_genealogical{$row->{'id'}},
f00cefe8 116 "Got correct genealogical flag for row " . $row->{'id'} );
7234b01d 117 # Check that we have the right row with the right groups
118 my $rank = $row->{'id'};
119 foreach my $rdghash ( @{$row->{'readings'}} ) {
120 # Skip 'readings' that aren't really
121 next unless $c->reading( $rdghash->{'readingid'} );
122 # Check the rank
123 is( $c->reading( $rdghash->{'readingid'} )->rank, $rank,
124 "Got correct reading rank" );
125 # Check the witnesses
126 my @realwits = sort $c->reading_witnesses( $rdghash->{'readingid'} );
127 my @sgrp = sort @{$rdghash->{'group'}};
128 is_deeply( \@sgrp, \@realwits, "Reading analyzed with correct groups" );
129 }
f00cefe8 130}
a44aaf2a 131is( $data->{'variant_count'}, 58, "Got right total variant number" );
b4cb2d60 132# TODO Make something meaningful of conflict count, maybe test other bits
7f52eac8 133
134=end testing
135
136=cut
137
d71100ed 138sub run_analysis {
88a6bac5 139 my( $tradition, %opts ) = @_;
f00cefe8 140 my $c = $tradition->collation;
88a6bac5 141
142 my $stemma_id = $opts{'stemma_id'} || 0;
1d73ecad 143 my @ranks = ref( $opts{'ranks'} ) eq 'ARRAY' ? @{$opts{'ranks'}} : ();
144 my @collapse = ref( $opts{'merge_types'} ) eq 'ARRAY' ? @{$opts{'merge_types'}} : ();
88a6bac5 145
146 # Get the stemma
147 my $stemma = $tradition->stemma( $stemma_id );
b4cb2d60 148
4ce27d42 149 # Figure out which witnesses we are working with - that is, the ones that
150 # appear both in the stemma and in the tradition. All others are 'lacunose'
151 # for our purposes.
88a6bac5 152 my @lacunose = $stemma->hypotheticals;
fae07016 153 my @tradition_wits = map { $_->sigil } $tradition->witnesses;
fae07016 154 push( @lacunose, _symmdiff( [ $stemma->witnesses ], \@tradition_wits ) );
88a6bac5 155
156 # Find and mark 'common' ranks for exclusion, unless they were
157 # explicitly specified.
158 unless( @ranks ) {
159 my %common_rank;
a44aaf2a 160 foreach my $rdg ( $c->common_readings ) {
88a6bac5 161 $common_rank{$rdg->rank} = 1;
162 }
163 @ranks = grep { !$common_rank{$_} } ( 1 .. $c->end->rank-1 );
d71100ed 164 }
7f52eac8 165
88a6bac5 166 # Group the variants to send to the solver
167 my @groups;
f629cb3b 168 my @use_ranks;
a44aaf2a 169 my %lacunae;
94654e27 170 my $moved = {};
88a6bac5 171 foreach my $rank ( @ranks ) {
a44aaf2a 172 my $missing = [ @lacunose ];
94654e27 173 my $rankgroup = group_variants( $tradition, $rank, $missing, $moved, \@collapse );
174 # Filter out any empty rankgroups
175 # (e.g. from the later rank for a transposition)
176 next unless keys %$rankgroup;
ffa22d6f 177 if( $opts{'exclude_type1'} ) {
178 # Check to see whether this is a "useful" group.
179 my( $rdgs, $grps ) = _useful_variant( $rankgroup,
180 $stemma->graph, $c->ac_label );
181 next unless @$rdgs;
182 }
f629cb3b 183 push( @use_ranks, $rank );
ffa22d6f 184 push( @groups, $rankgroup );
a44aaf2a 185 $lacunae{$rank} = $missing;
d71100ed 186 }
4ce27d42 187 # Run the solver
e59b8faa 188 my $answer = solve_variants( $stemma, @groups );
fae07016 189
88a6bac5 190 # Do further analysis on the answer
a44aaf2a 191 my $conflict_count = 0;
7234b01d 192 my $aclabel = $c->ac_label;
f629cb3b 193 foreach my $idx ( 0 .. $#use_ranks ) {
88a6bac5 194 my $location = $answer->{'variants'}->[$idx];
195 # Add the rank back in
94654e27 196 my $rank = $use_ranks[$idx];
197 $location->{'id'} = $rank;
7234b01d 198 # Note what our lacunae are
f629cb3b 199 my %lmiss;
7234b01d 200 map { $lmiss{$_} = 1 } @{$lacunae{$use_ranks[$idx]}};
7234b01d 201 $location->{'missing'} = [ keys %lmiss ];
202
88a6bac5 203 # Run the extra analysis we need.
638e2a95 204 analyze_location( $tradition, $stemma, $location, \%lmiss );
7234b01d 205
638e2a95 206 my @layerwits;
7234b01d 207 # Do the final post-analysis tidying up of the data.
a44aaf2a 208 foreach my $rdghash ( @{$location->{'readings'}} ) {
209 $conflict_count++
210 if exists $rdghash->{'conflict'} && $rdghash->{'conflict'};
94654e27 211 # Add the reading text back in, setting display value as needed
a44aaf2a 212 my $rdg = $c->reading( $rdghash->{'readingid'} );
94654e27 213 if( $rdg ) {
214 $rdghash->{'text'} = $rdg->text .
215 ( $rdg->rank == $rank ? '' : ' [' . $rdg->rank . ']' );
216 }
f629cb3b 217 # Remove lacunose witnesses from this reading's list now that the
7234b01d 218 # analysis is done
f629cb3b 219 my @realgroup;
7234b01d 220 map { push( @realgroup, $_ ) unless $lmiss{$_} } @{$rdghash->{'group'}};
f629cb3b 221 $rdghash->{'group'} = \@realgroup;
638e2a95 222 # Note any layered witnesses that appear in this group
223 foreach( @realgroup ) {
224 if( $_ =~ /^(.*)\Q$aclabel\E$/ ) {
225 push( @layerwits, $1 );
226 }
227 }
a44aaf2a 228 }
638e2a95 229 $location->{'layerwits'} = \@layerwits if @layerwits;
88a6bac5 230 }
a44aaf2a 231 $answer->{'conflict_count'} = $conflict_count;
f00cefe8 232
88a6bac5 233 return $answer;
d71100ed 234}
235
7f52eac8 236=head2 group_variants( $tradition, $rank, $lacunose, @merge_relationship_types )
237
238Groups the variants at the given $rank of the collation, treating any
239relationships in @merge_relationship_types as equivalent. $lacunose should
240be a reference to an array, to which the sigla of lacunose witnesses at this
94654e27 241rank will be appended; $transposed should be a reference to a hash, wherein
242the identities of transposed readings and their relatives will be stored.
7f52eac8 243
ffa22d6f 244Returns a hash $group_readings where $rdg is attested by the witnesses listed
245in $group_readings->{$rdg}.
7f52eac8 246
247=cut
248
249# Return group_readings, groups, lacunose
d1348d38 250sub group_variants {
94654e27 251 my( $tradition, $rank, $lacunose, $transposed, $collapse ) = @_;
7f52eac8 252 my $c = $tradition->collation;
335a62ef 253 my $aclabel = $c->ac_label;
d120c995 254 my $table = $c->alignment_table;
7f52eac8 255 # Get the alignment table readings
256 my %readings_at_rank;
d120c995 257 my %is_lacunose; # lookup table for witnesses not in stemma
258 map { $is_lacunose{$_} = 1; $is_lacunose{$_.$aclabel} = 1 } @$lacunose;
94654e27 259 my @check_for_gaps;
260 my %moved_wits;
d120c995 261 my $has_transposition;
262 foreach my $tablewit ( @{$table->{'alignment'}} ) {
7f52eac8 263 my $rdg = $tablewit->{'tokens'}->[$rank-1];
fae07016 264 my $wit = $tablewit->{'witness'};
ffa22d6f 265 # Exclude the witness if it is "lacunose" which if we got here
266 # means "not in the stemma".
267 next if $is_lacunose{$wit};
94654e27 268 # Note if the witness is actually in a lacuna
7f52eac8 269 if( $rdg && $rdg->{'t'}->is_lacuna ) {
335a62ef 270 _add_to_witlist( $wit, $lacunose, $aclabel );
94654e27 271 # Otherwise the witness either has a positive reading...
7f52eac8 272 } elsif( $rdg ) {
94654e27 273 # If the reading has been counted elsewhere as a transposition, ignore it.
274 if( $transposed->{$rdg->{'t'}->id} ) {
d120c995 275 # TODO Does this cope with three-way transpositions?
94654e27 276 map { $moved_wits{$_} = 1 } @{$transposed->{$rdg->{'t'}->id}};
277 next;
278 }
279 # Otherwise, record it...
280 $readings_at_rank{$rdg->{'t'}->id} = $rdg->{'t'};
281 # ...and grab any transpositions, and their relations.
282 my @transp = grep { $_->rank != $rank } $rdg->{'t'}->related_readings();
283 foreach my $trdg ( @transp ) {
d120c995 284 next if exists $readings_at_rank{$trdg->id};
285 $has_transposition = 1;
286 my @affected_wits = _table_witnesses(
287 $table, $trdg, \%is_lacunose, $aclabel );
288 next unless @affected_wits;
289 map { $moved_wits{$_} = 1 } @affected_wits;
290 $transposed->{$trdg->id} =
291 [ _table_witnesses( $table, $rdg->{'t'}, \%is_lacunose, $aclabel ) ];
94654e27 292 $readings_at_rank{$trdg->id} = $trdg;
293 }
294 # ...or it is empty, ergo a gap.
7f52eac8 295 } else {
d120c995 296 _add_to_witlist( $wit, \@check_for_gaps, $aclabel );
7f52eac8 297 }
298 }
94654e27 299 my @gap_wits;
300 map { _add_to_witlist( $_, \@gap_wits, $aclabel )
301 unless $moved_wits{$_} } @check_for_gaps;
7f52eac8 302 # Group the readings, collapsing groups by relationship if needed
d120c995 303 my $grouped_readings = {};
4ce27d42 304 foreach my $rdg ( values %readings_at_rank ) {
7f52eac8 305 # Skip readings that have been collapsed into others.
d120c995 306 next if exists $grouped_readings->{$rdg->id}
307 && $grouped_readings->{$rdg->id} eq 'COLLAPSE';
4ce27d42 308 # Get the witness list, including from readings collapsed into this one.
d120c995 309 my @wits = _table_witnesses( $table, $rdg, \%is_lacunose, $aclabel );
310 if( $collapse && @$collapse ) {
7f52eac8 311 my $filter = sub { my $r = $_[0]; grep { $_ eq $r->type } @$collapse; };
312 foreach my $other ( $rdg->related_readings( $filter ) ) {
d120c995 313 my @otherwits = _table_witnesses(
314 $table, $other, \%is_lacunose, $aclabel );
fae07016 315 push( @wits, @otherwits );
d120c995 316 $grouped_readings->{$other->id} = 'COLLAPSE';
d1348d38 317 }
318 }
d120c995 319 $grouped_readings->{$rdg->id} = \@wits;
7f52eac8 320 }
d120c995 321 $grouped_readings->{'(omitted)'} = \@gap_wits if @gap_wits;
7f52eac8 322 # Get rid of our collapsed readings
d120c995 323 map { delete $grouped_readings->{$_} if $grouped_readings->{$_} eq 'COLLAPSE' }
324 keys %$grouped_readings
7f52eac8 325 if $collapse;
d120c995 326
327 # If something was transposed, check the groups for doubled-up readings
328 if( $has_transposition ) {
329 print STDERR "Group for rank $rank:\n";
330 map { print STDERR "\t$_: " . join( ' ' , @{$grouped_readings->{$_}} ) . "\n" }
331 keys %$grouped_readings;
332 _check_transposed_consistency( $c, $rank, $transposed, $grouped_readings );
333 }
7f52eac8 334
4ce27d42 335 # Return the result
d120c995 336 return $grouped_readings;
337}
338
339# Helper function to query the alignment table for all witnesses (a.c. included)
340# that have a given reading at its rank.
341sub _table_witnesses {
342 my( $table, $trdg, $lacunose, $aclabel ) = @_;
343 my $tableidx = $trdg->rank - 1;
344 my @has_reading;
345 foreach my $row ( @{$table->{'alignment'}} ) {
346 my $wit = $row->{'witness'};
347 next if $lacunose->{$wit};
348 my $rdg = $row->{'tokens'}->[$tableidx];
349 next unless exists $rdg->{'t'} && defined $rdg->{'t'};
350 _add_to_witlist( $wit, \@has_reading, $aclabel )
351 if $rdg->{'t'}->id eq $trdg->id;
352 }
353 return @has_reading;
d1348d38 354}
355
335a62ef 356# Helper function to ensure that X and X a.c. never appear in the same list.
357sub _add_to_witlist {
358 my( $wit, $list, $acstr ) = @_;
359 my %inlist;
360 my $idx = 0;
361 map { $inlist{$_} = $idx++ } @$list;
362 if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
363 my $acwit = $1;
364 unless( exists $inlist{$acwit} ) {
365 push( @$list, $acwit.$acstr );
366 }
367 } else {
368 if( exists( $inlist{$wit.$acstr} ) ) {
369 # Replace the a.c. version with the main witness
370 my $i = $inlist{$wit.$acstr};
371 $list->[$i] = $wit;
372 } else {
373 push( @$list, $wit );
374 }
375 }
376}
377
d120c995 378sub _check_transposed_consistency {
379 my( $c, $rank, $transposed, $groupings ) = @_;
380 my %seen_wits;
381 my %thisrank;
382 # Note which readings are actually at this rank, and which witnesses
383 # belong to which reading.
384 foreach my $rdg ( keys %$groupings ) {
385 my $rdgobj = $c->reading( $rdg );
386 # Count '(omitted)' as a reading at this rank
387 $thisrank{$rdg} = 1 if !$rdgobj || $rdgobj->rank == $rank;
388 map { push( @{$seen_wits{$_}}, $rdg ) } @{$groupings->{$rdg}};
389 }
390 # Our work is done if we have no witness belonging to more than one
391 # reading.
392 my @doubled = grep { scalar @{$seen_wits{$_}} > 1 } keys %seen_wits;
393 return unless @doubled;
394 # If we have a symmetric related transposition, drop the non-rank readings.
395 if( @doubled == scalar keys %seen_wits ) {
396 foreach my $rdg ( keys %$groupings ) {
397 if( !$thisrank{$rdg} ) {
398 my $groupstr = wit_stringify( $groupings->{$rdg} );
399 my ( $matched ) = grep { $groupstr eq wit_stringify( $groupings->{$_} ) }
400 keys %thisrank;
401 delete $groupings->{$rdg};
402 # If we found a group match, assume there is a symmetry happening.
403 # TODO think more about this
404 print STDERR "*** Deleting symmetric reading $rdg\n";
405 unless( $matched ) {
406 delete $transposed->{$rdg};
407 warn "Found problem in evident symmetry with reading $rdg";
408 }
409 }
410 }
411 # Otherwise 'unhook' the transposed reading(s) that have duplicates.
412 } else {
413 foreach my $dup ( @doubled ) {
414 foreach my $rdg ( @{$seen_wits{$dup}} ) {
415 next if $thisrank{$rdg};
416 next unless exists $groupings->{$rdg};
417 print STDERR "*** Deleting asymmetric doubled-up reading $rdg\n";
418 delete $groupings->{$rdg};
419 delete $transposed->{$rdg};
420 }
421 }
422 # and put any now-orphaned readings into an 'omitted' reading.
423 foreach my $wit ( keys %seen_wits ) {
424 unless( grep { exists $groupings->{$_} } @{$seen_wits{$wit}} ) {
425 $groupings->{'(omitted)'} = [] unless exists $groupings->{'(omitted)'};
426 _add_to_witlist( $wit, $groupings->{'(omitted)'}, $c->ac_label );
427 }
428 }
429 }
430}
431
88a6bac5 432=head2 solve_variants( $graph, @groups )
433
434Sends the set of groups to the external graph solver service and returns
435a cleaned-up answer, adding the rank IDs back where they belong.
436
437The JSON has the form
438 { "graph": [ stemmagraph DOT string without newlines ],
439 "groupings": [ array of arrays of groups, one per rank ] }
440
441The answer has the form
442 { "variants" => [ array of variant location structures ],
443 "variant_count" => total,
444 "conflict_count" => number of conflicts detected,
445 "genealogical_count" => number of solutions found }
446
447=cut
448
449sub solve_variants {
e59b8faa 450 my( $stemma, @groups ) = @_;
335a62ef 451 my $aclabel = $stemma->collation->ac_label;
452
453 # Filter the groups down to distinct groups, and work out what graph
454 # should be used in the calculation of each group. We want to send each
455 # distinct problem to the solver only once.
456 # We need a whole bunch of lookup tables for this.
457 my $index_groupkeys = {}; # Save the order of readings
458 my $group_indices = {}; # Save the indices that have a given grouping
459 my $graph_problems = {}; # Save the groupings for the given graph
460
461 foreach my $idx ( 0..$#groups ) {
462 my $ghash = $groups[$idx];
88a6bac5 463 my @grouping;
335a62ef 464 # Sort the groupings from big to little, and scan for a.c. witnesses
465 # that would need an extended graph.
466 my @acwits; # note which AC witnesses crop up at this rank
467 my @idxkeys = sort { scalar @{$ghash->{$b}} <=> scalar @{$ghash->{$a}} }
468 keys %$ghash;
469 foreach my $rdg ( @idxkeys ) {
470 my @sg = sort @{$ghash->{$rdg}};
471 push( @acwits, grep { $_ =~ /\Q$aclabel\E$/ } @sg );
472 push( @grouping, \@sg );
473 }
474 # Save the reading order
475 $index_groupkeys->{$idx} = \@idxkeys;
476
477 # Now associate the distinct group with this index
478 my $gstr = wit_stringify( \@grouping );
479 push( @{$group_indices->{$gstr}}, $idx );
480
481 # Finally, add the group to the list to be calculated for this graph.
482 map { s/\Q$aclabel\E$// } @acwits;
d120c995 483 my $graph;
484 try {
485 $graph = $stemma->extend_graph( \@acwits );
486 } catch {
487 die "Unable to extend graph with @acwits";
488 }
335a62ef 489 unless( exists $graph_problems->{"$graph"} ) {
490 $graph_problems->{"$graph"} = { 'object' => $graph, 'groups' => [] };
88a6bac5 491 }
335a62ef 492 push( @{$graph_problems->{"$graph"}->{'groups'}}, \@grouping );
88a6bac5 493 }
335a62ef 494
495 ## For each distinct graph, send its groups to the solver.
88a6bac5 496 my $solver_url = 'http://byzantini.st/cgi-bin/graphcalc.cgi';
497 my $ua = LWP::UserAgent->new();
335a62ef 498 ## Witness map is a HACK to get around limitations in node names from IDP
499 my $witness_map = {};
500 ## Variables to store answers as they come back
501 my $variants = [ ( undef ) x ( scalar keys %$index_groupkeys ) ];
88a6bac5 502 my $genealogical = 0;
335a62ef 503 foreach my $graphkey ( keys %$graph_problems ) {
504 my $graph = $graph_problems->{$graphkey}->{'object'};
505 my $groupings = $graph_problems->{$graphkey}->{'groups'};
506 my $json = encode_json( _safe_wit_strings( $graph, $stemma->collation,
507 $groupings, $witness_map ) );
508 # Send it off and get the result
94654e27 509 #print STDERR "Sending request: $json\n";
335a62ef 510 my $resp = $ua->post( $solver_url, 'Content-Type' => 'application/json',
511 'Content' => $json );
512 my $answer;
513 my $used_idp;
514 if( $resp->is_success ) {
515 $answer = _desanitize_names( decode_json( $resp->content ), $witness_map );
516 $used_idp = 1;
517 } else {
518 # Fall back to the old method.
519 warn "IDP solver returned " . $resp->status_line . " / " . $resp->content
520 . "; falling back to perl method";
521 $answer = perl_solver( $graph, @$groupings );
522 }
523 ## The answer is the evaluated groupings, plus a boolean for whether
524 ## they were genealogical. Reconstruct our original groups.
525 foreach my $gidx ( 0 .. $#{$groupings} ) {
526 my( $calc_groups, $result ) = @{$answer->[$gidx]};
527 if( $result ) {
528 $genealogical++;
529 # Prune the calculated groups, in case the IDP solver failed to.
530 if( $used_idp ) {
531 my @pruned_groups;
532 foreach my $cg ( @$calc_groups ) {
533 # This is a little wasteful but the path of least
534 # resistance. Send both the stemma, which knows what
535 # its hypotheticals are, and the actual graph used.
536 my @pg = _prune_group( $cg, $stemma, $graph );
537 push( @pruned_groups, \@pg );
538 }
539 $calc_groups = \@pruned_groups;
6d25a3a0 540 }
335a62ef 541 }
542 # Retrieve the key for the original group that went to the solver
543 my $input_group = wit_stringify( $groupings->[$gidx] );
544 foreach my $oidx ( @{$group_indices->{$input_group}} ) {
545 my @readings = @{$index_groupkeys->{$oidx}};
546 my $vstruct = {
547 'genealogical' => $result,
548 'readings' => [],
549 };
550 foreach my $ridx ( 0 .. $#readings ) {
551 push( @{$vstruct->{'readings'}},
552 { 'readingid' => $readings[$ridx],
553 'group' => $calc_groups->[$ridx] } );
554 }
555 $variants->[$oidx] = $vstruct;
6d25a3a0 556 }
557 }
88a6bac5 558 }
559
560 return { 'variants' => $variants,
561 'variant_count' => scalar @$variants,
562 'genealogical_count' => $genealogical };
563}
564
b4cb2d60 565#### HACKERY to cope with IDP's limited idea of what a node name looks like ###
566
567sub _safe_wit_strings {
335a62ef 568 my( $graph, $c, $groupings, $witness_map ) = @_;
569 # Parse the graph we were given into a stemma.
b4cb2d60 570 my $safegraph = Graph->new();
571 # Convert the graph to a safe representation and store the conversion.
335a62ef 572 foreach my $n ( $graph->vertices ) {
b4cb2d60 573 my $sn = _safe_witstr( $n );
335a62ef 574 if( exists $witness_map->{$sn} ) {
575 warn "Ambiguous stringification $sn for $n and " . $witness_map->{$sn}
576 if $witness_map->{$sn} ne $n;
577 } else {
578 $witness_map->{$sn} = $n;
579 }
b4cb2d60 580 $safegraph->add_vertex( $sn );
581 $safegraph->set_vertex_attributes( $sn,
335a62ef 582 $graph->get_vertex_attributes( $n ) );
b4cb2d60 583 }
335a62ef 584 foreach my $e ( $graph->edges ) {
b4cb2d60 585 my @safe_e = ( _safe_witstr( $e->[0] ), _safe_witstr( $e->[1] ) );
586 $safegraph->add_edge( @safe_e );
587 }
588 my $safe_stemma = Text::Tradition::Stemma->new(
335a62ef 589 'collation' => $c, 'graph' => $safegraph );
b4cb2d60 590
591 # Now convert the witness groupings to a safe representation.
592 my $safe_groupings = [];
593 foreach my $grouping ( @$groupings ) {
594 my $safe_grouping = [];
595 foreach my $group ( @$grouping ) {
596 my $safe_group = [];
597 foreach my $n ( @$group ) {
598 my $sn = _safe_witstr( $n );
599 warn "Ambiguous stringification $sn for $n and " . $witness_map->{$sn}
600 if exists $witness_map->{$sn} && $witness_map->{$sn} ne $n;
601 $witness_map->{$sn} = $n;
602 push( @$safe_group, $sn );
603 }
604 push( @$safe_grouping, $safe_group );
605 }
606 push( @$safe_groupings, $safe_grouping );
607 }
608
609 # Return it all in the struct we expect. We have stored the reductions
610 # in the $witness_map that we were passed.
335a62ef 611 return { 'graph' => $safe_stemma->editable( { 'linesep' => ' ' } ),
612 'groupings' => $safe_groupings };
b4cb2d60 613}
614
615sub _safe_witstr {
616 my $witstr = shift;
617 $witstr =~ s/\s+/_/g;
618 $witstr =~ s/[^\w\d-]//g;
619 return $witstr;
620}
621
622sub _desanitize_names {
623 my( $jsonstruct, $witness_map ) = @_;
624 my $result = [];
625 foreach my $grouping ( @$jsonstruct ) {
626 my $real_grouping = [];
627 foreach my $element ( @$grouping ) {
628 if( ref( $element ) eq 'ARRAY' ) {
629 # it's the groupset.
630 my $real_groupset = [];
631 foreach my $group ( @$element ) {
632 my $real_group = [];
633 foreach my $n ( @$group ) {
634 my $rn = $witness_map->{$n};
635 push( @$real_group, $rn );
636 }
637 push( @$real_groupset, $real_group );
638 }
639 push( @$real_grouping, $real_groupset );
640 } else {
641 # It is the boolean, not actually a group.
642 push( @$real_grouping, $element );
643 }
644 }
645 push( @$result, $real_grouping );
646 }
647 return $result;
648}
649
650### END HACKERY ###
651
fae07016 652=head2 analyze_location ( $tradition, $graph, $location_hash )
7f52eac8 653
fae07016 654Given the tradition, its stemma graph, and the solution from the graph solver,
655work out the rest of the information we want. For each reading we need missing,
656conflict, reading_parents, independent_occurrence, followed, not_followed, and follow_unknown. Alters the location_hash in place.
7f52eac8 657
658=cut
732152b1 659
fae07016 660sub analyze_location {
638e2a95 661 my ( $tradition, $stemma, $variant_row, $lacunose ) = @_;
94654e27 662 my $c = $tradition->collation;
fae07016 663
664 # Make a hash of all known node memberships, and make the subgraphs.
665 my $contig = {};
666 my $reading_roots = {};
667 my $subgraph = {};
638e2a95 668 my $acstr = $c->ac_label;
669 my @acwits;
670 $DB::single = 1 if $variant_row->{id} == 87;
94654e27 671 # Note which witnesses positively belong to which group
fae07016 672 foreach my $rdghash ( @{$variant_row->{'readings'}} ) {
673 my $rid = $rdghash->{'readingid'};
638e2a95 674 foreach my $wit ( @{$rdghash->{'group'}} ) {
675 $contig->{$wit} = $rid;
676 if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
677 push( @acwits, $1 );
678 }
679 }
94654e27 680 }
681
638e2a95 682 # Get the actual graph we should work with
d120c995 683 my $graph;
684 try {
685 $graph = @acwits ? $stemma->extend_graph( \@acwits ) : $stemma->graph;
686 } catch {
687 die "Could not extend graph with a.c. witnesses @acwits";
688 }
638e2a95 689
94654e27 690 # Now, armed with that knowledge, make a subgraph for each reading
691 # and note the root(s) of each subgraph.
692 foreach my $rdghash( @{$variant_row->{'readings'}} ) {
693 my $rid = $rdghash->{'readingid'};
694 my %rdgwits;
fae07016 695 # Make the subgraph.
696 my $part = $graph->copy;
94654e27 697 my @todelete = grep { exists $contig->{$_} && $contig->{$_} ne $rid }
698 keys %$contig;
699 $part->delete_vertices( @todelete );
700 _prune_subtree( $part, $lacunose );
fae07016 701 $subgraph->{$rid} = $part;
94654e27 702 # Record the remaining lacunose nodes as part of this group, if
703 # we are dealing with a non-genealogical reading.
704 unless( $variant_row->{'genealogical'} ) {
705 map { $contig->{$_} = $rid } $part->vertices;
706 }
fae07016 707 # Get the reading roots.
708 map { $reading_roots->{$_} = $rid } $part->predecessorless_vertices;
bebec0e9 709 }
710
fae07016 711 # Now that we have all the node group memberships, calculate followed/
bebec0e9 712 # non-followed/unknown values for each reading. Also figure out the
713 # reading's evident parent(s).
714 foreach my $rdghash ( @{$variant_row->{'readings'}} ) {
fae07016 715 my $rid = $rdghash->{'readingid'};
716 # Get the subgraph
717 my $part = $subgraph->{$rid};
718
719 # Start figuring things out.
94654e27 720 my @roots = grep { $reading_roots->{$_} eq $rid } keys %$reading_roots;
721 $rdghash->{'independent_occurrence'} = \@roots;
bebec0e9 722 $rdghash->{'followed'} = scalar( $part->vertices ) - scalar( @roots );
723 # Find the parent readings, if any, of this reading.
94654e27 724 my $rdgparents = {};
bebec0e9 725 foreach my $wit ( @roots ) {
f00cefe8 726 # Look in the main stemma to find this witness's extant or known-reading
727 # immediate ancestor(s), and look up the reading that each ancestor olds.
728 my @check = $graph->predecessors( $wit );
729 while( @check ) {
730 my @next;
731 foreach my $wparent( @check ) {
fae07016 732 my $preading = $contig->{$wparent};
733 if( $preading ) {
94654e27 734 $rdgparents->{$preading} = 1;
f00cefe8 735 } else {
736 push( @next, $graph->predecessors( $wparent ) );
737 }
738 }
739 @check = @next;
740 }
bebec0e9 741 }
94654e27 742 foreach my $p ( keys %$rdgparents ) {
743 # Resolve the relationship of the parent to the reading, and
744 # save it in our hash.
745 my $pobj = $c->reading( $p );
746 my $relation;
747 my $prep = $pobj ? $pobj->id . ' (' . $pobj->text . ')' : $p;
748 if( $pobj ) {
749 my $rel = $c->get_relationship( $p, $rdghash->{readingid} );
750 if( $rel ) {
751 $relation = { type => $rel->type };
752 if( $rel->has_annotation ) {
753 $relation->{'annotation'} = $rel->annotation;
754 }
755 }
756 }
757 $rdgparents->{$p} = { 'label' => $prep, 'relation' => $relation };
758 }
759
760 $rdghash->{'reading_parents'} = $rdgparents;
bebec0e9 761
762 # Find the number of times this reading was altered, and the number of
763 # times we're not sure.
764 my( %nofollow, %unknownfollow );
765 foreach my $wit ( $part->vertices ) {
766 foreach my $wchild ( $graph->successors( $wit ) ) {
767 next if $part->has_vertex( $wchild );
fae07016 768 if( $reading_roots->{$wchild} && $contig->{$wchild} ) {
bebec0e9 769 # It definitely changed here.
770 $nofollow{$wchild} = 1;
771 } elsif( !($contig->{$wchild}) ) {
772 # The child is a hypothetical node not definitely in
773 # any group. Answer is unknown.
774 $unknownfollow{$wchild} = 1;
775 } # else it's a non-root node in a known group, and therefore
776 # is presumed to have its reading from its group, not this link.
777 }
778 }
779 $rdghash->{'not_followed'} = keys %nofollow;
780 $rdghash->{'follow_unknown'} = keys %unknownfollow;
fae07016 781
782 # Now say whether this reading represents a conflict.
783 unless( $variant_row->{'genealogical'} ) {
784 $rdghash->{'conflict'} = @roots != 1;
785 }
c4a4fb1b 786 }
d71100ed 787}
788
fae07016 789
790=head2 perl_solver( $tradition, $rank, $stemma_id, @merge_relationship_types )
791
792** NOTE ** This method should hopefully not be called - it is not guaranteed
793to be correct. Serves as a backup for the real solver.
794
795Runs an analysis of the given tradition, at the location given in $rank,
796against the graph of the stemma specified in $stemma_id. The argument
797@merge_relationship_types is an optional list of relationship types for
798which readings so related should be treated as equivalent.
799
800Returns a nested array data structure as follows:
801
802 [ [ group_list, is_genealogical ], [ group_list, is_genealogical ] ... ]
803
804where the group list is the array of arrays passed in for each element of @groups,
805possibly with the addition of hypothetical readings.
806
807
808=cut
809
810sub perl_solver {
335a62ef 811 my( $graph, @groups ) = @_;
e59b8faa 812 my @answer;
813 foreach my $g ( @groups ) {
814 push( @answer, _solve_variant_location( $graph, $g ) );
815 }
816 return \@answer;
fae07016 817}
818
e59b8faa 819sub _solve_variant_location {
820 my( $graph, $groups ) = @_;
fae07016 821 # Now do the work.
e59b8faa 822 my $contig = {};
823 my $subgraph = {};
824 my $is_conflicted;
825 my $conflict = {};
826
827 # Mark each ms as in its own group, first.
828 foreach my $g ( @$groups ) {
829 my $gst = wit_stringify( $g );
830 map { $contig->{$_} = $gst } @$g;
831 }
832
833 # Now for each unmarked node in the graph, initialize an array
834 # for possible group memberships. We will use this later to
835 # resolve potential conflicts.
836 map { $contig->{$_} = [] unless $contig->{$_} } $graph->vertices;
837 foreach my $g ( sort { scalar @$b <=> scalar @$a } @$groups ) {
838 my $gst = wit_stringify( $g ); # This is the group name
839 # Copy the graph, and delete all non-members from the new graph.
840 my $part = $graph->copy;
841 my @group_roots;
842 $part->delete_vertices(
843 grep { !ref( $contig->{$_} ) && $contig->{$_} ne $gst } $graph->vertices );
844
845 # Now look to see if our group is connected.
846 if( @$g > 1 ) {
847 # We have to take directionality into account.
848 # How many root nodes do we have?
849 my @roots = grep { ref( $contig->{$_} ) || $contig->{$_} eq $gst }
850 $part->predecessorless_vertices;
851 # Assuming that @$g > 1, find the first root node that has at
852 # least one successor belonging to our group. If this reading
853 # is genealogical, there should be only one, but we will check
854 # that implicitly later.
855 foreach my $root ( @roots ) {
856 # Prune the tree to get rid of extraneous hypotheticals.
94654e27 857 $root = _prune_subtree_old( $part, $root, $contig );
e59b8faa 858 next unless $root;
859 # Save this root for our group.
860 push( @group_roots, $root );
861 # Get all the successor nodes of our root.
862 }
863 } else {
864 # Dispense with the trivial case of one reading.
865 my $wit = $g->[0];
866 @group_roots = ( $wit );
867 foreach my $v ( $part->vertices ) {
868 $part->delete_vertex( $v ) unless $v eq $wit;
869 }
870 }
871
872 if( @group_roots > 1 ) {
873 $conflict->{$gst} = 1;
874 $is_conflicted = 1;
875 }
876 # Paint the 'hypotheticals' with our group.
877 foreach my $wit ( $part->vertices ) {
878 if( ref( $contig->{$wit} ) ) {
879 push( @{$contig->{$wit}}, $gst );
880 } elsif( $contig->{$wit} ne $gst ) {
881 warn "How did we get here?";
882 }
883 }
884
885
886 # Save the relevant subgraph.
887 $subgraph->{$gst} = $part;
888 }
889
890 # For each of our hypothetical readings, flatten its 'contig' array if
891 # the array contains zero or one group. If we have any unflattened arrays,
892 # we may need to run the resolution process. If the reading is already known
893 # to have a conflict, flatten the 'contig' array to nothing; we won't resolve
894 # it.
895 my @resolve;
896 foreach my $wit ( keys %$contig ) {
897 next unless ref( $contig->{$wit} );
898 if( @{$contig->{$wit}} > 1 ) {
899 if( $is_conflicted ) {
900 $contig->{$wit} = ''; # We aren't going to decide.
901 } else {
902 push( @resolve, $wit );
903 }
904 } else {
905 my $gst = pop @{$contig->{$wit}};
906 $contig->{$wit} = $gst || '';
907 }
908 }
909
910 if( @resolve ) {
911 my $still_contig = {};
912 foreach my $h ( @resolve ) {
913 # For each of the hypothetical readings with more than one possibility,
914 # try deleting it from each of its member subgraphs in turn, and see
915 # if that breaks the contiguous grouping.
916 # TODO This can still break in a corner case where group A can use
917 # either vertex 1 or 2, and group B can use either vertex 2 or 1.
918 # Revisit this if necessary; it could get brute-force nasty.
919 foreach my $gst ( @{$contig->{$h}} ) {
920 my $gpart = $subgraph->{$gst}->copy();
921 # If we have come this far, there is only one root and everything
922 # is reachable from it.
923 my( $root ) = $gpart->predecessorless_vertices;
924 my $reachable = {};
925 map { $reachable->{$_} = 1 } $gpart->vertices;
926
927 # Try deleting the hypothetical node.
928 $gpart->delete_vertex( $h );
929 if( $h eq $root ) {
930 # See if we still have a single root.
931 my @roots = $gpart->predecessorless_vertices;
932 warn "This shouldn't have happened" unless @roots;
933 if( @roots > 1 ) {
934 # $h is needed by this group.
935 if( exists( $still_contig->{$h} ) ) {
936 # Conflict!
937 $conflict->{$gst} = 1;
938 $still_contig->{$h} = '';
939 } else {
940 $still_contig->{$h} = $gst;
941 }
942 }
943 } else {
944 # $h is somewhere in the middle. See if everything
945 # else can still be reached from the root.
946 my %still_reachable = ( $root => 1 );
947 map { $still_reachable{$_} = 1 }
948 $gpart->all_successors( $root );
949 foreach my $v ( keys %$reachable ) {
950 next if $v eq $h;
951 if( !$still_reachable{$v}
952 && ( $contig->{$v} eq $gst
953 || ( exists $still_contig->{$v}
954 && $still_contig->{$v} eq $gst ) ) ) {
955 # We need $h.
956 if( exists $still_contig->{$h} ) {
957 # Conflict!
958 $conflict->{$gst} = 1;
959 $still_contig->{$h} = '';
960 } else {
961 $still_contig->{$h} = $gst;
962 }
963 last;
964 } # else we don't need $h in this group.
965 } # end foreach $v
966 } # endif $h eq $root
967 } # end foreach $gst
968 } # end foreach $h
969
970 # Now we have some hypothetical vertices in $still_contig that are the
971 # "real" group memberships. Replace these in $contig.
972 foreach my $v ( keys %$contig ) {
973 next unless ref $contig->{$v};
974 $contig->{$v} = $still_contig->{$v};
975 }
976 } # end if @resolve
977
978 my $is_genealogical = keys %$conflict ? JSON::false : JSON::true;
979 my $variant_row = [ [], $is_genealogical ];
980 # Fill in the groupings from $contig.
981 foreach my $g ( @$groups ) {
982 my $gst = wit_stringify( $g );
983 my @realgroup = grep { $contig->{$_} eq $gst } keys %$contig;
984 push( @{$variant_row->[0]}, \@realgroup );
985 }
986 return $variant_row;
987}
fae07016 988
6d25a3a0 989sub _prune_group {
335a62ef 990 my( $group, $stemma, $graph ) = @_;
94654e27 991 my $lacunose = {};
992 map { $lacunose->{$_} = 1 } $stemma->hypotheticals;
993 map { $lacunose->{$_} = 0 } @$group;
6d25a3a0 994 # Make our subgraph
335a62ef 995 my $subgraph = $graph->copy;
94654e27 996 map { $subgraph->delete_vertex( $_ ) unless exists $lacunose->{$_} }
6d25a3a0 997 $subgraph->vertices;
998 # ...and find the root.
6d25a3a0 999 # Now prune and return the remaining vertices.
94654e27 1000 _prune_subtree( $subgraph, $lacunose );
6d25a3a0 1001 return $subgraph->vertices;
1002}
1003
7f52eac8 1004sub _prune_subtree {
94654e27 1005 my( $tree, $lacunose ) = @_;
1006
1007 # Delete lacunose witnesses that have no successors
1008 my @orphan_hypotheticals;
1009 my $ctr = 0;
1010 do {
1011 die "Infinite loop on leaves" if $ctr > 100;
1012 @orphan_hypotheticals = grep { $lacunose->{$_} }
1013 $tree->successorless_vertices;
1014 $tree->delete_vertices( @orphan_hypotheticals );
1015 $ctr++;
1016 } while( @orphan_hypotheticals );
1017
1018 # Delete lacunose roots that have a single successor
1019 my @redundant_root;
1020 $ctr = 0;
1021 do {
1022 die "Infinite loop on roots" if $ctr > 100;
1023 @redundant_root = grep { $lacunose->{$_} && $tree->successors( $_ ) == 1 }
1024 $tree->predecessorless_vertices;
1025 $tree->delete_vertices( @redundant_root );
1026 $ctr++;
1027 } while( @redundant_root );
1028}
1029
1030sub _prune_subtree_old {
231d71fc 1031 my( $tree, $root, $contighash ) = @_;
1032 # First, delete hypothetical leaves / orphans until there are none left.
1033 my @orphan_hypotheticals = grep { ref( $contighash->{$_} ) }
1034 $tree->successorless_vertices;
1035 while( @orphan_hypotheticals ) {
1036 $tree->delete_vertices( @orphan_hypotheticals );
1037 @orphan_hypotheticals = grep { ref( $contighash->{$_} ) }
1038 $tree->successorless_vertices;
1039 }
1040 # Then delete a hypothetical root with only one successor, moving the
bebec0e9 1041 # root to the first child that has no other predecessors.
231d71fc 1042 while( $tree->successors( $root ) == 1 && ref $contighash->{$root} ) {
1043 my @nextroot = $tree->successors( $root );
1044 $tree->delete_vertex( $root );
bebec0e9 1045 ( $root ) = grep { $tree->is_predecessorless_vertex( $_ ) } @nextroot;
231d71fc 1046 }
1047 # The tree has been modified in place, but we need to know the new root.
bebec0e9 1048 $root = undef unless $root && $tree->has_vertex( $root );
231d71fc 1049 return $root;
1050}
d71100ed 1051# Add the variant, subject to a.c. representation logic.
1052# This assumes that we will see the 'main' version before the a.c. version.
1053sub add_variant_wit {
1054 my( $arr, $wit, $acstr ) = @_;
1055 my $skip;
1056 if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
1057 my $real = $1;
1058 $skip = grep { $_ =~ /^\Q$real\E$/ } @$arr;
1059 }
1060 push( @$arr, $wit ) unless $skip;
1061}
1062
5be0cdeb 1063sub _useful_variant {
1064 my( $group_readings, $graph, $acstr ) = @_;
1065
1066 # TODO Decide what to do with AC witnesses
1067
1068 # Sort by group size and return
1069 my $is_useful = 0;
1070 my( @readings, @groups ); # The sorted groups for our answer.
1071 foreach my $rdg ( sort { @{$group_readings->{$b}} <=> @{$group_readings->{$a}} }
1072 keys %$group_readings ) {
1073 push( @readings, $rdg );
1074 push( @groups, $group_readings->{$rdg} );
1075 if( @{$group_readings->{$rdg}} > 1 ) {
1076 $is_useful++;
1077 } else {
1078 my( $wit ) = @{$group_readings->{$rdg}};
1079 $wit =~ s/^(.*)\Q$acstr\E$/$1/;
1080 $is_useful++ unless( $graph->is_sink_vertex( $wit ) );
1081 }
1082 }
1083 if( $is_useful > 1 ) {
1084 return( \@readings, \@groups );
1085 } else {
1086 return( [], [] );
1087 }
1088}
1089
7f52eac8 1090=head2 wit_stringify( $groups )
1091
1092Takes an array of witness groupings and produces a string like
1093['A','B'] / ['C','D','E'] / ['F']
d71100ed 1094
7f52eac8 1095=cut
d71100ed 1096
1097sub wit_stringify {
1098 my $groups = shift;
1099 my @gst;
1100 # If we were passed an array of witnesses instead of an array of
1101 # groupings, then "group" the witnesses first.
1102 unless( ref( $groups->[0] ) ) {
1103 my $mkgrp = [ $groups ];
1104 $groups = $mkgrp;
1105 }
1106 foreach my $g ( @$groups ) {
1107 push( @gst, '[' . join( ',', map { "'$_'" } @$g ) . ']' );
1108 }
1109 return join( ' / ', @gst );
1110}
7f52eac8 1111
bebec0e9 1112sub _symmdiff {
1113 my( $lista, $listb ) = @_;
7f52eac8 1114 my %union;
1115 my %scalars;
1116 map { $union{$_} = 1; $scalars{$_} = $_ } @$lista;
1117 map { $union{$_} += 1; $scalars{$_} = $_ } @$listb;
bebec0e9 1118 my @set = grep { $union{$_} == 1 } keys %union;
7f52eac8 1119 return map { $scalars{$_} } @set;
1120}
1121
11221;
1123
1124=head1 LICENSE
1125
1126This package is free software and is provided "as is" without express
1127or implied warranty. You can redistribute it and/or modify it under
1128the same terms as Perl itself.
1129
1130=head1 AUTHOR
1131
1132Tara L Andrews E<lt>aurum@cpan.orgE<gt>