switch to new version of IDP program
[scpubgit/stemmatology.git] / lib / Text / Tradition / Analysis.pm
CommitLineData
d71100ed 1package Text::Tradition::Analysis;
2
3use strict;
4use warnings;
428bcf0b 5use Algorithm::Diff; # for word similarity measure
e4386ba9 6use Benchmark;
88a6bac5 7use Encode qw/ encode_utf8 /;
d1348d38 8use Exporter 'import';
b4cb2d60 9use Graph;
88a6bac5 10use JSON qw/ encode_json decode_json /;
11use LWP::UserAgent;
d71100ed 12use Text::Tradition;
13use Text::Tradition::Stemma;
d120c995 14use TryCatch;
d71100ed 15
d1348d38 16use vars qw/ @EXPORT_OK /;
a2cf85dd 17@EXPORT_OK = qw/ run_analysis group_variants analyze_variant_location wit_stringify /;
d1348d38 18
7f52eac8 19=head1 NAME
20
21Text::Tradition::Analysis - functions for stemma analysis of a tradition
22
23=head1 SYNOPSIS
24
25 use Text::Tradition;
26 use Text::Tradition::Analysis qw/ run_analysis analyze_variant_location /;
27 my $t = Text::Tradition->new(
28 'name' => 'this is a text',
29 'input' => 'TEI',
30 'file' => '/path/to/tei_parallel_seg_file.xml' );
31 $t->add_stemma( 'dotfile' => $stemmafile );
32
33 my $variant_data = run_analysis( $tradition );
34 # Recalculate rank $n treating all orthographic variants as equivalent
35 my $reanalyze = analyze_variant_location( $tradition, $n, 0, 'orthographic' );
36
37=head1 DESCRIPTION
38
39Text::Tradition is a library for representation and analysis of collated
40texts, particularly medieval ones. The Collation is the central feature of
41a Tradition, where the text, its sequence of readings, and its relationships
42between readings are actually kept.
43
44=head1 SUBROUTINES
45
88a6bac5 46=head2 run_analysis( $tradition, %opts )
7f52eac8 47
88a6bac5 48Runs the analysis described in analyze_variant_location on every location in the
49collation of the given tradition, with the given options. These include:
7f52eac8 50
88a6bac5 51=over 4
52
53=item * stemma_id - Specify which of the tradition's stemmata to use. Default
54is 0 (i.e. the first).
55
56=item * ranks - Specify a list of location ranks to analyze; exclude the rest.
57
58=item * merge_types - Specify a list of relationship types, where related readings
59should be treated as identical for the purposes of analysis.
60
ffa22d6f 61=item * exclude_type1 - Exclude those ranks whose groupings have only type-1 variants.
62
88a6bac5 63=back
7f52eac8 64
65=begin testing
66
67use Text::Tradition;
68use Text::Tradition::Analysis qw/ run_analysis analyze_variant_location /;
69
70my $datafile = 't/data/florilegium_tei_ps.xml';
71my $tradition = Text::Tradition->new( 'input' => 'TEI',
72 'name' => 'test0',
73 'file' => $datafile );
74my $s = $tradition->add_stemma( 'dotfile' => 't/data/florilegium.dot' );
75is( ref( $s ), 'Text::Tradition::Stemma', "Added stemma to tradition" );
76
f00cefe8 77my %expected_genealogical = (
a44aaf2a 78 1 => 0,
f00cefe8 79 2 => 1,
a44aaf2a 80 3 => 0,
81 5 => 0,
82 7 => 0,
83 8 => 0,
84 10 => 0,
f00cefe8 85 13 => 1,
a44aaf2a 86 33 => 0,
87 34 => 0,
88 37 => 0,
89 60 => 0,
f00cefe8 90 81 => 1,
a44aaf2a 91 84 => 0,
92 87 => 0,
93 101 => 0,
94 102 => 0,
f00cefe8 95 122 => 1,
a44aaf2a 96 157 => 0,
f00cefe8 97 166 => 1,
98 169 => 1,
a44aaf2a 99 200 => 0,
f00cefe8 100 216 => 1,
101 217 => 1,
102 219 => 1,
103 241 => 1,
104 242 => 1,
105 243 => 1,
106);
107
7f52eac8 108my $data = run_analysis( $tradition );
7234b01d 109my $c = $tradition->collation;
f00cefe8 110foreach my $row ( @{$data->{'variants'}} ) {
a44aaf2a 111 # Account for rows that used to be "not useful"
112 unless( exists $expected_genealogical{$row->{'id'}} ) {
113 $expected_genealogical{$row->{'id'}} = 1;
114 }
18f48b82 115 my $gen_bool = $row->{'genealogical'} ? 1 : 0;
116 is( $gen_bool, $expected_genealogical{$row->{'id'}},
f00cefe8 117 "Got correct genealogical flag for row " . $row->{'id'} );
7234b01d 118 # Check that we have the right row with the right groups
119 my $rank = $row->{'id'};
120 foreach my $rdghash ( @{$row->{'readings'}} ) {
121 # Skip 'readings' that aren't really
122 next unless $c->reading( $rdghash->{'readingid'} );
123 # Check the rank
124 is( $c->reading( $rdghash->{'readingid'} )->rank, $rank,
125 "Got correct reading rank" );
126 # Check the witnesses
127 my @realwits = sort $c->reading_witnesses( $rdghash->{'readingid'} );
128 my @sgrp = sort @{$rdghash->{'group'}};
129 is_deeply( \@sgrp, \@realwits, "Reading analyzed with correct groups" );
130 }
f00cefe8 131}
a44aaf2a 132is( $data->{'variant_count'}, 58, "Got right total variant number" );
b4cb2d60 133# TODO Make something meaningful of conflict count, maybe test other bits
7f52eac8 134
135=end testing
136
137=cut
138
d71100ed 139sub run_analysis {
88a6bac5 140 my( $tradition, %opts ) = @_;
f00cefe8 141 my $c = $tradition->collation;
88a6bac5 142
143 my $stemma_id = $opts{'stemma_id'} || 0;
1d73ecad 144 my @ranks = ref( $opts{'ranks'} ) eq 'ARRAY' ? @{$opts{'ranks'}} : ();
145 my @collapse = ref( $opts{'merge_types'} ) eq 'ARRAY' ? @{$opts{'merge_types'}} : ();
88a6bac5 146
147 # Get the stemma
148 my $stemma = $tradition->stemma( $stemma_id );
b4cb2d60 149
4ce27d42 150 # Figure out which witnesses we are working with - that is, the ones that
151 # appear both in the stemma and in the tradition. All others are 'lacunose'
152 # for our purposes.
88a6bac5 153 my @lacunose = $stemma->hypotheticals;
fae07016 154 my @tradition_wits = map { $_->sigil } $tradition->witnesses;
fae07016 155 push( @lacunose, _symmdiff( [ $stemma->witnesses ], \@tradition_wits ) );
88a6bac5 156
157 # Find and mark 'common' ranks for exclusion, unless they were
158 # explicitly specified.
159 unless( @ranks ) {
160 my %common_rank;
a44aaf2a 161 foreach my $rdg ( $c->common_readings ) {
88a6bac5 162 $common_rank{$rdg->rank} = 1;
163 }
164 @ranks = grep { !$common_rank{$_} } ( 1 .. $c->end->rank-1 );
d71100ed 165 }
7f52eac8 166
88a6bac5 167 # Group the variants to send to the solver
168 my @groups;
f629cb3b 169 my @use_ranks;
a44aaf2a 170 my %lacunae;
94654e27 171 my $moved = {};
88a6bac5 172 foreach my $rank ( @ranks ) {
a44aaf2a 173 my $missing = [ @lacunose ];
94654e27 174 my $rankgroup = group_variants( $tradition, $rank, $missing, $moved, \@collapse );
175 # Filter out any empty rankgroups
176 # (e.g. from the later rank for a transposition)
177 next unless keys %$rankgroup;
ffa22d6f 178 if( $opts{'exclude_type1'} ) {
179 # Check to see whether this is a "useful" group.
180 my( $rdgs, $grps ) = _useful_variant( $rankgroup,
181 $stemma->graph, $c->ac_label );
182 next unless @$rdgs;
183 }
f629cb3b 184 push( @use_ranks, $rank );
ffa22d6f 185 push( @groups, $rankgroup );
a44aaf2a 186 $lacunae{$rank} = $missing;
d71100ed 187 }
4ce27d42 188 # Run the solver
e59b8faa 189 my $answer = solve_variants( $stemma, @groups );
fae07016 190
88a6bac5 191 # Do further analysis on the answer
a44aaf2a 192 my $conflict_count = 0;
7234b01d 193 my $aclabel = $c->ac_label;
f629cb3b 194 foreach my $idx ( 0 .. $#use_ranks ) {
88a6bac5 195 my $location = $answer->{'variants'}->[$idx];
196 # Add the rank back in
94654e27 197 my $rank = $use_ranks[$idx];
198 $location->{'id'} = $rank;
7234b01d 199 # Note what our lacunae are
f629cb3b 200 my %lmiss;
7234b01d 201 map { $lmiss{$_} = 1 } @{$lacunae{$use_ranks[$idx]}};
7234b01d 202 $location->{'missing'} = [ keys %lmiss ];
203
88a6bac5 204 # Run the extra analysis we need.
428bcf0b 205 ## TODO We run through all the variants in this call, so
206 ## why not add the reading data there instead of here below?
638e2a95 207 analyze_location( $tradition, $stemma, $location, \%lmiss );
7234b01d 208
638e2a95 209 my @layerwits;
7234b01d 210 # Do the final post-analysis tidying up of the data.
a44aaf2a 211 foreach my $rdghash ( @{$location->{'readings'}} ) {
212 $conflict_count++
213 if exists $rdghash->{'conflict'} && $rdghash->{'conflict'};
94654e27 214 # Add the reading text back in, setting display value as needed
a44aaf2a 215 my $rdg = $c->reading( $rdghash->{'readingid'} );
94654e27 216 if( $rdg ) {
217 $rdghash->{'text'} = $rdg->text .
218 ( $rdg->rank == $rank ? '' : ' [' . $rdg->rank . ']' );
7b7abf10 219 $rdghash->{'is_ungrammatical'} = $rdg->grammar_invalid;
220 $rdghash->{'is_nonsense'} = $rdg->is_nonsense;
94654e27 221 }
f629cb3b 222 # Remove lacunose witnesses from this reading's list now that the
7234b01d 223 # analysis is done
f629cb3b 224 my @realgroup;
7234b01d 225 map { push( @realgroup, $_ ) unless $lmiss{$_} } @{$rdghash->{'group'}};
f629cb3b 226 $rdghash->{'group'} = \@realgroup;
638e2a95 227 # Note any layered witnesses that appear in this group
228 foreach( @realgroup ) {
229 if( $_ =~ /^(.*)\Q$aclabel\E$/ ) {
230 push( @layerwits, $1 );
231 }
232 }
a44aaf2a 233 }
638e2a95 234 $location->{'layerwits'} = \@layerwits if @layerwits;
88a6bac5 235 }
a44aaf2a 236 $answer->{'conflict_count'} = $conflict_count;
f00cefe8 237
88a6bac5 238 return $answer;
d71100ed 239}
240
7f52eac8 241=head2 group_variants( $tradition, $rank, $lacunose, @merge_relationship_types )
242
243Groups the variants at the given $rank of the collation, treating any
244relationships in @merge_relationship_types as equivalent. $lacunose should
245be a reference to an array, to which the sigla of lacunose witnesses at this
94654e27 246rank will be appended; $transposed should be a reference to a hash, wherein
247the identities of transposed readings and their relatives will be stored.
7f52eac8 248
ffa22d6f 249Returns a hash $group_readings where $rdg is attested by the witnesses listed
250in $group_readings->{$rdg}.
7f52eac8 251
252=cut
253
254# Return group_readings, groups, lacunose
d1348d38 255sub group_variants {
94654e27 256 my( $tradition, $rank, $lacunose, $transposed, $collapse ) = @_;
7f52eac8 257 my $c = $tradition->collation;
335a62ef 258 my $aclabel = $c->ac_label;
d120c995 259 my $table = $c->alignment_table;
7f52eac8 260 # Get the alignment table readings
261 my %readings_at_rank;
d120c995 262 my %is_lacunose; # lookup table for witnesses not in stemma
263 map { $is_lacunose{$_} = 1; $is_lacunose{$_.$aclabel} = 1 } @$lacunose;
94654e27 264 my @check_for_gaps;
265 my %moved_wits;
d120c995 266 my $has_transposition;
267 foreach my $tablewit ( @{$table->{'alignment'}} ) {
7f52eac8 268 my $rdg = $tablewit->{'tokens'}->[$rank-1];
fae07016 269 my $wit = $tablewit->{'witness'};
ffa22d6f 270 # Exclude the witness if it is "lacunose" which if we got here
271 # means "not in the stemma".
272 next if $is_lacunose{$wit};
94654e27 273 # Note if the witness is actually in a lacuna
7f52eac8 274 if( $rdg && $rdg->{'t'}->is_lacuna ) {
335a62ef 275 _add_to_witlist( $wit, $lacunose, $aclabel );
94654e27 276 # Otherwise the witness either has a positive reading...
7f52eac8 277 } elsif( $rdg ) {
94654e27 278 # If the reading has been counted elsewhere as a transposition, ignore it.
279 if( $transposed->{$rdg->{'t'}->id} ) {
d120c995 280 # TODO Does this cope with three-way transpositions?
94654e27 281 map { $moved_wits{$_} = 1 } @{$transposed->{$rdg->{'t'}->id}};
282 next;
283 }
284 # Otherwise, record it...
285 $readings_at_rank{$rdg->{'t'}->id} = $rdg->{'t'};
286 # ...and grab any transpositions, and their relations.
287 my @transp = grep { $_->rank != $rank } $rdg->{'t'}->related_readings();
288 foreach my $trdg ( @transp ) {
d120c995 289 next if exists $readings_at_rank{$trdg->id};
290 $has_transposition = 1;
291 my @affected_wits = _table_witnesses(
292 $table, $trdg, \%is_lacunose, $aclabel );
293 next unless @affected_wits;
294 map { $moved_wits{$_} = 1 } @affected_wits;
295 $transposed->{$trdg->id} =
296 [ _table_witnesses( $table, $rdg->{'t'}, \%is_lacunose, $aclabel ) ];
94654e27 297 $readings_at_rank{$trdg->id} = $trdg;
298 }
299 # ...or it is empty, ergo a gap.
7f52eac8 300 } else {
d120c995 301 _add_to_witlist( $wit, \@check_for_gaps, $aclabel );
7f52eac8 302 }
303 }
94654e27 304 my @gap_wits;
305 map { _add_to_witlist( $_, \@gap_wits, $aclabel )
306 unless $moved_wits{$_} } @check_for_gaps;
7f52eac8 307 # Group the readings, collapsing groups by relationship if needed
d120c995 308 my $grouped_readings = {};
4ce27d42 309 foreach my $rdg ( values %readings_at_rank ) {
7f52eac8 310 # Skip readings that have been collapsed into others.
d120c995 311 next if exists $grouped_readings->{$rdg->id}
312 && $grouped_readings->{$rdg->id} eq 'COLLAPSE';
4ce27d42 313 # Get the witness list, including from readings collapsed into this one.
d120c995 314 my @wits = _table_witnesses( $table, $rdg, \%is_lacunose, $aclabel );
315 if( $collapse && @$collapse ) {
7f52eac8 316 my $filter = sub { my $r = $_[0]; grep { $_ eq $r->type } @$collapse; };
317 foreach my $other ( $rdg->related_readings( $filter ) ) {
d120c995 318 my @otherwits = _table_witnesses(
319 $table, $other, \%is_lacunose, $aclabel );
fae07016 320 push( @wits, @otherwits );
d120c995 321 $grouped_readings->{$other->id} = 'COLLAPSE';
d1348d38 322 }
323 }
d120c995 324 $grouped_readings->{$rdg->id} = \@wits;
7f52eac8 325 }
d120c995 326 $grouped_readings->{'(omitted)'} = \@gap_wits if @gap_wits;
7f52eac8 327 # Get rid of our collapsed readings
d120c995 328 map { delete $grouped_readings->{$_} if $grouped_readings->{$_} eq 'COLLAPSE' }
329 keys %$grouped_readings
7f52eac8 330 if $collapse;
d120c995 331
332 # If something was transposed, check the groups for doubled-up readings
333 if( $has_transposition ) {
62a39b8f 334 # print STDERR "Group for rank $rank:\n";
335 # map { print STDERR "\t$_: " . join( ' ' , @{$grouped_readings->{$_}} ) . "\n" }
336 # keys %$grouped_readings;
d120c995 337 _check_transposed_consistency( $c, $rank, $transposed, $grouped_readings );
338 }
7f52eac8 339
4ce27d42 340 # Return the result
d120c995 341 return $grouped_readings;
342}
343
344# Helper function to query the alignment table for all witnesses (a.c. included)
345# that have a given reading at its rank.
346sub _table_witnesses {
347 my( $table, $trdg, $lacunose, $aclabel ) = @_;
348 my $tableidx = $trdg->rank - 1;
349 my @has_reading;
350 foreach my $row ( @{$table->{'alignment'}} ) {
351 my $wit = $row->{'witness'};
352 next if $lacunose->{$wit};
353 my $rdg = $row->{'tokens'}->[$tableidx];
354 next unless exists $rdg->{'t'} && defined $rdg->{'t'};
355 _add_to_witlist( $wit, \@has_reading, $aclabel )
356 if $rdg->{'t'}->id eq $trdg->id;
357 }
358 return @has_reading;
d1348d38 359}
360
335a62ef 361# Helper function to ensure that X and X a.c. never appear in the same list.
362sub _add_to_witlist {
363 my( $wit, $list, $acstr ) = @_;
364 my %inlist;
365 my $idx = 0;
366 map { $inlist{$_} = $idx++ } @$list;
367 if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
368 my $acwit = $1;
369 unless( exists $inlist{$acwit} ) {
370 push( @$list, $acwit.$acstr );
371 }
372 } else {
373 if( exists( $inlist{$wit.$acstr} ) ) {
374 # Replace the a.c. version with the main witness
375 my $i = $inlist{$wit.$acstr};
376 $list->[$i] = $wit;
377 } else {
378 push( @$list, $wit );
379 }
380 }
381}
382
d120c995 383sub _check_transposed_consistency {
384 my( $c, $rank, $transposed, $groupings ) = @_;
385 my %seen_wits;
386 my %thisrank;
387 # Note which readings are actually at this rank, and which witnesses
388 # belong to which reading.
389 foreach my $rdg ( keys %$groupings ) {
390 my $rdgobj = $c->reading( $rdg );
391 # Count '(omitted)' as a reading at this rank
392 $thisrank{$rdg} = 1 if !$rdgobj || $rdgobj->rank == $rank;
393 map { push( @{$seen_wits{$_}}, $rdg ) } @{$groupings->{$rdg}};
394 }
395 # Our work is done if we have no witness belonging to more than one
396 # reading.
397 my @doubled = grep { scalar @{$seen_wits{$_}} > 1 } keys %seen_wits;
398 return unless @doubled;
399 # If we have a symmetric related transposition, drop the non-rank readings.
400 if( @doubled == scalar keys %seen_wits ) {
401 foreach my $rdg ( keys %$groupings ) {
402 if( !$thisrank{$rdg} ) {
403 my $groupstr = wit_stringify( $groupings->{$rdg} );
404 my ( $matched ) = grep { $groupstr eq wit_stringify( $groupings->{$_} ) }
405 keys %thisrank;
406 delete $groupings->{$rdg};
407 # If we found a group match, assume there is a symmetry happening.
408 # TODO think more about this
62a39b8f 409 # print STDERR "*** Deleting symmetric reading $rdg\n";
d120c995 410 unless( $matched ) {
411 delete $transposed->{$rdg};
412 warn "Found problem in evident symmetry with reading $rdg";
413 }
414 }
415 }
416 # Otherwise 'unhook' the transposed reading(s) that have duplicates.
417 } else {
418 foreach my $dup ( @doubled ) {
419 foreach my $rdg ( @{$seen_wits{$dup}} ) {
420 next if $thisrank{$rdg};
421 next unless exists $groupings->{$rdg};
62a39b8f 422 # print STDERR "*** Deleting asymmetric doubled-up reading $rdg\n";
d120c995 423 delete $groupings->{$rdg};
424 delete $transposed->{$rdg};
425 }
426 }
427 # and put any now-orphaned readings into an 'omitted' reading.
428 foreach my $wit ( keys %seen_wits ) {
429 unless( grep { exists $groupings->{$_} } @{$seen_wits{$wit}} ) {
430 $groupings->{'(omitted)'} = [] unless exists $groupings->{'(omitted)'};
431 _add_to_witlist( $wit, $groupings->{'(omitted)'}, $c->ac_label );
432 }
433 }
434 }
435}
436
88a6bac5 437=head2 solve_variants( $graph, @groups )
438
439Sends the set of groups to the external graph solver service and returns
440a cleaned-up answer, adding the rank IDs back where they belong.
441
442The JSON has the form
443 { "graph": [ stemmagraph DOT string without newlines ],
444 "groupings": [ array of arrays of groups, one per rank ] }
445
446The answer has the form
447 { "variants" => [ array of variant location structures ],
448 "variant_count" => total,
449 "conflict_count" => number of conflicts detected,
450 "genealogical_count" => number of solutions found }
451
452=cut
453
454sub solve_variants {
e59b8faa 455 my( $stemma, @groups ) = @_;
335a62ef 456 my $aclabel = $stemma->collation->ac_label;
457
458 # Filter the groups down to distinct groups, and work out what graph
459 # should be used in the calculation of each group. We want to send each
460 # distinct problem to the solver only once.
461 # We need a whole bunch of lookup tables for this.
462 my $index_groupkeys = {}; # Save the order of readings
463 my $group_indices = {}; # Save the indices that have a given grouping
464 my $graph_problems = {}; # Save the groupings for the given graph
465
466 foreach my $idx ( 0..$#groups ) {
467 my $ghash = $groups[$idx];
88a6bac5 468 my @grouping;
335a62ef 469 # Sort the groupings from big to little, and scan for a.c. witnesses
470 # that would need an extended graph.
471 my @acwits; # note which AC witnesses crop up at this rank
472 my @idxkeys = sort { scalar @{$ghash->{$b}} <=> scalar @{$ghash->{$a}} }
473 keys %$ghash;
474 foreach my $rdg ( @idxkeys ) {
475 my @sg = sort @{$ghash->{$rdg}};
476 push( @acwits, grep { $_ =~ /\Q$aclabel\E$/ } @sg );
477 push( @grouping, \@sg );
478 }
479 # Save the reading order
480 $index_groupkeys->{$idx} = \@idxkeys;
481
482 # Now associate the distinct group with this index
483 my $gstr = wit_stringify( \@grouping );
484 push( @{$group_indices->{$gstr}}, $idx );
485
486 # Finally, add the group to the list to be calculated for this graph.
487 map { s/\Q$aclabel\E$// } @acwits;
d120c995 488 my $graph;
489 try {
490 $graph = $stemma->extend_graph( \@acwits );
491 } catch {
492 die "Unable to extend graph with @acwits";
493 }
335a62ef 494 unless( exists $graph_problems->{"$graph"} ) {
495 $graph_problems->{"$graph"} = { 'object' => $graph, 'groups' => [] };
88a6bac5 496 }
335a62ef 497 push( @{$graph_problems->{"$graph"}->{'groups'}}, \@grouping );
88a6bac5 498 }
335a62ef 499
500 ## For each distinct graph, send its groups to the solver.
88a6bac5 501 my $solver_url = 'http://byzantini.st/cgi-bin/graphcalc.cgi';
502 my $ua = LWP::UserAgent->new();
335a62ef 503 ## Witness map is a HACK to get around limitations in node names from IDP
504 my $witness_map = {};
505 ## Variables to store answers as they come back
506 my $variants = [ ( undef ) x ( scalar keys %$index_groupkeys ) ];
88a6bac5 507 my $genealogical = 0;
335a62ef 508 foreach my $graphkey ( keys %$graph_problems ) {
509 my $graph = $graph_problems->{$graphkey}->{'object'};
510 my $groupings = $graph_problems->{$graphkey}->{'groups'};
c30a4fdb 511 my $req = _safe_wit_strings( $graph, $stemma->collation,
512 $groupings, $witness_map );
513 $req->{'command'} = 'findGroupings';
514 my $json = encode_json( $req );
335a62ef 515 # Send it off and get the result
94654e27 516 #print STDERR "Sending request: $json\n";
335a62ef 517 my $resp = $ua->post( $solver_url, 'Content-Type' => 'application/json',
518 'Content' => $json );
519 my $answer;
520 my $used_idp;
521 if( $resp->is_success ) {
522 $answer = _desanitize_names( decode_json( $resp->content ), $witness_map );
523 $used_idp = 1;
524 } else {
525 # Fall back to the old method.
526 warn "IDP solver returned " . $resp->status_line . " / " . $resp->content
527 . "; falling back to perl method";
528 $answer = perl_solver( $graph, @$groupings );
529 }
530 ## The answer is the evaluated groupings, plus a boolean for whether
531 ## they were genealogical. Reconstruct our original groups.
532 foreach my $gidx ( 0 .. $#{$groupings} ) {
533 my( $calc_groups, $result ) = @{$answer->[$gidx]};
534 if( $result ) {
535 $genealogical++;
536 # Prune the calculated groups, in case the IDP solver failed to.
537 if( $used_idp ) {
538 my @pruned_groups;
539 foreach my $cg ( @$calc_groups ) {
540 # This is a little wasteful but the path of least
541 # resistance. Send both the stemma, which knows what
542 # its hypotheticals are, and the actual graph used.
543 my @pg = _prune_group( $cg, $stemma, $graph );
544 push( @pruned_groups, \@pg );
545 }
546 $calc_groups = \@pruned_groups;
6d25a3a0 547 }
335a62ef 548 }
549 # Retrieve the key for the original group that went to the solver
550 my $input_group = wit_stringify( $groupings->[$gidx] );
551 foreach my $oidx ( @{$group_indices->{$input_group}} ) {
552 my @readings = @{$index_groupkeys->{$oidx}};
553 my $vstruct = {
554 'genealogical' => $result,
555 'readings' => [],
556 };
557 foreach my $ridx ( 0 .. $#readings ) {
558 push( @{$vstruct->{'readings'}},
559 { 'readingid' => $readings[$ridx],
560 'group' => $calc_groups->[$ridx] } );
561 }
562 $variants->[$oidx] = $vstruct;
6d25a3a0 563 }
564 }
88a6bac5 565 }
566
567 return { 'variants' => $variants,
568 'variant_count' => scalar @$variants,
569 'genealogical_count' => $genealogical };
570}
571
b4cb2d60 572#### HACKERY to cope with IDP's limited idea of what a node name looks like ###
573
574sub _safe_wit_strings {
335a62ef 575 my( $graph, $c, $groupings, $witness_map ) = @_;
576 # Parse the graph we were given into a stemma.
b4cb2d60 577 my $safegraph = Graph->new();
578 # Convert the graph to a safe representation and store the conversion.
335a62ef 579 foreach my $n ( $graph->vertices ) {
b4cb2d60 580 my $sn = _safe_witstr( $n );
335a62ef 581 if( exists $witness_map->{$sn} ) {
582 warn "Ambiguous stringification $sn for $n and " . $witness_map->{$sn}
583 if $witness_map->{$sn} ne $n;
584 } else {
585 $witness_map->{$sn} = $n;
586 }
b4cb2d60 587 $safegraph->add_vertex( $sn );
588 $safegraph->set_vertex_attributes( $sn,
335a62ef 589 $graph->get_vertex_attributes( $n ) );
b4cb2d60 590 }
335a62ef 591 foreach my $e ( $graph->edges ) {
b4cb2d60 592 my @safe_e = ( _safe_witstr( $e->[0] ), _safe_witstr( $e->[1] ) );
593 $safegraph->add_edge( @safe_e );
594 }
595 my $safe_stemma = Text::Tradition::Stemma->new(
335a62ef 596 'collation' => $c, 'graph' => $safegraph );
b4cb2d60 597
598 # Now convert the witness groupings to a safe representation.
599 my $safe_groupings = [];
600 foreach my $grouping ( @$groupings ) {
601 my $safe_grouping = [];
602 foreach my $group ( @$grouping ) {
603 my $safe_group = [];
604 foreach my $n ( @$group ) {
605 my $sn = _safe_witstr( $n );
606 warn "Ambiguous stringification $sn for $n and " . $witness_map->{$sn}
607 if exists $witness_map->{$sn} && $witness_map->{$sn} ne $n;
608 $witness_map->{$sn} = $n;
609 push( @$safe_group, $sn );
610 }
611 push( @$safe_grouping, $safe_group );
612 }
613 push( @$safe_groupings, $safe_grouping );
614 }
615
616 # Return it all in the struct we expect. We have stored the reductions
617 # in the $witness_map that we were passed.
335a62ef 618 return { 'graph' => $safe_stemma->editable( { 'linesep' => ' ' } ),
619 'groupings' => $safe_groupings };
b4cb2d60 620}
621
622sub _safe_witstr {
623 my $witstr = shift;
624 $witstr =~ s/\s+/_/g;
625 $witstr =~ s/[^\w\d-]//g;
626 return $witstr;
627}
628
629sub _desanitize_names {
630 my( $jsonstruct, $witness_map ) = @_;
631 my $result = [];
632 foreach my $grouping ( @$jsonstruct ) {
633 my $real_grouping = [];
634 foreach my $element ( @$grouping ) {
635 if( ref( $element ) eq 'ARRAY' ) {
636 # it's the groupset.
637 my $real_groupset = [];
638 foreach my $group ( @$element ) {
639 my $real_group = [];
640 foreach my $n ( @$group ) {
641 my $rn = $witness_map->{$n};
642 push( @$real_group, $rn );
643 }
644 push( @$real_groupset, $real_group );
645 }
646 push( @$real_grouping, $real_groupset );
647 } else {
648 # It is the boolean, not actually a group.
649 push( @$real_grouping, $element );
650 }
651 }
652 push( @$result, $real_grouping );
653 }
654 return $result;
655}
656
657### END HACKERY ###
658
fae07016 659=head2 analyze_location ( $tradition, $graph, $location_hash )
7f52eac8 660
fae07016 661Given the tradition, its stemma graph, and the solution from the graph solver,
662work out the rest of the information we want. For each reading we need missing,
663conflict, reading_parents, independent_occurrence, followed, not_followed, and follow_unknown. Alters the location_hash in place.
7f52eac8 664
665=cut
732152b1 666
fae07016 667sub analyze_location {
638e2a95 668 my ( $tradition, $stemma, $variant_row, $lacunose ) = @_;
94654e27 669 my $c = $tradition->collation;
fae07016 670
671 # Make a hash of all known node memberships, and make the subgraphs.
672 my $contig = {};
673 my $reading_roots = {};
674 my $subgraph = {};
638e2a95 675 my $acstr = $c->ac_label;
676 my @acwits;
94654e27 677 # Note which witnesses positively belong to which group
fae07016 678 foreach my $rdghash ( @{$variant_row->{'readings'}} ) {
679 my $rid = $rdghash->{'readingid'};
638e2a95 680 foreach my $wit ( @{$rdghash->{'group'}} ) {
681 $contig->{$wit} = $rid;
682 if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
683 push( @acwits, $1 );
684 }
685 }
94654e27 686 }
638e2a95 687 # Get the actual graph we should work with
d120c995 688 my $graph;
689 try {
690 $graph = @acwits ? $stemma->extend_graph( \@acwits ) : $stemma->graph;
691 } catch {
692 die "Could not extend graph with a.c. witnesses @acwits";
693 }
638e2a95 694
94654e27 695 # Now, armed with that knowledge, make a subgraph for each reading
696 # and note the root(s) of each subgraph.
697 foreach my $rdghash( @{$variant_row->{'readings'}} ) {
698 my $rid = $rdghash->{'readingid'};
699 my %rdgwits;
fae07016 700 # Make the subgraph.
701 my $part = $graph->copy;
94654e27 702 my @todelete = grep { exists $contig->{$_} && $contig->{$_} ne $rid }
703 keys %$contig;
704 $part->delete_vertices( @todelete );
705 _prune_subtree( $part, $lacunose );
fae07016 706 $subgraph->{$rid} = $part;
94654e27 707 # Record the remaining lacunose nodes as part of this group, if
708 # we are dealing with a non-genealogical reading.
709 unless( $variant_row->{'genealogical'} ) {
710 map { $contig->{$_} = $rid } $part->vertices;
711 }
fae07016 712 # Get the reading roots.
713 map { $reading_roots->{$_} = $rid } $part->predecessorless_vertices;
bebec0e9 714 }
715
fae07016 716 # Now that we have all the node group memberships, calculate followed/
bebec0e9 717 # non-followed/unknown values for each reading. Also figure out the
718 # reading's evident parent(s).
719 foreach my $rdghash ( @{$variant_row->{'readings'}} ) {
fae07016 720 my $rid = $rdghash->{'readingid'};
428bcf0b 721 my $rdg = $c->reading( $rid );
fae07016 722 # Get the subgraph
723 my $part = $subgraph->{$rid};
724
725 # Start figuring things out.
94654e27 726 my @roots = grep { $reading_roots->{$_} eq $rid } keys %$reading_roots;
727 $rdghash->{'independent_occurrence'} = \@roots;
bebec0e9 728 $rdghash->{'followed'} = scalar( $part->vertices ) - scalar( @roots );
729 # Find the parent readings, if any, of this reading.
94654e27 730 my $rdgparents = {};
bebec0e9 731 foreach my $wit ( @roots ) {
f00cefe8 732 # Look in the main stemma to find this witness's extant or known-reading
733 # immediate ancestor(s), and look up the reading that each ancestor olds.
734 my @check = $graph->predecessors( $wit );
735 while( @check ) {
736 my @next;
737 foreach my $wparent( @check ) {
fae07016 738 my $preading = $contig->{$wparent};
428bcf0b 739 # IDP assigns all nodes, hypothetical included, to a reading
740 # in the case of genealogical sets. We prune non-necessary
741 # hypothetical readings, but they are still in $contig, so
742 # we account for that here.
743 if( $preading && $preading ne $rid ) {
94654e27 744 $rdgparents->{$preading} = 1;
f00cefe8 745 } else {
746 push( @next, $graph->predecessors( $wparent ) );
747 }
748 }
749 @check = @next;
750 }
bebec0e9 751 }
94654e27 752 foreach my $p ( keys %$rdgparents ) {
753 # Resolve the relationship of the parent to the reading, and
754 # save it in our hash.
755 my $pobj = $c->reading( $p );
94654e27 756 my $prep = $pobj ? $pobj->id . ' (' . $pobj->text . ')' : $p;
354cc918 757 my $phash = { 'label' => $prep };
94654e27 758 if( $pobj ) {
759 my $rel = $c->get_relationship( $p, $rdghash->{readingid} );
760 if( $rel ) {
428bcf0b 761 _add_to_hash( $rel, $phash );
762 } elsif( $rdg ) {
763 # First check for a transposed relationship
764 if( $rdg->rank != $pobj->rank ) {
765 foreach my $ti ( $rdg->related_readings( 'transposition' ) ) {
766 next unless $ti->text eq $rdg->text;
767 $rel = $c->get_relationship( $ti, $pobj );
768 if( $rel ) {
769 _add_to_hash( $rel, $phash, 1 );
770 last;
771 }
772 }
773 unless( $rel ) {
774 foreach my $ti ( $pobj->related_readings( 'transposition' ) ) {
775 next unless $ti->text eq $pobj->text;
776 $rel = $c->get_relationship( $ti, $rdg );
777 if( $rel ) {
778 _add_to_hash( $rel, $phash, 1 );
779 last;
780 }
781 }
782 }
354cc918 783 }
428bcf0b 784 unless( $rel ) {
785 # and then check for sheer word similarity.
786 my $rtext = $rdg->text;
787 my $ptext = $pobj->text;
788 if( similar( $rtext, $ptext ) ) {
789 # say STDERR "Words $rtext and $ptext judged similar";
790 $phash->{relation} = { type => 'wordsimilar' };
791 }
94654e27 792 }
428bcf0b 793 } else {
794 $phash->{relation} = { type => 'deletion' };
94654e27 795 }
354cc918 796 # Get the attributes of the parent object while we are here
797 $phash->{'text'} = $pobj->text if $pobj;
798 $phash->{'is_nonsense'} = $pobj->is_nonsense;
799 $phash->{'is_ungrammatical'} = $pobj->grammar_invalid;
800 } elsif( $p eq '(omitted)' ) {
801 $phash->{relation} = { type => 'addition' };
7b7abf10 802 }
354cc918 803 # Save it
7b7abf10 804 $rdgparents->{$p} = $phash;
94654e27 805 }
806
807 $rdghash->{'reading_parents'} = $rdgparents;
bebec0e9 808
809 # Find the number of times this reading was altered, and the number of
810 # times we're not sure.
811 my( %nofollow, %unknownfollow );
812 foreach my $wit ( $part->vertices ) {
813 foreach my $wchild ( $graph->successors( $wit ) ) {
814 next if $part->has_vertex( $wchild );
fae07016 815 if( $reading_roots->{$wchild} && $contig->{$wchild} ) {
bebec0e9 816 # It definitely changed here.
817 $nofollow{$wchild} = 1;
818 } elsif( !($contig->{$wchild}) ) {
819 # The child is a hypothetical node not definitely in
820 # any group. Answer is unknown.
821 $unknownfollow{$wchild} = 1;
822 } # else it's a non-root node in a known group, and therefore
823 # is presumed to have its reading from its group, not this link.
824 }
825 }
826 $rdghash->{'not_followed'} = keys %nofollow;
827 $rdghash->{'follow_unknown'} = keys %unknownfollow;
fae07016 828
829 # Now say whether this reading represents a conflict.
830 unless( $variant_row->{'genealogical'} ) {
831 $rdghash->{'conflict'} = @roots != 1;
832 }
c4a4fb1b 833 }
d71100ed 834}
835
428bcf0b 836sub _add_to_hash {
837 my( $rel, $phash, $is_transposed ) = @_;
838 $phash->{relation} = { type => $rel->type };
839 $phash->{relation}->{transposed} = 1 if $is_transposed;
840 $phash->{relation}->{annotation} = $rel->annotation
841 if $rel->has_annotation;
842}
843
844=head2 similar( $word1, $word2 )
845
846Use Algorithm::Diff to get a sense of how close the words are to each other.
847This will hopefully handle substitutions a bit more nicely than Levenshtein.
848
849=cut
850
851#!/usr/bin/env perl
852
853sub similar {
854 my( $word1, $word2 ) = sort { length($a) <=> length($b) } @_;
855 my @let1 = split( '', lc( $word1 ) );
856 my @let2 = split( '', lc( $word2 ) );
857 my $diff = Algorithm::Diff->new( \@let1, \@let2 );
858 my $mag = 0;
859 while( $diff->Next ) {
860 if( $diff->Same ) {
861 # Take off points for longer strings
862 my $cs = $diff->Range(1) - 2;
863 $cs = 0 if $cs < 0;
864 $mag -= $cs;
865 } elsif( !$diff->Items(1) ) {
866 $mag += $diff->Range(2);
867 } elsif( !$diff->Items(2) ) {
868 $mag += $diff->Range(1);
869 } else {
870 # Split the difference for substitutions
871 my $c1 = $diff->Range(1) || 1;
872 my $c2 = $diff->Range(2) || 1;
873 my $cd = ( $c1 + $c2 ) / 2;
874 $mag += $cd;
875 }
876 }
877 return ( $mag <= length( $word1 ) / 2 );
878}
879
880
fae07016 881
882=head2 perl_solver( $tradition, $rank, $stemma_id, @merge_relationship_types )
883
884** NOTE ** This method should hopefully not be called - it is not guaranteed
885to be correct. Serves as a backup for the real solver.
886
887Runs an analysis of the given tradition, at the location given in $rank,
888against the graph of the stemma specified in $stemma_id. The argument
889@merge_relationship_types is an optional list of relationship types for
890which readings so related should be treated as equivalent.
891
892Returns a nested array data structure as follows:
893
894 [ [ group_list, is_genealogical ], [ group_list, is_genealogical ] ... ]
895
896where the group list is the array of arrays passed in for each element of @groups,
897possibly with the addition of hypothetical readings.
898
899
900=cut
901
902sub perl_solver {
335a62ef 903 my( $graph, @groups ) = @_;
e59b8faa 904 my @answer;
905 foreach my $g ( @groups ) {
906 push( @answer, _solve_variant_location( $graph, $g ) );
907 }
908 return \@answer;
fae07016 909}
910
e59b8faa 911sub _solve_variant_location {
912 my( $graph, $groups ) = @_;
fae07016 913 # Now do the work.
e59b8faa 914 my $contig = {};
915 my $subgraph = {};
916 my $is_conflicted;
917 my $conflict = {};
918
919 # Mark each ms as in its own group, first.
920 foreach my $g ( @$groups ) {
921 my $gst = wit_stringify( $g );
922 map { $contig->{$_} = $gst } @$g;
923 }
924
925 # Now for each unmarked node in the graph, initialize an array
926 # for possible group memberships. We will use this later to
927 # resolve potential conflicts.
928 map { $contig->{$_} = [] unless $contig->{$_} } $graph->vertices;
929 foreach my $g ( sort { scalar @$b <=> scalar @$a } @$groups ) {
930 my $gst = wit_stringify( $g ); # This is the group name
931 # Copy the graph, and delete all non-members from the new graph.
932 my $part = $graph->copy;
933 my @group_roots;
934 $part->delete_vertices(
935 grep { !ref( $contig->{$_} ) && $contig->{$_} ne $gst } $graph->vertices );
936
937 # Now look to see if our group is connected.
938 if( @$g > 1 ) {
939 # We have to take directionality into account.
940 # How many root nodes do we have?
941 my @roots = grep { ref( $contig->{$_} ) || $contig->{$_} eq $gst }
942 $part->predecessorless_vertices;
943 # Assuming that @$g > 1, find the first root node that has at
944 # least one successor belonging to our group. If this reading
945 # is genealogical, there should be only one, but we will check
946 # that implicitly later.
947 foreach my $root ( @roots ) {
948 # Prune the tree to get rid of extraneous hypotheticals.
94654e27 949 $root = _prune_subtree_old( $part, $root, $contig );
e59b8faa 950 next unless $root;
951 # Save this root for our group.
952 push( @group_roots, $root );
953 # Get all the successor nodes of our root.
954 }
955 } else {
956 # Dispense with the trivial case of one reading.
957 my $wit = $g->[0];
958 @group_roots = ( $wit );
959 foreach my $v ( $part->vertices ) {
960 $part->delete_vertex( $v ) unless $v eq $wit;
961 }
962 }
963
964 if( @group_roots > 1 ) {
965 $conflict->{$gst} = 1;
966 $is_conflicted = 1;
967 }
968 # Paint the 'hypotheticals' with our group.
969 foreach my $wit ( $part->vertices ) {
970 if( ref( $contig->{$wit} ) ) {
971 push( @{$contig->{$wit}}, $gst );
972 } elsif( $contig->{$wit} ne $gst ) {
973 warn "How did we get here?";
974 }
975 }
976
977
978 # Save the relevant subgraph.
979 $subgraph->{$gst} = $part;
980 }
981
982 # For each of our hypothetical readings, flatten its 'contig' array if
983 # the array contains zero or one group. If we have any unflattened arrays,
984 # we may need to run the resolution process. If the reading is already known
985 # to have a conflict, flatten the 'contig' array to nothing; we won't resolve
986 # it.
987 my @resolve;
988 foreach my $wit ( keys %$contig ) {
989 next unless ref( $contig->{$wit} );
990 if( @{$contig->{$wit}} > 1 ) {
991 if( $is_conflicted ) {
992 $contig->{$wit} = ''; # We aren't going to decide.
993 } else {
994 push( @resolve, $wit );
995 }
996 } else {
997 my $gst = pop @{$contig->{$wit}};
998 $contig->{$wit} = $gst || '';
999 }
1000 }
1001
1002 if( @resolve ) {
1003 my $still_contig = {};
1004 foreach my $h ( @resolve ) {
1005 # For each of the hypothetical readings with more than one possibility,
1006 # try deleting it from each of its member subgraphs in turn, and see
1007 # if that breaks the contiguous grouping.
1008 # TODO This can still break in a corner case where group A can use
1009 # either vertex 1 or 2, and group B can use either vertex 2 or 1.
1010 # Revisit this if necessary; it could get brute-force nasty.
1011 foreach my $gst ( @{$contig->{$h}} ) {
1012 my $gpart = $subgraph->{$gst}->copy();
1013 # If we have come this far, there is only one root and everything
1014 # is reachable from it.
1015 my( $root ) = $gpart->predecessorless_vertices;
1016 my $reachable = {};
1017 map { $reachable->{$_} = 1 } $gpart->vertices;
1018
1019 # Try deleting the hypothetical node.
1020 $gpart->delete_vertex( $h );
1021 if( $h eq $root ) {
1022 # See if we still have a single root.
1023 my @roots = $gpart->predecessorless_vertices;
1024 warn "This shouldn't have happened" unless @roots;
1025 if( @roots > 1 ) {
1026 # $h is needed by this group.
1027 if( exists( $still_contig->{$h} ) ) {
1028 # Conflict!
1029 $conflict->{$gst} = 1;
1030 $still_contig->{$h} = '';
1031 } else {
1032 $still_contig->{$h} = $gst;
1033 }
1034 }
1035 } else {
1036 # $h is somewhere in the middle. See if everything
1037 # else can still be reached from the root.
1038 my %still_reachable = ( $root => 1 );
1039 map { $still_reachable{$_} = 1 }
1040 $gpart->all_successors( $root );
1041 foreach my $v ( keys %$reachable ) {
1042 next if $v eq $h;
1043 if( !$still_reachable{$v}
1044 && ( $contig->{$v} eq $gst
1045 || ( exists $still_contig->{$v}
1046 && $still_contig->{$v} eq $gst ) ) ) {
1047 # We need $h.
1048 if( exists $still_contig->{$h} ) {
1049 # Conflict!
1050 $conflict->{$gst} = 1;
1051 $still_contig->{$h} = '';
1052 } else {
1053 $still_contig->{$h} = $gst;
1054 }
1055 last;
1056 } # else we don't need $h in this group.
1057 } # end foreach $v
1058 } # endif $h eq $root
1059 } # end foreach $gst
1060 } # end foreach $h
1061
1062 # Now we have some hypothetical vertices in $still_contig that are the
1063 # "real" group memberships. Replace these in $contig.
1064 foreach my $v ( keys %$contig ) {
1065 next unless ref $contig->{$v};
1066 $contig->{$v} = $still_contig->{$v};
1067 }
1068 } # end if @resolve
1069
1070 my $is_genealogical = keys %$conflict ? JSON::false : JSON::true;
1071 my $variant_row = [ [], $is_genealogical ];
1072 # Fill in the groupings from $contig.
1073 foreach my $g ( @$groups ) {
1074 my $gst = wit_stringify( $g );
1075 my @realgroup = grep { $contig->{$_} eq $gst } keys %$contig;
1076 push( @{$variant_row->[0]}, \@realgroup );
1077 }
1078 return $variant_row;
1079}
fae07016 1080
6d25a3a0 1081sub _prune_group {
335a62ef 1082 my( $group, $stemma, $graph ) = @_;
94654e27 1083 my $lacunose = {};
1084 map { $lacunose->{$_} = 1 } $stemma->hypotheticals;
1085 map { $lacunose->{$_} = 0 } @$group;
6d25a3a0 1086 # Make our subgraph
335a62ef 1087 my $subgraph = $graph->copy;
94654e27 1088 map { $subgraph->delete_vertex( $_ ) unless exists $lacunose->{$_} }
6d25a3a0 1089 $subgraph->vertices;
1090 # ...and find the root.
6d25a3a0 1091 # Now prune and return the remaining vertices.
94654e27 1092 _prune_subtree( $subgraph, $lacunose );
6d25a3a0 1093 return $subgraph->vertices;
1094}
1095
7f52eac8 1096sub _prune_subtree {
94654e27 1097 my( $tree, $lacunose ) = @_;
1098
1099 # Delete lacunose witnesses that have no successors
1100 my @orphan_hypotheticals;
1101 my $ctr = 0;
1102 do {
1103 die "Infinite loop on leaves" if $ctr > 100;
1104 @orphan_hypotheticals = grep { $lacunose->{$_} }
1105 $tree->successorless_vertices;
1106 $tree->delete_vertices( @orphan_hypotheticals );
1107 $ctr++;
1108 } while( @orphan_hypotheticals );
1109
1110 # Delete lacunose roots that have a single successor
1111 my @redundant_root;
1112 $ctr = 0;
1113 do {
1114 die "Infinite loop on roots" if $ctr > 100;
1115 @redundant_root = grep { $lacunose->{$_} && $tree->successors( $_ ) == 1 }
1116 $tree->predecessorless_vertices;
1117 $tree->delete_vertices( @redundant_root );
1118 $ctr++;
1119 } while( @redundant_root );
1120}
1121
1122sub _prune_subtree_old {
231d71fc 1123 my( $tree, $root, $contighash ) = @_;
1124 # First, delete hypothetical leaves / orphans until there are none left.
1125 my @orphan_hypotheticals = grep { ref( $contighash->{$_} ) }
1126 $tree->successorless_vertices;
1127 while( @orphan_hypotheticals ) {
1128 $tree->delete_vertices( @orphan_hypotheticals );
1129 @orphan_hypotheticals = grep { ref( $contighash->{$_} ) }
1130 $tree->successorless_vertices;
1131 }
1132 # Then delete a hypothetical root with only one successor, moving the
bebec0e9 1133 # root to the first child that has no other predecessors.
231d71fc 1134 while( $tree->successors( $root ) == 1 && ref $contighash->{$root} ) {
1135 my @nextroot = $tree->successors( $root );
1136 $tree->delete_vertex( $root );
bebec0e9 1137 ( $root ) = grep { $tree->is_predecessorless_vertex( $_ ) } @nextroot;
231d71fc 1138 }
1139 # The tree has been modified in place, but we need to know the new root.
bebec0e9 1140 $root = undef unless $root && $tree->has_vertex( $root );
231d71fc 1141 return $root;
1142}
d71100ed 1143# Add the variant, subject to a.c. representation logic.
1144# This assumes that we will see the 'main' version before the a.c. version.
1145sub add_variant_wit {
1146 my( $arr, $wit, $acstr ) = @_;
1147 my $skip;
1148 if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
1149 my $real = $1;
1150 $skip = grep { $_ =~ /^\Q$real\E$/ } @$arr;
1151 }
1152 push( @$arr, $wit ) unless $skip;
1153}
1154
5be0cdeb 1155sub _useful_variant {
1156 my( $group_readings, $graph, $acstr ) = @_;
1157
1158 # TODO Decide what to do with AC witnesses
1159
1160 # Sort by group size and return
1161 my $is_useful = 0;
1162 my( @readings, @groups ); # The sorted groups for our answer.
1163 foreach my $rdg ( sort { @{$group_readings->{$b}} <=> @{$group_readings->{$a}} }
1164 keys %$group_readings ) {
1165 push( @readings, $rdg );
1166 push( @groups, $group_readings->{$rdg} );
1167 if( @{$group_readings->{$rdg}} > 1 ) {
1168 $is_useful++;
1169 } else {
1170 my( $wit ) = @{$group_readings->{$rdg}};
1171 $wit =~ s/^(.*)\Q$acstr\E$/$1/;
1172 $is_useful++ unless( $graph->is_sink_vertex( $wit ) );
1173 }
1174 }
1175 if( $is_useful > 1 ) {
1176 return( \@readings, \@groups );
1177 } else {
1178 return( [], [] );
1179 }
1180}
1181
7f52eac8 1182=head2 wit_stringify( $groups )
1183
1184Takes an array of witness groupings and produces a string like
1185['A','B'] / ['C','D','E'] / ['F']
d71100ed 1186
7f52eac8 1187=cut
d71100ed 1188
1189sub wit_stringify {
1190 my $groups = shift;
1191 my @gst;
1192 # If we were passed an array of witnesses instead of an array of
1193 # groupings, then "group" the witnesses first.
1194 unless( ref( $groups->[0] ) ) {
1195 my $mkgrp = [ $groups ];
1196 $groups = $mkgrp;
1197 }
1198 foreach my $g ( @$groups ) {
1199 push( @gst, '[' . join( ',', map { "'$_'" } @$g ) . ']' );
1200 }
1201 return join( ' / ', @gst );
1202}
7f52eac8 1203
bebec0e9 1204sub _symmdiff {
1205 my( $lista, $listb ) = @_;
7f52eac8 1206 my %union;
1207 my %scalars;
1208 map { $union{$_} = 1; $scalars{$_} = $_ } @$lista;
1209 map { $union{$_} += 1; $scalars{$_} = $_ } @$listb;
bebec0e9 1210 my @set = grep { $union{$_} == 1 } keys %union;
7f52eac8 1211 return map { $scalars{$_} } @set;
1212}
1213
12141;
1215
1216=head1 LICENSE
1217
1218This package is free software and is provided "as is" without express
1219or implied warranty. You can redistribute it and/or modify it under
1220the same terms as Perl itself.
1221
1222=head1 AUTHOR
1223
1224Tara L Andrews E<lt>aurum@cpan.orgE<gt>