retain original order of collation sets
[scpubgit/stemmatology.git] / lib / Text / Tradition / Analysis.pm
CommitLineData
d71100ed 1package Text::Tradition::Analysis;
2
3use strict;
4use warnings;
e4386ba9 5use Benchmark;
88a6bac5 6use Encode qw/ encode_utf8 /;
d1348d38 7use Exporter 'import';
b4cb2d60 8use Graph;
88a6bac5 9use JSON qw/ encode_json decode_json /;
10use LWP::UserAgent;
d71100ed 11use Text::Tradition;
12use Text::Tradition::Stemma;
13
d1348d38 14use vars qw/ @EXPORT_OK /;
a2cf85dd 15@EXPORT_OK = qw/ run_analysis group_variants analyze_variant_location wit_stringify /;
d1348d38 16
7f52eac8 17=head1 NAME
18
19Text::Tradition::Analysis - functions for stemma analysis of a tradition
20
21=head1 SYNOPSIS
22
23 use Text::Tradition;
24 use Text::Tradition::Analysis qw/ run_analysis analyze_variant_location /;
25 my $t = Text::Tradition->new(
26 'name' => 'this is a text',
27 'input' => 'TEI',
28 'file' => '/path/to/tei_parallel_seg_file.xml' );
29 $t->add_stemma( 'dotfile' => $stemmafile );
30
31 my $variant_data = run_analysis( $tradition );
32 # Recalculate rank $n treating all orthographic variants as equivalent
33 my $reanalyze = analyze_variant_location( $tradition, $n, 0, 'orthographic' );
34
35=head1 DESCRIPTION
36
37Text::Tradition is a library for representation and analysis of collated
38texts, particularly medieval ones. The Collation is the central feature of
39a Tradition, where the text, its sequence of readings, and its relationships
40between readings are actually kept.
41
42=head1 SUBROUTINES
43
88a6bac5 44=head2 run_analysis( $tradition, %opts )
7f52eac8 45
88a6bac5 46Runs the analysis described in analyze_variant_location on every location in the
47collation of the given tradition, with the given options. These include:
7f52eac8 48
88a6bac5 49=over 4
50
51=item * stemma_id - Specify which of the tradition's stemmata to use. Default
52is 0 (i.e. the first).
53
54=item * ranks - Specify a list of location ranks to analyze; exclude the rest.
55
56=item * merge_types - Specify a list of relationship types, where related readings
57should be treated as identical for the purposes of analysis.
58
ffa22d6f 59=item * exclude_type1 - Exclude those ranks whose groupings have only type-1 variants.
60
88a6bac5 61=back
7f52eac8 62
63=begin testing
64
65use Text::Tradition;
66use Text::Tradition::Analysis qw/ run_analysis analyze_variant_location /;
67
68my $datafile = 't/data/florilegium_tei_ps.xml';
69my $tradition = Text::Tradition->new( 'input' => 'TEI',
70 'name' => 'test0',
71 'file' => $datafile );
72my $s = $tradition->add_stemma( 'dotfile' => 't/data/florilegium.dot' );
73is( ref( $s ), 'Text::Tradition::Stemma', "Added stemma to tradition" );
74
f00cefe8 75my %expected_genealogical = (
a44aaf2a 76 1 => 0,
f00cefe8 77 2 => 1,
a44aaf2a 78 3 => 0,
79 5 => 0,
80 7 => 0,
81 8 => 0,
82 10 => 0,
f00cefe8 83 13 => 1,
a44aaf2a 84 33 => 0,
85 34 => 0,
86 37 => 0,
87 60 => 0,
f00cefe8 88 81 => 1,
a44aaf2a 89 84 => 0,
90 87 => 0,
91 101 => 0,
92 102 => 0,
f00cefe8 93 122 => 1,
a44aaf2a 94 157 => 0,
f00cefe8 95 166 => 1,
96 169 => 1,
a44aaf2a 97 200 => 0,
f00cefe8 98 216 => 1,
99 217 => 1,
100 219 => 1,
101 241 => 1,
102 242 => 1,
103 243 => 1,
104);
105
7f52eac8 106my $data = run_analysis( $tradition );
7234b01d 107my $c = $tradition->collation;
f00cefe8 108foreach my $row ( @{$data->{'variants'}} ) {
a44aaf2a 109 # Account for rows that used to be "not useful"
110 unless( exists $expected_genealogical{$row->{'id'}} ) {
111 $expected_genealogical{$row->{'id'}} = 1;
112 }
6d25a3a0 113 is( $row->{'genealogical'}, $expected_genealogical{$row->{'id'}},
f00cefe8 114 "Got correct genealogical flag for row " . $row->{'id'} );
7234b01d 115 # Check that we have the right row with the right groups
116 my $rank = $row->{'id'};
117 foreach my $rdghash ( @{$row->{'readings'}} ) {
118 # Skip 'readings' that aren't really
119 next unless $c->reading( $rdghash->{'readingid'} );
120 # Check the rank
121 is( $c->reading( $rdghash->{'readingid'} )->rank, $rank,
122 "Got correct reading rank" );
123 # Check the witnesses
124 my @realwits = sort $c->reading_witnesses( $rdghash->{'readingid'} );
125 my @sgrp = sort @{$rdghash->{'group'}};
126 is_deeply( \@sgrp, \@realwits, "Reading analyzed with correct groups" );
127 }
f00cefe8 128}
a44aaf2a 129is( $data->{'variant_count'}, 58, "Got right total variant number" );
b4cb2d60 130# TODO Make something meaningful of conflict count, maybe test other bits
7f52eac8 131
132=end testing
133
134=cut
135
d71100ed 136sub run_analysis {
88a6bac5 137 my( $tradition, %opts ) = @_;
f00cefe8 138 my $c = $tradition->collation;
88a6bac5 139
140 my $stemma_id = $opts{'stemma_id'} || 0;
1d73ecad 141 my @ranks = ref( $opts{'ranks'} ) eq 'ARRAY' ? @{$opts{'ranks'}} : ();
142 my @collapse = ref( $opts{'merge_types'} ) eq 'ARRAY' ? @{$opts{'merge_types'}} : ();
88a6bac5 143
144 # Get the stemma
145 my $stemma = $tradition->stemma( $stemma_id );
b4cb2d60 146
4ce27d42 147 # Figure out which witnesses we are working with - that is, the ones that
148 # appear both in the stemma and in the tradition. All others are 'lacunose'
149 # for our purposes.
88a6bac5 150 my @lacunose = $stemma->hypotheticals;
fae07016 151 my @tradition_wits = map { $_->sigil } $tradition->witnesses;
fae07016 152 push( @lacunose, _symmdiff( [ $stemma->witnesses ], \@tradition_wits ) );
88a6bac5 153
154 # Find and mark 'common' ranks for exclusion, unless they were
155 # explicitly specified.
156 unless( @ranks ) {
157 my %common_rank;
a44aaf2a 158 foreach my $rdg ( $c->common_readings ) {
88a6bac5 159 $common_rank{$rdg->rank} = 1;
160 }
161 @ranks = grep { !$common_rank{$_} } ( 1 .. $c->end->rank-1 );
d71100ed 162 }
7f52eac8 163
88a6bac5 164 # Group the variants to send to the solver
165 my @groups;
f629cb3b 166 my @use_ranks;
a44aaf2a 167 my %lacunae;
88a6bac5 168 foreach my $rank ( @ranks ) {
a44aaf2a 169 my $missing = [ @lacunose ];
ffa22d6f 170 my $rankgroup = group_variants( $tradition, $rank, $missing, \@collapse );
171 if( $opts{'exclude_type1'} ) {
172 # Check to see whether this is a "useful" group.
173 my( $rdgs, $grps ) = _useful_variant( $rankgroup,
174 $stemma->graph, $c->ac_label );
175 next unless @$rdgs;
176 }
f629cb3b 177 push( @use_ranks, $rank );
ffa22d6f 178 push( @groups, $rankgroup );
a44aaf2a 179 $lacunae{$rank} = $missing;
d71100ed 180 }
4ce27d42 181 # Run the solver
e59b8faa 182 my $answer = solve_variants( $stemma, @groups );
fae07016 183
88a6bac5 184 # Do further analysis on the answer
a44aaf2a 185 my $conflict_count = 0;
7234b01d 186 my $aclabel = $c->ac_label;
f629cb3b 187 foreach my $idx ( 0 .. $#use_ranks ) {
88a6bac5 188 my $location = $answer->{'variants'}->[$idx];
189 # Add the rank back in
f629cb3b 190 $location->{'id'} = $use_ranks[$idx];
7234b01d 191 # Note what our lacunae are
f629cb3b 192 my %lmiss;
7234b01d 193 map { $lmiss{$_} = 1 } @{$lacunae{$use_ranks[$idx]}};
194 # Run through the reading groups and add as 'lacunae' any redundant
195 # a.c. witnesses (yes, we have to do this before the analysis, thus
196 # identical loops before and after. Boo.)
197 # TODO Consider making these callbacks to analyze_location
198 foreach my $rdghash ( @{$location->{'readings'}} ) {
199 my %rwits;
200 map { $rwits{$_} = 1 } @{$rdghash->{'group'}};
201 foreach my $rw ( keys %rwits ) {
202 if( $rw =~ /^(.*)\Q$aclabel\E$/ ) {
203 if( exists $rwits{$1} ) {
204 $lmiss{$rw} = 1;
205 delete $rwits{$rw};
206 }
207 }
208 }
209 $rdghash->{'group'} = [ keys %rwits ];
210 }
211 $location->{'missing'} = [ keys %lmiss ];
212
88a6bac5 213 # Run the extra analysis we need.
88a6bac5 214 analyze_location( $tradition, $stemma->graph, $location );
7234b01d 215
216 # Do the final post-analysis tidying up of the data.
a44aaf2a 217 foreach my $rdghash ( @{$location->{'readings'}} ) {
218 $conflict_count++
219 if exists $rdghash->{'conflict'} && $rdghash->{'conflict'};
f629cb3b 220 # Add the reading text back in
a44aaf2a 221 my $rdg = $c->reading( $rdghash->{'readingid'} );
222 $rdghash->{'text'} = $rdg ? $rdg->text : $rdghash->{'readingid'};
f629cb3b 223 # Remove lacunose witnesses from this reading's list now that the
7234b01d 224 # analysis is done
f629cb3b 225 my @realgroup;
7234b01d 226 map { push( @realgroup, $_ ) unless $lmiss{$_} } @{$rdghash->{'group'}};
f629cb3b 227 $rdghash->{'group'} = \@realgroup;
7234b01d 228 # TODO Record hypotheticals used to create group, if we end up
229 # needing it
a44aaf2a 230 }
88a6bac5 231 }
a44aaf2a 232 $answer->{'conflict_count'} = $conflict_count;
f00cefe8 233
88a6bac5 234 return $answer;
d71100ed 235}
236
7f52eac8 237=head2 group_variants( $tradition, $rank, $lacunose, @merge_relationship_types )
238
239Groups the variants at the given $rank of the collation, treating any
240relationships in @merge_relationship_types as equivalent. $lacunose should
241be a reference to an array, to which the sigla of lacunose witnesses at this
242rank will be appended.
243
ffa22d6f 244Returns a hash $group_readings where $rdg is attested by the witnesses listed
245in $group_readings->{$rdg}.
7f52eac8 246
247=cut
248
249# Return group_readings, groups, lacunose
d1348d38 250sub group_variants {
7f52eac8 251 my( $tradition, $rank, $lacunose, $collapse ) = @_;
252 my $c = $tradition->collation;
335a62ef 253 my $aclabel = $c->ac_label;
254
7f52eac8 255 # Get the alignment table readings
256 my %readings_at_rank;
ffa22d6f 257 my %is_lacunose; # lookup table for $lacunose
258 map { $is_lacunose{$_} = 1 } @$lacunose;
7f52eac8 259 my @gap_wits;
1d73ecad 260 foreach my $tablewit ( @{$c->alignment_table->{'alignment'}} ) {
7f52eac8 261 my $rdg = $tablewit->{'tokens'}->[$rank-1];
fae07016 262 my $wit = $tablewit->{'witness'};
ffa22d6f 263 # Exclude the witness if it is "lacunose" which if we got here
264 # means "not in the stemma".
265 next if $is_lacunose{$wit};
7f52eac8 266 if( $rdg && $rdg->{'t'}->is_lacuna ) {
335a62ef 267 _add_to_witlist( $wit, $lacunose, $aclabel );
7f52eac8 268 } elsif( $rdg ) {
269 $readings_at_rank{$rdg->{'t'}->text} = $rdg->{'t'};
270 } else {
335a62ef 271 _add_to_witlist( $wit, \@gap_wits, $aclabel );
7f52eac8 272 }
273 }
d1348d38 274
7f52eac8 275 # Group the readings, collapsing groups by relationship if needed
276 my %grouped_readings;
4ce27d42 277 foreach my $rdg ( values %readings_at_rank ) {
7f52eac8 278 # Skip readings that have been collapsed into others.
f00cefe8 279 next if exists $grouped_readings{$rdg->id} && !$grouped_readings{$rdg->id};
4ce27d42 280 # Get the witness list, including from readings collapsed into this one.
7f52eac8 281 my @wits = $rdg->witnesses;
282 if( $collapse ) {
283 my $filter = sub { my $r = $_[0]; grep { $_ eq $r->type } @$collapse; };
284 foreach my $other ( $rdg->related_readings( $filter ) ) {
fae07016 285 my @otherwits = $other->witnesses;
fae07016 286 push( @wits, @otherwits );
f00cefe8 287 $grouped_readings{$other->id} = 0;
d1348d38 288 }
289 }
335a62ef 290 # Filter the group to those witnesses in the stemma
4ce27d42 291 my @use_wits;
292 foreach my $wit ( @wits ) {
293 next if $is_lacunose{$wit};
294 push( @use_wits, $wit );
4ce27d42 295 }
ffa22d6f 296 $grouped_readings{$rdg->id} = \@use_wits;
7f52eac8 297 }
298 $grouped_readings{'(omitted)'} = \@gap_wits if @gap_wits;
299 # Get rid of our collapsed readings
300 map { delete $grouped_readings{$_} unless $grouped_readings{$_} }
301 keys %grouped_readings
302 if $collapse;
303
4ce27d42 304 # Return the result
5be0cdeb 305 return \%grouped_readings;
d1348d38 306}
307
335a62ef 308# Helper function to ensure that X and X a.c. never appear in the same list.
309sub _add_to_witlist {
310 my( $wit, $list, $acstr ) = @_;
311 my %inlist;
312 my $idx = 0;
313 map { $inlist{$_} = $idx++ } @$list;
314 if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
315 my $acwit = $1;
316 unless( exists $inlist{$acwit} ) {
317 push( @$list, $acwit.$acstr );
318 }
319 } else {
320 if( exists( $inlist{$wit.$acstr} ) ) {
321 # Replace the a.c. version with the main witness
322 my $i = $inlist{$wit.$acstr};
323 $list->[$i] = $wit;
324 } else {
325 push( @$list, $wit );
326 }
327 }
328}
329
88a6bac5 330=head2 solve_variants( $graph, @groups )
331
332Sends the set of groups to the external graph solver service and returns
333a cleaned-up answer, adding the rank IDs back where they belong.
334
335The JSON has the form
336 { "graph": [ stemmagraph DOT string without newlines ],
337 "groupings": [ array of arrays of groups, one per rank ] }
338
339The answer has the form
340 { "variants" => [ array of variant location structures ],
341 "variant_count" => total,
342 "conflict_count" => number of conflicts detected,
343 "genealogical_count" => number of solutions found }
344
345=cut
346
347sub solve_variants {
e59b8faa 348 my( $stemma, @groups ) = @_;
335a62ef 349 my $aclabel = $stemma->collation->ac_label;
350
351 # Filter the groups down to distinct groups, and work out what graph
352 # should be used in the calculation of each group. We want to send each
353 # distinct problem to the solver only once.
354 # We need a whole bunch of lookup tables for this.
355 my $index_groupkeys = {}; # Save the order of readings
356 my $group_indices = {}; # Save the indices that have a given grouping
357 my $graph_problems = {}; # Save the groupings for the given graph
358
359 foreach my $idx ( 0..$#groups ) {
360 my $ghash = $groups[$idx];
88a6bac5 361 my @grouping;
335a62ef 362 # Sort the groupings from big to little, and scan for a.c. witnesses
363 # that would need an extended graph.
364 my @acwits; # note which AC witnesses crop up at this rank
365 my @idxkeys = sort { scalar @{$ghash->{$b}} <=> scalar @{$ghash->{$a}} }
366 keys %$ghash;
367 foreach my $rdg ( @idxkeys ) {
368 my @sg = sort @{$ghash->{$rdg}};
369 push( @acwits, grep { $_ =~ /\Q$aclabel\E$/ } @sg );
370 push( @grouping, \@sg );
371 }
372 # Save the reading order
373 $index_groupkeys->{$idx} = \@idxkeys;
374
375 # Now associate the distinct group with this index
376 my $gstr = wit_stringify( \@grouping );
377 push( @{$group_indices->{$gstr}}, $idx );
378
379 # Finally, add the group to the list to be calculated for this graph.
380 map { s/\Q$aclabel\E$// } @acwits;
381 my $graph = $stemma->extend_graph( \@acwits );
382 unless( exists $graph_problems->{"$graph"} ) {
383 $graph_problems->{"$graph"} = { 'object' => $graph, 'groups' => [] };
88a6bac5 384 }
335a62ef 385 push( @{$graph_problems->{"$graph"}->{'groups'}}, \@grouping );
88a6bac5 386 }
335a62ef 387
388 ## For each distinct graph, send its groups to the solver.
389 $DB::single = 1;
88a6bac5 390 my $solver_url = 'http://byzantini.st/cgi-bin/graphcalc.cgi';
391 my $ua = LWP::UserAgent->new();
335a62ef 392 ## Witness map is a HACK to get around limitations in node names from IDP
393 my $witness_map = {};
394 ## Variables to store answers as they come back
395 my $variants = [ ( undef ) x ( scalar keys %$index_groupkeys ) ];
88a6bac5 396 my $genealogical = 0;
335a62ef 397 foreach my $graphkey ( keys %$graph_problems ) {
398 my $graph = $graph_problems->{$graphkey}->{'object'};
399 my $groupings = $graph_problems->{$graphkey}->{'groups'};
400 my $json = encode_json( _safe_wit_strings( $graph, $stemma->collation,
401 $groupings, $witness_map ) );
402 # Send it off and get the result
403 my $resp = $ua->post( $solver_url, 'Content-Type' => 'application/json',
404 'Content' => $json );
405 my $answer;
406 my $used_idp;
407 if( $resp->is_success ) {
408 $answer = _desanitize_names( decode_json( $resp->content ), $witness_map );
409 $used_idp = 1;
410 } else {
411 # Fall back to the old method.
412 warn "IDP solver returned " . $resp->status_line . " / " . $resp->content
413 . "; falling back to perl method";
414 $answer = perl_solver( $graph, @$groupings );
415 }
416 ## The answer is the evaluated groupings, plus a boolean for whether
417 ## they were genealogical. Reconstruct our original groups.
418 foreach my $gidx ( 0 .. $#{$groupings} ) {
419 my( $calc_groups, $result ) = @{$answer->[$gidx]};
420 if( $result ) {
421 $genealogical++;
422 # Prune the calculated groups, in case the IDP solver failed to.
423 if( $used_idp ) {
424 my @pruned_groups;
425 foreach my $cg ( @$calc_groups ) {
426 # This is a little wasteful but the path of least
427 # resistance. Send both the stemma, which knows what
428 # its hypotheticals are, and the actual graph used.
429 my @pg = _prune_group( $cg, $stemma, $graph );
430 push( @pruned_groups, \@pg );
431 }
432 $calc_groups = \@pruned_groups;
6d25a3a0 433 }
335a62ef 434 }
435 # Retrieve the key for the original group that went to the solver
436 my $input_group = wit_stringify( $groupings->[$gidx] );
437 foreach my $oidx ( @{$group_indices->{$input_group}} ) {
438 my @readings = @{$index_groupkeys->{$oidx}};
439 my $vstruct = {
440 'genealogical' => $result,
441 'readings' => [],
442 };
443 foreach my $ridx ( 0 .. $#readings ) {
444 push( @{$vstruct->{'readings'}},
445 { 'readingid' => $readings[$ridx],
446 'group' => $calc_groups->[$ridx] } );
447 }
448 $variants->[$oidx] = $vstruct;
6d25a3a0 449 }
450 }
88a6bac5 451 }
452
453 return { 'variants' => $variants,
454 'variant_count' => scalar @$variants,
455 'genealogical_count' => $genealogical };
456}
457
b4cb2d60 458#### HACKERY to cope with IDP's limited idea of what a node name looks like ###
459
460sub _safe_wit_strings {
335a62ef 461 my( $graph, $c, $groupings, $witness_map ) = @_;
462 # Parse the graph we were given into a stemma.
b4cb2d60 463 my $safegraph = Graph->new();
464 # Convert the graph to a safe representation and store the conversion.
335a62ef 465 foreach my $n ( $graph->vertices ) {
b4cb2d60 466 my $sn = _safe_witstr( $n );
335a62ef 467 if( exists $witness_map->{$sn} ) {
468 warn "Ambiguous stringification $sn for $n and " . $witness_map->{$sn}
469 if $witness_map->{$sn} ne $n;
470 } else {
471 $witness_map->{$sn} = $n;
472 }
b4cb2d60 473 $safegraph->add_vertex( $sn );
474 $safegraph->set_vertex_attributes( $sn,
335a62ef 475 $graph->get_vertex_attributes( $n ) );
b4cb2d60 476 }
335a62ef 477 foreach my $e ( $graph->edges ) {
b4cb2d60 478 my @safe_e = ( _safe_witstr( $e->[0] ), _safe_witstr( $e->[1] ) );
479 $safegraph->add_edge( @safe_e );
480 }
481 my $safe_stemma = Text::Tradition::Stemma->new(
335a62ef 482 'collation' => $c, 'graph' => $safegraph );
b4cb2d60 483
484 # Now convert the witness groupings to a safe representation.
485 my $safe_groupings = [];
486 foreach my $grouping ( @$groupings ) {
487 my $safe_grouping = [];
488 foreach my $group ( @$grouping ) {
489 my $safe_group = [];
490 foreach my $n ( @$group ) {
491 my $sn = _safe_witstr( $n );
492 warn "Ambiguous stringification $sn for $n and " . $witness_map->{$sn}
493 if exists $witness_map->{$sn} && $witness_map->{$sn} ne $n;
494 $witness_map->{$sn} = $n;
495 push( @$safe_group, $sn );
496 }
497 push( @$safe_grouping, $safe_group );
498 }
499 push( @$safe_groupings, $safe_grouping );
500 }
501
502 # Return it all in the struct we expect. We have stored the reductions
503 # in the $witness_map that we were passed.
335a62ef 504 return { 'graph' => $safe_stemma->editable( { 'linesep' => ' ' } ),
505 'groupings' => $safe_groupings };
b4cb2d60 506}
507
508sub _safe_witstr {
509 my $witstr = shift;
510 $witstr =~ s/\s+/_/g;
511 $witstr =~ s/[^\w\d-]//g;
512 return $witstr;
513}
514
515sub _desanitize_names {
516 my( $jsonstruct, $witness_map ) = @_;
517 my $result = [];
518 foreach my $grouping ( @$jsonstruct ) {
519 my $real_grouping = [];
520 foreach my $element ( @$grouping ) {
521 if( ref( $element ) eq 'ARRAY' ) {
522 # it's the groupset.
523 my $real_groupset = [];
524 foreach my $group ( @$element ) {
525 my $real_group = [];
526 foreach my $n ( @$group ) {
527 my $rn = $witness_map->{$n};
528 push( @$real_group, $rn );
529 }
530 push( @$real_groupset, $real_group );
531 }
532 push( @$real_grouping, $real_groupset );
533 } else {
534 # It is the boolean, not actually a group.
535 push( @$real_grouping, $element );
536 }
537 }
538 push( @$result, $real_grouping );
539 }
540 return $result;
541}
542
543### END HACKERY ###
544
fae07016 545=head2 analyze_location ( $tradition, $graph, $location_hash )
7f52eac8 546
fae07016 547Given the tradition, its stemma graph, and the solution from the graph solver,
548work out the rest of the information we want. For each reading we need missing,
549conflict, reading_parents, independent_occurrence, followed, not_followed, and follow_unknown. Alters the location_hash in place.
7f52eac8 550
551=cut
732152b1 552
fae07016 553sub analyze_location {
554 my ( $tradition, $graph, $variant_row ) = @_;
555
556 # Make a hash of all known node memberships, and make the subgraphs.
557 my $contig = {};
558 my $reading_roots = {};
559 my $subgraph = {};
560 foreach my $rdghash ( @{$variant_row->{'readings'}} ) {
561 my $rid = $rdghash->{'readingid'};
562 map { $contig->{$_} = $rid } @{$rdghash->{'group'}};
c4a4fb1b 563
fae07016 564 # Make the subgraph.
565 my $part = $graph->copy;
566 my %these_vertices;
567 map { $these_vertices{$_} = 1 } @{$rdghash->{'group'}};
568 $part->delete_vertices( grep { !$these_vertices{$_} } $part->vertices );
569 $subgraph->{$rid} = $part;
570 # Get the reading roots.
571 map { $reading_roots->{$_} = $rid } $part->predecessorless_vertices;
bebec0e9 572 }
573
fae07016 574 # Now that we have all the node group memberships, calculate followed/
bebec0e9 575 # non-followed/unknown values for each reading. Also figure out the
576 # reading's evident parent(s).
577 foreach my $rdghash ( @{$variant_row->{'readings'}} ) {
fae07016 578 # Group string key - TODO do we need this?
579 my $gst = wit_stringify( $rdghash->{'group'} );
580 my $rid = $rdghash->{'readingid'};
581 # Get the subgraph
582 my $part = $subgraph->{$rid};
583
584 # Start figuring things out.
bebec0e9 585 my @roots = $part->predecessorless_vertices;
586 $rdghash->{'independent_occurrence'} = scalar @roots;
587 $rdghash->{'followed'} = scalar( $part->vertices ) - scalar( @roots );
588 # Find the parent readings, if any, of this reading.
f00cefe8 589 my %rdgparents;
bebec0e9 590 foreach my $wit ( @roots ) {
f00cefe8 591 # Look in the main stemma to find this witness's extant or known-reading
592 # immediate ancestor(s), and look up the reading that each ancestor olds.
593 my @check = $graph->predecessors( $wit );
594 while( @check ) {
595 my @next;
596 foreach my $wparent( @check ) {
fae07016 597 my $preading = $contig->{$wparent};
598 if( $preading ) {
599 $rdgparents{$preading} = 1;
f00cefe8 600 } else {
601 push( @next, $graph->predecessors( $wparent ) );
602 }
603 }
604 @check = @next;
605 }
bebec0e9 606 }
f00cefe8 607 $rdghash->{'reading_parents'} = [ keys %rdgparents ];
bebec0e9 608
609 # Find the number of times this reading was altered, and the number of
610 # times we're not sure.
611 my( %nofollow, %unknownfollow );
612 foreach my $wit ( $part->vertices ) {
613 foreach my $wchild ( $graph->successors( $wit ) ) {
614 next if $part->has_vertex( $wchild );
fae07016 615 if( $reading_roots->{$wchild} && $contig->{$wchild} ) {
bebec0e9 616 # It definitely changed here.
617 $nofollow{$wchild} = 1;
618 } elsif( !($contig->{$wchild}) ) {
619 # The child is a hypothetical node not definitely in
620 # any group. Answer is unknown.
621 $unknownfollow{$wchild} = 1;
622 } # else it's a non-root node in a known group, and therefore
623 # is presumed to have its reading from its group, not this link.
624 }
625 }
626 $rdghash->{'not_followed'} = keys %nofollow;
627 $rdghash->{'follow_unknown'} = keys %unknownfollow;
fae07016 628
629 # Now say whether this reading represents a conflict.
630 unless( $variant_row->{'genealogical'} ) {
631 $rdghash->{'conflict'} = @roots != 1;
632 }
c4a4fb1b 633 }
d71100ed 634}
635
fae07016 636
637=head2 perl_solver( $tradition, $rank, $stemma_id, @merge_relationship_types )
638
639** NOTE ** This method should hopefully not be called - it is not guaranteed
640to be correct. Serves as a backup for the real solver.
641
642Runs an analysis of the given tradition, at the location given in $rank,
643against the graph of the stemma specified in $stemma_id. The argument
644@merge_relationship_types is an optional list of relationship types for
645which readings so related should be treated as equivalent.
646
647Returns a nested array data structure as follows:
648
649 [ [ group_list, is_genealogical ], [ group_list, is_genealogical ] ... ]
650
651where the group list is the array of arrays passed in for each element of @groups,
652possibly with the addition of hypothetical readings.
653
654
655=cut
656
657sub perl_solver {
335a62ef 658 my( $graph, @groups ) = @_;
e59b8faa 659 my @answer;
660 foreach my $g ( @groups ) {
661 push( @answer, _solve_variant_location( $graph, $g ) );
662 }
663 return \@answer;
fae07016 664}
665
e59b8faa 666sub _solve_variant_location {
667 my( $graph, $groups ) = @_;
fae07016 668 # Now do the work.
e59b8faa 669 my $contig = {};
670 my $subgraph = {};
671 my $is_conflicted;
672 my $conflict = {};
673
674 # Mark each ms as in its own group, first.
675 foreach my $g ( @$groups ) {
676 my $gst = wit_stringify( $g );
677 map { $contig->{$_} = $gst } @$g;
678 }
679
680 # Now for each unmarked node in the graph, initialize an array
681 # for possible group memberships. We will use this later to
682 # resolve potential conflicts.
683 map { $contig->{$_} = [] unless $contig->{$_} } $graph->vertices;
684 foreach my $g ( sort { scalar @$b <=> scalar @$a } @$groups ) {
685 my $gst = wit_stringify( $g ); # This is the group name
686 # Copy the graph, and delete all non-members from the new graph.
687 my $part = $graph->copy;
688 my @group_roots;
689 $part->delete_vertices(
690 grep { !ref( $contig->{$_} ) && $contig->{$_} ne $gst } $graph->vertices );
691
692 # Now look to see if our group is connected.
693 if( @$g > 1 ) {
694 # We have to take directionality into account.
695 # How many root nodes do we have?
696 my @roots = grep { ref( $contig->{$_} ) || $contig->{$_} eq $gst }
697 $part->predecessorless_vertices;
698 # Assuming that @$g > 1, find the first root node that has at
699 # least one successor belonging to our group. If this reading
700 # is genealogical, there should be only one, but we will check
701 # that implicitly later.
702 foreach my $root ( @roots ) {
703 # Prune the tree to get rid of extraneous hypotheticals.
704 $root = _prune_subtree( $part, $root, $contig );
705 next unless $root;
706 # Save this root for our group.
707 push( @group_roots, $root );
708 # Get all the successor nodes of our root.
709 }
710 } else {
711 # Dispense with the trivial case of one reading.
712 my $wit = $g->[0];
713 @group_roots = ( $wit );
714 foreach my $v ( $part->vertices ) {
715 $part->delete_vertex( $v ) unless $v eq $wit;
716 }
717 }
718
719 if( @group_roots > 1 ) {
720 $conflict->{$gst} = 1;
721 $is_conflicted = 1;
722 }
723 # Paint the 'hypotheticals' with our group.
724 foreach my $wit ( $part->vertices ) {
725 if( ref( $contig->{$wit} ) ) {
726 push( @{$contig->{$wit}}, $gst );
727 } elsif( $contig->{$wit} ne $gst ) {
728 warn "How did we get here?";
729 }
730 }
731
732
733 # Save the relevant subgraph.
734 $subgraph->{$gst} = $part;
735 }
736
737 # For each of our hypothetical readings, flatten its 'contig' array if
738 # the array contains zero or one group. If we have any unflattened arrays,
739 # we may need to run the resolution process. If the reading is already known
740 # to have a conflict, flatten the 'contig' array to nothing; we won't resolve
741 # it.
742 my @resolve;
743 foreach my $wit ( keys %$contig ) {
744 next unless ref( $contig->{$wit} );
745 if( @{$contig->{$wit}} > 1 ) {
746 if( $is_conflicted ) {
747 $contig->{$wit} = ''; # We aren't going to decide.
748 } else {
749 push( @resolve, $wit );
750 }
751 } else {
752 my $gst = pop @{$contig->{$wit}};
753 $contig->{$wit} = $gst || '';
754 }
755 }
756
757 if( @resolve ) {
758 my $still_contig = {};
759 foreach my $h ( @resolve ) {
760 # For each of the hypothetical readings with more than one possibility,
761 # try deleting it from each of its member subgraphs in turn, and see
762 # if that breaks the contiguous grouping.
763 # TODO This can still break in a corner case where group A can use
764 # either vertex 1 or 2, and group B can use either vertex 2 or 1.
765 # Revisit this if necessary; it could get brute-force nasty.
766 foreach my $gst ( @{$contig->{$h}} ) {
767 my $gpart = $subgraph->{$gst}->copy();
768 # If we have come this far, there is only one root and everything
769 # is reachable from it.
770 my( $root ) = $gpart->predecessorless_vertices;
771 my $reachable = {};
772 map { $reachable->{$_} = 1 } $gpart->vertices;
773
774 # Try deleting the hypothetical node.
775 $gpart->delete_vertex( $h );
776 if( $h eq $root ) {
777 # See if we still have a single root.
778 my @roots = $gpart->predecessorless_vertices;
779 warn "This shouldn't have happened" unless @roots;
780 if( @roots > 1 ) {
781 # $h is needed by this group.
782 if( exists( $still_contig->{$h} ) ) {
783 # Conflict!
784 $conflict->{$gst} = 1;
785 $still_contig->{$h} = '';
786 } else {
787 $still_contig->{$h} = $gst;
788 }
789 }
790 } else {
791 # $h is somewhere in the middle. See if everything
792 # else can still be reached from the root.
793 my %still_reachable = ( $root => 1 );
794 map { $still_reachable{$_} = 1 }
795 $gpart->all_successors( $root );
796 foreach my $v ( keys %$reachable ) {
797 next if $v eq $h;
798 if( !$still_reachable{$v}
799 && ( $contig->{$v} eq $gst
800 || ( exists $still_contig->{$v}
801 && $still_contig->{$v} eq $gst ) ) ) {
802 # We need $h.
803 if( exists $still_contig->{$h} ) {
804 # Conflict!
805 $conflict->{$gst} = 1;
806 $still_contig->{$h} = '';
807 } else {
808 $still_contig->{$h} = $gst;
809 }
810 last;
811 } # else we don't need $h in this group.
812 } # end foreach $v
813 } # endif $h eq $root
814 } # end foreach $gst
815 } # end foreach $h
816
817 # Now we have some hypothetical vertices in $still_contig that are the
818 # "real" group memberships. Replace these in $contig.
819 foreach my $v ( keys %$contig ) {
820 next unless ref $contig->{$v};
821 $contig->{$v} = $still_contig->{$v};
822 }
823 } # end if @resolve
824
825 my $is_genealogical = keys %$conflict ? JSON::false : JSON::true;
826 my $variant_row = [ [], $is_genealogical ];
827 # Fill in the groupings from $contig.
828 foreach my $g ( @$groups ) {
829 my $gst = wit_stringify( $g );
830 my @realgroup = grep { $contig->{$_} eq $gst } keys %$contig;
831 push( @{$variant_row->[0]}, \@realgroup );
832 }
833 return $variant_row;
834}
fae07016 835
6d25a3a0 836sub _prune_group {
335a62ef 837 my( $group, $stemma, $graph ) = @_;
6d25a3a0 838 # Get these into a form prune_subtree will recognize. Make a "contighash"
839 my $hypohash = {};
840 map { $hypohash->{$_} = 1 } @$group;
841 # ...with reference values for hypotheticals.
842 map { $hypohash->{$_} = [] } $stemma->hypotheticals;
843 # Make our subgraph
335a62ef 844 my $subgraph = $graph->copy;
6d25a3a0 845 map { $subgraph->delete_vertex( $_ ) unless exists $hypohash->{$_} }
846 $subgraph->vertices;
847 # ...and find the root.
848 my( $root ) = $subgraph->predecessorless_vertices;
849 # Now prune and return the remaining vertices.
850 _prune_subtree( $subgraph, $root, $hypohash );
851 return $subgraph->vertices;
852}
853
7f52eac8 854sub _prune_subtree {
231d71fc 855 my( $tree, $root, $contighash ) = @_;
856 # First, delete hypothetical leaves / orphans until there are none left.
857 my @orphan_hypotheticals = grep { ref( $contighash->{$_} ) }
858 $tree->successorless_vertices;
859 while( @orphan_hypotheticals ) {
860 $tree->delete_vertices( @orphan_hypotheticals );
861 @orphan_hypotheticals = grep { ref( $contighash->{$_} ) }
862 $tree->successorless_vertices;
863 }
864 # Then delete a hypothetical root with only one successor, moving the
bebec0e9 865 # root to the first child that has no other predecessors.
231d71fc 866 while( $tree->successors( $root ) == 1 && ref $contighash->{$root} ) {
867 my @nextroot = $tree->successors( $root );
868 $tree->delete_vertex( $root );
bebec0e9 869 ( $root ) = grep { $tree->is_predecessorless_vertex( $_ ) } @nextroot;
231d71fc 870 }
871 # The tree has been modified in place, but we need to know the new root.
bebec0e9 872 $root = undef unless $root && $tree->has_vertex( $root );
231d71fc 873 return $root;
874}
d71100ed 875# Add the variant, subject to a.c. representation logic.
876# This assumes that we will see the 'main' version before the a.c. version.
877sub add_variant_wit {
878 my( $arr, $wit, $acstr ) = @_;
879 my $skip;
880 if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
881 my $real = $1;
882 $skip = grep { $_ =~ /^\Q$real\E$/ } @$arr;
883 }
884 push( @$arr, $wit ) unless $skip;
885}
886
5be0cdeb 887sub _useful_variant {
888 my( $group_readings, $graph, $acstr ) = @_;
889
890 # TODO Decide what to do with AC witnesses
891
892 # Sort by group size and return
893 my $is_useful = 0;
894 my( @readings, @groups ); # The sorted groups for our answer.
895 foreach my $rdg ( sort { @{$group_readings->{$b}} <=> @{$group_readings->{$a}} }
896 keys %$group_readings ) {
897 push( @readings, $rdg );
898 push( @groups, $group_readings->{$rdg} );
899 if( @{$group_readings->{$rdg}} > 1 ) {
900 $is_useful++;
901 } else {
902 my( $wit ) = @{$group_readings->{$rdg}};
903 $wit =~ s/^(.*)\Q$acstr\E$/$1/;
904 $is_useful++ unless( $graph->is_sink_vertex( $wit ) );
905 }
906 }
907 if( $is_useful > 1 ) {
908 return( \@readings, \@groups );
909 } else {
910 return( [], [] );
911 }
912}
913
7f52eac8 914=head2 wit_stringify( $groups )
915
916Takes an array of witness groupings and produces a string like
917['A','B'] / ['C','D','E'] / ['F']
d71100ed 918
7f52eac8 919=cut
d71100ed 920
921sub wit_stringify {
922 my $groups = shift;
923 my @gst;
924 # If we were passed an array of witnesses instead of an array of
925 # groupings, then "group" the witnesses first.
926 unless( ref( $groups->[0] ) ) {
927 my $mkgrp = [ $groups ];
928 $groups = $mkgrp;
929 }
930 foreach my $g ( @$groups ) {
931 push( @gst, '[' . join( ',', map { "'$_'" } @$g ) . ']' );
932 }
933 return join( ' / ', @gst );
934}
7f52eac8 935
bebec0e9 936sub _symmdiff {
937 my( $lista, $listb ) = @_;
7f52eac8 938 my %union;
939 my %scalars;
940 map { $union{$_} = 1; $scalars{$_} = $_ } @$lista;
941 map { $union{$_} += 1; $scalars{$_} = $_ } @$listb;
bebec0e9 942 my @set = grep { $union{$_} == 1 } keys %union;
7f52eac8 943 return map { $scalars{$_} } @set;
944}
945
9461;
947
948=head1 LICENSE
949
950This package is free software and is provided "as is" without express
951or implied warranty. You can redistribute it and/or modify it under
952the same terms as Perl itself.
953
954=head1 AUTHOR
955
956Tara L Andrews E<lt>aurum@cpan.orgE<gt>