script to link readings based on morphological tags
[scpubgit/stemmatology.git] / lib / Text / Tradition / Analysis.pm
CommitLineData
d71100ed 1package Text::Tradition::Analysis;
2
3use strict;
4use warnings;
e4386ba9 5use Benchmark;
88a6bac5 6use Encode qw/ encode_utf8 /;
d1348d38 7use Exporter 'import';
b4cb2d60 8use Graph;
88a6bac5 9use JSON qw/ encode_json decode_json /;
10use LWP::UserAgent;
d71100ed 11use Text::Tradition;
12use Text::Tradition::Stemma;
13
d1348d38 14use vars qw/ @EXPORT_OK /;
a2cf85dd 15@EXPORT_OK = qw/ run_analysis group_variants analyze_variant_location wit_stringify /;
d1348d38 16
7f52eac8 17=head1 NAME
18
19Text::Tradition::Analysis - functions for stemma analysis of a tradition
20
21=head1 SYNOPSIS
22
23 use Text::Tradition;
24 use Text::Tradition::Analysis qw/ run_analysis analyze_variant_location /;
25 my $t = Text::Tradition->new(
26 'name' => 'this is a text',
27 'input' => 'TEI',
28 'file' => '/path/to/tei_parallel_seg_file.xml' );
29 $t->add_stemma( 'dotfile' => $stemmafile );
30
31 my $variant_data = run_analysis( $tradition );
32 # Recalculate rank $n treating all orthographic variants as equivalent
33 my $reanalyze = analyze_variant_location( $tradition, $n, 0, 'orthographic' );
34
35=head1 DESCRIPTION
36
37Text::Tradition is a library for representation and analysis of collated
38texts, particularly medieval ones. The Collation is the central feature of
39a Tradition, where the text, its sequence of readings, and its relationships
40between readings are actually kept.
41
42=head1 SUBROUTINES
43
88a6bac5 44=head2 run_analysis( $tradition, %opts )
7f52eac8 45
88a6bac5 46Runs the analysis described in analyze_variant_location on every location in the
47collation of the given tradition, with the given options. These include:
7f52eac8 48
88a6bac5 49=over 4
50
51=item * stemma_id - Specify which of the tradition's stemmata to use. Default
52is 0 (i.e. the first).
53
54=item * ranks - Specify a list of location ranks to analyze; exclude the rest.
55
56=item * merge_types - Specify a list of relationship types, where related readings
57should be treated as identical for the purposes of analysis.
58
ffa22d6f 59=item * exclude_type1 - Exclude those ranks whose groupings have only type-1 variants.
60
88a6bac5 61=back
7f52eac8 62
63=begin testing
64
65use Text::Tradition;
66use Text::Tradition::Analysis qw/ run_analysis analyze_variant_location /;
67
68my $datafile = 't/data/florilegium_tei_ps.xml';
69my $tradition = Text::Tradition->new( 'input' => 'TEI',
70 'name' => 'test0',
71 'file' => $datafile );
72my $s = $tradition->add_stemma( 'dotfile' => 't/data/florilegium.dot' );
73is( ref( $s ), 'Text::Tradition::Stemma', "Added stemma to tradition" );
74
f00cefe8 75my %expected_genealogical = (
a44aaf2a 76 1 => 0,
f00cefe8 77 2 => 1,
a44aaf2a 78 3 => 0,
79 5 => 0,
80 7 => 0,
81 8 => 0,
82 10 => 0,
f00cefe8 83 13 => 1,
a44aaf2a 84 33 => 0,
85 34 => 0,
86 37 => 0,
87 60 => 0,
f00cefe8 88 81 => 1,
a44aaf2a 89 84 => 0,
90 87 => 0,
91 101 => 0,
92 102 => 0,
f00cefe8 93 122 => 1,
a44aaf2a 94 157 => 0,
f00cefe8 95 166 => 1,
96 169 => 1,
a44aaf2a 97 200 => 0,
f00cefe8 98 216 => 1,
99 217 => 1,
100 219 => 1,
101 241 => 1,
102 242 => 1,
103 243 => 1,
104);
105
7f52eac8 106my $data = run_analysis( $tradition );
7234b01d 107my $c = $tradition->collation;
f00cefe8 108foreach my $row ( @{$data->{'variants'}} ) {
a44aaf2a 109 # Account for rows that used to be "not useful"
110 unless( exists $expected_genealogical{$row->{'id'}} ) {
111 $expected_genealogical{$row->{'id'}} = 1;
112 }
18f48b82 113 my $gen_bool = $row->{'genealogical'} ? 1 : 0;
114 is( $gen_bool, $expected_genealogical{$row->{'id'}},
f00cefe8 115 "Got correct genealogical flag for row " . $row->{'id'} );
7234b01d 116 # Check that we have the right row with the right groups
117 my $rank = $row->{'id'};
118 foreach my $rdghash ( @{$row->{'readings'}} ) {
119 # Skip 'readings' that aren't really
120 next unless $c->reading( $rdghash->{'readingid'} );
121 # Check the rank
122 is( $c->reading( $rdghash->{'readingid'} )->rank, $rank,
123 "Got correct reading rank" );
124 # Check the witnesses
125 my @realwits = sort $c->reading_witnesses( $rdghash->{'readingid'} );
126 my @sgrp = sort @{$rdghash->{'group'}};
127 is_deeply( \@sgrp, \@realwits, "Reading analyzed with correct groups" );
128 }
f00cefe8 129}
a44aaf2a 130is( $data->{'variant_count'}, 58, "Got right total variant number" );
b4cb2d60 131# TODO Make something meaningful of conflict count, maybe test other bits
7f52eac8 132
133=end testing
134
135=cut
136
d71100ed 137sub run_analysis {
88a6bac5 138 my( $tradition, %opts ) = @_;
f00cefe8 139 my $c = $tradition->collation;
88a6bac5 140
141 my $stemma_id = $opts{'stemma_id'} || 0;
1d73ecad 142 my @ranks = ref( $opts{'ranks'} ) eq 'ARRAY' ? @{$opts{'ranks'}} : ();
143 my @collapse = ref( $opts{'merge_types'} ) eq 'ARRAY' ? @{$opts{'merge_types'}} : ();
88a6bac5 144
145 # Get the stemma
146 my $stemma = $tradition->stemma( $stemma_id );
b4cb2d60 147
4ce27d42 148 # Figure out which witnesses we are working with - that is, the ones that
149 # appear both in the stemma and in the tradition. All others are 'lacunose'
150 # for our purposes.
88a6bac5 151 my @lacunose = $stemma->hypotheticals;
fae07016 152 my @tradition_wits = map { $_->sigil } $tradition->witnesses;
fae07016 153 push( @lacunose, _symmdiff( [ $stemma->witnesses ], \@tradition_wits ) );
88a6bac5 154
155 # Find and mark 'common' ranks for exclusion, unless they were
156 # explicitly specified.
157 unless( @ranks ) {
158 my %common_rank;
a44aaf2a 159 foreach my $rdg ( $c->common_readings ) {
88a6bac5 160 $common_rank{$rdg->rank} = 1;
161 }
162 @ranks = grep { !$common_rank{$_} } ( 1 .. $c->end->rank-1 );
d71100ed 163 }
7f52eac8 164
88a6bac5 165 # Group the variants to send to the solver
166 my @groups;
f629cb3b 167 my @use_ranks;
a44aaf2a 168 my %lacunae;
88a6bac5 169 foreach my $rank ( @ranks ) {
a44aaf2a 170 my $missing = [ @lacunose ];
ffa22d6f 171 my $rankgroup = group_variants( $tradition, $rank, $missing, \@collapse );
172 if( $opts{'exclude_type1'} ) {
173 # Check to see whether this is a "useful" group.
174 my( $rdgs, $grps ) = _useful_variant( $rankgroup,
175 $stemma->graph, $c->ac_label );
176 next unless @$rdgs;
177 }
f629cb3b 178 push( @use_ranks, $rank );
ffa22d6f 179 push( @groups, $rankgroup );
a44aaf2a 180 $lacunae{$rank} = $missing;
d71100ed 181 }
4ce27d42 182 # Run the solver
e59b8faa 183 my $answer = solve_variants( $stemma, @groups );
fae07016 184
88a6bac5 185 # Do further analysis on the answer
a44aaf2a 186 my $conflict_count = 0;
7234b01d 187 my $aclabel = $c->ac_label;
f629cb3b 188 foreach my $idx ( 0 .. $#use_ranks ) {
88a6bac5 189 my $location = $answer->{'variants'}->[$idx];
190 # Add the rank back in
f629cb3b 191 $location->{'id'} = $use_ranks[$idx];
7234b01d 192 # Note what our lacunae are
f629cb3b 193 my %lmiss;
7234b01d 194 map { $lmiss{$_} = 1 } @{$lacunae{$use_ranks[$idx]}};
195 # Run through the reading groups and add as 'lacunae' any redundant
196 # a.c. witnesses (yes, we have to do this before the analysis, thus
197 # identical loops before and after. Boo.)
198 # TODO Consider making these callbacks to analyze_location
199 foreach my $rdghash ( @{$location->{'readings'}} ) {
200 my %rwits;
201 map { $rwits{$_} = 1 } @{$rdghash->{'group'}};
202 foreach my $rw ( keys %rwits ) {
203 if( $rw =~ /^(.*)\Q$aclabel\E$/ ) {
204 if( exists $rwits{$1} ) {
205 $lmiss{$rw} = 1;
206 delete $rwits{$rw};
207 }
208 }
209 }
210 $rdghash->{'group'} = [ keys %rwits ];
211 }
212 $location->{'missing'} = [ keys %lmiss ];
213
88a6bac5 214 # Run the extra analysis we need.
88a6bac5 215 analyze_location( $tradition, $stemma->graph, $location );
7234b01d 216
217 # Do the final post-analysis tidying up of the data.
a44aaf2a 218 foreach my $rdghash ( @{$location->{'readings'}} ) {
219 $conflict_count++
220 if exists $rdghash->{'conflict'} && $rdghash->{'conflict'};
f629cb3b 221 # Add the reading text back in
a44aaf2a 222 my $rdg = $c->reading( $rdghash->{'readingid'} );
223 $rdghash->{'text'} = $rdg ? $rdg->text : $rdghash->{'readingid'};
f629cb3b 224 # Remove lacunose witnesses from this reading's list now that the
7234b01d 225 # analysis is done
f629cb3b 226 my @realgroup;
7234b01d 227 map { push( @realgroup, $_ ) unless $lmiss{$_} } @{$rdghash->{'group'}};
f629cb3b 228 $rdghash->{'group'} = \@realgroup;
7234b01d 229 # TODO Record hypotheticals used to create group, if we end up
230 # needing it
a44aaf2a 231 }
88a6bac5 232 }
a44aaf2a 233 $answer->{'conflict_count'} = $conflict_count;
f00cefe8 234
88a6bac5 235 return $answer;
d71100ed 236}
237
7f52eac8 238=head2 group_variants( $tradition, $rank, $lacunose, @merge_relationship_types )
239
240Groups the variants at the given $rank of the collation, treating any
241relationships in @merge_relationship_types as equivalent. $lacunose should
242be a reference to an array, to which the sigla of lacunose witnesses at this
243rank will be appended.
244
ffa22d6f 245Returns a hash $group_readings where $rdg is attested by the witnesses listed
246in $group_readings->{$rdg}.
7f52eac8 247
248=cut
249
250# Return group_readings, groups, lacunose
d1348d38 251sub group_variants {
7f52eac8 252 my( $tradition, $rank, $lacunose, $collapse ) = @_;
253 my $c = $tradition->collation;
335a62ef 254 my $aclabel = $c->ac_label;
255
7f52eac8 256 # Get the alignment table readings
257 my %readings_at_rank;
ffa22d6f 258 my %is_lacunose; # lookup table for $lacunose
259 map { $is_lacunose{$_} = 1 } @$lacunose;
7f52eac8 260 my @gap_wits;
1d73ecad 261 foreach my $tablewit ( @{$c->alignment_table->{'alignment'}} ) {
7f52eac8 262 my $rdg = $tablewit->{'tokens'}->[$rank-1];
fae07016 263 my $wit = $tablewit->{'witness'};
ffa22d6f 264 # Exclude the witness if it is "lacunose" which if we got here
265 # means "not in the stemma".
266 next if $is_lacunose{$wit};
7f52eac8 267 if( $rdg && $rdg->{'t'}->is_lacuna ) {
335a62ef 268 _add_to_witlist( $wit, $lacunose, $aclabel );
7f52eac8 269 } elsif( $rdg ) {
270 $readings_at_rank{$rdg->{'t'}->text} = $rdg->{'t'};
271 } else {
335a62ef 272 _add_to_witlist( $wit, \@gap_wits, $aclabel );
7f52eac8 273 }
274 }
d1348d38 275
7f52eac8 276 # Group the readings, collapsing groups by relationship if needed
277 my %grouped_readings;
4ce27d42 278 foreach my $rdg ( values %readings_at_rank ) {
7f52eac8 279 # Skip readings that have been collapsed into others.
f00cefe8 280 next if exists $grouped_readings{$rdg->id} && !$grouped_readings{$rdg->id};
4ce27d42 281 # Get the witness list, including from readings collapsed into this one.
7f52eac8 282 my @wits = $rdg->witnesses;
283 if( $collapse ) {
284 my $filter = sub { my $r = $_[0]; grep { $_ eq $r->type } @$collapse; };
285 foreach my $other ( $rdg->related_readings( $filter ) ) {
fae07016 286 my @otherwits = $other->witnesses;
fae07016 287 push( @wits, @otherwits );
f00cefe8 288 $grouped_readings{$other->id} = 0;
d1348d38 289 }
290 }
335a62ef 291 # Filter the group to those witnesses in the stemma
4ce27d42 292 my @use_wits;
293 foreach my $wit ( @wits ) {
294 next if $is_lacunose{$wit};
295 push( @use_wits, $wit );
4ce27d42 296 }
ffa22d6f 297 $grouped_readings{$rdg->id} = \@use_wits;
7f52eac8 298 }
299 $grouped_readings{'(omitted)'} = \@gap_wits if @gap_wits;
300 # Get rid of our collapsed readings
301 map { delete $grouped_readings{$_} unless $grouped_readings{$_} }
302 keys %grouped_readings
303 if $collapse;
304
4ce27d42 305 # Return the result
5be0cdeb 306 return \%grouped_readings;
d1348d38 307}
308
335a62ef 309# Helper function to ensure that X and X a.c. never appear in the same list.
310sub _add_to_witlist {
311 my( $wit, $list, $acstr ) = @_;
312 my %inlist;
313 my $idx = 0;
314 map { $inlist{$_} = $idx++ } @$list;
315 if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
316 my $acwit = $1;
317 unless( exists $inlist{$acwit} ) {
318 push( @$list, $acwit.$acstr );
319 }
320 } else {
321 if( exists( $inlist{$wit.$acstr} ) ) {
322 # Replace the a.c. version with the main witness
323 my $i = $inlist{$wit.$acstr};
324 $list->[$i] = $wit;
325 } else {
326 push( @$list, $wit );
327 }
328 }
329}
330
88a6bac5 331=head2 solve_variants( $graph, @groups )
332
333Sends the set of groups to the external graph solver service and returns
334a cleaned-up answer, adding the rank IDs back where they belong.
335
336The JSON has the form
337 { "graph": [ stemmagraph DOT string without newlines ],
338 "groupings": [ array of arrays of groups, one per rank ] }
339
340The answer has the form
341 { "variants" => [ array of variant location structures ],
342 "variant_count" => total,
343 "conflict_count" => number of conflicts detected,
344 "genealogical_count" => number of solutions found }
345
346=cut
347
348sub solve_variants {
e59b8faa 349 my( $stemma, @groups ) = @_;
335a62ef 350 my $aclabel = $stemma->collation->ac_label;
351
352 # Filter the groups down to distinct groups, and work out what graph
353 # should be used in the calculation of each group. We want to send each
354 # distinct problem to the solver only once.
355 # We need a whole bunch of lookup tables for this.
356 my $index_groupkeys = {}; # Save the order of readings
357 my $group_indices = {}; # Save the indices that have a given grouping
358 my $graph_problems = {}; # Save the groupings for the given graph
359
360 foreach my $idx ( 0..$#groups ) {
361 my $ghash = $groups[$idx];
88a6bac5 362 my @grouping;
335a62ef 363 # Sort the groupings from big to little, and scan for a.c. witnesses
364 # that would need an extended graph.
365 my @acwits; # note which AC witnesses crop up at this rank
366 my @idxkeys = sort { scalar @{$ghash->{$b}} <=> scalar @{$ghash->{$a}} }
367 keys %$ghash;
368 foreach my $rdg ( @idxkeys ) {
369 my @sg = sort @{$ghash->{$rdg}};
370 push( @acwits, grep { $_ =~ /\Q$aclabel\E$/ } @sg );
371 push( @grouping, \@sg );
372 }
373 # Save the reading order
374 $index_groupkeys->{$idx} = \@idxkeys;
375
376 # Now associate the distinct group with this index
377 my $gstr = wit_stringify( \@grouping );
378 push( @{$group_indices->{$gstr}}, $idx );
379
380 # Finally, add the group to the list to be calculated for this graph.
381 map { s/\Q$aclabel\E$// } @acwits;
382 my $graph = $stemma->extend_graph( \@acwits );
383 unless( exists $graph_problems->{"$graph"} ) {
384 $graph_problems->{"$graph"} = { 'object' => $graph, 'groups' => [] };
88a6bac5 385 }
335a62ef 386 push( @{$graph_problems->{"$graph"}->{'groups'}}, \@grouping );
88a6bac5 387 }
335a62ef 388
389 ## For each distinct graph, send its groups to the solver.
390 $DB::single = 1;
88a6bac5 391 my $solver_url = 'http://byzantini.st/cgi-bin/graphcalc.cgi';
392 my $ua = LWP::UserAgent->new();
335a62ef 393 ## Witness map is a HACK to get around limitations in node names from IDP
394 my $witness_map = {};
395 ## Variables to store answers as they come back
396 my $variants = [ ( undef ) x ( scalar keys %$index_groupkeys ) ];
88a6bac5 397 my $genealogical = 0;
335a62ef 398 foreach my $graphkey ( keys %$graph_problems ) {
399 my $graph = $graph_problems->{$graphkey}->{'object'};
400 my $groupings = $graph_problems->{$graphkey}->{'groups'};
401 my $json = encode_json( _safe_wit_strings( $graph, $stemma->collation,
402 $groupings, $witness_map ) );
403 # Send it off and get the result
404 my $resp = $ua->post( $solver_url, 'Content-Type' => 'application/json',
405 'Content' => $json );
406 my $answer;
407 my $used_idp;
408 if( $resp->is_success ) {
409 $answer = _desanitize_names( decode_json( $resp->content ), $witness_map );
410 $used_idp = 1;
411 } else {
412 # Fall back to the old method.
413 warn "IDP solver returned " . $resp->status_line . " / " . $resp->content
414 . "; falling back to perl method";
415 $answer = perl_solver( $graph, @$groupings );
416 }
417 ## The answer is the evaluated groupings, plus a boolean for whether
418 ## they were genealogical. Reconstruct our original groups.
419 foreach my $gidx ( 0 .. $#{$groupings} ) {
420 my( $calc_groups, $result ) = @{$answer->[$gidx]};
421 if( $result ) {
422 $genealogical++;
423 # Prune the calculated groups, in case the IDP solver failed to.
424 if( $used_idp ) {
425 my @pruned_groups;
426 foreach my $cg ( @$calc_groups ) {
427 # This is a little wasteful but the path of least
428 # resistance. Send both the stemma, which knows what
429 # its hypotheticals are, and the actual graph used.
430 my @pg = _prune_group( $cg, $stemma, $graph );
431 push( @pruned_groups, \@pg );
432 }
433 $calc_groups = \@pruned_groups;
6d25a3a0 434 }
335a62ef 435 }
436 # Retrieve the key for the original group that went to the solver
437 my $input_group = wit_stringify( $groupings->[$gidx] );
438 foreach my $oidx ( @{$group_indices->{$input_group}} ) {
439 my @readings = @{$index_groupkeys->{$oidx}};
440 my $vstruct = {
441 'genealogical' => $result,
442 'readings' => [],
443 };
444 foreach my $ridx ( 0 .. $#readings ) {
445 push( @{$vstruct->{'readings'}},
446 { 'readingid' => $readings[$ridx],
447 'group' => $calc_groups->[$ridx] } );
448 }
449 $variants->[$oidx] = $vstruct;
6d25a3a0 450 }
451 }
88a6bac5 452 }
453
454 return { 'variants' => $variants,
455 'variant_count' => scalar @$variants,
456 'genealogical_count' => $genealogical };
457}
458
b4cb2d60 459#### HACKERY to cope with IDP's limited idea of what a node name looks like ###
460
461sub _safe_wit_strings {
335a62ef 462 my( $graph, $c, $groupings, $witness_map ) = @_;
463 # Parse the graph we were given into a stemma.
b4cb2d60 464 my $safegraph = Graph->new();
465 # Convert the graph to a safe representation and store the conversion.
335a62ef 466 foreach my $n ( $graph->vertices ) {
b4cb2d60 467 my $sn = _safe_witstr( $n );
335a62ef 468 if( exists $witness_map->{$sn} ) {
469 warn "Ambiguous stringification $sn for $n and " . $witness_map->{$sn}
470 if $witness_map->{$sn} ne $n;
471 } else {
472 $witness_map->{$sn} = $n;
473 }
b4cb2d60 474 $safegraph->add_vertex( $sn );
475 $safegraph->set_vertex_attributes( $sn,
335a62ef 476 $graph->get_vertex_attributes( $n ) );
b4cb2d60 477 }
335a62ef 478 foreach my $e ( $graph->edges ) {
b4cb2d60 479 my @safe_e = ( _safe_witstr( $e->[0] ), _safe_witstr( $e->[1] ) );
480 $safegraph->add_edge( @safe_e );
481 }
482 my $safe_stemma = Text::Tradition::Stemma->new(
335a62ef 483 'collation' => $c, 'graph' => $safegraph );
b4cb2d60 484
485 # Now convert the witness groupings to a safe representation.
486 my $safe_groupings = [];
487 foreach my $grouping ( @$groupings ) {
488 my $safe_grouping = [];
489 foreach my $group ( @$grouping ) {
490 my $safe_group = [];
491 foreach my $n ( @$group ) {
492 my $sn = _safe_witstr( $n );
493 warn "Ambiguous stringification $sn for $n and " . $witness_map->{$sn}
494 if exists $witness_map->{$sn} && $witness_map->{$sn} ne $n;
495 $witness_map->{$sn} = $n;
496 push( @$safe_group, $sn );
497 }
498 push( @$safe_grouping, $safe_group );
499 }
500 push( @$safe_groupings, $safe_grouping );
501 }
502
503 # Return it all in the struct we expect. We have stored the reductions
504 # in the $witness_map that we were passed.
335a62ef 505 return { 'graph' => $safe_stemma->editable( { 'linesep' => ' ' } ),
506 'groupings' => $safe_groupings };
b4cb2d60 507}
508
509sub _safe_witstr {
510 my $witstr = shift;
511 $witstr =~ s/\s+/_/g;
512 $witstr =~ s/[^\w\d-]//g;
513 return $witstr;
514}
515
516sub _desanitize_names {
517 my( $jsonstruct, $witness_map ) = @_;
518 my $result = [];
519 foreach my $grouping ( @$jsonstruct ) {
520 my $real_grouping = [];
521 foreach my $element ( @$grouping ) {
522 if( ref( $element ) eq 'ARRAY' ) {
523 # it's the groupset.
524 my $real_groupset = [];
525 foreach my $group ( @$element ) {
526 my $real_group = [];
527 foreach my $n ( @$group ) {
528 my $rn = $witness_map->{$n};
529 push( @$real_group, $rn );
530 }
531 push( @$real_groupset, $real_group );
532 }
533 push( @$real_grouping, $real_groupset );
534 } else {
535 # It is the boolean, not actually a group.
536 push( @$real_grouping, $element );
537 }
538 }
539 push( @$result, $real_grouping );
540 }
541 return $result;
542}
543
544### END HACKERY ###
545
fae07016 546=head2 analyze_location ( $tradition, $graph, $location_hash )
7f52eac8 547
fae07016 548Given the tradition, its stemma graph, and the solution from the graph solver,
549work out the rest of the information we want. For each reading we need missing,
550conflict, reading_parents, independent_occurrence, followed, not_followed, and follow_unknown. Alters the location_hash in place.
7f52eac8 551
552=cut
732152b1 553
fae07016 554sub analyze_location {
555 my ( $tradition, $graph, $variant_row ) = @_;
556
557 # Make a hash of all known node memberships, and make the subgraphs.
558 my $contig = {};
559 my $reading_roots = {};
560 my $subgraph = {};
561 foreach my $rdghash ( @{$variant_row->{'readings'}} ) {
562 my $rid = $rdghash->{'readingid'};
563 map { $contig->{$_} = $rid } @{$rdghash->{'group'}};
c4a4fb1b 564
fae07016 565 # Make the subgraph.
566 my $part = $graph->copy;
567 my %these_vertices;
568 map { $these_vertices{$_} = 1 } @{$rdghash->{'group'}};
569 $part->delete_vertices( grep { !$these_vertices{$_} } $part->vertices );
570 $subgraph->{$rid} = $part;
571 # Get the reading roots.
572 map { $reading_roots->{$_} = $rid } $part->predecessorless_vertices;
bebec0e9 573 }
574
fae07016 575 # Now that we have all the node group memberships, calculate followed/
bebec0e9 576 # non-followed/unknown values for each reading. Also figure out the
577 # reading's evident parent(s).
578 foreach my $rdghash ( @{$variant_row->{'readings'}} ) {
fae07016 579 # Group string key - TODO do we need this?
580 my $gst = wit_stringify( $rdghash->{'group'} );
581 my $rid = $rdghash->{'readingid'};
582 # Get the subgraph
583 my $part = $subgraph->{$rid};
584
585 # Start figuring things out.
bebec0e9 586 my @roots = $part->predecessorless_vertices;
587 $rdghash->{'independent_occurrence'} = scalar @roots;
588 $rdghash->{'followed'} = scalar( $part->vertices ) - scalar( @roots );
589 # Find the parent readings, if any, of this reading.
f00cefe8 590 my %rdgparents;
bebec0e9 591 foreach my $wit ( @roots ) {
f00cefe8 592 # Look in the main stemma to find this witness's extant or known-reading
593 # immediate ancestor(s), and look up the reading that each ancestor olds.
594 my @check = $graph->predecessors( $wit );
595 while( @check ) {
596 my @next;
597 foreach my $wparent( @check ) {
fae07016 598 my $preading = $contig->{$wparent};
599 if( $preading ) {
600 $rdgparents{$preading} = 1;
f00cefe8 601 } else {
602 push( @next, $graph->predecessors( $wparent ) );
603 }
604 }
605 @check = @next;
606 }
bebec0e9 607 }
f00cefe8 608 $rdghash->{'reading_parents'} = [ keys %rdgparents ];
bebec0e9 609
610 # Find the number of times this reading was altered, and the number of
611 # times we're not sure.
612 my( %nofollow, %unknownfollow );
613 foreach my $wit ( $part->vertices ) {
614 foreach my $wchild ( $graph->successors( $wit ) ) {
615 next if $part->has_vertex( $wchild );
fae07016 616 if( $reading_roots->{$wchild} && $contig->{$wchild} ) {
bebec0e9 617 # It definitely changed here.
618 $nofollow{$wchild} = 1;
619 } elsif( !($contig->{$wchild}) ) {
620 # The child is a hypothetical node not definitely in
621 # any group. Answer is unknown.
622 $unknownfollow{$wchild} = 1;
623 } # else it's a non-root node in a known group, and therefore
624 # is presumed to have its reading from its group, not this link.
625 }
626 }
627 $rdghash->{'not_followed'} = keys %nofollow;
628 $rdghash->{'follow_unknown'} = keys %unknownfollow;
fae07016 629
630 # Now say whether this reading represents a conflict.
631 unless( $variant_row->{'genealogical'} ) {
632 $rdghash->{'conflict'} = @roots != 1;
633 }
c4a4fb1b 634 }
d71100ed 635}
636
fae07016 637
638=head2 perl_solver( $tradition, $rank, $stemma_id, @merge_relationship_types )
639
640** NOTE ** This method should hopefully not be called - it is not guaranteed
641to be correct. Serves as a backup for the real solver.
642
643Runs an analysis of the given tradition, at the location given in $rank,
644against the graph of the stemma specified in $stemma_id. The argument
645@merge_relationship_types is an optional list of relationship types for
646which readings so related should be treated as equivalent.
647
648Returns a nested array data structure as follows:
649
650 [ [ group_list, is_genealogical ], [ group_list, is_genealogical ] ... ]
651
652where the group list is the array of arrays passed in for each element of @groups,
653possibly with the addition of hypothetical readings.
654
655
656=cut
657
658sub perl_solver {
335a62ef 659 my( $graph, @groups ) = @_;
e59b8faa 660 my @answer;
661 foreach my $g ( @groups ) {
662 push( @answer, _solve_variant_location( $graph, $g ) );
663 }
664 return \@answer;
fae07016 665}
666
e59b8faa 667sub _solve_variant_location {
668 my( $graph, $groups ) = @_;
fae07016 669 # Now do the work.
e59b8faa 670 my $contig = {};
671 my $subgraph = {};
672 my $is_conflicted;
673 my $conflict = {};
674
675 # Mark each ms as in its own group, first.
676 foreach my $g ( @$groups ) {
677 my $gst = wit_stringify( $g );
678 map { $contig->{$_} = $gst } @$g;
679 }
680
681 # Now for each unmarked node in the graph, initialize an array
682 # for possible group memberships. We will use this later to
683 # resolve potential conflicts.
684 map { $contig->{$_} = [] unless $contig->{$_} } $graph->vertices;
685 foreach my $g ( sort { scalar @$b <=> scalar @$a } @$groups ) {
686 my $gst = wit_stringify( $g ); # This is the group name
687 # Copy the graph, and delete all non-members from the new graph.
688 my $part = $graph->copy;
689 my @group_roots;
690 $part->delete_vertices(
691 grep { !ref( $contig->{$_} ) && $contig->{$_} ne $gst } $graph->vertices );
692
693 # Now look to see if our group is connected.
694 if( @$g > 1 ) {
695 # We have to take directionality into account.
696 # How many root nodes do we have?
697 my @roots = grep { ref( $contig->{$_} ) || $contig->{$_} eq $gst }
698 $part->predecessorless_vertices;
699 # Assuming that @$g > 1, find the first root node that has at
700 # least one successor belonging to our group. If this reading
701 # is genealogical, there should be only one, but we will check
702 # that implicitly later.
703 foreach my $root ( @roots ) {
704 # Prune the tree to get rid of extraneous hypotheticals.
705 $root = _prune_subtree( $part, $root, $contig );
706 next unless $root;
707 # Save this root for our group.
708 push( @group_roots, $root );
709 # Get all the successor nodes of our root.
710 }
711 } else {
712 # Dispense with the trivial case of one reading.
713 my $wit = $g->[0];
714 @group_roots = ( $wit );
715 foreach my $v ( $part->vertices ) {
716 $part->delete_vertex( $v ) unless $v eq $wit;
717 }
718 }
719
720 if( @group_roots > 1 ) {
721 $conflict->{$gst} = 1;
722 $is_conflicted = 1;
723 }
724 # Paint the 'hypotheticals' with our group.
725 foreach my $wit ( $part->vertices ) {
726 if( ref( $contig->{$wit} ) ) {
727 push( @{$contig->{$wit}}, $gst );
728 } elsif( $contig->{$wit} ne $gst ) {
729 warn "How did we get here?";
730 }
731 }
732
733
734 # Save the relevant subgraph.
735 $subgraph->{$gst} = $part;
736 }
737
738 # For each of our hypothetical readings, flatten its 'contig' array if
739 # the array contains zero or one group. If we have any unflattened arrays,
740 # we may need to run the resolution process. If the reading is already known
741 # to have a conflict, flatten the 'contig' array to nothing; we won't resolve
742 # it.
743 my @resolve;
744 foreach my $wit ( keys %$contig ) {
745 next unless ref( $contig->{$wit} );
746 if( @{$contig->{$wit}} > 1 ) {
747 if( $is_conflicted ) {
748 $contig->{$wit} = ''; # We aren't going to decide.
749 } else {
750 push( @resolve, $wit );
751 }
752 } else {
753 my $gst = pop @{$contig->{$wit}};
754 $contig->{$wit} = $gst || '';
755 }
756 }
757
758 if( @resolve ) {
759 my $still_contig = {};
760 foreach my $h ( @resolve ) {
761 # For each of the hypothetical readings with more than one possibility,
762 # try deleting it from each of its member subgraphs in turn, and see
763 # if that breaks the contiguous grouping.
764 # TODO This can still break in a corner case where group A can use
765 # either vertex 1 or 2, and group B can use either vertex 2 or 1.
766 # Revisit this if necessary; it could get brute-force nasty.
767 foreach my $gst ( @{$contig->{$h}} ) {
768 my $gpart = $subgraph->{$gst}->copy();
769 # If we have come this far, there is only one root and everything
770 # is reachable from it.
771 my( $root ) = $gpart->predecessorless_vertices;
772 my $reachable = {};
773 map { $reachable->{$_} = 1 } $gpart->vertices;
774
775 # Try deleting the hypothetical node.
776 $gpart->delete_vertex( $h );
777 if( $h eq $root ) {
778 # See if we still have a single root.
779 my @roots = $gpart->predecessorless_vertices;
780 warn "This shouldn't have happened" unless @roots;
781 if( @roots > 1 ) {
782 # $h is needed by this group.
783 if( exists( $still_contig->{$h} ) ) {
784 # Conflict!
785 $conflict->{$gst} = 1;
786 $still_contig->{$h} = '';
787 } else {
788 $still_contig->{$h} = $gst;
789 }
790 }
791 } else {
792 # $h is somewhere in the middle. See if everything
793 # else can still be reached from the root.
794 my %still_reachable = ( $root => 1 );
795 map { $still_reachable{$_} = 1 }
796 $gpart->all_successors( $root );
797 foreach my $v ( keys %$reachable ) {
798 next if $v eq $h;
799 if( !$still_reachable{$v}
800 && ( $contig->{$v} eq $gst
801 || ( exists $still_contig->{$v}
802 && $still_contig->{$v} eq $gst ) ) ) {
803 # We need $h.
804 if( exists $still_contig->{$h} ) {
805 # Conflict!
806 $conflict->{$gst} = 1;
807 $still_contig->{$h} = '';
808 } else {
809 $still_contig->{$h} = $gst;
810 }
811 last;
812 } # else we don't need $h in this group.
813 } # end foreach $v
814 } # endif $h eq $root
815 } # end foreach $gst
816 } # end foreach $h
817
818 # Now we have some hypothetical vertices in $still_contig that are the
819 # "real" group memberships. Replace these in $contig.
820 foreach my $v ( keys %$contig ) {
821 next unless ref $contig->{$v};
822 $contig->{$v} = $still_contig->{$v};
823 }
824 } # end if @resolve
825
826 my $is_genealogical = keys %$conflict ? JSON::false : JSON::true;
827 my $variant_row = [ [], $is_genealogical ];
828 # Fill in the groupings from $contig.
829 foreach my $g ( @$groups ) {
830 my $gst = wit_stringify( $g );
831 my @realgroup = grep { $contig->{$_} eq $gst } keys %$contig;
832 push( @{$variant_row->[0]}, \@realgroup );
833 }
834 return $variant_row;
835}
fae07016 836
6d25a3a0 837sub _prune_group {
335a62ef 838 my( $group, $stemma, $graph ) = @_;
6d25a3a0 839 # Get these into a form prune_subtree will recognize. Make a "contighash"
840 my $hypohash = {};
841 map { $hypohash->{$_} = 1 } @$group;
842 # ...with reference values for hypotheticals.
843 map { $hypohash->{$_} = [] } $stemma->hypotheticals;
844 # Make our subgraph
335a62ef 845 my $subgraph = $graph->copy;
6d25a3a0 846 map { $subgraph->delete_vertex( $_ ) unless exists $hypohash->{$_} }
847 $subgraph->vertices;
848 # ...and find the root.
849 my( $root ) = $subgraph->predecessorless_vertices;
850 # Now prune and return the remaining vertices.
851 _prune_subtree( $subgraph, $root, $hypohash );
852 return $subgraph->vertices;
853}
854
7f52eac8 855sub _prune_subtree {
231d71fc 856 my( $tree, $root, $contighash ) = @_;
857 # First, delete hypothetical leaves / orphans until there are none left.
858 my @orphan_hypotheticals = grep { ref( $contighash->{$_} ) }
859 $tree->successorless_vertices;
860 while( @orphan_hypotheticals ) {
861 $tree->delete_vertices( @orphan_hypotheticals );
862 @orphan_hypotheticals = grep { ref( $contighash->{$_} ) }
863 $tree->successorless_vertices;
864 }
865 # Then delete a hypothetical root with only one successor, moving the
bebec0e9 866 # root to the first child that has no other predecessors.
231d71fc 867 while( $tree->successors( $root ) == 1 && ref $contighash->{$root} ) {
868 my @nextroot = $tree->successors( $root );
869 $tree->delete_vertex( $root );
bebec0e9 870 ( $root ) = grep { $tree->is_predecessorless_vertex( $_ ) } @nextroot;
231d71fc 871 }
872 # The tree has been modified in place, but we need to know the new root.
bebec0e9 873 $root = undef unless $root && $tree->has_vertex( $root );
231d71fc 874 return $root;
875}
d71100ed 876# Add the variant, subject to a.c. representation logic.
877# This assumes that we will see the 'main' version before the a.c. version.
878sub add_variant_wit {
879 my( $arr, $wit, $acstr ) = @_;
880 my $skip;
881 if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
882 my $real = $1;
883 $skip = grep { $_ =~ /^\Q$real\E$/ } @$arr;
884 }
885 push( @$arr, $wit ) unless $skip;
886}
887
5be0cdeb 888sub _useful_variant {
889 my( $group_readings, $graph, $acstr ) = @_;
890
891 # TODO Decide what to do with AC witnesses
892
893 # Sort by group size and return
894 my $is_useful = 0;
895 my( @readings, @groups ); # The sorted groups for our answer.
896 foreach my $rdg ( sort { @{$group_readings->{$b}} <=> @{$group_readings->{$a}} }
897 keys %$group_readings ) {
898 push( @readings, $rdg );
899 push( @groups, $group_readings->{$rdg} );
900 if( @{$group_readings->{$rdg}} > 1 ) {
901 $is_useful++;
902 } else {
903 my( $wit ) = @{$group_readings->{$rdg}};
904 $wit =~ s/^(.*)\Q$acstr\E$/$1/;
905 $is_useful++ unless( $graph->is_sink_vertex( $wit ) );
906 }
907 }
908 if( $is_useful > 1 ) {
909 return( \@readings, \@groups );
910 } else {
911 return( [], [] );
912 }
913}
914
7f52eac8 915=head2 wit_stringify( $groups )
916
917Takes an array of witness groupings and produces a string like
918['A','B'] / ['C','D','E'] / ['F']
d71100ed 919
7f52eac8 920=cut
d71100ed 921
922sub wit_stringify {
923 my $groups = shift;
924 my @gst;
925 # If we were passed an array of witnesses instead of an array of
926 # groupings, then "group" the witnesses first.
927 unless( ref( $groups->[0] ) ) {
928 my $mkgrp = [ $groups ];
929 $groups = $mkgrp;
930 }
931 foreach my $g ( @$groups ) {
932 push( @gst, '[' . join( ',', map { "'$_'" } @$g ) . ']' );
933 }
934 return join( ' / ', @gst );
935}
7f52eac8 936
bebec0e9 937sub _symmdiff {
938 my( $lista, $listb ) = @_;
7f52eac8 939 my %union;
940 my %scalars;
941 map { $union{$_} = 1; $scalars{$_} = $_ } @$lista;
942 map { $union{$_} += 1; $scalars{$_} = $_ } @$listb;
bebec0e9 943 my @set = grep { $union{$_} == 1 } keys %union;
7f52eac8 944 return map { $scalars{$_} } @set;
945}
946
9471;
948
949=head1 LICENSE
950
951This package is free software and is provided "as is" without express
952or implied warranty. You can redistribute it and/or modify it under
953the same terms as Perl itself.
954
955=head1 AUTHOR
956
957Tara L Andrews E<lt>aurum@cpan.orgE<gt>