fix a.c. handling on empty lemmata
[scpubgit/stemmatology.git] / lib / Text / Tradition / Analysis.pm
CommitLineData
d71100ed 1package Text::Tradition::Analysis;
2
3use strict;
4use warnings;
e4386ba9 5use Benchmark;
88a6bac5 6use Encode qw/ encode_utf8 /;
d1348d38 7use Exporter 'import';
b4cb2d60 8use Graph;
88a6bac5 9use JSON qw/ encode_json decode_json /;
10use LWP::UserAgent;
d71100ed 11use Text::Tradition;
12use Text::Tradition::Stemma;
13
d1348d38 14use vars qw/ @EXPORT_OK /;
a2cf85dd 15@EXPORT_OK = qw/ run_analysis group_variants analyze_variant_location wit_stringify /;
d1348d38 16
7f52eac8 17=head1 NAME
18
19Text::Tradition::Analysis - functions for stemma analysis of a tradition
20
21=head1 SYNOPSIS
22
23 use Text::Tradition;
24 use Text::Tradition::Analysis qw/ run_analysis analyze_variant_location /;
25 my $t = Text::Tradition->new(
26 'name' => 'this is a text',
27 'input' => 'TEI',
28 'file' => '/path/to/tei_parallel_seg_file.xml' );
29 $t->add_stemma( 'dotfile' => $stemmafile );
30
31 my $variant_data = run_analysis( $tradition );
32 # Recalculate rank $n treating all orthographic variants as equivalent
33 my $reanalyze = analyze_variant_location( $tradition, $n, 0, 'orthographic' );
34
35=head1 DESCRIPTION
36
37Text::Tradition is a library for representation and analysis of collated
38texts, particularly medieval ones. The Collation is the central feature of
39a Tradition, where the text, its sequence of readings, and its relationships
40between readings are actually kept.
41
42=head1 SUBROUTINES
43
88a6bac5 44=head2 run_analysis( $tradition, %opts )
7f52eac8 45
88a6bac5 46Runs the analysis described in analyze_variant_location on every location in the
47collation of the given tradition, with the given options. These include:
7f52eac8 48
88a6bac5 49=over 4
50
51=item * stemma_id - Specify which of the tradition's stemmata to use. Default
52is 0 (i.e. the first).
53
54=item * ranks - Specify a list of location ranks to analyze; exclude the rest.
55
56=item * merge_types - Specify a list of relationship types, where related readings
57should be treated as identical for the purposes of analysis.
58
ffa22d6f 59=item * exclude_type1 - Exclude those ranks whose groupings have only type-1 variants.
60
88a6bac5 61=back
7f52eac8 62
63=begin testing
64
65use Text::Tradition;
66use Text::Tradition::Analysis qw/ run_analysis analyze_variant_location /;
67
68my $datafile = 't/data/florilegium_tei_ps.xml';
69my $tradition = Text::Tradition->new( 'input' => 'TEI',
70 'name' => 'test0',
71 'file' => $datafile );
72my $s = $tradition->add_stemma( 'dotfile' => 't/data/florilegium.dot' );
73is( ref( $s ), 'Text::Tradition::Stemma', "Added stemma to tradition" );
74
f00cefe8 75my %expected_genealogical = (
a44aaf2a 76 1 => 0,
f00cefe8 77 2 => 1,
a44aaf2a 78 3 => 0,
79 5 => 0,
80 7 => 0,
81 8 => 0,
82 10 => 0,
f00cefe8 83 13 => 1,
a44aaf2a 84 33 => 0,
85 34 => 0,
86 37 => 0,
87 60 => 0,
f00cefe8 88 81 => 1,
a44aaf2a 89 84 => 0,
90 87 => 0,
91 101 => 0,
92 102 => 0,
f00cefe8 93 122 => 1,
a44aaf2a 94 157 => 0,
f00cefe8 95 166 => 1,
96 169 => 1,
a44aaf2a 97 200 => 0,
f00cefe8 98 216 => 1,
99 217 => 1,
100 219 => 1,
101 241 => 1,
102 242 => 1,
103 243 => 1,
104);
105
7f52eac8 106my $data = run_analysis( $tradition );
7234b01d 107my $c = $tradition->collation;
f00cefe8 108foreach my $row ( @{$data->{'variants'}} ) {
a44aaf2a 109 # Account for rows that used to be "not useful"
110 unless( exists $expected_genealogical{$row->{'id'}} ) {
111 $expected_genealogical{$row->{'id'}} = 1;
112 }
6d25a3a0 113 is( $row->{'genealogical'}, $expected_genealogical{$row->{'id'}},
f00cefe8 114 "Got correct genealogical flag for row " . $row->{'id'} );
7234b01d 115 # Check that we have the right row with the right groups
116 my $rank = $row->{'id'};
117 foreach my $rdghash ( @{$row->{'readings'}} ) {
118 # Skip 'readings' that aren't really
119 next unless $c->reading( $rdghash->{'readingid'} );
120 # Check the rank
121 is( $c->reading( $rdghash->{'readingid'} )->rank, $rank,
122 "Got correct reading rank" );
123 # Check the witnesses
124 my @realwits = sort $c->reading_witnesses( $rdghash->{'readingid'} );
125 my @sgrp = sort @{$rdghash->{'group'}};
126 is_deeply( \@sgrp, \@realwits, "Reading analyzed with correct groups" );
127 }
f00cefe8 128}
a44aaf2a 129is( $data->{'variant_count'}, 58, "Got right total variant number" );
b4cb2d60 130# TODO Make something meaningful of conflict count, maybe test other bits
7f52eac8 131
132=end testing
133
134=cut
135
d71100ed 136sub run_analysis {
88a6bac5 137 my( $tradition, %opts ) = @_;
f00cefe8 138 my $c = $tradition->collation;
88a6bac5 139
140 my $stemma_id = $opts{'stemma_id'} || 0;
1d73ecad 141 my @ranks = ref( $opts{'ranks'} ) eq 'ARRAY' ? @{$opts{'ranks'}} : ();
142 my @collapse = ref( $opts{'merge_types'} ) eq 'ARRAY' ? @{$opts{'merge_types'}} : ();
88a6bac5 143
144 # Get the stemma
145 my $stemma = $tradition->stemma( $stemma_id );
b4cb2d60 146
4ce27d42 147 # Figure out which witnesses we are working with - that is, the ones that
148 # appear both in the stemma and in the tradition. All others are 'lacunose'
149 # for our purposes.
88a6bac5 150 my @lacunose = $stemma->hypotheticals;
fae07016 151 my @tradition_wits = map { $_->sigil } $tradition->witnesses;
b4cb2d60 152 map { push( @tradition_wits, $_->sigil.$c->ac_label ) if $_->is_layered }
fae07016 153 $tradition->witnesses;
154 push( @lacunose, _symmdiff( [ $stemma->witnesses ], \@tradition_wits ) );
88a6bac5 155
156 # Find and mark 'common' ranks for exclusion, unless they were
157 # explicitly specified.
158 unless( @ranks ) {
159 my %common_rank;
a44aaf2a 160 foreach my $rdg ( $c->common_readings ) {
88a6bac5 161 $common_rank{$rdg->rank} = 1;
162 }
163 @ranks = grep { !$common_rank{$_} } ( 1 .. $c->end->rank-1 );
d71100ed 164 }
7f52eac8 165
88a6bac5 166 # Group the variants to send to the solver
167 my @groups;
f629cb3b 168 my @use_ranks;
a44aaf2a 169 my %lacunae;
88a6bac5 170 foreach my $rank ( @ranks ) {
a44aaf2a 171 my $missing = [ @lacunose ];
ffa22d6f 172 my $rankgroup = group_variants( $tradition, $rank, $missing, \@collapse );
173 if( $opts{'exclude_type1'} ) {
174 # Check to see whether this is a "useful" group.
175 my( $rdgs, $grps ) = _useful_variant( $rankgroup,
176 $stemma->graph, $c->ac_label );
177 next unless @$rdgs;
178 }
f629cb3b 179 push( @use_ranks, $rank );
ffa22d6f 180 push( @groups, $rankgroup );
a44aaf2a 181 $lacunae{$rank} = $missing;
d71100ed 182 }
4ce27d42 183 # Run the solver
e59b8faa 184 my $answer = solve_variants( $stemma, @groups );
fae07016 185
88a6bac5 186 # Do further analysis on the answer
a44aaf2a 187 my $conflict_count = 0;
7234b01d 188 my $aclabel = $c->ac_label;
f629cb3b 189 foreach my $idx ( 0 .. $#use_ranks ) {
88a6bac5 190 my $location = $answer->{'variants'}->[$idx];
191 # Add the rank back in
f629cb3b 192 $location->{'id'} = $use_ranks[$idx];
4ce27d42 193 $DB::single = 1 if $use_ranks[$idx] == 87;
7234b01d 194 # Note what our lacunae are
f629cb3b 195 my %lmiss;
7234b01d 196 map { $lmiss{$_} = 1 } @{$lacunae{$use_ranks[$idx]}};
197 # Run through the reading groups and add as 'lacunae' any redundant
198 # a.c. witnesses (yes, we have to do this before the analysis, thus
199 # identical loops before and after. Boo.)
200 # TODO Consider making these callbacks to analyze_location
201 foreach my $rdghash ( @{$location->{'readings'}} ) {
202 my %rwits;
203 map { $rwits{$_} = 1 } @{$rdghash->{'group'}};
204 foreach my $rw ( keys %rwits ) {
205 if( $rw =~ /^(.*)\Q$aclabel\E$/ ) {
206 if( exists $rwits{$1} ) {
207 $lmiss{$rw} = 1;
208 delete $rwits{$rw};
209 }
210 }
211 }
212 $rdghash->{'group'} = [ keys %rwits ];
213 }
214 $location->{'missing'} = [ keys %lmiss ];
215
88a6bac5 216 # Run the extra analysis we need.
88a6bac5 217 analyze_location( $tradition, $stemma->graph, $location );
7234b01d 218
219 # Do the final post-analysis tidying up of the data.
a44aaf2a 220 foreach my $rdghash ( @{$location->{'readings'}} ) {
221 $conflict_count++
222 if exists $rdghash->{'conflict'} && $rdghash->{'conflict'};
f629cb3b 223 # Add the reading text back in
a44aaf2a 224 my $rdg = $c->reading( $rdghash->{'readingid'} );
225 $rdghash->{'text'} = $rdg ? $rdg->text : $rdghash->{'readingid'};
f629cb3b 226 # Remove lacunose witnesses from this reading's list now that the
7234b01d 227 # analysis is done
f629cb3b 228 my @realgroup;
7234b01d 229 map { push( @realgroup, $_ ) unless $lmiss{$_} } @{$rdghash->{'group'}};
f629cb3b 230 $rdghash->{'group'} = \@realgroup;
7234b01d 231 # TODO Record hypotheticals used to create group, if we end up
232 # needing it
a44aaf2a 233 }
88a6bac5 234 }
a44aaf2a 235 $answer->{'conflict_count'} = $conflict_count;
f00cefe8 236
88a6bac5 237 return $answer;
d71100ed 238}
239
7f52eac8 240=head2 group_variants( $tradition, $rank, $lacunose, @merge_relationship_types )
241
242Groups the variants at the given $rank of the collation, treating any
243relationships in @merge_relationship_types as equivalent. $lacunose should
244be a reference to an array, to which the sigla of lacunose witnesses at this
245rank will be appended.
246
ffa22d6f 247Returns a hash $group_readings where $rdg is attested by the witnesses listed
248in $group_readings->{$rdg}.
7f52eac8 249
250=cut
251
252# Return group_readings, groups, lacunose
d1348d38 253sub group_variants {
7f52eac8 254 my( $tradition, $rank, $lacunose, $collapse ) = @_;
255 my $c = $tradition->collation;
b4cb2d60 256 my $aclabel = $c->ac_label;
4ce27d42 257 my %seen_acwits;
258 map { $seen_acwits{$_->sigil.$aclabel} = 0 if $_->is_layered } $tradition->witnesses;
7f52eac8 259 # Get the alignment table readings
260 my %readings_at_rank;
ffa22d6f 261 my %is_lacunose; # lookup table for $lacunose
262 map { $is_lacunose{$_} = 1 } @$lacunose;
7f52eac8 263 my @gap_wits;
1d73ecad 264 foreach my $tablewit ( @{$c->alignment_table->{'alignment'}} ) {
7f52eac8 265 my $rdg = $tablewit->{'tokens'}->[$rank-1];
fae07016 266 my $wit = $tablewit->{'witness'};
ffa22d6f 267 # Exclude the witness if it is "lacunose" which if we got here
268 # means "not in the stemma".
269 next if $is_lacunose{$wit};
7f52eac8 270 if( $rdg && $rdg->{'t'}->is_lacuna ) {
4ce27d42 271 push( @$lacunose, $wit );
7f52eac8 272 } elsif( $rdg ) {
273 $readings_at_rank{$rdg->{'t'}->text} = $rdg->{'t'};
274 } else {
4ce27d42 275 $seen_acwits{$wit} = 1 if exists $seen_acwits{$wit};
276 push( @gap_wits, $wit );
7f52eac8 277 }
278 }
d1348d38 279
7f52eac8 280 # Group the readings, collapsing groups by relationship if needed
281 my %grouped_readings;
4ce27d42 282 foreach my $rdg ( values %readings_at_rank ) {
7f52eac8 283 # Skip readings that have been collapsed into others.
f00cefe8 284 next if exists $grouped_readings{$rdg->id} && !$grouped_readings{$rdg->id};
4ce27d42 285 # Get the witness list, including from readings collapsed into this one.
7f52eac8 286 my @wits = $rdg->witnesses;
287 if( $collapse ) {
288 my $filter = sub { my $r = $_[0]; grep { $_ eq $r->type } @$collapse; };
289 foreach my $other ( $rdg->related_readings( $filter ) ) {
fae07016 290 my @otherwits = $other->witnesses;
fae07016 291 push( @wits, @otherwits );
f00cefe8 292 $grouped_readings{$other->id} = 0;
d1348d38 293 }
294 }
4ce27d42 295 # Filter the group to those witnesses in the stemma, and note any
296 # a.c. witnesses explicitly returned.
297 my @use_wits;
298 foreach my $wit ( @wits ) {
299 next if $is_lacunose{$wit};
300 push( @use_wits, $wit );
301 $seen_acwits{$wit} = 1 if exists $seen_acwits{$wit};
302 }
ffa22d6f 303 $grouped_readings{$rdg->id} = \@use_wits;
7f52eac8 304 }
305 $grouped_readings{'(omitted)'} = \@gap_wits if @gap_wits;
306 # Get rid of our collapsed readings
307 map { delete $grouped_readings{$_} unless $grouped_readings{$_} }
308 keys %grouped_readings
309 if $collapse;
4ce27d42 310 # Any unseen a.c. witnesses should be made lacunose
311 map { push( @$lacunose, $_ ) unless $seen_acwits{$_} } keys %seen_acwits;
7f52eac8 312
4ce27d42 313 # Return the result
5be0cdeb 314 return \%grouped_readings;
d1348d38 315}
316
88a6bac5 317=head2 solve_variants( $graph, @groups )
318
319Sends the set of groups to the external graph solver service and returns
320a cleaned-up answer, adding the rank IDs back where they belong.
321
322The JSON has the form
323 { "graph": [ stemmagraph DOT string without newlines ],
324 "groupings": [ array of arrays of groups, one per rank ] }
325
326The answer has the form
327 { "variants" => [ array of variant location structures ],
328 "variant_count" => total,
329 "conflict_count" => number of conflicts detected,
330 "genealogical_count" => number of solutions found }
331
332=cut
333
334sub solve_variants {
e59b8faa 335 my( $stemma, @groups ) = @_;
88a6bac5 336
337 # Make the json with stemma + groups
b4cb2d60 338 my $groupings = [];
88a6bac5 339 foreach my $ghash ( @groups ) {
340 my @grouping;
7234b01d 341 foreach my $k ( sort keys %$ghash ) {
88a6bac5 342 push( @grouping, $ghash->{$k} );
343 }
b4cb2d60 344 push( @$groupings, \@grouping );
88a6bac5 345 }
b4cb2d60 346 ## Witness map is a HACK to get around limitations in node names from IDP
347 my $witness_map = {};
348 my $json = encode_json( _safe_wit_strings( $stemma, $groupings, $witness_map ) );
88a6bac5 349
350 # Send it off and get the result
351 my $solver_url = 'http://byzantini.st/cgi-bin/graphcalc.cgi';
352 my $ua = LWP::UserAgent->new();
353 my $resp = $ua->post( $solver_url, 'Content-Type' => 'application/json',
354 'Content' => $json );
355
356 my $answer;
6d25a3a0 357 my $used_idp;
88a6bac5 358 if( $resp->is_success ) {
b4cb2d60 359 $answer = _desanitize_names( decode_json( $resp->content ), $witness_map );
6d25a3a0 360 $used_idp = 1;
88a6bac5 361 } else {
fae07016 362 # Fall back to the old method.
363 warn "IDP solver returned " . $resp->status_line . " / " . $resp->content
364 . "; falling back to perl method";
b4cb2d60 365 $answer = perl_solver( $stemma, @$groupings );
88a6bac5 366 }
367
368 # Fold the result back into what we know about the groups.
369 my $variants = [];
370 my $genealogical = 0;
371 foreach my $idx ( 0 .. $#groups ) {
372 my( $calc_groups, $result ) = @{$answer->[$idx]};
6d25a3a0 373 if( $result ) {
374 $genealogical++;
375 # Prune the calculated groups, in case the IDP solver failed to.
376 if( $used_idp ) {
377 my @pruned_groups;
378 foreach my $cg ( @$calc_groups ) {
379 my @pg = _prune_group( $cg, $stemma );
380 push( @pruned_groups, \@pg );
381 }
382 $calc_groups = \@pruned_groups;
383 }
384 }
88a6bac5 385 my $input_group = $groups[$idx];
7234b01d 386 foreach my $k ( sort keys %$input_group ) {
88a6bac5 387 my $cg = shift @$calc_groups;
388 $input_group->{$k} = $cg;
389 }
390 my $vstruct = {
391 'genealogical' => $result,
392 'readings' => [],
fae07016 393 };
f629cb3b 394 foreach my $k ( sort { @{$input_group->{$b}} <=> @{$input_group->{$a}} }
395 keys %$input_group ) {
88a6bac5 396 push( @{$vstruct->{'readings'}},
fae07016 397 { 'readingid' => $k, 'group' => $input_group->{$k}} );
88a6bac5 398 }
399 push( @$variants, $vstruct );
400 }
401
402 return { 'variants' => $variants,
403 'variant_count' => scalar @$variants,
404 'genealogical_count' => $genealogical };
405}
406
b4cb2d60 407#### HACKERY to cope with IDP's limited idea of what a node name looks like ###
408
409sub _safe_wit_strings {
410 my( $stemma, $groupings, $witness_map ) = @_;
411 my $safegraph = Graph->new();
412 # Convert the graph to a safe representation and store the conversion.
413 foreach my $n ( $stemma->graph->vertices ) {
414 my $sn = _safe_witstr( $n );
415 warn "Ambiguous stringification $sn for $n and " . $witness_map->{$sn}
416 if exists $witness_map->{$sn};
417 $witness_map->{$sn} = $n;
418 $safegraph->add_vertex( $sn );
419 $safegraph->set_vertex_attributes( $sn,
420 $stemma->graph->get_vertex_attributes( $n ) );
421 }
422 foreach my $e ( $stemma->graph->edges ) {
423 my @safe_e = ( _safe_witstr( $e->[0] ), _safe_witstr( $e->[1] ) );
424 $safegraph->add_edge( @safe_e );
425 }
426 my $safe_stemma = Text::Tradition::Stemma->new(
427 'collation' => $stemma->collation, 'graph' => $safegraph );
428
429 # Now convert the witness groupings to a safe representation.
430 my $safe_groupings = [];
431 foreach my $grouping ( @$groupings ) {
432 my $safe_grouping = [];
433 foreach my $group ( @$grouping ) {
434 my $safe_group = [];
435 foreach my $n ( @$group ) {
436 my $sn = _safe_witstr( $n );
437 warn "Ambiguous stringification $sn for $n and " . $witness_map->{$sn}
438 if exists $witness_map->{$sn} && $witness_map->{$sn} ne $n;
439 $witness_map->{$sn} = $n;
440 push( @$safe_group, $sn );
441 }
442 push( @$safe_grouping, $safe_group );
443 }
444 push( @$safe_groupings, $safe_grouping );
445 }
446
447 # Return it all in the struct we expect. We have stored the reductions
448 # in the $witness_map that we were passed.
449 return { 'graph' => $safe_stemma->editable( ' ' ), 'groupings' => $safe_groupings };
450}
451
452sub _safe_witstr {
453 my $witstr = shift;
454 $witstr =~ s/\s+/_/g;
455 $witstr =~ s/[^\w\d-]//g;
456 return $witstr;
457}
458
459sub _desanitize_names {
460 my( $jsonstruct, $witness_map ) = @_;
461 my $result = [];
462 foreach my $grouping ( @$jsonstruct ) {
463 my $real_grouping = [];
464 foreach my $element ( @$grouping ) {
465 if( ref( $element ) eq 'ARRAY' ) {
466 # it's the groupset.
467 my $real_groupset = [];
468 foreach my $group ( @$element ) {
469 my $real_group = [];
470 foreach my $n ( @$group ) {
471 my $rn = $witness_map->{$n};
472 push( @$real_group, $rn );
473 }
474 push( @$real_groupset, $real_group );
475 }
476 push( @$real_grouping, $real_groupset );
477 } else {
478 # It is the boolean, not actually a group.
479 push( @$real_grouping, $element );
480 }
481 }
482 push( @$result, $real_grouping );
483 }
484 return $result;
485}
486
487### END HACKERY ###
488
fae07016 489=head2 analyze_location ( $tradition, $graph, $location_hash )
7f52eac8 490
fae07016 491Given the tradition, its stemma graph, and the solution from the graph solver,
492work out the rest of the information we want. For each reading we need missing,
493conflict, reading_parents, independent_occurrence, followed, not_followed, and follow_unknown. Alters the location_hash in place.
7f52eac8 494
495=cut
732152b1 496
fae07016 497sub analyze_location {
498 my ( $tradition, $graph, $variant_row ) = @_;
499
500 # Make a hash of all known node memberships, and make the subgraphs.
501 my $contig = {};
502 my $reading_roots = {};
503 my $subgraph = {};
504 foreach my $rdghash ( @{$variant_row->{'readings'}} ) {
505 my $rid = $rdghash->{'readingid'};
506 map { $contig->{$_} = $rid } @{$rdghash->{'group'}};
c4a4fb1b 507
fae07016 508 # Make the subgraph.
509 my $part = $graph->copy;
510 my %these_vertices;
511 map { $these_vertices{$_} = 1 } @{$rdghash->{'group'}};
512 $part->delete_vertices( grep { !$these_vertices{$_} } $part->vertices );
513 $subgraph->{$rid} = $part;
514 # Get the reading roots.
515 map { $reading_roots->{$_} = $rid } $part->predecessorless_vertices;
bebec0e9 516 }
517
fae07016 518 # Now that we have all the node group memberships, calculate followed/
bebec0e9 519 # non-followed/unknown values for each reading. Also figure out the
520 # reading's evident parent(s).
521 foreach my $rdghash ( @{$variant_row->{'readings'}} ) {
fae07016 522 # Group string key - TODO do we need this?
523 my $gst = wit_stringify( $rdghash->{'group'} );
524 my $rid = $rdghash->{'readingid'};
525 # Get the subgraph
526 my $part = $subgraph->{$rid};
527
528 # Start figuring things out.
bebec0e9 529 my @roots = $part->predecessorless_vertices;
530 $rdghash->{'independent_occurrence'} = scalar @roots;
531 $rdghash->{'followed'} = scalar( $part->vertices ) - scalar( @roots );
532 # Find the parent readings, if any, of this reading.
f00cefe8 533 my %rdgparents;
bebec0e9 534 foreach my $wit ( @roots ) {
f00cefe8 535 # Look in the main stemma to find this witness's extant or known-reading
536 # immediate ancestor(s), and look up the reading that each ancestor olds.
537 my @check = $graph->predecessors( $wit );
538 while( @check ) {
539 my @next;
540 foreach my $wparent( @check ) {
fae07016 541 my $preading = $contig->{$wparent};
542 if( $preading ) {
543 $rdgparents{$preading} = 1;
f00cefe8 544 } else {
545 push( @next, $graph->predecessors( $wparent ) );
546 }
547 }
548 @check = @next;
549 }
bebec0e9 550 }
f00cefe8 551 $rdghash->{'reading_parents'} = [ keys %rdgparents ];
bebec0e9 552
553 # Find the number of times this reading was altered, and the number of
554 # times we're not sure.
555 my( %nofollow, %unknownfollow );
556 foreach my $wit ( $part->vertices ) {
557 foreach my $wchild ( $graph->successors( $wit ) ) {
558 next if $part->has_vertex( $wchild );
fae07016 559 if( $reading_roots->{$wchild} && $contig->{$wchild} ) {
bebec0e9 560 # It definitely changed here.
561 $nofollow{$wchild} = 1;
562 } elsif( !($contig->{$wchild}) ) {
563 # The child is a hypothetical node not definitely in
564 # any group. Answer is unknown.
565 $unknownfollow{$wchild} = 1;
566 } # else it's a non-root node in a known group, and therefore
567 # is presumed to have its reading from its group, not this link.
568 }
569 }
570 $rdghash->{'not_followed'} = keys %nofollow;
571 $rdghash->{'follow_unknown'} = keys %unknownfollow;
fae07016 572
573 # Now say whether this reading represents a conflict.
574 unless( $variant_row->{'genealogical'} ) {
575 $rdghash->{'conflict'} = @roots != 1;
576 }
c4a4fb1b 577 }
d71100ed 578}
579
fae07016 580
581=head2 perl_solver( $tradition, $rank, $stemma_id, @merge_relationship_types )
582
583** NOTE ** This method should hopefully not be called - it is not guaranteed
584to be correct. Serves as a backup for the real solver.
585
586Runs an analysis of the given tradition, at the location given in $rank,
587against the graph of the stemma specified in $stemma_id. The argument
588@merge_relationship_types is an optional list of relationship types for
589which readings so related should be treated as equivalent.
590
591Returns a nested array data structure as follows:
592
593 [ [ group_list, is_genealogical ], [ group_list, is_genealogical ] ... ]
594
595where the group list is the array of arrays passed in for each element of @groups,
596possibly with the addition of hypothetical readings.
597
598
599=cut
600
601sub perl_solver {
e59b8faa 602 my( $stemma, @groups ) = @_;
603 my $graph = $stemma->graph;
604 my @answer;
605 foreach my $g ( @groups ) {
606 push( @answer, _solve_variant_location( $graph, $g ) );
607 }
608 return \@answer;
fae07016 609}
610
e59b8faa 611sub _solve_variant_location {
612 my( $graph, $groups ) = @_;
fae07016 613 # Now do the work.
e59b8faa 614 my $contig = {};
615 my $subgraph = {};
616 my $is_conflicted;
617 my $conflict = {};
618
619 # Mark each ms as in its own group, first.
620 foreach my $g ( @$groups ) {
621 my $gst = wit_stringify( $g );
622 map { $contig->{$_} = $gst } @$g;
623 }
624
625 # Now for each unmarked node in the graph, initialize an array
626 # for possible group memberships. We will use this later to
627 # resolve potential conflicts.
628 map { $contig->{$_} = [] unless $contig->{$_} } $graph->vertices;
629 foreach my $g ( sort { scalar @$b <=> scalar @$a } @$groups ) {
630 my $gst = wit_stringify( $g ); # This is the group name
631 # Copy the graph, and delete all non-members from the new graph.
632 my $part = $graph->copy;
633 my @group_roots;
634 $part->delete_vertices(
635 grep { !ref( $contig->{$_} ) && $contig->{$_} ne $gst } $graph->vertices );
636
637 # Now look to see if our group is connected.
638 if( @$g > 1 ) {
639 # We have to take directionality into account.
640 # How many root nodes do we have?
641 my @roots = grep { ref( $contig->{$_} ) || $contig->{$_} eq $gst }
642 $part->predecessorless_vertices;
643 # Assuming that @$g > 1, find the first root node that has at
644 # least one successor belonging to our group. If this reading
645 # is genealogical, there should be only one, but we will check
646 # that implicitly later.
647 foreach my $root ( @roots ) {
648 # Prune the tree to get rid of extraneous hypotheticals.
649 $root = _prune_subtree( $part, $root, $contig );
650 next unless $root;
651 # Save this root for our group.
652 push( @group_roots, $root );
653 # Get all the successor nodes of our root.
654 }
655 } else {
656 # Dispense with the trivial case of one reading.
657 my $wit = $g->[0];
658 @group_roots = ( $wit );
659 foreach my $v ( $part->vertices ) {
660 $part->delete_vertex( $v ) unless $v eq $wit;
661 }
662 }
663
664 if( @group_roots > 1 ) {
665 $conflict->{$gst} = 1;
666 $is_conflicted = 1;
667 }
668 # Paint the 'hypotheticals' with our group.
669 foreach my $wit ( $part->vertices ) {
670 if( ref( $contig->{$wit} ) ) {
671 push( @{$contig->{$wit}}, $gst );
672 } elsif( $contig->{$wit} ne $gst ) {
673 warn "How did we get here?";
674 }
675 }
676
677
678 # Save the relevant subgraph.
679 $subgraph->{$gst} = $part;
680 }
681
682 # For each of our hypothetical readings, flatten its 'contig' array if
683 # the array contains zero or one group. If we have any unflattened arrays,
684 # we may need to run the resolution process. If the reading is already known
685 # to have a conflict, flatten the 'contig' array to nothing; we won't resolve
686 # it.
687 my @resolve;
688 foreach my $wit ( keys %$contig ) {
689 next unless ref( $contig->{$wit} );
690 if( @{$contig->{$wit}} > 1 ) {
691 if( $is_conflicted ) {
692 $contig->{$wit} = ''; # We aren't going to decide.
693 } else {
694 push( @resolve, $wit );
695 }
696 } else {
697 my $gst = pop @{$contig->{$wit}};
698 $contig->{$wit} = $gst || '';
699 }
700 }
701
702 if( @resolve ) {
703 my $still_contig = {};
704 foreach my $h ( @resolve ) {
705 # For each of the hypothetical readings with more than one possibility,
706 # try deleting it from each of its member subgraphs in turn, and see
707 # if that breaks the contiguous grouping.
708 # TODO This can still break in a corner case where group A can use
709 # either vertex 1 or 2, and group B can use either vertex 2 or 1.
710 # Revisit this if necessary; it could get brute-force nasty.
711 foreach my $gst ( @{$contig->{$h}} ) {
712 my $gpart = $subgraph->{$gst}->copy();
713 # If we have come this far, there is only one root and everything
714 # is reachable from it.
715 my( $root ) = $gpart->predecessorless_vertices;
716 my $reachable = {};
717 map { $reachable->{$_} = 1 } $gpart->vertices;
718
719 # Try deleting the hypothetical node.
720 $gpart->delete_vertex( $h );
721 if( $h eq $root ) {
722 # See if we still have a single root.
723 my @roots = $gpart->predecessorless_vertices;
724 warn "This shouldn't have happened" unless @roots;
725 if( @roots > 1 ) {
726 # $h is needed by this group.
727 if( exists( $still_contig->{$h} ) ) {
728 # Conflict!
729 $conflict->{$gst} = 1;
730 $still_contig->{$h} = '';
731 } else {
732 $still_contig->{$h} = $gst;
733 }
734 }
735 } else {
736 # $h is somewhere in the middle. See if everything
737 # else can still be reached from the root.
738 my %still_reachable = ( $root => 1 );
739 map { $still_reachable{$_} = 1 }
740 $gpart->all_successors( $root );
741 foreach my $v ( keys %$reachable ) {
742 next if $v eq $h;
743 if( !$still_reachable{$v}
744 && ( $contig->{$v} eq $gst
745 || ( exists $still_contig->{$v}
746 && $still_contig->{$v} eq $gst ) ) ) {
747 # We need $h.
748 if( exists $still_contig->{$h} ) {
749 # Conflict!
750 $conflict->{$gst} = 1;
751 $still_contig->{$h} = '';
752 } else {
753 $still_contig->{$h} = $gst;
754 }
755 last;
756 } # else we don't need $h in this group.
757 } # end foreach $v
758 } # endif $h eq $root
759 } # end foreach $gst
760 } # end foreach $h
761
762 # Now we have some hypothetical vertices in $still_contig that are the
763 # "real" group memberships. Replace these in $contig.
764 foreach my $v ( keys %$contig ) {
765 next unless ref $contig->{$v};
766 $contig->{$v} = $still_contig->{$v};
767 }
768 } # end if @resolve
769
770 my $is_genealogical = keys %$conflict ? JSON::false : JSON::true;
771 my $variant_row = [ [], $is_genealogical ];
772 # Fill in the groupings from $contig.
773 foreach my $g ( @$groups ) {
774 my $gst = wit_stringify( $g );
775 my @realgroup = grep { $contig->{$_} eq $gst } keys %$contig;
776 push( @{$variant_row->[0]}, \@realgroup );
777 }
778 return $variant_row;
779}
fae07016 780
6d25a3a0 781sub _prune_group {
782 my( $group, $stemma ) = @_;
783 # Get these into a form prune_subtree will recognize. Make a "contighash"
784 my $hypohash = {};
785 map { $hypohash->{$_} = 1 } @$group;
786 # ...with reference values for hypotheticals.
787 map { $hypohash->{$_} = [] } $stemma->hypotheticals;
788 # Make our subgraph
789 my $subgraph = $stemma->graph->copy;
790 map { $subgraph->delete_vertex( $_ ) unless exists $hypohash->{$_} }
791 $subgraph->vertices;
792 # ...and find the root.
793 my( $root ) = $subgraph->predecessorless_vertices;
794 # Now prune and return the remaining vertices.
795 _prune_subtree( $subgraph, $root, $hypohash );
796 return $subgraph->vertices;
797}
798
7f52eac8 799sub _prune_subtree {
231d71fc 800 my( $tree, $root, $contighash ) = @_;
801 # First, delete hypothetical leaves / orphans until there are none left.
802 my @orphan_hypotheticals = grep { ref( $contighash->{$_} ) }
803 $tree->successorless_vertices;
804 while( @orphan_hypotheticals ) {
805 $tree->delete_vertices( @orphan_hypotheticals );
806 @orphan_hypotheticals = grep { ref( $contighash->{$_} ) }
807 $tree->successorless_vertices;
808 }
809 # Then delete a hypothetical root with only one successor, moving the
bebec0e9 810 # root to the first child that has no other predecessors.
231d71fc 811 while( $tree->successors( $root ) == 1 && ref $contighash->{$root} ) {
812 my @nextroot = $tree->successors( $root );
813 $tree->delete_vertex( $root );
bebec0e9 814 ( $root ) = grep { $tree->is_predecessorless_vertex( $_ ) } @nextroot;
231d71fc 815 }
816 # The tree has been modified in place, but we need to know the new root.
bebec0e9 817 $root = undef unless $root && $tree->has_vertex( $root );
231d71fc 818 return $root;
819}
d71100ed 820# Add the variant, subject to a.c. representation logic.
821# This assumes that we will see the 'main' version before the a.c. version.
822sub add_variant_wit {
823 my( $arr, $wit, $acstr ) = @_;
824 my $skip;
825 if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
826 my $real = $1;
827 $skip = grep { $_ =~ /^\Q$real\E$/ } @$arr;
828 }
829 push( @$arr, $wit ) unless $skip;
830}
831
5be0cdeb 832sub _useful_variant {
833 my( $group_readings, $graph, $acstr ) = @_;
834
835 # TODO Decide what to do with AC witnesses
836
837 # Sort by group size and return
838 my $is_useful = 0;
839 my( @readings, @groups ); # The sorted groups for our answer.
840 foreach my $rdg ( sort { @{$group_readings->{$b}} <=> @{$group_readings->{$a}} }
841 keys %$group_readings ) {
842 push( @readings, $rdg );
843 push( @groups, $group_readings->{$rdg} );
844 if( @{$group_readings->{$rdg}} > 1 ) {
845 $is_useful++;
846 } else {
847 my( $wit ) = @{$group_readings->{$rdg}};
848 $wit =~ s/^(.*)\Q$acstr\E$/$1/;
849 $is_useful++ unless( $graph->is_sink_vertex( $wit ) );
850 }
851 }
852 if( $is_useful > 1 ) {
853 return( \@readings, \@groups );
854 } else {
855 return( [], [] );
856 }
857}
858
7f52eac8 859=head2 wit_stringify( $groups )
860
861Takes an array of witness groupings and produces a string like
862['A','B'] / ['C','D','E'] / ['F']
d71100ed 863
7f52eac8 864=cut
d71100ed 865
866sub wit_stringify {
867 my $groups = shift;
868 my @gst;
869 # If we were passed an array of witnesses instead of an array of
870 # groupings, then "group" the witnesses first.
871 unless( ref( $groups->[0] ) ) {
872 my $mkgrp = [ $groups ];
873 $groups = $mkgrp;
874 }
875 foreach my $g ( @$groups ) {
876 push( @gst, '[' . join( ',', map { "'$_'" } @$g ) . ']' );
877 }
878 return join( ' / ', @gst );
879}
7f52eac8 880
bebec0e9 881sub _symmdiff {
882 my( $lista, $listb ) = @_;
7f52eac8 883 my %union;
884 my %scalars;
885 map { $union{$_} = 1; $scalars{$_} = $_ } @$lista;
886 map { $union{$_} += 1; $scalars{$_} = $_ } @$listb;
bebec0e9 887 my @set = grep { $union{$_} == 1 } keys %union;
7f52eac8 888 return map { $scalars{$_} } @set;
889}
890
8911;
892
893=head1 LICENSE
894
895This package is free software and is provided "as is" without express
896or implied warranty. You can redistribute it and/or modify it under
897the same terms as Perl itself.
898
899=head1 AUTHOR
900
901Tara L Andrews E<lt>aurum@cpan.orgE<gt>