do proper quoting of unusual entity names in the dot
[scpubgit/stemmatology.git] / lib / Text / Tradition / Analysis.pm
CommitLineData
d71100ed 1package Text::Tradition::Analysis;
2
3use strict;
4use warnings;
e4386ba9 5use Benchmark;
88a6bac5 6use Encode qw/ encode_utf8 /;
d1348d38 7use Exporter 'import';
b4cb2d60 8use Graph;
88a6bac5 9use JSON qw/ encode_json decode_json /;
10use LWP::UserAgent;
d71100ed 11use Text::Tradition;
12use Text::Tradition::Stemma;
13
d1348d38 14use vars qw/ @EXPORT_OK /;
a2cf85dd 15@EXPORT_OK = qw/ run_analysis group_variants analyze_variant_location wit_stringify /;
d1348d38 16
7f52eac8 17=head1 NAME
18
19Text::Tradition::Analysis - functions for stemma analysis of a tradition
20
21=head1 SYNOPSIS
22
23 use Text::Tradition;
24 use Text::Tradition::Analysis qw/ run_analysis analyze_variant_location /;
25 my $t = Text::Tradition->new(
26 'name' => 'this is a text',
27 'input' => 'TEI',
28 'file' => '/path/to/tei_parallel_seg_file.xml' );
29 $t->add_stemma( 'dotfile' => $stemmafile );
30
31 my $variant_data = run_analysis( $tradition );
32 # Recalculate rank $n treating all orthographic variants as equivalent
33 my $reanalyze = analyze_variant_location( $tradition, $n, 0, 'orthographic' );
34
35=head1 DESCRIPTION
36
37Text::Tradition is a library for representation and analysis of collated
38texts, particularly medieval ones. The Collation is the central feature of
39a Tradition, where the text, its sequence of readings, and its relationships
40between readings are actually kept.
41
42=head1 SUBROUTINES
43
88a6bac5 44=head2 run_analysis( $tradition, %opts )
7f52eac8 45
88a6bac5 46Runs the analysis described in analyze_variant_location on every location in the
47collation of the given tradition, with the given options. These include:
7f52eac8 48
88a6bac5 49=over 4
50
51=item * stemma_id - Specify which of the tradition's stemmata to use. Default
52is 0 (i.e. the first).
53
54=item * ranks - Specify a list of location ranks to analyze; exclude the rest.
55
56=item * merge_types - Specify a list of relationship types, where related readings
57should be treated as identical for the purposes of analysis.
58
ffa22d6f 59=item * exclude_type1 - Exclude those ranks whose groupings have only type-1 variants.
60
88a6bac5 61=back
7f52eac8 62
63=begin testing
64
65use Text::Tradition;
66use Text::Tradition::Analysis qw/ run_analysis analyze_variant_location /;
67
68my $datafile = 't/data/florilegium_tei_ps.xml';
69my $tradition = Text::Tradition->new( 'input' => 'TEI',
70 'name' => 'test0',
71 'file' => $datafile );
72my $s = $tradition->add_stemma( 'dotfile' => 't/data/florilegium.dot' );
73is( ref( $s ), 'Text::Tradition::Stemma', "Added stemma to tradition" );
74
f00cefe8 75my %expected_genealogical = (
a44aaf2a 76 1 => 0,
f00cefe8 77 2 => 1,
a44aaf2a 78 3 => 0,
79 5 => 0,
80 7 => 0,
81 8 => 0,
82 10 => 0,
f00cefe8 83 13 => 1,
a44aaf2a 84 33 => 0,
85 34 => 0,
86 37 => 0,
87 60 => 0,
f00cefe8 88 81 => 1,
a44aaf2a 89 84 => 0,
90 87 => 0,
91 101 => 0,
92 102 => 0,
f00cefe8 93 122 => 1,
a44aaf2a 94 157 => 0,
f00cefe8 95 166 => 1,
96 169 => 1,
a44aaf2a 97 200 => 0,
f00cefe8 98 216 => 1,
99 217 => 1,
100 219 => 1,
101 241 => 1,
102 242 => 1,
103 243 => 1,
104);
105
7f52eac8 106my $data = run_analysis( $tradition );
7234b01d 107my $c = $tradition->collation;
f00cefe8 108foreach my $row ( @{$data->{'variants'}} ) {
a44aaf2a 109 # Account for rows that used to be "not useful"
110 unless( exists $expected_genealogical{$row->{'id'}} ) {
111 $expected_genealogical{$row->{'id'}} = 1;
112 }
6d25a3a0 113 is( $row->{'genealogical'}, $expected_genealogical{$row->{'id'}},
f00cefe8 114 "Got correct genealogical flag for row " . $row->{'id'} );
7234b01d 115 # Check that we have the right row with the right groups
116 my $rank = $row->{'id'};
117 foreach my $rdghash ( @{$row->{'readings'}} ) {
118 # Skip 'readings' that aren't really
119 next unless $c->reading( $rdghash->{'readingid'} );
120 # Check the rank
121 is( $c->reading( $rdghash->{'readingid'} )->rank, $rank,
122 "Got correct reading rank" );
123 # Check the witnesses
124 my @realwits = sort $c->reading_witnesses( $rdghash->{'readingid'} );
125 my @sgrp = sort @{$rdghash->{'group'}};
126 is_deeply( \@sgrp, \@realwits, "Reading analyzed with correct groups" );
127 }
f00cefe8 128}
a44aaf2a 129is( $data->{'variant_count'}, 58, "Got right total variant number" );
b4cb2d60 130# TODO Make something meaningful of conflict count, maybe test other bits
7f52eac8 131
132=end testing
133
134=cut
135
d71100ed 136sub run_analysis {
88a6bac5 137 my( $tradition, %opts ) = @_;
f00cefe8 138 my $c = $tradition->collation;
88a6bac5 139
140 my $stemma_id = $opts{'stemma_id'} || 0;
1d73ecad 141 my @ranks = ref( $opts{'ranks'} ) eq 'ARRAY' ? @{$opts{'ranks'}} : ();
142 my @collapse = ref( $opts{'merge_types'} ) eq 'ARRAY' ? @{$opts{'merge_types'}} : ();
88a6bac5 143
144 # Get the stemma
145 my $stemma = $tradition->stemma( $stemma_id );
b4cb2d60 146
88a6bac5 147 # Figure out which witnesses we are working with
148 my @lacunose = $stemma->hypotheticals;
fae07016 149 my @tradition_wits = map { $_->sigil } $tradition->witnesses;
b4cb2d60 150 map { push( @tradition_wits, $_->sigil.$c->ac_label ) if $_->is_layered }
fae07016 151 $tradition->witnesses;
152 push( @lacunose, _symmdiff( [ $stemma->witnesses ], \@tradition_wits ) );
88a6bac5 153
154 # Find and mark 'common' ranks for exclusion, unless they were
155 # explicitly specified.
156 unless( @ranks ) {
157 my %common_rank;
a44aaf2a 158 foreach my $rdg ( $c->common_readings ) {
88a6bac5 159 $common_rank{$rdg->rank} = 1;
160 }
161 @ranks = grep { !$common_rank{$_} } ( 1 .. $c->end->rank-1 );
d71100ed 162 }
7f52eac8 163
88a6bac5 164 # Group the variants to send to the solver
165 my @groups;
f629cb3b 166 my @use_ranks;
a44aaf2a 167 my %lacunae;
88a6bac5 168 foreach my $rank ( @ranks ) {
a44aaf2a 169 my $missing = [ @lacunose ];
ffa22d6f 170 my $rankgroup = group_variants( $tradition, $rank, $missing, \@collapse );
171 if( $opts{'exclude_type1'} ) {
172 # Check to see whether this is a "useful" group.
173 my( $rdgs, $grps ) = _useful_variant( $rankgroup,
174 $stemma->graph, $c->ac_label );
175 next unless @$rdgs;
176 }
f629cb3b 177 push( @use_ranks, $rank );
ffa22d6f 178 push( @groups, $rankgroup );
a44aaf2a 179 $lacunae{$rank} = $missing;
d71100ed 180 }
88a6bac5 181 # Parse the answer
e59b8faa 182 my $answer = solve_variants( $stemma, @groups );
fae07016 183
88a6bac5 184 # Do further analysis on the answer
a44aaf2a 185 my $conflict_count = 0;
7234b01d 186 my $aclabel = $c->ac_label;
f629cb3b 187 foreach my $idx ( 0 .. $#use_ranks ) {
88a6bac5 188 my $location = $answer->{'variants'}->[$idx];
189 # Add the rank back in
f629cb3b 190 $location->{'id'} = $use_ranks[$idx];
7234b01d 191 # Note what our lacunae are
f629cb3b 192 my %lmiss;
7234b01d 193 map { $lmiss{$_} = 1 } @{$lacunae{$use_ranks[$idx]}};
194 # Run through the reading groups and add as 'lacunae' any redundant
195 # a.c. witnesses (yes, we have to do this before the analysis, thus
196 # identical loops before and after. Boo.)
197 # TODO Consider making these callbacks to analyze_location
198 foreach my $rdghash ( @{$location->{'readings'}} ) {
199 my %rwits;
200 map { $rwits{$_} = 1 } @{$rdghash->{'group'}};
201 foreach my $rw ( keys %rwits ) {
202 if( $rw =~ /^(.*)\Q$aclabel\E$/ ) {
203 if( exists $rwits{$1} ) {
204 $lmiss{$rw} = 1;
205 delete $rwits{$rw};
206 }
207 }
208 }
209 $rdghash->{'group'} = [ keys %rwits ];
210 }
211 $location->{'missing'} = [ keys %lmiss ];
212
88a6bac5 213 # Run the extra analysis we need.
88a6bac5 214 analyze_location( $tradition, $stemma->graph, $location );
7234b01d 215
216 # Do the final post-analysis tidying up of the data.
a44aaf2a 217 foreach my $rdghash ( @{$location->{'readings'}} ) {
218 $conflict_count++
219 if exists $rdghash->{'conflict'} && $rdghash->{'conflict'};
f629cb3b 220 # Add the reading text back in
a44aaf2a 221 my $rdg = $c->reading( $rdghash->{'readingid'} );
222 $rdghash->{'text'} = $rdg ? $rdg->text : $rdghash->{'readingid'};
f629cb3b 223 # Remove lacunose witnesses from this reading's list now that the
7234b01d 224 # analysis is done
f629cb3b 225 my @realgroup;
7234b01d 226 map { push( @realgroup, $_ ) unless $lmiss{$_} } @{$rdghash->{'group'}};
f629cb3b 227 $rdghash->{'group'} = \@realgroup;
7234b01d 228 # TODO Record hypotheticals used to create group, if we end up
229 # needing it
a44aaf2a 230 }
88a6bac5 231 }
a44aaf2a 232 $answer->{'conflict_count'} = $conflict_count;
f00cefe8 233
88a6bac5 234 return $answer;
d71100ed 235}
236
7f52eac8 237=head2 group_variants( $tradition, $rank, $lacunose, @merge_relationship_types )
238
239Groups the variants at the given $rank of the collation, treating any
240relationships in @merge_relationship_types as equivalent. $lacunose should
241be a reference to an array, to which the sigla of lacunose witnesses at this
242rank will be appended.
243
ffa22d6f 244Returns a hash $group_readings where $rdg is attested by the witnesses listed
245in $group_readings->{$rdg}.
7f52eac8 246
247=cut
248
249# Return group_readings, groups, lacunose
d1348d38 250sub group_variants {
7f52eac8 251 my( $tradition, $rank, $lacunose, $collapse ) = @_;
252 my $c = $tradition->collation;
b4cb2d60 253 my $aclabel = $c->ac_label;
7f52eac8 254 # Get the alignment table readings
255 my %readings_at_rank;
ffa22d6f 256 my %is_lacunose; # lookup table for $lacunose
257 map { $is_lacunose{$_} = 1 } @$lacunose;
7f52eac8 258 my @gap_wits;
1d73ecad 259 foreach my $tablewit ( @{$c->alignment_table->{'alignment'}} ) {
7f52eac8 260 my $rdg = $tablewit->{'tokens'}->[$rank-1];
fae07016 261 my $wit = $tablewit->{'witness'};
ffa22d6f 262 # Exclude the witness if it is "lacunose" which if we got here
263 # means "not in the stemma".
264 next if $is_lacunose{$wit};
7f52eac8 265 if( $rdg && $rdg->{'t'}->is_lacuna ) {
b4cb2d60 266 _add_to_witlist( $wit, $lacunose, $aclabel );
7f52eac8 267 } elsif( $rdg ) {
268 $readings_at_rank{$rdg->{'t'}->text} = $rdg->{'t'};
269 } else {
b4cb2d60 270 _add_to_witlist( $wit, \@gap_wits, $aclabel );
7f52eac8 271 }
272 }
d1348d38 273
7f52eac8 274 # Group the readings, collapsing groups by relationship if needed
275 my %grouped_readings;
b4cb2d60 276 foreach my $rdg ( sort { $b->witnesses <=> $a->witnesses }
277 values %readings_at_rank ) {
7f52eac8 278 # Skip readings that have been collapsed into others.
f00cefe8 279 next if exists $grouped_readings{$rdg->id} && !$grouped_readings{$rdg->id};
7f52eac8 280 my @wits = $rdg->witnesses;
281 if( $collapse ) {
282 my $filter = sub { my $r = $_[0]; grep { $_ eq $r->type } @$collapse; };
283 foreach my $other ( $rdg->related_readings( $filter ) ) {
fae07016 284 my @otherwits = $other->witnesses;
fae07016 285 push( @wits, @otherwits );
f00cefe8 286 $grouped_readings{$other->id} = 0;
d1348d38 287 }
288 }
ffa22d6f 289 my @use_wits = grep { !$is_lacunose{$_} } @wits;
290 $grouped_readings{$rdg->id} = \@use_wits;
7f52eac8 291 }
292 $grouped_readings{'(omitted)'} = \@gap_wits if @gap_wits;
293 # Get rid of our collapsed readings
294 map { delete $grouped_readings{$_} unless $grouped_readings{$_} }
295 keys %grouped_readings
296 if $collapse;
297
5be0cdeb 298 return \%grouped_readings;
d1348d38 299}
300
88a6bac5 301=head2 solve_variants( $graph, @groups )
302
303Sends the set of groups to the external graph solver service and returns
304a cleaned-up answer, adding the rank IDs back where they belong.
305
306The JSON has the form
307 { "graph": [ stemmagraph DOT string without newlines ],
308 "groupings": [ array of arrays of groups, one per rank ] }
309
310The answer has the form
311 { "variants" => [ array of variant location structures ],
312 "variant_count" => total,
313 "conflict_count" => number of conflicts detected,
314 "genealogical_count" => number of solutions found }
315
316=cut
317
318sub solve_variants {
e59b8faa 319 my( $stemma, @groups ) = @_;
88a6bac5 320
321 # Make the json with stemma + groups
b4cb2d60 322 my $groupings = [];
88a6bac5 323 foreach my $ghash ( @groups ) {
324 my @grouping;
7234b01d 325 foreach my $k ( sort keys %$ghash ) {
88a6bac5 326 push( @grouping, $ghash->{$k} );
327 }
b4cb2d60 328 push( @$groupings, \@grouping );
88a6bac5 329 }
b4cb2d60 330 ## Witness map is a HACK to get around limitations in node names from IDP
331 my $witness_map = {};
332 my $json = encode_json( _safe_wit_strings( $stemma, $groupings, $witness_map ) );
88a6bac5 333
334 # Send it off and get the result
335 my $solver_url = 'http://byzantini.st/cgi-bin/graphcalc.cgi';
336 my $ua = LWP::UserAgent->new();
337 my $resp = $ua->post( $solver_url, 'Content-Type' => 'application/json',
338 'Content' => $json );
339
340 my $answer;
6d25a3a0 341 my $used_idp;
88a6bac5 342 if( $resp->is_success ) {
b4cb2d60 343 $answer = _desanitize_names( decode_json( $resp->content ), $witness_map );
6d25a3a0 344 $used_idp = 1;
88a6bac5 345 } else {
fae07016 346 # Fall back to the old method.
347 warn "IDP solver returned " . $resp->status_line . " / " . $resp->content
348 . "; falling back to perl method";
b4cb2d60 349 $answer = perl_solver( $stemma, @$groupings );
88a6bac5 350 }
351
352 # Fold the result back into what we know about the groups.
353 my $variants = [];
354 my $genealogical = 0;
355 foreach my $idx ( 0 .. $#groups ) {
356 my( $calc_groups, $result ) = @{$answer->[$idx]};
6d25a3a0 357 if( $result ) {
358 $genealogical++;
359 # Prune the calculated groups, in case the IDP solver failed to.
360 if( $used_idp ) {
361 my @pruned_groups;
362 foreach my $cg ( @$calc_groups ) {
363 my @pg = _prune_group( $cg, $stemma );
364 push( @pruned_groups, \@pg );
365 }
366 $calc_groups = \@pruned_groups;
367 }
368 }
88a6bac5 369 my $input_group = $groups[$idx];
7234b01d 370 foreach my $k ( sort keys %$input_group ) {
88a6bac5 371 my $cg = shift @$calc_groups;
372 $input_group->{$k} = $cg;
373 }
374 my $vstruct = {
375 'genealogical' => $result,
376 'readings' => [],
fae07016 377 };
f629cb3b 378 foreach my $k ( sort { @{$input_group->{$b}} <=> @{$input_group->{$a}} }
379 keys %$input_group ) {
88a6bac5 380 push( @{$vstruct->{'readings'}},
fae07016 381 { 'readingid' => $k, 'group' => $input_group->{$k}} );
88a6bac5 382 }
383 push( @$variants, $vstruct );
384 }
385
386 return { 'variants' => $variants,
387 'variant_count' => scalar @$variants,
388 'genealogical_count' => $genealogical };
389}
390
b4cb2d60 391#### HACKERY to cope with IDP's limited idea of what a node name looks like ###
392
393sub _safe_wit_strings {
394 my( $stemma, $groupings, $witness_map ) = @_;
395 my $safegraph = Graph->new();
396 # Convert the graph to a safe representation and store the conversion.
397 foreach my $n ( $stemma->graph->vertices ) {
398 my $sn = _safe_witstr( $n );
399 warn "Ambiguous stringification $sn for $n and " . $witness_map->{$sn}
400 if exists $witness_map->{$sn};
401 $witness_map->{$sn} = $n;
402 $safegraph->add_vertex( $sn );
403 $safegraph->set_vertex_attributes( $sn,
404 $stemma->graph->get_vertex_attributes( $n ) );
405 }
406 foreach my $e ( $stemma->graph->edges ) {
407 my @safe_e = ( _safe_witstr( $e->[0] ), _safe_witstr( $e->[1] ) );
408 $safegraph->add_edge( @safe_e );
409 }
410 my $safe_stemma = Text::Tradition::Stemma->new(
411 'collation' => $stemma->collation, 'graph' => $safegraph );
412
413 # Now convert the witness groupings to a safe representation.
414 my $safe_groupings = [];
415 foreach my $grouping ( @$groupings ) {
416 my $safe_grouping = [];
417 foreach my $group ( @$grouping ) {
418 my $safe_group = [];
419 foreach my $n ( @$group ) {
420 my $sn = _safe_witstr( $n );
421 warn "Ambiguous stringification $sn for $n and " . $witness_map->{$sn}
422 if exists $witness_map->{$sn} && $witness_map->{$sn} ne $n;
423 $witness_map->{$sn} = $n;
424 push( @$safe_group, $sn );
425 }
426 push( @$safe_grouping, $safe_group );
427 }
428 push( @$safe_groupings, $safe_grouping );
429 }
430
431 # Return it all in the struct we expect. We have stored the reductions
432 # in the $witness_map that we were passed.
433 return { 'graph' => $safe_stemma->editable( ' ' ), 'groupings' => $safe_groupings };
434}
435
436sub _safe_witstr {
437 my $witstr = shift;
438 $witstr =~ s/\s+/_/g;
439 $witstr =~ s/[^\w\d-]//g;
440 return $witstr;
441}
442
443sub _desanitize_names {
444 my( $jsonstruct, $witness_map ) = @_;
445 my $result = [];
446 foreach my $grouping ( @$jsonstruct ) {
447 my $real_grouping = [];
448 foreach my $element ( @$grouping ) {
449 if( ref( $element ) eq 'ARRAY' ) {
450 # it's the groupset.
451 my $real_groupset = [];
452 foreach my $group ( @$element ) {
453 my $real_group = [];
454 foreach my $n ( @$group ) {
455 my $rn = $witness_map->{$n};
456 push( @$real_group, $rn );
457 }
458 push( @$real_groupset, $real_group );
459 }
460 push( @$real_grouping, $real_groupset );
461 } else {
462 # It is the boolean, not actually a group.
463 push( @$real_grouping, $element );
464 }
465 }
466 push( @$result, $real_grouping );
467 }
468 return $result;
469}
470
471### END HACKERY ###
472
fae07016 473=head2 analyze_location ( $tradition, $graph, $location_hash )
7f52eac8 474
fae07016 475Given the tradition, its stemma graph, and the solution from the graph solver,
476work out the rest of the information we want. For each reading we need missing,
477conflict, reading_parents, independent_occurrence, followed, not_followed, and follow_unknown. Alters the location_hash in place.
7f52eac8 478
479=cut
732152b1 480
fae07016 481sub analyze_location {
482 my ( $tradition, $graph, $variant_row ) = @_;
483
484 # Make a hash of all known node memberships, and make the subgraphs.
485 my $contig = {};
486 my $reading_roots = {};
487 my $subgraph = {};
488 foreach my $rdghash ( @{$variant_row->{'readings'}} ) {
489 my $rid = $rdghash->{'readingid'};
490 map { $contig->{$_} = $rid } @{$rdghash->{'group'}};
c4a4fb1b 491
fae07016 492 # Make the subgraph.
493 my $part = $graph->copy;
494 my %these_vertices;
495 map { $these_vertices{$_} = 1 } @{$rdghash->{'group'}};
496 $part->delete_vertices( grep { !$these_vertices{$_} } $part->vertices );
497 $subgraph->{$rid} = $part;
498 # Get the reading roots.
499 map { $reading_roots->{$_} = $rid } $part->predecessorless_vertices;
bebec0e9 500 }
501
fae07016 502 # Now that we have all the node group memberships, calculate followed/
bebec0e9 503 # non-followed/unknown values for each reading. Also figure out the
504 # reading's evident parent(s).
505 foreach my $rdghash ( @{$variant_row->{'readings'}} ) {
fae07016 506 # Group string key - TODO do we need this?
507 my $gst = wit_stringify( $rdghash->{'group'} );
508 my $rid = $rdghash->{'readingid'};
509 # Get the subgraph
510 my $part = $subgraph->{$rid};
511
512 # Start figuring things out.
bebec0e9 513 my @roots = $part->predecessorless_vertices;
514 $rdghash->{'independent_occurrence'} = scalar @roots;
515 $rdghash->{'followed'} = scalar( $part->vertices ) - scalar( @roots );
516 # Find the parent readings, if any, of this reading.
f00cefe8 517 my %rdgparents;
bebec0e9 518 foreach my $wit ( @roots ) {
f00cefe8 519 # Look in the main stemma to find this witness's extant or known-reading
520 # immediate ancestor(s), and look up the reading that each ancestor olds.
521 my @check = $graph->predecessors( $wit );
522 while( @check ) {
523 my @next;
524 foreach my $wparent( @check ) {
fae07016 525 my $preading = $contig->{$wparent};
526 if( $preading ) {
527 $rdgparents{$preading} = 1;
f00cefe8 528 } else {
529 push( @next, $graph->predecessors( $wparent ) );
530 }
531 }
532 @check = @next;
533 }
bebec0e9 534 }
f00cefe8 535 $rdghash->{'reading_parents'} = [ keys %rdgparents ];
bebec0e9 536
537 # Find the number of times this reading was altered, and the number of
538 # times we're not sure.
539 my( %nofollow, %unknownfollow );
540 foreach my $wit ( $part->vertices ) {
541 foreach my $wchild ( $graph->successors( $wit ) ) {
542 next if $part->has_vertex( $wchild );
fae07016 543 if( $reading_roots->{$wchild} && $contig->{$wchild} ) {
bebec0e9 544 # It definitely changed here.
545 $nofollow{$wchild} = 1;
546 } elsif( !($contig->{$wchild}) ) {
547 # The child is a hypothetical node not definitely in
548 # any group. Answer is unknown.
549 $unknownfollow{$wchild} = 1;
550 } # else it's a non-root node in a known group, and therefore
551 # is presumed to have its reading from its group, not this link.
552 }
553 }
554 $rdghash->{'not_followed'} = keys %nofollow;
555 $rdghash->{'follow_unknown'} = keys %unknownfollow;
fae07016 556
557 # Now say whether this reading represents a conflict.
558 unless( $variant_row->{'genealogical'} ) {
559 $rdghash->{'conflict'} = @roots != 1;
560 }
c4a4fb1b 561 }
d71100ed 562}
563
fae07016 564
565=head2 perl_solver( $tradition, $rank, $stemma_id, @merge_relationship_types )
566
567** NOTE ** This method should hopefully not be called - it is not guaranteed
568to be correct. Serves as a backup for the real solver.
569
570Runs an analysis of the given tradition, at the location given in $rank,
571against the graph of the stemma specified in $stemma_id. The argument
572@merge_relationship_types is an optional list of relationship types for
573which readings so related should be treated as equivalent.
574
575Returns a nested array data structure as follows:
576
577 [ [ group_list, is_genealogical ], [ group_list, is_genealogical ] ... ]
578
579where the group list is the array of arrays passed in for each element of @groups,
580possibly with the addition of hypothetical readings.
581
582
583=cut
584
585sub perl_solver {
e59b8faa 586 my( $stemma, @groups ) = @_;
587 my $graph = $stemma->graph;
588 my @answer;
589 foreach my $g ( @groups ) {
590 push( @answer, _solve_variant_location( $graph, $g ) );
591 }
592 return \@answer;
fae07016 593}
594
e59b8faa 595sub _solve_variant_location {
596 my( $graph, $groups ) = @_;
fae07016 597 # Now do the work.
e59b8faa 598 my $contig = {};
599 my $subgraph = {};
600 my $is_conflicted;
601 my $conflict = {};
602
603 # Mark each ms as in its own group, first.
604 foreach my $g ( @$groups ) {
605 my $gst = wit_stringify( $g );
606 map { $contig->{$_} = $gst } @$g;
607 }
608
609 # Now for each unmarked node in the graph, initialize an array
610 # for possible group memberships. We will use this later to
611 # resolve potential conflicts.
612 map { $contig->{$_} = [] unless $contig->{$_} } $graph->vertices;
613 foreach my $g ( sort { scalar @$b <=> scalar @$a } @$groups ) {
614 my $gst = wit_stringify( $g ); # This is the group name
615 # Copy the graph, and delete all non-members from the new graph.
616 my $part = $graph->copy;
617 my @group_roots;
618 $part->delete_vertices(
619 grep { !ref( $contig->{$_} ) && $contig->{$_} ne $gst } $graph->vertices );
620
621 # Now look to see if our group is connected.
622 if( @$g > 1 ) {
623 # We have to take directionality into account.
624 # How many root nodes do we have?
625 my @roots = grep { ref( $contig->{$_} ) || $contig->{$_} eq $gst }
626 $part->predecessorless_vertices;
627 # Assuming that @$g > 1, find the first root node that has at
628 # least one successor belonging to our group. If this reading
629 # is genealogical, there should be only one, but we will check
630 # that implicitly later.
631 foreach my $root ( @roots ) {
632 # Prune the tree to get rid of extraneous hypotheticals.
633 $root = _prune_subtree( $part, $root, $contig );
634 next unless $root;
635 # Save this root for our group.
636 push( @group_roots, $root );
637 # Get all the successor nodes of our root.
638 }
639 } else {
640 # Dispense with the trivial case of one reading.
641 my $wit = $g->[0];
642 @group_roots = ( $wit );
643 foreach my $v ( $part->vertices ) {
644 $part->delete_vertex( $v ) unless $v eq $wit;
645 }
646 }
647
648 if( @group_roots > 1 ) {
649 $conflict->{$gst} = 1;
650 $is_conflicted = 1;
651 }
652 # Paint the 'hypotheticals' with our group.
653 foreach my $wit ( $part->vertices ) {
654 if( ref( $contig->{$wit} ) ) {
655 push( @{$contig->{$wit}}, $gst );
656 } elsif( $contig->{$wit} ne $gst ) {
657 warn "How did we get here?";
658 }
659 }
660
661
662 # Save the relevant subgraph.
663 $subgraph->{$gst} = $part;
664 }
665
666 # For each of our hypothetical readings, flatten its 'contig' array if
667 # the array contains zero or one group. If we have any unflattened arrays,
668 # we may need to run the resolution process. If the reading is already known
669 # to have a conflict, flatten the 'contig' array to nothing; we won't resolve
670 # it.
671 my @resolve;
672 foreach my $wit ( keys %$contig ) {
673 next unless ref( $contig->{$wit} );
674 if( @{$contig->{$wit}} > 1 ) {
675 if( $is_conflicted ) {
676 $contig->{$wit} = ''; # We aren't going to decide.
677 } else {
678 push( @resolve, $wit );
679 }
680 } else {
681 my $gst = pop @{$contig->{$wit}};
682 $contig->{$wit} = $gst || '';
683 }
684 }
685
686 if( @resolve ) {
687 my $still_contig = {};
688 foreach my $h ( @resolve ) {
689 # For each of the hypothetical readings with more than one possibility,
690 # try deleting it from each of its member subgraphs in turn, and see
691 # if that breaks the contiguous grouping.
692 # TODO This can still break in a corner case where group A can use
693 # either vertex 1 or 2, and group B can use either vertex 2 or 1.
694 # Revisit this if necessary; it could get brute-force nasty.
695 foreach my $gst ( @{$contig->{$h}} ) {
696 my $gpart = $subgraph->{$gst}->copy();
697 # If we have come this far, there is only one root and everything
698 # is reachable from it.
699 my( $root ) = $gpart->predecessorless_vertices;
700 my $reachable = {};
701 map { $reachable->{$_} = 1 } $gpart->vertices;
702
703 # Try deleting the hypothetical node.
704 $gpart->delete_vertex( $h );
705 if( $h eq $root ) {
706 # See if we still have a single root.
707 my @roots = $gpart->predecessorless_vertices;
708 warn "This shouldn't have happened" unless @roots;
709 if( @roots > 1 ) {
710 # $h is needed by this group.
711 if( exists( $still_contig->{$h} ) ) {
712 # Conflict!
713 $conflict->{$gst} = 1;
714 $still_contig->{$h} = '';
715 } else {
716 $still_contig->{$h} = $gst;
717 }
718 }
719 } else {
720 # $h is somewhere in the middle. See if everything
721 # else can still be reached from the root.
722 my %still_reachable = ( $root => 1 );
723 map { $still_reachable{$_} = 1 }
724 $gpart->all_successors( $root );
725 foreach my $v ( keys %$reachable ) {
726 next if $v eq $h;
727 if( !$still_reachable{$v}
728 && ( $contig->{$v} eq $gst
729 || ( exists $still_contig->{$v}
730 && $still_contig->{$v} eq $gst ) ) ) {
731 # We need $h.
732 if( exists $still_contig->{$h} ) {
733 # Conflict!
734 $conflict->{$gst} = 1;
735 $still_contig->{$h} = '';
736 } else {
737 $still_contig->{$h} = $gst;
738 }
739 last;
740 } # else we don't need $h in this group.
741 } # end foreach $v
742 } # endif $h eq $root
743 } # end foreach $gst
744 } # end foreach $h
745
746 # Now we have some hypothetical vertices in $still_contig that are the
747 # "real" group memberships. Replace these in $contig.
748 foreach my $v ( keys %$contig ) {
749 next unless ref $contig->{$v};
750 $contig->{$v} = $still_contig->{$v};
751 }
752 } # end if @resolve
753
754 my $is_genealogical = keys %$conflict ? JSON::false : JSON::true;
755 my $variant_row = [ [], $is_genealogical ];
756 # Fill in the groupings from $contig.
757 foreach my $g ( @$groups ) {
758 my $gst = wit_stringify( $g );
759 my @realgroup = grep { $contig->{$_} eq $gst } keys %$contig;
760 push( @{$variant_row->[0]}, \@realgroup );
761 }
762 return $variant_row;
763}
fae07016 764
6d25a3a0 765sub _prune_group {
766 my( $group, $stemma ) = @_;
767 # Get these into a form prune_subtree will recognize. Make a "contighash"
768 my $hypohash = {};
769 map { $hypohash->{$_} = 1 } @$group;
770 # ...with reference values for hypotheticals.
771 map { $hypohash->{$_} = [] } $stemma->hypotheticals;
772 # Make our subgraph
773 my $subgraph = $stemma->graph->copy;
774 map { $subgraph->delete_vertex( $_ ) unless exists $hypohash->{$_} }
775 $subgraph->vertices;
776 # ...and find the root.
777 my( $root ) = $subgraph->predecessorless_vertices;
778 # Now prune and return the remaining vertices.
779 _prune_subtree( $subgraph, $root, $hypohash );
780 return $subgraph->vertices;
781}
782
7f52eac8 783sub _prune_subtree {
231d71fc 784 my( $tree, $root, $contighash ) = @_;
785 # First, delete hypothetical leaves / orphans until there are none left.
786 my @orphan_hypotheticals = grep { ref( $contighash->{$_} ) }
787 $tree->successorless_vertices;
788 while( @orphan_hypotheticals ) {
789 $tree->delete_vertices( @orphan_hypotheticals );
790 @orphan_hypotheticals = grep { ref( $contighash->{$_} ) }
791 $tree->successorless_vertices;
792 }
793 # Then delete a hypothetical root with only one successor, moving the
bebec0e9 794 # root to the first child that has no other predecessors.
231d71fc 795 while( $tree->successors( $root ) == 1 && ref $contighash->{$root} ) {
796 my @nextroot = $tree->successors( $root );
797 $tree->delete_vertex( $root );
bebec0e9 798 ( $root ) = grep { $tree->is_predecessorless_vertex( $_ ) } @nextroot;
231d71fc 799 }
800 # The tree has been modified in place, but we need to know the new root.
bebec0e9 801 $root = undef unless $root && $tree->has_vertex( $root );
231d71fc 802 return $root;
803}
d71100ed 804# Add the variant, subject to a.c. representation logic.
805# This assumes that we will see the 'main' version before the a.c. version.
806sub add_variant_wit {
807 my( $arr, $wit, $acstr ) = @_;
808 my $skip;
809 if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
810 my $real = $1;
811 $skip = grep { $_ =~ /^\Q$real\E$/ } @$arr;
812 }
813 push( @$arr, $wit ) unless $skip;
814}
815
5be0cdeb 816sub _useful_variant {
817 my( $group_readings, $graph, $acstr ) = @_;
818
819 # TODO Decide what to do with AC witnesses
820
821 # Sort by group size and return
822 my $is_useful = 0;
823 my( @readings, @groups ); # The sorted groups for our answer.
824 foreach my $rdg ( sort { @{$group_readings->{$b}} <=> @{$group_readings->{$a}} }
825 keys %$group_readings ) {
826 push( @readings, $rdg );
827 push( @groups, $group_readings->{$rdg} );
828 if( @{$group_readings->{$rdg}} > 1 ) {
829 $is_useful++;
830 } else {
831 my( $wit ) = @{$group_readings->{$rdg}};
832 $wit =~ s/^(.*)\Q$acstr\E$/$1/;
833 $is_useful++ unless( $graph->is_sink_vertex( $wit ) );
834 }
835 }
836 if( $is_useful > 1 ) {
837 return( \@readings, \@groups );
838 } else {
839 return( [], [] );
840 }
841}
842
7f52eac8 843=head2 wit_stringify( $groups )
844
845Takes an array of witness groupings and produces a string like
846['A','B'] / ['C','D','E'] / ['F']
d71100ed 847
7f52eac8 848=cut
d71100ed 849
850sub wit_stringify {
851 my $groups = shift;
852 my @gst;
853 # If we were passed an array of witnesses instead of an array of
854 # groupings, then "group" the witnesses first.
855 unless( ref( $groups->[0] ) ) {
856 my $mkgrp = [ $groups ];
857 $groups = $mkgrp;
858 }
859 foreach my $g ( @$groups ) {
860 push( @gst, '[' . join( ',', map { "'$_'" } @$g ) . ']' );
861 }
862 return join( ' / ', @gst );
863}
7f52eac8 864
5be0cdeb 865# Helper function to ensure that X and X a.c. never appear in the same list.
866sub _add_to_witlist {
867 my( $wit, $list, $acstr ) = @_;
868 my %inlist;
869 my $idx = 0;
870 map { $inlist{$_} = $idx++ } @$list;
871 if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
872 my $acwit = $1;
873 unless( exists $inlist{$acwit} ) {
874 push( @$list, $acwit.$acstr );
875 }
876 } else {
877 if( exists( $inlist{$wit.$acstr} ) ) {
878 # Replace the a.c. version with the main witness
879 my $i = $inlist{$wit.$acstr};
880 $list->[$i] = $wit;
881 } else {
882 push( @$list, $wit );
883 }
884 }
885}
886
bebec0e9 887sub _symmdiff {
888 my( $lista, $listb ) = @_;
7f52eac8 889 my %union;
890 my %scalars;
891 map { $union{$_} = 1; $scalars{$_} = $_ } @$lista;
892 map { $union{$_} += 1; $scalars{$_} = $_ } @$listb;
bebec0e9 893 my @set = grep { $union{$_} == 1 } keys %union;
7f52eac8 894 return map { $scalars{$_} } @set;
895}
896
8971;
898
899=head1 LICENSE
900
901This package is free software and is provided "as is" without express
902or implied warranty. You can redistribute it and/or modify it under
903the same terms as Perl itself.
904
905=head1 AUTHOR
906
907Tara L Andrews E<lt>aurum@cpan.orgE<gt>