allow size restriction of stemma graphs
[scpubgit/stemmatology.git] / lib / Text / Tradition / Analysis.pm
CommitLineData
d71100ed 1package Text::Tradition::Analysis;
2
3use strict;
4use warnings;
e4386ba9 5use Benchmark;
88a6bac5 6use Encode qw/ encode_utf8 /;
d1348d38 7use Exporter 'import';
b4cb2d60 8use Graph;
88a6bac5 9use JSON qw/ encode_json decode_json /;
10use LWP::UserAgent;
d71100ed 11use Text::Tradition;
12use Text::Tradition::Stemma;
13
d1348d38 14use vars qw/ @EXPORT_OK /;
a2cf85dd 15@EXPORT_OK = qw/ run_analysis group_variants analyze_variant_location wit_stringify /;
d1348d38 16
7f52eac8 17=head1 NAME
18
19Text::Tradition::Analysis - functions for stemma analysis of a tradition
20
21=head1 SYNOPSIS
22
23 use Text::Tradition;
24 use Text::Tradition::Analysis qw/ run_analysis analyze_variant_location /;
25 my $t = Text::Tradition->new(
26 'name' => 'this is a text',
27 'input' => 'TEI',
28 'file' => '/path/to/tei_parallel_seg_file.xml' );
29 $t->add_stemma( 'dotfile' => $stemmafile );
30
31 my $variant_data = run_analysis( $tradition );
32 # Recalculate rank $n treating all orthographic variants as equivalent
33 my $reanalyze = analyze_variant_location( $tradition, $n, 0, 'orthographic' );
34
35=head1 DESCRIPTION
36
37Text::Tradition is a library for representation and analysis of collated
38texts, particularly medieval ones. The Collation is the central feature of
39a Tradition, where the text, its sequence of readings, and its relationships
40between readings are actually kept.
41
42=head1 SUBROUTINES
43
88a6bac5 44=head2 run_analysis( $tradition, %opts )
7f52eac8 45
88a6bac5 46Runs the analysis described in analyze_variant_location on every location in the
47collation of the given tradition, with the given options. These include:
7f52eac8 48
88a6bac5 49=over 4
50
51=item * stemma_id - Specify which of the tradition's stemmata to use. Default
52is 0 (i.e. the first).
53
54=item * ranks - Specify a list of location ranks to analyze; exclude the rest.
55
56=item * merge_types - Specify a list of relationship types, where related readings
57should be treated as identical for the purposes of analysis.
58
ffa22d6f 59=item * exclude_type1 - Exclude those ranks whose groupings have only type-1 variants.
60
88a6bac5 61=back
7f52eac8 62
63=begin testing
64
65use Text::Tradition;
66use Text::Tradition::Analysis qw/ run_analysis analyze_variant_location /;
67
68my $datafile = 't/data/florilegium_tei_ps.xml';
69my $tradition = Text::Tradition->new( 'input' => 'TEI',
70 'name' => 'test0',
71 'file' => $datafile );
72my $s = $tradition->add_stemma( 'dotfile' => 't/data/florilegium.dot' );
73is( ref( $s ), 'Text::Tradition::Stemma', "Added stemma to tradition" );
74
f00cefe8 75my %expected_genealogical = (
a44aaf2a 76 1 => 0,
f00cefe8 77 2 => 1,
a44aaf2a 78 3 => 0,
79 5 => 0,
80 7 => 0,
81 8 => 0,
82 10 => 0,
f00cefe8 83 13 => 1,
a44aaf2a 84 33 => 0,
85 34 => 0,
86 37 => 0,
87 60 => 0,
f00cefe8 88 81 => 1,
a44aaf2a 89 84 => 0,
90 87 => 0,
91 101 => 0,
92 102 => 0,
f00cefe8 93 122 => 1,
a44aaf2a 94 157 => 0,
f00cefe8 95 166 => 1,
96 169 => 1,
a44aaf2a 97 200 => 0,
f00cefe8 98 216 => 1,
99 217 => 1,
100 219 => 1,
101 241 => 1,
102 242 => 1,
103 243 => 1,
104);
105
7f52eac8 106my $data = run_analysis( $tradition );
7234b01d 107my $c = $tradition->collation;
f00cefe8 108foreach my $row ( @{$data->{'variants'}} ) {
a44aaf2a 109 # Account for rows that used to be "not useful"
110 unless( exists $expected_genealogical{$row->{'id'}} ) {
111 $expected_genealogical{$row->{'id'}} = 1;
112 }
18f48b82 113 my $gen_bool = $row->{'genealogical'} ? 1 : 0;
114 is( $gen_bool, $expected_genealogical{$row->{'id'}},
f00cefe8 115 "Got correct genealogical flag for row " . $row->{'id'} );
7234b01d 116 # Check that we have the right row with the right groups
117 my $rank = $row->{'id'};
118 foreach my $rdghash ( @{$row->{'readings'}} ) {
119 # Skip 'readings' that aren't really
120 next unless $c->reading( $rdghash->{'readingid'} );
121 # Check the rank
122 is( $c->reading( $rdghash->{'readingid'} )->rank, $rank,
123 "Got correct reading rank" );
124 # Check the witnesses
125 my @realwits = sort $c->reading_witnesses( $rdghash->{'readingid'} );
126 my @sgrp = sort @{$rdghash->{'group'}};
127 is_deeply( \@sgrp, \@realwits, "Reading analyzed with correct groups" );
128 }
f00cefe8 129}
a44aaf2a 130is( $data->{'variant_count'}, 58, "Got right total variant number" );
b4cb2d60 131# TODO Make something meaningful of conflict count, maybe test other bits
7f52eac8 132
133=end testing
134
135=cut
136
d71100ed 137sub run_analysis {
88a6bac5 138 my( $tradition, %opts ) = @_;
f00cefe8 139 my $c = $tradition->collation;
88a6bac5 140
141 my $stemma_id = $opts{'stemma_id'} || 0;
1d73ecad 142 my @ranks = ref( $opts{'ranks'} ) eq 'ARRAY' ? @{$opts{'ranks'}} : ();
143 my @collapse = ref( $opts{'merge_types'} ) eq 'ARRAY' ? @{$opts{'merge_types'}} : ();
88a6bac5 144
145 # Get the stemma
146 my $stemma = $tradition->stemma( $stemma_id );
b4cb2d60 147
4ce27d42 148 # Figure out which witnesses we are working with - that is, the ones that
149 # appear both in the stemma and in the tradition. All others are 'lacunose'
150 # for our purposes.
88a6bac5 151 my @lacunose = $stemma->hypotheticals;
fae07016 152 my @tradition_wits = map { $_->sigil } $tradition->witnesses;
fae07016 153 push( @lacunose, _symmdiff( [ $stemma->witnesses ], \@tradition_wits ) );
88a6bac5 154
155 # Find and mark 'common' ranks for exclusion, unless they were
156 # explicitly specified.
157 unless( @ranks ) {
158 my %common_rank;
a44aaf2a 159 foreach my $rdg ( $c->common_readings ) {
88a6bac5 160 $common_rank{$rdg->rank} = 1;
161 }
162 @ranks = grep { !$common_rank{$_} } ( 1 .. $c->end->rank-1 );
d71100ed 163 }
7f52eac8 164
88a6bac5 165 # Group the variants to send to the solver
166 my @groups;
f629cb3b 167 my @use_ranks;
a44aaf2a 168 my %lacunae;
94654e27 169 my $moved = {};
88a6bac5 170 foreach my $rank ( @ranks ) {
a44aaf2a 171 my $missing = [ @lacunose ];
94654e27 172 my $rankgroup = group_variants( $tradition, $rank, $missing, $moved, \@collapse );
173 # Filter out any empty rankgroups
174 # (e.g. from the later rank for a transposition)
175 next unless keys %$rankgroup;
ffa22d6f 176 if( $opts{'exclude_type1'} ) {
177 # Check to see whether this is a "useful" group.
178 my( $rdgs, $grps ) = _useful_variant( $rankgroup,
179 $stemma->graph, $c->ac_label );
180 next unless @$rdgs;
181 }
f629cb3b 182 push( @use_ranks, $rank );
ffa22d6f 183 push( @groups, $rankgroup );
a44aaf2a 184 $lacunae{$rank} = $missing;
d71100ed 185 }
4ce27d42 186 # Run the solver
e59b8faa 187 my $answer = solve_variants( $stemma, @groups );
fae07016 188
88a6bac5 189 # Do further analysis on the answer
a44aaf2a 190 my $conflict_count = 0;
7234b01d 191 my $aclabel = $c->ac_label;
f629cb3b 192 foreach my $idx ( 0 .. $#use_ranks ) {
88a6bac5 193 my $location = $answer->{'variants'}->[$idx];
194 # Add the rank back in
94654e27 195 my $rank = $use_ranks[$idx];
196 $location->{'id'} = $rank;
7234b01d 197 # Note what our lacunae are
f629cb3b 198 my %lmiss;
7234b01d 199 map { $lmiss{$_} = 1 } @{$lacunae{$use_ranks[$idx]}};
7234b01d 200 $location->{'missing'} = [ keys %lmiss ];
201
88a6bac5 202 # Run the extra analysis we need.
94654e27 203 analyze_location( $tradition, $stemma->graph, $location, \%lmiss );
7234b01d 204
205 # Do the final post-analysis tidying up of the data.
a44aaf2a 206 foreach my $rdghash ( @{$location->{'readings'}} ) {
207 $conflict_count++
208 if exists $rdghash->{'conflict'} && $rdghash->{'conflict'};
94654e27 209 # Add the reading text back in, setting display value as needed
a44aaf2a 210 my $rdg = $c->reading( $rdghash->{'readingid'} );
94654e27 211 if( $rdg ) {
212 $rdghash->{'text'} = $rdg->text .
213 ( $rdg->rank == $rank ? '' : ' [' . $rdg->rank . ']' );
214 }
f629cb3b 215 # Remove lacunose witnesses from this reading's list now that the
7234b01d 216 # analysis is done
f629cb3b 217 my @realgroup;
7234b01d 218 map { push( @realgroup, $_ ) unless $lmiss{$_} } @{$rdghash->{'group'}};
f629cb3b 219 $rdghash->{'group'} = \@realgroup;
7234b01d 220 # TODO Record hypotheticals used to create group, if we end up
221 # needing it
a44aaf2a 222 }
88a6bac5 223 }
a44aaf2a 224 $answer->{'conflict_count'} = $conflict_count;
f00cefe8 225
88a6bac5 226 return $answer;
d71100ed 227}
228
7f52eac8 229=head2 group_variants( $tradition, $rank, $lacunose, @merge_relationship_types )
230
231Groups the variants at the given $rank of the collation, treating any
232relationships in @merge_relationship_types as equivalent. $lacunose should
233be a reference to an array, to which the sigla of lacunose witnesses at this
94654e27 234rank will be appended; $transposed should be a reference to a hash, wherein
235the identities of transposed readings and their relatives will be stored.
7f52eac8 236
ffa22d6f 237Returns a hash $group_readings where $rdg is attested by the witnesses listed
238in $group_readings->{$rdg}.
7f52eac8 239
240=cut
241
242# Return group_readings, groups, lacunose
d1348d38 243sub group_variants {
94654e27 244 my( $tradition, $rank, $lacunose, $transposed, $collapse ) = @_;
7f52eac8 245 my $c = $tradition->collation;
335a62ef 246 my $aclabel = $c->ac_label;
7f52eac8 247 # Get the alignment table readings
248 my %readings_at_rank;
ffa22d6f 249 my %is_lacunose; # lookup table for $lacunose
250 map { $is_lacunose{$_} = 1 } @$lacunose;
94654e27 251 my @check_for_gaps;
252 my %moved_wits;
1d73ecad 253 foreach my $tablewit ( @{$c->alignment_table->{'alignment'}} ) {
7f52eac8 254 my $rdg = $tablewit->{'tokens'}->[$rank-1];
fae07016 255 my $wit = $tablewit->{'witness'};
ffa22d6f 256 # Exclude the witness if it is "lacunose" which if we got here
257 # means "not in the stemma".
258 next if $is_lacunose{$wit};
94654e27 259 # Note if the witness is actually in a lacuna
7f52eac8 260 if( $rdg && $rdg->{'t'}->is_lacuna ) {
335a62ef 261 _add_to_witlist( $wit, $lacunose, $aclabel );
94654e27 262 # Otherwise the witness either has a positive reading...
7f52eac8 263 } elsif( $rdg ) {
94654e27 264 # If the reading has been counted elsewhere as a transposition, ignore it.
265 if( $transposed->{$rdg->{'t'}->id} ) {
266 # TODO This doesn't cope with three-way transpositions
267 map { $moved_wits{$_} = 1 } @{$transposed->{$rdg->{'t'}->id}};
268 next;
269 }
270 # Otherwise, record it...
271 $readings_at_rank{$rdg->{'t'}->id} = $rdg->{'t'};
272 # ...and grab any transpositions, and their relations.
273 my @transp = grep { $_->rank != $rank } $rdg->{'t'}->related_readings();
274 foreach my $trdg ( @transp ) {
275 map { $moved_wits{$_} = 1 } $trdg->witnesses;
276 $transposed->{$trdg->id} = [ $rdg->{'t'}->witnesses ];
277 $readings_at_rank{$trdg->id} = $trdg;
278 }
279 # ...or it is empty, ergo a gap.
7f52eac8 280 } else {
94654e27 281 push( @check_for_gaps, $wit );
7f52eac8 282 }
283 }
94654e27 284 my @gap_wits;
285 map { _add_to_witlist( $_, \@gap_wits, $aclabel )
286 unless $moved_wits{$_} } @check_for_gaps;
287 # TODO check for, and break into a new row, any doubled-up witness readings
288 # after transposition...
7f52eac8 289 # Group the readings, collapsing groups by relationship if needed
290 my %grouped_readings;
4ce27d42 291 foreach my $rdg ( values %readings_at_rank ) {
7f52eac8 292 # Skip readings that have been collapsed into others.
f00cefe8 293 next if exists $grouped_readings{$rdg->id} && !$grouped_readings{$rdg->id};
4ce27d42 294 # Get the witness list, including from readings collapsed into this one.
7f52eac8 295 my @wits = $rdg->witnesses;
296 if( $collapse ) {
297 my $filter = sub { my $r = $_[0]; grep { $_ eq $r->type } @$collapse; };
298 foreach my $other ( $rdg->related_readings( $filter ) ) {
fae07016 299 my @otherwits = $other->witnesses;
fae07016 300 push( @wits, @otherwits );
f00cefe8 301 $grouped_readings{$other->id} = 0;
d1348d38 302 }
303 }
335a62ef 304 # Filter the group to those witnesses in the stemma
4ce27d42 305 my @use_wits;
306 foreach my $wit ( @wits ) {
307 next if $is_lacunose{$wit};
308 push( @use_wits, $wit );
4ce27d42 309 }
ffa22d6f 310 $grouped_readings{$rdg->id} = \@use_wits;
7f52eac8 311 }
312 $grouped_readings{'(omitted)'} = \@gap_wits if @gap_wits;
313 # Get rid of our collapsed readings
314 map { delete $grouped_readings{$_} unless $grouped_readings{$_} }
315 keys %grouped_readings
316 if $collapse;
317
4ce27d42 318 # Return the result
5be0cdeb 319 return \%grouped_readings;
d1348d38 320}
321
335a62ef 322# Helper function to ensure that X and X a.c. never appear in the same list.
323sub _add_to_witlist {
324 my( $wit, $list, $acstr ) = @_;
325 my %inlist;
326 my $idx = 0;
327 map { $inlist{$_} = $idx++ } @$list;
328 if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
329 my $acwit = $1;
330 unless( exists $inlist{$acwit} ) {
331 push( @$list, $acwit.$acstr );
332 }
333 } else {
334 if( exists( $inlist{$wit.$acstr} ) ) {
335 # Replace the a.c. version with the main witness
336 my $i = $inlist{$wit.$acstr};
337 $list->[$i] = $wit;
338 } else {
339 push( @$list, $wit );
340 }
341 }
342}
343
88a6bac5 344=head2 solve_variants( $graph, @groups )
345
346Sends the set of groups to the external graph solver service and returns
347a cleaned-up answer, adding the rank IDs back where they belong.
348
349The JSON has the form
350 { "graph": [ stemmagraph DOT string without newlines ],
351 "groupings": [ array of arrays of groups, one per rank ] }
352
353The answer has the form
354 { "variants" => [ array of variant location structures ],
355 "variant_count" => total,
356 "conflict_count" => number of conflicts detected,
357 "genealogical_count" => number of solutions found }
358
359=cut
360
361sub solve_variants {
e59b8faa 362 my( $stemma, @groups ) = @_;
335a62ef 363 my $aclabel = $stemma->collation->ac_label;
364
365 # Filter the groups down to distinct groups, and work out what graph
366 # should be used in the calculation of each group. We want to send each
367 # distinct problem to the solver only once.
368 # We need a whole bunch of lookup tables for this.
369 my $index_groupkeys = {}; # Save the order of readings
370 my $group_indices = {}; # Save the indices that have a given grouping
371 my $graph_problems = {}; # Save the groupings for the given graph
372
373 foreach my $idx ( 0..$#groups ) {
374 my $ghash = $groups[$idx];
88a6bac5 375 my @grouping;
335a62ef 376 # Sort the groupings from big to little, and scan for a.c. witnesses
377 # that would need an extended graph.
378 my @acwits; # note which AC witnesses crop up at this rank
379 my @idxkeys = sort { scalar @{$ghash->{$b}} <=> scalar @{$ghash->{$a}} }
380 keys %$ghash;
381 foreach my $rdg ( @idxkeys ) {
382 my @sg = sort @{$ghash->{$rdg}};
383 push( @acwits, grep { $_ =~ /\Q$aclabel\E$/ } @sg );
384 push( @grouping, \@sg );
385 }
386 # Save the reading order
387 $index_groupkeys->{$idx} = \@idxkeys;
388
389 # Now associate the distinct group with this index
390 my $gstr = wit_stringify( \@grouping );
391 push( @{$group_indices->{$gstr}}, $idx );
392
393 # Finally, add the group to the list to be calculated for this graph.
394 map { s/\Q$aclabel\E$// } @acwits;
395 my $graph = $stemma->extend_graph( \@acwits );
396 unless( exists $graph_problems->{"$graph"} ) {
397 $graph_problems->{"$graph"} = { 'object' => $graph, 'groups' => [] };
88a6bac5 398 }
335a62ef 399 push( @{$graph_problems->{"$graph"}->{'groups'}}, \@grouping );
88a6bac5 400 }
335a62ef 401
402 ## For each distinct graph, send its groups to the solver.
88a6bac5 403 my $solver_url = 'http://byzantini.st/cgi-bin/graphcalc.cgi';
404 my $ua = LWP::UserAgent->new();
335a62ef 405 ## Witness map is a HACK to get around limitations in node names from IDP
406 my $witness_map = {};
407 ## Variables to store answers as they come back
408 my $variants = [ ( undef ) x ( scalar keys %$index_groupkeys ) ];
88a6bac5 409 my $genealogical = 0;
335a62ef 410 foreach my $graphkey ( keys %$graph_problems ) {
411 my $graph = $graph_problems->{$graphkey}->{'object'};
412 my $groupings = $graph_problems->{$graphkey}->{'groups'};
413 my $json = encode_json( _safe_wit_strings( $graph, $stemma->collation,
414 $groupings, $witness_map ) );
415 # Send it off and get the result
94654e27 416 #print STDERR "Sending request: $json\n";
335a62ef 417 my $resp = $ua->post( $solver_url, 'Content-Type' => 'application/json',
418 'Content' => $json );
419 my $answer;
420 my $used_idp;
421 if( $resp->is_success ) {
422 $answer = _desanitize_names( decode_json( $resp->content ), $witness_map );
423 $used_idp = 1;
424 } else {
425 # Fall back to the old method.
426 warn "IDP solver returned " . $resp->status_line . " / " . $resp->content
427 . "; falling back to perl method";
428 $answer = perl_solver( $graph, @$groupings );
429 }
430 ## The answer is the evaluated groupings, plus a boolean for whether
431 ## they were genealogical. Reconstruct our original groups.
432 foreach my $gidx ( 0 .. $#{$groupings} ) {
433 my( $calc_groups, $result ) = @{$answer->[$gidx]};
434 if( $result ) {
435 $genealogical++;
436 # Prune the calculated groups, in case the IDP solver failed to.
437 if( $used_idp ) {
438 my @pruned_groups;
439 foreach my $cg ( @$calc_groups ) {
440 # This is a little wasteful but the path of least
441 # resistance. Send both the stemma, which knows what
442 # its hypotheticals are, and the actual graph used.
443 my @pg = _prune_group( $cg, $stemma, $graph );
444 push( @pruned_groups, \@pg );
445 }
446 $calc_groups = \@pruned_groups;
6d25a3a0 447 }
335a62ef 448 }
449 # Retrieve the key for the original group that went to the solver
450 my $input_group = wit_stringify( $groupings->[$gidx] );
451 foreach my $oidx ( @{$group_indices->{$input_group}} ) {
452 my @readings = @{$index_groupkeys->{$oidx}};
453 my $vstruct = {
454 'genealogical' => $result,
455 'readings' => [],
456 };
457 foreach my $ridx ( 0 .. $#readings ) {
458 push( @{$vstruct->{'readings'}},
459 { 'readingid' => $readings[$ridx],
460 'group' => $calc_groups->[$ridx] } );
461 }
462 $variants->[$oidx] = $vstruct;
6d25a3a0 463 }
464 }
88a6bac5 465 }
466
467 return { 'variants' => $variants,
468 'variant_count' => scalar @$variants,
469 'genealogical_count' => $genealogical };
470}
471
b4cb2d60 472#### HACKERY to cope with IDP's limited idea of what a node name looks like ###
473
474sub _safe_wit_strings {
335a62ef 475 my( $graph, $c, $groupings, $witness_map ) = @_;
476 # Parse the graph we were given into a stemma.
b4cb2d60 477 my $safegraph = Graph->new();
478 # Convert the graph to a safe representation and store the conversion.
335a62ef 479 foreach my $n ( $graph->vertices ) {
b4cb2d60 480 my $sn = _safe_witstr( $n );
335a62ef 481 if( exists $witness_map->{$sn} ) {
482 warn "Ambiguous stringification $sn for $n and " . $witness_map->{$sn}
483 if $witness_map->{$sn} ne $n;
484 } else {
485 $witness_map->{$sn} = $n;
486 }
b4cb2d60 487 $safegraph->add_vertex( $sn );
488 $safegraph->set_vertex_attributes( $sn,
335a62ef 489 $graph->get_vertex_attributes( $n ) );
b4cb2d60 490 }
335a62ef 491 foreach my $e ( $graph->edges ) {
b4cb2d60 492 my @safe_e = ( _safe_witstr( $e->[0] ), _safe_witstr( $e->[1] ) );
493 $safegraph->add_edge( @safe_e );
494 }
495 my $safe_stemma = Text::Tradition::Stemma->new(
335a62ef 496 'collation' => $c, 'graph' => $safegraph );
b4cb2d60 497
498 # Now convert the witness groupings to a safe representation.
499 my $safe_groupings = [];
500 foreach my $grouping ( @$groupings ) {
501 my $safe_grouping = [];
502 foreach my $group ( @$grouping ) {
503 my $safe_group = [];
504 foreach my $n ( @$group ) {
505 my $sn = _safe_witstr( $n );
506 warn "Ambiguous stringification $sn for $n and " . $witness_map->{$sn}
507 if exists $witness_map->{$sn} && $witness_map->{$sn} ne $n;
508 $witness_map->{$sn} = $n;
509 push( @$safe_group, $sn );
510 }
511 push( @$safe_grouping, $safe_group );
512 }
513 push( @$safe_groupings, $safe_grouping );
514 }
515
516 # Return it all in the struct we expect. We have stored the reductions
517 # in the $witness_map that we were passed.
335a62ef 518 return { 'graph' => $safe_stemma->editable( { 'linesep' => ' ' } ),
519 'groupings' => $safe_groupings };
b4cb2d60 520}
521
522sub _safe_witstr {
523 my $witstr = shift;
524 $witstr =~ s/\s+/_/g;
525 $witstr =~ s/[^\w\d-]//g;
526 return $witstr;
527}
528
529sub _desanitize_names {
530 my( $jsonstruct, $witness_map ) = @_;
531 my $result = [];
532 foreach my $grouping ( @$jsonstruct ) {
533 my $real_grouping = [];
534 foreach my $element ( @$grouping ) {
535 if( ref( $element ) eq 'ARRAY' ) {
536 # it's the groupset.
537 my $real_groupset = [];
538 foreach my $group ( @$element ) {
539 my $real_group = [];
540 foreach my $n ( @$group ) {
541 my $rn = $witness_map->{$n};
542 push( @$real_group, $rn );
543 }
544 push( @$real_groupset, $real_group );
545 }
546 push( @$real_grouping, $real_groupset );
547 } else {
548 # It is the boolean, not actually a group.
549 push( @$real_grouping, $element );
550 }
551 }
552 push( @$result, $real_grouping );
553 }
554 return $result;
555}
556
557### END HACKERY ###
558
fae07016 559=head2 analyze_location ( $tradition, $graph, $location_hash )
7f52eac8 560
fae07016 561Given the tradition, its stemma graph, and the solution from the graph solver,
562work out the rest of the information we want. For each reading we need missing,
563conflict, reading_parents, independent_occurrence, followed, not_followed, and follow_unknown. Alters the location_hash in place.
7f52eac8 564
565=cut
732152b1 566
fae07016 567sub analyze_location {
94654e27 568 my ( $tradition, $graph, $variant_row, $lacunose ) = @_;
569 my $c = $tradition->collation;
fae07016 570
571 # Make a hash of all known node memberships, and make the subgraphs.
572 my $contig = {};
573 my $reading_roots = {};
574 my $subgraph = {};
94654e27 575 $DB::single = 1 if $variant_row->{id} == 6;
576 # Note which witnesses positively belong to which group
fae07016 577 foreach my $rdghash ( @{$variant_row->{'readings'}} ) {
578 my $rid = $rdghash->{'readingid'};
579 map { $contig->{$_} = $rid } @{$rdghash->{'group'}};
94654e27 580 }
581
582 # Now, armed with that knowledge, make a subgraph for each reading
583 # and note the root(s) of each subgraph.
584 foreach my $rdghash( @{$variant_row->{'readings'}} ) {
585 my $rid = $rdghash->{'readingid'};
586 my %rdgwits;
fae07016 587 # Make the subgraph.
588 my $part = $graph->copy;
94654e27 589 my @todelete = grep { exists $contig->{$_} && $contig->{$_} ne $rid }
590 keys %$contig;
591 $part->delete_vertices( @todelete );
592 _prune_subtree( $part, $lacunose );
fae07016 593 $subgraph->{$rid} = $part;
94654e27 594 # Record the remaining lacunose nodes as part of this group, if
595 # we are dealing with a non-genealogical reading.
596 unless( $variant_row->{'genealogical'} ) {
597 map { $contig->{$_} = $rid } $part->vertices;
598 }
fae07016 599 # Get the reading roots.
600 map { $reading_roots->{$_} = $rid } $part->predecessorless_vertices;
bebec0e9 601 }
602
fae07016 603 # Now that we have all the node group memberships, calculate followed/
bebec0e9 604 # non-followed/unknown values for each reading. Also figure out the
605 # reading's evident parent(s).
606 foreach my $rdghash ( @{$variant_row->{'readings'}} ) {
fae07016 607 my $rid = $rdghash->{'readingid'};
608 # Get the subgraph
609 my $part = $subgraph->{$rid};
610
611 # Start figuring things out.
94654e27 612 my @roots = grep { $reading_roots->{$_} eq $rid } keys %$reading_roots;
613 $rdghash->{'independent_occurrence'} = \@roots;
bebec0e9 614 $rdghash->{'followed'} = scalar( $part->vertices ) - scalar( @roots );
615 # Find the parent readings, if any, of this reading.
94654e27 616 my $rdgparents = {};
bebec0e9 617 foreach my $wit ( @roots ) {
f00cefe8 618 # Look in the main stemma to find this witness's extant or known-reading
619 # immediate ancestor(s), and look up the reading that each ancestor olds.
620 my @check = $graph->predecessors( $wit );
621 while( @check ) {
622 my @next;
623 foreach my $wparent( @check ) {
fae07016 624 my $preading = $contig->{$wparent};
625 if( $preading ) {
94654e27 626 $rdgparents->{$preading} = 1;
f00cefe8 627 } else {
628 push( @next, $graph->predecessors( $wparent ) );
629 }
630 }
631 @check = @next;
632 }
bebec0e9 633 }
94654e27 634 foreach my $p ( keys %$rdgparents ) {
635 # Resolve the relationship of the parent to the reading, and
636 # save it in our hash.
637 my $pobj = $c->reading( $p );
638 my $relation;
639 my $prep = $pobj ? $pobj->id . ' (' . $pobj->text . ')' : $p;
640 if( $pobj ) {
641 my $rel = $c->get_relationship( $p, $rdghash->{readingid} );
642 if( $rel ) {
643 $relation = { type => $rel->type };
644 if( $rel->has_annotation ) {
645 $relation->{'annotation'} = $rel->annotation;
646 }
647 }
648 }
649 $rdgparents->{$p} = { 'label' => $prep, 'relation' => $relation };
650 }
651
652 $rdghash->{'reading_parents'} = $rdgparents;
bebec0e9 653
654 # Find the number of times this reading was altered, and the number of
655 # times we're not sure.
656 my( %nofollow, %unknownfollow );
657 foreach my $wit ( $part->vertices ) {
658 foreach my $wchild ( $graph->successors( $wit ) ) {
659 next if $part->has_vertex( $wchild );
fae07016 660 if( $reading_roots->{$wchild} && $contig->{$wchild} ) {
bebec0e9 661 # It definitely changed here.
662 $nofollow{$wchild} = 1;
663 } elsif( !($contig->{$wchild}) ) {
664 # The child is a hypothetical node not definitely in
665 # any group. Answer is unknown.
666 $unknownfollow{$wchild} = 1;
667 } # else it's a non-root node in a known group, and therefore
668 # is presumed to have its reading from its group, not this link.
669 }
670 }
671 $rdghash->{'not_followed'} = keys %nofollow;
672 $rdghash->{'follow_unknown'} = keys %unknownfollow;
fae07016 673
674 # Now say whether this reading represents a conflict.
675 unless( $variant_row->{'genealogical'} ) {
676 $rdghash->{'conflict'} = @roots != 1;
677 }
c4a4fb1b 678 }
d71100ed 679}
680
fae07016 681
682=head2 perl_solver( $tradition, $rank, $stemma_id, @merge_relationship_types )
683
684** NOTE ** This method should hopefully not be called - it is not guaranteed
685to be correct. Serves as a backup for the real solver.
686
687Runs an analysis of the given tradition, at the location given in $rank,
688against the graph of the stemma specified in $stemma_id. The argument
689@merge_relationship_types is an optional list of relationship types for
690which readings so related should be treated as equivalent.
691
692Returns a nested array data structure as follows:
693
694 [ [ group_list, is_genealogical ], [ group_list, is_genealogical ] ... ]
695
696where the group list is the array of arrays passed in for each element of @groups,
697possibly with the addition of hypothetical readings.
698
699
700=cut
701
702sub perl_solver {
335a62ef 703 my( $graph, @groups ) = @_;
e59b8faa 704 my @answer;
705 foreach my $g ( @groups ) {
706 push( @answer, _solve_variant_location( $graph, $g ) );
707 }
708 return \@answer;
fae07016 709}
710
e59b8faa 711sub _solve_variant_location {
712 my( $graph, $groups ) = @_;
fae07016 713 # Now do the work.
e59b8faa 714 my $contig = {};
715 my $subgraph = {};
716 my $is_conflicted;
717 my $conflict = {};
718
719 # Mark each ms as in its own group, first.
720 foreach my $g ( @$groups ) {
721 my $gst = wit_stringify( $g );
722 map { $contig->{$_} = $gst } @$g;
723 }
724
725 # Now for each unmarked node in the graph, initialize an array
726 # for possible group memberships. We will use this later to
727 # resolve potential conflicts.
728 map { $contig->{$_} = [] unless $contig->{$_} } $graph->vertices;
729 foreach my $g ( sort { scalar @$b <=> scalar @$a } @$groups ) {
730 my $gst = wit_stringify( $g ); # This is the group name
731 # Copy the graph, and delete all non-members from the new graph.
732 my $part = $graph->copy;
733 my @group_roots;
734 $part->delete_vertices(
735 grep { !ref( $contig->{$_} ) && $contig->{$_} ne $gst } $graph->vertices );
736
737 # Now look to see if our group is connected.
738 if( @$g > 1 ) {
739 # We have to take directionality into account.
740 # How many root nodes do we have?
741 my @roots = grep { ref( $contig->{$_} ) || $contig->{$_} eq $gst }
742 $part->predecessorless_vertices;
743 # Assuming that @$g > 1, find the first root node that has at
744 # least one successor belonging to our group. If this reading
745 # is genealogical, there should be only one, but we will check
746 # that implicitly later.
747 foreach my $root ( @roots ) {
748 # Prune the tree to get rid of extraneous hypotheticals.
94654e27 749 $root = _prune_subtree_old( $part, $root, $contig );
e59b8faa 750 next unless $root;
751 # Save this root for our group.
752 push( @group_roots, $root );
753 # Get all the successor nodes of our root.
754 }
755 } else {
756 # Dispense with the trivial case of one reading.
757 my $wit = $g->[0];
758 @group_roots = ( $wit );
759 foreach my $v ( $part->vertices ) {
760 $part->delete_vertex( $v ) unless $v eq $wit;
761 }
762 }
763
764 if( @group_roots > 1 ) {
765 $conflict->{$gst} = 1;
766 $is_conflicted = 1;
767 }
768 # Paint the 'hypotheticals' with our group.
769 foreach my $wit ( $part->vertices ) {
770 if( ref( $contig->{$wit} ) ) {
771 push( @{$contig->{$wit}}, $gst );
772 } elsif( $contig->{$wit} ne $gst ) {
773 warn "How did we get here?";
774 }
775 }
776
777
778 # Save the relevant subgraph.
779 $subgraph->{$gst} = $part;
780 }
781
782 # For each of our hypothetical readings, flatten its 'contig' array if
783 # the array contains zero or one group. If we have any unflattened arrays,
784 # we may need to run the resolution process. If the reading is already known
785 # to have a conflict, flatten the 'contig' array to nothing; we won't resolve
786 # it.
787 my @resolve;
788 foreach my $wit ( keys %$contig ) {
789 next unless ref( $contig->{$wit} );
790 if( @{$contig->{$wit}} > 1 ) {
791 if( $is_conflicted ) {
792 $contig->{$wit} = ''; # We aren't going to decide.
793 } else {
794 push( @resolve, $wit );
795 }
796 } else {
797 my $gst = pop @{$contig->{$wit}};
798 $contig->{$wit} = $gst || '';
799 }
800 }
801
802 if( @resolve ) {
803 my $still_contig = {};
804 foreach my $h ( @resolve ) {
805 # For each of the hypothetical readings with more than one possibility,
806 # try deleting it from each of its member subgraphs in turn, and see
807 # if that breaks the contiguous grouping.
808 # TODO This can still break in a corner case where group A can use
809 # either vertex 1 or 2, and group B can use either vertex 2 or 1.
810 # Revisit this if necessary; it could get brute-force nasty.
811 foreach my $gst ( @{$contig->{$h}} ) {
812 my $gpart = $subgraph->{$gst}->copy();
813 # If we have come this far, there is only one root and everything
814 # is reachable from it.
815 my( $root ) = $gpart->predecessorless_vertices;
816 my $reachable = {};
817 map { $reachable->{$_} = 1 } $gpart->vertices;
818
819 # Try deleting the hypothetical node.
820 $gpart->delete_vertex( $h );
821 if( $h eq $root ) {
822 # See if we still have a single root.
823 my @roots = $gpart->predecessorless_vertices;
824 warn "This shouldn't have happened" unless @roots;
825 if( @roots > 1 ) {
826 # $h is needed by this group.
827 if( exists( $still_contig->{$h} ) ) {
828 # Conflict!
829 $conflict->{$gst} = 1;
830 $still_contig->{$h} = '';
831 } else {
832 $still_contig->{$h} = $gst;
833 }
834 }
835 } else {
836 # $h is somewhere in the middle. See if everything
837 # else can still be reached from the root.
838 my %still_reachable = ( $root => 1 );
839 map { $still_reachable{$_} = 1 }
840 $gpart->all_successors( $root );
841 foreach my $v ( keys %$reachable ) {
842 next if $v eq $h;
843 if( !$still_reachable{$v}
844 && ( $contig->{$v} eq $gst
845 || ( exists $still_contig->{$v}
846 && $still_contig->{$v} eq $gst ) ) ) {
847 # We need $h.
848 if( exists $still_contig->{$h} ) {
849 # Conflict!
850 $conflict->{$gst} = 1;
851 $still_contig->{$h} = '';
852 } else {
853 $still_contig->{$h} = $gst;
854 }
855 last;
856 } # else we don't need $h in this group.
857 } # end foreach $v
858 } # endif $h eq $root
859 } # end foreach $gst
860 } # end foreach $h
861
862 # Now we have some hypothetical vertices in $still_contig that are the
863 # "real" group memberships. Replace these in $contig.
864 foreach my $v ( keys %$contig ) {
865 next unless ref $contig->{$v};
866 $contig->{$v} = $still_contig->{$v};
867 }
868 } # end if @resolve
869
870 my $is_genealogical = keys %$conflict ? JSON::false : JSON::true;
871 my $variant_row = [ [], $is_genealogical ];
872 # Fill in the groupings from $contig.
873 foreach my $g ( @$groups ) {
874 my $gst = wit_stringify( $g );
875 my @realgroup = grep { $contig->{$_} eq $gst } keys %$contig;
876 push( @{$variant_row->[0]}, \@realgroup );
877 }
878 return $variant_row;
879}
fae07016 880
6d25a3a0 881sub _prune_group {
335a62ef 882 my( $group, $stemma, $graph ) = @_;
94654e27 883 my $lacunose = {};
884 map { $lacunose->{$_} = 1 } $stemma->hypotheticals;
885 map { $lacunose->{$_} = 0 } @$group;
6d25a3a0 886 # Make our subgraph
335a62ef 887 my $subgraph = $graph->copy;
94654e27 888 map { $subgraph->delete_vertex( $_ ) unless exists $lacunose->{$_} }
6d25a3a0 889 $subgraph->vertices;
890 # ...and find the root.
6d25a3a0 891 # Now prune and return the remaining vertices.
94654e27 892 _prune_subtree( $subgraph, $lacunose );
6d25a3a0 893 return $subgraph->vertices;
894}
895
7f52eac8 896sub _prune_subtree {
94654e27 897 my( $tree, $lacunose ) = @_;
898
899 # Delete lacunose witnesses that have no successors
900 my @orphan_hypotheticals;
901 my $ctr = 0;
902 do {
903 die "Infinite loop on leaves" if $ctr > 100;
904 @orphan_hypotheticals = grep { $lacunose->{$_} }
905 $tree->successorless_vertices;
906 $tree->delete_vertices( @orphan_hypotheticals );
907 $ctr++;
908 } while( @orphan_hypotheticals );
909
910 # Delete lacunose roots that have a single successor
911 my @redundant_root;
912 $ctr = 0;
913 do {
914 die "Infinite loop on roots" if $ctr > 100;
915 @redundant_root = grep { $lacunose->{$_} && $tree->successors( $_ ) == 1 }
916 $tree->predecessorless_vertices;
917 $tree->delete_vertices( @redundant_root );
918 $ctr++;
919 } while( @redundant_root );
920}
921
922sub _prune_subtree_old {
231d71fc 923 my( $tree, $root, $contighash ) = @_;
924 # First, delete hypothetical leaves / orphans until there are none left.
925 my @orphan_hypotheticals = grep { ref( $contighash->{$_} ) }
926 $tree->successorless_vertices;
927 while( @orphan_hypotheticals ) {
928 $tree->delete_vertices( @orphan_hypotheticals );
929 @orphan_hypotheticals = grep { ref( $contighash->{$_} ) }
930 $tree->successorless_vertices;
931 }
932 # Then delete a hypothetical root with only one successor, moving the
bebec0e9 933 # root to the first child that has no other predecessors.
231d71fc 934 while( $tree->successors( $root ) == 1 && ref $contighash->{$root} ) {
935 my @nextroot = $tree->successors( $root );
936 $tree->delete_vertex( $root );
bebec0e9 937 ( $root ) = grep { $tree->is_predecessorless_vertex( $_ ) } @nextroot;
231d71fc 938 }
939 # The tree has been modified in place, but we need to know the new root.
bebec0e9 940 $root = undef unless $root && $tree->has_vertex( $root );
231d71fc 941 return $root;
942}
d71100ed 943# Add the variant, subject to a.c. representation logic.
944# This assumes that we will see the 'main' version before the a.c. version.
945sub add_variant_wit {
946 my( $arr, $wit, $acstr ) = @_;
947 my $skip;
948 if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
949 my $real = $1;
950 $skip = grep { $_ =~ /^\Q$real\E$/ } @$arr;
951 }
952 push( @$arr, $wit ) unless $skip;
953}
954
5be0cdeb 955sub _useful_variant {
956 my( $group_readings, $graph, $acstr ) = @_;
957
958 # TODO Decide what to do with AC witnesses
959
960 # Sort by group size and return
961 my $is_useful = 0;
962 my( @readings, @groups ); # The sorted groups for our answer.
963 foreach my $rdg ( sort { @{$group_readings->{$b}} <=> @{$group_readings->{$a}} }
964 keys %$group_readings ) {
965 push( @readings, $rdg );
966 push( @groups, $group_readings->{$rdg} );
967 if( @{$group_readings->{$rdg}} > 1 ) {
968 $is_useful++;
969 } else {
970 my( $wit ) = @{$group_readings->{$rdg}};
971 $wit =~ s/^(.*)\Q$acstr\E$/$1/;
972 $is_useful++ unless( $graph->is_sink_vertex( $wit ) );
973 }
974 }
975 if( $is_useful > 1 ) {
976 return( \@readings, \@groups );
977 } else {
978 return( [], [] );
979 }
980}
981
7f52eac8 982=head2 wit_stringify( $groups )
983
984Takes an array of witness groupings and produces a string like
985['A','B'] / ['C','D','E'] / ['F']
d71100ed 986
7f52eac8 987=cut
d71100ed 988
989sub wit_stringify {
990 my $groups = shift;
991 my @gst;
992 # If we were passed an array of witnesses instead of an array of
993 # groupings, then "group" the witnesses first.
994 unless( ref( $groups->[0] ) ) {
995 my $mkgrp = [ $groups ];
996 $groups = $mkgrp;
997 }
998 foreach my $g ( @$groups ) {
999 push( @gst, '[' . join( ',', map { "'$_'" } @$g ) . ']' );
1000 }
1001 return join( ' / ', @gst );
1002}
7f52eac8 1003
bebec0e9 1004sub _symmdiff {
1005 my( $lista, $listb ) = @_;
7f52eac8 1006 my %union;
1007 my %scalars;
1008 map { $union{$_} = 1; $scalars{$_} = $_ } @$lista;
1009 map { $union{$_} += 1; $scalars{$_} = $_ } @$listb;
bebec0e9 1010 my @set = grep { $union{$_} == 1 } keys %union;
7f52eac8 1011 return map { $scalars{$_} } @set;
1012}
1013
10141;
1015
1016=head1 LICENSE
1017
1018This package is free software and is provided "as is" without express
1019or implied warranty. You can redistribute it and/or modify it under
1020the same terms as Perl itself.
1021
1022=head1 AUTHOR
1023
1024Tara L Andrews E<lt>aurum@cpan.orgE<gt>