allow creation of nonlinear traditions
[scpubgit/stemmatology.git] / lib / Text / Tradition / Analysis.pm
CommitLineData
d71100ed 1package Text::Tradition::Analysis;
2
3use strict;
4use warnings;
e4386ba9 5use Benchmark;
88a6bac5 6use Encode qw/ encode_utf8 /;
d1348d38 7use Exporter 'import';
b4cb2d60 8use Graph;
88a6bac5 9use JSON qw/ encode_json decode_json /;
10use LWP::UserAgent;
d71100ed 11use Text::Tradition;
12use Text::Tradition::Stemma;
13
d1348d38 14use vars qw/ @EXPORT_OK /;
a2cf85dd 15@EXPORT_OK = qw/ run_analysis group_variants analyze_variant_location wit_stringify /;
d1348d38 16
7f52eac8 17=head1 NAME
18
19Text::Tradition::Analysis - functions for stemma analysis of a tradition
20
21=head1 SYNOPSIS
22
23 use Text::Tradition;
24 use Text::Tradition::Analysis qw/ run_analysis analyze_variant_location /;
25 my $t = Text::Tradition->new(
26 'name' => 'this is a text',
27 'input' => 'TEI',
28 'file' => '/path/to/tei_parallel_seg_file.xml' );
29 $t->add_stemma( 'dotfile' => $stemmafile );
30
31 my $variant_data = run_analysis( $tradition );
32 # Recalculate rank $n treating all orthographic variants as equivalent
33 my $reanalyze = analyze_variant_location( $tradition, $n, 0, 'orthographic' );
34
35=head1 DESCRIPTION
36
37Text::Tradition is a library for representation and analysis of collated
38texts, particularly medieval ones. The Collation is the central feature of
39a Tradition, where the text, its sequence of readings, and its relationships
40between readings are actually kept.
41
42=head1 SUBROUTINES
43
88a6bac5 44=head2 run_analysis( $tradition, %opts )
7f52eac8 45
88a6bac5 46Runs the analysis described in analyze_variant_location on every location in the
47collation of the given tradition, with the given options. These include:
7f52eac8 48
88a6bac5 49=over 4
50
51=item * stemma_id - Specify which of the tradition's stemmata to use. Default
52is 0 (i.e. the first).
53
54=item * ranks - Specify a list of location ranks to analyze; exclude the rest.
55
56=item * merge_types - Specify a list of relationship types, where related readings
57should be treated as identical for the purposes of analysis.
58
ffa22d6f 59=item * exclude_type1 - Exclude those ranks whose groupings have only type-1 variants.
60
88a6bac5 61=back
7f52eac8 62
63=begin testing
64
65use Text::Tradition;
66use Text::Tradition::Analysis qw/ run_analysis analyze_variant_location /;
67
68my $datafile = 't/data/florilegium_tei_ps.xml';
69my $tradition = Text::Tradition->new( 'input' => 'TEI',
70 'name' => 'test0',
71 'file' => $datafile );
72my $s = $tradition->add_stemma( 'dotfile' => 't/data/florilegium.dot' );
73is( ref( $s ), 'Text::Tradition::Stemma', "Added stemma to tradition" );
74
f00cefe8 75my %expected_genealogical = (
a44aaf2a 76 1 => 0,
f00cefe8 77 2 => 1,
a44aaf2a 78 3 => 0,
79 5 => 0,
80 7 => 0,
81 8 => 0,
82 10 => 0,
f00cefe8 83 13 => 1,
a44aaf2a 84 33 => 0,
85 34 => 0,
86 37 => 0,
87 60 => 0,
f00cefe8 88 81 => 1,
a44aaf2a 89 84 => 0,
90 87 => 0,
91 101 => 0,
92 102 => 0,
f00cefe8 93 122 => 1,
a44aaf2a 94 157 => 0,
f00cefe8 95 166 => 1,
96 169 => 1,
a44aaf2a 97 200 => 0,
f00cefe8 98 216 => 1,
99 217 => 1,
100 219 => 1,
101 241 => 1,
102 242 => 1,
103 243 => 1,
104);
105
7f52eac8 106my $data = run_analysis( $tradition );
7234b01d 107my $c = $tradition->collation;
f00cefe8 108foreach my $row ( @{$data->{'variants'}} ) {
a44aaf2a 109 # Account for rows that used to be "not useful"
110 unless( exists $expected_genealogical{$row->{'id'}} ) {
111 $expected_genealogical{$row->{'id'}} = 1;
112 }
18f48b82 113 my $gen_bool = $row->{'genealogical'} ? 1 : 0;
114 is( $gen_bool, $expected_genealogical{$row->{'id'}},
f00cefe8 115 "Got correct genealogical flag for row " . $row->{'id'} );
7234b01d 116 # Check that we have the right row with the right groups
117 my $rank = $row->{'id'};
118 foreach my $rdghash ( @{$row->{'readings'}} ) {
119 # Skip 'readings' that aren't really
120 next unless $c->reading( $rdghash->{'readingid'} );
121 # Check the rank
122 is( $c->reading( $rdghash->{'readingid'} )->rank, $rank,
123 "Got correct reading rank" );
124 # Check the witnesses
125 my @realwits = sort $c->reading_witnesses( $rdghash->{'readingid'} );
126 my @sgrp = sort @{$rdghash->{'group'}};
127 is_deeply( \@sgrp, \@realwits, "Reading analyzed with correct groups" );
128 }
f00cefe8 129}
a44aaf2a 130is( $data->{'variant_count'}, 58, "Got right total variant number" );
b4cb2d60 131# TODO Make something meaningful of conflict count, maybe test other bits
7f52eac8 132
133=end testing
134
135=cut
136
d71100ed 137sub run_analysis {
88a6bac5 138 my( $tradition, %opts ) = @_;
f00cefe8 139 my $c = $tradition->collation;
88a6bac5 140
141 my $stemma_id = $opts{'stemma_id'} || 0;
1d73ecad 142 my @ranks = ref( $opts{'ranks'} ) eq 'ARRAY' ? @{$opts{'ranks'}} : ();
143 my @collapse = ref( $opts{'merge_types'} ) eq 'ARRAY' ? @{$opts{'merge_types'}} : ();
88a6bac5 144
145 # Get the stemma
146 my $stemma = $tradition->stemma( $stemma_id );
b4cb2d60 147
4ce27d42 148 # Figure out which witnesses we are working with - that is, the ones that
149 # appear both in the stemma and in the tradition. All others are 'lacunose'
150 # for our purposes.
88a6bac5 151 my @lacunose = $stemma->hypotheticals;
fae07016 152 my @tradition_wits = map { $_->sigil } $tradition->witnesses;
fae07016 153 push( @lacunose, _symmdiff( [ $stemma->witnesses ], \@tradition_wits ) );
88a6bac5 154
155 # Find and mark 'common' ranks for exclusion, unless they were
156 # explicitly specified.
157 unless( @ranks ) {
158 my %common_rank;
a44aaf2a 159 foreach my $rdg ( $c->common_readings ) {
88a6bac5 160 $common_rank{$rdg->rank} = 1;
161 }
162 @ranks = grep { !$common_rank{$_} } ( 1 .. $c->end->rank-1 );
d71100ed 163 }
7f52eac8 164
88a6bac5 165 # Group the variants to send to the solver
166 my @groups;
f629cb3b 167 my @use_ranks;
a44aaf2a 168 my %lacunae;
94654e27 169 my $moved = {};
88a6bac5 170 foreach my $rank ( @ranks ) {
a44aaf2a 171 my $missing = [ @lacunose ];
94654e27 172 my $rankgroup = group_variants( $tradition, $rank, $missing, $moved, \@collapse );
173 # Filter out any empty rankgroups
174 # (e.g. from the later rank for a transposition)
175 next unless keys %$rankgroup;
ffa22d6f 176 if( $opts{'exclude_type1'} ) {
177 # Check to see whether this is a "useful" group.
178 my( $rdgs, $grps ) = _useful_variant( $rankgroup,
179 $stemma->graph, $c->ac_label );
180 next unless @$rdgs;
181 }
f629cb3b 182 push( @use_ranks, $rank );
ffa22d6f 183 push( @groups, $rankgroup );
a44aaf2a 184 $lacunae{$rank} = $missing;
d71100ed 185 }
4ce27d42 186 # Run the solver
e59b8faa 187 my $answer = solve_variants( $stemma, @groups );
fae07016 188
88a6bac5 189 # Do further analysis on the answer
a44aaf2a 190 my $conflict_count = 0;
7234b01d 191 my $aclabel = $c->ac_label;
f629cb3b 192 foreach my $idx ( 0 .. $#use_ranks ) {
88a6bac5 193 my $location = $answer->{'variants'}->[$idx];
194 # Add the rank back in
94654e27 195 my $rank = $use_ranks[$idx];
196 $location->{'id'} = $rank;
7234b01d 197 # Note what our lacunae are
f629cb3b 198 my %lmiss;
7234b01d 199 map { $lmiss{$_} = 1 } @{$lacunae{$use_ranks[$idx]}};
7234b01d 200 $location->{'missing'} = [ keys %lmiss ];
201
88a6bac5 202 # Run the extra analysis we need.
638e2a95 203 analyze_location( $tradition, $stemma, $location, \%lmiss );
7234b01d 204
638e2a95 205 my @layerwits;
7234b01d 206 # Do the final post-analysis tidying up of the data.
a44aaf2a 207 foreach my $rdghash ( @{$location->{'readings'}} ) {
208 $conflict_count++
209 if exists $rdghash->{'conflict'} && $rdghash->{'conflict'};
94654e27 210 # Add the reading text back in, setting display value as needed
a44aaf2a 211 my $rdg = $c->reading( $rdghash->{'readingid'} );
94654e27 212 if( $rdg ) {
213 $rdghash->{'text'} = $rdg->text .
214 ( $rdg->rank == $rank ? '' : ' [' . $rdg->rank . ']' );
215 }
f629cb3b 216 # Remove lacunose witnesses from this reading's list now that the
7234b01d 217 # analysis is done
f629cb3b 218 my @realgroup;
7234b01d 219 map { push( @realgroup, $_ ) unless $lmiss{$_} } @{$rdghash->{'group'}};
f629cb3b 220 $rdghash->{'group'} = \@realgroup;
638e2a95 221 # Note any layered witnesses that appear in this group
222 foreach( @realgroup ) {
223 if( $_ =~ /^(.*)\Q$aclabel\E$/ ) {
224 push( @layerwits, $1 );
225 }
226 }
a44aaf2a 227 }
638e2a95 228 $location->{'layerwits'} = \@layerwits if @layerwits;
88a6bac5 229 }
a44aaf2a 230 $answer->{'conflict_count'} = $conflict_count;
f00cefe8 231
88a6bac5 232 return $answer;
d71100ed 233}
234
7f52eac8 235=head2 group_variants( $tradition, $rank, $lacunose, @merge_relationship_types )
236
237Groups the variants at the given $rank of the collation, treating any
238relationships in @merge_relationship_types as equivalent. $lacunose should
239be a reference to an array, to which the sigla of lacunose witnesses at this
94654e27 240rank will be appended; $transposed should be a reference to a hash, wherein
241the identities of transposed readings and their relatives will be stored.
7f52eac8 242
ffa22d6f 243Returns a hash $group_readings where $rdg is attested by the witnesses listed
244in $group_readings->{$rdg}.
7f52eac8 245
246=cut
247
248# Return group_readings, groups, lacunose
d1348d38 249sub group_variants {
94654e27 250 my( $tradition, $rank, $lacunose, $transposed, $collapse ) = @_;
7f52eac8 251 my $c = $tradition->collation;
335a62ef 252 my $aclabel = $c->ac_label;
7f52eac8 253 # Get the alignment table readings
254 my %readings_at_rank;
ffa22d6f 255 my %is_lacunose; # lookup table for $lacunose
256 map { $is_lacunose{$_} = 1 } @$lacunose;
94654e27 257 my @check_for_gaps;
258 my %moved_wits;
1d73ecad 259 foreach my $tablewit ( @{$c->alignment_table->{'alignment'}} ) {
7f52eac8 260 my $rdg = $tablewit->{'tokens'}->[$rank-1];
fae07016 261 my $wit = $tablewit->{'witness'};
ffa22d6f 262 # Exclude the witness if it is "lacunose" which if we got here
263 # means "not in the stemma".
264 next if $is_lacunose{$wit};
94654e27 265 # Note if the witness is actually in a lacuna
7f52eac8 266 if( $rdg && $rdg->{'t'}->is_lacuna ) {
335a62ef 267 _add_to_witlist( $wit, $lacunose, $aclabel );
94654e27 268 # Otherwise the witness either has a positive reading...
7f52eac8 269 } elsif( $rdg ) {
94654e27 270 # If the reading has been counted elsewhere as a transposition, ignore it.
271 if( $transposed->{$rdg->{'t'}->id} ) {
272 # TODO This doesn't cope with three-way transpositions
273 map { $moved_wits{$_} = 1 } @{$transposed->{$rdg->{'t'}->id}};
274 next;
275 }
276 # Otherwise, record it...
277 $readings_at_rank{$rdg->{'t'}->id} = $rdg->{'t'};
278 # ...and grab any transpositions, and their relations.
279 my @transp = grep { $_->rank != $rank } $rdg->{'t'}->related_readings();
280 foreach my $trdg ( @transp ) {
281 map { $moved_wits{$_} = 1 } $trdg->witnesses;
282 $transposed->{$trdg->id} = [ $rdg->{'t'}->witnesses ];
283 $readings_at_rank{$trdg->id} = $trdg;
284 }
285 # ...or it is empty, ergo a gap.
7f52eac8 286 } else {
94654e27 287 push( @check_for_gaps, $wit );
7f52eac8 288 }
289 }
94654e27 290 my @gap_wits;
291 map { _add_to_witlist( $_, \@gap_wits, $aclabel )
292 unless $moved_wits{$_} } @check_for_gaps;
293 # TODO check for, and break into a new row, any doubled-up witness readings
294 # after transposition...
7f52eac8 295 # Group the readings, collapsing groups by relationship if needed
296 my %grouped_readings;
4ce27d42 297 foreach my $rdg ( values %readings_at_rank ) {
7f52eac8 298 # Skip readings that have been collapsed into others.
f00cefe8 299 next if exists $grouped_readings{$rdg->id} && !$grouped_readings{$rdg->id};
4ce27d42 300 # Get the witness list, including from readings collapsed into this one.
7f52eac8 301 my @wits = $rdg->witnesses;
302 if( $collapse ) {
303 my $filter = sub { my $r = $_[0]; grep { $_ eq $r->type } @$collapse; };
304 foreach my $other ( $rdg->related_readings( $filter ) ) {
fae07016 305 my @otherwits = $other->witnesses;
fae07016 306 push( @wits, @otherwits );
f00cefe8 307 $grouped_readings{$other->id} = 0;
d1348d38 308 }
309 }
335a62ef 310 # Filter the group to those witnesses in the stemma
4ce27d42 311 my @use_wits;
312 foreach my $wit ( @wits ) {
313 next if $is_lacunose{$wit};
314 push( @use_wits, $wit );
4ce27d42 315 }
ffa22d6f 316 $grouped_readings{$rdg->id} = \@use_wits;
7f52eac8 317 }
318 $grouped_readings{'(omitted)'} = \@gap_wits if @gap_wits;
319 # Get rid of our collapsed readings
320 map { delete $grouped_readings{$_} unless $grouped_readings{$_} }
321 keys %grouped_readings
322 if $collapse;
323
4ce27d42 324 # Return the result
5be0cdeb 325 return \%grouped_readings;
d1348d38 326}
327
335a62ef 328# Helper function to ensure that X and X a.c. never appear in the same list.
329sub _add_to_witlist {
330 my( $wit, $list, $acstr ) = @_;
331 my %inlist;
332 my $idx = 0;
333 map { $inlist{$_} = $idx++ } @$list;
334 if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
335 my $acwit = $1;
336 unless( exists $inlist{$acwit} ) {
337 push( @$list, $acwit.$acstr );
338 }
339 } else {
340 if( exists( $inlist{$wit.$acstr} ) ) {
341 # Replace the a.c. version with the main witness
342 my $i = $inlist{$wit.$acstr};
343 $list->[$i] = $wit;
344 } else {
345 push( @$list, $wit );
346 }
347 }
348}
349
88a6bac5 350=head2 solve_variants( $graph, @groups )
351
352Sends the set of groups to the external graph solver service and returns
353a cleaned-up answer, adding the rank IDs back where they belong.
354
355The JSON has the form
356 { "graph": [ stemmagraph DOT string without newlines ],
357 "groupings": [ array of arrays of groups, one per rank ] }
358
359The answer has the form
360 { "variants" => [ array of variant location structures ],
361 "variant_count" => total,
362 "conflict_count" => number of conflicts detected,
363 "genealogical_count" => number of solutions found }
364
365=cut
366
367sub solve_variants {
e59b8faa 368 my( $stemma, @groups ) = @_;
335a62ef 369 my $aclabel = $stemma->collation->ac_label;
370
371 # Filter the groups down to distinct groups, and work out what graph
372 # should be used in the calculation of each group. We want to send each
373 # distinct problem to the solver only once.
374 # We need a whole bunch of lookup tables for this.
375 my $index_groupkeys = {}; # Save the order of readings
376 my $group_indices = {}; # Save the indices that have a given grouping
377 my $graph_problems = {}; # Save the groupings for the given graph
378
379 foreach my $idx ( 0..$#groups ) {
380 my $ghash = $groups[$idx];
88a6bac5 381 my @grouping;
335a62ef 382 # Sort the groupings from big to little, and scan for a.c. witnesses
383 # that would need an extended graph.
384 my @acwits; # note which AC witnesses crop up at this rank
385 my @idxkeys = sort { scalar @{$ghash->{$b}} <=> scalar @{$ghash->{$a}} }
386 keys %$ghash;
387 foreach my $rdg ( @idxkeys ) {
388 my @sg = sort @{$ghash->{$rdg}};
389 push( @acwits, grep { $_ =~ /\Q$aclabel\E$/ } @sg );
390 push( @grouping, \@sg );
391 }
392 # Save the reading order
393 $index_groupkeys->{$idx} = \@idxkeys;
394
395 # Now associate the distinct group with this index
396 my $gstr = wit_stringify( \@grouping );
397 push( @{$group_indices->{$gstr}}, $idx );
398
399 # Finally, add the group to the list to be calculated for this graph.
400 map { s/\Q$aclabel\E$// } @acwits;
401 my $graph = $stemma->extend_graph( \@acwits );
402 unless( exists $graph_problems->{"$graph"} ) {
403 $graph_problems->{"$graph"} = { 'object' => $graph, 'groups' => [] };
88a6bac5 404 }
335a62ef 405 push( @{$graph_problems->{"$graph"}->{'groups'}}, \@grouping );
88a6bac5 406 }
335a62ef 407
408 ## For each distinct graph, send its groups to the solver.
88a6bac5 409 my $solver_url = 'http://byzantini.st/cgi-bin/graphcalc.cgi';
410 my $ua = LWP::UserAgent->new();
335a62ef 411 ## Witness map is a HACK to get around limitations in node names from IDP
412 my $witness_map = {};
413 ## Variables to store answers as they come back
414 my $variants = [ ( undef ) x ( scalar keys %$index_groupkeys ) ];
88a6bac5 415 my $genealogical = 0;
335a62ef 416 foreach my $graphkey ( keys %$graph_problems ) {
417 my $graph = $graph_problems->{$graphkey}->{'object'};
418 my $groupings = $graph_problems->{$graphkey}->{'groups'};
419 my $json = encode_json( _safe_wit_strings( $graph, $stemma->collation,
420 $groupings, $witness_map ) );
421 # Send it off and get the result
94654e27 422 #print STDERR "Sending request: $json\n";
335a62ef 423 my $resp = $ua->post( $solver_url, 'Content-Type' => 'application/json',
424 'Content' => $json );
425 my $answer;
426 my $used_idp;
427 if( $resp->is_success ) {
428 $answer = _desanitize_names( decode_json( $resp->content ), $witness_map );
429 $used_idp = 1;
430 } else {
431 # Fall back to the old method.
432 warn "IDP solver returned " . $resp->status_line . " / " . $resp->content
433 . "; falling back to perl method";
434 $answer = perl_solver( $graph, @$groupings );
435 }
436 ## The answer is the evaluated groupings, plus a boolean for whether
437 ## they were genealogical. Reconstruct our original groups.
438 foreach my $gidx ( 0 .. $#{$groupings} ) {
439 my( $calc_groups, $result ) = @{$answer->[$gidx]};
440 if( $result ) {
441 $genealogical++;
442 # Prune the calculated groups, in case the IDP solver failed to.
443 if( $used_idp ) {
444 my @pruned_groups;
445 foreach my $cg ( @$calc_groups ) {
446 # This is a little wasteful but the path of least
447 # resistance. Send both the stemma, which knows what
448 # its hypotheticals are, and the actual graph used.
449 my @pg = _prune_group( $cg, $stemma, $graph );
450 push( @pruned_groups, \@pg );
451 }
452 $calc_groups = \@pruned_groups;
6d25a3a0 453 }
335a62ef 454 }
455 # Retrieve the key for the original group that went to the solver
456 my $input_group = wit_stringify( $groupings->[$gidx] );
457 foreach my $oidx ( @{$group_indices->{$input_group}} ) {
458 my @readings = @{$index_groupkeys->{$oidx}};
459 my $vstruct = {
460 'genealogical' => $result,
461 'readings' => [],
462 };
463 foreach my $ridx ( 0 .. $#readings ) {
464 push( @{$vstruct->{'readings'}},
465 { 'readingid' => $readings[$ridx],
466 'group' => $calc_groups->[$ridx] } );
467 }
468 $variants->[$oidx] = $vstruct;
6d25a3a0 469 }
470 }
88a6bac5 471 }
472
473 return { 'variants' => $variants,
474 'variant_count' => scalar @$variants,
475 'genealogical_count' => $genealogical };
476}
477
b4cb2d60 478#### HACKERY to cope with IDP's limited idea of what a node name looks like ###
479
480sub _safe_wit_strings {
335a62ef 481 my( $graph, $c, $groupings, $witness_map ) = @_;
482 # Parse the graph we were given into a stemma.
b4cb2d60 483 my $safegraph = Graph->new();
484 # Convert the graph to a safe representation and store the conversion.
335a62ef 485 foreach my $n ( $graph->vertices ) {
b4cb2d60 486 my $sn = _safe_witstr( $n );
335a62ef 487 if( exists $witness_map->{$sn} ) {
488 warn "Ambiguous stringification $sn for $n and " . $witness_map->{$sn}
489 if $witness_map->{$sn} ne $n;
490 } else {
491 $witness_map->{$sn} = $n;
492 }
b4cb2d60 493 $safegraph->add_vertex( $sn );
494 $safegraph->set_vertex_attributes( $sn,
335a62ef 495 $graph->get_vertex_attributes( $n ) );
b4cb2d60 496 }
335a62ef 497 foreach my $e ( $graph->edges ) {
b4cb2d60 498 my @safe_e = ( _safe_witstr( $e->[0] ), _safe_witstr( $e->[1] ) );
499 $safegraph->add_edge( @safe_e );
500 }
501 my $safe_stemma = Text::Tradition::Stemma->new(
335a62ef 502 'collation' => $c, 'graph' => $safegraph );
b4cb2d60 503
504 # Now convert the witness groupings to a safe representation.
505 my $safe_groupings = [];
506 foreach my $grouping ( @$groupings ) {
507 my $safe_grouping = [];
508 foreach my $group ( @$grouping ) {
509 my $safe_group = [];
510 foreach my $n ( @$group ) {
511 my $sn = _safe_witstr( $n );
512 warn "Ambiguous stringification $sn for $n and " . $witness_map->{$sn}
513 if exists $witness_map->{$sn} && $witness_map->{$sn} ne $n;
514 $witness_map->{$sn} = $n;
515 push( @$safe_group, $sn );
516 }
517 push( @$safe_grouping, $safe_group );
518 }
519 push( @$safe_groupings, $safe_grouping );
520 }
521
522 # Return it all in the struct we expect. We have stored the reductions
523 # in the $witness_map that we were passed.
335a62ef 524 return { 'graph' => $safe_stemma->editable( { 'linesep' => ' ' } ),
525 'groupings' => $safe_groupings };
b4cb2d60 526}
527
528sub _safe_witstr {
529 my $witstr = shift;
530 $witstr =~ s/\s+/_/g;
531 $witstr =~ s/[^\w\d-]//g;
532 return $witstr;
533}
534
535sub _desanitize_names {
536 my( $jsonstruct, $witness_map ) = @_;
537 my $result = [];
538 foreach my $grouping ( @$jsonstruct ) {
539 my $real_grouping = [];
540 foreach my $element ( @$grouping ) {
541 if( ref( $element ) eq 'ARRAY' ) {
542 # it's the groupset.
543 my $real_groupset = [];
544 foreach my $group ( @$element ) {
545 my $real_group = [];
546 foreach my $n ( @$group ) {
547 my $rn = $witness_map->{$n};
548 push( @$real_group, $rn );
549 }
550 push( @$real_groupset, $real_group );
551 }
552 push( @$real_grouping, $real_groupset );
553 } else {
554 # It is the boolean, not actually a group.
555 push( @$real_grouping, $element );
556 }
557 }
558 push( @$result, $real_grouping );
559 }
560 return $result;
561}
562
563### END HACKERY ###
564
fae07016 565=head2 analyze_location ( $tradition, $graph, $location_hash )
7f52eac8 566
fae07016 567Given the tradition, its stemma graph, and the solution from the graph solver,
568work out the rest of the information we want. For each reading we need missing,
569conflict, reading_parents, independent_occurrence, followed, not_followed, and follow_unknown. Alters the location_hash in place.
7f52eac8 570
571=cut
732152b1 572
fae07016 573sub analyze_location {
638e2a95 574 my ( $tradition, $stemma, $variant_row, $lacunose ) = @_;
94654e27 575 my $c = $tradition->collation;
fae07016 576
577 # Make a hash of all known node memberships, and make the subgraphs.
578 my $contig = {};
579 my $reading_roots = {};
580 my $subgraph = {};
638e2a95 581 my $acstr = $c->ac_label;
582 my @acwits;
583 $DB::single = 1 if $variant_row->{id} == 87;
94654e27 584 # Note which witnesses positively belong to which group
fae07016 585 foreach my $rdghash ( @{$variant_row->{'readings'}} ) {
586 my $rid = $rdghash->{'readingid'};
638e2a95 587 foreach my $wit ( @{$rdghash->{'group'}} ) {
588 $contig->{$wit} = $rid;
589 if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
590 push( @acwits, $1 );
591 }
592 }
94654e27 593 }
594
638e2a95 595 # Get the actual graph we should work with
596 my $graph = @acwits ? $stemma->extend_graph( \@acwits ) : $stemma->graph;
597
94654e27 598 # Now, armed with that knowledge, make a subgraph for each reading
599 # and note the root(s) of each subgraph.
600 foreach my $rdghash( @{$variant_row->{'readings'}} ) {
601 my $rid = $rdghash->{'readingid'};
602 my %rdgwits;
fae07016 603 # Make the subgraph.
604 my $part = $graph->copy;
94654e27 605 my @todelete = grep { exists $contig->{$_} && $contig->{$_} ne $rid }
606 keys %$contig;
607 $part->delete_vertices( @todelete );
608 _prune_subtree( $part, $lacunose );
fae07016 609 $subgraph->{$rid} = $part;
94654e27 610 # Record the remaining lacunose nodes as part of this group, if
611 # we are dealing with a non-genealogical reading.
612 unless( $variant_row->{'genealogical'} ) {
613 map { $contig->{$_} = $rid } $part->vertices;
614 }
fae07016 615 # Get the reading roots.
616 map { $reading_roots->{$_} = $rid } $part->predecessorless_vertices;
bebec0e9 617 }
618
fae07016 619 # Now that we have all the node group memberships, calculate followed/
bebec0e9 620 # non-followed/unknown values for each reading. Also figure out the
621 # reading's evident parent(s).
622 foreach my $rdghash ( @{$variant_row->{'readings'}} ) {
fae07016 623 my $rid = $rdghash->{'readingid'};
624 # Get the subgraph
625 my $part = $subgraph->{$rid};
626
627 # Start figuring things out.
94654e27 628 my @roots = grep { $reading_roots->{$_} eq $rid } keys %$reading_roots;
629 $rdghash->{'independent_occurrence'} = \@roots;
bebec0e9 630 $rdghash->{'followed'} = scalar( $part->vertices ) - scalar( @roots );
631 # Find the parent readings, if any, of this reading.
94654e27 632 my $rdgparents = {};
bebec0e9 633 foreach my $wit ( @roots ) {
f00cefe8 634 # Look in the main stemma to find this witness's extant or known-reading
635 # immediate ancestor(s), and look up the reading that each ancestor olds.
636 my @check = $graph->predecessors( $wit );
637 while( @check ) {
638 my @next;
639 foreach my $wparent( @check ) {
fae07016 640 my $preading = $contig->{$wparent};
641 if( $preading ) {
94654e27 642 $rdgparents->{$preading} = 1;
f00cefe8 643 } else {
644 push( @next, $graph->predecessors( $wparent ) );
645 }
646 }
647 @check = @next;
648 }
bebec0e9 649 }
94654e27 650 foreach my $p ( keys %$rdgparents ) {
651 # Resolve the relationship of the parent to the reading, and
652 # save it in our hash.
653 my $pobj = $c->reading( $p );
654 my $relation;
655 my $prep = $pobj ? $pobj->id . ' (' . $pobj->text . ')' : $p;
656 if( $pobj ) {
657 my $rel = $c->get_relationship( $p, $rdghash->{readingid} );
658 if( $rel ) {
659 $relation = { type => $rel->type };
660 if( $rel->has_annotation ) {
661 $relation->{'annotation'} = $rel->annotation;
662 }
663 }
664 }
665 $rdgparents->{$p} = { 'label' => $prep, 'relation' => $relation };
666 }
667
668 $rdghash->{'reading_parents'} = $rdgparents;
bebec0e9 669
670 # Find the number of times this reading was altered, and the number of
671 # times we're not sure.
672 my( %nofollow, %unknownfollow );
673 foreach my $wit ( $part->vertices ) {
674 foreach my $wchild ( $graph->successors( $wit ) ) {
675 next if $part->has_vertex( $wchild );
fae07016 676 if( $reading_roots->{$wchild} && $contig->{$wchild} ) {
bebec0e9 677 # It definitely changed here.
678 $nofollow{$wchild} = 1;
679 } elsif( !($contig->{$wchild}) ) {
680 # The child is a hypothetical node not definitely in
681 # any group. Answer is unknown.
682 $unknownfollow{$wchild} = 1;
683 } # else it's a non-root node in a known group, and therefore
684 # is presumed to have its reading from its group, not this link.
685 }
686 }
687 $rdghash->{'not_followed'} = keys %nofollow;
688 $rdghash->{'follow_unknown'} = keys %unknownfollow;
fae07016 689
690 # Now say whether this reading represents a conflict.
691 unless( $variant_row->{'genealogical'} ) {
692 $rdghash->{'conflict'} = @roots != 1;
693 }
c4a4fb1b 694 }
d71100ed 695}
696
fae07016 697
698=head2 perl_solver( $tradition, $rank, $stemma_id, @merge_relationship_types )
699
700** NOTE ** This method should hopefully not be called - it is not guaranteed
701to be correct. Serves as a backup for the real solver.
702
703Runs an analysis of the given tradition, at the location given in $rank,
704against the graph of the stemma specified in $stemma_id. The argument
705@merge_relationship_types is an optional list of relationship types for
706which readings so related should be treated as equivalent.
707
708Returns a nested array data structure as follows:
709
710 [ [ group_list, is_genealogical ], [ group_list, is_genealogical ] ... ]
711
712where the group list is the array of arrays passed in for each element of @groups,
713possibly with the addition of hypothetical readings.
714
715
716=cut
717
718sub perl_solver {
335a62ef 719 my( $graph, @groups ) = @_;
e59b8faa 720 my @answer;
721 foreach my $g ( @groups ) {
722 push( @answer, _solve_variant_location( $graph, $g ) );
723 }
724 return \@answer;
fae07016 725}
726
e59b8faa 727sub _solve_variant_location {
728 my( $graph, $groups ) = @_;
fae07016 729 # Now do the work.
e59b8faa 730 my $contig = {};
731 my $subgraph = {};
732 my $is_conflicted;
733 my $conflict = {};
734
735 # Mark each ms as in its own group, first.
736 foreach my $g ( @$groups ) {
737 my $gst = wit_stringify( $g );
738 map { $contig->{$_} = $gst } @$g;
739 }
740
741 # Now for each unmarked node in the graph, initialize an array
742 # for possible group memberships. We will use this later to
743 # resolve potential conflicts.
744 map { $contig->{$_} = [] unless $contig->{$_} } $graph->vertices;
745 foreach my $g ( sort { scalar @$b <=> scalar @$a } @$groups ) {
746 my $gst = wit_stringify( $g ); # This is the group name
747 # Copy the graph, and delete all non-members from the new graph.
748 my $part = $graph->copy;
749 my @group_roots;
750 $part->delete_vertices(
751 grep { !ref( $contig->{$_} ) && $contig->{$_} ne $gst } $graph->vertices );
752
753 # Now look to see if our group is connected.
754 if( @$g > 1 ) {
755 # We have to take directionality into account.
756 # How many root nodes do we have?
757 my @roots = grep { ref( $contig->{$_} ) || $contig->{$_} eq $gst }
758 $part->predecessorless_vertices;
759 # Assuming that @$g > 1, find the first root node that has at
760 # least one successor belonging to our group. If this reading
761 # is genealogical, there should be only one, but we will check
762 # that implicitly later.
763 foreach my $root ( @roots ) {
764 # Prune the tree to get rid of extraneous hypotheticals.
94654e27 765 $root = _prune_subtree_old( $part, $root, $contig );
e59b8faa 766 next unless $root;
767 # Save this root for our group.
768 push( @group_roots, $root );
769 # Get all the successor nodes of our root.
770 }
771 } else {
772 # Dispense with the trivial case of one reading.
773 my $wit = $g->[0];
774 @group_roots = ( $wit );
775 foreach my $v ( $part->vertices ) {
776 $part->delete_vertex( $v ) unless $v eq $wit;
777 }
778 }
779
780 if( @group_roots > 1 ) {
781 $conflict->{$gst} = 1;
782 $is_conflicted = 1;
783 }
784 # Paint the 'hypotheticals' with our group.
785 foreach my $wit ( $part->vertices ) {
786 if( ref( $contig->{$wit} ) ) {
787 push( @{$contig->{$wit}}, $gst );
788 } elsif( $contig->{$wit} ne $gst ) {
789 warn "How did we get here?";
790 }
791 }
792
793
794 # Save the relevant subgraph.
795 $subgraph->{$gst} = $part;
796 }
797
798 # For each of our hypothetical readings, flatten its 'contig' array if
799 # the array contains zero or one group. If we have any unflattened arrays,
800 # we may need to run the resolution process. If the reading is already known
801 # to have a conflict, flatten the 'contig' array to nothing; we won't resolve
802 # it.
803 my @resolve;
804 foreach my $wit ( keys %$contig ) {
805 next unless ref( $contig->{$wit} );
806 if( @{$contig->{$wit}} > 1 ) {
807 if( $is_conflicted ) {
808 $contig->{$wit} = ''; # We aren't going to decide.
809 } else {
810 push( @resolve, $wit );
811 }
812 } else {
813 my $gst = pop @{$contig->{$wit}};
814 $contig->{$wit} = $gst || '';
815 }
816 }
817
818 if( @resolve ) {
819 my $still_contig = {};
820 foreach my $h ( @resolve ) {
821 # For each of the hypothetical readings with more than one possibility,
822 # try deleting it from each of its member subgraphs in turn, and see
823 # if that breaks the contiguous grouping.
824 # TODO This can still break in a corner case where group A can use
825 # either vertex 1 or 2, and group B can use either vertex 2 or 1.
826 # Revisit this if necessary; it could get brute-force nasty.
827 foreach my $gst ( @{$contig->{$h}} ) {
828 my $gpart = $subgraph->{$gst}->copy();
829 # If we have come this far, there is only one root and everything
830 # is reachable from it.
831 my( $root ) = $gpart->predecessorless_vertices;
832 my $reachable = {};
833 map { $reachable->{$_} = 1 } $gpart->vertices;
834
835 # Try deleting the hypothetical node.
836 $gpart->delete_vertex( $h );
837 if( $h eq $root ) {
838 # See if we still have a single root.
839 my @roots = $gpart->predecessorless_vertices;
840 warn "This shouldn't have happened" unless @roots;
841 if( @roots > 1 ) {
842 # $h is needed by this group.
843 if( exists( $still_contig->{$h} ) ) {
844 # Conflict!
845 $conflict->{$gst} = 1;
846 $still_contig->{$h} = '';
847 } else {
848 $still_contig->{$h} = $gst;
849 }
850 }
851 } else {
852 # $h is somewhere in the middle. See if everything
853 # else can still be reached from the root.
854 my %still_reachable = ( $root => 1 );
855 map { $still_reachable{$_} = 1 }
856 $gpart->all_successors( $root );
857 foreach my $v ( keys %$reachable ) {
858 next if $v eq $h;
859 if( !$still_reachable{$v}
860 && ( $contig->{$v} eq $gst
861 || ( exists $still_contig->{$v}
862 && $still_contig->{$v} eq $gst ) ) ) {
863 # We need $h.
864 if( exists $still_contig->{$h} ) {
865 # Conflict!
866 $conflict->{$gst} = 1;
867 $still_contig->{$h} = '';
868 } else {
869 $still_contig->{$h} = $gst;
870 }
871 last;
872 } # else we don't need $h in this group.
873 } # end foreach $v
874 } # endif $h eq $root
875 } # end foreach $gst
876 } # end foreach $h
877
878 # Now we have some hypothetical vertices in $still_contig that are the
879 # "real" group memberships. Replace these in $contig.
880 foreach my $v ( keys %$contig ) {
881 next unless ref $contig->{$v};
882 $contig->{$v} = $still_contig->{$v};
883 }
884 } # end if @resolve
885
886 my $is_genealogical = keys %$conflict ? JSON::false : JSON::true;
887 my $variant_row = [ [], $is_genealogical ];
888 # Fill in the groupings from $contig.
889 foreach my $g ( @$groups ) {
890 my $gst = wit_stringify( $g );
891 my @realgroup = grep { $contig->{$_} eq $gst } keys %$contig;
892 push( @{$variant_row->[0]}, \@realgroup );
893 }
894 return $variant_row;
895}
fae07016 896
6d25a3a0 897sub _prune_group {
335a62ef 898 my( $group, $stemma, $graph ) = @_;
94654e27 899 my $lacunose = {};
900 map { $lacunose->{$_} = 1 } $stemma->hypotheticals;
901 map { $lacunose->{$_} = 0 } @$group;
6d25a3a0 902 # Make our subgraph
335a62ef 903 my $subgraph = $graph->copy;
94654e27 904 map { $subgraph->delete_vertex( $_ ) unless exists $lacunose->{$_} }
6d25a3a0 905 $subgraph->vertices;
906 # ...and find the root.
6d25a3a0 907 # Now prune and return the remaining vertices.
94654e27 908 _prune_subtree( $subgraph, $lacunose );
6d25a3a0 909 return $subgraph->vertices;
910}
911
7f52eac8 912sub _prune_subtree {
94654e27 913 my( $tree, $lacunose ) = @_;
914
915 # Delete lacunose witnesses that have no successors
916 my @orphan_hypotheticals;
917 my $ctr = 0;
918 do {
919 die "Infinite loop on leaves" if $ctr > 100;
920 @orphan_hypotheticals = grep { $lacunose->{$_} }
921 $tree->successorless_vertices;
922 $tree->delete_vertices( @orphan_hypotheticals );
923 $ctr++;
924 } while( @orphan_hypotheticals );
925
926 # Delete lacunose roots that have a single successor
927 my @redundant_root;
928 $ctr = 0;
929 do {
930 die "Infinite loop on roots" if $ctr > 100;
931 @redundant_root = grep { $lacunose->{$_} && $tree->successors( $_ ) == 1 }
932 $tree->predecessorless_vertices;
933 $tree->delete_vertices( @redundant_root );
934 $ctr++;
935 } while( @redundant_root );
936}
937
938sub _prune_subtree_old {
231d71fc 939 my( $tree, $root, $contighash ) = @_;
940 # First, delete hypothetical leaves / orphans until there are none left.
941 my @orphan_hypotheticals = grep { ref( $contighash->{$_} ) }
942 $tree->successorless_vertices;
943 while( @orphan_hypotheticals ) {
944 $tree->delete_vertices( @orphan_hypotheticals );
945 @orphan_hypotheticals = grep { ref( $contighash->{$_} ) }
946 $tree->successorless_vertices;
947 }
948 # Then delete a hypothetical root with only one successor, moving the
bebec0e9 949 # root to the first child that has no other predecessors.
231d71fc 950 while( $tree->successors( $root ) == 1 && ref $contighash->{$root} ) {
951 my @nextroot = $tree->successors( $root );
952 $tree->delete_vertex( $root );
bebec0e9 953 ( $root ) = grep { $tree->is_predecessorless_vertex( $_ ) } @nextroot;
231d71fc 954 }
955 # The tree has been modified in place, but we need to know the new root.
bebec0e9 956 $root = undef unless $root && $tree->has_vertex( $root );
231d71fc 957 return $root;
958}
d71100ed 959# Add the variant, subject to a.c. representation logic.
960# This assumes that we will see the 'main' version before the a.c. version.
961sub add_variant_wit {
962 my( $arr, $wit, $acstr ) = @_;
963 my $skip;
964 if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
965 my $real = $1;
966 $skip = grep { $_ =~ /^\Q$real\E$/ } @$arr;
967 }
968 push( @$arr, $wit ) unless $skip;
969}
970
5be0cdeb 971sub _useful_variant {
972 my( $group_readings, $graph, $acstr ) = @_;
973
974 # TODO Decide what to do with AC witnesses
975
976 # Sort by group size and return
977 my $is_useful = 0;
978 my( @readings, @groups ); # The sorted groups for our answer.
979 foreach my $rdg ( sort { @{$group_readings->{$b}} <=> @{$group_readings->{$a}} }
980 keys %$group_readings ) {
981 push( @readings, $rdg );
982 push( @groups, $group_readings->{$rdg} );
983 if( @{$group_readings->{$rdg}} > 1 ) {
984 $is_useful++;
985 } else {
986 my( $wit ) = @{$group_readings->{$rdg}};
987 $wit =~ s/^(.*)\Q$acstr\E$/$1/;
988 $is_useful++ unless( $graph->is_sink_vertex( $wit ) );
989 }
990 }
991 if( $is_useful > 1 ) {
992 return( \@readings, \@groups );
993 } else {
994 return( [], [] );
995 }
996}
997
7f52eac8 998=head2 wit_stringify( $groups )
999
1000Takes an array of witness groupings and produces a string like
1001['A','B'] / ['C','D','E'] / ['F']
d71100ed 1002
7f52eac8 1003=cut
d71100ed 1004
1005sub wit_stringify {
1006 my $groups = shift;
1007 my @gst;
1008 # If we were passed an array of witnesses instead of an array of
1009 # groupings, then "group" the witnesses first.
1010 unless( ref( $groups->[0] ) ) {
1011 my $mkgrp = [ $groups ];
1012 $groups = $mkgrp;
1013 }
1014 foreach my $g ( @$groups ) {
1015 push( @gst, '[' . join( ',', map { "'$_'" } @$g ) . ']' );
1016 }
1017 return join( ' / ', @gst );
1018}
7f52eac8 1019
bebec0e9 1020sub _symmdiff {
1021 my( $lista, $listb ) = @_;
7f52eac8 1022 my %union;
1023 my %scalars;
1024 map { $union{$_} = 1; $scalars{$_} = $_ } @$lista;
1025 map { $union{$_} += 1; $scalars{$_} = $_ } @$listb;
bebec0e9 1026 my @set = grep { $union{$_} == 1 } keys %union;
7f52eac8 1027 return map { $scalars{$_} } @set;
1028}
1029
10301;
1031
1032=head1 LICENSE
1033
1034This package is free software and is provided "as is" without express
1035or implied warranty. You can redistribute it and/or modify it under
1036the same terms as Perl itself.
1037
1038=head1 AUTHOR
1039
1040Tara L Andrews E<lt>aurum@cpan.orgE<gt>