remove redundant a.c. witnesses from list
[scpubgit/stemmatology.git] / lib / Text / Tradition / Analysis.pm
CommitLineData
d71100ed 1package Text::Tradition::Analysis;
2
3use strict;
4use warnings;
e4386ba9 5use Benchmark;
d1348d38 6use Exporter 'import';
d71100ed 7use Text::Tradition;
8use Text::Tradition::Stemma;
9
d1348d38 10use vars qw/ @EXPORT_OK /;
a2cf85dd 11@EXPORT_OK = qw/ run_analysis group_variants analyze_variant_location wit_stringify /;
d1348d38 12
7f52eac8 13=head1 NAME
14
15Text::Tradition::Analysis - functions for stemma analysis of a tradition
16
17=head1 SYNOPSIS
18
19 use Text::Tradition;
20 use Text::Tradition::Analysis qw/ run_analysis analyze_variant_location /;
21 my $t = Text::Tradition->new(
22 'name' => 'this is a text',
23 'input' => 'TEI',
24 'file' => '/path/to/tei_parallel_seg_file.xml' );
25 $t->add_stemma( 'dotfile' => $stemmafile );
26
27 my $variant_data = run_analysis( $tradition );
28 # Recalculate rank $n treating all orthographic variants as equivalent
29 my $reanalyze = analyze_variant_location( $tradition, $n, 0, 'orthographic' );
30
31=head1 DESCRIPTION
32
33Text::Tradition is a library for representation and analysis of collated
34texts, particularly medieval ones. The Collation is the central feature of
35a Tradition, where the text, its sequence of readings, and its relationships
36between readings are actually kept.
37
38=head1 SUBROUTINES
39
40=head2 run_analysis( $tradition, $stemma_id, @merge_relationship_types )
41
42Runs the analysis described in analyze_variant_location on every location
43in the collation of the given tradition, against the stemma specified in
44$stemma_id. If $stemma_id is not specified, it defaults to 0 (referencing
45the first stemma saved for the tradition.)
46
47The optional @merge_relationship_types contains a list of relationship types
48to treat as equivalent for the analysis.
49
50=begin testing
51
52use Text::Tradition;
53use Text::Tradition::Analysis qw/ run_analysis analyze_variant_location /;
54
55my $datafile = 't/data/florilegium_tei_ps.xml';
56my $tradition = Text::Tradition->new( 'input' => 'TEI',
57 'name' => 'test0',
58 'file' => $datafile );
59my $s = $tradition->add_stemma( 'dotfile' => 't/data/florilegium.dot' );
60is( ref( $s ), 'Text::Tradition::Stemma', "Added stemma to tradition" );
61
62my $data = run_analysis( $tradition );
63# TODO should be 21!
64is( $data->{'genealogical_count'}, 42, "Got right genealogical count" );
65is( $data->{'conflict_count'}, 17, "Got right conflict count" );
66is( $data->{'variant_count'}, 58, "Got right total variant number" );
67
68=end testing
69
70=cut
71
d71100ed 72sub run_analysis {
7f52eac8 73 my( $tradition, $stemma_id, @collapse ) = @_;
74 $stemma_id = 0 unless $stemma_id;
56cf65bd 75
7f52eac8 76 # Run the variant analysis on every rank in the graph that doesn't
77 # have a common reading. Return the results.
78 my @variants; # holds results from analyze_variant_location
79 my $genealogical; # counter of 'genealogical' variants
80 my $conflicts; # counter of conflicting readings
d71100ed 81
7f52eac8 82 # Find and mark 'common' ranks for exclusion.
83 my %common_rank;
84 foreach my $rdg ( $tradition->collation->common_readings ) {
85 $common_rank{$rdg->rank} = 1;
d71100ed 86 }
7f52eac8 87
88 foreach my $rank ( 1 .. $tradition->collation->end->rank-1 ) {
89 next if $common_rank{$rank};
90 my $variant_row = analyze_variant_location(
91 $tradition, $rank, $stemma_id, @collapse );
92 push( @variants, $variant_row );
93 $genealogical++ if $variant_row->{'genealogical'};
94 $conflicts += grep { $_->{'conflict'} } @{$variant_row->{'readings'}};
d71100ed 95 }
96
7f52eac8 97 return {
98 'variants' => \@variants,
99 'variant_count' => scalar @variants, # TODO redundant
100 'conflict_count' => $conflicts,
101 'genealogical_count' => $genealogical,
102 };
d71100ed 103}
104
7f52eac8 105=head2 group_variants( $tradition, $rank, $lacunose, @merge_relationship_types )
106
107Groups the variants at the given $rank of the collation, treating any
108relationships in @merge_relationship_types as equivalent. $lacunose should
109be a reference to an array, to which the sigla of lacunose witnesses at this
110rank will be appended.
111
112Returns two ordered lists $readings, $groups, where $readings->[$n] is attested
113by the witnesses listed in $groups->[$n].
114
115=cut
116
117# Return group_readings, groups, lacunose
d1348d38 118sub group_variants {
7f52eac8 119 my( $tradition, $rank, $lacunose, $collapse ) = @_;
120 my $c = $tradition->collation;
121 # Get the alignment table readings
122 my %readings_at_rank;
123 my @gap_wits;
124 foreach my $tablewit ( @{$tradition->collation->alignment_table->{'alignment'}} ) {
125 my $rdg = $tablewit->{'tokens'}->[$rank-1];
126 if( $rdg && $rdg->{'t'}->is_lacuna ) {
127 push( @$lacunose, $tablewit->{'witness'} );
128 } elsif( $rdg ) {
129 $readings_at_rank{$rdg->{'t'}->text} = $rdg->{'t'};
130 } else {
131 push( @gap_wits, $tablewit->{'witness'} );
132 }
133 }
d1348d38 134
7f52eac8 135 # Group the readings, collapsing groups by relationship if needed
136 my %grouped_readings;
137 foreach my $rdg ( sort { $b->witnesses <=> $a->witnesses } values %readings_at_rank ) {
138 # Skip readings that have been collapsed into others.
139 next if exists $grouped_readings{$rdg->text} && !$grouped_readings{$rdg->text};
140 my @wits = $rdg->witnesses;
141 if( $collapse ) {
142 my $filter = sub { my $r = $_[0]; grep { $_ eq $r->type } @$collapse; };
143 foreach my $other ( $rdg->related_readings( $filter ) ) {
144 push( @wits, $other->witnesses );
145 $grouped_readings{$other->text} = 0;
d1348d38 146 }
147 }
7f52eac8 148 $grouped_readings{$rdg->text} = \@wits;
149 }
150 $grouped_readings{'(omitted)'} = \@gap_wits if @gap_wits;
151 # Get rid of our collapsed readings
152 map { delete $grouped_readings{$_} unless $grouped_readings{$_} }
153 keys %grouped_readings
154 if $collapse;
155
156 # Return the readings and groups, sorted by size
157 my( @readings, @groups );
158 foreach my $r ( sort { @{$grouped_readings{$b}} <=> @{$grouped_readings{$a}} }
159 keys %grouped_readings ) {
160 push( @readings, $r );
161 push( @groups, $grouped_readings{$r} );
d1348d38 162 }
7f52eac8 163 return( \@readings, \@groups );
d1348d38 164}
165
7f52eac8 166=head2 analyze_variant_location( $tradition, $rank, $stemma_id, @merge_relationship_types )
f6c8ea08 167
7f52eac8 168Runs an analysis of the given tradition, at the location given in $rank,
169against the graph of the stemma specified in $stemma_id. The argument
170@merge_relationship_types is an optional list of relationship types for
171which readings so related should be treated as equivalent.
f6c8ea08 172
7f52eac8 173Returns a data structure as follows:
174
175 { 'id' => $rank,
176 'genealogical' => boolean,
177 'readings => [ { text => $reading_text,
178 group => [ witnesses ],
179 conflict => [ conflicting ],
180 missing => [ excluded ] }, ... ]
181 }
182where 'conflicting' is the list of witnesses whose readings conflict with
183this group, and 'excluded' is the list of witnesses either not present in
184the stemma or lacunose at this location.
185
186=cut
732152b1 187
d71100ed 188sub analyze_variant_location {
7f52eac8 189 my( $tradition, $rank, $sid, @collapse ) = @_;
190 $DB::single = 1 if @collapse;
191 # Get the readings in this tradition at this rank
192 my @rank_rdgs = grep { $_->rank == $rank } $tradition->collation->readings;
193 # Get the applicable stemma
194 my $undirected; # TODO Allow undirected distance tree analysis too
195 my $stemma = $tradition->stemma( $sid );
196 my $graph = $stemma->graph;
197 # Figure out which witnesses we are working with
198 my @lacunose = _set( 'symmdiff', [ $stemma->witnesses ],
199 [ map { $_->sigil } $tradition->witnesses ] );
200
201 # Now group the readings
202 my( $readings, $groups ) =
203 group_variants( $tradition, $rank, \@lacunose, \@collapse );
204 my $group_readings = {};
205 # Lookup table group string -> readings
206 foreach my $x ( 0 .. $#$groups ) {
207 $group_readings->{wit_stringify( $groups->[$x] )} = $readings->[$x];
208 }
209
210 # Now do the work.
231d71fc 211 my $contig = {};
212 my $subgraph = {};
c4a4fb1b 213 my $is_conflicted;
d71100ed 214 my $conflict = {};
7f52eac8 215 my $variant_row = { 'id' => $rank, 'readings' => [] };
94a077d6 216 # Mark each ms as in its own group, first.
217 foreach my $g ( @$groups ) {
218 my $gst = wit_stringify( $g );
231d71fc 219 map { $contig->{$_} = $gst } @$g;
94a077d6 220 }
c4a4fb1b 221 # Now for each unmarked node in the graph, initialize an array
222 # for possible group memberships. We will use this later to
223 # resolve potential conflicts.
231d71fc 224 map { $contig->{$_} = [] unless $contig->{$_} } $graph->vertices;
d71100ed 225 foreach my $g ( sort { scalar @$b <=> scalar @$a } @$groups ) {
c4a4fb1b 226 my $gst = wit_stringify( $g ); # This is the group name
227 my $reachable = { $g->[0] => 1 };
08e0fb85 228 # Copy the graph, and delete all non-members from the new graph.
c4a4fb1b 229 my $part = $graph->copy;
230 my $group_root;
231 $part->delete_vertices(
231d71fc 232 grep { !ref( $contig->{$_} ) && $contig->{$_} ne $gst } $graph->vertices );
c4a4fb1b 233
234 # Now look to see if our group is connected.
235 if( $undirected ) { # For use with distance trees etc.
236 # Find all vertices reachable from the first (arbitrary) group
237 # member. If we are genealogical this should include them all.
238 map { $reachable->{$_} = 1 } $part->all_reachable( $g->[0] );
239 # TODO This is a terrible way to do distance trees, since all
240 # non-leaf nodes are included in every graph part now. We may
241 # have to go back to SPDP.
242 } else {
243 if( @$g > 1 ) {
244 # Dispense with the trivial case of one reading.
245 # We have to take directionality into account.
246 # How many root nodes do we have?
231d71fc 247 my @roots = grep { ref( $contig->{$_} ) || $contig->{$_} eq $gst }
c4a4fb1b 248 $part->source_vertices;
249 # Assuming that @$g > 1, find the first root node that has at
250 # least one successor belonging to our group. If this reading
251 # is genealogical, there should be only one, but we will check
252 # that implicitly later.
253 my $nodes_in_subtree = 0;
254 foreach my $root ( @roots ) {
231d71fc 255 # Prune the tree to get rid of extraneous hypotheticals.
7f52eac8 256 $root = _prune_subtree( $part, $root, $contig );
c4a4fb1b 257 # Get all the successor nodes of our root.
258 my $tmp_reach = { $root => 1 };
259 map { $tmp_reach->{$_} = 1 } $part->all_successors( $root );
260 # Skip this root if none of our successors are in our group
261 # (e.g. isolated 'hypothetical' witnesses with no group)
231d71fc 262 next unless grep { $contig->{$_} } keys %$tmp_reach;
c4a4fb1b 263 if( keys %$tmp_reach > $nodes_in_subtree ) {
264 $nodes_in_subtree = keys %$tmp_reach;
265 $reachable = $tmp_reach;
266 $group_root = $root;
267 }
268 }
269 } # else it is a single-node group, nothing to calculate.
270 }
271
272 # None of the 'reachable' nodes should be marked as being in another
273 # group. Paint the 'hypotheticals' with our group while we are at it,
274 # unless there is a conflict present.
275 foreach ( keys %$reachable ) {
231d71fc 276 if( ref $contig->{$_} ) {
277 push( @{$contig->{$_}}, $gst );
278 } elsif( $contig->{$_} ne $gst ) {
279 $conflict->{$group_readings->{$gst}} = $group_readings->{$contig->{$_}};
c4a4fb1b 280 } # else it is an 'extant' node marked with our group already.
d71100ed 281 }
08e0fb85 282 # None of the unreachable nodes should be in our group either.
283 foreach ( $part->vertices ) {
c4a4fb1b 284 next if $reachable->{$_};
231d71fc 285 if( $contig->{$_} eq $gst ) {
c4a4fb1b 286 $conflict->{$group_readings->{$gst}} = $group_readings->{$gst};
287 last;
288 }
08e0fb85 289 }
290
c4a4fb1b 291 # Now, if we have a conflict, we can write the reading in full. If not,
292 # we have to save the subgraph so that we can resolve possible conflicts
293 # on hypothetical nodes.
294 $is_conflicted = 1 if exists $conflict->{$group_readings->{$gst}};
295
732152b1 296 # Write the reading.
297 my $reading = { 'text' => $group_readings->{$gst},
7f52eac8 298 'missing' => wit_stringify( \@lacunose ),
c4a4fb1b 299 'group' => $gst }; # This will change if we find no conflict
300 if( $is_conflicted ) {
301 $reading->{'conflict'} = $conflict->{$group_readings->{$gst}}
732152b1 302 } else {
c4a4fb1b 303 # Save the relevant subgraph.
231d71fc 304 $subgraph->{$gst} = { 'graph' => $part,
c4a4fb1b 305 'root' => $group_root,
306 'reachable' => $reachable };
732152b1 307 }
308 push( @{$variant_row->{'readings'}}, $reading );
d71100ed 309 }
c4a4fb1b 310
311 # Now that we have gone through all the rows, check the hypothetical
312 # readings for conflict if we haven't found one yet.
231d71fc 313 if( keys %$subgraph && !keys %$conflict ) {
c4a4fb1b 314 my @resolve;
231d71fc 315 foreach ( keys %$contig ) {
316 next unless ref $contig->{$_};
317 if( scalar @{$contig->{$_}} > 1 ) {
c4a4fb1b 318 push( @resolve, $_ );
319 } else {
231d71fc 320 $contig->{$_} = scalar @{$contig->{$_}} ? $contig->{$_}->[0] : '';
c4a4fb1b 321 }
322 }
323 # Do we still have a possible conflict?
231d71fc 324 my $still_contig = {};
c4a4fb1b 325 foreach my $h ( @resolve ) {
326 # For each of the hypothetical readings with more than one possibility,
327 # try deleting it from each of its member subgraphs in turn, and see
328 # if that breaks the contiguous grouping.
329 # TODO This can still break in a corner case where group A can use
330 # either vertex 1 or 2, and group B can use either vertex 2 or 1.
331 # Revisit this if necessary; it could get brute-force nasty.
231d71fc 332 foreach my $gst ( @{$contig->{$h}} ) {
333 my $gpart = $subgraph->{$gst}->{'graph'}->copy;
334 my $reachable = $subgraph->{$gst}->{'reachable'};
c4a4fb1b 335 $gpart->delete_vertex( $h );
336 # Is everything else still reachable from the root?
337 # TODO If $h was the root, see if we still have a single root.
231d71fc 338 my %still_reachable = ( $subgraph->{$gst}->{'root'} => 1 );
c4a4fb1b 339 map { $still_reachable{$_} = 1 }
231d71fc 340 $gpart->all_successors( $subgraph->{$gst}->{'root'} );
c4a4fb1b 341 foreach my $v ( keys %$reachable ) {
342 next if $v eq $h;
343 if( !$still_reachable{$v}
231d71fc 344 && ( $contig->{$v} eq $gst
345 || ( exists $still_contig->{$v}
346 && $still_contig->{$v} eq $gst ) ) ) {
c4a4fb1b 347 # We need $h.
231d71fc 348 if( exists $still_contig->{$h} ) {
c4a4fb1b 349 # Conflict!
350 $conflict->{$group_readings->{$gst}} =
231d71fc 351 $group_readings->{$still_contig->{$h}};
c4a4fb1b 352 } else {
231d71fc 353 $still_contig->{$h} = $gst;
c4a4fb1b 354 }
355 last;
356 } # else we don't need $h in this group.
357 }
358 }
359 }
360
361 # Now, assuming no conflict, we have some hypothetical vertices in
362 # $still_contig that are the "real" group memberships. Replace these
363 # in $contig.
364 unless ( keys %$conflict ) {
231d71fc 365 foreach my $v ( keys %$contig ) {
366 next unless ref $contig->{$v};
367 $contig->{$v} = $still_contig->{$v};
c4a4fb1b 368 }
369 }
370 }
371
372 # Now write the group and conflict information into the respective rows.
7f52eac8 373 my %missing;
374 map { $missing{$_} = 1 } @lacunose; # quick lookup table
c4a4fb1b 375 foreach my $rdg ( @{$variant_row->{'readings'}} ) {
376 $rdg->{'conflict'} = $conflict->{$rdg->{'text'}};
377 next if $rdg->{'conflict'};
7f52eac8 378 my @members = grep { $contig->{$_} eq $rdg->{'group'} && !$missing{$_} }
231d71fc 379 keys %$contig;
c4a4fb1b 380 $rdg->{'group'} = wit_stringify( \@members );
381 }
382
08e0fb85 383 $variant_row->{'genealogical'} = !( keys %$conflict );
732152b1 384 return $variant_row;
d71100ed 385}
386
7f52eac8 387sub _prune_subtree {
231d71fc 388 my( $tree, $root, $contighash ) = @_;
389 # First, delete hypothetical leaves / orphans until there are none left.
390 my @orphan_hypotheticals = grep { ref( $contighash->{$_} ) }
391 $tree->successorless_vertices;
392 while( @orphan_hypotheticals ) {
393 $tree->delete_vertices( @orphan_hypotheticals );
394 @orphan_hypotheticals = grep { ref( $contighash->{$_} ) }
395 $tree->successorless_vertices;
396 }
397 # Then delete a hypothetical root with only one successor, moving the
398 # root to the child.
399 while( $tree->successors( $root ) == 1 && ref $contighash->{$root} ) {
400 my @nextroot = $tree->successors( $root );
401 $tree->delete_vertex( $root );
402 $root = $nextroot[0];
403 }
404 # The tree has been modified in place, but we need to know the new root.
405 return $root;
406}
d71100ed 407# Add the variant, subject to a.c. representation logic.
408# This assumes that we will see the 'main' version before the a.c. version.
409sub add_variant_wit {
410 my( $arr, $wit, $acstr ) = @_;
411 my $skip;
412 if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
413 my $real = $1;
414 $skip = grep { $_ =~ /^\Q$real\E$/ } @$arr;
415 }
416 push( @$arr, $wit ) unless $skip;
417}
418
7f52eac8 419=head2 wit_stringify( $groups )
420
421Takes an array of witness groupings and produces a string like
422['A','B'] / ['C','D','E'] / ['F']
d71100ed 423
7f52eac8 424=cut
d71100ed 425
426sub wit_stringify {
427 my $groups = shift;
428 my @gst;
429 # If we were passed an array of witnesses instead of an array of
430 # groupings, then "group" the witnesses first.
431 unless( ref( $groups->[0] ) ) {
432 my $mkgrp = [ $groups ];
433 $groups = $mkgrp;
434 }
435 foreach my $g ( @$groups ) {
436 push( @gst, '[' . join( ',', map { "'$_'" } @$g ) . ']' );
437 }
438 return join( ' / ', @gst );
439}
7f52eac8 440
441sub _set {
442 my( $op, $lista, $listb ) = @_;
443 my %union;
444 my %scalars;
445 map { $union{$_} = 1; $scalars{$_} = $_ } @$lista;
446 map { $union{$_} += 1; $scalars{$_} = $_ } @$listb;
447 my @set;
448 if( $op eq 'intersection' ) {
449 @set = grep { $union{$_} == 2 } keys %union;
450 } elsif( $op eq 'symmdiff' ) {
451 @set = grep { $union{$_} == 1 } keys %union;
452 } elsif( $op eq 'union' ) {
453 @set = keys %union;
454 }
455 return map { $scalars{$_} } @set;
456}
457
4581;
459
460=head1 LICENSE
461
462This package is free software and is provided "as is" without express
463or implied warranty. You can redistribute it and/or modify it under
464the same terms as Perl itself.
465
466=head1 AUTHOR
467
468Tara L Andrews E<lt>aurum@cpan.orgE<gt>