deal with global relationships
[scpubgit/stemmatology.git] / lib / Text / Tradition / Analysis.pm
CommitLineData
d71100ed 1package Text::Tradition::Analysis;
2
3use strict;
4use warnings;
e4386ba9 5use Benchmark;
d1348d38 6use Exporter 'import';
d71100ed 7use Text::Tradition;
8use Text::Tradition::Stemma;
9
d1348d38 10use vars qw/ @EXPORT_OK /;
a2cf85dd 11@EXPORT_OK = qw/ run_analysis group_variants analyze_variant_location wit_stringify /;
d1348d38 12
7f52eac8 13=head1 NAME
14
15Text::Tradition::Analysis - functions for stemma analysis of a tradition
16
17=head1 SYNOPSIS
18
19 use Text::Tradition;
20 use Text::Tradition::Analysis qw/ run_analysis analyze_variant_location /;
21 my $t = Text::Tradition->new(
22 'name' => 'this is a text',
23 'input' => 'TEI',
24 'file' => '/path/to/tei_parallel_seg_file.xml' );
25 $t->add_stemma( 'dotfile' => $stemmafile );
26
27 my $variant_data = run_analysis( $tradition );
28 # Recalculate rank $n treating all orthographic variants as equivalent
29 my $reanalyze = analyze_variant_location( $tradition, $n, 0, 'orthographic' );
30
31=head1 DESCRIPTION
32
33Text::Tradition is a library for representation and analysis of collated
34texts, particularly medieval ones. The Collation is the central feature of
35a Tradition, where the text, its sequence of readings, and its relationships
36between readings are actually kept.
37
38=head1 SUBROUTINES
39
40=head2 run_analysis( $tradition, $stemma_id, @merge_relationship_types )
41
42Runs the analysis described in analyze_variant_location on every location
43in the collation of the given tradition, against the stemma specified in
44$stemma_id. If $stemma_id is not specified, it defaults to 0 (referencing
45the first stemma saved for the tradition.)
46
47The optional @merge_relationship_types contains a list of relationship types
48to treat as equivalent for the analysis.
49
50=begin testing
51
52use Text::Tradition;
53use Text::Tradition::Analysis qw/ run_analysis analyze_variant_location /;
54
55my $datafile = 't/data/florilegium_tei_ps.xml';
56my $tradition = Text::Tradition->new( 'input' => 'TEI',
57 'name' => 'test0',
58 'file' => $datafile );
59my $s = $tradition->add_stemma( 'dotfile' => 't/data/florilegium.dot' );
60is( ref( $s ), 'Text::Tradition::Stemma', "Added stemma to tradition" );
61
62my $data = run_analysis( $tradition );
5be0cdeb 63# TODO Check genealogical count
64is( $data->{'genealogical_count'}, 13, "Got right genealogical count" );
65is( $data->{'conflict_count'}, 16, "Got right conflict count" );
66is( $data->{'variant_count'}, 28, "Got right total variant number" );
7f52eac8 67
68=end testing
69
70=cut
71
d71100ed 72sub run_analysis {
7f52eac8 73 my( $tradition, $stemma_id, @collapse ) = @_;
74 $stemma_id = 0 unless $stemma_id;
56cf65bd 75
7f52eac8 76 # Run the variant analysis on every rank in the graph that doesn't
77 # have a common reading. Return the results.
78 my @variants; # holds results from analyze_variant_location
79 my $genealogical; # counter of 'genealogical' variants
80 my $conflicts; # counter of conflicting readings
d71100ed 81
7f52eac8 82 # Find and mark 'common' ranks for exclusion.
83 my %common_rank;
84 foreach my $rdg ( $tradition->collation->common_readings ) {
85 $common_rank{$rdg->rank} = 1;
d71100ed 86 }
7f52eac8 87
88 foreach my $rank ( 1 .. $tradition->collation->end->rank-1 ) {
89 next if $common_rank{$rank};
90 my $variant_row = analyze_variant_location(
91 $tradition, $rank, $stemma_id, @collapse );
5be0cdeb 92 next unless $variant_row;
7f52eac8 93 push( @variants, $variant_row );
94 $genealogical++ if $variant_row->{'genealogical'};
95 $conflicts += grep { $_->{'conflict'} } @{$variant_row->{'readings'}};
d71100ed 96 }
97
7f52eac8 98 return {
99 'variants' => \@variants,
100 'variant_count' => scalar @variants, # TODO redundant
101 'conflict_count' => $conflicts,
102 'genealogical_count' => $genealogical,
103 };
d71100ed 104}
105
7f52eac8 106=head2 group_variants( $tradition, $rank, $lacunose, @merge_relationship_types )
107
108Groups the variants at the given $rank of the collation, treating any
109relationships in @merge_relationship_types as equivalent. $lacunose should
110be a reference to an array, to which the sigla of lacunose witnesses at this
111rank will be appended.
112
113Returns two ordered lists $readings, $groups, where $readings->[$n] is attested
114by the witnesses listed in $groups->[$n].
115
116=cut
117
118# Return group_readings, groups, lacunose
d1348d38 119sub group_variants {
7f52eac8 120 my( $tradition, $rank, $lacunose, $collapse ) = @_;
121 my $c = $tradition->collation;
122 # Get the alignment table readings
123 my %readings_at_rank;
124 my @gap_wits;
125 foreach my $tablewit ( @{$tradition->collation->alignment_table->{'alignment'}} ) {
126 my $rdg = $tablewit->{'tokens'}->[$rank-1];
127 if( $rdg && $rdg->{'t'}->is_lacuna ) {
5be0cdeb 128 _add_to_witlist( $tablewit->{'witness'}, $lacunose,
129 $tradition->collation->ac_label );
7f52eac8 130 } elsif( $rdg ) {
131 $readings_at_rank{$rdg->{'t'}->text} = $rdg->{'t'};
132 } else {
5be0cdeb 133 _add_to_witlist( $tablewit->{'witness'}, \@gap_wits,
134 $tradition->collation->ac_label );
7f52eac8 135 }
136 }
d1348d38 137
7f52eac8 138 # Group the readings, collapsing groups by relationship if needed
139 my %grouped_readings;
140 foreach my $rdg ( sort { $b->witnesses <=> $a->witnesses } values %readings_at_rank ) {
141 # Skip readings that have been collapsed into others.
142 next if exists $grouped_readings{$rdg->text} && !$grouped_readings{$rdg->text};
143 my @wits = $rdg->witnesses;
144 if( $collapse ) {
145 my $filter = sub { my $r = $_[0]; grep { $_ eq $r->type } @$collapse; };
146 foreach my $other ( $rdg->related_readings( $filter ) ) {
147 push( @wits, $other->witnesses );
148 $grouped_readings{$other->text} = 0;
d1348d38 149 }
150 }
7f52eac8 151 $grouped_readings{$rdg->text} = \@wits;
152 }
153 $grouped_readings{'(omitted)'} = \@gap_wits if @gap_wits;
154 # Get rid of our collapsed readings
155 map { delete $grouped_readings{$_} unless $grouped_readings{$_} }
156 keys %grouped_readings
157 if $collapse;
158
5be0cdeb 159 return \%grouped_readings;
d1348d38 160}
161
7f52eac8 162=head2 analyze_variant_location( $tradition, $rank, $stemma_id, @merge_relationship_types )
f6c8ea08 163
7f52eac8 164Runs an analysis of the given tradition, at the location given in $rank,
165against the graph of the stemma specified in $stemma_id. The argument
166@merge_relationship_types is an optional list of relationship types for
167which readings so related should be treated as equivalent.
f6c8ea08 168
7f52eac8 169Returns a data structure as follows:
170
171 { 'id' => $rank,
172 'genealogical' => boolean,
173 'readings => [ { text => $reading_text,
174 group => [ witnesses ],
175 conflict => [ conflicting ],
176 missing => [ excluded ] }, ... ]
177 }
178where 'conflicting' is the list of witnesses whose readings conflict with
179this group, and 'excluded' is the list of witnesses either not present in
180the stemma or lacunose at this location.
181
182=cut
732152b1 183
d71100ed 184sub analyze_variant_location {
7f52eac8 185 my( $tradition, $rank, $sid, @collapse ) = @_;
7f52eac8 186 # Get the readings in this tradition at this rank
187 my @rank_rdgs = grep { $_->rank == $rank } $tradition->collation->readings;
188 # Get the applicable stemma
189 my $undirected; # TODO Allow undirected distance tree analysis too
190 my $stemma = $tradition->stemma( $sid );
191 my $graph = $stemma->graph;
192 # Figure out which witnesses we are working with
193 my @lacunose = _set( 'symmdiff', [ $stemma->witnesses ],
194 [ map { $_->sigil } $tradition->witnesses ] );
195
196 # Now group the readings
5be0cdeb 197 my( $readings, $groups ) = _useful_variant(
198 group_variants( $tradition, $rank, \@lacunose, \@collapse ),
199 $graph, $tradition->collation->ac_label );
200 return unless scalar @$readings;
7f52eac8 201 my $group_readings = {};
202 # Lookup table group string -> readings
203 foreach my $x ( 0 .. $#$groups ) {
204 $group_readings->{wit_stringify( $groups->[$x] )} = $readings->[$x];
205 }
206
207 # Now do the work.
231d71fc 208 my $contig = {};
209 my $subgraph = {};
c4a4fb1b 210 my $is_conflicted;
d71100ed 211 my $conflict = {};
7f52eac8 212 my $variant_row = { 'id' => $rank, 'readings' => [] };
94a077d6 213 # Mark each ms as in its own group, first.
214 foreach my $g ( @$groups ) {
215 my $gst = wit_stringify( $g );
231d71fc 216 map { $contig->{$_} = $gst } @$g;
94a077d6 217 }
c4a4fb1b 218 # Now for each unmarked node in the graph, initialize an array
219 # for possible group memberships. We will use this later to
220 # resolve potential conflicts.
231d71fc 221 map { $contig->{$_} = [] unless $contig->{$_} } $graph->vertices;
d71100ed 222 foreach my $g ( sort { scalar @$b <=> scalar @$a } @$groups ) {
c4a4fb1b 223 my $gst = wit_stringify( $g ); # This is the group name
224 my $reachable = { $g->[0] => 1 };
08e0fb85 225 # Copy the graph, and delete all non-members from the new graph.
c4a4fb1b 226 my $part = $graph->copy;
227 my $group_root;
228 $part->delete_vertices(
231d71fc 229 grep { !ref( $contig->{$_} ) && $contig->{$_} ne $gst } $graph->vertices );
c4a4fb1b 230
231 # Now look to see if our group is connected.
232 if( $undirected ) { # For use with distance trees etc.
233 # Find all vertices reachable from the first (arbitrary) group
234 # member. If we are genealogical this should include them all.
235 map { $reachable->{$_} = 1 } $part->all_reachable( $g->[0] );
236 # TODO This is a terrible way to do distance trees, since all
237 # non-leaf nodes are included in every graph part now. We may
238 # have to go back to SPDP.
239 } else {
240 if( @$g > 1 ) {
241 # Dispense with the trivial case of one reading.
242 # We have to take directionality into account.
243 # How many root nodes do we have?
231d71fc 244 my @roots = grep { ref( $contig->{$_} ) || $contig->{$_} eq $gst }
c4a4fb1b 245 $part->source_vertices;
246 # Assuming that @$g > 1, find the first root node that has at
247 # least one successor belonging to our group. If this reading
248 # is genealogical, there should be only one, but we will check
249 # that implicitly later.
250 my $nodes_in_subtree = 0;
251 foreach my $root ( @roots ) {
231d71fc 252 # Prune the tree to get rid of extraneous hypotheticals.
7f52eac8 253 $root = _prune_subtree( $part, $root, $contig );
c4a4fb1b 254 # Get all the successor nodes of our root.
255 my $tmp_reach = { $root => 1 };
256 map { $tmp_reach->{$_} = 1 } $part->all_successors( $root );
257 # Skip this root if none of our successors are in our group
258 # (e.g. isolated 'hypothetical' witnesses with no group)
231d71fc 259 next unless grep { $contig->{$_} } keys %$tmp_reach;
c4a4fb1b 260 if( keys %$tmp_reach > $nodes_in_subtree ) {
261 $nodes_in_subtree = keys %$tmp_reach;
262 $reachable = $tmp_reach;
263 $group_root = $root;
264 }
265 }
266 } # else it is a single-node group, nothing to calculate.
267 }
268
269 # None of the 'reachable' nodes should be marked as being in another
270 # group. Paint the 'hypotheticals' with our group while we are at it,
271 # unless there is a conflict present.
272 foreach ( keys %$reachable ) {
231d71fc 273 if( ref $contig->{$_} ) {
274 push( @{$contig->{$_}}, $gst );
275 } elsif( $contig->{$_} ne $gst ) {
276 $conflict->{$group_readings->{$gst}} = $group_readings->{$contig->{$_}};
c4a4fb1b 277 } # else it is an 'extant' node marked with our group already.
d71100ed 278 }
08e0fb85 279 # None of the unreachable nodes should be in our group either.
280 foreach ( $part->vertices ) {
c4a4fb1b 281 next if $reachable->{$_};
231d71fc 282 if( $contig->{$_} eq $gst ) {
c4a4fb1b 283 $conflict->{$group_readings->{$gst}} = $group_readings->{$gst};
284 last;
285 }
08e0fb85 286 }
287
c4a4fb1b 288 # Now, if we have a conflict, we can write the reading in full. If not,
289 # we have to save the subgraph so that we can resolve possible conflicts
290 # on hypothetical nodes.
291 $is_conflicted = 1 if exists $conflict->{$group_readings->{$gst}};
292
732152b1 293 # Write the reading.
294 my $reading = { 'text' => $group_readings->{$gst},
7f52eac8 295 'missing' => wit_stringify( \@lacunose ),
c4a4fb1b 296 'group' => $gst }; # This will change if we find no conflict
297 if( $is_conflicted ) {
298 $reading->{'conflict'} = $conflict->{$group_readings->{$gst}}
732152b1 299 } else {
c4a4fb1b 300 # Save the relevant subgraph.
231d71fc 301 $subgraph->{$gst} = { 'graph' => $part,
c4a4fb1b 302 'root' => $group_root,
303 'reachable' => $reachable };
732152b1 304 }
305 push( @{$variant_row->{'readings'}}, $reading );
d71100ed 306 }
c4a4fb1b 307
308 # Now that we have gone through all the rows, check the hypothetical
309 # readings for conflict if we haven't found one yet.
231d71fc 310 if( keys %$subgraph && !keys %$conflict ) {
c4a4fb1b 311 my @resolve;
231d71fc 312 foreach ( keys %$contig ) {
313 next unless ref $contig->{$_};
314 if( scalar @{$contig->{$_}} > 1 ) {
c4a4fb1b 315 push( @resolve, $_ );
316 } else {
231d71fc 317 $contig->{$_} = scalar @{$contig->{$_}} ? $contig->{$_}->[0] : '';
c4a4fb1b 318 }
319 }
320 # Do we still have a possible conflict?
231d71fc 321 my $still_contig = {};
c4a4fb1b 322 foreach my $h ( @resolve ) {
323 # For each of the hypothetical readings with more than one possibility,
324 # try deleting it from each of its member subgraphs in turn, and see
325 # if that breaks the contiguous grouping.
326 # TODO This can still break in a corner case where group A can use
327 # either vertex 1 or 2, and group B can use either vertex 2 or 1.
328 # Revisit this if necessary; it could get brute-force nasty.
231d71fc 329 foreach my $gst ( @{$contig->{$h}} ) {
330 my $gpart = $subgraph->{$gst}->{'graph'}->copy;
331 my $reachable = $subgraph->{$gst}->{'reachable'};
c4a4fb1b 332 $gpart->delete_vertex( $h );
333 # Is everything else still reachable from the root?
334 # TODO If $h was the root, see if we still have a single root.
231d71fc 335 my %still_reachable = ( $subgraph->{$gst}->{'root'} => 1 );
c4a4fb1b 336 map { $still_reachable{$_} = 1 }
231d71fc 337 $gpart->all_successors( $subgraph->{$gst}->{'root'} );
c4a4fb1b 338 foreach my $v ( keys %$reachable ) {
339 next if $v eq $h;
340 if( !$still_reachable{$v}
231d71fc 341 && ( $contig->{$v} eq $gst
342 || ( exists $still_contig->{$v}
343 && $still_contig->{$v} eq $gst ) ) ) {
c4a4fb1b 344 # We need $h.
231d71fc 345 if( exists $still_contig->{$h} ) {
c4a4fb1b 346 # Conflict!
347 $conflict->{$group_readings->{$gst}} =
231d71fc 348 $group_readings->{$still_contig->{$h}};
c4a4fb1b 349 } else {
231d71fc 350 $still_contig->{$h} = $gst;
c4a4fb1b 351 }
352 last;
353 } # else we don't need $h in this group.
354 }
355 }
356 }
357
358 # Now, assuming no conflict, we have some hypothetical vertices in
359 # $still_contig that are the "real" group memberships. Replace these
360 # in $contig.
361 unless ( keys %$conflict ) {
231d71fc 362 foreach my $v ( keys %$contig ) {
363 next unless ref $contig->{$v};
364 $contig->{$v} = $still_contig->{$v};
c4a4fb1b 365 }
366 }
367 }
368
369 # Now write the group and conflict information into the respective rows.
7f52eac8 370 my %missing;
371 map { $missing{$_} = 1 } @lacunose; # quick lookup table
c4a4fb1b 372 foreach my $rdg ( @{$variant_row->{'readings'}} ) {
373 $rdg->{'conflict'} = $conflict->{$rdg->{'text'}};
374 next if $rdg->{'conflict'};
7f52eac8 375 my @members = grep { $contig->{$_} eq $rdg->{'group'} && !$missing{$_} }
231d71fc 376 keys %$contig;
c4a4fb1b 377 $rdg->{'group'} = wit_stringify( \@members );
378 }
379
08e0fb85 380 $variant_row->{'genealogical'} = !( keys %$conflict );
732152b1 381 return $variant_row;
d71100ed 382}
383
7f52eac8 384sub _prune_subtree {
231d71fc 385 my( $tree, $root, $contighash ) = @_;
386 # First, delete hypothetical leaves / orphans until there are none left.
387 my @orphan_hypotheticals = grep { ref( $contighash->{$_} ) }
388 $tree->successorless_vertices;
389 while( @orphan_hypotheticals ) {
390 $tree->delete_vertices( @orphan_hypotheticals );
391 @orphan_hypotheticals = grep { ref( $contighash->{$_} ) }
392 $tree->successorless_vertices;
393 }
394 # Then delete a hypothetical root with only one successor, moving the
395 # root to the child.
396 while( $tree->successors( $root ) == 1 && ref $contighash->{$root} ) {
397 my @nextroot = $tree->successors( $root );
398 $tree->delete_vertex( $root );
399 $root = $nextroot[0];
400 }
401 # The tree has been modified in place, but we need to know the new root.
402 return $root;
403}
d71100ed 404# Add the variant, subject to a.c. representation logic.
405# This assumes that we will see the 'main' version before the a.c. version.
406sub add_variant_wit {
407 my( $arr, $wit, $acstr ) = @_;
408 my $skip;
409 if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
410 my $real = $1;
411 $skip = grep { $_ =~ /^\Q$real\E$/ } @$arr;
412 }
413 push( @$arr, $wit ) unless $skip;
414}
415
5be0cdeb 416sub _useful_variant {
417 my( $group_readings, $graph, $acstr ) = @_;
418
419 # TODO Decide what to do with AC witnesses
420
421 # Sort by group size and return
422 my $is_useful = 0;
423 my( @readings, @groups ); # The sorted groups for our answer.
424 foreach my $rdg ( sort { @{$group_readings->{$b}} <=> @{$group_readings->{$a}} }
425 keys %$group_readings ) {
426 push( @readings, $rdg );
427 push( @groups, $group_readings->{$rdg} );
428 if( @{$group_readings->{$rdg}} > 1 ) {
429 $is_useful++;
430 } else {
431 my( $wit ) = @{$group_readings->{$rdg}};
432 $wit =~ s/^(.*)\Q$acstr\E$/$1/;
433 $is_useful++ unless( $graph->is_sink_vertex( $wit ) );
434 }
435 }
436 if( $is_useful > 1 ) {
437 return( \@readings, \@groups );
438 } else {
439 return( [], [] );
440 }
441}
442
7f52eac8 443=head2 wit_stringify( $groups )
444
445Takes an array of witness groupings and produces a string like
446['A','B'] / ['C','D','E'] / ['F']
d71100ed 447
7f52eac8 448=cut
d71100ed 449
450sub wit_stringify {
451 my $groups = shift;
452 my @gst;
453 # If we were passed an array of witnesses instead of an array of
454 # groupings, then "group" the witnesses first.
455 unless( ref( $groups->[0] ) ) {
456 my $mkgrp = [ $groups ];
457 $groups = $mkgrp;
458 }
459 foreach my $g ( @$groups ) {
460 push( @gst, '[' . join( ',', map { "'$_'" } @$g ) . ']' );
461 }
462 return join( ' / ', @gst );
463}
7f52eac8 464
5be0cdeb 465# Helper function to ensure that X and X a.c. never appear in the same list.
466sub _add_to_witlist {
467 my( $wit, $list, $acstr ) = @_;
468 my %inlist;
469 my $idx = 0;
470 map { $inlist{$_} = $idx++ } @$list;
471 if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
472 my $acwit = $1;
473 unless( exists $inlist{$acwit} ) {
474 push( @$list, $acwit.$acstr );
475 }
476 } else {
477 if( exists( $inlist{$wit.$acstr} ) ) {
478 # Replace the a.c. version with the main witness
479 my $i = $inlist{$wit.$acstr};
480 $list->[$i] = $wit;
481 } else {
482 push( @$list, $wit );
483 }
484 }
485}
486
7f52eac8 487sub _set {
488 my( $op, $lista, $listb ) = @_;
489 my %union;
490 my %scalars;
491 map { $union{$_} = 1; $scalars{$_} = $_ } @$lista;
492 map { $union{$_} += 1; $scalars{$_} = $_ } @$listb;
493 my @set;
494 if( $op eq 'intersection' ) {
495 @set = grep { $union{$_} == 2 } keys %union;
496 } elsif( $op eq 'symmdiff' ) {
497 @set = grep { $union{$_} == 1 } keys %union;
498 } elsif( $op eq 'union' ) {
499 @set = keys %union;
500 }
501 return map { $scalars{$_} } @set;
502}
503
5041;
505
506=head1 LICENSE
507
508This package is free software and is provided "as is" without express
509or implied warranty. You can redistribute it and/or modify it under
510the same terms as Perl itself.
511
512=head1 AUTHOR
513
514Tara L Andrews E<lt>aurum@cpan.orgE<gt>