better analysis with more information
[scpubgit/stemmatology.git] / lib / Text / Tradition / Analysis.pm
CommitLineData
d71100ed 1package Text::Tradition::Analysis;
2
3use strict;
4use warnings;
e4386ba9 5use Benchmark;
d1348d38 6use Exporter 'import';
d71100ed 7use Text::Tradition;
8use Text::Tradition::Stemma;
9
d1348d38 10use vars qw/ @EXPORT_OK /;
a2cf85dd 11@EXPORT_OK = qw/ run_analysis group_variants analyze_variant_location wit_stringify /;
d1348d38 12
7f52eac8 13=head1 NAME
14
15Text::Tradition::Analysis - functions for stemma analysis of a tradition
16
17=head1 SYNOPSIS
18
19 use Text::Tradition;
20 use Text::Tradition::Analysis qw/ run_analysis analyze_variant_location /;
21 my $t = Text::Tradition->new(
22 'name' => 'this is a text',
23 'input' => 'TEI',
24 'file' => '/path/to/tei_parallel_seg_file.xml' );
25 $t->add_stemma( 'dotfile' => $stemmafile );
26
27 my $variant_data = run_analysis( $tradition );
28 # Recalculate rank $n treating all orthographic variants as equivalent
29 my $reanalyze = analyze_variant_location( $tradition, $n, 0, 'orthographic' );
30
31=head1 DESCRIPTION
32
33Text::Tradition is a library for representation and analysis of collated
34texts, particularly medieval ones. The Collation is the central feature of
35a Tradition, where the text, its sequence of readings, and its relationships
36between readings are actually kept.
37
38=head1 SUBROUTINES
39
40=head2 run_analysis( $tradition, $stemma_id, @merge_relationship_types )
41
42Runs the analysis described in analyze_variant_location on every location
43in the collation of the given tradition, against the stemma specified in
44$stemma_id. If $stemma_id is not specified, it defaults to 0 (referencing
45the first stemma saved for the tradition.)
46
47The optional @merge_relationship_types contains a list of relationship types
48to treat as equivalent for the analysis.
49
50=begin testing
51
52use Text::Tradition;
53use Text::Tradition::Analysis qw/ run_analysis analyze_variant_location /;
54
55my $datafile = 't/data/florilegium_tei_ps.xml';
56my $tradition = Text::Tradition->new( 'input' => 'TEI',
57 'name' => 'test0',
58 'file' => $datafile );
59my $s = $tradition->add_stemma( 'dotfile' => 't/data/florilegium.dot' );
60is( ref( $s ), 'Text::Tradition::Stemma', "Added stemma to tradition" );
61
62my $data = run_analysis( $tradition );
5be0cdeb 63# TODO Check genealogical count
64is( $data->{'genealogical_count'}, 13, "Got right genealogical count" );
65is( $data->{'conflict_count'}, 16, "Got right conflict count" );
66is( $data->{'variant_count'}, 28, "Got right total variant number" );
7f52eac8 67
68=end testing
69
70=cut
71
d71100ed 72sub run_analysis {
7f52eac8 73 my( $tradition, $stemma_id, @collapse ) = @_;
74 $stemma_id = 0 unless $stemma_id;
56cf65bd 75
7f52eac8 76 # Run the variant analysis on every rank in the graph that doesn't
77 # have a common reading. Return the results.
78 my @variants; # holds results from analyze_variant_location
79 my $genealogical; # counter of 'genealogical' variants
80 my $conflicts; # counter of conflicting readings
d71100ed 81
7f52eac8 82 # Find and mark 'common' ranks for exclusion.
83 my %common_rank;
84 foreach my $rdg ( $tradition->collation->common_readings ) {
85 $common_rank{$rdg->rank} = 1;
d71100ed 86 }
7f52eac8 87
88 foreach my $rank ( 1 .. $tradition->collation->end->rank-1 ) {
89 next if $common_rank{$rank};
90 my $variant_row = analyze_variant_location(
91 $tradition, $rank, $stemma_id, @collapse );
5be0cdeb 92 next unless $variant_row;
7f52eac8 93 push( @variants, $variant_row );
94 $genealogical++ if $variant_row->{'genealogical'};
95 $conflicts += grep { $_->{'conflict'} } @{$variant_row->{'readings'}};
d71100ed 96 }
97
7f52eac8 98 return {
99 'variants' => \@variants,
100 'variant_count' => scalar @variants, # TODO redundant
101 'conflict_count' => $conflicts,
102 'genealogical_count' => $genealogical,
103 };
d71100ed 104}
105
7f52eac8 106=head2 group_variants( $tradition, $rank, $lacunose, @merge_relationship_types )
107
108Groups the variants at the given $rank of the collation, treating any
109relationships in @merge_relationship_types as equivalent. $lacunose should
110be a reference to an array, to which the sigla of lacunose witnesses at this
111rank will be appended.
112
113Returns two ordered lists $readings, $groups, where $readings->[$n] is attested
114by the witnesses listed in $groups->[$n].
115
116=cut
117
118# Return group_readings, groups, lacunose
d1348d38 119sub group_variants {
7f52eac8 120 my( $tradition, $rank, $lacunose, $collapse ) = @_;
121 my $c = $tradition->collation;
122 # Get the alignment table readings
123 my %readings_at_rank;
124 my @gap_wits;
125 foreach my $tablewit ( @{$tradition->collation->alignment_table->{'alignment'}} ) {
126 my $rdg = $tablewit->{'tokens'}->[$rank-1];
127 if( $rdg && $rdg->{'t'}->is_lacuna ) {
5be0cdeb 128 _add_to_witlist( $tablewit->{'witness'}, $lacunose,
129 $tradition->collation->ac_label );
7f52eac8 130 } elsif( $rdg ) {
131 $readings_at_rank{$rdg->{'t'}->text} = $rdg->{'t'};
132 } else {
5be0cdeb 133 _add_to_witlist( $tablewit->{'witness'}, \@gap_wits,
134 $tradition->collation->ac_label );
7f52eac8 135 }
136 }
d1348d38 137
7f52eac8 138 # Group the readings, collapsing groups by relationship if needed
139 my %grouped_readings;
140 foreach my $rdg ( sort { $b->witnesses <=> $a->witnesses } values %readings_at_rank ) {
141 # Skip readings that have been collapsed into others.
142 next if exists $grouped_readings{$rdg->text} && !$grouped_readings{$rdg->text};
143 my @wits = $rdg->witnesses;
144 if( $collapse ) {
145 my $filter = sub { my $r = $_[0]; grep { $_ eq $r->type } @$collapse; };
146 foreach my $other ( $rdg->related_readings( $filter ) ) {
147 push( @wits, $other->witnesses );
148 $grouped_readings{$other->text} = 0;
d1348d38 149 }
150 }
7f52eac8 151 $grouped_readings{$rdg->text} = \@wits;
152 }
153 $grouped_readings{'(omitted)'} = \@gap_wits if @gap_wits;
154 # Get rid of our collapsed readings
155 map { delete $grouped_readings{$_} unless $grouped_readings{$_} }
156 keys %grouped_readings
157 if $collapse;
158
5be0cdeb 159 return \%grouped_readings;
d1348d38 160}
161
7f52eac8 162=head2 analyze_variant_location( $tradition, $rank, $stemma_id, @merge_relationship_types )
f6c8ea08 163
7f52eac8 164Runs an analysis of the given tradition, at the location given in $rank,
165against the graph of the stemma specified in $stemma_id. The argument
166@merge_relationship_types is an optional list of relationship types for
167which readings so related should be treated as equivalent.
f6c8ea08 168
7f52eac8 169Returns a data structure as follows:
170
171 { 'id' => $rank,
172 'genealogical' => boolean,
173 'readings => [ { text => $reading_text,
174 group => [ witnesses ],
175 conflict => [ conflicting ],
176 missing => [ excluded ] }, ... ]
177 }
178where 'conflicting' is the list of witnesses whose readings conflict with
179this group, and 'excluded' is the list of witnesses either not present in
180the stemma or lacunose at this location.
181
182=cut
732152b1 183
d71100ed 184sub analyze_variant_location {
7f52eac8 185 my( $tradition, $rank, $sid, @collapse ) = @_;
7f52eac8 186 # Get the readings in this tradition at this rank
187 my @rank_rdgs = grep { $_->rank == $rank } $tradition->collation->readings;
188 # Get the applicable stemma
189 my $undirected; # TODO Allow undirected distance tree analysis too
190 my $stemma = $tradition->stemma( $sid );
191 my $graph = $stemma->graph;
192 # Figure out which witnesses we are working with
bebec0e9 193 my @lacunose = $stemma->hypotheticals;
194 push( @lacunose, _symmdiff( [ $stemma->witnesses ],
195 [ map { $_->sigil } $tradition->witnesses ] ) );
7f52eac8 196
197 # Now group the readings
5be0cdeb 198 my( $readings, $groups ) = _useful_variant(
199 group_variants( $tradition, $rank, \@lacunose, \@collapse ),
200 $graph, $tradition->collation->ac_label );
201 return unless scalar @$readings;
7f52eac8 202 my $group_readings = {};
203 # Lookup table group string -> readings
204 foreach my $x ( 0 .. $#$groups ) {
205 $group_readings->{wit_stringify( $groups->[$x] )} = $readings->[$x];
206 }
207
208 # Now do the work.
231d71fc 209 my $contig = {};
210 my $subgraph = {};
c4a4fb1b 211 my $is_conflicted;
d71100ed 212 my $conflict = {};
bebec0e9 213 my %reading_roots;
7f52eac8 214 my $variant_row = { 'id' => $rank, 'readings' => [] };
94a077d6 215 # Mark each ms as in its own group, first.
216 foreach my $g ( @$groups ) {
217 my $gst = wit_stringify( $g );
231d71fc 218 map { $contig->{$_} = $gst } @$g;
94a077d6 219 }
c4a4fb1b 220 # Now for each unmarked node in the graph, initialize an array
221 # for possible group memberships. We will use this later to
222 # resolve potential conflicts.
bebec0e9 223 $DB::single = 1 if $rank == 636;
231d71fc 224 map { $contig->{$_} = [] unless $contig->{$_} } $graph->vertices;
d71100ed 225 foreach my $g ( sort { scalar @$b <=> scalar @$a } @$groups ) {
c4a4fb1b 226 my $gst = wit_stringify( $g ); # This is the group name
08e0fb85 227 # Copy the graph, and delete all non-members from the new graph.
c4a4fb1b 228 my $part = $graph->copy;
bebec0e9 229 my @group_roots;
c4a4fb1b 230 $part->delete_vertices(
231d71fc 231 grep { !ref( $contig->{$_} ) && $contig->{$_} ne $gst } $graph->vertices );
c4a4fb1b 232
233 # Now look to see if our group is connected.
234 if( $undirected ) { # For use with distance trees etc.
235 # Find all vertices reachable from the first (arbitrary) group
bebec0e9 236 # member. If we are genealogical this should include them all.
237 my $reachable = {};
c4a4fb1b 238 map { $reachable->{$_} = 1 } $part->all_reachable( $g->[0] );
239 # TODO This is a terrible way to do distance trees, since all
240 # non-leaf nodes are included in every graph part now. We may
241 # have to go back to SPDP.
242 } else {
243 if( @$g > 1 ) {
c4a4fb1b 244 # We have to take directionality into account.
245 # How many root nodes do we have?
231d71fc 246 my @roots = grep { ref( $contig->{$_} ) || $contig->{$_} eq $gst }
bebec0e9 247 $part->predecessorless_vertices;
c4a4fb1b 248 # Assuming that @$g > 1, find the first root node that has at
249 # least one successor belonging to our group. If this reading
250 # is genealogical, there should be only one, but we will check
251 # that implicitly later.
c4a4fb1b 252 foreach my $root ( @roots ) {
231d71fc 253 # Prune the tree to get rid of extraneous hypotheticals.
7f52eac8 254 $root = _prune_subtree( $part, $root, $contig );
bebec0e9 255 next unless $root;
256 # Save this root for our group.
257 push( @group_roots, $root );
c4a4fb1b 258 # Get all the successor nodes of our root.
c4a4fb1b 259 }
bebec0e9 260 } else {
261 # Dispense with the trivial case of one reading.
262 @group_roots = @$g;
263 _prune_subtree( $part, @group_roots, $contig );
264 }
c4a4fb1b 265 }
266
bebec0e9 267 map { $reading_roots{$_} = 1 } @group_roots;
268 if( @group_roots > 1 ) {
269 $conflict->{$group_readings->{$gst}} = 1;
270 $is_conflicted = 1;
08e0fb85 271 }
bebec0e9 272 # Paint the 'hypotheticals' with our group.
273 foreach my $wit ( $part->vertices ) {
274 if( ref( $contig->{$wit} ) ) {
275 push( @{$contig->{$wit}}, $gst );
276 } elsif( $contig->{$wit} ne $gst ) {
277 warn "How did we get here?";
278 }
279 }
08e0fb85 280
c4a4fb1b 281
bebec0e9 282 # Start to write the reading, and save the group subgraph.
732152b1 283 my $reading = { 'text' => $group_readings->{$gst},
7f52eac8 284 'missing' => wit_stringify( \@lacunose ),
c4a4fb1b 285 'group' => $gst }; # This will change if we find no conflict
bebec0e9 286 # Save the relevant subgraph.
287 $subgraph->{$gst} = $part;
732152b1 288 push( @{$variant_row->{'readings'}}, $reading );
d71100ed 289 }
c4a4fb1b 290
bebec0e9 291 # For each of our hypothetical readings, flatten its 'contig' array if
292 # the array contains zero or one group. If we have any unflattened arrays,
293 # we may need to run the resolution process. If the reading is already known
294 # to have a conflict, flatten the 'contig' array to nothing; we won't resolve
295 # it.
296 my @resolve;
297 foreach my $wit ( keys %$contig ) {
298 next unless ref( $contig->{$wit} );
299 if( @{$contig->{$wit}} > 1 ) {
300 if( $is_conflicted ) {
301 $contig->{$wit} = ''; # We aren't going to decide.
302 } else {
303 push( @resolve, $wit );
304 }
305 } else {
306 my $gst = pop @{$contig->{$wit}};
307 $contig->{$wit} = $gst || '';
308 }
309 }
310
311 if( @resolve ) {
231d71fc 312 my $still_contig = {};
c4a4fb1b 313 foreach my $h ( @resolve ) {
314 # For each of the hypothetical readings with more than one possibility,
315 # try deleting it from each of its member subgraphs in turn, and see
316 # if that breaks the contiguous grouping.
317 # TODO This can still break in a corner case where group A can use
318 # either vertex 1 or 2, and group B can use either vertex 2 or 1.
319 # Revisit this if necessary; it could get brute-force nasty.
231d71fc 320 foreach my $gst ( @{$contig->{$h}} ) {
bebec0e9 321 my $gpart = $subgraph->{$gst}->copy();
322 # If we have come this far, there is only one root and everything
323 # is reachable from it.
324 my( $root ) = $gpart->predecessorless_vertices;
325 my $reachable = {};
326 map { $reachable->{$_} = 1 } $gpart->vertices;
327
328 # Try deleting the hypothetical node.
c4a4fb1b 329 $gpart->delete_vertex( $h );
bebec0e9 330 if( $h eq $root ) {
331 # See if we still have a single root.
332 my @roots = $gpart->predecessorless_vertices;
333 warn "This shouldn't have happened" unless @roots;
334 if( @roots > 1 ) {
335 # $h is needed by this group.
336 if( exists( $still_contig->{$h} ) ) {
337 # Conflict!
338 $conflict->{$group_readings->{$gst}} = 1;
339 $still_contig->{$h} = '';
340 } else {
341 $still_contig->{$h} = $gst;
342 }
343 }
344 } else {
345 # $h is somewhere in the middle. See if everything
346 # else can still be reached from the root.
347 my %still_reachable = ( $root => 1 );
348 map { $still_reachable{$_} = 1 }
349 $gpart->all_successors( $root );
350 foreach my $v ( keys %$reachable ) {
351 next if $v eq $h;
352 if( !$still_reachable{$v}
353 && ( $contig->{$v} eq $gst
354 || ( exists $still_contig->{$v}
355 && $still_contig->{$v} eq $gst ) ) ) {
356 # We need $h.
357 if( exists $still_contig->{$h} ) {
358 # Conflict!
359 $conflict->{$group_readings->{$gst}} = 1;
360 $still_contig->{$h} = '';
361 } else {
362 $still_contig->{$h} = $gst;
363 }
364 last;
365 } # else we don't need $h in this group.
366 } # end foreach $v
367 } # endif $h eq $root
368 } # end foreach $gst
369 } # end foreach $h
c4a4fb1b 370
bebec0e9 371 # Now we have some hypothetical vertices in $still_contig that are the
372 # "real" group memberships. Replace these in $contig.
373 foreach my $v ( keys %$contig ) {
374 next unless ref $contig->{$v};
375 $contig->{$v} = $still_contig->{$v};
376 }
377 } # end if @resolve
378
379 # Now that we have all the node group memberships, calculate followed/
380 # non-followed/unknown values for each reading. Also figure out the
381 # reading's evident parent(s).
382 foreach my $rdghash ( @{$variant_row->{'readings'}} ) {
383 my $gst = $rdghash->{'group'};
384 my $part = $subgraph->{$gst};
385 my @roots = $part->predecessorless_vertices;
386 $rdghash->{'independent_occurrence'} = scalar @roots;
387 $rdghash->{'followed'} = scalar( $part->vertices ) - scalar( @roots );
388 # Find the parent readings, if any, of this reading.
389 my @rdgparents;
390 foreach my $wit ( @roots ) {
391 # Look in the main stemma to find this witness's parent(s), and look
392 # up the reading that the parent holds.
393 foreach my $wparent( $graph->predecessors( $wit ) ) {
394 my $pgroup = $contig->{$wparent};
395 if( $pgroup ) {
396 push( @rdgparents, $group_readings->{$pgroup} );
397 }
398 }
399 }
400 $rdghash->{'reading_parents'} = \@rdgparents;
401
402 # Find the number of times this reading was altered, and the number of
403 # times we're not sure.
404 my( %nofollow, %unknownfollow );
405 foreach my $wit ( $part->vertices ) {
406 foreach my $wchild ( $graph->successors( $wit ) ) {
407 next if $part->has_vertex( $wchild );
408 if( $reading_roots{$wchild} && $contig->{$wchild} ) {
409 # It definitely changed here.
410 $nofollow{$wchild} = 1;
411 } elsif( !($contig->{$wchild}) ) {
412 # The child is a hypothetical node not definitely in
413 # any group. Answer is unknown.
414 $unknownfollow{$wchild} = 1;
415 } # else it's a non-root node in a known group, and therefore
416 # is presumed to have its reading from its group, not this link.
417 }
418 }
419 $rdghash->{'not_followed'} = keys %nofollow;
420 $rdghash->{'follow_unknown'} = keys %unknownfollow;
c4a4fb1b 421 }
bebec0e9 422
c4a4fb1b 423 # Now write the group and conflict information into the respective rows.
bebec0e9 424 foreach my $rdghash ( @{$variant_row->{'readings'}} ) {
425 $rdghash->{'conflict'} = $conflict->{$rdghash->{'text'}};
426 my @members = grep { $contig->{$_} eq $rdghash->{'group'} } keys %$contig;
427 $rdghash->{'group'} = wit_stringify( \@members );
c4a4fb1b 428 }
429
08e0fb85 430 $variant_row->{'genealogical'} = !( keys %$conflict );
732152b1 431 return $variant_row;
d71100ed 432}
433
7f52eac8 434sub _prune_subtree {
231d71fc 435 my( $tree, $root, $contighash ) = @_;
436 # First, delete hypothetical leaves / orphans until there are none left.
437 my @orphan_hypotheticals = grep { ref( $contighash->{$_} ) }
438 $tree->successorless_vertices;
439 while( @orphan_hypotheticals ) {
440 $tree->delete_vertices( @orphan_hypotheticals );
441 @orphan_hypotheticals = grep { ref( $contighash->{$_} ) }
442 $tree->successorless_vertices;
443 }
444 # Then delete a hypothetical root with only one successor, moving the
bebec0e9 445 # root to the first child that has no other predecessors.
231d71fc 446 while( $tree->successors( $root ) == 1 && ref $contighash->{$root} ) {
447 my @nextroot = $tree->successors( $root );
448 $tree->delete_vertex( $root );
bebec0e9 449 ( $root ) = grep { $tree->is_predecessorless_vertex( $_ ) } @nextroot;
231d71fc 450 }
451 # The tree has been modified in place, but we need to know the new root.
bebec0e9 452 $root = undef unless $root && $tree->has_vertex( $root );
231d71fc 453 return $root;
454}
d71100ed 455# Add the variant, subject to a.c. representation logic.
456# This assumes that we will see the 'main' version before the a.c. version.
457sub add_variant_wit {
458 my( $arr, $wit, $acstr ) = @_;
459 my $skip;
460 if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
461 my $real = $1;
462 $skip = grep { $_ =~ /^\Q$real\E$/ } @$arr;
463 }
464 push( @$arr, $wit ) unless $skip;
465}
466
5be0cdeb 467sub _useful_variant {
468 my( $group_readings, $graph, $acstr ) = @_;
469
470 # TODO Decide what to do with AC witnesses
471
472 # Sort by group size and return
473 my $is_useful = 0;
474 my( @readings, @groups ); # The sorted groups for our answer.
475 foreach my $rdg ( sort { @{$group_readings->{$b}} <=> @{$group_readings->{$a}} }
476 keys %$group_readings ) {
477 push( @readings, $rdg );
478 push( @groups, $group_readings->{$rdg} );
479 if( @{$group_readings->{$rdg}} > 1 ) {
480 $is_useful++;
481 } else {
482 my( $wit ) = @{$group_readings->{$rdg}};
483 $wit =~ s/^(.*)\Q$acstr\E$/$1/;
484 $is_useful++ unless( $graph->is_sink_vertex( $wit ) );
485 }
486 }
487 if( $is_useful > 1 ) {
488 return( \@readings, \@groups );
489 } else {
490 return( [], [] );
491 }
492}
493
7f52eac8 494=head2 wit_stringify( $groups )
495
496Takes an array of witness groupings and produces a string like
497['A','B'] / ['C','D','E'] / ['F']
d71100ed 498
7f52eac8 499=cut
d71100ed 500
501sub wit_stringify {
502 my $groups = shift;
503 my @gst;
504 # If we were passed an array of witnesses instead of an array of
505 # groupings, then "group" the witnesses first.
506 unless( ref( $groups->[0] ) ) {
507 my $mkgrp = [ $groups ];
508 $groups = $mkgrp;
509 }
510 foreach my $g ( @$groups ) {
511 push( @gst, '[' . join( ',', map { "'$_'" } @$g ) . ']' );
512 }
513 return join( ' / ', @gst );
514}
7f52eac8 515
5be0cdeb 516# Helper function to ensure that X and X a.c. never appear in the same list.
517sub _add_to_witlist {
518 my( $wit, $list, $acstr ) = @_;
519 my %inlist;
520 my $idx = 0;
521 map { $inlist{$_} = $idx++ } @$list;
522 if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
523 my $acwit = $1;
524 unless( exists $inlist{$acwit} ) {
525 push( @$list, $acwit.$acstr );
526 }
527 } else {
528 if( exists( $inlist{$wit.$acstr} ) ) {
529 # Replace the a.c. version with the main witness
530 my $i = $inlist{$wit.$acstr};
531 $list->[$i] = $wit;
532 } else {
533 push( @$list, $wit );
534 }
535 }
536}
537
bebec0e9 538sub _symmdiff {
539 my( $lista, $listb ) = @_;
7f52eac8 540 my %union;
541 my %scalars;
542 map { $union{$_} = 1; $scalars{$_} = $_ } @$lista;
543 map { $union{$_} += 1; $scalars{$_} = $_ } @$listb;
bebec0e9 544 my @set = grep { $union{$_} == 1 } keys %union;
7f52eac8 545 return map { $scalars{$_} } @set;
546}
547
5481;
549
550=head1 LICENSE
551
552This package is free software and is provided "as is" without express
553or implied warranty. You can redistribute it and/or modify it under
554the same terms as Perl itself.
555
556=head1 AUTHOR
557
558Tara L Andrews E<lt>aurum@cpan.orgE<gt>