1 package Text::Tradition::Analysis;
8 use Text::Tradition::Stemma;
10 use vars qw/ @EXPORT_OK /;
11 @EXPORT_OK = qw/ run_analysis group_variants analyze_variant_location wit_stringify /;
15 Text::Tradition::Analysis - functions for stemma analysis of a tradition
20 use Text::Tradition::Analysis qw/ run_analysis analyze_variant_location /;
21 my $t = Text::Tradition->new(
22 'name' => 'this is a text',
24 'file' => '/path/to/tei_parallel_seg_file.xml' );
25 $t->add_stemma( 'dotfile' => $stemmafile );
27 my $variant_data = run_analysis( $tradition );
28 # Recalculate rank $n treating all orthographic variants as equivalent
29 my $reanalyze = analyze_variant_location( $tradition, $n, 0, 'orthographic' );
33 Text::Tradition is a library for representation and analysis of collated
34 texts, particularly medieval ones. The Collation is the central feature of
35 a Tradition, where the text, its sequence of readings, and its relationships
36 between readings are actually kept.
40 =head2 run_analysis( $tradition, $stemma_id, @merge_relationship_types )
42 Runs the analysis described in analyze_variant_location on every location
43 in the collation of the given tradition, against the stemma specified in
44 $stemma_id. If $stemma_id is not specified, it defaults to 0 (referencing
45 the first stemma saved for the tradition.)
47 The optional @merge_relationship_types contains a list of relationship types
48 to treat as equivalent for the analysis.
53 use Text::Tradition::Analysis qw/ run_analysis analyze_variant_location /;
55 my $datafile = 't/data/florilegium_tei_ps.xml';
56 my $tradition = Text::Tradition->new( 'input' => 'TEI',
58 'file' => $datafile );
59 my $s = $tradition->add_stemma( 'dotfile' => 't/data/florilegium.dot' );
60 is( ref( $s ), 'Text::Tradition::Stemma', "Added stemma to tradition" );
62 my %expected_genealogical = (
93 my $data = run_analysis( $tradition );
94 foreach my $row ( @{$data->{'variants'}} ) {
95 is( $row->{'genealogical'}, $expected_genealogical{$row->{'id'}},
96 "Got correct genealogical flag for row " . $row->{'id'} );
98 is( $data->{'conflict_count'}, 16, "Got right conflict count" );
99 is( $data->{'variant_count'}, 28, "Got right total variant number" );
106 my( $tradition, $stemma_id, @collapse ) = @_;
107 my $c = $tradition->collation;
108 $stemma_id = 0 unless $stemma_id;
110 # Run the variant analysis on every rank in the graph that doesn't
111 # have a common reading. Return the results.
112 my @variants; # holds results from analyze_variant_location
113 my $genealogical; # counter of 'genealogical' variants
114 my $conflicts; # counter of conflicting readings
116 # Find and mark 'common' ranks for exclusion.
118 foreach my $rdg ( $tradition->collation->common_readings ) {
119 $common_rank{$rdg->rank} = 1;
122 foreach my $rank ( 1 .. $tradition->collation->end->rank-1 ) {
123 next if $common_rank{$rank};
124 my $variant_row = analyze_variant_location(
125 $tradition, $rank, $stemma_id, @collapse );
126 next unless $variant_row;
127 # Add the reading text to the readings in variant_row
128 foreach my $rh ( @{$variant_row->{'readings'}} ) {
129 if( $c->reading( $rh->{'readingid'} ) ) {
130 $rh->{'text'} = $c->reading( $rh->{'readingid'} )->text;
132 $rh->{'text'} = $rh->{'readingid'};
135 push( @variants, $variant_row );
136 $genealogical++ if $variant_row->{'genealogical'};
137 $conflicts += grep { $_->{'conflict'} } @{$variant_row->{'readings'}};
142 'variants' => \@variants,
143 'variant_count' => scalar @variants, # TODO redundant
144 'conflict_count' => $conflicts,
145 'genealogical_count' => $genealogical,
149 =head2 group_variants( $tradition, $rank, $lacunose, @merge_relationship_types )
151 Groups the variants at the given $rank of the collation, treating any
152 relationships in @merge_relationship_types as equivalent. $lacunose should
153 be a reference to an array, to which the sigla of lacunose witnesses at this
154 rank will be appended.
156 Returns two ordered lists $readings, $groups, where $readings->[$n] is attested
157 by the witnesses listed in $groups->[$n].
161 # Return group_readings, groups, lacunose
163 my( $tradition, $rank, $lacunose, $collapse ) = @_;
164 my $c = $tradition->collation;
165 # Get the alignment table readings
166 my %readings_at_rank;
168 foreach my $tablewit ( @{$tradition->collation->alignment_table->{'alignment'}} ) {
169 my $rdg = $tablewit->{'tokens'}->[$rank-1];
170 if( $rdg && $rdg->{'t'}->is_lacuna ) {
171 _add_to_witlist( $tablewit->{'witness'}, $lacunose,
172 $tradition->collation->ac_label );
174 $readings_at_rank{$rdg->{'t'}->text} = $rdg->{'t'};
176 _add_to_witlist( $tablewit->{'witness'}, \@gap_wits,
177 $tradition->collation->ac_label );
181 # Group the readings, collapsing groups by relationship if needed
182 my %grouped_readings;
183 foreach my $rdg ( sort { $b->witnesses <=> $a->witnesses } values %readings_at_rank ) {
184 # Skip readings that have been collapsed into others.
185 next if exists $grouped_readings{$rdg->id} && !$grouped_readings{$rdg->id};
186 my @wits = $rdg->witnesses;
188 my $filter = sub { my $r = $_[0]; grep { $_ eq $r->type } @$collapse; };
189 foreach my $other ( $rdg->related_readings( $filter ) ) {
190 push( @wits, $other->witnesses );
191 $grouped_readings{$other->id} = 0;
194 $grouped_readings{$rdg->id} = \@wits;
196 $grouped_readings{'(omitted)'} = \@gap_wits if @gap_wits;
197 # Get rid of our collapsed readings
198 map { delete $grouped_readings{$_} unless $grouped_readings{$_} }
199 keys %grouped_readings
202 return \%grouped_readings;
205 =head2 analyze_variant_location( $tradition, $rank, $stemma_id, @merge_relationship_types )
207 Runs an analysis of the given tradition, at the location given in $rank,
208 against the graph of the stemma specified in $stemma_id. The argument
209 @merge_relationship_types is an optional list of relationship types for
210 which readings so related should be treated as equivalent.
212 Returns a data structure as follows:
215 'genealogical' => boolean,
216 'readings => [ { readingid => $reading_id,
217 group => [ witnesses ],
218 conflict => [ conflicting ],
219 missing => [ excluded ] }, ... ]
221 where 'conflicting' is the list of witnesses whose readings conflict with
222 this group, and 'excluded' is the list of witnesses either not present in
223 the stemma or lacunose at this location.
227 sub analyze_variant_location {
228 my( $tradition, $rank, $sid, @collapse ) = @_;
229 # Get the readings in this tradition at this rank
230 my @rank_rdgs = grep { $_->rank == $rank } $tradition->collation->readings;
231 # Get the applicable stemma
232 my $undirected; # TODO Allow undirected distance tree analysis too
233 my $stemma = $tradition->stemma( $sid );
234 my $graph = $stemma->graph;
235 # Figure out which witnesses we are working with
236 my @lacunose = $stemma->hypotheticals;
237 push( @lacunose, _symmdiff( [ $stemma->witnesses ],
238 [ map { $_->sigil } $tradition->witnesses ] ) );
240 # Now group the readings
241 my( $readings, $groups ) = _useful_variant(
242 group_variants( $tradition, $rank, \@lacunose, \@collapse ),
243 $graph, $tradition->collation->ac_label );
244 return unless scalar @$readings;
245 my $group_readings = {};
246 # Lookup table group string -> readings
247 foreach my $x ( 0 .. $#$groups ) {
248 $group_readings->{wit_stringify( $groups->[$x] )} = $readings->[$x];
257 my $variant_row = { 'id' => $rank, 'readings' => [] };
258 # Mark each ms as in its own group, first.
259 $DB::single = 1 if $rank == 81;
260 foreach my $g ( @$groups ) {
261 my $gst = wit_stringify( $g );
262 map { $contig->{$_} = $gst } @$g;
264 # Now for each unmarked node in the graph, initialize an array
265 # for possible group memberships. We will use this later to
266 # resolve potential conflicts.
267 map { $contig->{$_} = [] unless $contig->{$_} } $graph->vertices;
268 foreach my $g ( sort { scalar @$b <=> scalar @$a } @$groups ) {
269 my $gst = wit_stringify( $g ); # This is the group name
270 # Copy the graph, and delete all non-members from the new graph.
271 my $part = $graph->copy;
273 $part->delete_vertices(
274 grep { !ref( $contig->{$_} ) && $contig->{$_} ne $gst } $graph->vertices );
276 # Now look to see if our group is connected.
277 if( $undirected ) { # For use with distance trees etc.
278 # Find all vertices reachable from the first (arbitrary) group
279 # member. If we are genealogical this should include them all.
281 map { $reachable->{$_} = 1 } $part->all_reachable( $g->[0] );
282 # TODO This is a terrible way to do distance trees, since all
283 # non-leaf nodes are included in every graph part now. We may
284 # have to go back to SPDP.
287 # We have to take directionality into account.
288 # How many root nodes do we have?
289 my @roots = grep { ref( $contig->{$_} ) || $contig->{$_} eq $gst }
290 $part->predecessorless_vertices;
291 # Assuming that @$g > 1, find the first root node that has at
292 # least one successor belonging to our group. If this reading
293 # is genealogical, there should be only one, but we will check
294 # that implicitly later.
295 foreach my $root ( @roots ) {
296 # Prune the tree to get rid of extraneous hypotheticals.
297 $root = _prune_subtree( $part, $root, $contig );
299 # Save this root for our group.
300 push( @group_roots, $root );
301 # Get all the successor nodes of our root.
304 # Dispense with the trivial case of one reading.
306 @group_roots = ( $wit );
307 foreach my $v ( $part->vertices ) {
308 $part->delete_vertex( $v ) unless $v eq $wit;
313 map { $reading_roots{$_} = 1 } @group_roots;
314 if( @group_roots > 1 ) {
315 $conflict->{$group_readings->{$gst}} = 1;
318 # Paint the 'hypotheticals' with our group.
319 foreach my $wit ( $part->vertices ) {
320 if( ref( $contig->{$wit} ) ) {
321 push( @{$contig->{$wit}}, $gst );
322 } elsif( $contig->{$wit} ne $gst ) {
323 warn "How did we get here?";
328 # Start to write the reading, and save the group subgraph.
329 my $reading = { 'readingid' => $group_readings->{$gst},
330 'missing' => wit_stringify( \@lacunose ),
331 'group' => $gst }; # This will change if we find no conflict
332 # Save the relevant subgraph.
333 $subgraph->{$gst} = $part;
334 push( @{$variant_row->{'readings'}}, $reading );
337 # For each of our hypothetical readings, flatten its 'contig' array if
338 # the array contains zero or one group. If we have any unflattened arrays,
339 # we may need to run the resolution process. If the reading is already known
340 # to have a conflict, flatten the 'contig' array to nothing; we won't resolve
343 foreach my $wit ( keys %$contig ) {
344 next unless ref( $contig->{$wit} );
345 if( @{$contig->{$wit}} > 1 ) {
346 if( $is_conflicted ) {
347 $contig->{$wit} = ''; # We aren't going to decide.
349 push( @resolve, $wit );
352 my $gst = pop @{$contig->{$wit}};
353 $contig->{$wit} = $gst || '';
358 my $still_contig = {};
359 foreach my $h ( @resolve ) {
360 # For each of the hypothetical readings with more than one possibility,
361 # try deleting it from each of its member subgraphs in turn, and see
362 # if that breaks the contiguous grouping.
363 # TODO This can still break in a corner case where group A can use
364 # either vertex 1 or 2, and group B can use either vertex 2 or 1.
365 # Revisit this if necessary; it could get brute-force nasty.
366 foreach my $gst ( @{$contig->{$h}} ) {
367 my $gpart = $subgraph->{$gst}->copy();
368 # If we have come this far, there is only one root and everything
369 # is reachable from it.
370 my( $root ) = $gpart->predecessorless_vertices;
372 map { $reachable->{$_} = 1 } $gpart->vertices;
374 # Try deleting the hypothetical node.
375 $gpart->delete_vertex( $h );
377 # See if we still have a single root.
378 my @roots = $gpart->predecessorless_vertices;
379 warn "This shouldn't have happened" unless @roots;
381 # $h is needed by this group.
382 if( exists( $still_contig->{$h} ) ) {
384 $conflict->{$group_readings->{$gst}} = 1;
385 $still_contig->{$h} = '';
387 $still_contig->{$h} = $gst;
391 # $h is somewhere in the middle. See if everything
392 # else can still be reached from the root.
393 my %still_reachable = ( $root => 1 );
394 map { $still_reachable{$_} = 1 }
395 $gpart->all_successors( $root );
396 foreach my $v ( keys %$reachable ) {
398 if( !$still_reachable{$v}
399 && ( $contig->{$v} eq $gst
400 || ( exists $still_contig->{$v}
401 && $still_contig->{$v} eq $gst ) ) ) {
403 if( exists $still_contig->{$h} ) {
405 $conflict->{$group_readings->{$gst}} = 1;
406 $still_contig->{$h} = '';
408 $still_contig->{$h} = $gst;
411 } # else we don't need $h in this group.
413 } # endif $h eq $root
417 # Now we have some hypothetical vertices in $still_contig that are the
418 # "real" group memberships. Replace these in $contig.
419 foreach my $v ( keys %$contig ) {
420 next unless ref $contig->{$v};
421 $contig->{$v} = $still_contig->{$v};
425 # Now that we have all the node group memberships, calculate followed/
426 # non-followed/unknown values for each reading. Also figure out the
427 # reading's evident parent(s).
428 foreach my $rdghash ( @{$variant_row->{'readings'}} ) {
429 my $gst = $rdghash->{'group'};
430 my $part = $subgraph->{$gst};
431 my @roots = $part->predecessorless_vertices;
432 $rdghash->{'independent_occurrence'} = scalar @roots;
433 $rdghash->{'followed'} = scalar( $part->vertices ) - scalar( @roots );
434 # Find the parent readings, if any, of this reading.
436 foreach my $wit ( @roots ) {
437 # Look in the main stemma to find this witness's extant or known-reading
438 # immediate ancestor(s), and look up the reading that each ancestor olds.
439 my @check = $graph->predecessors( $wit );
442 foreach my $wparent( @check ) {
443 my $pgroup = $contig->{$wparent};
445 $rdgparents{$group_readings->{$pgroup}} = 1;
447 push( @next, $graph->predecessors( $wparent ) );
453 $rdghash->{'reading_parents'} = [ keys %rdgparents ];
455 # Find the number of times this reading was altered, and the number of
456 # times we're not sure.
457 my( %nofollow, %unknownfollow );
458 foreach my $wit ( $part->vertices ) {
459 foreach my $wchild ( $graph->successors( $wit ) ) {
460 next if $part->has_vertex( $wchild );
461 if( $reading_roots{$wchild} && $contig->{$wchild} ) {
462 # It definitely changed here.
463 $nofollow{$wchild} = 1;
464 } elsif( !($contig->{$wchild}) ) {
465 # The child is a hypothetical node not definitely in
466 # any group. Answer is unknown.
467 $unknownfollow{$wchild} = 1;
468 } # else it's a non-root node in a known group, and therefore
469 # is presumed to have its reading from its group, not this link.
472 $rdghash->{'not_followed'} = keys %nofollow;
473 $rdghash->{'follow_unknown'} = keys %unknownfollow;
476 # Now write the group and conflict information into the respective rows.
477 foreach my $rdghash ( @{$variant_row->{'readings'}} ) {
478 $rdghash->{'conflict'} = $conflict->{$rdghash->{'readingid'}};
479 my @members = grep { $contig->{$_} eq $rdghash->{'group'} } keys %$contig;
480 $rdghash->{'group'} = wit_stringify( \@members );
483 $variant_row->{'genealogical'} = !( keys %$conflict );
488 my( $tree, $root, $contighash ) = @_;
489 # First, delete hypothetical leaves / orphans until there are none left.
490 my @orphan_hypotheticals = grep { ref( $contighash->{$_} ) }
491 $tree->successorless_vertices;
492 while( @orphan_hypotheticals ) {
493 $tree->delete_vertices( @orphan_hypotheticals );
494 @orphan_hypotheticals = grep { ref( $contighash->{$_} ) }
495 $tree->successorless_vertices;
497 # Then delete a hypothetical root with only one successor, moving the
498 # root to the first child that has no other predecessors.
499 while( $tree->successors( $root ) == 1 && ref $contighash->{$root} ) {
500 my @nextroot = $tree->successors( $root );
501 $tree->delete_vertex( $root );
502 ( $root ) = grep { $tree->is_predecessorless_vertex( $_ ) } @nextroot;
504 # The tree has been modified in place, but we need to know the new root.
505 $root = undef unless $root && $tree->has_vertex( $root );
508 # Add the variant, subject to a.c. representation logic.
509 # This assumes that we will see the 'main' version before the a.c. version.
510 sub add_variant_wit {
511 my( $arr, $wit, $acstr ) = @_;
513 if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
515 $skip = grep { $_ =~ /^\Q$real\E$/ } @$arr;
517 push( @$arr, $wit ) unless $skip;
520 sub _useful_variant {
521 my( $group_readings, $graph, $acstr ) = @_;
523 # TODO Decide what to do with AC witnesses
525 # Sort by group size and return
527 my( @readings, @groups ); # The sorted groups for our answer.
528 foreach my $rdg ( sort { @{$group_readings->{$b}} <=> @{$group_readings->{$a}} }
529 keys %$group_readings ) {
530 push( @readings, $rdg );
531 push( @groups, $group_readings->{$rdg} );
532 if( @{$group_readings->{$rdg}} > 1 ) {
535 my( $wit ) = @{$group_readings->{$rdg}};
536 $wit =~ s/^(.*)\Q$acstr\E$/$1/;
537 $is_useful++ unless( $graph->is_sink_vertex( $wit ) );
540 if( $is_useful > 1 ) {
541 return( \@readings, \@groups );
547 =head2 wit_stringify( $groups )
549 Takes an array of witness groupings and produces a string like
550 ['A','B'] / ['C','D','E'] / ['F']
557 # If we were passed an array of witnesses instead of an array of
558 # groupings, then "group" the witnesses first.
559 unless( ref( $groups->[0] ) ) {
560 my $mkgrp = [ $groups ];
563 foreach my $g ( @$groups ) {
564 push( @gst, '[' . join( ',', map { "'$_'" } @$g ) . ']' );
566 return join( ' / ', @gst );
569 # Helper function to ensure that X and X a.c. never appear in the same list.
570 sub _add_to_witlist {
571 my( $wit, $list, $acstr ) = @_;
574 map { $inlist{$_} = $idx++ } @$list;
575 if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
577 unless( exists $inlist{$acwit} ) {
578 push( @$list, $acwit.$acstr );
581 if( exists( $inlist{$wit.$acstr} ) ) {
582 # Replace the a.c. version with the main witness
583 my $i = $inlist{$wit.$acstr};
586 push( @$list, $wit );
592 my( $lista, $listb ) = @_;
595 map { $union{$_} = 1; $scalars{$_} = $_ } @$lista;
596 map { $union{$_} += 1; $scalars{$_} = $_ } @$listb;
597 my @set = grep { $union{$_} == 1 } keys %union;
598 return map { $scalars{$_} } @set;
605 This package is free software and is provided "as is" without express
606 or implied warranty. You can redistribute it and/or modify it under
607 the same terms as Perl itself.
611 Tara L Andrews E<lt>aurum@cpan.orgE<gt>