1 package Text::Tradition::Analysis;
8 use Text::Tradition::Stemma;
10 use vars qw/ @EXPORT_OK /;
11 @EXPORT_OK = qw/ run_analysis group_variants analyze_variant_location wit_stringify /;
15 Text::Tradition::Analysis - functions for stemma analysis of a tradition
20 use Text::Tradition::Analysis qw/ run_analysis analyze_variant_location /;
21 my $t = Text::Tradition->new(
22 'name' => 'this is a text',
24 'file' => '/path/to/tei_parallel_seg_file.xml' );
25 $t->add_stemma( 'dotfile' => $stemmafile );
27 my $variant_data = run_analysis( $tradition );
28 # Recalculate rank $n treating all orthographic variants as equivalent
29 my $reanalyze = analyze_variant_location( $tradition, $n, 0, 'orthographic' );
33 Text::Tradition is a library for representation and analysis of collated
34 texts, particularly medieval ones. The Collation is the central feature of
35 a Tradition, where the text, its sequence of readings, and its relationships
36 between readings are actually kept.
40 =head2 run_analysis( $tradition, $stemma_id, @merge_relationship_types )
42 Runs the analysis described in analyze_variant_location on every location
43 in the collation of the given tradition, against the stemma specified in
44 $stemma_id. If $stemma_id is not specified, it defaults to 0 (referencing
45 the first stemma saved for the tradition.)
47 The optional @merge_relationship_types contains a list of relationship types
48 to treat as equivalent for the analysis.
53 use Text::Tradition::Analysis qw/ run_analysis analyze_variant_location /;
55 my $datafile = 't/data/florilegium_tei_ps.xml';
56 my $tradition = Text::Tradition->new( 'input' => 'TEI',
58 'file' => $datafile );
59 my $s = $tradition->add_stemma( 'dotfile' => 't/data/florilegium.dot' );
60 is( ref( $s ), 'Text::Tradition::Stemma', "Added stemma to tradition" );
62 my $data = run_analysis( $tradition );
63 # TODO Check genealogical count
64 is( $data->{'genealogical_count'}, 13, "Got right genealogical count" );
65 is( $data->{'conflict_count'}, 16, "Got right conflict count" );
66 is( $data->{'variant_count'}, 28, "Got right total variant number" );
73 my( $tradition, $stemma_id, @collapse ) = @_;
74 $stemma_id = 0 unless $stemma_id;
76 # Run the variant analysis on every rank in the graph that doesn't
77 # have a common reading. Return the results.
78 my @variants; # holds results from analyze_variant_location
79 my $genealogical; # counter of 'genealogical' variants
80 my $conflicts; # counter of conflicting readings
82 # Find and mark 'common' ranks for exclusion.
84 foreach my $rdg ( $tradition->collation->common_readings ) {
85 $common_rank{$rdg->rank} = 1;
88 foreach my $rank ( 1 .. $tradition->collation->end->rank-1 ) {
89 next if $common_rank{$rank};
90 my $variant_row = analyze_variant_location(
91 $tradition, $rank, $stemma_id, @collapse );
92 next unless $variant_row;
93 push( @variants, $variant_row );
94 $genealogical++ if $variant_row->{'genealogical'};
95 $conflicts += grep { $_->{'conflict'} } @{$variant_row->{'readings'}};
99 'variants' => \@variants,
100 'variant_count' => scalar @variants, # TODO redundant
101 'conflict_count' => $conflicts,
102 'genealogical_count' => $genealogical,
106 =head2 group_variants( $tradition, $rank, $lacunose, @merge_relationship_types )
108 Groups the variants at the given $rank of the collation, treating any
109 relationships in @merge_relationship_types as equivalent. $lacunose should
110 be a reference to an array, to which the sigla of lacunose witnesses at this
111 rank will be appended.
113 Returns two ordered lists $readings, $groups, where $readings->[$n] is attested
114 by the witnesses listed in $groups->[$n].
118 # Return group_readings, groups, lacunose
120 my( $tradition, $rank, $lacunose, $collapse ) = @_;
121 my $c = $tradition->collation;
122 # Get the alignment table readings
123 my %readings_at_rank;
125 foreach my $tablewit ( @{$tradition->collation->alignment_table->{'alignment'}} ) {
126 my $rdg = $tablewit->{'tokens'}->[$rank-1];
127 if( $rdg && $rdg->{'t'}->is_lacuna ) {
128 _add_to_witlist( $tablewit->{'witness'}, $lacunose,
129 $tradition->collation->ac_label );
131 $readings_at_rank{$rdg->{'t'}->text} = $rdg->{'t'};
133 _add_to_witlist( $tablewit->{'witness'}, \@gap_wits,
134 $tradition->collation->ac_label );
138 # Group the readings, collapsing groups by relationship if needed
139 my %grouped_readings;
140 foreach my $rdg ( sort { $b->witnesses <=> $a->witnesses } values %readings_at_rank ) {
141 # Skip readings that have been collapsed into others.
142 next if exists $grouped_readings{$rdg->text} && !$grouped_readings{$rdg->text};
143 my @wits = $rdg->witnesses;
145 my $filter = sub { my $r = $_[0]; grep { $_ eq $r->type } @$collapse; };
146 foreach my $other ( $rdg->related_readings( $filter ) ) {
147 push( @wits, $other->witnesses );
148 $grouped_readings{$other->text} = 0;
151 $grouped_readings{$rdg->text} = \@wits;
153 $grouped_readings{'(omitted)'} = \@gap_wits if @gap_wits;
154 # Get rid of our collapsed readings
155 map { delete $grouped_readings{$_} unless $grouped_readings{$_} }
156 keys %grouped_readings
159 return \%grouped_readings;
162 =head2 analyze_variant_location( $tradition, $rank, $stemma_id, @merge_relationship_types )
164 Runs an analysis of the given tradition, at the location given in $rank,
165 against the graph of the stemma specified in $stemma_id. The argument
166 @merge_relationship_types is an optional list of relationship types for
167 which readings so related should be treated as equivalent.
169 Returns a data structure as follows:
172 'genealogical' => boolean,
173 'readings => [ { text => $reading_text,
174 group => [ witnesses ],
175 conflict => [ conflicting ],
176 missing => [ excluded ] }, ... ]
178 where 'conflicting' is the list of witnesses whose readings conflict with
179 this group, and 'excluded' is the list of witnesses either not present in
180 the stemma or lacunose at this location.
184 sub analyze_variant_location {
185 my( $tradition, $rank, $sid, @collapse ) = @_;
186 # Get the readings in this tradition at this rank
187 my @rank_rdgs = grep { $_->rank == $rank } $tradition->collation->readings;
188 # Get the applicable stemma
189 my $undirected; # TODO Allow undirected distance tree analysis too
190 my $stemma = $tradition->stemma( $sid );
191 my $graph = $stemma->graph;
192 # Figure out which witnesses we are working with
193 my @lacunose = $stemma->hypotheticals;
194 push( @lacunose, _symmdiff( [ $stemma->witnesses ],
195 [ map { $_->sigil } $tradition->witnesses ] ) );
197 # Now group the readings
198 my( $readings, $groups ) = _useful_variant(
199 group_variants( $tradition, $rank, \@lacunose, \@collapse ),
200 $graph, $tradition->collation->ac_label );
201 return unless scalar @$readings;
202 my $group_readings = {};
203 # Lookup table group string -> readings
204 foreach my $x ( 0 .. $#$groups ) {
205 $group_readings->{wit_stringify( $groups->[$x] )} = $readings->[$x];
214 my $variant_row = { 'id' => $rank, 'readings' => [] };
215 # Mark each ms as in its own group, first.
216 foreach my $g ( @$groups ) {
217 my $gst = wit_stringify( $g );
218 map { $contig->{$_} = $gst } @$g;
220 # Now for each unmarked node in the graph, initialize an array
221 # for possible group memberships. We will use this later to
222 # resolve potential conflicts.
223 $DB::single = 1 if $rank == 636;
224 map { $contig->{$_} = [] unless $contig->{$_} } $graph->vertices;
225 foreach my $g ( sort { scalar @$b <=> scalar @$a } @$groups ) {
226 my $gst = wit_stringify( $g ); # This is the group name
227 # Copy the graph, and delete all non-members from the new graph.
228 my $part = $graph->copy;
230 $part->delete_vertices(
231 grep { !ref( $contig->{$_} ) && $contig->{$_} ne $gst } $graph->vertices );
233 # Now look to see if our group is connected.
234 if( $undirected ) { # For use with distance trees etc.
235 # Find all vertices reachable from the first (arbitrary) group
236 # member. If we are genealogical this should include them all.
238 map { $reachable->{$_} = 1 } $part->all_reachable( $g->[0] );
239 # TODO This is a terrible way to do distance trees, since all
240 # non-leaf nodes are included in every graph part now. We may
241 # have to go back to SPDP.
244 # We have to take directionality into account.
245 # How many root nodes do we have?
246 my @roots = grep { ref( $contig->{$_} ) || $contig->{$_} eq $gst }
247 $part->predecessorless_vertices;
248 # Assuming that @$g > 1, find the first root node that has at
249 # least one successor belonging to our group. If this reading
250 # is genealogical, there should be only one, but we will check
251 # that implicitly later.
252 foreach my $root ( @roots ) {
253 # Prune the tree to get rid of extraneous hypotheticals.
254 $root = _prune_subtree( $part, $root, $contig );
256 # Save this root for our group.
257 push( @group_roots, $root );
258 # Get all the successor nodes of our root.
261 # Dispense with the trivial case of one reading.
263 _prune_subtree( $part, @group_roots, $contig );
267 map { $reading_roots{$_} = 1 } @group_roots;
268 if( @group_roots > 1 ) {
269 $conflict->{$group_readings->{$gst}} = 1;
272 # Paint the 'hypotheticals' with our group.
273 foreach my $wit ( $part->vertices ) {
274 if( ref( $contig->{$wit} ) ) {
275 push( @{$contig->{$wit}}, $gst );
276 } elsif( $contig->{$wit} ne $gst ) {
277 warn "How did we get here?";
282 # Start to write the reading, and save the group subgraph.
283 my $reading = { 'text' => $group_readings->{$gst},
284 'missing' => wit_stringify( \@lacunose ),
285 'group' => $gst }; # This will change if we find no conflict
286 # Save the relevant subgraph.
287 $subgraph->{$gst} = $part;
288 push( @{$variant_row->{'readings'}}, $reading );
291 # For each of our hypothetical readings, flatten its 'contig' array if
292 # the array contains zero or one group. If we have any unflattened arrays,
293 # we may need to run the resolution process. If the reading is already known
294 # to have a conflict, flatten the 'contig' array to nothing; we won't resolve
297 foreach my $wit ( keys %$contig ) {
298 next unless ref( $contig->{$wit} );
299 if( @{$contig->{$wit}} > 1 ) {
300 if( $is_conflicted ) {
301 $contig->{$wit} = ''; # We aren't going to decide.
303 push( @resolve, $wit );
306 my $gst = pop @{$contig->{$wit}};
307 $contig->{$wit} = $gst || '';
312 my $still_contig = {};
313 foreach my $h ( @resolve ) {
314 # For each of the hypothetical readings with more than one possibility,
315 # try deleting it from each of its member subgraphs in turn, and see
316 # if that breaks the contiguous grouping.
317 # TODO This can still break in a corner case where group A can use
318 # either vertex 1 or 2, and group B can use either vertex 2 or 1.
319 # Revisit this if necessary; it could get brute-force nasty.
320 foreach my $gst ( @{$contig->{$h}} ) {
321 my $gpart = $subgraph->{$gst}->copy();
322 # If we have come this far, there is only one root and everything
323 # is reachable from it.
324 my( $root ) = $gpart->predecessorless_vertices;
326 map { $reachable->{$_} = 1 } $gpart->vertices;
328 # Try deleting the hypothetical node.
329 $gpart->delete_vertex( $h );
331 # See if we still have a single root.
332 my @roots = $gpart->predecessorless_vertices;
333 warn "This shouldn't have happened" unless @roots;
335 # $h is needed by this group.
336 if( exists( $still_contig->{$h} ) ) {
338 $conflict->{$group_readings->{$gst}} = 1;
339 $still_contig->{$h} = '';
341 $still_contig->{$h} = $gst;
345 # $h is somewhere in the middle. See if everything
346 # else can still be reached from the root.
347 my %still_reachable = ( $root => 1 );
348 map { $still_reachable{$_} = 1 }
349 $gpart->all_successors( $root );
350 foreach my $v ( keys %$reachable ) {
352 if( !$still_reachable{$v}
353 && ( $contig->{$v} eq $gst
354 || ( exists $still_contig->{$v}
355 && $still_contig->{$v} eq $gst ) ) ) {
357 if( exists $still_contig->{$h} ) {
359 $conflict->{$group_readings->{$gst}} = 1;
360 $still_contig->{$h} = '';
362 $still_contig->{$h} = $gst;
365 } # else we don't need $h in this group.
367 } # endif $h eq $root
371 # Now we have some hypothetical vertices in $still_contig that are the
372 # "real" group memberships. Replace these in $contig.
373 foreach my $v ( keys %$contig ) {
374 next unless ref $contig->{$v};
375 $contig->{$v} = $still_contig->{$v};
379 # Now that we have all the node group memberships, calculate followed/
380 # non-followed/unknown values for each reading. Also figure out the
381 # reading's evident parent(s).
382 foreach my $rdghash ( @{$variant_row->{'readings'}} ) {
383 my $gst = $rdghash->{'group'};
384 my $part = $subgraph->{$gst};
385 my @roots = $part->predecessorless_vertices;
386 $rdghash->{'independent_occurrence'} = scalar @roots;
387 $rdghash->{'followed'} = scalar( $part->vertices ) - scalar( @roots );
388 # Find the parent readings, if any, of this reading.
390 foreach my $wit ( @roots ) {
391 # Look in the main stemma to find this witness's parent(s), and look
392 # up the reading that the parent holds.
393 foreach my $wparent( $graph->predecessors( $wit ) ) {
394 my $pgroup = $contig->{$wparent};
396 push( @rdgparents, $group_readings->{$pgroup} );
400 $rdghash->{'reading_parents'} = \@rdgparents;
402 # Find the number of times this reading was altered, and the number of
403 # times we're not sure.
404 my( %nofollow, %unknownfollow );
405 foreach my $wit ( $part->vertices ) {
406 foreach my $wchild ( $graph->successors( $wit ) ) {
407 next if $part->has_vertex( $wchild );
408 if( $reading_roots{$wchild} && $contig->{$wchild} ) {
409 # It definitely changed here.
410 $nofollow{$wchild} = 1;
411 } elsif( !($contig->{$wchild}) ) {
412 # The child is a hypothetical node not definitely in
413 # any group. Answer is unknown.
414 $unknownfollow{$wchild} = 1;
415 } # else it's a non-root node in a known group, and therefore
416 # is presumed to have its reading from its group, not this link.
419 $rdghash->{'not_followed'} = keys %nofollow;
420 $rdghash->{'follow_unknown'} = keys %unknownfollow;
423 # Now write the group and conflict information into the respective rows.
424 foreach my $rdghash ( @{$variant_row->{'readings'}} ) {
425 $rdghash->{'conflict'} = $conflict->{$rdghash->{'text'}};
426 my @members = grep { $contig->{$_} eq $rdghash->{'group'} } keys %$contig;
427 $rdghash->{'group'} = wit_stringify( \@members );
430 $variant_row->{'genealogical'} = !( keys %$conflict );
435 my( $tree, $root, $contighash ) = @_;
436 # First, delete hypothetical leaves / orphans until there are none left.
437 my @orphan_hypotheticals = grep { ref( $contighash->{$_} ) }
438 $tree->successorless_vertices;
439 while( @orphan_hypotheticals ) {
440 $tree->delete_vertices( @orphan_hypotheticals );
441 @orphan_hypotheticals = grep { ref( $contighash->{$_} ) }
442 $tree->successorless_vertices;
444 # Then delete a hypothetical root with only one successor, moving the
445 # root to the first child that has no other predecessors.
446 while( $tree->successors( $root ) == 1 && ref $contighash->{$root} ) {
447 my @nextroot = $tree->successors( $root );
448 $tree->delete_vertex( $root );
449 ( $root ) = grep { $tree->is_predecessorless_vertex( $_ ) } @nextroot;
451 # The tree has been modified in place, but we need to know the new root.
452 $root = undef unless $root && $tree->has_vertex( $root );
455 # Add the variant, subject to a.c. representation logic.
456 # This assumes that we will see the 'main' version before the a.c. version.
457 sub add_variant_wit {
458 my( $arr, $wit, $acstr ) = @_;
460 if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
462 $skip = grep { $_ =~ /^\Q$real\E$/ } @$arr;
464 push( @$arr, $wit ) unless $skip;
467 sub _useful_variant {
468 my( $group_readings, $graph, $acstr ) = @_;
470 # TODO Decide what to do with AC witnesses
472 # Sort by group size and return
474 my( @readings, @groups ); # The sorted groups for our answer.
475 foreach my $rdg ( sort { @{$group_readings->{$b}} <=> @{$group_readings->{$a}} }
476 keys %$group_readings ) {
477 push( @readings, $rdg );
478 push( @groups, $group_readings->{$rdg} );
479 if( @{$group_readings->{$rdg}} > 1 ) {
482 my( $wit ) = @{$group_readings->{$rdg}};
483 $wit =~ s/^(.*)\Q$acstr\E$/$1/;
484 $is_useful++ unless( $graph->is_sink_vertex( $wit ) );
487 if( $is_useful > 1 ) {
488 return( \@readings, \@groups );
494 =head2 wit_stringify( $groups )
496 Takes an array of witness groupings and produces a string like
497 ['A','B'] / ['C','D','E'] / ['F']
504 # If we were passed an array of witnesses instead of an array of
505 # groupings, then "group" the witnesses first.
506 unless( ref( $groups->[0] ) ) {
507 my $mkgrp = [ $groups ];
510 foreach my $g ( @$groups ) {
511 push( @gst, '[' . join( ',', map { "'$_'" } @$g ) . ']' );
513 return join( ' / ', @gst );
516 # Helper function to ensure that X and X a.c. never appear in the same list.
517 sub _add_to_witlist {
518 my( $wit, $list, $acstr ) = @_;
521 map { $inlist{$_} = $idx++ } @$list;
522 if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
524 unless( exists $inlist{$acwit} ) {
525 push( @$list, $acwit.$acstr );
528 if( exists( $inlist{$wit.$acstr} ) ) {
529 # Replace the a.c. version with the main witness
530 my $i = $inlist{$wit.$acstr};
533 push( @$list, $wit );
539 my( $lista, $listb ) = @_;
542 map { $union{$_} = 1; $scalars{$_} = $_ } @$lista;
543 map { $union{$_} += 1; $scalars{$_} = $_ } @$listb;
544 my @set = grep { $union{$_} == 1 } keys %union;
545 return map { $scalars{$_} } @set;
552 This package is free software and is provided "as is" without express
553 or implied warranty. You can redistribute it and/or modify it under
554 the same terms as Perl itself.
558 Tara L Andrews E<lt>aurum@cpan.orgE<gt>