1 package Text::Tradition::Analysis;
8 use Text::Tradition::Stemma;
10 use vars qw/ @EXPORT_OK /;
11 @EXPORT_OK = qw/ run_analysis group_variants analyze_variant_location wit_stringify /;
15 Text::Tradition::Analysis - functions for stemma analysis of a tradition
20 use Text::Tradition::Analysis qw/ run_analysis analyze_variant_location /;
21 my $t = Text::Tradition->new(
22 'name' => 'this is a text',
24 'file' => '/path/to/tei_parallel_seg_file.xml' );
25 $t->add_stemma( 'dotfile' => $stemmafile );
27 my $variant_data = run_analysis( $tradition );
28 # Recalculate rank $n treating all orthographic variants as equivalent
29 my $reanalyze = analyze_variant_location( $tradition, $n, 0, 'orthographic' );
33 Text::Tradition is a library for representation and analysis of collated
34 texts, particularly medieval ones. The Collation is the central feature of
35 a Tradition, where the text, its sequence of readings, and its relationships
36 between readings are actually kept.
40 =head2 run_analysis( $tradition, $stemma_id, @merge_relationship_types )
42 Runs the analysis described in analyze_variant_location on every location
43 in the collation of the given tradition, against the stemma specified in
44 $stemma_id. If $stemma_id is not specified, it defaults to 0 (referencing
45 the first stemma saved for the tradition.)
47 The optional @merge_relationship_types contains a list of relationship types
48 to treat as equivalent for the analysis.
53 use Text::Tradition::Analysis qw/ run_analysis analyze_variant_location /;
55 my $datafile = 't/data/florilegium_tei_ps.xml';
56 my $tradition = Text::Tradition->new( 'input' => 'TEI',
58 'file' => $datafile );
59 my $s = $tradition->add_stemma( 'dotfile' => 't/data/florilegium.dot' );
60 is( ref( $s ), 'Text::Tradition::Stemma', "Added stemma to tradition" );
62 my $data = run_analysis( $tradition );
63 # TODO Check genealogical count
64 is( $data->{'genealogical_count'}, 13, "Got right genealogical count" );
65 is( $data->{'conflict_count'}, 16, "Got right conflict count" );
66 is( $data->{'variant_count'}, 28, "Got right total variant number" );
73 my( $tradition, $stemma_id, @collapse ) = @_;
74 $stemma_id = 0 unless $stemma_id;
76 # Run the variant analysis on every rank in the graph that doesn't
77 # have a common reading. Return the results.
78 my @variants; # holds results from analyze_variant_location
79 my $genealogical; # counter of 'genealogical' variants
80 my $conflicts; # counter of conflicting readings
82 # Find and mark 'common' ranks for exclusion.
84 foreach my $rdg ( $tradition->collation->common_readings ) {
85 $common_rank{$rdg->rank} = 1;
88 foreach my $rank ( 1 .. $tradition->collation->end->rank-1 ) {
89 next if $common_rank{$rank};
90 my $variant_row = analyze_variant_location(
91 $tradition, $rank, $stemma_id, @collapse );
92 next unless $variant_row;
93 push( @variants, $variant_row );
94 $genealogical++ if $variant_row->{'genealogical'};
95 $conflicts += grep { $_->{'conflict'} } @{$variant_row->{'readings'}};
99 'variants' => \@variants,
100 'variant_count' => scalar @variants, # TODO redundant
101 'conflict_count' => $conflicts,
102 'genealogical_count' => $genealogical,
106 =head2 group_variants( $tradition, $rank, $lacunose, @merge_relationship_types )
108 Groups the variants at the given $rank of the collation, treating any
109 relationships in @merge_relationship_types as equivalent. $lacunose should
110 be a reference to an array, to which the sigla of lacunose witnesses at this
111 rank will be appended.
113 Returns two ordered lists $readings, $groups, where $readings->[$n] is attested
114 by the witnesses listed in $groups->[$n].
118 # Return group_readings, groups, lacunose
120 my( $tradition, $rank, $lacunose, $collapse ) = @_;
121 my $c = $tradition->collation;
122 # Get the alignment table readings
123 my %readings_at_rank;
125 foreach my $tablewit ( @{$tradition->collation->alignment_table->{'alignment'}} ) {
126 my $rdg = $tablewit->{'tokens'}->[$rank-1];
127 if( $rdg && $rdg->{'t'}->is_lacuna ) {
128 _add_to_witlist( $tablewit->{'witness'}, $lacunose,
129 $tradition->collation->ac_label );
131 $readings_at_rank{$rdg->{'t'}->text} = $rdg->{'t'};
133 _add_to_witlist( $tablewit->{'witness'}, \@gap_wits,
134 $tradition->collation->ac_label );
138 # Group the readings, collapsing groups by relationship if needed
139 my %grouped_readings;
140 foreach my $rdg ( sort { $b->witnesses <=> $a->witnesses } values %readings_at_rank ) {
141 # Skip readings that have been collapsed into others.
142 next if exists $grouped_readings{$rdg->text} && !$grouped_readings{$rdg->text};
143 my @wits = $rdg->witnesses;
145 my $filter = sub { my $r = $_[0]; grep { $_ eq $r->type } @$collapse; };
146 foreach my $other ( $rdg->related_readings( $filter ) ) {
147 push( @wits, $other->witnesses );
148 $grouped_readings{$other->text} = 0;
151 $grouped_readings{$rdg->text} = \@wits;
153 $grouped_readings{'(omitted)'} = \@gap_wits if @gap_wits;
154 # Get rid of our collapsed readings
155 map { delete $grouped_readings{$_} unless $grouped_readings{$_} }
156 keys %grouped_readings
159 return \%grouped_readings;
162 =head2 analyze_variant_location( $tradition, $rank, $stemma_id, @merge_relationship_types )
164 Runs an analysis of the given tradition, at the location given in $rank,
165 against the graph of the stemma specified in $stemma_id. The argument
166 @merge_relationship_types is an optional list of relationship types for
167 which readings so related should be treated as equivalent.
169 Returns a data structure as follows:
172 'genealogical' => boolean,
173 'readings => [ { text => $reading_text,
174 group => [ witnesses ],
175 conflict => [ conflicting ],
176 missing => [ excluded ] }, ... ]
178 where 'conflicting' is the list of witnesses whose readings conflict with
179 this group, and 'excluded' is the list of witnesses either not present in
180 the stemma or lacunose at this location.
184 sub analyze_variant_location {
185 my( $tradition, $rank, $sid, @collapse ) = @_;
186 # Get the readings in this tradition at this rank
187 my @rank_rdgs = grep { $_->rank == $rank } $tradition->collation->readings;
188 # Get the applicable stemma
189 my $undirected; # TODO Allow undirected distance tree analysis too
190 my $stemma = $tradition->stemma( $sid );
191 my $graph = $stemma->graph;
192 # Figure out which witnesses we are working with
193 my @lacunose = _set( 'symmdiff', [ $stemma->witnesses ],
194 [ map { $_->sigil } $tradition->witnesses ] );
196 # Now group the readings
197 my( $readings, $groups ) = _useful_variant(
198 group_variants( $tradition, $rank, \@lacunose, \@collapse ),
199 $graph, $tradition->collation->ac_label );
200 return unless scalar @$readings;
201 my $group_readings = {};
202 # Lookup table group string -> readings
203 foreach my $x ( 0 .. $#$groups ) {
204 $group_readings->{wit_stringify( $groups->[$x] )} = $readings->[$x];
212 my $variant_row = { 'id' => $rank, 'readings' => [] };
213 # Mark each ms as in its own group, first.
214 foreach my $g ( @$groups ) {
215 my $gst = wit_stringify( $g );
216 map { $contig->{$_} = $gst } @$g;
218 # Now for each unmarked node in the graph, initialize an array
219 # for possible group memberships. We will use this later to
220 # resolve potential conflicts.
221 map { $contig->{$_} = [] unless $contig->{$_} } $graph->vertices;
222 foreach my $g ( sort { scalar @$b <=> scalar @$a } @$groups ) {
223 my $gst = wit_stringify( $g ); # This is the group name
224 my $reachable = { $g->[0] => 1 };
225 # Copy the graph, and delete all non-members from the new graph.
226 my $part = $graph->copy;
228 $part->delete_vertices(
229 grep { !ref( $contig->{$_} ) && $contig->{$_} ne $gst } $graph->vertices );
231 # Now look to see if our group is connected.
232 if( $undirected ) { # For use with distance trees etc.
233 # Find all vertices reachable from the first (arbitrary) group
234 # member. If we are genealogical this should include them all.
235 map { $reachable->{$_} = 1 } $part->all_reachable( $g->[0] );
236 # TODO This is a terrible way to do distance trees, since all
237 # non-leaf nodes are included in every graph part now. We may
238 # have to go back to SPDP.
241 # Dispense with the trivial case of one reading.
242 # We have to take directionality into account.
243 # How many root nodes do we have?
244 my @roots = grep { ref( $contig->{$_} ) || $contig->{$_} eq $gst }
245 $part->source_vertices;
246 # Assuming that @$g > 1, find the first root node that has at
247 # least one successor belonging to our group. If this reading
248 # is genealogical, there should be only one, but we will check
249 # that implicitly later.
250 my $nodes_in_subtree = 0;
251 foreach my $root ( @roots ) {
252 # Prune the tree to get rid of extraneous hypotheticals.
253 $root = _prune_subtree( $part, $root, $contig );
254 # Get all the successor nodes of our root.
255 my $tmp_reach = { $root => 1 };
256 map { $tmp_reach->{$_} = 1 } $part->all_successors( $root );
257 # Skip this root if none of our successors are in our group
258 # (e.g. isolated 'hypothetical' witnesses with no group)
259 next unless grep { $contig->{$_} } keys %$tmp_reach;
260 if( keys %$tmp_reach > $nodes_in_subtree ) {
261 $nodes_in_subtree = keys %$tmp_reach;
262 $reachable = $tmp_reach;
266 } # else it is a single-node group, nothing to calculate.
269 # None of the 'reachable' nodes should be marked as being in another
270 # group. Paint the 'hypotheticals' with our group while we are at it,
271 # unless there is a conflict present.
272 foreach ( keys %$reachable ) {
273 if( ref $contig->{$_} ) {
274 push( @{$contig->{$_}}, $gst );
275 } elsif( $contig->{$_} ne $gst ) {
276 $conflict->{$group_readings->{$gst}} = $group_readings->{$contig->{$_}};
277 } # else it is an 'extant' node marked with our group already.
279 # None of the unreachable nodes should be in our group either.
280 foreach ( $part->vertices ) {
281 next if $reachable->{$_};
282 if( $contig->{$_} eq $gst ) {
283 $conflict->{$group_readings->{$gst}} = $group_readings->{$gst};
288 # Now, if we have a conflict, we can write the reading in full. If not,
289 # we have to save the subgraph so that we can resolve possible conflicts
290 # on hypothetical nodes.
291 $is_conflicted = 1 if exists $conflict->{$group_readings->{$gst}};
294 my $reading = { 'text' => $group_readings->{$gst},
295 'missing' => wit_stringify( \@lacunose ),
296 'group' => $gst }; # This will change if we find no conflict
297 if( $is_conflicted ) {
298 $reading->{'conflict'} = $conflict->{$group_readings->{$gst}}
300 # Save the relevant subgraph.
301 $subgraph->{$gst} = { 'graph' => $part,
302 'root' => $group_root,
303 'reachable' => $reachable };
305 push( @{$variant_row->{'readings'}}, $reading );
308 # Now that we have gone through all the rows, check the hypothetical
309 # readings for conflict if we haven't found one yet.
310 if( keys %$subgraph && !keys %$conflict ) {
312 foreach ( keys %$contig ) {
313 next unless ref $contig->{$_};
314 if( scalar @{$contig->{$_}} > 1 ) {
315 push( @resolve, $_ );
317 $contig->{$_} = scalar @{$contig->{$_}} ? $contig->{$_}->[0] : '';
320 # Do we still have a possible conflict?
321 my $still_contig = {};
322 foreach my $h ( @resolve ) {
323 # For each of the hypothetical readings with more than one possibility,
324 # try deleting it from each of its member subgraphs in turn, and see
325 # if that breaks the contiguous grouping.
326 # TODO This can still break in a corner case where group A can use
327 # either vertex 1 or 2, and group B can use either vertex 2 or 1.
328 # Revisit this if necessary; it could get brute-force nasty.
329 foreach my $gst ( @{$contig->{$h}} ) {
330 my $gpart = $subgraph->{$gst}->{'graph'}->copy;
331 my $reachable = $subgraph->{$gst}->{'reachable'};
332 $gpart->delete_vertex( $h );
333 # Is everything else still reachable from the root?
334 # TODO If $h was the root, see if we still have a single root.
335 my %still_reachable = ( $subgraph->{$gst}->{'root'} => 1 );
336 map { $still_reachable{$_} = 1 }
337 $gpart->all_successors( $subgraph->{$gst}->{'root'} );
338 foreach my $v ( keys %$reachable ) {
340 if( !$still_reachable{$v}
341 && ( $contig->{$v} eq $gst
342 || ( exists $still_contig->{$v}
343 && $still_contig->{$v} eq $gst ) ) ) {
345 if( exists $still_contig->{$h} ) {
347 $conflict->{$group_readings->{$gst}} =
348 $group_readings->{$still_contig->{$h}};
350 $still_contig->{$h} = $gst;
353 } # else we don't need $h in this group.
358 # Now, assuming no conflict, we have some hypothetical vertices in
359 # $still_contig that are the "real" group memberships. Replace these
361 unless ( keys %$conflict ) {
362 foreach my $v ( keys %$contig ) {
363 next unless ref $contig->{$v};
364 $contig->{$v} = $still_contig->{$v};
369 # Now write the group and conflict information into the respective rows.
371 map { $missing{$_} = 1 } @lacunose; # quick lookup table
372 foreach my $rdg ( @{$variant_row->{'readings'}} ) {
373 $rdg->{'conflict'} = $conflict->{$rdg->{'text'}};
374 next if $rdg->{'conflict'};
375 my @members = grep { $contig->{$_} eq $rdg->{'group'} && !$missing{$_} }
377 $rdg->{'group'} = wit_stringify( \@members );
380 $variant_row->{'genealogical'} = !( keys %$conflict );
385 my( $tree, $root, $contighash ) = @_;
386 # First, delete hypothetical leaves / orphans until there are none left.
387 my @orphan_hypotheticals = grep { ref( $contighash->{$_} ) }
388 $tree->successorless_vertices;
389 while( @orphan_hypotheticals ) {
390 $tree->delete_vertices( @orphan_hypotheticals );
391 @orphan_hypotheticals = grep { ref( $contighash->{$_} ) }
392 $tree->successorless_vertices;
394 # Then delete a hypothetical root with only one successor, moving the
396 while( $tree->successors( $root ) == 1 && ref $contighash->{$root} ) {
397 my @nextroot = $tree->successors( $root );
398 $tree->delete_vertex( $root );
399 $root = $nextroot[0];
401 # The tree has been modified in place, but we need to know the new root.
404 # Add the variant, subject to a.c. representation logic.
405 # This assumes that we will see the 'main' version before the a.c. version.
406 sub add_variant_wit {
407 my( $arr, $wit, $acstr ) = @_;
409 if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
411 $skip = grep { $_ =~ /^\Q$real\E$/ } @$arr;
413 push( @$arr, $wit ) unless $skip;
416 sub _useful_variant {
417 my( $group_readings, $graph, $acstr ) = @_;
419 # TODO Decide what to do with AC witnesses
421 # Sort by group size and return
423 my( @readings, @groups ); # The sorted groups for our answer.
424 foreach my $rdg ( sort { @{$group_readings->{$b}} <=> @{$group_readings->{$a}} }
425 keys %$group_readings ) {
426 push( @readings, $rdg );
427 push( @groups, $group_readings->{$rdg} );
428 if( @{$group_readings->{$rdg}} > 1 ) {
431 my( $wit ) = @{$group_readings->{$rdg}};
432 $wit =~ s/^(.*)\Q$acstr\E$/$1/;
433 $is_useful++ unless( $graph->is_sink_vertex( $wit ) );
436 if( $is_useful > 1 ) {
437 return( \@readings, \@groups );
443 =head2 wit_stringify( $groups )
445 Takes an array of witness groupings and produces a string like
446 ['A','B'] / ['C','D','E'] / ['F']
453 # If we were passed an array of witnesses instead of an array of
454 # groupings, then "group" the witnesses first.
455 unless( ref( $groups->[0] ) ) {
456 my $mkgrp = [ $groups ];
459 foreach my $g ( @$groups ) {
460 push( @gst, '[' . join( ',', map { "'$_'" } @$g ) . ']' );
462 return join( ' / ', @gst );
465 # Helper function to ensure that X and X a.c. never appear in the same list.
466 sub _add_to_witlist {
467 my( $wit, $list, $acstr ) = @_;
470 map { $inlist{$_} = $idx++ } @$list;
471 if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
473 unless( exists $inlist{$acwit} ) {
474 push( @$list, $acwit.$acstr );
477 if( exists( $inlist{$wit.$acstr} ) ) {
478 # Replace the a.c. version with the main witness
479 my $i = $inlist{$wit.$acstr};
482 push( @$list, $wit );
488 my( $op, $lista, $listb ) = @_;
491 map { $union{$_} = 1; $scalars{$_} = $_ } @$lista;
492 map { $union{$_} += 1; $scalars{$_} = $_ } @$listb;
494 if( $op eq 'intersection' ) {
495 @set = grep { $union{$_} == 2 } keys %union;
496 } elsif( $op eq 'symmdiff' ) {
497 @set = grep { $union{$_} == 1 } keys %union;
498 } elsif( $op eq 'union' ) {
501 return map { $scalars{$_} } @set;
508 This package is free software and is provided "as is" without express
509 or implied warranty. You can redistribute it and/or modify it under
510 the same terms as Perl itself.
514 Tara L Andrews E<lt>aurum@cpan.orgE<gt>