1 package Text::Tradition::Analysis;
8 use Text::Tradition::Stemma;
10 use vars qw/ @EXPORT_OK /;
11 @EXPORT_OK = qw/ run_analysis group_variants analyze_variant_location wit_stringify /;
15 Text::Tradition::Analysis - functions for stemma analysis of a tradition
20 use Text::Tradition::Analysis qw/ run_analysis analyze_variant_location /;
21 my $t = Text::Tradition->new(
22 'name' => 'this is a text',
24 'file' => '/path/to/tei_parallel_seg_file.xml' );
25 $t->add_stemma( 'dotfile' => $stemmafile );
27 my $variant_data = run_analysis( $tradition );
28 # Recalculate rank $n treating all orthographic variants as equivalent
29 my $reanalyze = analyze_variant_location( $tradition, $n, 0, 'orthographic' );
33 Text::Tradition is a library for representation and analysis of collated
34 texts, particularly medieval ones. The Collation is the central feature of
35 a Tradition, where the text, its sequence of readings, and its relationships
36 between readings are actually kept.
40 =head2 run_analysis( $tradition, $stemma_id, @merge_relationship_types )
42 Runs the analysis described in analyze_variant_location on every location
43 in the collation of the given tradition, against the stemma specified in
44 $stemma_id. If $stemma_id is not specified, it defaults to 0 (referencing
45 the first stemma saved for the tradition.)
47 The optional @merge_relationship_types contains a list of relationship types
48 to treat as equivalent for the analysis.
53 use Text::Tradition::Analysis qw/ run_analysis analyze_variant_location /;
55 my $datafile = 't/data/florilegium_tei_ps.xml';
56 my $tradition = Text::Tradition->new( 'input' => 'TEI',
58 'file' => $datafile );
59 my $s = $tradition->add_stemma( 'dotfile' => 't/data/florilegium.dot' );
60 is( ref( $s ), 'Text::Tradition::Stemma', "Added stemma to tradition" );
62 my $data = run_analysis( $tradition );
64 is( $data->{'genealogical_count'}, 42, "Got right genealogical count" );
65 is( $data->{'conflict_count'}, 17, "Got right conflict count" );
66 is( $data->{'variant_count'}, 58, "Got right total variant number" );
73 my( $tradition, $stemma_id, @collapse ) = @_;
74 $stemma_id = 0 unless $stemma_id;
76 # Run the variant analysis on every rank in the graph that doesn't
77 # have a common reading. Return the results.
78 my @variants; # holds results from analyze_variant_location
79 my $genealogical; # counter of 'genealogical' variants
80 my $conflicts; # counter of conflicting readings
82 # Find and mark 'common' ranks for exclusion.
84 foreach my $rdg ( $tradition->collation->common_readings ) {
85 $common_rank{$rdg->rank} = 1;
88 foreach my $rank ( 1 .. $tradition->collation->end->rank-1 ) {
89 next if $common_rank{$rank};
90 my $variant_row = analyze_variant_location(
91 $tradition, $rank, $stemma_id, @collapse );
92 push( @variants, $variant_row );
93 $genealogical++ if $variant_row->{'genealogical'};
94 $conflicts += grep { $_->{'conflict'} } @{$variant_row->{'readings'}};
98 'variants' => \@variants,
99 'variant_count' => scalar @variants, # TODO redundant
100 'conflict_count' => $conflicts,
101 'genealogical_count' => $genealogical,
105 =head2 group_variants( $tradition, $rank, $lacunose, @merge_relationship_types )
107 Groups the variants at the given $rank of the collation, treating any
108 relationships in @merge_relationship_types as equivalent. $lacunose should
109 be a reference to an array, to which the sigla of lacunose witnesses at this
110 rank will be appended.
112 Returns two ordered lists $readings, $groups, where $readings->[$n] is attested
113 by the witnesses listed in $groups->[$n].
117 # Return group_readings, groups, lacunose
119 my( $tradition, $rank, $lacunose, $collapse ) = @_;
120 my $c = $tradition->collation;
121 # Get the alignment table readings
122 my %readings_at_rank;
124 foreach my $tablewit ( @{$tradition->collation->alignment_table->{'alignment'}} ) {
125 my $rdg = $tablewit->{'tokens'}->[$rank-1];
126 if( $rdg && $rdg->{'t'}->is_lacuna ) {
127 push( @$lacunose, $tablewit->{'witness'} );
129 $readings_at_rank{$rdg->{'t'}->text} = $rdg->{'t'};
131 push( @gap_wits, $tablewit->{'witness'} );
135 # Group the readings, collapsing groups by relationship if needed
136 my %grouped_readings;
137 foreach my $rdg ( sort { $b->witnesses <=> $a->witnesses } values %readings_at_rank ) {
138 # Skip readings that have been collapsed into others.
139 next if exists $grouped_readings{$rdg->text} && !$grouped_readings{$rdg->text};
140 my @wits = $rdg->witnesses;
142 my $filter = sub { my $r = $_[0]; grep { $_ eq $r->type } @$collapse; };
143 foreach my $other ( $rdg->related_readings( $filter ) ) {
144 push( @wits, $other->witnesses );
145 $grouped_readings{$other->text} = 0;
148 $grouped_readings{$rdg->text} = \@wits;
150 $grouped_readings{'(omitted)'} = \@gap_wits if @gap_wits;
151 # Get rid of our collapsed readings
152 map { delete $grouped_readings{$_} unless $grouped_readings{$_} }
153 keys %grouped_readings
156 # Return the readings and groups, sorted by size
157 my( @readings, @groups );
158 foreach my $r ( sort { @{$grouped_readings{$b}} <=> @{$grouped_readings{$a}} }
159 keys %grouped_readings ) {
160 push( @readings, $r );
161 push( @groups, $grouped_readings{$r} );
163 return( \@readings, \@groups );
166 =head2 analyze_variant_location( $tradition, $rank, $stemma_id, @merge_relationship_types )
168 Runs an analysis of the given tradition, at the location given in $rank,
169 against the graph of the stemma specified in $stemma_id. The argument
170 @merge_relationship_types is an optional list of relationship types for
171 which readings so related should be treated as equivalent.
173 Returns a data structure as follows:
176 'genealogical' => boolean,
177 'readings => [ { text => $reading_text,
178 group => [ witnesses ],
179 conflict => [ conflicting ],
180 missing => [ excluded ] }, ... ]
182 where 'conflicting' is the list of witnesses whose readings conflict with
183 this group, and 'excluded' is the list of witnesses either not present in
184 the stemma or lacunose at this location.
188 sub analyze_variant_location {
189 my( $tradition, $rank, $sid, @collapse ) = @_;
190 $DB::single = 1 if @collapse;
191 # Get the readings in this tradition at this rank
192 my @rank_rdgs = grep { $_->rank == $rank } $tradition->collation->readings;
193 # Get the applicable stemma
194 my $undirected; # TODO Allow undirected distance tree analysis too
195 my $stemma = $tradition->stemma( $sid );
196 my $graph = $stemma->graph;
197 # Figure out which witnesses we are working with
198 my @lacunose = _set( 'symmdiff', [ $stemma->witnesses ],
199 [ map { $_->sigil } $tradition->witnesses ] );
201 # Now group the readings
202 my( $readings, $groups ) =
203 group_variants( $tradition, $rank, \@lacunose, \@collapse );
204 my $group_readings = {};
205 # Lookup table group string -> readings
206 foreach my $x ( 0 .. $#$groups ) {
207 $group_readings->{wit_stringify( $groups->[$x] )} = $readings->[$x];
215 my $variant_row = { 'id' => $rank, 'readings' => [] };
216 # Mark each ms as in its own group, first.
217 foreach my $g ( @$groups ) {
218 my $gst = wit_stringify( $g );
219 map { $contig->{$_} = $gst } @$g;
221 # Now for each unmarked node in the graph, initialize an array
222 # for possible group memberships. We will use this later to
223 # resolve potential conflicts.
224 map { $contig->{$_} = [] unless $contig->{$_} } $graph->vertices;
225 foreach my $g ( sort { scalar @$b <=> scalar @$a } @$groups ) {
226 my $gst = wit_stringify( $g ); # This is the group name
227 my $reachable = { $g->[0] => 1 };
228 # Copy the graph, and delete all non-members from the new graph.
229 my $part = $graph->copy;
231 $part->delete_vertices(
232 grep { !ref( $contig->{$_} ) && $contig->{$_} ne $gst } $graph->vertices );
234 # Now look to see if our group is connected.
235 if( $undirected ) { # For use with distance trees etc.
236 # Find all vertices reachable from the first (arbitrary) group
237 # member. If we are genealogical this should include them all.
238 map { $reachable->{$_} = 1 } $part->all_reachable( $g->[0] );
239 # TODO This is a terrible way to do distance trees, since all
240 # non-leaf nodes are included in every graph part now. We may
241 # have to go back to SPDP.
244 # Dispense with the trivial case of one reading.
245 # We have to take directionality into account.
246 # How many root nodes do we have?
247 my @roots = grep { ref( $contig->{$_} ) || $contig->{$_} eq $gst }
248 $part->source_vertices;
249 # Assuming that @$g > 1, find the first root node that has at
250 # least one successor belonging to our group. If this reading
251 # is genealogical, there should be only one, but we will check
252 # that implicitly later.
253 my $nodes_in_subtree = 0;
254 foreach my $root ( @roots ) {
255 # Prune the tree to get rid of extraneous hypotheticals.
256 $root = _prune_subtree( $part, $root, $contig );
257 # Get all the successor nodes of our root.
258 my $tmp_reach = { $root => 1 };
259 map { $tmp_reach->{$_} = 1 } $part->all_successors( $root );
260 # Skip this root if none of our successors are in our group
261 # (e.g. isolated 'hypothetical' witnesses with no group)
262 next unless grep { $contig->{$_} } keys %$tmp_reach;
263 if( keys %$tmp_reach > $nodes_in_subtree ) {
264 $nodes_in_subtree = keys %$tmp_reach;
265 $reachable = $tmp_reach;
269 } # else it is a single-node group, nothing to calculate.
272 # None of the 'reachable' nodes should be marked as being in another
273 # group. Paint the 'hypotheticals' with our group while we are at it,
274 # unless there is a conflict present.
275 foreach ( keys %$reachable ) {
276 if( ref $contig->{$_} ) {
277 push( @{$contig->{$_}}, $gst );
278 } elsif( $contig->{$_} ne $gst ) {
279 $conflict->{$group_readings->{$gst}} = $group_readings->{$contig->{$_}};
280 } # else it is an 'extant' node marked with our group already.
282 # None of the unreachable nodes should be in our group either.
283 foreach ( $part->vertices ) {
284 next if $reachable->{$_};
285 if( $contig->{$_} eq $gst ) {
286 $conflict->{$group_readings->{$gst}} = $group_readings->{$gst};
291 # Now, if we have a conflict, we can write the reading in full. If not,
292 # we have to save the subgraph so that we can resolve possible conflicts
293 # on hypothetical nodes.
294 $is_conflicted = 1 if exists $conflict->{$group_readings->{$gst}};
297 my $reading = { 'text' => $group_readings->{$gst},
298 'missing' => wit_stringify( \@lacunose ),
299 'group' => $gst }; # This will change if we find no conflict
300 if( $is_conflicted ) {
301 $reading->{'conflict'} = $conflict->{$group_readings->{$gst}}
303 # Save the relevant subgraph.
304 $subgraph->{$gst} = { 'graph' => $part,
305 'root' => $group_root,
306 'reachable' => $reachable };
308 push( @{$variant_row->{'readings'}}, $reading );
311 # Now that we have gone through all the rows, check the hypothetical
312 # readings for conflict if we haven't found one yet.
313 if( keys %$subgraph && !keys %$conflict ) {
315 foreach ( keys %$contig ) {
316 next unless ref $contig->{$_};
317 if( scalar @{$contig->{$_}} > 1 ) {
318 push( @resolve, $_ );
320 $contig->{$_} = scalar @{$contig->{$_}} ? $contig->{$_}->[0] : '';
323 # Do we still have a possible conflict?
324 my $still_contig = {};
325 foreach my $h ( @resolve ) {
326 # For each of the hypothetical readings with more than one possibility,
327 # try deleting it from each of its member subgraphs in turn, and see
328 # if that breaks the contiguous grouping.
329 # TODO This can still break in a corner case where group A can use
330 # either vertex 1 or 2, and group B can use either vertex 2 or 1.
331 # Revisit this if necessary; it could get brute-force nasty.
332 foreach my $gst ( @{$contig->{$h}} ) {
333 my $gpart = $subgraph->{$gst}->{'graph'}->copy;
334 my $reachable = $subgraph->{$gst}->{'reachable'};
335 $gpart->delete_vertex( $h );
336 # Is everything else still reachable from the root?
337 # TODO If $h was the root, see if we still have a single root.
338 my %still_reachable = ( $subgraph->{$gst}->{'root'} => 1 );
339 map { $still_reachable{$_} = 1 }
340 $gpart->all_successors( $subgraph->{$gst}->{'root'} );
341 foreach my $v ( keys %$reachable ) {
343 if( !$still_reachable{$v}
344 && ( $contig->{$v} eq $gst
345 || ( exists $still_contig->{$v}
346 && $still_contig->{$v} eq $gst ) ) ) {
348 if( exists $still_contig->{$h} ) {
350 $conflict->{$group_readings->{$gst}} =
351 $group_readings->{$still_contig->{$h}};
353 $still_contig->{$h} = $gst;
356 } # else we don't need $h in this group.
361 # Now, assuming no conflict, we have some hypothetical vertices in
362 # $still_contig that are the "real" group memberships. Replace these
364 unless ( keys %$conflict ) {
365 foreach my $v ( keys %$contig ) {
366 next unless ref $contig->{$v};
367 $contig->{$v} = $still_contig->{$v};
372 # Now write the group and conflict information into the respective rows.
374 map { $missing{$_} = 1 } @lacunose; # quick lookup table
375 foreach my $rdg ( @{$variant_row->{'readings'}} ) {
376 $rdg->{'conflict'} = $conflict->{$rdg->{'text'}};
377 next if $rdg->{'conflict'};
378 my @members = grep { $contig->{$_} eq $rdg->{'group'} && !$missing{$_} }
380 $rdg->{'group'} = wit_stringify( \@members );
383 $variant_row->{'genealogical'} = !( keys %$conflict );
388 my( $tree, $root, $contighash ) = @_;
389 # First, delete hypothetical leaves / orphans until there are none left.
390 my @orphan_hypotheticals = grep { ref( $contighash->{$_} ) }
391 $tree->successorless_vertices;
392 while( @orphan_hypotheticals ) {
393 $tree->delete_vertices( @orphan_hypotheticals );
394 @orphan_hypotheticals = grep { ref( $contighash->{$_} ) }
395 $tree->successorless_vertices;
397 # Then delete a hypothetical root with only one successor, moving the
399 while( $tree->successors( $root ) == 1 && ref $contighash->{$root} ) {
400 my @nextroot = $tree->successors( $root );
401 $tree->delete_vertex( $root );
402 $root = $nextroot[0];
404 # The tree has been modified in place, but we need to know the new root.
407 # Add the variant, subject to a.c. representation logic.
408 # This assumes that we will see the 'main' version before the a.c. version.
409 sub add_variant_wit {
410 my( $arr, $wit, $acstr ) = @_;
412 if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
414 $skip = grep { $_ =~ /^\Q$real\E$/ } @$arr;
416 push( @$arr, $wit ) unless $skip;
419 =head2 wit_stringify( $groups )
421 Takes an array of witness groupings and produces a string like
422 ['A','B'] / ['C','D','E'] / ['F']
429 # If we were passed an array of witnesses instead of an array of
430 # groupings, then "group" the witnesses first.
431 unless( ref( $groups->[0] ) ) {
432 my $mkgrp = [ $groups ];
435 foreach my $g ( @$groups ) {
436 push( @gst, '[' . join( ',', map { "'$_'" } @$g ) . ']' );
438 return join( ' / ', @gst );
442 my( $op, $lista, $listb ) = @_;
445 map { $union{$_} = 1; $scalars{$_} = $_ } @$lista;
446 map { $union{$_} += 1; $scalars{$_} = $_ } @$listb;
448 if( $op eq 'intersection' ) {
449 @set = grep { $union{$_} == 2 } keys %union;
450 } elsif( $op eq 'symmdiff' ) {
451 @set = grep { $union{$_} == 1 } keys %union;
452 } elsif( $op eq 'union' ) {
455 return map { $scalars{$_} } @set;
462 This package is free software and is provided "as is" without express
463 or implied warranty. You can redistribute it and/or modify it under
464 the same terms as Perl itself.
468 Tara L Andrews E<lt>aurum@cpan.orgE<gt>