1 package Text::Tradition::Analysis;
6 use Encode qw/ encode_utf8 /;
8 use JSON qw/ encode_json decode_json /;
11 use Text::Tradition::Stemma;
13 use vars qw/ @EXPORT_OK /;
14 @EXPORT_OK = qw/ run_analysis group_variants analyze_variant_location wit_stringify /;
18 Text::Tradition::Analysis - functions for stemma analysis of a tradition
23 use Text::Tradition::Analysis qw/ run_analysis analyze_variant_location /;
24 my $t = Text::Tradition->new(
25 'name' => 'this is a text',
27 'file' => '/path/to/tei_parallel_seg_file.xml' );
28 $t->add_stemma( 'dotfile' => $stemmafile );
30 my $variant_data = run_analysis( $tradition );
31 # Recalculate rank $n treating all orthographic variants as equivalent
32 my $reanalyze = analyze_variant_location( $tradition, $n, 0, 'orthographic' );
36 Text::Tradition is a library for representation and analysis of collated
37 texts, particularly medieval ones. The Collation is the central feature of
38 a Tradition, where the text, its sequence of readings, and its relationships
39 between readings are actually kept.
43 =head2 run_analysis( $tradition, %opts )
45 Runs the analysis described in analyze_variant_location on every location in the
46 collation of the given tradition, with the given options. These include:
50 =item * stemma_id - Specify which of the tradition's stemmata to use. Default
51 is 0 (i.e. the first).
53 =item * ranks - Specify a list of location ranks to analyze; exclude the rest.
55 =item * merge_types - Specify a list of relationship types, where related readings
56 should be treated as identical for the purposes of analysis.
63 use Text::Tradition::Analysis qw/ run_analysis analyze_variant_location /;
65 my $datafile = 't/data/florilegium_tei_ps.xml';
66 my $tradition = Text::Tradition->new( 'input' => 'TEI',
68 'file' => $datafile );
69 my $s = $tradition->add_stemma( 'dotfile' => 't/data/florilegium.dot' );
70 is( ref( $s ), 'Text::Tradition::Stemma', "Added stemma to tradition" );
72 my %expected_genealogical = (
103 my $data = run_analysis( $tradition );
104 foreach my $row ( @{$data->{'variants'}} ) {
105 is( $row->{'genealogical'}, $expected_genealogical{$row->{'id'}},
106 "Got correct genealogical flag for row " . $row->{'id'} );
108 is( $data->{'conflict_count'}, 16, "Got right conflict count" );
109 is( $data->{'variant_count'}, 28, "Got right total variant number" );
116 my( $tradition, %opts ) = @_;
117 my $c = $tradition->collation;
119 my $stemma_id = $opts{'stemma_id'} || 0;
120 my @ranks = @{$opts{'ranks'}} if ref( $opts{'ranks'} ) eq 'ARRAY';
121 my @collapse = @{$opts{'merge_types'}} if ref( $opts{'merge_types'} ) eq 'ARRAY';
124 my $stemma = $tradition->stemma( $stemma_id );
125 # Figure out which witnesses we are working with
126 my @lacunose = $stemma->hypotheticals;
127 push( @lacunose, _symmdiff( [ $stemma->witnesses ],
128 [ map { $_->sigil } $tradition->witnesses ] ) );
130 # Find and mark 'common' ranks for exclusion, unless they were
131 # explicitly specified.
134 foreach my $rdg ( $tradition->collation->common_readings ) {
135 $common_rank{$rdg->rank} = 1;
137 @ranks = grep { !$common_rank{$_} } ( 1 .. $c->end->rank-1 );
140 # Group the variants to send to the solver
142 foreach my $rank ( @ranks ) {
143 push( @groups, group_variants( $tradition, $rank, \@lacunose, \@collapse ) );
147 my $answer = solve_variants( $stemma->editable( ' ' ), @groups );
149 # Do further analysis on the answer
150 foreach my $idx ( 0 .. $#ranks ) {
151 my $location = $answer->{'variants'}->[$idx];
152 # Add the rank back in
153 $location->{'id'} = $ranks[$idx];
154 # Run the extra analysis we need.
155 # For each reading we need missing, conflict, reading_parents,
156 # independent_occurrence, followed, not_followed, follow_unknown
157 analyze_location( $tradition, $stemma->graph, $location );
163 =head2 group_variants( $tradition, $rank, $lacunose, @merge_relationship_types )
165 Groups the variants at the given $rank of the collation, treating any
166 relationships in @merge_relationship_types as equivalent. $lacunose should
167 be a reference to an array, to which the sigla of lacunose witnesses at this
168 rank will be appended.
170 Returns two ordered lists $readings, $groups, where $readings->[$n] is attested
171 by the witnesses listed in $groups->[$n].
175 # Return group_readings, groups, lacunose
177 my( $tradition, $rank, $lacunose, $collapse ) = @_;
178 my $c = $tradition->collation;
179 # Get the alignment table readings
180 my %readings_at_rank;
182 foreach my $tablewit ( @{$tradition->collation->alignment_table->{'alignment'}} ) {
183 my $rdg = $tablewit->{'tokens'}->[$rank-1];
184 if( $rdg && $rdg->{'t'}->is_lacuna ) {
185 _add_to_witlist( $tablewit->{'witness'}, $lacunose,
186 $tradition->collation->ac_label );
188 $readings_at_rank{$rdg->{'t'}->text} = $rdg->{'t'};
190 _add_to_witlist( $tablewit->{'witness'}, \@gap_wits,
191 $tradition->collation->ac_label );
195 # Group the readings, collapsing groups by relationship if needed
196 my %grouped_readings;
197 foreach my $rdg ( sort { $b->witnesses <=> $a->witnesses } values %readings_at_rank ) {
198 # Skip readings that have been collapsed into others.
199 next if exists $grouped_readings{$rdg->id} && !$grouped_readings{$rdg->id};
200 my @wits = $rdg->witnesses;
202 my $filter = sub { my $r = $_[0]; grep { $_ eq $r->type } @$collapse; };
203 foreach my $other ( $rdg->related_readings( $filter ) ) {
204 push( @wits, $other->witnesses );
205 $grouped_readings{$other->id} = 0;
208 $grouped_readings{$rdg->id} = \@wits;
210 $grouped_readings{'(omitted)'} = \@gap_wits if @gap_wits;
211 # Get rid of our collapsed readings
212 map { delete $grouped_readings{$_} unless $grouped_readings{$_} }
213 keys %grouped_readings
216 return \%grouped_readings;
219 =head2 solve_variants( $graph, @groups )
221 Sends the set of groups to the external graph solver service and returns
222 a cleaned-up answer, adding the rank IDs back where they belong.
224 The JSON has the form
225 { "graph": [ stemmagraph DOT string without newlines ],
226 "groupings": [ array of arrays of groups, one per rank ] }
228 The answer has the form
229 { "variants" => [ array of variant location structures ],
230 "variant_count" => total,
231 "conflict_count" => number of conflicts detected,
232 "genealogical_count" => number of solutions found }
237 my( $graph, @groups ) = @_;
239 # Make the json with stemma + groups
240 my $jsonstruct = { 'graph' => $graph, 'groupings' => [] };
241 foreach my $ghash ( @groups ) {
243 foreach my $k ( sort keys %$ghash ) {
244 push( @grouping, $ghash->{$k} );
246 push( @{$jsonstruct->{'groupings'}}, \@grouping );
248 my $json = encode_json( $jsonstruct );
250 # Send it off and get the result
251 my $solver_url = 'http://byzantini.st/cgi-bin/graphcalc.cgi';
252 my $ua = LWP::UserAgent->new();
253 my $resp = $ua->post( $solver_url, 'Content-Type' => 'application/json',
254 'Content' => $json );
257 if( $resp->is_success ) {
258 $answer = decode_json( $resp->content );
260 # Either throw an error or fall back to the old method.
261 die "Solver returned " . $resp->status_line . " / " . $resp->content;
264 # Fold the result back into what we know about the groups.
266 my $genealogical = 0;
267 foreach my $idx ( 0 .. $#groups ) {
268 my( $calc_groups, $result ) = @{$answer->[$idx]};
269 $genealogical++ if $result;
270 my $input_group = $groups[$idx];
271 foreach my $k ( sort keys %$input_group ) {
272 my $cg = shift @$calc_groups;
273 $input_group->{$k} = $cg;
276 'genealogical' => $result,
279 foreach my $k ( keys %$input_group ) {
280 push( @{$vstruct->{'readings'}},
281 { 'readingid' => $k, 'group' => $dg } );
283 push( @$variants, $vstruct );
286 return { 'variants' => $variants,
287 'variant_count' => scalar @$variants,
288 'genealogical_count' => $genealogical };
291 =head2 analyze_variant_location( $tradition, $rank, $stemma_id, @merge_relationship_types )
293 Runs an analysis of the given tradition, at the location given in $rank,
294 against the graph of the stemma specified in $stemma_id. The argument
295 @merge_relationship_types is an optional list of relationship types for
296 which readings so related should be treated as equivalent.
298 Returns a data structure as follows:
301 'genealogical' => boolean,
302 'readings => [ { readingid => $reading_id,
303 group => [ witnesses ],
304 conflict => [ conflicting ],
305 missing => [ excluded ] }, ... ]
307 where 'conflicting' is the list of witnesses whose readings conflict with
308 this group, and 'excluded' is the list of witnesses either not present in
309 the stemma or lacunose at this location.
313 sub analyze_variant_location {
314 my( $tradition, $rank, $sid, @collapse ) = @_;
315 # Get the readings in this tradition at this rank
316 my @rank_rdgs = grep { $_->rank == $rank } $tradition->collation->readings;
317 # Get the applicable stemma
318 my $undirected; # TODO Allow undirected distance tree analysis too
319 my $stemma = $tradition->stemma( $sid );
320 my $graph = $stemma->graph;
321 # Figure out which witnesses we are working with
322 my @lacunose = $stemma->hypotheticals;
323 push( @lacunose, _symmdiff( [ $stemma->witnesses ],
324 [ map { $_->sigil } $tradition->witnesses ] ) );
326 # Now group the readings
327 my( $readings, $groups ) = _useful_variant(
328 group_variants( $tradition, $rank, \@lacunose, \@collapse ),
329 $graph, $tradition->collation->ac_label );
330 return unless scalar @$readings;
331 my $group_readings = {};
332 # Lookup table group string -> readings
333 foreach my $x ( 0 .. $#$groups ) {
334 $group_readings->{wit_stringify( $groups->[$x] )} = $readings->[$x];
343 my $variant_row = { 'id' => $rank, 'readings' => [] };
344 # Mark each ms as in its own group, first.
345 $DB::single = 1 if $rank == 81;
346 foreach my $g ( @$groups ) {
347 my $gst = wit_stringify( $g );
348 map { $contig->{$_} = $gst } @$g;
350 # Now for each unmarked node in the graph, initialize an array
351 # for possible group memberships. We will use this later to
352 # resolve potential conflicts.
353 map { $contig->{$_} = [] unless $contig->{$_} } $graph->vertices;
354 foreach my $g ( sort { scalar @$b <=> scalar @$a } @$groups ) {
355 my $gst = wit_stringify( $g ); # This is the group name
356 # Copy the graph, and delete all non-members from the new graph.
357 my $part = $graph->copy;
359 $part->delete_vertices(
360 grep { !ref( $contig->{$_} ) && $contig->{$_} ne $gst } $graph->vertices );
362 # Now look to see if our group is connected.
363 if( $undirected ) { # For use with distance trees etc.
364 # Find all vertices reachable from the first (arbitrary) group
365 # member. If we are genealogical this should include them all.
367 map { $reachable->{$_} = 1 } $part->all_reachable( $g->[0] );
368 # TODO This is a terrible way to do distance trees, since all
369 # non-leaf nodes are included in every graph part now. We may
370 # have to go back to SPDP.
373 # We have to take directionality into account.
374 # How many root nodes do we have?
375 my @roots = grep { ref( $contig->{$_} ) || $contig->{$_} eq $gst }
376 $part->predecessorless_vertices;
377 # Assuming that @$g > 1, find the first root node that has at
378 # least one successor belonging to our group. If this reading
379 # is genealogical, there should be only one, but we will check
380 # that implicitly later.
381 foreach my $root ( @roots ) {
382 # Prune the tree to get rid of extraneous hypotheticals.
383 $root = _prune_subtree( $part, $root, $contig );
385 # Save this root for our group.
386 push( @group_roots, $root );
387 # Get all the successor nodes of our root.
390 # Dispense with the trivial case of one reading.
392 @group_roots = ( $wit );
393 foreach my $v ( $part->vertices ) {
394 $part->delete_vertex( $v ) unless $v eq $wit;
399 map { $reading_roots{$_} = 1 } @group_roots;
400 if( @group_roots > 1 ) {
401 $conflict->{$group_readings->{$gst}} = 1;
404 # Paint the 'hypotheticals' with our group.
405 foreach my $wit ( $part->vertices ) {
406 if( ref( $contig->{$wit} ) ) {
407 push( @{$contig->{$wit}}, $gst );
408 } elsif( $contig->{$wit} ne $gst ) {
409 warn "How did we get here?";
414 # Start to write the reading, and save the group subgraph.
415 my $reading = { 'readingid' => $group_readings->{$gst},
416 'missing' => wit_stringify( \@lacunose ),
417 'group' => $gst }; # This will change if we find no conflict
418 # Save the relevant subgraph.
419 $subgraph->{$gst} = $part;
420 push( @{$variant_row->{'readings'}}, $reading );
423 # For each of our hypothetical readings, flatten its 'contig' array if
424 # the array contains zero or one group. If we have any unflattened arrays,
425 # we may need to run the resolution process. If the reading is already known
426 # to have a conflict, flatten the 'contig' array to nothing; we won't resolve
429 foreach my $wit ( keys %$contig ) {
430 next unless ref( $contig->{$wit} );
431 if( @{$contig->{$wit}} > 1 ) {
432 if( $is_conflicted ) {
433 $contig->{$wit} = ''; # We aren't going to decide.
435 push( @resolve, $wit );
438 my $gst = pop @{$contig->{$wit}};
439 $contig->{$wit} = $gst || '';
444 my $still_contig = {};
445 foreach my $h ( @resolve ) {
446 # For each of the hypothetical readings with more than one possibility,
447 # try deleting it from each of its member subgraphs in turn, and see
448 # if that breaks the contiguous grouping.
449 # TODO This can still break in a corner case where group A can use
450 # either vertex 1 or 2, and group B can use either vertex 2 or 1.
451 # Revisit this if necessary; it could get brute-force nasty.
452 foreach my $gst ( @{$contig->{$h}} ) {
453 my $gpart = $subgraph->{$gst}->copy();
454 # If we have come this far, there is only one root and everything
455 # is reachable from it.
456 my( $root ) = $gpart->predecessorless_vertices;
458 map { $reachable->{$_} = 1 } $gpart->vertices;
460 # Try deleting the hypothetical node.
461 $gpart->delete_vertex( $h );
463 # See if we still have a single root.
464 my @roots = $gpart->predecessorless_vertices;
465 warn "This shouldn't have happened" unless @roots;
467 # $h is needed by this group.
468 if( exists( $still_contig->{$h} ) ) {
470 $conflict->{$group_readings->{$gst}} = 1;
471 $still_contig->{$h} = '';
473 $still_contig->{$h} = $gst;
477 # $h is somewhere in the middle. See if everything
478 # else can still be reached from the root.
479 my %still_reachable = ( $root => 1 );
480 map { $still_reachable{$_} = 1 }
481 $gpart->all_successors( $root );
482 foreach my $v ( keys %$reachable ) {
484 if( !$still_reachable{$v}
485 && ( $contig->{$v} eq $gst
486 || ( exists $still_contig->{$v}
487 && $still_contig->{$v} eq $gst ) ) ) {
489 if( exists $still_contig->{$h} ) {
491 $conflict->{$group_readings->{$gst}} = 1;
492 $still_contig->{$h} = '';
494 $still_contig->{$h} = $gst;
497 } # else we don't need $h in this group.
499 } # endif $h eq $root
503 # Now we have some hypothetical vertices in $still_contig that are the
504 # "real" group memberships. Replace these in $contig.
505 foreach my $v ( keys %$contig ) {
506 next unless ref $contig->{$v};
507 $contig->{$v} = $still_contig->{$v};
511 # Now that we have all the node group memberships, calculate followed/
512 # non-followed/unknown values for each reading. Also figure out the
513 # reading's evident parent(s).
514 foreach my $rdghash ( @{$variant_row->{'readings'}} ) {
515 my $gst = $rdghash->{'group'};
516 my $part = $subgraph->{$gst};
517 my @roots = $part->predecessorless_vertices;
518 $rdghash->{'independent_occurrence'} = scalar @roots;
519 $rdghash->{'followed'} = scalar( $part->vertices ) - scalar( @roots );
520 # Find the parent readings, if any, of this reading.
522 foreach my $wit ( @roots ) {
523 # Look in the main stemma to find this witness's extant or known-reading
524 # immediate ancestor(s), and look up the reading that each ancestor olds.
525 my @check = $graph->predecessors( $wit );
528 foreach my $wparent( @check ) {
529 my $pgroup = $contig->{$wparent};
531 $rdgparents{$group_readings->{$pgroup}} = 1;
533 push( @next, $graph->predecessors( $wparent ) );
539 $rdghash->{'reading_parents'} = [ keys %rdgparents ];
541 # Find the number of times this reading was altered, and the number of
542 # times we're not sure.
543 my( %nofollow, %unknownfollow );
544 foreach my $wit ( $part->vertices ) {
545 foreach my $wchild ( $graph->successors( $wit ) ) {
546 next if $part->has_vertex( $wchild );
547 if( $reading_roots{$wchild} && $contig->{$wchild} ) {
548 # It definitely changed here.
549 $nofollow{$wchild} = 1;
550 } elsif( !($contig->{$wchild}) ) {
551 # The child is a hypothetical node not definitely in
552 # any group. Answer is unknown.
553 $unknownfollow{$wchild} = 1;
554 } # else it's a non-root node in a known group, and therefore
555 # is presumed to have its reading from its group, not this link.
558 $rdghash->{'not_followed'} = keys %nofollow;
559 $rdghash->{'follow_unknown'} = keys %unknownfollow;
562 # Now write the group and conflict information into the respective rows.
563 foreach my $rdghash ( @{$variant_row->{'readings'}} ) {
564 $rdghash->{'conflict'} = $conflict->{$rdghash->{'readingid'}};
565 my @members = grep { $contig->{$_} eq $rdghash->{'group'} } keys %$contig;
566 $rdghash->{'group'} = wit_stringify( \@members );
569 $variant_row->{'genealogical'} = !( keys %$conflict );
574 my( $tree, $root, $contighash ) = @_;
575 # First, delete hypothetical leaves / orphans until there are none left.
576 my @orphan_hypotheticals = grep { ref( $contighash->{$_} ) }
577 $tree->successorless_vertices;
578 while( @orphan_hypotheticals ) {
579 $tree->delete_vertices( @orphan_hypotheticals );
580 @orphan_hypotheticals = grep { ref( $contighash->{$_} ) }
581 $tree->successorless_vertices;
583 # Then delete a hypothetical root with only one successor, moving the
584 # root to the first child that has no other predecessors.
585 while( $tree->successors( $root ) == 1 && ref $contighash->{$root} ) {
586 my @nextroot = $tree->successors( $root );
587 $tree->delete_vertex( $root );
588 ( $root ) = grep { $tree->is_predecessorless_vertex( $_ ) } @nextroot;
590 # The tree has been modified in place, but we need to know the new root.
591 $root = undef unless $root && $tree->has_vertex( $root );
594 # Add the variant, subject to a.c. representation logic.
595 # This assumes that we will see the 'main' version before the a.c. version.
596 sub add_variant_wit {
597 my( $arr, $wit, $acstr ) = @_;
599 if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
601 $skip = grep { $_ =~ /^\Q$real\E$/ } @$arr;
603 push( @$arr, $wit ) unless $skip;
606 sub _useful_variant {
607 my( $group_readings, $graph, $acstr ) = @_;
609 # TODO Decide what to do with AC witnesses
611 # Sort by group size and return
613 my( @readings, @groups ); # The sorted groups for our answer.
614 foreach my $rdg ( sort { @{$group_readings->{$b}} <=> @{$group_readings->{$a}} }
615 keys %$group_readings ) {
616 push( @readings, $rdg );
617 push( @groups, $group_readings->{$rdg} );
618 if( @{$group_readings->{$rdg}} > 1 ) {
621 my( $wit ) = @{$group_readings->{$rdg}};
622 $wit =~ s/^(.*)\Q$acstr\E$/$1/;
623 $is_useful++ unless( $graph->is_sink_vertex( $wit ) );
626 if( $is_useful > 1 ) {
627 return( \@readings, \@groups );
633 =head2 wit_stringify( $groups )
635 Takes an array of witness groupings and produces a string like
636 ['A','B'] / ['C','D','E'] / ['F']
643 # If we were passed an array of witnesses instead of an array of
644 # groupings, then "group" the witnesses first.
645 unless( ref( $groups->[0] ) ) {
646 my $mkgrp = [ $groups ];
649 foreach my $g ( @$groups ) {
650 push( @gst, '[' . join( ',', map { "'$_'" } @$g ) . ']' );
652 return join( ' / ', @gst );
655 # Helper function to ensure that X and X a.c. never appear in the same list.
656 sub _add_to_witlist {
657 my( $wit, $list, $acstr ) = @_;
660 map { $inlist{$_} = $idx++ } @$list;
661 if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
663 unless( exists $inlist{$acwit} ) {
664 push( @$list, $acwit.$acstr );
667 if( exists( $inlist{$wit.$acstr} ) ) {
668 # Replace the a.c. version with the main witness
669 my $i = $inlist{$wit.$acstr};
672 push( @$list, $wit );
678 my( $lista, $listb ) = @_;
681 map { $union{$_} = 1; $scalars{$_} = $_ } @$lista;
682 map { $union{$_} += 1; $scalars{$_} = $_ } @$listb;
683 my @set = grep { $union{$_} == 1 } keys %union;
684 return map { $scalars{$_} } @set;
691 This package is free software and is provided "as is" without express
692 or implied warranty. You can redistribute it and/or modify it under
693 the same terms as Perl itself.
697 Tara L Andrews E<lt>aurum@cpan.orgE<gt>