1 package Text::Tradition::Analysis;
6 use Encode qw/ encode_utf8 /;
8 use JSON qw/ encode_json decode_json /;
11 use Text::Tradition::Stemma;
13 use vars qw/ @EXPORT_OK /;
14 @EXPORT_OK = qw/ run_analysis group_variants analyze_variant_location wit_stringify /;
18 Text::Tradition::Analysis - functions for stemma analysis of a tradition
23 use Text::Tradition::Analysis qw/ run_analysis analyze_variant_location /;
24 my $t = Text::Tradition->new(
25 'name' => 'this is a text',
27 'file' => '/path/to/tei_parallel_seg_file.xml' );
28 $t->add_stemma( 'dotfile' => $stemmafile );
30 my $variant_data = run_analysis( $tradition );
31 # Recalculate rank $n treating all orthographic variants as equivalent
32 my $reanalyze = analyze_variant_location( $tradition, $n, 0, 'orthographic' );
36 Text::Tradition is a library for representation and analysis of collated
37 texts, particularly medieval ones. The Collation is the central feature of
38 a Tradition, where the text, its sequence of readings, and its relationships
39 between readings are actually kept.
43 =head2 run_analysis( $tradition, %opts )
45 Runs the analysis described in analyze_variant_location on every location in the
46 collation of the given tradition, with the given options. These include:
50 =item * stemma_id - Specify which of the tradition's stemmata to use. Default
51 is 0 (i.e. the first).
53 =item * ranks - Specify a list of location ranks to analyze; exclude the rest.
55 =item * merge_types - Specify a list of relationship types, where related readings
56 should be treated as identical for the purposes of analysis.
63 use Text::Tradition::Analysis qw/ run_analysis analyze_variant_location /;
65 my $datafile = 't/data/florilegium_tei_ps.xml';
66 my $tradition = Text::Tradition->new( 'input' => 'TEI',
68 'file' => $datafile );
69 my $s = $tradition->add_stemma( 'dotfile' => 't/data/florilegium.dot' );
70 is( ref( $s ), 'Text::Tradition::Stemma', "Added stemma to tradition" );
72 my %expected_genealogical = (
103 my $data = run_analysis( $tradition );
104 foreach my $row ( @{$data->{'variants'}} ) {
105 # Account for rows that used to be "not useful"
106 unless( exists $expected_genealogical{$row->{'id'}} ) {
107 $expected_genealogical{$row->{'id'}} = 1;
109 is( $row->{'genealogical'}, $expected_genealogical{$row->{'id'}},
110 "Got correct genealogical flag for row " . $row->{'id'} );
112 is( $data->{'conflict_count'}, 34, "Got right conflict count" );
113 is( $data->{'variant_count'}, 58, "Got right total variant number" );
120 my( $tradition, %opts ) = @_;
121 my $c = $tradition->collation;
123 my $stemma_id = $opts{'stemma_id'} || 0;
124 my @ranks = @{$opts{'ranks'}} if ref( $opts{'ranks'} ) eq 'ARRAY';
125 my @collapse = @{$opts{'merge_types'}} if ref( $opts{'merge_types'} ) eq 'ARRAY';
128 my $stemma = $tradition->stemma( $stemma_id );
129 # Figure out which witnesses we are working with
130 my @lacunose = $stemma->hypotheticals;
131 my @tradition_wits = map { $_->sigil } $tradition->witnesses;
132 map { push( @tradition_wits, $_->sigil."_ac" ) if $_->is_layered }
133 $tradition->witnesses;
134 push( @lacunose, _symmdiff( [ $stemma->witnesses ], \@tradition_wits ) );
136 # Find and mark 'common' ranks for exclusion, unless they were
137 # explicitly specified.
140 foreach my $rdg ( $c->common_readings ) {
141 $common_rank{$rdg->rank} = 1;
143 @ranks = grep { !$common_rank{$_} } ( 1 .. $c->end->rank-1 );
146 # Group the variants to send to the solver
149 foreach my $rank ( @ranks ) {
150 my $missing = [ @lacunose ];
151 push( @groups, group_variants( $tradition, $rank, $missing, \@collapse ) );
152 $lacunae{$rank} = $missing;
156 my $answer = solve_variants( $stemma, @groups );
158 # Do further analysis on the answer
159 my $conflict_count = 0;
160 foreach my $idx ( 0 .. $#ranks ) {
161 my $location = $answer->{'variants'}->[$idx];
162 # Add the rank back in
163 $location->{'id'} = $ranks[$idx];
164 # Add the lacunae back in
165 $location->{'missing'} = $lacunae{$ranks[$idx]};
166 # Run the extra analysis we need.
167 analyze_location( $tradition, $stemma->graph, $location );
168 # Add the reading text back in
169 foreach my $rdghash ( @{$location->{'readings'}} ) {
171 if exists $rdghash->{'conflict'} && $rdghash->{'conflict'};
172 my $rdg = $c->reading( $rdghash->{'readingid'} );
173 $rdghash->{'text'} = $rdg ? $rdg->text : $rdghash->{'readingid'};
176 $answer->{'conflict_count'} = $conflict_count;
181 =head2 group_variants( $tradition, $rank, $lacunose, @merge_relationship_types )
183 Groups the variants at the given $rank of the collation, treating any
184 relationships in @merge_relationship_types as equivalent. $lacunose should
185 be a reference to an array, to which the sigla of lacunose witnesses at this
186 rank will be appended.
188 Returns two ordered lists $readings, $groups, where $readings->[$n] is attested
189 by the witnesses listed in $groups->[$n].
193 # Return group_readings, groups, lacunose
195 my( $tradition, $rank, $lacunose, $collapse ) = @_;
196 my $c = $tradition->collation;
197 # All the regexps here are to get rid of space characters in witness names.
198 my $aclabel = $c->ac_label;
199 $aclabel =~ s/\s/_/g;
200 # Get the alignment table readings
201 my %readings_at_rank;
203 foreach my $tablewit ( @{$tradition->collation->alignment_table->{'alignment'}} ) {
204 my $rdg = $tablewit->{'tokens'}->[$rank-1];
205 my $wit = $tablewit->{'witness'};
207 if( $rdg && $rdg->{'t'}->is_lacuna ) {
208 _add_to_witlist( $wit, $lacunose, $aclabel );
210 $readings_at_rank{$rdg->{'t'}->text} = $rdg->{'t'};
212 _add_to_witlist( $wit, \@gap_wits, $aclabel );
216 # Group the readings, collapsing groups by relationship if needed
217 my %grouped_readings;
218 foreach my $rdg ( sort { $b->witnesses <=> $a->witnesses } values %readings_at_rank ) {
219 # Skip readings that have been collapsed into others.
220 next if exists $grouped_readings{$rdg->id} && !$grouped_readings{$rdg->id};
221 my @wits = $rdg->witnesses;
222 map { s/\s/_/g } @wits;
224 my $filter = sub { my $r = $_[0]; grep { $_ eq $r->type } @$collapse; };
225 foreach my $other ( $rdg->related_readings( $filter ) ) {
226 my @otherwits = $other->witnesses;
227 map { s/\s/_/g } @otherwits;
228 push( @wits, @otherwits );
229 $grouped_readings{$other->id} = 0;
232 $grouped_readings{$rdg->id} = \@wits;
234 $grouped_readings{'(omitted)'} = \@gap_wits if @gap_wits;
235 # Get rid of our collapsed readings
236 map { delete $grouped_readings{$_} unless $grouped_readings{$_} }
237 keys %grouped_readings
240 return \%grouped_readings;
243 =head2 solve_variants( $graph, @groups )
245 Sends the set of groups to the external graph solver service and returns
246 a cleaned-up answer, adding the rank IDs back where they belong.
248 The JSON has the form
249 { "graph": [ stemmagraph DOT string without newlines ],
250 "groupings": [ array of arrays of groups, one per rank ] }
252 The answer has the form
253 { "variants" => [ array of variant location structures ],
254 "variant_count" => total,
255 "conflict_count" => number of conflicts detected,
256 "genealogical_count" => number of solutions found }
261 my( $stemma, @groups ) = @_;
263 # Make the json with stemma + groups
264 my $jsonstruct = { 'graph' => $stemma->editable( ' ' ), 'groupings' => [] };
265 foreach my $ghash ( @groups ) {
267 foreach my $k ( sort keys %$ghash ) {
268 push( @grouping, $ghash->{$k} );
270 push( @{$jsonstruct->{'groupings'}}, \@grouping );
272 my $json = encode_json( $jsonstruct );
274 # Send it off and get the result
275 my $solver_url = 'http://byzantini.st/cgi-bin/graphcalc.cgi';
276 my $ua = LWP::UserAgent->new();
277 my $resp = $ua->post( $solver_url, 'Content-Type' => 'application/json',
278 'Content' => $json );
281 if( $resp->is_success ) {
282 $answer = decode_json( $resp->content );
284 # Fall back to the old method.
285 warn "IDP solver returned " . $resp->status_line . " / " . $resp->content
286 . "; falling back to perl method";
287 $answer = perl_solver( $stemma, @{$jsonstruct->{'groupings'}} );
290 # Fold the result back into what we know about the groups.
292 my $genealogical = 0;
293 foreach my $idx ( 0 .. $#groups ) {
294 my( $calc_groups, $result ) = @{$answer->[$idx]};
295 $genealogical++ if $result;
296 my $input_group = $groups[$idx];
297 foreach my $k ( sort keys %$input_group ) {
298 my $cg = shift @$calc_groups;
299 $input_group->{$k} = $cg;
302 'genealogical' => $result,
305 foreach my $k ( keys %$input_group ) {
306 push( @{$vstruct->{'readings'}},
307 { 'readingid' => $k, 'group' => $input_group->{$k}} );
309 push( @$variants, $vstruct );
312 return { 'variants' => $variants,
313 'variant_count' => scalar @$variants,
314 'genealogical_count' => $genealogical };
317 =head2 analyze_location ( $tradition, $graph, $location_hash )
319 Given the tradition, its stemma graph, and the solution from the graph solver,
320 work out the rest of the information we want. For each reading we need missing,
321 conflict, reading_parents, independent_occurrence, followed, not_followed, and follow_unknown. Alters the location_hash in place.
325 sub analyze_location {
326 my ( $tradition, $graph, $variant_row ) = @_;
328 # Make a hash of all known node memberships, and make the subgraphs.
330 my $reading_roots = {};
332 foreach my $rdghash ( @{$variant_row->{'readings'}} ) {
333 my $rid = $rdghash->{'readingid'};
334 map { $contig->{$_} = $rid } @{$rdghash->{'group'}};
337 my $part = $graph->copy;
339 map { $these_vertices{$_} = 1 } @{$rdghash->{'group'}};
340 $part->delete_vertices( grep { !$these_vertices{$_} } $part->vertices );
341 $subgraph->{$rid} = $part;
342 # Get the reading roots.
343 map { $reading_roots->{$_} = $rid } $part->predecessorless_vertices;
346 # Now that we have all the node group memberships, calculate followed/
347 # non-followed/unknown values for each reading. Also figure out the
348 # reading's evident parent(s).
349 foreach my $rdghash ( @{$variant_row->{'readings'}} ) {
350 # Group string key - TODO do we need this?
351 my $gst = wit_stringify( $rdghash->{'group'} );
352 my $rid = $rdghash->{'readingid'};
354 my $part = $subgraph->{$rid};
356 # Start figuring things out.
357 my @roots = $part->predecessorless_vertices;
358 $rdghash->{'independent_occurrence'} = scalar @roots;
359 $rdghash->{'followed'} = scalar( $part->vertices ) - scalar( @roots );
360 # Find the parent readings, if any, of this reading.
362 foreach my $wit ( @roots ) {
363 # Look in the main stemma to find this witness's extant or known-reading
364 # immediate ancestor(s), and look up the reading that each ancestor olds.
365 my @check = $graph->predecessors( $wit );
368 foreach my $wparent( @check ) {
369 my $preading = $contig->{$wparent};
371 $rdgparents{$preading} = 1;
373 push( @next, $graph->predecessors( $wparent ) );
379 $rdghash->{'reading_parents'} = [ keys %rdgparents ];
381 # Find the number of times this reading was altered, and the number of
382 # times we're not sure.
383 my( %nofollow, %unknownfollow );
384 foreach my $wit ( $part->vertices ) {
385 foreach my $wchild ( $graph->successors( $wit ) ) {
386 next if $part->has_vertex( $wchild );
387 if( $reading_roots->{$wchild} && $contig->{$wchild} ) {
388 # It definitely changed here.
389 $nofollow{$wchild} = 1;
390 } elsif( !($contig->{$wchild}) ) {
391 # The child is a hypothetical node not definitely in
392 # any group. Answer is unknown.
393 $unknownfollow{$wchild} = 1;
394 } # else it's a non-root node in a known group, and therefore
395 # is presumed to have its reading from its group, not this link.
398 $rdghash->{'not_followed'} = keys %nofollow;
399 $rdghash->{'follow_unknown'} = keys %unknownfollow;
401 # Now say whether this reading represents a conflict.
402 unless( $variant_row->{'genealogical'} ) {
403 $rdghash->{'conflict'} = @roots != 1;
409 =head2 perl_solver( $tradition, $rank, $stemma_id, @merge_relationship_types )
411 ** NOTE ** This method should hopefully not be called - it is not guaranteed
412 to be correct. Serves as a backup for the real solver.
414 Runs an analysis of the given tradition, at the location given in $rank,
415 against the graph of the stemma specified in $stemma_id. The argument
416 @merge_relationship_types is an optional list of relationship types for
417 which readings so related should be treated as equivalent.
419 Returns a nested array data structure as follows:
421 [ [ group_list, is_genealogical ], [ group_list, is_genealogical ] ... ]
423 where the group list is the array of arrays passed in for each element of @groups,
424 possibly with the addition of hypothetical readings.
430 my( $stemma, @groups ) = @_;
431 my $graph = $stemma->graph;
433 foreach my $g ( @groups ) {
434 push( @answer, _solve_variant_location( $graph, $g ) );
439 sub _solve_variant_location {
440 my( $graph, $groups ) = @_;
447 # Mark each ms as in its own group, first.
448 foreach my $g ( @$groups ) {
449 my $gst = wit_stringify( $g );
450 map { $contig->{$_} = $gst } @$g;
453 # Now for each unmarked node in the graph, initialize an array
454 # for possible group memberships. We will use this later to
455 # resolve potential conflicts.
456 map { $contig->{$_} = [] unless $contig->{$_} } $graph->vertices;
457 foreach my $g ( sort { scalar @$b <=> scalar @$a } @$groups ) {
458 my $gst = wit_stringify( $g ); # This is the group name
459 # Copy the graph, and delete all non-members from the new graph.
460 my $part = $graph->copy;
462 $part->delete_vertices(
463 grep { !ref( $contig->{$_} ) && $contig->{$_} ne $gst } $graph->vertices );
465 # Now look to see if our group is connected.
467 # We have to take directionality into account.
468 # How many root nodes do we have?
469 my @roots = grep { ref( $contig->{$_} ) || $contig->{$_} eq $gst }
470 $part->predecessorless_vertices;
471 # Assuming that @$g > 1, find the first root node that has at
472 # least one successor belonging to our group. If this reading
473 # is genealogical, there should be only one, but we will check
474 # that implicitly later.
475 foreach my $root ( @roots ) {
476 # Prune the tree to get rid of extraneous hypotheticals.
477 $root = _prune_subtree( $part, $root, $contig );
479 # Save this root for our group.
480 push( @group_roots, $root );
481 # Get all the successor nodes of our root.
484 # Dispense with the trivial case of one reading.
486 @group_roots = ( $wit );
487 foreach my $v ( $part->vertices ) {
488 $part->delete_vertex( $v ) unless $v eq $wit;
492 if( @group_roots > 1 ) {
493 $conflict->{$gst} = 1;
496 # Paint the 'hypotheticals' with our group.
497 foreach my $wit ( $part->vertices ) {
498 if( ref( $contig->{$wit} ) ) {
499 push( @{$contig->{$wit}}, $gst );
500 } elsif( $contig->{$wit} ne $gst ) {
501 warn "How did we get here?";
506 # Save the relevant subgraph.
507 $subgraph->{$gst} = $part;
510 # For each of our hypothetical readings, flatten its 'contig' array if
511 # the array contains zero or one group. If we have any unflattened arrays,
512 # we may need to run the resolution process. If the reading is already known
513 # to have a conflict, flatten the 'contig' array to nothing; we won't resolve
516 foreach my $wit ( keys %$contig ) {
517 next unless ref( $contig->{$wit} );
518 if( @{$contig->{$wit}} > 1 ) {
519 if( $is_conflicted ) {
520 $contig->{$wit} = ''; # We aren't going to decide.
522 push( @resolve, $wit );
525 my $gst = pop @{$contig->{$wit}};
526 $contig->{$wit} = $gst || '';
531 my $still_contig = {};
532 foreach my $h ( @resolve ) {
533 # For each of the hypothetical readings with more than one possibility,
534 # try deleting it from each of its member subgraphs in turn, and see
535 # if that breaks the contiguous grouping.
536 # TODO This can still break in a corner case where group A can use
537 # either vertex 1 or 2, and group B can use either vertex 2 or 1.
538 # Revisit this if necessary; it could get brute-force nasty.
539 foreach my $gst ( @{$contig->{$h}} ) {
540 my $gpart = $subgraph->{$gst}->copy();
541 # If we have come this far, there is only one root and everything
542 # is reachable from it.
543 my( $root ) = $gpart->predecessorless_vertices;
545 map { $reachable->{$_} = 1 } $gpart->vertices;
547 # Try deleting the hypothetical node.
548 $gpart->delete_vertex( $h );
550 # See if we still have a single root.
551 my @roots = $gpart->predecessorless_vertices;
552 warn "This shouldn't have happened" unless @roots;
554 # $h is needed by this group.
555 if( exists( $still_contig->{$h} ) ) {
557 $conflict->{$gst} = 1;
558 $still_contig->{$h} = '';
560 $still_contig->{$h} = $gst;
564 # $h is somewhere in the middle. See if everything
565 # else can still be reached from the root.
566 my %still_reachable = ( $root => 1 );
567 map { $still_reachable{$_} = 1 }
568 $gpart->all_successors( $root );
569 foreach my $v ( keys %$reachable ) {
571 if( !$still_reachable{$v}
572 && ( $contig->{$v} eq $gst
573 || ( exists $still_contig->{$v}
574 && $still_contig->{$v} eq $gst ) ) ) {
576 if( exists $still_contig->{$h} ) {
578 $conflict->{$gst} = 1;
579 $still_contig->{$h} = '';
581 $still_contig->{$h} = $gst;
584 } # else we don't need $h in this group.
586 } # endif $h eq $root
590 # Now we have some hypothetical vertices in $still_contig that are the
591 # "real" group memberships. Replace these in $contig.
592 foreach my $v ( keys %$contig ) {
593 next unless ref $contig->{$v};
594 $contig->{$v} = $still_contig->{$v};
598 my $is_genealogical = keys %$conflict ? JSON::false : JSON::true;
599 my $variant_row = [ [], $is_genealogical ];
600 # Fill in the groupings from $contig.
601 foreach my $g ( @$groups ) {
602 my $gst = wit_stringify( $g );
603 my @realgroup = grep { $contig->{$_} eq $gst } keys %$contig;
604 push( @{$variant_row->[0]}, \@realgroup );
610 my( $tree, $root, $contighash ) = @_;
611 # First, delete hypothetical leaves / orphans until there are none left.
612 my @orphan_hypotheticals = grep { ref( $contighash->{$_} ) }
613 $tree->successorless_vertices;
614 while( @orphan_hypotheticals ) {
615 $tree->delete_vertices( @orphan_hypotheticals );
616 @orphan_hypotheticals = grep { ref( $contighash->{$_} ) }
617 $tree->successorless_vertices;
619 # Then delete a hypothetical root with only one successor, moving the
620 # root to the first child that has no other predecessors.
621 while( $tree->successors( $root ) == 1 && ref $contighash->{$root} ) {
622 my @nextroot = $tree->successors( $root );
623 $tree->delete_vertex( $root );
624 ( $root ) = grep { $tree->is_predecessorless_vertex( $_ ) } @nextroot;
626 # The tree has been modified in place, but we need to know the new root.
627 $root = undef unless $root && $tree->has_vertex( $root );
630 # Add the variant, subject to a.c. representation logic.
631 # This assumes that we will see the 'main' version before the a.c. version.
632 sub add_variant_wit {
633 my( $arr, $wit, $acstr ) = @_;
635 if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
637 $skip = grep { $_ =~ /^\Q$real\E$/ } @$arr;
639 push( @$arr, $wit ) unless $skip;
642 sub _useful_variant {
643 my( $group_readings, $graph, $acstr ) = @_;
645 # TODO Decide what to do with AC witnesses
647 # Sort by group size and return
649 my( @readings, @groups ); # The sorted groups for our answer.
650 foreach my $rdg ( sort { @{$group_readings->{$b}} <=> @{$group_readings->{$a}} }
651 keys %$group_readings ) {
652 push( @readings, $rdg );
653 push( @groups, $group_readings->{$rdg} );
654 if( @{$group_readings->{$rdg}} > 1 ) {
657 my( $wit ) = @{$group_readings->{$rdg}};
658 $wit =~ s/^(.*)\Q$acstr\E$/$1/;
659 $is_useful++ unless( $graph->is_sink_vertex( $wit ) );
662 if( $is_useful > 1 ) {
663 return( \@readings, \@groups );
669 =head2 wit_stringify( $groups )
671 Takes an array of witness groupings and produces a string like
672 ['A','B'] / ['C','D','E'] / ['F']
679 # If we were passed an array of witnesses instead of an array of
680 # groupings, then "group" the witnesses first.
681 unless( ref( $groups->[0] ) ) {
682 my $mkgrp = [ $groups ];
685 foreach my $g ( @$groups ) {
686 push( @gst, '[' . join( ',', map { "'$_'" } @$g ) . ']' );
688 return join( ' / ', @gst );
691 # Helper function to ensure that X and X a.c. never appear in the same list.
692 sub _add_to_witlist {
693 my( $wit, $list, $acstr ) = @_;
696 map { $inlist{$_} = $idx++ } @$list;
697 if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
699 unless( exists $inlist{$acwit} ) {
700 push( @$list, $acwit.$acstr );
703 if( exists( $inlist{$wit.$acstr} ) ) {
704 # Replace the a.c. version with the main witness
705 my $i = $inlist{$wit.$acstr};
708 push( @$list, $wit );
714 my( $lista, $listb ) = @_;
717 map { $union{$_} = 1; $scalars{$_} = $_ } @$lista;
718 map { $union{$_} += 1; $scalars{$_} = $_ } @$listb;
719 my @set = grep { $union{$_} == 1 } keys %union;
720 return map { $scalars{$_} } @set;
727 This package is free software and is provided "as is" without express
728 or implied warranty. You can redistribute it and/or modify it under
729 the same terms as Perl itself.
733 Tara L Andrews E<lt>aurum@cpan.orgE<gt>